llama_cpp 0.9.3 → 0.9.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/src/ggml-alloc.c +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +177 -98
- data/ext/llama_cpp/src/ggml-metal.m +29 -17
- data/ext/llama_cpp/src/ggml-metal.metal +93 -93
- data/ext/llama_cpp/src/ggml-opencl.cpp +5 -7
- data/ext/llama_cpp/src/ggml-quants.c +1 -1
- data/ext/llama_cpp/src/ggml.c +154 -30
- data/ext/llama_cpp/src/ggml.h +11 -3
- data/ext/llama_cpp/src/llama.cpp +316 -122
- data/ext/llama_cpp/src/llama.h +72 -4
- data/lib/llama_cpp/version.rb +2 -2
- metadata +3 -3
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -4826,7 +4826,17 @@ struct ggml_tensor * ggml_diag_mask_zero_inplace(
|
|
4826
4826
|
static struct ggml_tensor * ggml_soft_max_impl(
|
4827
4827
|
struct ggml_context * ctx,
|
4828
4828
|
struct ggml_tensor * a,
|
4829
|
+
struct ggml_tensor * mask,
|
4830
|
+
float scale,
|
4829
4831
|
bool inplace) {
|
4832
|
+
GGML_ASSERT(ggml_is_contiguous(a));
|
4833
|
+
if (mask) {
|
4834
|
+
GGML_ASSERT(ggml_is_contiguous(mask));
|
4835
|
+
GGML_ASSERT(mask->ne[2] == 1);
|
4836
|
+
GGML_ASSERT(mask->ne[3] == 1);
|
4837
|
+
GGML_ASSERT(ggml_can_repeat_rows(mask, a));
|
4838
|
+
}
|
4839
|
+
|
4830
4840
|
bool is_node = false;
|
4831
4841
|
|
4832
4842
|
if (a->grad) {
|
@@ -4835,9 +4845,13 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
4835
4845
|
|
4836
4846
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
4837
4847
|
|
4848
|
+
float params[] = { scale };
|
4849
|
+
ggml_set_op_params(result, params, sizeof(params));
|
4850
|
+
|
4838
4851
|
result->op = GGML_OP_SOFT_MAX;
|
4839
4852
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4840
4853
|
result->src[0] = a;
|
4854
|
+
result->src[1] = mask;
|
4841
4855
|
|
4842
4856
|
return result;
|
4843
4857
|
}
|
@@ -4845,13 +4859,21 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
4845
4859
|
struct ggml_tensor * ggml_soft_max(
|
4846
4860
|
struct ggml_context * ctx,
|
4847
4861
|
struct ggml_tensor * a) {
|
4848
|
-
return ggml_soft_max_impl(ctx, a, false);
|
4862
|
+
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
|
4849
4863
|
}
|
4850
4864
|
|
4851
4865
|
struct ggml_tensor * ggml_soft_max_inplace(
|
4852
4866
|
struct ggml_context * ctx,
|
4853
4867
|
struct ggml_tensor * a) {
|
4854
|
-
return ggml_soft_max_impl(ctx, a, true);
|
4868
|
+
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
|
4869
|
+
}
|
4870
|
+
|
4871
|
+
struct ggml_tensor * ggml_soft_max_ext(
|
4872
|
+
struct ggml_context * ctx,
|
4873
|
+
struct ggml_tensor * a,
|
4874
|
+
struct ggml_tensor * mask,
|
4875
|
+
float scale) {
|
4876
|
+
return ggml_soft_max_impl(ctx, a, mask, scale, false);
|
4855
4877
|
}
|
4856
4878
|
|
4857
4879
|
// ggml_soft_max_back
|
@@ -9373,7 +9395,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
9373
9395
|
// TODO: find the optimal values for these
|
9374
9396
|
if (ggml_is_contiguous(src0) &&
|
9375
9397
|
ggml_is_contiguous(src1) &&
|
9376
|
-
|
9398
|
+
//src0->type == GGML_TYPE_F32 &&
|
9377
9399
|
src1->type == GGML_TYPE_F32 &&
|
9378
9400
|
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
9379
9401
|
|
@@ -9611,10 +9633,12 @@ static void ggml_compute_forward_out_prod_f32(
|
|
9611
9633
|
const int ith = params->ith;
|
9612
9634
|
const int nth = params->nth;
|
9613
9635
|
|
9636
|
+
GGML_ASSERT(ne0 == ne00);
|
9637
|
+
GGML_ASSERT(ne1 == ne10);
|
9638
|
+
GGML_ASSERT(ne2 == ne02);
|
9614
9639
|
GGML_ASSERT(ne02 == ne12);
|
9615
|
-
GGML_ASSERT(ne03 == ne13);
|
9616
|
-
GGML_ASSERT(ne2 == ne12);
|
9617
9640
|
GGML_ASSERT(ne3 == ne13);
|
9641
|
+
GGML_ASSERT(ne03 == ne13);
|
9618
9642
|
|
9619
9643
|
// we don't support permuted src0 or src1
|
9620
9644
|
GGML_ASSERT(nb00 == sizeof(float));
|
@@ -9625,18 +9649,25 @@ static void ggml_compute_forward_out_prod_f32(
|
|
9625
9649
|
// GGML_ASSERT(nb1 <= nb2);
|
9626
9650
|
// GGML_ASSERT(nb2 <= nb3);
|
9627
9651
|
|
9628
|
-
GGML_ASSERT(ne0 == ne00);
|
9629
|
-
GGML_ASSERT(ne1 == ne10);
|
9630
|
-
GGML_ASSERT(ne2 == ne02);
|
9631
|
-
GGML_ASSERT(ne3 == ne03);
|
9632
|
-
|
9633
9652
|
// nb01 >= nb00 - src0 is not transposed
|
9634
9653
|
// compute by src0 rows
|
9635
9654
|
|
9636
9655
|
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
|
9637
|
-
// TODO: #if defined(
|
9656
|
+
// TODO: #if defined(GGML_USE_CLBLAST)
|
9657
|
+
|
9658
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9659
|
+
bool use_blas = ggml_is_matrix(src0) &&
|
9660
|
+
ggml_is_matrix(src1) &&
|
9661
|
+
ggml_is_contiguous(src0) &&
|
9662
|
+
(ggml_is_contiguous(src1) || ggml_is_transposed(src1));
|
9663
|
+
#endif
|
9638
9664
|
|
9639
9665
|
if (params->type == GGML_TASK_INIT) {
|
9666
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
|
9667
|
+
if (use_blas) {
|
9668
|
+
return;
|
9669
|
+
}
|
9670
|
+
#endif
|
9640
9671
|
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
9641
9672
|
return;
|
9642
9673
|
}
|
@@ -9645,6 +9676,50 @@ static void ggml_compute_forward_out_prod_f32(
|
|
9645
9676
|
return;
|
9646
9677
|
}
|
9647
9678
|
|
9679
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9680
|
+
if (use_blas) {
|
9681
|
+
if (params->ith != 0) { // All threads other than the first do no work.
|
9682
|
+
return;
|
9683
|
+
}
|
9684
|
+
// Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
|
9685
|
+
// src0: (k,n)
|
9686
|
+
// src1: (k,m)
|
9687
|
+
// dst: (m,n)
|
9688
|
+
//
|
9689
|
+
// Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
|
9690
|
+
// Also expressed as (major,minor)
|
9691
|
+
// a: (m,k): so src1 transposed
|
9692
|
+
// b: (k,n): so src0
|
9693
|
+
// c: (m,n)
|
9694
|
+
//
|
9695
|
+
// However, if ggml_is_transposed(src1) is true, then
|
9696
|
+
// src1->data already contains a transposed version, so sgemm mustn't
|
9697
|
+
// transpose it further.
|
9698
|
+
|
9699
|
+
int n = src0->ne[0];
|
9700
|
+
int k = src0->ne[1];
|
9701
|
+
int m = src1->ne[0];
|
9702
|
+
|
9703
|
+
int transposeA, lda;
|
9704
|
+
|
9705
|
+
if (!ggml_is_transposed(src1)) {
|
9706
|
+
transposeA = CblasTrans;
|
9707
|
+
lda = m;
|
9708
|
+
} else {
|
9709
|
+
transposeA = CblasNoTrans;
|
9710
|
+
lda = k;
|
9711
|
+
}
|
9712
|
+
|
9713
|
+
float * a = (float *) ((char *) src1->data);
|
9714
|
+
float * b = (float *) ((char *) src0->data);
|
9715
|
+
float * c = (float *) ((char *) dst->data);
|
9716
|
+
|
9717
|
+
cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
|
9718
|
+
|
9719
|
+
return;
|
9720
|
+
}
|
9721
|
+
#endif
|
9722
|
+
|
9648
9723
|
// dst[:,:,:,:] = 0
|
9649
9724
|
// for i2,i3:
|
9650
9725
|
// for i1:
|
@@ -10498,20 +10573,25 @@ static void ggml_compute_forward_diag_mask_zero(
|
|
10498
10573
|
static void ggml_compute_forward_soft_max_f32(
|
10499
10574
|
const struct ggml_compute_params * params,
|
10500
10575
|
const struct ggml_tensor * src0,
|
10501
|
-
struct ggml_tensor *
|
10502
|
-
|
10503
|
-
|
10504
|
-
|
10576
|
+
const struct ggml_tensor * src1,
|
10577
|
+
struct ggml_tensor * dst) {
|
10578
|
+
assert(ggml_is_contiguous(dst));
|
10579
|
+
assert(ggml_are_same_shape(src0, dst));
|
10505
10580
|
|
10506
10581
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10507
10582
|
return;
|
10508
10583
|
}
|
10509
10584
|
|
10585
|
+
float scale = 1.0f;
|
10586
|
+
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
10587
|
+
|
10510
10588
|
// TODO: handle transposed/permuted matrices
|
10511
10589
|
|
10512
10590
|
const int ith = params->ith;
|
10513
10591
|
const int nth = params->nth;
|
10514
10592
|
|
10593
|
+
const int64_t ne11 = src1 ? src1->ne[1] : 1;
|
10594
|
+
|
10515
10595
|
const int nc = src0->ne[0];
|
10516
10596
|
const int nr = ggml_nrows(src0);
|
10517
10597
|
|
@@ -10522,29 +10602,40 @@ static void ggml_compute_forward_soft_max_f32(
|
|
10522
10602
|
const int ir0 = dr*ith;
|
10523
10603
|
const int ir1 = MIN(ir0 + dr, nr);
|
10524
10604
|
|
10605
|
+
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
|
10606
|
+
|
10525
10607
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
10526
|
-
float *sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
10527
|
-
float *dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
10608
|
+
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
10609
|
+
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
10610
|
+
|
10611
|
+
// broadcast the mask across rows
|
10612
|
+
float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
|
10613
|
+
|
10614
|
+
ggml_vec_cpy_f32 (nc, wp, sp);
|
10615
|
+
ggml_vec_scale_f32(nc, wp, scale);
|
10616
|
+
if (mp) {
|
10617
|
+
ggml_vec_acc_f32(nc, wp, mp);
|
10618
|
+
}
|
10528
10619
|
|
10529
10620
|
#ifndef NDEBUG
|
10530
10621
|
for (int i = 0; i < nc; ++i) {
|
10531
10622
|
//printf("p[%d] = %f\n", i, p[i]);
|
10532
|
-
assert(!isnan(
|
10623
|
+
assert(!isnan(wp[i]));
|
10533
10624
|
}
|
10534
10625
|
#endif
|
10535
10626
|
|
10536
10627
|
float max = -INFINITY;
|
10537
|
-
ggml_vec_max_f32(nc, &max,
|
10628
|
+
ggml_vec_max_f32(nc, &max, wp);
|
10538
10629
|
|
10539
10630
|
ggml_float sum = 0.0;
|
10540
10631
|
|
10541
10632
|
uint16_t scvt;
|
10542
10633
|
for (int i = 0; i < nc; i++) {
|
10543
|
-
if (
|
10634
|
+
if (wp[i] == -INFINITY) {
|
10544
10635
|
dp[i] = 0.0f;
|
10545
10636
|
} else {
|
10546
|
-
// const float val = (
|
10547
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(
|
10637
|
+
// const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
|
10638
|
+
ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
|
10548
10639
|
memcpy(&scvt, &s, sizeof(scvt));
|
10549
10640
|
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
10550
10641
|
sum += (ggml_float)val;
|
@@ -10569,11 +10660,12 @@ static void ggml_compute_forward_soft_max_f32(
|
|
10569
10660
|
static void ggml_compute_forward_soft_max(
|
10570
10661
|
const struct ggml_compute_params * params,
|
10571
10662
|
const struct ggml_tensor * src0,
|
10572
|
-
struct ggml_tensor *
|
10663
|
+
const struct ggml_tensor * src1,
|
10664
|
+
struct ggml_tensor * dst) {
|
10573
10665
|
switch (src0->type) {
|
10574
10666
|
case GGML_TYPE_F32:
|
10575
10667
|
{
|
10576
|
-
ggml_compute_forward_soft_max_f32(params, src0, dst);
|
10668
|
+
ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
|
10577
10669
|
} break;
|
10578
10670
|
default:
|
10579
10671
|
{
|
@@ -13810,7 +13902,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
13810
13902
|
} break;
|
13811
13903
|
case GGML_OP_SOFT_MAX:
|
13812
13904
|
{
|
13813
|
-
ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
|
13905
|
+
ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
|
13814
13906
|
} break;
|
13815
13907
|
case GGML_OP_SOFT_MAX_BACK:
|
13816
13908
|
{
|
@@ -15636,13 +15728,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15636
15728
|
{
|
15637
15729
|
n_tasks = 1;
|
15638
15730
|
} break;
|
15639
|
-
case GGML_OP_COUNT:
|
15640
|
-
{
|
15641
|
-
GGML_ASSERT(false);
|
15642
|
-
} break;
|
15643
15731
|
default:
|
15644
15732
|
{
|
15645
|
-
|
15733
|
+
fprintf(stderr, "%s: op not implemented: ", __func__);
|
15734
|
+
if (node->op < GGML_OP_COUNT) {
|
15735
|
+
fprintf(stderr, "%s\n", ggml_op_name(node->op));
|
15736
|
+
} else {
|
15737
|
+
fprintf(stderr, "%d\n", node->op);
|
15738
|
+
}
|
15646
15739
|
GGML_ASSERT(false);
|
15647
15740
|
} break;
|
15648
15741
|
}
|
@@ -15845,6 +15938,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15845
15938
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
15846
15939
|
}
|
15847
15940
|
} break;
|
15941
|
+
case GGML_OP_SOFT_MAX:
|
15942
|
+
{
|
15943
|
+
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
|
15944
|
+
|
15945
|
+
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
15946
|
+
} break;
|
15848
15947
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
15849
15948
|
{
|
15850
15949
|
GGML_ASSERT(node->src[0]->ne[3] == 1);
|
@@ -18399,24 +18498,29 @@ int gguf_find_key(const struct gguf_context * ctx, const char * key) {
|
|
18399
18498
|
}
|
18400
18499
|
|
18401
18500
|
const char * gguf_get_key(const struct gguf_context * ctx, int key_id) {
|
18501
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18402
18502
|
return ctx->kv[key_id].key.data;
|
18403
18503
|
}
|
18404
18504
|
|
18405
18505
|
enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) {
|
18506
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18406
18507
|
return ctx->kv[key_id].type;
|
18407
18508
|
}
|
18408
18509
|
|
18409
18510
|
enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) {
|
18511
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18410
18512
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
18411
18513
|
return ctx->kv[key_id].value.arr.type;
|
18412
18514
|
}
|
18413
18515
|
|
18414
18516
|
const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) {
|
18517
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18415
18518
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
18416
18519
|
return ctx->kv[key_id].value.arr.data;
|
18417
18520
|
}
|
18418
18521
|
|
18419
18522
|
const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
|
18523
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18420
18524
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
18421
18525
|
struct gguf_kv * kv = &ctx->kv[key_id];
|
18422
18526
|
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
|
@@ -18424,70 +18528,90 @@ const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i
|
|
18424
18528
|
}
|
18425
18529
|
|
18426
18530
|
int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
|
18531
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18427
18532
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
18428
18533
|
return ctx->kv[key_id].value.arr.n;
|
18429
18534
|
}
|
18430
18535
|
|
18431
18536
|
uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) {
|
18537
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18432
18538
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
|
18433
18539
|
return ctx->kv[key_id].value.uint8;
|
18434
18540
|
}
|
18435
18541
|
|
18436
18542
|
int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) {
|
18543
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18437
18544
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
|
18438
18545
|
return ctx->kv[key_id].value.int8;
|
18439
18546
|
}
|
18440
18547
|
|
18441
18548
|
uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) {
|
18549
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18442
18550
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
|
18443
18551
|
return ctx->kv[key_id].value.uint16;
|
18444
18552
|
}
|
18445
18553
|
|
18446
18554
|
int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) {
|
18555
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18447
18556
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
|
18448
18557
|
return ctx->kv[key_id].value.int16;
|
18449
18558
|
}
|
18450
18559
|
|
18451
18560
|
uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) {
|
18561
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18452
18562
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
|
18453
18563
|
return ctx->kv[key_id].value.uint32;
|
18454
18564
|
}
|
18455
18565
|
|
18456
18566
|
int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) {
|
18567
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18457
18568
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
|
18458
18569
|
return ctx->kv[key_id].value.int32;
|
18459
18570
|
}
|
18460
18571
|
|
18461
18572
|
float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) {
|
18573
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18462
18574
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
|
18463
18575
|
return ctx->kv[key_id].value.float32;
|
18464
18576
|
}
|
18465
18577
|
|
18466
18578
|
uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) {
|
18579
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18467
18580
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
|
18468
18581
|
return ctx->kv[key_id].value.uint64;
|
18469
18582
|
}
|
18470
18583
|
|
18471
18584
|
int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) {
|
18585
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18472
18586
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
|
18473
18587
|
return ctx->kv[key_id].value.int64;
|
18474
18588
|
}
|
18475
18589
|
|
18476
18590
|
double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
|
18591
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18477
18592
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
|
18478
18593
|
return ctx->kv[key_id].value.float64;
|
18479
18594
|
}
|
18480
18595
|
|
18481
18596
|
bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
|
18597
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18482
18598
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
|
18483
18599
|
return ctx->kv[key_id].value.bool_;
|
18484
18600
|
}
|
18485
18601
|
|
18486
18602
|
const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
|
18603
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18487
18604
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
|
18488
18605
|
return ctx->kv[key_id].value.str.data;
|
18489
18606
|
}
|
18490
18607
|
|
18608
|
+
const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) {
|
18609
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18610
|
+
GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY);
|
18611
|
+
GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_STRING);
|
18612
|
+
return &ctx->kv[key_id].value;
|
18613
|
+
}
|
18614
|
+
|
18491
18615
|
int gguf_get_n_tensors(const struct gguf_context * ctx) {
|
18492
18616
|
return ctx->header.n_tensors;
|
18493
18617
|
}
|
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -244,11 +244,10 @@
|
|
244
244
|
#define GGML_ASSERT(x) \
|
245
245
|
do { \
|
246
246
|
if (!(x)) { \
|
247
|
-
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
248
|
-
fflush(stderr); \
|
249
247
|
fflush(stdout); \
|
248
|
+
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
250
249
|
ggml_print_backtrace(); \
|
251
|
-
|
250
|
+
abort(); \
|
252
251
|
} \
|
253
252
|
} while (0)
|
254
253
|
|
@@ -1283,6 +1282,14 @@ extern "C" {
|
|
1283
1282
|
struct ggml_context * ctx,
|
1284
1283
|
struct ggml_tensor * a);
|
1285
1284
|
|
1285
|
+
// fused soft_max(a*scale + mask)
|
1286
|
+
// mask is optional
|
1287
|
+
GGML_API struct ggml_tensor * ggml_soft_max_ext(
|
1288
|
+
struct ggml_context * ctx,
|
1289
|
+
struct ggml_tensor * a,
|
1290
|
+
struct ggml_tensor * mask,
|
1291
|
+
float scale);
|
1292
|
+
|
1286
1293
|
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
1287
1294
|
struct ggml_context * ctx,
|
1288
1295
|
struct ggml_tensor * a,
|
@@ -2045,6 +2052,7 @@ extern "C" {
|
|
2045
2052
|
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
|
2046
2053
|
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
|
2047
2054
|
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
|
2055
|
+
GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
|
2048
2056
|
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
|
2049
2057
|
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
2050
2058
|
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|