llama_cpp 0.9.3 → 0.9.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/src/ggml-alloc.c +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +177 -98
- data/ext/llama_cpp/src/ggml-metal.m +29 -17
- data/ext/llama_cpp/src/ggml-metal.metal +93 -93
- data/ext/llama_cpp/src/ggml-opencl.cpp +5 -7
- data/ext/llama_cpp/src/ggml-quants.c +1 -1
- data/ext/llama_cpp/src/ggml.c +154 -30
- data/ext/llama_cpp/src/ggml.h +11 -3
- data/ext/llama_cpp/src/llama.cpp +316 -122
- data/ext/llama_cpp/src/llama.h +72 -4
- data/lib/llama_cpp/version.rb +2 -2
- metadata +3 -3
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -4826,7 +4826,17 @@ struct ggml_tensor * ggml_diag_mask_zero_inplace(
|
|
4826
4826
|
static struct ggml_tensor * ggml_soft_max_impl(
|
4827
4827
|
struct ggml_context * ctx,
|
4828
4828
|
struct ggml_tensor * a,
|
4829
|
+
struct ggml_tensor * mask,
|
4830
|
+
float scale,
|
4829
4831
|
bool inplace) {
|
4832
|
+
GGML_ASSERT(ggml_is_contiguous(a));
|
4833
|
+
if (mask) {
|
4834
|
+
GGML_ASSERT(ggml_is_contiguous(mask));
|
4835
|
+
GGML_ASSERT(mask->ne[2] == 1);
|
4836
|
+
GGML_ASSERT(mask->ne[3] == 1);
|
4837
|
+
GGML_ASSERT(ggml_can_repeat_rows(mask, a));
|
4838
|
+
}
|
4839
|
+
|
4830
4840
|
bool is_node = false;
|
4831
4841
|
|
4832
4842
|
if (a->grad) {
|
@@ -4835,9 +4845,13 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
4835
4845
|
|
4836
4846
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
4837
4847
|
|
4848
|
+
float params[] = { scale };
|
4849
|
+
ggml_set_op_params(result, params, sizeof(params));
|
4850
|
+
|
4838
4851
|
result->op = GGML_OP_SOFT_MAX;
|
4839
4852
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
4840
4853
|
result->src[0] = a;
|
4854
|
+
result->src[1] = mask;
|
4841
4855
|
|
4842
4856
|
return result;
|
4843
4857
|
}
|
@@ -4845,13 +4859,21 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|
4845
4859
|
struct ggml_tensor * ggml_soft_max(
|
4846
4860
|
struct ggml_context * ctx,
|
4847
4861
|
struct ggml_tensor * a) {
|
4848
|
-
return ggml_soft_max_impl(ctx, a, false);
|
4862
|
+
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
|
4849
4863
|
}
|
4850
4864
|
|
4851
4865
|
struct ggml_tensor * ggml_soft_max_inplace(
|
4852
4866
|
struct ggml_context * ctx,
|
4853
4867
|
struct ggml_tensor * a) {
|
4854
|
-
return ggml_soft_max_impl(ctx, a, true);
|
4868
|
+
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
|
4869
|
+
}
|
4870
|
+
|
4871
|
+
struct ggml_tensor * ggml_soft_max_ext(
|
4872
|
+
struct ggml_context * ctx,
|
4873
|
+
struct ggml_tensor * a,
|
4874
|
+
struct ggml_tensor * mask,
|
4875
|
+
float scale) {
|
4876
|
+
return ggml_soft_max_impl(ctx, a, mask, scale, false);
|
4855
4877
|
}
|
4856
4878
|
|
4857
4879
|
// ggml_soft_max_back
|
@@ -9373,7 +9395,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|
9373
9395
|
// TODO: find the optimal values for these
|
9374
9396
|
if (ggml_is_contiguous(src0) &&
|
9375
9397
|
ggml_is_contiguous(src1) &&
|
9376
|
-
|
9398
|
+
//src0->type == GGML_TYPE_F32 &&
|
9377
9399
|
src1->type == GGML_TYPE_F32 &&
|
9378
9400
|
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
9379
9401
|
|
@@ -9611,10 +9633,12 @@ static void ggml_compute_forward_out_prod_f32(
|
|
9611
9633
|
const int ith = params->ith;
|
9612
9634
|
const int nth = params->nth;
|
9613
9635
|
|
9636
|
+
GGML_ASSERT(ne0 == ne00);
|
9637
|
+
GGML_ASSERT(ne1 == ne10);
|
9638
|
+
GGML_ASSERT(ne2 == ne02);
|
9614
9639
|
GGML_ASSERT(ne02 == ne12);
|
9615
|
-
GGML_ASSERT(ne03 == ne13);
|
9616
|
-
GGML_ASSERT(ne2 == ne12);
|
9617
9640
|
GGML_ASSERT(ne3 == ne13);
|
9641
|
+
GGML_ASSERT(ne03 == ne13);
|
9618
9642
|
|
9619
9643
|
// we don't support permuted src0 or src1
|
9620
9644
|
GGML_ASSERT(nb00 == sizeof(float));
|
@@ -9625,18 +9649,25 @@ static void ggml_compute_forward_out_prod_f32(
|
|
9625
9649
|
// GGML_ASSERT(nb1 <= nb2);
|
9626
9650
|
// GGML_ASSERT(nb2 <= nb3);
|
9627
9651
|
|
9628
|
-
GGML_ASSERT(ne0 == ne00);
|
9629
|
-
GGML_ASSERT(ne1 == ne10);
|
9630
|
-
GGML_ASSERT(ne2 == ne02);
|
9631
|
-
GGML_ASSERT(ne3 == ne03);
|
9632
|
-
|
9633
9652
|
// nb01 >= nb00 - src0 is not transposed
|
9634
9653
|
// compute by src0 rows
|
9635
9654
|
|
9636
9655
|
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
|
9637
|
-
// TODO: #if defined(
|
9656
|
+
// TODO: #if defined(GGML_USE_CLBLAST)
|
9657
|
+
|
9658
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9659
|
+
bool use_blas = ggml_is_matrix(src0) &&
|
9660
|
+
ggml_is_matrix(src1) &&
|
9661
|
+
ggml_is_contiguous(src0) &&
|
9662
|
+
(ggml_is_contiguous(src1) || ggml_is_transposed(src1));
|
9663
|
+
#endif
|
9638
9664
|
|
9639
9665
|
if (params->type == GGML_TASK_INIT) {
|
9666
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
|
9667
|
+
if (use_blas) {
|
9668
|
+
return;
|
9669
|
+
}
|
9670
|
+
#endif
|
9640
9671
|
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
9641
9672
|
return;
|
9642
9673
|
}
|
@@ -9645,6 +9676,50 @@ static void ggml_compute_forward_out_prod_f32(
|
|
9645
9676
|
return;
|
9646
9677
|
}
|
9647
9678
|
|
9679
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9680
|
+
if (use_blas) {
|
9681
|
+
if (params->ith != 0) { // All threads other than the first do no work.
|
9682
|
+
return;
|
9683
|
+
}
|
9684
|
+
// Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
|
9685
|
+
// src0: (k,n)
|
9686
|
+
// src1: (k,m)
|
9687
|
+
// dst: (m,n)
|
9688
|
+
//
|
9689
|
+
// Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
|
9690
|
+
// Also expressed as (major,minor)
|
9691
|
+
// a: (m,k): so src1 transposed
|
9692
|
+
// b: (k,n): so src0
|
9693
|
+
// c: (m,n)
|
9694
|
+
//
|
9695
|
+
// However, if ggml_is_transposed(src1) is true, then
|
9696
|
+
// src1->data already contains a transposed version, so sgemm mustn't
|
9697
|
+
// transpose it further.
|
9698
|
+
|
9699
|
+
int n = src0->ne[0];
|
9700
|
+
int k = src0->ne[1];
|
9701
|
+
int m = src1->ne[0];
|
9702
|
+
|
9703
|
+
int transposeA, lda;
|
9704
|
+
|
9705
|
+
if (!ggml_is_transposed(src1)) {
|
9706
|
+
transposeA = CblasTrans;
|
9707
|
+
lda = m;
|
9708
|
+
} else {
|
9709
|
+
transposeA = CblasNoTrans;
|
9710
|
+
lda = k;
|
9711
|
+
}
|
9712
|
+
|
9713
|
+
float * a = (float *) ((char *) src1->data);
|
9714
|
+
float * b = (float *) ((char *) src0->data);
|
9715
|
+
float * c = (float *) ((char *) dst->data);
|
9716
|
+
|
9717
|
+
cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
|
9718
|
+
|
9719
|
+
return;
|
9720
|
+
}
|
9721
|
+
#endif
|
9722
|
+
|
9648
9723
|
// dst[:,:,:,:] = 0
|
9649
9724
|
// for i2,i3:
|
9650
9725
|
// for i1:
|
@@ -10498,20 +10573,25 @@ static void ggml_compute_forward_diag_mask_zero(
|
|
10498
10573
|
static void ggml_compute_forward_soft_max_f32(
|
10499
10574
|
const struct ggml_compute_params * params,
|
10500
10575
|
const struct ggml_tensor * src0,
|
10501
|
-
struct ggml_tensor *
|
10502
|
-
|
10503
|
-
|
10504
|
-
|
10576
|
+
const struct ggml_tensor * src1,
|
10577
|
+
struct ggml_tensor * dst) {
|
10578
|
+
assert(ggml_is_contiguous(dst));
|
10579
|
+
assert(ggml_are_same_shape(src0, dst));
|
10505
10580
|
|
10506
10581
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
10507
10582
|
return;
|
10508
10583
|
}
|
10509
10584
|
|
10585
|
+
float scale = 1.0f;
|
10586
|
+
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
10587
|
+
|
10510
10588
|
// TODO: handle transposed/permuted matrices
|
10511
10589
|
|
10512
10590
|
const int ith = params->ith;
|
10513
10591
|
const int nth = params->nth;
|
10514
10592
|
|
10593
|
+
const int64_t ne11 = src1 ? src1->ne[1] : 1;
|
10594
|
+
|
10515
10595
|
const int nc = src0->ne[0];
|
10516
10596
|
const int nr = ggml_nrows(src0);
|
10517
10597
|
|
@@ -10522,29 +10602,40 @@ static void ggml_compute_forward_soft_max_f32(
|
|
10522
10602
|
const int ir0 = dr*ith;
|
10523
10603
|
const int ir1 = MIN(ir0 + dr, nr);
|
10524
10604
|
|
10605
|
+
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
|
10606
|
+
|
10525
10607
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
10526
|
-
float *sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
10527
|
-
float *dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
10608
|
+
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
10609
|
+
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
10610
|
+
|
10611
|
+
// broadcast the mask across rows
|
10612
|
+
float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
|
10613
|
+
|
10614
|
+
ggml_vec_cpy_f32 (nc, wp, sp);
|
10615
|
+
ggml_vec_scale_f32(nc, wp, scale);
|
10616
|
+
if (mp) {
|
10617
|
+
ggml_vec_acc_f32(nc, wp, mp);
|
10618
|
+
}
|
10528
10619
|
|
10529
10620
|
#ifndef NDEBUG
|
10530
10621
|
for (int i = 0; i < nc; ++i) {
|
10531
10622
|
//printf("p[%d] = %f\n", i, p[i]);
|
10532
|
-
assert(!isnan(
|
10623
|
+
assert(!isnan(wp[i]));
|
10533
10624
|
}
|
10534
10625
|
#endif
|
10535
10626
|
|
10536
10627
|
float max = -INFINITY;
|
10537
|
-
ggml_vec_max_f32(nc, &max,
|
10628
|
+
ggml_vec_max_f32(nc, &max, wp);
|
10538
10629
|
|
10539
10630
|
ggml_float sum = 0.0;
|
10540
10631
|
|
10541
10632
|
uint16_t scvt;
|
10542
10633
|
for (int i = 0; i < nc; i++) {
|
10543
|
-
if (
|
10634
|
+
if (wp[i] == -INFINITY) {
|
10544
10635
|
dp[i] = 0.0f;
|
10545
10636
|
} else {
|
10546
|
-
// const float val = (
|
10547
|
-
ggml_fp16_t s = GGML_FP32_TO_FP16(
|
10637
|
+
// const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
|
10638
|
+
ggml_fp16_t s = GGML_FP32_TO_FP16(wp[i] - max);
|
10548
10639
|
memcpy(&scvt, &s, sizeof(scvt));
|
10549
10640
|
const float val = GGML_FP16_TO_FP32(ggml_table_exp_f16[scvt]);
|
10550
10641
|
sum += (ggml_float)val;
|
@@ -10569,11 +10660,12 @@ static void ggml_compute_forward_soft_max_f32(
|
|
10569
10660
|
static void ggml_compute_forward_soft_max(
|
10570
10661
|
const struct ggml_compute_params * params,
|
10571
10662
|
const struct ggml_tensor * src0,
|
10572
|
-
struct ggml_tensor *
|
10663
|
+
const struct ggml_tensor * src1,
|
10664
|
+
struct ggml_tensor * dst) {
|
10573
10665
|
switch (src0->type) {
|
10574
10666
|
case GGML_TYPE_F32:
|
10575
10667
|
{
|
10576
|
-
ggml_compute_forward_soft_max_f32(params, src0, dst);
|
10668
|
+
ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
|
10577
10669
|
} break;
|
10578
10670
|
default:
|
10579
10671
|
{
|
@@ -13810,7 +13902,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
13810
13902
|
} break;
|
13811
13903
|
case GGML_OP_SOFT_MAX:
|
13812
13904
|
{
|
13813
|
-
ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
|
13905
|
+
ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
|
13814
13906
|
} break;
|
13815
13907
|
case GGML_OP_SOFT_MAX_BACK:
|
13816
13908
|
{
|
@@ -15636,13 +15728,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
15636
15728
|
{
|
15637
15729
|
n_tasks = 1;
|
15638
15730
|
} break;
|
15639
|
-
case GGML_OP_COUNT:
|
15640
|
-
{
|
15641
|
-
GGML_ASSERT(false);
|
15642
|
-
} break;
|
15643
15731
|
default:
|
15644
15732
|
{
|
15645
|
-
|
15733
|
+
fprintf(stderr, "%s: op not implemented: ", __func__);
|
15734
|
+
if (node->op < GGML_OP_COUNT) {
|
15735
|
+
fprintf(stderr, "%s\n", ggml_op_name(node->op));
|
15736
|
+
} else {
|
15737
|
+
fprintf(stderr, "%d\n", node->op);
|
15738
|
+
}
|
15646
15739
|
GGML_ASSERT(false);
|
15647
15740
|
} break;
|
15648
15741
|
}
|
@@ -15845,6 +15938,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
15845
15938
|
cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
|
15846
15939
|
}
|
15847
15940
|
} break;
|
15941
|
+
case GGML_OP_SOFT_MAX:
|
15942
|
+
{
|
15943
|
+
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
|
15944
|
+
|
15945
|
+
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
|
15946
|
+
} break;
|
15848
15947
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
15849
15948
|
{
|
15850
15949
|
GGML_ASSERT(node->src[0]->ne[3] == 1);
|
@@ -18399,24 +18498,29 @@ int gguf_find_key(const struct gguf_context * ctx, const char * key) {
|
|
18399
18498
|
}
|
18400
18499
|
|
18401
18500
|
const char * gguf_get_key(const struct gguf_context * ctx, int key_id) {
|
18501
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18402
18502
|
return ctx->kv[key_id].key.data;
|
18403
18503
|
}
|
18404
18504
|
|
18405
18505
|
enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) {
|
18506
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18406
18507
|
return ctx->kv[key_id].type;
|
18407
18508
|
}
|
18408
18509
|
|
18409
18510
|
enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) {
|
18511
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18410
18512
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
18411
18513
|
return ctx->kv[key_id].value.arr.type;
|
18412
18514
|
}
|
18413
18515
|
|
18414
18516
|
const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) {
|
18517
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18415
18518
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
18416
18519
|
return ctx->kv[key_id].value.arr.data;
|
18417
18520
|
}
|
18418
18521
|
|
18419
18522
|
const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
|
18523
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18420
18524
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
18421
18525
|
struct gguf_kv * kv = &ctx->kv[key_id];
|
18422
18526
|
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
|
@@ -18424,70 +18528,90 @@ const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i
|
|
18424
18528
|
}
|
18425
18529
|
|
18426
18530
|
int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
|
18531
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18427
18532
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
18428
18533
|
return ctx->kv[key_id].value.arr.n;
|
18429
18534
|
}
|
18430
18535
|
|
18431
18536
|
uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) {
|
18537
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18432
18538
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
|
18433
18539
|
return ctx->kv[key_id].value.uint8;
|
18434
18540
|
}
|
18435
18541
|
|
18436
18542
|
int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) {
|
18543
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18437
18544
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
|
18438
18545
|
return ctx->kv[key_id].value.int8;
|
18439
18546
|
}
|
18440
18547
|
|
18441
18548
|
uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) {
|
18549
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18442
18550
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
|
18443
18551
|
return ctx->kv[key_id].value.uint16;
|
18444
18552
|
}
|
18445
18553
|
|
18446
18554
|
int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) {
|
18555
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18447
18556
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
|
18448
18557
|
return ctx->kv[key_id].value.int16;
|
18449
18558
|
}
|
18450
18559
|
|
18451
18560
|
uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) {
|
18561
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18452
18562
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
|
18453
18563
|
return ctx->kv[key_id].value.uint32;
|
18454
18564
|
}
|
18455
18565
|
|
18456
18566
|
int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) {
|
18567
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18457
18568
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
|
18458
18569
|
return ctx->kv[key_id].value.int32;
|
18459
18570
|
}
|
18460
18571
|
|
18461
18572
|
float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) {
|
18573
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18462
18574
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
|
18463
18575
|
return ctx->kv[key_id].value.float32;
|
18464
18576
|
}
|
18465
18577
|
|
18466
18578
|
uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) {
|
18579
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18467
18580
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
|
18468
18581
|
return ctx->kv[key_id].value.uint64;
|
18469
18582
|
}
|
18470
18583
|
|
18471
18584
|
int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) {
|
18585
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18472
18586
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
|
18473
18587
|
return ctx->kv[key_id].value.int64;
|
18474
18588
|
}
|
18475
18589
|
|
18476
18590
|
double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
|
18591
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18477
18592
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
|
18478
18593
|
return ctx->kv[key_id].value.float64;
|
18479
18594
|
}
|
18480
18595
|
|
18481
18596
|
bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
|
18597
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18482
18598
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
|
18483
18599
|
return ctx->kv[key_id].value.bool_;
|
18484
18600
|
}
|
18485
18601
|
|
18486
18602
|
const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
|
18603
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18487
18604
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
|
18488
18605
|
return ctx->kv[key_id].value.str.data;
|
18489
18606
|
}
|
18490
18607
|
|
18608
|
+
const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) {
|
18609
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18610
|
+
GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY);
|
18611
|
+
GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_STRING);
|
18612
|
+
return &ctx->kv[key_id].value;
|
18613
|
+
}
|
18614
|
+
|
18491
18615
|
int gguf_get_n_tensors(const struct gguf_context * ctx) {
|
18492
18616
|
return ctx->header.n_tensors;
|
18493
18617
|
}
|
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -244,11 +244,10 @@
|
|
244
244
|
#define GGML_ASSERT(x) \
|
245
245
|
do { \
|
246
246
|
if (!(x)) { \
|
247
|
-
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
248
|
-
fflush(stderr); \
|
249
247
|
fflush(stdout); \
|
248
|
+
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
250
249
|
ggml_print_backtrace(); \
|
251
|
-
|
250
|
+
abort(); \
|
252
251
|
} \
|
253
252
|
} while (0)
|
254
253
|
|
@@ -1283,6 +1282,14 @@ extern "C" {
|
|
1283
1282
|
struct ggml_context * ctx,
|
1284
1283
|
struct ggml_tensor * a);
|
1285
1284
|
|
1285
|
+
// fused soft_max(a*scale + mask)
|
1286
|
+
// mask is optional
|
1287
|
+
GGML_API struct ggml_tensor * ggml_soft_max_ext(
|
1288
|
+
struct ggml_context * ctx,
|
1289
|
+
struct ggml_tensor * a,
|
1290
|
+
struct ggml_tensor * mask,
|
1291
|
+
float scale);
|
1292
|
+
|
1286
1293
|
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
1287
1294
|
struct ggml_context * ctx,
|
1288
1295
|
struct ggml_tensor * a,
|
@@ -2045,6 +2052,7 @@ extern "C" {
|
|
2045
2052
|
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
|
2046
2053
|
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
|
2047
2054
|
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
|
2055
|
+
GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
|
2048
2056
|
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
|
2049
2057
|
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
2050
2058
|
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|