llama_cpp 0.9.3 → 0.9.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +68 -40
- data/ext/llama_cpp/src/ggml-quants.c +1 -1
- data/ext/llama_cpp/src/ggml.c +86 -8
- data/ext/llama_cpp/src/ggml.h +1 -0
- data/ext/llama_cpp/src/llama.cpp +264 -84
- data/ext/llama_cpp/src/llama.h +71 -3
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -9611,10 +9611,12 @@ static void ggml_compute_forward_out_prod_f32(
|
|
9611
9611
|
const int ith = params->ith;
|
9612
9612
|
const int nth = params->nth;
|
9613
9613
|
|
9614
|
+
GGML_ASSERT(ne0 == ne00);
|
9615
|
+
GGML_ASSERT(ne1 == ne10);
|
9616
|
+
GGML_ASSERT(ne2 == ne02);
|
9614
9617
|
GGML_ASSERT(ne02 == ne12);
|
9615
|
-
GGML_ASSERT(ne03 == ne13);
|
9616
|
-
GGML_ASSERT(ne2 == ne12);
|
9617
9618
|
GGML_ASSERT(ne3 == ne13);
|
9619
|
+
GGML_ASSERT(ne03 == ne13);
|
9618
9620
|
|
9619
9621
|
// we don't support permuted src0 or src1
|
9620
9622
|
GGML_ASSERT(nb00 == sizeof(float));
|
@@ -9625,18 +9627,25 @@ static void ggml_compute_forward_out_prod_f32(
|
|
9625
9627
|
// GGML_ASSERT(nb1 <= nb2);
|
9626
9628
|
// GGML_ASSERT(nb2 <= nb3);
|
9627
9629
|
|
9628
|
-
GGML_ASSERT(ne0 == ne00);
|
9629
|
-
GGML_ASSERT(ne1 == ne10);
|
9630
|
-
GGML_ASSERT(ne2 == ne02);
|
9631
|
-
GGML_ASSERT(ne3 == ne03);
|
9632
|
-
|
9633
9630
|
// nb01 >= nb00 - src0 is not transposed
|
9634
9631
|
// compute by src0 rows
|
9635
9632
|
|
9636
9633
|
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
|
9637
|
-
// TODO: #if defined(
|
9634
|
+
// TODO: #if defined(GGML_USE_CLBLAST)
|
9635
|
+
|
9636
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9637
|
+
bool use_blas = ggml_is_matrix(src0) &&
|
9638
|
+
ggml_is_matrix(src1) &&
|
9639
|
+
ggml_is_contiguous(src0) &&
|
9640
|
+
(ggml_is_contiguous(src1) || ggml_is_transposed(src1));
|
9641
|
+
#endif
|
9638
9642
|
|
9639
9643
|
if (params->type == GGML_TASK_INIT) {
|
9644
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
|
9645
|
+
if (use_blas) {
|
9646
|
+
return;
|
9647
|
+
}
|
9648
|
+
#endif
|
9640
9649
|
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
9641
9650
|
return;
|
9642
9651
|
}
|
@@ -9645,6 +9654,50 @@ static void ggml_compute_forward_out_prod_f32(
|
|
9645
9654
|
return;
|
9646
9655
|
}
|
9647
9656
|
|
9657
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9658
|
+
if (use_blas) {
|
9659
|
+
if (params->ith != 0) { // All threads other than the first do no work.
|
9660
|
+
return;
|
9661
|
+
}
|
9662
|
+
// Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
|
9663
|
+
// src0: (k,n)
|
9664
|
+
// src1: (k,m)
|
9665
|
+
// dst: (m,n)
|
9666
|
+
//
|
9667
|
+
// Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
|
9668
|
+
// Also expressed as (major,minor)
|
9669
|
+
// a: (m,k): so src1 transposed
|
9670
|
+
// b: (k,n): so src0
|
9671
|
+
// c: (m,n)
|
9672
|
+
//
|
9673
|
+
// However, if ggml_is_transposed(src1) is true, then
|
9674
|
+
// src1->data already contains a transposed version, so sgemm mustn't
|
9675
|
+
// transpose it further.
|
9676
|
+
|
9677
|
+
int n = src0->ne[0];
|
9678
|
+
int k = src0->ne[1];
|
9679
|
+
int m = src1->ne[0];
|
9680
|
+
|
9681
|
+
int transposeA, lda;
|
9682
|
+
|
9683
|
+
if (!ggml_is_transposed(src1)) {
|
9684
|
+
transposeA = CblasTrans;
|
9685
|
+
lda = m;
|
9686
|
+
} else {
|
9687
|
+
transposeA = CblasNoTrans;
|
9688
|
+
lda = k;
|
9689
|
+
}
|
9690
|
+
|
9691
|
+
float * a = (float *) ((char *) src1->data);
|
9692
|
+
float * b = (float *) ((char *) src0->data);
|
9693
|
+
float * c = (float *) ((char *) dst->data);
|
9694
|
+
|
9695
|
+
cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
|
9696
|
+
|
9697
|
+
return;
|
9698
|
+
}
|
9699
|
+
#endif
|
9700
|
+
|
9648
9701
|
// dst[:,:,:,:] = 0
|
9649
9702
|
// for i2,i3:
|
9650
9703
|
// for i1:
|
@@ -18399,24 +18452,29 @@ int gguf_find_key(const struct gguf_context * ctx, const char * key) {
|
|
18399
18452
|
}
|
18400
18453
|
|
18401
18454
|
const char * gguf_get_key(const struct gguf_context * ctx, int key_id) {
|
18455
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18402
18456
|
return ctx->kv[key_id].key.data;
|
18403
18457
|
}
|
18404
18458
|
|
18405
18459
|
enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) {
|
18460
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18406
18461
|
return ctx->kv[key_id].type;
|
18407
18462
|
}
|
18408
18463
|
|
18409
18464
|
enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) {
|
18465
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18410
18466
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
18411
18467
|
return ctx->kv[key_id].value.arr.type;
|
18412
18468
|
}
|
18413
18469
|
|
18414
18470
|
const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) {
|
18471
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18415
18472
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
18416
18473
|
return ctx->kv[key_id].value.arr.data;
|
18417
18474
|
}
|
18418
18475
|
|
18419
18476
|
const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
|
18477
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18420
18478
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
18421
18479
|
struct gguf_kv * kv = &ctx->kv[key_id];
|
18422
18480
|
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
|
@@ -18424,70 +18482,90 @@ const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i
|
|
18424
18482
|
}
|
18425
18483
|
|
18426
18484
|
int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
|
18485
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18427
18486
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
18428
18487
|
return ctx->kv[key_id].value.arr.n;
|
18429
18488
|
}
|
18430
18489
|
|
18431
18490
|
uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) {
|
18491
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18432
18492
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
|
18433
18493
|
return ctx->kv[key_id].value.uint8;
|
18434
18494
|
}
|
18435
18495
|
|
18436
18496
|
int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) {
|
18497
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18437
18498
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
|
18438
18499
|
return ctx->kv[key_id].value.int8;
|
18439
18500
|
}
|
18440
18501
|
|
18441
18502
|
uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) {
|
18503
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18442
18504
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
|
18443
18505
|
return ctx->kv[key_id].value.uint16;
|
18444
18506
|
}
|
18445
18507
|
|
18446
18508
|
int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) {
|
18509
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18447
18510
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
|
18448
18511
|
return ctx->kv[key_id].value.int16;
|
18449
18512
|
}
|
18450
18513
|
|
18451
18514
|
uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) {
|
18515
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18452
18516
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
|
18453
18517
|
return ctx->kv[key_id].value.uint32;
|
18454
18518
|
}
|
18455
18519
|
|
18456
18520
|
int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) {
|
18521
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18457
18522
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
|
18458
18523
|
return ctx->kv[key_id].value.int32;
|
18459
18524
|
}
|
18460
18525
|
|
18461
18526
|
float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) {
|
18527
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18462
18528
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
|
18463
18529
|
return ctx->kv[key_id].value.float32;
|
18464
18530
|
}
|
18465
18531
|
|
18466
18532
|
uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) {
|
18533
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18467
18534
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
|
18468
18535
|
return ctx->kv[key_id].value.uint64;
|
18469
18536
|
}
|
18470
18537
|
|
18471
18538
|
int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) {
|
18539
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18472
18540
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
|
18473
18541
|
return ctx->kv[key_id].value.int64;
|
18474
18542
|
}
|
18475
18543
|
|
18476
18544
|
double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
|
18545
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18477
18546
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
|
18478
18547
|
return ctx->kv[key_id].value.float64;
|
18479
18548
|
}
|
18480
18549
|
|
18481
18550
|
bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
|
18551
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18482
18552
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
|
18483
18553
|
return ctx->kv[key_id].value.bool_;
|
18484
18554
|
}
|
18485
18555
|
|
18486
18556
|
const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
|
18557
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18487
18558
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
|
18488
18559
|
return ctx->kv[key_id].value.str.data;
|
18489
18560
|
}
|
18490
18561
|
|
18562
|
+
const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) {
|
18563
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18564
|
+
GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY);
|
18565
|
+
GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_STRING);
|
18566
|
+
return &ctx->kv[key_id].value;
|
18567
|
+
}
|
18568
|
+
|
18491
18569
|
int gguf_get_n_tensors(const struct gguf_context * ctx) {
|
18492
18570
|
return ctx->header.n_tensors;
|
18493
18571
|
}
|
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -2045,6 +2045,7 @@ extern "C" {
|
|
2045
2045
|
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
|
2046
2046
|
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
|
2047
2047
|
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
|
2048
|
+
GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
|
2048
2049
|
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
|
2049
2050
|
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
2050
2051
|
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|