llama_cpp 0.9.3 → 0.9.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +68 -40
- data/ext/llama_cpp/src/ggml-quants.c +1 -1
- data/ext/llama_cpp/src/ggml.c +86 -8
- data/ext/llama_cpp/src/ggml.h +1 -0
- data/ext/llama_cpp/src/llama.cpp +264 -84
- data/ext/llama_cpp/src/llama.h +71 -3
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -9611,10 +9611,12 @@ static void ggml_compute_forward_out_prod_f32(
|
|
9611
9611
|
const int ith = params->ith;
|
9612
9612
|
const int nth = params->nth;
|
9613
9613
|
|
9614
|
+
GGML_ASSERT(ne0 == ne00);
|
9615
|
+
GGML_ASSERT(ne1 == ne10);
|
9616
|
+
GGML_ASSERT(ne2 == ne02);
|
9614
9617
|
GGML_ASSERT(ne02 == ne12);
|
9615
|
-
GGML_ASSERT(ne03 == ne13);
|
9616
|
-
GGML_ASSERT(ne2 == ne12);
|
9617
9618
|
GGML_ASSERT(ne3 == ne13);
|
9619
|
+
GGML_ASSERT(ne03 == ne13);
|
9618
9620
|
|
9619
9621
|
// we don't support permuted src0 or src1
|
9620
9622
|
GGML_ASSERT(nb00 == sizeof(float));
|
@@ -9625,18 +9627,25 @@ static void ggml_compute_forward_out_prod_f32(
|
|
9625
9627
|
// GGML_ASSERT(nb1 <= nb2);
|
9626
9628
|
// GGML_ASSERT(nb2 <= nb3);
|
9627
9629
|
|
9628
|
-
GGML_ASSERT(ne0 == ne00);
|
9629
|
-
GGML_ASSERT(ne1 == ne10);
|
9630
|
-
GGML_ASSERT(ne2 == ne02);
|
9631
|
-
GGML_ASSERT(ne3 == ne03);
|
9632
|
-
|
9633
9630
|
// nb01 >= nb00 - src0 is not transposed
|
9634
9631
|
// compute by src0 rows
|
9635
9632
|
|
9636
9633
|
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
|
9637
|
-
// TODO: #if defined(
|
9634
|
+
// TODO: #if defined(GGML_USE_CLBLAST)
|
9635
|
+
|
9636
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9637
|
+
bool use_blas = ggml_is_matrix(src0) &&
|
9638
|
+
ggml_is_matrix(src1) &&
|
9639
|
+
ggml_is_contiguous(src0) &&
|
9640
|
+
(ggml_is_contiguous(src1) || ggml_is_transposed(src1));
|
9641
|
+
#endif
|
9638
9642
|
|
9639
9643
|
if (params->type == GGML_TASK_INIT) {
|
9644
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
|
9645
|
+
if (use_blas) {
|
9646
|
+
return;
|
9647
|
+
}
|
9648
|
+
#endif
|
9640
9649
|
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
9641
9650
|
return;
|
9642
9651
|
}
|
@@ -9645,6 +9654,50 @@ static void ggml_compute_forward_out_prod_f32(
|
|
9645
9654
|
return;
|
9646
9655
|
}
|
9647
9656
|
|
9657
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
9658
|
+
if (use_blas) {
|
9659
|
+
if (params->ith != 0) { // All threads other than the first do no work.
|
9660
|
+
return;
|
9661
|
+
}
|
9662
|
+
// Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
|
9663
|
+
// src0: (k,n)
|
9664
|
+
// src1: (k,m)
|
9665
|
+
// dst: (m,n)
|
9666
|
+
//
|
9667
|
+
// Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
|
9668
|
+
// Also expressed as (major,minor)
|
9669
|
+
// a: (m,k): so src1 transposed
|
9670
|
+
// b: (k,n): so src0
|
9671
|
+
// c: (m,n)
|
9672
|
+
//
|
9673
|
+
// However, if ggml_is_transposed(src1) is true, then
|
9674
|
+
// src1->data already contains a transposed version, so sgemm mustn't
|
9675
|
+
// transpose it further.
|
9676
|
+
|
9677
|
+
int n = src0->ne[0];
|
9678
|
+
int k = src0->ne[1];
|
9679
|
+
int m = src1->ne[0];
|
9680
|
+
|
9681
|
+
int transposeA, lda;
|
9682
|
+
|
9683
|
+
if (!ggml_is_transposed(src1)) {
|
9684
|
+
transposeA = CblasTrans;
|
9685
|
+
lda = m;
|
9686
|
+
} else {
|
9687
|
+
transposeA = CblasNoTrans;
|
9688
|
+
lda = k;
|
9689
|
+
}
|
9690
|
+
|
9691
|
+
float * a = (float *) ((char *) src1->data);
|
9692
|
+
float * b = (float *) ((char *) src0->data);
|
9693
|
+
float * c = (float *) ((char *) dst->data);
|
9694
|
+
|
9695
|
+
cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
|
9696
|
+
|
9697
|
+
return;
|
9698
|
+
}
|
9699
|
+
#endif
|
9700
|
+
|
9648
9701
|
// dst[:,:,:,:] = 0
|
9649
9702
|
// for i2,i3:
|
9650
9703
|
// for i1:
|
@@ -18399,24 +18452,29 @@ int gguf_find_key(const struct gguf_context * ctx, const char * key) {
|
|
18399
18452
|
}
|
18400
18453
|
|
18401
18454
|
const char * gguf_get_key(const struct gguf_context * ctx, int key_id) {
|
18455
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18402
18456
|
return ctx->kv[key_id].key.data;
|
18403
18457
|
}
|
18404
18458
|
|
18405
18459
|
enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) {
|
18460
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18406
18461
|
return ctx->kv[key_id].type;
|
18407
18462
|
}
|
18408
18463
|
|
18409
18464
|
enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) {
|
18465
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18410
18466
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
18411
18467
|
return ctx->kv[key_id].value.arr.type;
|
18412
18468
|
}
|
18413
18469
|
|
18414
18470
|
const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) {
|
18471
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18415
18472
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
18416
18473
|
return ctx->kv[key_id].value.arr.data;
|
18417
18474
|
}
|
18418
18475
|
|
18419
18476
|
const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
|
18477
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18420
18478
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
18421
18479
|
struct gguf_kv * kv = &ctx->kv[key_id];
|
18422
18480
|
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
|
@@ -18424,70 +18482,90 @@ const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i
|
|
18424
18482
|
}
|
18425
18483
|
|
18426
18484
|
int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
|
18485
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18427
18486
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
18428
18487
|
return ctx->kv[key_id].value.arr.n;
|
18429
18488
|
}
|
18430
18489
|
|
18431
18490
|
uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) {
|
18491
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18432
18492
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
|
18433
18493
|
return ctx->kv[key_id].value.uint8;
|
18434
18494
|
}
|
18435
18495
|
|
18436
18496
|
int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) {
|
18497
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18437
18498
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
|
18438
18499
|
return ctx->kv[key_id].value.int8;
|
18439
18500
|
}
|
18440
18501
|
|
18441
18502
|
uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) {
|
18503
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18442
18504
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
|
18443
18505
|
return ctx->kv[key_id].value.uint16;
|
18444
18506
|
}
|
18445
18507
|
|
18446
18508
|
int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) {
|
18509
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18447
18510
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
|
18448
18511
|
return ctx->kv[key_id].value.int16;
|
18449
18512
|
}
|
18450
18513
|
|
18451
18514
|
uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) {
|
18515
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18452
18516
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
|
18453
18517
|
return ctx->kv[key_id].value.uint32;
|
18454
18518
|
}
|
18455
18519
|
|
18456
18520
|
int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) {
|
18521
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18457
18522
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
|
18458
18523
|
return ctx->kv[key_id].value.int32;
|
18459
18524
|
}
|
18460
18525
|
|
18461
18526
|
float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) {
|
18527
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18462
18528
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
|
18463
18529
|
return ctx->kv[key_id].value.float32;
|
18464
18530
|
}
|
18465
18531
|
|
18466
18532
|
uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) {
|
18533
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18467
18534
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
|
18468
18535
|
return ctx->kv[key_id].value.uint64;
|
18469
18536
|
}
|
18470
18537
|
|
18471
18538
|
int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) {
|
18539
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18472
18540
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
|
18473
18541
|
return ctx->kv[key_id].value.int64;
|
18474
18542
|
}
|
18475
18543
|
|
18476
18544
|
double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
|
18545
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18477
18546
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
|
18478
18547
|
return ctx->kv[key_id].value.float64;
|
18479
18548
|
}
|
18480
18549
|
|
18481
18550
|
bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
|
18551
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18482
18552
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
|
18483
18553
|
return ctx->kv[key_id].value.bool_;
|
18484
18554
|
}
|
18485
18555
|
|
18486
18556
|
const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
|
18557
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18487
18558
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
|
18488
18559
|
return ctx->kv[key_id].value.str.data;
|
18489
18560
|
}
|
18490
18561
|
|
18562
|
+
const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) {
|
18563
|
+
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
18564
|
+
GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY);
|
18565
|
+
GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_STRING);
|
18566
|
+
return &ctx->kv[key_id].value;
|
18567
|
+
}
|
18568
|
+
|
18491
18569
|
int gguf_get_n_tensors(const struct gguf_context * ctx) {
|
18492
18570
|
return ctx->header.n_tensors;
|
18493
18571
|
}
|
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -2045,6 +2045,7 @@ extern "C" {
|
|
2045
2045
|
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
|
2046
2046
|
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
|
2047
2047
|
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
|
2048
|
+
GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
|
2048
2049
|
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
|
2049
2050
|
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
2050
2051
|
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|