@fugood/llama.node 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/CMakeLists.txt +9 -0
  2. package/README.md +1 -1
  3. package/bin/darwin/arm64/default.metallib +0 -0
  4. package/bin/darwin/arm64/llama-node.node +0 -0
  5. package/bin/darwin/x64/default.metallib +0 -0
  6. package/bin/darwin/x64/llama-node.node +0 -0
  7. package/bin/linux/arm64/llama-node.node +0 -0
  8. package/bin/linux/x64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  10. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  11. package/bin/win32/arm64/llama-node.node +0 -0
  12. package/bin/win32/arm64/node.lib +0 -0
  13. package/bin/win32/x64/llama-node.node +0 -0
  14. package/bin/win32/x64/node.lib +0 -0
  15. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/arm64/node.lib +0 -0
  17. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  18. package/bin/win32-vulkan/x64/node.lib +0 -0
  19. package/lib/binding.ts +1 -1
  20. package/package.json +2 -1
  21. package/patches/llama.patch +22 -0
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/llama.cpp/CMakeLists.txt +14 -12
  24. package/src/llama.cpp/common/common.cpp +19 -5
  25. package/src/llama.cpp/common/common.h +2 -0
  26. package/src/llama.cpp/common/grammar-parser.cpp +9 -0
  27. package/src/llama.cpp/common/sampling.cpp +3 -3
  28. package/src/llama.cpp/common/sampling.h +1 -1
  29. package/src/llama.cpp/examples/CMakeLists.txt +3 -0
  30. package/src/llama.cpp/examples/embedding/embedding.cpp +10 -2
  31. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +56 -7
  32. package/src/llama.cpp/examples/llama.android/{app/src/main/cpp → llama}/CMakeLists.txt +1 -1
  33. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +49 -0
  34. package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
  35. package/src/llama.cpp/examples/llava/llava-cli.cpp +26 -6
  36. package/src/llama.cpp/examples/main/main.cpp +5 -1
  37. package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
  38. package/src/llama.cpp/examples/rpc/rpc-server.cpp +70 -0
  39. package/src/llama.cpp/examples/server/server.cpp +12 -16
  40. package/src/llama.cpp/examples/server/utils.hpp +1 -1
  41. package/src/llama.cpp/ggml-backend.c +2 -2
  42. package/src/llama.cpp/ggml-kompute.cpp +9 -3
  43. package/src/llama.cpp/ggml-quants.c +6 -0
  44. package/src/llama.cpp/ggml-rpc.cpp +1023 -0
  45. package/src/llama.cpp/ggml-rpc.h +24 -0
  46. package/src/llama.cpp/ggml-sycl.cpp +20 -143
  47. package/src/llama.cpp/ggml-vulkan.cpp +4 -2
  48. package/src/llama.cpp/ggml.c +116 -271
  49. package/src/llama.cpp/ggml.h +12 -15
  50. package/src/llama.cpp/llama.cpp +451 -265
  51. package/src/llama.cpp/llama.h +3 -0
  52. package/src/llama.cpp/requirements.txt +0 -1
  53. package/src/llama.cpp/tests/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/tests/test-backend-ops.cpp +16 -19
  55. package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
  56. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
  57. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
@@ -4,7 +4,6 @@
4
4
  #include "ggml-impl.h"
5
5
  #include "ggml-quants.h"
6
6
  #include "ggml.h"
7
- #include "sgemm.h"
8
7
 
9
8
  #if defined(_MSC_VER) || defined(__MINGW32__)
10
9
  #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -37,6 +36,10 @@
37
36
  #undef GGML_USE_LLAMAFILE
38
37
  #endif
39
38
 
39
+ #ifdef GGML_USE_LLAMAFILE
40
+ #include "sgemm.h"
41
+ #endif
42
+
40
43
  #if defined(_MSC_VER)
41
44
  // disable "possible loss of data" to avoid hundreds of casts
42
45
  // we should just be careful :)
@@ -1949,6 +1952,7 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
1949
1952
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
1950
1953
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
1951
1954
  inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
1955
+ inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
1952
1956
  // TODO: optimize performance
1953
1957
  inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
1954
1958
  inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
@@ -2185,7 +2189,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
2185
2189
  "SOFT_MAX_BACK",
2186
2190
  "ROPE",
2187
2191
  "ROPE_BACK",
2188
- "ALIBI",
2189
2192
  "CLAMP",
2190
2193
  "CONV_TRANSPOSE_1D",
2191
2194
  "IM2COL",
@@ -2227,7 +2230,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
2227
2230
  "CROSS_ENTROPY_LOSS_BACK",
2228
2231
  };
2229
2232
 
2230
- static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77");
2233
+ static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
2231
2234
 
2232
2235
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
2233
2236
  "none",
@@ -2276,7 +2279,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
2276
2279
  "soft_max_back(x)",
2277
2280
  "rope(x)",
2278
2281
  "rope_back(x)",
2279
- "alibi(x)",
2280
2282
  "clamp(x)",
2281
2283
  "conv_transpose_1d(x)",
2282
2284
  "im2col(x)",
@@ -2318,7 +2320,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
2318
2320
  "cross_entropy_loss_back(x,y)",
2319
2321
  };
2320
2322
 
2321
- static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77");
2323
+ static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
2322
2324
 
2323
2325
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
2324
2326
 
@@ -2331,6 +2333,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
2331
2333
  "TANH",
2332
2334
  "ELU",
2333
2335
  "RELU",
2336
+ "SIGMOID",
2334
2337
  "GELU",
2335
2338
  "GELU_QUICK",
2336
2339
  "SILU",
@@ -2338,7 +2341,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
2338
2341
  "HARDSIGMOID",
2339
2342
  };
2340
2343
 
2341
- static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");
2344
+ static_assert(GGML_UNARY_OP_COUNT == 13, "GGML_UNARY_OP_COUNT != 13");
2342
2345
 
2343
2346
 
2344
2347
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@@ -4563,6 +4566,20 @@ struct ggml_tensor * ggml_leaky_relu(
4563
4566
  return result;
4564
4567
  }
4565
4568
 
4569
+ // ggml_sigmoid
4570
+
4571
+ struct ggml_tensor * ggml_sigmoid(
4572
+ struct ggml_context * ctx,
4573
+ struct ggml_tensor * a) {
4574
+ return ggml_unary(ctx, a, GGML_UNARY_OP_SIGMOID);
4575
+ }
4576
+
4577
+ struct ggml_tensor * ggml_sigmoid_inplace(
4578
+ struct ggml_context * ctx,
4579
+ struct ggml_tensor * a) {
4580
+ return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SIGMOID);
4581
+ }
4582
+
4566
4583
  // ggml_gelu
4567
4584
 
4568
4585
  struct ggml_tensor * ggml_gelu(
@@ -5646,7 +5663,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
5646
5663
  struct ggml_context * ctx,
5647
5664
  struct ggml_tensor * a,
5648
5665
  struct ggml_tensor * mask,
5649
- struct ggml_tensor * pos,
5650
5666
  float scale,
5651
5667
  float max_bias,
5652
5668
  bool inplace) {
@@ -5660,18 +5676,8 @@ static struct ggml_tensor * ggml_soft_max_impl(
5660
5676
  GGML_ASSERT(mask->ne[1] >= a->ne[1]);
5661
5677
  }
5662
5678
 
5663
- if (pos) {
5664
- GGML_ASSERT(ggml_is_vector(pos));
5665
- GGML_ASSERT(pos->type == GGML_TYPE_F16 || pos->type == GGML_TYPE_F32);
5666
- GGML_ASSERT(pos->ne[0] == a->ne[0]);
5667
- }
5668
-
5669
- if (pos && mask) {
5670
- GGML_ASSERT(pos->type == mask->type);
5671
- }
5672
-
5673
5679
  if (max_bias > 0.0f) {
5674
- GGML_ASSERT(pos);
5680
+ GGML_ASSERT(mask);
5675
5681
  }
5676
5682
 
5677
5683
  bool is_node = false;
@@ -5689,7 +5695,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
5689
5695
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5690
5696
  result->src[0] = a;
5691
5697
  result->src[1] = mask;
5692
- result->src[2] = pos;
5693
5698
 
5694
5699
  return result;
5695
5700
  }
@@ -5697,23 +5702,22 @@ static struct ggml_tensor * ggml_soft_max_impl(
5697
5702
  struct ggml_tensor * ggml_soft_max(
5698
5703
  struct ggml_context * ctx,
5699
5704
  struct ggml_tensor * a) {
5700
- return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, false);
5705
+ return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, false);
5701
5706
  }
5702
5707
 
5703
5708
  struct ggml_tensor * ggml_soft_max_inplace(
5704
5709
  struct ggml_context * ctx,
5705
5710
  struct ggml_tensor * a) {
5706
- return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, true);
5711
+ return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, true);
5707
5712
  }
5708
5713
 
5709
5714
  struct ggml_tensor * ggml_soft_max_ext(
5710
5715
  struct ggml_context * ctx,
5711
5716
  struct ggml_tensor * a,
5712
5717
  struct ggml_tensor * mask,
5713
- struct ggml_tensor * pos,
5714
5718
  float scale,
5715
5719
  float max_bias) {
5716
- return ggml_soft_max_impl(ctx, a, mask, pos, scale, max_bias, false);
5720
+ return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
5717
5721
  }
5718
5722
 
5719
5723
  // ggml_soft_max_back
@@ -5928,37 +5932,6 @@ struct ggml_tensor * ggml_rope_back(
5928
5932
  return result;
5929
5933
  }
5930
5934
 
5931
- // ggml_alibi
5932
-
5933
- struct ggml_tensor * ggml_alibi(
5934
- struct ggml_context * ctx,
5935
- struct ggml_tensor * a,
5936
- int n_past,
5937
- int n_head,
5938
- float bias_max) {
5939
- GGML_ASSERT(n_past >= 0);
5940
- bool is_node = false;
5941
-
5942
- if (a->grad) {
5943
- GGML_ASSERT(false); // TODO: implement backward
5944
- is_node = true;
5945
- }
5946
-
5947
- // TODO: when implement backward, fix this:
5948
- //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5949
- struct ggml_tensor * result = ggml_view_tensor(ctx, a);
5950
-
5951
- int32_t op_params[3] = { n_past, n_head };
5952
- memcpy(op_params + 2, &bias_max, sizeof(float));
5953
- ggml_set_op_params(result, op_params, sizeof(op_params));
5954
-
5955
- result->op = GGML_OP_ALIBI;
5956
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5957
- result->src[0] = a;
5958
-
5959
- return result;
5960
- }
5961
-
5962
5935
  // ggml_clamp
5963
5936
 
5964
5937
  struct ggml_tensor * ggml_clamp(
@@ -6486,9 +6459,11 @@ struct ggml_tensor * ggml_flash_attn_ext(
6486
6459
  struct ggml_tensor * k,
6487
6460
  struct ggml_tensor * v,
6488
6461
  struct ggml_tensor * mask,
6489
- float scale) {
6462
+ float scale,
6463
+ float max_bias) {
6490
6464
  GGML_ASSERT(ggml_can_mul_mat(k, q));
6491
6465
  // TODO: check if vT can be multiplied by (k*qT)
6466
+
6492
6467
  if (mask) {
6493
6468
  GGML_ASSERT(ggml_is_contiguous(mask));
6494
6469
  GGML_ASSERT(mask->ne[2] == 1);
@@ -6498,6 +6473,10 @@ struct ggml_tensor * ggml_flash_attn_ext(
6498
6473
  //GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
6499
6474
  }
6500
6475
 
6476
+ if (max_bias > 0.0f) {
6477
+ GGML_ASSERT(mask);
6478
+ }
6479
+
6501
6480
  bool is_node = false;
6502
6481
 
6503
6482
  if (q->grad || k->grad || v->grad) {
@@ -6508,7 +6487,7 @@ struct ggml_tensor * ggml_flash_attn_ext(
6508
6487
  int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
6509
6488
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
6510
6489
 
6511
- float params[] = { scale };
6490
+ float params[] = { scale, max_bias };
6512
6491
  ggml_set_op_params(result, params, sizeof(params));
6513
6492
 
6514
6493
  result->op = GGML_OP_FLASH_ATTN_EXT;
@@ -6528,7 +6507,7 @@ void ggml_flash_attn_ext_set_prec(
6528
6507
 
6529
6508
  const int32_t prec_i32 = (int32_t) prec;
6530
6509
 
6531
- ggml_set_op_params_i32(a, 1, prec_i32); // scale is on first pos
6510
+ ggml_set_op_params_i32(a, 2, prec_i32); // scale is on first pos, max_bias on second
6532
6511
  }
6533
6512
 
6534
6513
  // ggml_flash_ff
@@ -10892,6 +10871,52 @@ static void ggml_compute_forward_relu(
10892
10871
  }
10893
10872
  }
10894
10873
 
10874
+ // ggml_compute_forward_sigmoid
10875
+
10876
+ static void ggml_compute_forward_sigmoid_f32(
10877
+ const struct ggml_compute_params * params,
10878
+ struct ggml_tensor * dst) {
10879
+
10880
+ const struct ggml_tensor * src0 = dst->src[0];
10881
+
10882
+ assert(params->ith == 0);
10883
+ assert(ggml_are_same_shape(src0, dst));
10884
+
10885
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
10886
+ return;
10887
+ }
10888
+
10889
+ const int n = ggml_nrows(src0);
10890
+ const int nc = src0->ne[0];
10891
+
10892
+ assert(dst->nb[0] == sizeof(float));
10893
+ assert(src0->nb[0] == sizeof(float));
10894
+
10895
+ for (int i = 0; i < n; i++) {
10896
+ ggml_vec_sigmoid_f32(nc,
10897
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
10898
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
10899
+ }
10900
+ }
10901
+
10902
+ static void ggml_compute_forward_sigmoid(
10903
+ const struct ggml_compute_params * params,
10904
+ struct ggml_tensor * dst) {
10905
+
10906
+ const struct ggml_tensor * src0 = dst->src[0];
10907
+
10908
+ switch (src0->type) {
10909
+ case GGML_TYPE_F32:
10910
+ {
10911
+ ggml_compute_forward_sigmoid_f32(params, dst);
10912
+ } break;
10913
+ default:
10914
+ {
10915
+ GGML_ASSERT(false);
10916
+ } break;
10917
+ }
10918
+ }
10919
+
10895
10920
  // ggml_compute_forward_gelu
10896
10921
 
10897
10922
  static void ggml_compute_forward_gelu_f32(
@@ -13333,7 +13358,6 @@ static void ggml_compute_forward_soft_max_f32(
13333
13358
 
13334
13359
  const struct ggml_tensor * src0 = dst->src[0];
13335
13360
  const struct ggml_tensor * src1 = dst->src[1];
13336
- const struct ggml_tensor * src2 = dst->src[2];
13337
13361
 
13338
13362
  assert(ggml_is_contiguous(dst));
13339
13363
  assert(ggml_are_same_shape(src0, dst));
@@ -13359,8 +13383,8 @@ static void ggml_compute_forward_soft_max_f32(
13359
13383
 
13360
13384
  // TODO: is this supposed to be ceil instead of floor?
13361
13385
  // https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
13362
- const uint32_t n_head_kv = ne02;
13363
- const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head_kv));
13386
+ const uint32_t n_head = ne02;
13387
+ const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
13364
13388
 
13365
13389
  const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
13366
13390
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
@@ -13377,13 +13401,13 @@ static void ggml_compute_forward_soft_max_f32(
13377
13401
 
13378
13402
  float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
13379
13403
 
13380
- // when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching
13381
- ggml_fp16_t * pos_f16 = src2 ? (ggml_fp16_t *) src2->data : src0->data;
13382
- float * pos_f32 = src2 ? (float *) src2->data : src0->data;
13383
-
13384
- const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
13404
+ const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
13385
13405
 
13386
13406
  for (int i1 = ir0; i1 < ir1; i1++) {
13407
+ // ALiBi
13408
+ const uint32_t h = (i1/ne01)%ne02; // head
13409
+ const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
13410
+
13387
13411
  float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
13388
13412
  float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
13389
13413
 
@@ -13396,27 +13420,11 @@ static void ggml_compute_forward_soft_max_f32(
13396
13420
  if (mp_f32) {
13397
13421
  if (use_f16) {
13398
13422
  for (int i = 0; i < nc; ++i) {
13399
- wp[i] += GGML_FP16_TO_FP32(mp_f16[i]);
13400
- }
13401
- } else {
13402
- for (int i = 0; i < nc; ++i) {
13403
- wp[i] += mp_f32[i];
13404
- }
13405
- }
13406
- }
13407
-
13408
- // ALiBi bias
13409
- if (max_bias > 0.0f) {
13410
- const uint32_t h = (i1/ne01)%ne02; // head
13411
- const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
13412
-
13413
- if (use_f16) {
13414
- for (int i = 0; i < nc; ++i) {
13415
- wp[i] += slope*GGML_FP16_TO_FP32(pos_f16[i]);
13423
+ wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]);
13416
13424
  }
13417
13425
  } else {
13418
13426
  for (int i = 0; i < nc; ++i) {
13419
- wp[i] += slope*pos_f32[i];
13427
+ wp[i] += slope*mp_f32[i];
13420
13428
  }
13421
13429
  }
13422
13430
  }
@@ -13578,178 +13586,6 @@ static void ggml_compute_forward_soft_max_back(
13578
13586
  }
13579
13587
  }
13580
13588
 
13581
- // ggml_compute_forward_alibi
13582
-
13583
- static void ggml_compute_forward_alibi_f32(
13584
- const struct ggml_compute_params * params,
13585
- struct ggml_tensor * dst) {
13586
-
13587
- const struct ggml_tensor * src0 = dst->src[0];
13588
-
13589
- assert(params->ith == 0);
13590
-
13591
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13592
- return;
13593
- }
13594
-
13595
- //const int n_past = ((int32_t *) dst->op_params)[0];
13596
- const int n_head = ((int32_t *) dst->op_params)[1];
13597
- float max_bias;
13598
- memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
13599
-
13600
- const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
13601
- const int64_t ne1 = src0->ne[1]; // seq_len_without_past
13602
- const int64_t ne2 = src0->ne[2]; // n_head -> this is k
13603
- //const int64_t ne3 = src0->ne[3]; // 1 -> bsz
13604
-
13605
- const int64_t n = ggml_nrows(src0);
13606
- const int64_t ne2_ne3 = n/ne1; // ne2*ne3
13607
-
13608
- const size_t nb0 = src0->nb[0];
13609
- const size_t nb1 = src0->nb[1];
13610
- const size_t nb2 = src0->nb[2];
13611
- //const int nb3 = src0->nb[3];
13612
-
13613
- GGML_ASSERT(nb0 == sizeof(float));
13614
- GGML_ASSERT(n_head == ne2);
13615
-
13616
- // add alibi to src0 (KQ_scaled)
13617
- const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
13618
-
13619
- const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
13620
- const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
13621
-
13622
- for (int64_t k = 0; k < ne2_ne3; k++) {
13623
- // TODO: k*nb2 or k*nb3
13624
- float m_k;
13625
-
13626
- if (k < n_heads_log2_floor) {
13627
- m_k = powf(m0, k + 1);
13628
- } else {
13629
- m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
13630
- }
13631
-
13632
- for (int64_t i = 0; i < ne0; i++) {
13633
- for (int64_t j = 0; j < ne1; j++) {
13634
- float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
13635
- float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
13636
- pdst[0] = i * m_k + src[0];
13637
- }
13638
- }
13639
- }
13640
- }
13641
-
13642
- static void ggml_compute_forward_alibi_f16(
13643
- const struct ggml_compute_params * params,
13644
- struct ggml_tensor * dst) {
13645
-
13646
- const struct ggml_tensor * src0 = dst->src[0];
13647
-
13648
- assert(params->ith == 0);
13649
-
13650
- if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
13651
- return;
13652
- }
13653
-
13654
- //const int n_past = ((int32_t *) dst->op_params)[0];
13655
- const int n_head = ((int32_t *) dst->op_params)[1];
13656
- float max_bias;
13657
- memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
13658
-
13659
- const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
13660
- const int ne1 = src0->ne[1]; // seq_len_without_past
13661
- const int ne2 = src0->ne[2]; // n_head -> this is k
13662
- //const int ne3 = src0->ne[3]; // 1 -> bsz
13663
-
13664
- const int n = ggml_nrows(src0);
13665
- const int ne2_ne3 = n/ne1; // ne2*ne3
13666
-
13667
- const int nb0 = src0->nb[0];
13668
- const int nb1 = src0->nb[1];
13669
- const int nb2 = src0->nb[2];
13670
- //const int nb3 = src0->nb[3];
13671
-
13672
- GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
13673
- //GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
13674
- GGML_ASSERT(n_head == ne2);
13675
-
13676
- // add alibi to src0 (KQ_scaled)
13677
- const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
13678
-
13679
- const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
13680
- const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
13681
-
13682
- for (int k = 0; k < ne2_ne3; k++) {
13683
- // TODO: k*nb2 or k*nb3
13684
- float m_k;
13685
-
13686
- if (k < n_heads_log2_floor) {
13687
- m_k = powf(m0, k + 1);
13688
- } else {
13689
- m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
13690
- }
13691
-
13692
- for (int i = 0; i < ne0; i++) {
13693
- for (int j = 0; j < ne1; j++) {
13694
- ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
13695
- float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
13696
-
13697
- // we return F32
13698
- pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
13699
- }
13700
- }
13701
- }
13702
- }
13703
-
13704
- static void ggml_compute_forward_alibi(
13705
- const struct ggml_compute_params * params,
13706
- struct ggml_tensor * dst) {
13707
-
13708
- const struct ggml_tensor * src0 = dst->src[0];
13709
-
13710
- switch (src0->type) {
13711
- case GGML_TYPE_F16:
13712
- {
13713
- ggml_compute_forward_alibi_f16(params, dst);
13714
- } break;
13715
- case GGML_TYPE_F32:
13716
- {
13717
- ggml_compute_forward_alibi_f32(params, dst);
13718
- } break;
13719
- case GGML_TYPE_BF16:
13720
- case GGML_TYPE_Q4_0:
13721
- case GGML_TYPE_Q4_1:
13722
- case GGML_TYPE_Q5_0:
13723
- case GGML_TYPE_Q5_1:
13724
- case GGML_TYPE_Q8_0:
13725
- case GGML_TYPE_Q8_1:
13726
- case GGML_TYPE_Q2_K:
13727
- case GGML_TYPE_Q3_K:
13728
- case GGML_TYPE_Q4_K:
13729
- case GGML_TYPE_Q5_K:
13730
- case GGML_TYPE_Q6_K:
13731
- case GGML_TYPE_IQ2_XXS:
13732
- case GGML_TYPE_IQ2_XS:
13733
- case GGML_TYPE_IQ3_XXS:
13734
- case GGML_TYPE_IQ1_S:
13735
- case GGML_TYPE_IQ1_M:
13736
- case GGML_TYPE_IQ4_NL:
13737
- case GGML_TYPE_IQ4_XS:
13738
- case GGML_TYPE_IQ3_S:
13739
- case GGML_TYPE_IQ2_S:
13740
- case GGML_TYPE_Q8_K:
13741
- case GGML_TYPE_I8:
13742
- case GGML_TYPE_I16:
13743
- case GGML_TYPE_I32:
13744
- case GGML_TYPE_I64:
13745
- case GGML_TYPE_F64:
13746
- case GGML_TYPE_COUNT:
13747
- {
13748
- GGML_ASSERT(false);
13749
- } break;
13750
- }
13751
- }
13752
-
13753
13589
  // ggml_compute_forward_clamp
13754
13590
 
13755
13591
  static void ggml_compute_forward_clamp_f32(
@@ -15763,8 +15599,17 @@ static void ggml_compute_forward_flash_attn_ext_f16(
15763
15599
  const int ir0 = dr*ith;
15764
15600
  const int ir1 = MIN(ir0 + dr, nr);
15765
15601
 
15766
- float scale = 1.0f;
15767
- memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
15602
+ float scale = 1.0f;
15603
+ float max_bias = 0.0f;
15604
+
15605
+ memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
15606
+ memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
15607
+
15608
+ const uint32_t n_head = neq2;
15609
+ const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
15610
+
15611
+ const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
15612
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
15768
15613
 
15769
15614
  // loop over n_batch and n_head
15770
15615
  for (int ir = ir0; ir < ir1; ++ir) {
@@ -15773,6 +15618,9 @@ static void ggml_compute_forward_flash_attn_ext_f16(
15773
15618
  const int iq2 = (ir - iq3*neq2*neq1)/neq1;
15774
15619
  const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
15775
15620
 
15621
+ const uint32_t h = iq2; // head
15622
+ const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
15623
+
15776
15624
  float S = 0.0f;
15777
15625
  float M = -INFINITY;
15778
15626
 
@@ -15796,7 +15644,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
15796
15644
  // loop over n_kv and n_head_kv
15797
15645
  // ref: https://arxiv.org/pdf/2112.05682.pdf
15798
15646
  for (int64_t ic = 0; ic < nek1; ++ic) {
15799
- const float mv = mp ? GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
15647
+ const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
15800
15648
  if (mv == -INFINITY) {
15801
15649
  continue;
15802
15650
  }
@@ -15867,7 +15715,7 @@ static void ggml_compute_forward_flash_attn_ext(
15867
15715
  const struct ggml_tensor * v,
15868
15716
  const struct ggml_tensor * mask,
15869
15717
  struct ggml_tensor * dst) {
15870
- switch (dst->op_params[1]) {
15718
+ switch (dst->op_params[2]) {
15871
15719
  case GGML_PREC_DEFAULT:
15872
15720
  case GGML_PREC_F32:
15873
15721
  {
@@ -16834,6 +16682,10 @@ static void ggml_compute_forward_unary(
16834
16682
  {
16835
16683
  ggml_compute_forward_relu(params, dst);
16836
16684
  } break;
16685
+ case GGML_UNARY_OP_SIGMOID:
16686
+ {
16687
+ ggml_compute_forward_sigmoid(params, dst);
16688
+ } break;
16837
16689
  case GGML_UNARY_OP_GELU:
16838
16690
  {
16839
16691
  ggml_compute_forward_gelu(params, dst);
@@ -17630,10 +17482,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
17630
17482
  {
17631
17483
  ggml_compute_forward_rope_back(params, tensor);
17632
17484
  } break;
17633
- case GGML_OP_ALIBI:
17634
- {
17635
- ggml_compute_forward_alibi(params, tensor);
17636
- } break;
17637
17485
  case GGML_OP_CLAMP:
17638
17486
  {
17639
17487
  ggml_compute_forward_clamp(params, tensor);
@@ -18652,10 +18500,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18652
18500
  zero_table);
18653
18501
  }
18654
18502
  } break;
18655
- case GGML_OP_ALIBI:
18656
- {
18657
- GGML_ASSERT(false); // TODO: not implemented
18658
- } break;
18659
18503
  case GGML_OP_CLAMP:
18660
18504
  {
18661
18505
  GGML_ASSERT(false); // TODO: not implemented
@@ -18826,6 +18670,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18826
18670
  zero_table);
18827
18671
  }
18828
18672
  } break;
18673
+ case GGML_UNARY_OP_SIGMOID:
18674
+ {
18675
+ GGML_ASSERT(false); // TODO: not implemented
18676
+ } break;
18829
18677
  case GGML_UNARY_OP_GELU:
18830
18678
  {
18831
18679
  GGML_ASSERT(false); // TODO: not implemented
@@ -19355,6 +19203,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
19355
19203
  case GGML_UNARY_OP_TANH:
19356
19204
  case GGML_UNARY_OP_ELU:
19357
19205
  case GGML_UNARY_OP_RELU:
19206
+ case GGML_UNARY_OP_SIGMOID:
19358
19207
  case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
19359
19208
  case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
19360
19209
  {
@@ -19428,10 +19277,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
19428
19277
  {
19429
19278
  n_tasks = n_threads;
19430
19279
  } break;
19431
- case GGML_OP_ALIBI:
19432
- {
19433
- n_tasks = 1; //TODO
19434
- } break;
19435
19280
  case GGML_OP_CLAMP:
19436
19281
  {
19437
19282
  n_tasks = 1; //TODO