@fugood/llama.node 0.3.6 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/README.md +17 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +3 -1
  19. package/lib/index.js +16 -1
  20. package/lib/index.ts +16 -0
  21. package/package.json +1 -1
  22. package/src/EmbeddingWorker.cpp +4 -3
  23. package/src/LlamaCompletionWorker.cpp +4 -2
  24. package/src/LlamaContext.cpp +61 -6
  25. package/src/LlamaContext.h +1 -0
  26. package/src/common.hpp +6 -11
  27. package/src/llama.cpp/.github/workflows/build.yml +19 -17
  28. package/src/llama.cpp/.github/workflows/docker.yml +77 -30
  29. package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +22 -3
  31. package/src/llama.cpp/CMakeLists.txt +49 -24
  32. package/src/llama.cpp/common/arg.cpp +82 -26
  33. package/src/llama.cpp/common/arg.h +3 -0
  34. package/src/llama.cpp/common/common.cpp +192 -72
  35. package/src/llama.cpp/common/common.h +51 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +12 -12
  37. package/src/llama.cpp/common/ngram-cache.h +2 -2
  38. package/src/llama.cpp/common/sampling.cpp +11 -6
  39. package/src/llama.cpp/common/speculative.cpp +18 -15
  40. package/src/llama.cpp/docs/build.md +2 -0
  41. package/src/llama.cpp/examples/batched/batched.cpp +9 -7
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
  43. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
  44. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
  45. package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
  46. package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
  47. package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
  48. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
  49. package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
  50. package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
  51. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
  52. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
  53. package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
  54. package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
  55. package/src/llama.cpp/examples/infill/infill.cpp +23 -24
  56. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
  57. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
  58. package/src/llama.cpp/examples/llava/clip.cpp +4 -2
  59. package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
  60. package/src/llama.cpp/examples/llava/llava.cpp +2 -2
  61. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
  62. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
  63. package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
  64. package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
  65. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
  66. package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
  67. package/src/llama.cpp/examples/main/main.cpp +51 -29
  68. package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
  69. package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
  70. package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
  71. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
  72. package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
  73. package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
  74. package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
  76. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
  77. package/src/llama.cpp/examples/run/run.cpp +175 -61
  78. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
  79. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
  80. package/src/llama.cpp/examples/server/httplib.h +1295 -409
  81. package/src/llama.cpp/examples/server/server.cpp +387 -181
  82. package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
  83. package/src/llama.cpp/examples/server/utils.hpp +170 -58
  84. package/src/llama.cpp/examples/simple/simple.cpp +9 -8
  85. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
  86. package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
  87. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
  88. package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
  89. package/src/llama.cpp/examples/tts/tts.cpp +64 -23
  90. package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
  91. package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
  92. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
  93. package/src/llama.cpp/ggml/include/ggml.h +36 -145
  94. package/src/llama.cpp/ggml/include/gguf.h +202 -0
  95. package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
  96. package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
  97. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
  98. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
  99. package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
  100. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
  101. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
  102. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
  103. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
  105. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
  106. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
  107. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
  109. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
  111. package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
  112. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
  113. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
  115. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
  117. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
  120. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
  121. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
  124. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
  125. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
  126. package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
  128. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
  129. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
  130. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
  131. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
  132. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
  133. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
  134. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
  135. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
  138. package/src/llama.cpp/ggml/src/ggml.c +117 -1327
  139. package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
  140. package/src/llama.cpp/include/llama-cpp.h +6 -1
  141. package/src/llama.cpp/include/llama.h +138 -75
  142. package/src/llama.cpp/src/CMakeLists.txt +13 -1
  143. package/src/llama.cpp/src/llama-adapter.cpp +347 -0
  144. package/src/llama.cpp/src/llama-adapter.h +74 -0
  145. package/src/llama.cpp/src/llama-arch.cpp +1487 -0
  146. package/src/llama.cpp/src/llama-arch.h +400 -0
  147. package/src/llama.cpp/src/llama-batch.cpp +368 -0
  148. package/src/llama.cpp/src/llama-batch.h +88 -0
  149. package/src/llama.cpp/src/llama-chat.cpp +578 -0
  150. package/src/llama.cpp/src/llama-chat.h +52 -0
  151. package/src/llama.cpp/src/llama-context.cpp +1775 -0
  152. package/src/llama.cpp/src/llama-context.h +128 -0
  153. package/src/llama.cpp/src/llama-cparams.cpp +1 -0
  154. package/src/llama.cpp/src/llama-cparams.h +37 -0
  155. package/src/llama.cpp/src/llama-grammar.cpp +5 -4
  156. package/src/llama.cpp/src/llama-grammar.h +3 -1
  157. package/src/llama.cpp/src/llama-hparams.cpp +71 -0
  158. package/src/llama.cpp/src/llama-hparams.h +139 -0
  159. package/src/llama.cpp/src/llama-impl.cpp +167 -0
  160. package/src/llama.cpp/src/llama-impl.h +16 -136
  161. package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
  162. package/src/llama.cpp/src/llama-kv-cache.h +218 -0
  163. package/src/llama.cpp/src/llama-mmap.cpp +589 -0
  164. package/src/llama.cpp/src/llama-mmap.h +67 -0
  165. package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
  166. package/src/llama.cpp/src/llama-model-loader.h +167 -0
  167. package/src/llama.cpp/src/llama-model.cpp +3953 -0
  168. package/src/llama.cpp/src/llama-model.h +370 -0
  169. package/src/llama.cpp/src/llama-quant.cpp +934 -0
  170. package/src/llama.cpp/src/llama-quant.h +1 -0
  171. package/src/llama.cpp/src/llama-sampling.cpp +147 -32
  172. package/src/llama.cpp/src/llama-sampling.h +3 -19
  173. package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
  174. package/src/llama.cpp/src/llama-vocab.h +97 -142
  175. package/src/llama.cpp/src/llama.cpp +7160 -20314
  176. package/src/llama.cpp/src/unicode.cpp +8 -3
  177. package/src/llama.cpp/tests/CMakeLists.txt +2 -0
  178. package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
  179. package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
  180. package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
  181. package/src/llama.cpp/tests/test-gguf.cpp +222 -187
  182. package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
  183. package/src/llama.cpp/tests/test-sampling.cpp +0 -1
  184. package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
  185. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
  186. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
@@ -968,6 +968,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
968
968
  "GET_REL_POS",
969
969
  "ADD_REL_POS",
970
970
  "RWKV_WKV6",
971
+ "GATED_LINEAR_ATTN",
971
972
 
972
973
  "UNARY",
973
974
 
@@ -987,7 +988,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
987
988
  "OPT_STEP_ADAMW",
988
989
  };
989
990
 
990
- static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
991
+ static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
991
992
 
992
993
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
993
994
  "none",
@@ -1064,6 +1065,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1064
1065
  "get_rel_pos(x)",
1065
1066
  "add_rel_pos(x)",
1066
1067
  "rwkv_wkv6(k, v, r, tf, td, s)",
1068
+ "gated_linear_attn(k, v, q, gate, s)",
1067
1069
 
1068
1070
  "unary(x)",
1069
1071
 
@@ -1083,7 +1085,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1083
1085
  "adamw(x)",
1084
1086
  };
1085
1087
 
1086
- static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
1088
+ static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
1087
1089
 
1088
1090
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1089
1091
 
@@ -1588,15 +1590,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
1588
1590
 
1589
1591
  struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
1590
1592
 
1591
- #ifdef __clang__
1592
- // temporary until ggml_tensor::backend is removed
1593
- #pragma clang diagnostic push
1594
- #pragma clang diagnostic ignored "-Wdeprecated-declarations"
1595
- #endif
1596
-
1597
1593
  *result = (struct ggml_tensor) {
1598
1594
  /*.type =*/ type,
1599
- /*.backend =*/ GGML_BACKEND_TYPE_CPU,
1600
1595
  /*.buffer =*/ NULL,
1601
1596
  /*.ne =*/ { 1, 1, 1, 1 },
1602
1597
  /*.nb =*/ { 0, 0, 0, 0 },
@@ -1612,10 +1607,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
1612
1607
  /*.padding =*/ { 0 },
1613
1608
  };
1614
1609
 
1615
- #ifdef __clang__
1616
- #pragma clang diagnostic pop
1617
- #endif
1618
-
1619
1610
  // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
1620
1611
  //GGML_ASSERT_ALIGNED(result->data);
1621
1612
 
@@ -3459,12 +3450,14 @@ struct ggml_tensor * ggml_soft_max_ext(
3459
3450
  return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
3460
3451
  }
3461
3452
 
3462
- // ggml_soft_max_back
3453
+ // ggml_soft_max_ext_back
3463
3454
 
3464
- static struct ggml_tensor * ggml_soft_max_back_impl(
3455
+ static struct ggml_tensor * ggml_soft_max_ext_back_impl(
3465
3456
  struct ggml_context * ctx,
3466
3457
  struct ggml_tensor * a,
3467
3458
  struct ggml_tensor * b,
3459
+ float scale,
3460
+ float max_bias,
3468
3461
  bool inplace) {
3469
3462
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
3470
3463
 
@@ -3472,21 +3465,28 @@ static struct ggml_tensor * ggml_soft_max_back_impl(
3472
3465
  result->src[0] = a;
3473
3466
  result->src[1] = b;
3474
3467
 
3468
+ memcpy((float *) result->op_params + 0, &scale, sizeof(float));
3469
+ memcpy((float *) result->op_params + 1, &max_bias, sizeof(float));
3470
+
3475
3471
  return result;
3476
3472
  }
3477
3473
 
3478
- struct ggml_tensor * ggml_soft_max_back(
3474
+ struct ggml_tensor * ggml_soft_max_ext_back(
3479
3475
  struct ggml_context * ctx,
3480
3476
  struct ggml_tensor * a,
3481
- struct ggml_tensor * b) {
3482
- return ggml_soft_max_back_impl(ctx, a, b, false);
3477
+ struct ggml_tensor * b,
3478
+ float scale,
3479
+ float max_bias) {
3480
+ return ggml_soft_max_ext_back_impl(ctx, a, b, scale, max_bias, false);
3483
3481
  }
3484
3482
 
3485
- struct ggml_tensor * ggml_soft_max_back_inplace(
3483
+ struct ggml_tensor * ggml_soft_max_ext_back_inplace(
3486
3484
  struct ggml_context * ctx,
3487
3485
  struct ggml_tensor * a,
3488
- struct ggml_tensor * b) {
3489
- return ggml_soft_max_back_impl(ctx, a, b, true);
3486
+ struct ggml_tensor * b,
3487
+ float scale,
3488
+ float max_bias) {
3489
+ return ggml_soft_max_ext_back_impl(ctx, a, b, scale, max_bias, true);
3490
3490
  }
3491
3491
 
3492
3492
  // ggml_rope
@@ -3704,7 +3704,7 @@ void ggml_rope_yarn_corr_dims(
3704
3704
 
3705
3705
  // ggml_rope_back
3706
3706
 
3707
- struct ggml_tensor * ggml_rope_back(
3707
+ struct ggml_tensor * ggml_rope_ext_back(
3708
3708
  struct ggml_context * ctx,
3709
3709
  struct ggml_tensor * a,
3710
3710
  struct ggml_tensor * b,
@@ -3718,29 +3718,32 @@ struct ggml_tensor * ggml_rope_back(
3718
3718
  float attn_factor,
3719
3719
  float beta_fast,
3720
3720
  float beta_slow) {
3721
- GGML_ASSERT(ggml_is_vector(b));
3722
- GGML_ASSERT(b->type == GGML_TYPE_I32);
3723
- GGML_ASSERT(a->ne[2] == b->ne[0]);
3724
-
3725
- struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
3726
-
3727
- int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
3728
- memcpy(params + 5, &freq_base, sizeof(float));
3729
- memcpy(params + 6, &freq_scale, sizeof(float));
3730
- memcpy(params + 7, &ext_factor, sizeof(float));
3731
- memcpy(params + 8, &attn_factor, sizeof(float));
3732
- memcpy(params + 9, &beta_fast, sizeof(float));
3733
- memcpy(params + 10, &beta_slow, sizeof(float));
3734
- ggml_set_op_params(result, params, sizeof(params));
3735
-
3736
- result->op = GGML_OP_ROPE_BACK;
3737
- result->src[0] = a;
3738
- result->src[1] = b;
3739
- result->src[2] = c;
3740
-
3721
+ struct ggml_tensor * result = ggml_rope_ext(
3722
+ ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
3723
+ result->op = GGML_OP_ROPE_BACK;
3741
3724
  return result;
3742
3725
  }
3743
3726
 
3727
+ struct ggml_tensor * ggml_rope_multi_back(
3728
+ struct ggml_context * ctx,
3729
+ struct ggml_tensor * a,
3730
+ struct ggml_tensor * b,
3731
+ struct ggml_tensor * c,
3732
+ int n_dims,
3733
+ int sections[4],
3734
+ int mode,
3735
+ int n_ctx_orig,
3736
+ float freq_base,
3737
+ float freq_scale,
3738
+ float ext_factor,
3739
+ float attn_factor,
3740
+ float beta_fast,
3741
+ float beta_slow) {
3742
+ struct ggml_tensor * result = ggml_rope_multi(
3743
+ ctx, a, b, c, n_dims, sections, mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
3744
+ result->op = GGML_OP_ROPE_BACK;
3745
+ return result;
3746
+ }
3744
3747
  // ggml_clamp
3745
3748
 
3746
3749
  struct ggml_tensor * ggml_clamp(
@@ -4640,15 +4643,13 @@ struct ggml_tensor * ggml_rwkv_wkv6(
4640
4643
  GGML_ASSERT(ggml_is_contiguous(state));
4641
4644
 
4642
4645
  const int64_t S = k->ne[0];
4643
- const int64_t H = k->ne[2];
4644
- const int64_t n_tokens = k->ne[3];
4646
+ const int64_t H = k->ne[1];
4647
+ const int64_t n_tokens = k->ne[2];
4645
4648
  const int64_t n_seqs = state->ne[1];
4646
4649
  {
4647
- GGML_ASSERT(k->ne[1] == 1);
4648
- GGML_ASSERT(v->ne[0] == 1 && v->ne[1] == S && v->ne[2] == H && v->ne[3] == n_tokens);
4649
- GGML_ASSERT(r->ne[0] == 1 && r->ne[1] == S && r->ne[2] == H && r->ne[3] == n_tokens);
4650
- // TODO: RWKV v4 and v5
4651
- GGML_ASSERT(td->ne[0] == 1 && td->ne[1] == S && td->ne[2] == H && td->ne[3] == n_tokens);
4650
+ GGML_ASSERT(v->ne[0] == S && v->ne[1] == H && v->ne[2] == n_tokens);
4651
+ GGML_ASSERT(r->ne[0] == S && r->ne[1] == H && r->ne[2] == n_tokens);
4652
+ GGML_ASSERT(td->ne[0] == S && td->ne[1] == H && td->ne[2] == n_tokens);
4652
4653
  GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs);
4653
4654
  }
4654
4655
 
@@ -4667,6 +4668,49 @@ struct ggml_tensor * ggml_rwkv_wkv6(
4667
4668
  return result;
4668
4669
  }
4669
4670
 
4671
+ // ggml_gated_linear_attn
4672
+
4673
+ struct ggml_tensor * ggml_gated_linear_attn(
4674
+ struct ggml_context * ctx,
4675
+ struct ggml_tensor * k,
4676
+ struct ggml_tensor * v,
4677
+ struct ggml_tensor * q,
4678
+ struct ggml_tensor * g,
4679
+ struct ggml_tensor * state,
4680
+ float scale) {
4681
+ GGML_ASSERT(ggml_is_contiguous(k));
4682
+ GGML_ASSERT(ggml_is_contiguous(v));
4683
+ GGML_ASSERT(ggml_is_contiguous(q));
4684
+ GGML_ASSERT(ggml_is_contiguous(g));
4685
+ GGML_ASSERT(ggml_is_contiguous(state));
4686
+
4687
+ const int64_t S = k->ne[0];
4688
+ const int64_t H = k->ne[1];
4689
+ const int64_t n_tokens = k->ne[2];
4690
+ const int64_t n_seqs = state->ne[1];
4691
+ {
4692
+ GGML_ASSERT(v->ne[0] == S && v->ne[1] == H && v->ne[2] == n_tokens);
4693
+ GGML_ASSERT(q->ne[0] == S && q->ne[1] == H && q->ne[2] == n_tokens);
4694
+ GGML_ASSERT(g->ne[0] == S && g->ne[1] == H && g->ne[2] == n_tokens);
4695
+ GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs);
4696
+ }
4697
+
4698
+ // concat output and new_state
4699
+ const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 };
4700
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4701
+
4702
+ ggml_set_op_params_f32(result, 0, scale);
4703
+
4704
+ result->op = GGML_OP_GATED_LINEAR_ATTN;
4705
+ result->src[0] = k;
4706
+ result->src[1] = v;
4707
+ result->src[2] = q;
4708
+ result->src[3] = g;
4709
+ result->src[4] = state;
4710
+
4711
+ return result;
4712
+ }
4713
+
4670
4714
  // ggml_unary
4671
4715
 
4672
4716
  static struct ggml_tensor * ggml_unary_impl(
@@ -5041,10 +5085,10 @@ struct ggml_tensor * ggml_cross_entropy_loss_back(
5041
5085
  struct ggml_tensor * a,
5042
5086
  struct ggml_tensor * b,
5043
5087
  struct ggml_tensor * c) {
5044
- GGML_ASSERT(ggml_are_same_shape(a, b));
5045
- GGML_ASSERT(ggml_is_scalar(c));
5088
+ GGML_ASSERT(ggml_is_scalar(a));
5089
+ GGML_ASSERT(ggml_are_same_shape(b, c));
5046
5090
 
5047
- struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
5091
+ struct ggml_tensor * result = ggml_dup_tensor(ctx, b);
5048
5092
 
5049
5093
  result->op = GGML_OP_CROSS_ENTROPY_LOSS_BACK;
5050
5094
  result->src[0] = a;
@@ -5223,7 +5267,7 @@ static void ggml_sub_or_set(
5223
5267
  }
5224
5268
 
5225
5269
  static void ggml_compute_backward(
5226
- struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i, bool * grads_needed) {
5270
+ struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i, const bool * grads_needed) {
5227
5271
  struct ggml_tensor * tensor = cgraph->nodes[i];
5228
5272
  struct ggml_tensor * grad = ggml_graph_get_grad(cgraph, tensor);
5229
5273
 
@@ -5367,7 +5411,7 @@ static void ggml_compute_backward(
5367
5411
  if (src0_needs_grads) {
5368
5412
  float eps;
5369
5413
  memcpy(&eps, tensor->op_params, sizeof(float));
5370
- ggml_add_or_set(ctx, cgraph, isrc0, ggml_rms_norm_back(ctx, src0, grad, eps));
5414
+ ggml_add_or_set(ctx, cgraph, isrc0, ggml_rms_norm_back(ctx, grad, src0, eps));
5371
5415
  }
5372
5416
  } break;
5373
5417
  case GGML_OP_MUL_MAT: {
@@ -5550,7 +5594,13 @@ static void ggml_compute_backward(
5550
5594
  } break;
5551
5595
  case GGML_OP_SOFT_MAX: {
5552
5596
  if (src0_needs_grads) {
5553
- ggml_add_or_set(ctx, cgraph, isrc0, ggml_soft_max_back(ctx, grad, tensor));
5597
+ float scale = 1.0f;
5598
+ float max_bias = 0.0f;
5599
+
5600
+ memcpy(&scale, (const float *) tensor->op_params + 0, sizeof(float));
5601
+ memcpy(&max_bias, (const float *) tensor->op_params + 1, sizeof(float));
5602
+
5603
+ ggml_add_or_set(ctx, cgraph, isrc0, ggml_soft_max_ext_back(ctx, grad, tensor, scale, max_bias));
5554
5604
  }
5555
5605
  GGML_ASSERT((!src1 || !src1_needs_grads) && "backward pass for softmax mask not implemented");
5556
5606
  } break;
@@ -5562,6 +5612,7 @@ static void ggml_compute_backward(
5562
5612
  //const int n_ctx = ((int32_t *) tensor->op_params)[3];
5563
5613
  const int n_ctx_orig = ((const int32_t *) tensor->op_params)[4];
5564
5614
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
5615
+ int sections[4] = {0, 0, 0, 0};
5565
5616
 
5566
5617
  memcpy(&freq_base, (const float *) tensor->op_params + 5, sizeof(float));
5567
5618
  memcpy(&freq_scale, (const float *) tensor->op_params + 6, sizeof(float));
@@ -5569,10 +5620,14 @@ static void ggml_compute_backward(
5569
5620
  memcpy(&attn_factor, (const float *) tensor->op_params + 8, sizeof(float));
5570
5621
  memcpy(&beta_fast, (const float *) tensor->op_params + 9, sizeof(float));
5571
5622
  memcpy(&beta_slow, (const float *) tensor->op_params + 10, sizeof(float));
5572
-
5573
- ggml_add_or_set(ctx, cgraph, isrc0,
5574
- ggml_rope_back(ctx, grad, src1, src2, n_dims, mode, n_ctx_orig, freq_base,
5575
- freq_scale, ext_factor, attn_factor, beta_fast, beta_slow));
5623
+ memcpy(&sections, tensor->op_params + 11, sizeof(sections));
5624
+
5625
+ struct ggml_tensor * rope_back = grad->ne[2] == src1->ne[0] ?
5626
+ ggml_rope_ext_back(ctx, grad, src1, src2, n_dims,
5627
+ mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow) :
5628
+ ggml_rope_multi_back(ctx, grad, src1, src2, n_dims, sections,
5629
+ mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
5630
+ ggml_add_or_set(ctx, cgraph, isrc0, rope_back);
5576
5631
  }
5577
5632
  GGML_ASSERT((!src2 || !src2_needs_grads) && "gradients for freq factors not implemented");
5578
5633
  } break;
@@ -5586,7 +5641,7 @@ static void ggml_compute_backward(
5586
5641
  const int32_t d1 = ggml_get_op_params_i32(tensor, 5);
5587
5642
  const bool is_2D = ggml_get_op_params_i32(tensor, 6) == 1;
5588
5643
 
5589
- ggml_add_or_set(ctx, cgraph, isrc1, ggml_im2col_back(ctx, src0, grad, src1->ne, s0, s1, p0, p1, d0, d1, is_2D));
5644
+ ggml_add_or_set(ctx, cgraph, isrc1, ggml_im2col_back(ctx, grad, src0, src1->ne, s0, s1, p0, p1, d0, d1, is_2D));
5590
5645
  }
5591
5646
  } break;
5592
5647
  case GGML_OP_POOL_2D: {
@@ -5629,7 +5684,7 @@ static void ggml_compute_backward(
5629
5684
  } break;
5630
5685
  case GGML_UNARY_OP_SILU: {
5631
5686
  if (src0_needs_grads) {
5632
- ggml_add_or_set(ctx, cgraph, isrc0, ggml_silu_back(ctx, src0, grad));
5687
+ ggml_add_or_set(ctx, cgraph, isrc0, ggml_silu_back(ctx, grad, src0));
5633
5688
  }
5634
5689
  } break;
5635
5690
  case GGML_UNARY_OP_EXP: {
@@ -5646,7 +5701,7 @@ static void ggml_compute_backward(
5646
5701
  } break;
5647
5702
  case GGML_OP_CROSS_ENTROPY_LOSS: {
5648
5703
  if (src0_needs_grads) {
5649
- ggml_add_or_set(ctx, cgraph, isrc0, ggml_cross_entropy_loss_back(ctx, src0, src1, grad));
5704
+ ggml_add_or_set(ctx, cgraph, isrc0, ggml_cross_entropy_loss_back(ctx, grad, src0, src1));
5650
5705
  }
5651
5706
  GGML_ASSERT(!src1_needs_grads && "backward pass for labels not implemented");
5652
5707
  } break;
@@ -6417,1271 +6472,6 @@ size_t ggml_quantize_chunk(
6417
6472
 
6418
6473
  ////////////////////////////////////////////////////////////////////////////////
6419
6474
 
6420
- struct gguf_str {
6421
- uint64_t n; // GGUFv2
6422
- char * data;
6423
- };
6424
-
6425
- static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
6426
- [GGUF_TYPE_UINT8] = sizeof(uint8_t),
6427
- [GGUF_TYPE_INT8] = sizeof(int8_t),
6428
- [GGUF_TYPE_UINT16] = sizeof(uint16_t),
6429
- [GGUF_TYPE_INT16] = sizeof(int16_t),
6430
- [GGUF_TYPE_UINT32] = sizeof(uint32_t),
6431
- [GGUF_TYPE_INT32] = sizeof(int32_t),
6432
- [GGUF_TYPE_FLOAT32] = sizeof(float),
6433
- [GGUF_TYPE_BOOL] = sizeof(bool),
6434
- [GGUF_TYPE_STRING] = sizeof(struct gguf_str),
6435
- [GGUF_TYPE_UINT64] = sizeof(uint64_t),
6436
- [GGUF_TYPE_INT64] = sizeof(int64_t),
6437
- [GGUF_TYPE_FLOAT64] = sizeof(double),
6438
- [GGUF_TYPE_ARRAY] = 0, // undefined
6439
- };
6440
- static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
6441
-
6442
- static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
6443
- [GGUF_TYPE_UINT8] = "u8",
6444
- [GGUF_TYPE_INT8] = "i8",
6445
- [GGUF_TYPE_UINT16] = "u16",
6446
- [GGUF_TYPE_INT16] = "i16",
6447
- [GGUF_TYPE_UINT32] = "u32",
6448
- [GGUF_TYPE_INT32] = "i32",
6449
- [GGUF_TYPE_FLOAT32] = "f32",
6450
- [GGUF_TYPE_BOOL] = "bool",
6451
- [GGUF_TYPE_STRING] = "str",
6452
- [GGUF_TYPE_ARRAY] = "arr",
6453
- [GGUF_TYPE_UINT64] = "u64",
6454
- [GGUF_TYPE_INT64] = "i64",
6455
- [GGUF_TYPE_FLOAT64] = "f64",
6456
- };
6457
- static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
6458
-
6459
- union gguf_value {
6460
- uint8_t uint8;
6461
- int8_t int8;
6462
- uint16_t uint16;
6463
- int16_t int16;
6464
- uint32_t uint32;
6465
- int32_t int32;
6466
- float float32;
6467
- uint64_t uint64;
6468
- int64_t int64;
6469
- double float64;
6470
- bool bool_;
6471
-
6472
- struct gguf_str str;
6473
-
6474
- struct {
6475
- enum gguf_type type;
6476
-
6477
- uint64_t n; // GGUFv2
6478
- void * data;
6479
- } arr;
6480
- };
6481
-
6482
- struct gguf_kv {
6483
- struct gguf_str key;
6484
-
6485
- enum gguf_type type;
6486
- union gguf_value value;
6487
- };
6488
-
6489
- struct gguf_header {
6490
- char magic[4];
6491
-
6492
- uint32_t version;
6493
- uint64_t n_tensors; // GGUFv2
6494
- uint64_t n_kv; // GGUFv2
6495
- };
6496
-
6497
- struct gguf_tensor_info {
6498
- struct gguf_str name;
6499
-
6500
- uint32_t n_dims;
6501
- uint64_t ne[GGML_MAX_DIMS];
6502
-
6503
- enum ggml_type type;
6504
-
6505
- uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
6506
-
6507
- // for writing API
6508
- const void * data;
6509
- size_t size;
6510
- };
6511
-
6512
- struct gguf_context {
6513
- struct gguf_header header;
6514
-
6515
- struct gguf_kv * kv;
6516
- struct gguf_tensor_info * infos;
6517
-
6518
- size_t alignment;
6519
- size_t offset; // offset of `data` from beginning of file
6520
- size_t size; // size of `data` in bytes
6521
-
6522
- //uint8_t * padding;
6523
- void * data;
6524
- };
6525
-
6526
- size_t gguf_type_size(enum gguf_type type) {
6527
- GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
6528
- return GGUF_TYPE_SIZE[type];
6529
- }
6530
-
6531
- static bool gguf_tensor_info_sanitize(struct gguf_tensor_info * info) {
6532
- if (info->n_dims > GGML_MAX_DIMS) {
6533
- fprintf(stderr, "%s: invalid number of dimensions (%" PRIu32 ")\n", __func__, info->n_dims);
6534
- return false;
6535
- }
6536
-
6537
- if (info->type < 0 || info->type >= GGML_TYPE_COUNT) {
6538
- fprintf(stderr, "%s: invalid type (%d)\n", __func__, info->type);
6539
- return false;
6540
- }
6541
-
6542
- if (strlen(info->name.data) >= GGML_MAX_NAME) {
6543
- fprintf(stderr, "%s: tensor '%s' name is too long\n", __func__, info->name.data);
6544
- return false;
6545
- }
6546
-
6547
- for (uint32_t i = 0; i < info->n_dims; ++i) {
6548
- if (info->ne[i] <= 0) {
6549
- fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[i]);
6550
- return false;
6551
- }
6552
- }
6553
-
6554
- // prevent overflow for total number of elements
6555
- if (INT64_MAX/info->ne[1] <= info->ne[0]) {
6556
- fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[1]);
6557
- return false;
6558
- }
6559
-
6560
- if (INT64_MAX/info->ne[2] <= info->ne[0]*info->ne[1]) {
6561
- fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[2]);
6562
- return false;
6563
- }
6564
-
6565
- if (INT64_MAX/info->ne[3] <= info->ne[0]*info->ne[1]*info->ne[2]) {
6566
- fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[3]);
6567
- return false;
6568
- }
6569
-
6570
- return true;
6571
- }
6572
-
6573
- static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
6574
- const size_t n = fread(dst, 1, size, file);
6575
- *offset += n;
6576
- return n == size;
6577
- }
6578
-
6579
- static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
6580
- p->n = 0;
6581
- p->data = NULL;
6582
-
6583
- bool ok = true;
6584
-
6585
- ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset);
6586
-
6587
- // early exit if string length is invalid, prevents from integer overflow
6588
- if (p->n == SIZE_MAX) {
6589
- fprintf(stderr, "%s: invalid string length (%" PRIu64 ")\n", __func__, p->n);
6590
- return false;
6591
- }
6592
-
6593
- p->data = calloc(p->n + 1, 1);
6594
- if (!p->data) {
6595
- fprintf(stderr, "%s: failed to allocate memory for string of length %" PRIu64 "\n", __func__, p->n);
6596
- return false;
6597
- }
6598
-
6599
- ok = ok && gguf_fread_el(file, p->data, p->n, offset);
6600
-
6601
- return ok;
6602
- }
6603
-
6604
- static void gguf_free_kv(struct gguf_kv * kv) {
6605
- if (kv->key.data) {
6606
- GGML_FREE(kv->key.data);
6607
- }
6608
-
6609
- if (kv->type == GGUF_TYPE_STRING) {
6610
- if (kv->value.str.data) {
6611
- GGML_FREE(kv->value.str.data);
6612
- }
6613
- }
6614
-
6615
- if (kv->type == GGUF_TYPE_ARRAY) {
6616
- if (kv->value.arr.data) {
6617
- if (kv->value.arr.type == GGUF_TYPE_STRING) {
6618
- for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
6619
- struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
6620
- if (str->data) {
6621
- GGML_FREE(str->data);
6622
- }
6623
- }
6624
- }
6625
- GGML_FREE(kv->value.arr.data);
6626
- }
6627
- }
6628
- }
6629
-
6630
- struct gguf_context * gguf_init_empty(void) {
6631
- struct gguf_context * ctx = calloc(1, sizeof(struct gguf_context));
6632
- if (!ctx) {
6633
- fprintf(stderr, "%s: failed to allocate memory for context\n", __func__);
6634
- return NULL;
6635
- }
6636
-
6637
- memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
6638
- ctx->header.version = GGUF_VERSION;
6639
- ctx->header.n_tensors = 0;
6640
- ctx->header.n_kv = 0;
6641
-
6642
- ctx->kv = NULL;
6643
- ctx->infos = NULL;
6644
-
6645
- ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
6646
- ctx->offset = 0;
6647
- ctx->size = 0;
6648
-
6649
- ctx->data = NULL;
6650
-
6651
- return ctx;
6652
- }
6653
-
6654
- struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
6655
- // offset from start of file
6656
- size_t offset = 0;
6657
-
6658
- char magic[4];
6659
-
6660
- // check the magic before making allocations
6661
- {
6662
- gguf_fread_el(file, &magic, sizeof(magic), &offset);
6663
-
6664
- for (uint32_t i = 0; i < sizeof(magic); i++) {
6665
- if (magic[i] != GGUF_MAGIC[i]) {
6666
- fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
6667
- return NULL;
6668
- }
6669
- }
6670
- }
6671
-
6672
- bool ok = true;
6673
-
6674
- struct gguf_context * ctx = calloc(1, sizeof(struct gguf_context));
6675
- if (!ctx) {
6676
- fprintf(stderr, "%s: failed to allocate memory for context\n", __func__);
6677
- return NULL;
6678
- }
6679
-
6680
- // read the header
6681
- {
6682
- strncpy(ctx->header.magic, magic, 4);
6683
-
6684
- ctx->kv = NULL;
6685
- ctx->infos = NULL;
6686
- ctx->data = NULL;
6687
-
6688
- ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
6689
- ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
6690
- ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
6691
-
6692
- if (ctx->header.version == 1) {
6693
- fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
6694
- gguf_free(ctx);
6695
- return NULL;
6696
- }
6697
-
6698
- // sanity-checks to prevent from integer/buffer overflows
6699
-
6700
- ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/sizeof(struct gguf_tensor_info));
6701
- ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/ggml_tensor_overhead());
6702
- ok = ok && (ctx->header.n_kv < (SIZE_MAX/2)/sizeof(struct gguf_kv));
6703
-
6704
- if (!ok) {
6705
- fprintf(stderr, "%s: failed to read header\n", __func__);
6706
- gguf_free(ctx);
6707
- return NULL;
6708
- }
6709
- }
6710
-
6711
- // read the kv pairs
6712
- {
6713
- const uint64_t n_kv = ctx->header.n_kv;
6714
-
6715
- if (n_kv > 0) {
6716
- ctx->kv = calloc(n_kv, sizeof(struct gguf_kv));
6717
- if (!ctx->kv) {
6718
- fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__);
6719
- gguf_free(ctx);
6720
- return NULL;
6721
- }
6722
- }
6723
-
6724
- for (uint64_t i = 0; i < n_kv; ++i) {
6725
- struct gguf_kv * kv = &ctx->kv[i];
6726
-
6727
- //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
6728
-
6729
- ok = ok && gguf_fread_str(file, &kv->key, &offset);
6730
- ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
6731
-
6732
- //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
6733
-
6734
- switch (kv->type) {
6735
- case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (file, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break;
6736
- case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (file, &kv->value.int8, sizeof(kv->value.int8), &offset); break;
6737
- case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (file, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break;
6738
- case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (file, &kv->value.int16, sizeof(kv->value.int16), &offset); break;
6739
- case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
6740
- case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
6741
- case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
6742
- case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (file, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break;
6743
- case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (file, &kv->value.int64, sizeof(kv->value.int64), &offset); break;
6744
- case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
6745
- case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
6746
- case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
6747
- case GGUF_TYPE_ARRAY:
6748
- {
6749
- ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
6750
- ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
6751
-
6752
- switch (kv->value.arr.type) {
6753
- case GGUF_TYPE_UINT8:
6754
- case GGUF_TYPE_INT8:
6755
- case GGUF_TYPE_UINT16:
6756
- case GGUF_TYPE_INT16:
6757
- case GGUF_TYPE_UINT32:
6758
- case GGUF_TYPE_INT32:
6759
- case GGUF_TYPE_FLOAT32:
6760
- case GGUF_TYPE_UINT64:
6761
- case GGUF_TYPE_INT64:
6762
- case GGUF_TYPE_FLOAT64:
6763
- case GGUF_TYPE_BOOL:
6764
- {
6765
- // prevent from integer overflow in the malloc below
6766
- if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
6767
- fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
6768
- gguf_free(ctx);
6769
- return NULL;
6770
- }
6771
-
6772
- kv->value.arr.data = calloc(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
6773
- if (!kv->value.arr.data) {
6774
- fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
6775
- gguf_free(ctx);
6776
- return NULL;
6777
- }
6778
-
6779
- ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
6780
- } break;
6781
- case GGUF_TYPE_STRING:
6782
- {
6783
- // prevent from integer overflow in the malloc below
6784
- if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
6785
- fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
6786
- gguf_free(ctx);
6787
- return NULL;
6788
- }
6789
-
6790
- kv->value.arr.data = calloc(kv->value.arr.n, sizeof(struct gguf_str));
6791
- if (!kv->value.arr.data) {
6792
- fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
6793
- gguf_free(ctx);
6794
- return NULL;
6795
- }
6796
-
6797
- for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
6798
- ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
6799
- }
6800
- } break;
6801
- case GGUF_TYPE_ARRAY:
6802
- default:
6803
- {
6804
- fprintf(stderr, "%s: invalid array type %d\n", __func__, kv->value.arr.type);
6805
- ok = false;
6806
- } break;
6807
- }
6808
- } break;
6809
- default:
6810
- {
6811
- fprintf(stderr, "%s: invalid type %d\n", __func__, kv->type);
6812
- ok = false;
6813
- } break;
6814
- }
6815
-
6816
- if (!ok) {
6817
- break;
6818
- }
6819
- }
6820
-
6821
- if (!ok) {
6822
- fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
6823
- gguf_free(ctx);
6824
- return NULL;
6825
- }
6826
- }
6827
-
6828
- // read the tensor infos
6829
- if (ctx->header.n_tensors > 0) {
6830
- ctx->infos = calloc(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
6831
- if (!ctx->infos) {
6832
- fprintf(stderr, "%s: failed to allocate memory for tensor infos\n", __func__);
6833
- gguf_free(ctx);
6834
- return NULL;
6835
- }
6836
-
6837
- for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
6838
- struct gguf_tensor_info * info = &ctx->infos[i];
6839
-
6840
- for (int j = 0; j < GGML_MAX_DIMS; ++j) {
6841
- info->ne[j] = 1;
6842
- }
6843
-
6844
- ok = ok && gguf_fread_str(file, &info->name, &offset);
6845
- ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
6846
-
6847
- ok = ok && (info->n_dims <= GGML_MAX_DIMS);
6848
-
6849
- for (uint32_t j = 0; j < info->n_dims; ++j) {
6850
- ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
6851
- }
6852
-
6853
- ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
6854
- ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
6855
-
6856
- ok = ok && gguf_tensor_info_sanitize(info);
6857
-
6858
- // make sure there is no duplicated tensor names
6859
- for (uint64_t j = 0; j < i && ok; ++j) {
6860
- if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
6861
- fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
6862
- ok = false;
6863
- }
6864
- }
6865
-
6866
- if (!ok) {
6867
- fprintf(stderr, "%s: failed to read tensor info\n", __func__);
6868
- gguf_free(ctx);
6869
- return NULL;
6870
- }
6871
- }
6872
- }
6873
-
6874
- ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
6875
-
6876
- int alignment_idx = gguf_find_key(ctx, "general.alignment");
6877
- if (alignment_idx != -1) {
6878
- ctx->alignment = gguf_get_val_u32(ctx, alignment_idx);
6879
- }
6880
-
6881
- // we require the data section to be aligned, so take into account any padding
6882
- {
6883
- const size_t offset_pad = offset % ctx->alignment;
6884
-
6885
- if (offset_pad != 0) {
6886
- offset += ctx->alignment - offset_pad;
6887
- fseek(file, offset, SEEK_SET);
6888
- }
6889
- }
6890
-
6891
- // store the current file offset - this is where the data section starts
6892
- ctx->offset = offset;
6893
-
6894
- // compute the total size of the data section, taking into account the alignment
6895
- {
6896
- ctx->size = 0;
6897
- for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
6898
- struct gguf_tensor_info * info = &ctx->infos[i];
6899
-
6900
- const int64_t ne =
6901
- (int64_t) info->ne[0] *
6902
- (int64_t) info->ne[1] *
6903
- (int64_t) info->ne[2] *
6904
- (int64_t) info->ne[3];
6905
-
6906
- if (ggml_blck_size(info->type) == 0 ) {
6907
- // this tensor type support have been removed:
6908
- fprintf(stderr, "%s: tensor '%s' of type %d: %s\n",
6909
- __func__, info->name.data, (int) info->type, ggml_type_name(info->type));
6910
- gguf_free(ctx);
6911
- return NULL;
6912
- }
6913
-
6914
- if (ne % ggml_blck_size(info->type) != 0) {
6915
- fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
6916
- __func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
6917
- gguf_free(ctx);
6918
- return NULL;
6919
- }
6920
-
6921
- const size_t size_cur = ggml_row_size(info->type, ne);
6922
-
6923
- ctx->size += GGML_PAD(size_cur, ctx->alignment);
6924
- }
6925
- }
6926
-
6927
- // load the tensor data only if requested
6928
- if (params.ctx != NULL) {
6929
- // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
6930
- // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
6931
- // the ggml_tensor structs to the appropriate locations in the binary blob
6932
-
6933
- // compute the exact size needed for the new ggml_context
6934
- const size_t mem_size =
6935
- params.no_alloc ?
6936
- (ctx->header.n_tensors )*ggml_tensor_overhead() :
6937
- (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
6938
-
6939
- struct ggml_init_params pdata = {
6940
- .mem_size = mem_size,
6941
- .mem_buffer = NULL,
6942
- .no_alloc = params.no_alloc,
6943
- };
6944
-
6945
- *params.ctx = ggml_init(pdata);
6946
- if (*params.ctx == NULL) {
6947
- fprintf(stderr, "%s: failed to initialize context\n", __func__);
6948
- gguf_free(ctx);
6949
- return NULL;
6950
- }
6951
-
6952
- struct ggml_context * ctx_data = *params.ctx;
6953
-
6954
- struct ggml_tensor * data = NULL;
6955
-
6956
- if (!params.no_alloc) {
6957
- data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
6958
-
6959
- ok = ok && data != NULL;
6960
-
6961
- // read the binary blob with the tensor data
6962
- ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);
6963
-
6964
- if (!ok) {
6965
- fprintf(stderr, "%s: failed to read tensor data\n", __func__);
6966
- ggml_free(ctx_data);
6967
- gguf_free(ctx);
6968
- return NULL;
6969
- }
6970
-
6971
- ctx->data = data->data;
6972
- }
6973
-
6974
- ggml_set_no_alloc(ctx_data, true);
6975
-
6976
- // create the tensors
6977
- for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
6978
- const int64_t ne[GGML_MAX_DIMS] = {
6979
- ctx->infos[i].ne[0],
6980
- ctx->infos[i].ne[1],
6981
- ctx->infos[i].ne[2],
6982
- ctx->infos[i].ne[3],
6983
- };
6984
-
6985
- struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
6986
-
6987
- ok = ok && cur != NULL;
6988
-
6989
- if (!ok) {
6990
- break;
6991
- }
6992
-
6993
- ggml_set_name(cur, ctx->infos[i].name.data);
6994
-
6995
- // point the data member to the appropriate location in the binary blob using the tensor infos
6996
- if (!params.no_alloc) {
6997
- //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
6998
- cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
6999
- }
7000
- }
7001
-
7002
- if (!ok) {
7003
- fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
7004
- ggml_free(ctx_data);
7005
- gguf_free(ctx);
7006
- return NULL;
7007
- }
7008
-
7009
- ggml_set_no_alloc(ctx_data, params.no_alloc);
7010
- }
7011
-
7012
- return ctx;
7013
- }
7014
-
7015
- struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
7016
- FILE * file = ggml_fopen(fname, "rb");
7017
- if (!file) {
7018
- fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
7019
- return NULL;
7020
- }
7021
-
7022
- struct gguf_context * result = gguf_init_from_file_impl(file, params);
7023
- fclose(file);
7024
- return result;
7025
- }
7026
-
7027
- void gguf_free(struct gguf_context * ctx) {
7028
- if (ctx == NULL) {
7029
- return;
7030
- }
7031
-
7032
- if (ctx->kv) {
7033
- // free string memory - not great..
7034
- for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
7035
- gguf_free_kv(&ctx->kv[i]);
7036
- }
7037
-
7038
- GGML_FREE(ctx->kv);
7039
- }
7040
-
7041
- if (ctx->infos) {
7042
- for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
7043
- struct gguf_tensor_info * info = &ctx->infos[i];
7044
-
7045
- if (info->name.data) {
7046
- GGML_FREE(info->name.data);
7047
- }
7048
- }
7049
-
7050
- GGML_FREE(ctx->infos);
7051
- }
7052
-
7053
- GGML_FREE(ctx);
7054
- }
7055
-
7056
- const char * gguf_type_name(enum gguf_type type) {
7057
- return GGUF_TYPE_NAME[type];
7058
- }
7059
-
7060
- int gguf_get_version(const struct gguf_context * ctx) {
7061
- return ctx->header.version;
7062
- }
7063
-
7064
- size_t gguf_get_alignment(const struct gguf_context * ctx) {
7065
- return ctx->alignment;
7066
- }
7067
-
7068
- size_t gguf_get_data_offset(const struct gguf_context * ctx) {
7069
- return ctx->offset;
7070
- }
7071
-
7072
- void * gguf_get_data(const struct gguf_context * ctx) {
7073
- return ctx->data;
7074
- }
7075
-
7076
- int gguf_get_n_kv(const struct gguf_context * ctx) {
7077
- return ctx->header.n_kv;
7078
- }
7079
-
7080
- int gguf_find_key(const struct gguf_context * ctx, const char * key) {
7081
- // return -1 if key not found
7082
- int keyfound = -1;
7083
-
7084
- const int n_kv = gguf_get_n_kv(ctx);
7085
-
7086
- for (int i = 0; i < n_kv; ++i) {
7087
- if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
7088
- keyfound = i;
7089
- break;
7090
- }
7091
- }
7092
-
7093
- return keyfound;
7094
- }
7095
-
7096
- const char * gguf_get_key(const struct gguf_context * ctx, int key_id) {
7097
- GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
7098
- return ctx->kv[key_id].key.data;
7099
- }
7100
-
7101
- enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) {
7102
- GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
7103
- return ctx->kv[key_id].type;
7104
- }
7105
-
7106
- enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) {
7107
- GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
7108
- GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
7109
- return ctx->kv[key_id].value.arr.type;
7110
- }
7111
-
7112
- const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) {
7113
- GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
7114
- GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
7115
- return ctx->kv[key_id].value.arr.data;
7116
- }
7117
-
7118
- const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
7119
- GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
7120
- GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
7121
- struct gguf_kv * kv = &ctx->kv[key_id];
7122
- struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
7123
- return str->data;
7124
- }
7125
-
7126
- int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
7127
- GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
7128
- GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
7129
- return ctx->kv[key_id].value.arr.n;
7130
- }
7131
-
7132
- uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) {
7133
- GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
7134
- GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
7135
- return ctx->kv[key_id].value.uint8;
7136
- }
7137
-
7138
- int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) {
7139
- GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
7140
- GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
7141
- return ctx->kv[key_id].value.int8;
7142
- }
7143
-
7144
- uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) {
7145
- GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
7146
- GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
7147
- return ctx->kv[key_id].value.uint16;
7148
- }
7149
-
7150
- int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) {
7151
- GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
7152
- GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
7153
- return ctx->kv[key_id].value.int16;
7154
- }
7155
-
7156
- uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) {
7157
- GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
7158
- GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
7159
- return ctx->kv[key_id].value.uint32;
7160
- }
7161
-
7162
- int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) {
7163
- GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
7164
- GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
7165
- return ctx->kv[key_id].value.int32;
7166
- }
7167
-
7168
- float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) {
7169
- GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
7170
- GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
7171
- return ctx->kv[key_id].value.float32;
7172
- }
7173
-
7174
- uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) {
7175
- GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
7176
- GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
7177
- return ctx->kv[key_id].value.uint64;
7178
- }
7179
-
7180
- int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) {
7181
- GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
7182
- GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
7183
- return ctx->kv[key_id].value.int64;
7184
- }
7185
-
7186
- double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
7187
- GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
7188
- GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
7189
- return ctx->kv[key_id].value.float64;
7190
- }
7191
-
7192
- bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
7193
- GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
7194
- GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
7195
- return ctx->kv[key_id].value.bool_;
7196
- }
7197
-
7198
- const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
7199
- GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
7200
- GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
7201
- return ctx->kv[key_id].value.str.data;
7202
- }
7203
-
7204
- const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) {
7205
- GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
7206
- GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY);
7207
- GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_STRING);
7208
- return &ctx->kv[key_id].value;
7209
- }
7210
-
7211
- int gguf_get_n_tensors(const struct gguf_context * ctx) {
7212
- return ctx->header.n_tensors;
7213
- }
7214
-
7215
- int gguf_find_tensor(const struct gguf_context * ctx, const char * name) {
7216
- // return -1 if tensor not found
7217
- int tensorfound = -1;
7218
-
7219
- const int n_tensors = gguf_get_n_tensors(ctx);
7220
-
7221
- for (int i = 0; i < n_tensors; ++i) {
7222
- if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
7223
- tensorfound = i;
7224
- break;
7225
- }
7226
- }
7227
-
7228
- return tensorfound;
7229
- }
7230
-
7231
- size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
7232
- return ctx->infos[i].offset;
7233
- }
7234
-
7235
- char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
7236
- return ctx->infos[i].name.data;
7237
- }
7238
-
7239
- enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
7240
- return ctx->infos[i].type;
7241
- }
7242
-
7243
- // returns the index
7244
- static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
7245
- const int idx = gguf_find_key(ctx, key);
7246
- if (idx >= 0) {
7247
- return idx;
7248
- }
7249
-
7250
- const int n_kv = gguf_get_n_kv(ctx);
7251
-
7252
- ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
7253
- ctx->kv[n_kv].key.n = strlen(key);
7254
- ctx->kv[n_kv].key.data = strdup(key);
7255
- ctx->header.n_kv++;
7256
-
7257
- return n_kv;
7258
- }
7259
-
7260
- void gguf_remove_key(struct gguf_context * ctx, const char * key) {
7261
- const int idx = gguf_find_key(ctx, key);
7262
- if (idx >= 0) {
7263
- const int n_kv = gguf_get_n_kv(ctx);
7264
- gguf_free_kv(&ctx->kv[idx]);
7265
- for (int i = idx; i < n_kv-1; ++i) {
7266
- ctx->kv[i] = ctx->kv[i+1];
7267
- }
7268
- ctx->kv = realloc(ctx->kv, (n_kv - 1) * sizeof(struct gguf_kv));
7269
- ctx->header.n_kv--;
7270
- }
7271
- }
7272
-
7273
- void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
7274
- const int idx = gguf_get_or_add_key(ctx, key);
7275
-
7276
- ctx->kv[idx].type = GGUF_TYPE_UINT8;
7277
- ctx->kv[idx].value.uint8 = val;
7278
- }
7279
-
7280
- void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
7281
- const int idx = gguf_get_or_add_key(ctx, key);
7282
-
7283
- ctx->kv[idx].type = GGUF_TYPE_INT8;
7284
- ctx->kv[idx].value.int8 = val;
7285
- }
7286
-
7287
- void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
7288
- const int idx = gguf_get_or_add_key(ctx, key);
7289
-
7290
- ctx->kv[idx].type = GGUF_TYPE_UINT16;
7291
- ctx->kv[idx].value.uint16 = val;
7292
- }
7293
-
7294
- void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
7295
- const int idx = gguf_get_or_add_key(ctx, key);
7296
-
7297
- ctx->kv[idx].type = GGUF_TYPE_INT16;
7298
- ctx->kv[idx].value.int16 = val;
7299
- }
7300
-
7301
- void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
7302
- const int idx = gguf_get_or_add_key(ctx, key);
7303
-
7304
- ctx->kv[idx].type = GGUF_TYPE_UINT32;
7305
- ctx->kv[idx].value.uint32 = val;
7306
- }
7307
-
7308
- void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
7309
- const int idx = gguf_get_or_add_key(ctx, key);
7310
-
7311
- ctx->kv[idx].type = GGUF_TYPE_INT32;
7312
- ctx->kv[idx].value.int32 = val;
7313
- }
7314
-
7315
- void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
7316
- const int idx = gguf_get_or_add_key(ctx, key);
7317
-
7318
- ctx->kv[idx].type = GGUF_TYPE_FLOAT32;
7319
- ctx->kv[idx].value.float32 = val;
7320
- }
7321
-
7322
- void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
7323
- const int idx = gguf_get_or_add_key(ctx, key);
7324
-
7325
- ctx->kv[idx].type = GGUF_TYPE_UINT64;
7326
- ctx->kv[idx].value.uint64 = val;
7327
- }
7328
-
7329
- void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
7330
- const int idx = gguf_get_or_add_key(ctx, key);
7331
-
7332
- ctx->kv[idx].type = GGUF_TYPE_INT64;
7333
- ctx->kv[idx].value.int64 = val;
7334
- }
7335
-
7336
- void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
7337
- const int idx = gguf_get_or_add_key(ctx, key);
7338
-
7339
- ctx->kv[idx].type = GGUF_TYPE_FLOAT64;
7340
- ctx->kv[idx].value.float64 = val;
7341
- }
7342
-
7343
- void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
7344
- const int idx = gguf_get_or_add_key(ctx, key);
7345
-
7346
- ctx->kv[idx].type = GGUF_TYPE_BOOL;
7347
- ctx->kv[idx].value.bool_ = val;
7348
- }
7349
-
7350
- void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
7351
- const int idx = gguf_get_or_add_key(ctx, key);
7352
-
7353
- ctx->kv[idx].type = GGUF_TYPE_STRING;
7354
- ctx->kv[idx].value.str.n = strlen(val);
7355
- ctx->kv[idx].value.str.data = strdup(val);
7356
- }
7357
-
7358
- void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
7359
- const int idx = gguf_get_or_add_key(ctx, key);
7360
-
7361
- ctx->kv[idx].type = GGUF_TYPE_ARRAY;
7362
- ctx->kv[idx].value.arr.type = type;
7363
- ctx->kv[idx].value.arr.n = n;
7364
- ctx->kv[idx].value.arr.data = GGML_CALLOC(n, gguf_type_size(type));
7365
- memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
7366
- }
7367
-
7368
- void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
7369
- const int idx = gguf_get_or_add_key(ctx, key);
7370
-
7371
- ctx->kv[idx].type = GGUF_TYPE_ARRAY;
7372
- ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
7373
- ctx->kv[idx].value.arr.n = n;
7374
- ctx->kv[idx].value.arr.data = GGML_CALLOC(n, sizeof(struct gguf_str));
7375
- for (int i = 0; i < n; i++) {
7376
- struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
7377
- str->n = strlen(data[i]);
7378
- str->data = strdup(data[i]);
7379
- }
7380
- }
7381
-
7382
- // set or add KV pairs from another context
7383
- void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
7384
- for (uint32_t i = 0; i < src->header.n_kv; i++) {
7385
- switch (src->kv[i].type) {
7386
- case GGUF_TYPE_UINT8: gguf_set_val_u8 (ctx, src->kv[i].key.data, src->kv[i].value.uint8); break;
7387
- case GGUF_TYPE_INT8: gguf_set_val_i8 (ctx, src->kv[i].key.data, src->kv[i].value.int8); break;
7388
- case GGUF_TYPE_UINT16: gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16); break;
7389
- case GGUF_TYPE_INT16: gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16); break;
7390
- case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
7391
- case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
7392
- case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
7393
- case GGUF_TYPE_UINT64: gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64); break;
7394
- case GGUF_TYPE_INT64: gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64); break;
7395
- case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64); break;
7396
- case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
7397
- case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
7398
- case GGUF_TYPE_ARRAY:
7399
- {
7400
- if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
7401
- const char ** data = GGML_CALLOC(src->kv[i].value.arr.n, sizeof(char *));
7402
- for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
7403
- data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
7404
- }
7405
- gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
7406
- GGML_FREE((void *)data);
7407
- } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
7408
- GGML_ABORT("nested arrays not supported");
7409
- } else {
7410
- gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
7411
- }
7412
- } break;
7413
- default: GGML_ABORT("invalid type");
7414
- }
7415
- }
7416
- }
7417
-
7418
- void gguf_add_tensor(
7419
- struct gguf_context * ctx,
7420
- const struct ggml_tensor * tensor) {
7421
- GGML_ASSERT(tensor);
7422
- if (gguf_find_tensor(ctx, tensor->name) != -1) {
7423
- GGML_ABORT("duplicated tensor name");
7424
- }
7425
-
7426
- const int idx = ctx->header.n_tensors;
7427
- ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
7428
-
7429
- ctx->infos[idx].name.n = strlen(tensor->name);
7430
- ctx->infos[idx].name.data = strdup(tensor->name);
7431
-
7432
- for (int i = 0; i < GGML_MAX_DIMS; ++i) {
7433
- ctx->infos[idx].ne[i] = 1;
7434
- }
7435
-
7436
- ctx->infos[idx].n_dims = ggml_n_dims(tensor);
7437
- for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
7438
- ctx->infos[idx].ne[i] = tensor->ne[i];
7439
- }
7440
-
7441
- ctx->infos[idx].type = tensor->type;
7442
- ctx->infos[idx].offset = 0;
7443
- ctx->infos[idx].data = tensor->data;
7444
- ctx->infos[idx].size = ggml_nbytes(tensor);
7445
-
7446
- if (ctx->header.n_tensors > 0) {
7447
- ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
7448
- }
7449
-
7450
- ctx->header.n_tensors++;
7451
- }
7452
-
7453
- void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
7454
- const int idx = gguf_find_tensor(ctx, name);
7455
- if (idx < 0) {
7456
- GGML_ABORT("tensor not found");
7457
- }
7458
-
7459
- ctx->infos[idx].type = type;
7460
- }
7461
-
7462
- void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
7463
- const int idx = gguf_find_tensor(ctx, name);
7464
- if (idx < 0) {
7465
- GGML_ABORT("tensor not found");
7466
- }
7467
-
7468
- ctx->infos[idx].data = data;
7469
- ctx->infos[idx].size = size;
7470
-
7471
- // update offsets
7472
- for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
7473
- ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
7474
- }
7475
- }
7476
-
7477
- //static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
7478
- // fwrite(&val->n, sizeof(val->n), 1, file);
7479
- // fwrite(val->data, sizeof(char), val->n, file);
7480
- //}
7481
- //
7482
- //static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
7483
- // fwrite(val, sizeof(char), size, file);
7484
- //}
7485
-
7486
- struct gguf_buf gguf_buf_init(size_t size) {
7487
- struct gguf_buf buf = {
7488
- /*buf.data =*/ size == 0 ? NULL : GGML_CALLOC(1, size),
7489
- /*buf.size =*/ size,
7490
- /*buf.offset =*/ 0,
7491
- };
7492
-
7493
- return buf;
7494
- }
7495
-
7496
- void gguf_buf_free(struct gguf_buf buf) {
7497
- if (buf.data) {
7498
- GGML_FREE(buf.data);
7499
- }
7500
- }
7501
-
7502
- static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
7503
- if (buf->offset + size > buf->size) {
7504
- buf->size = 1.5*(buf->offset + size);
7505
- if (buf->data) {
7506
- buf->data = realloc(buf->data, buf->size);
7507
- }
7508
- }
7509
- }
7510
-
7511
- static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
7512
- gguf_buf_grow(buf, sizeof(val->n) + val->n);
7513
-
7514
- if (buf->data) {
7515
- memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
7516
- }
7517
- buf->offset += sizeof(val->n);
7518
-
7519
- if (buf->data) {
7520
- memcpy((char *) buf->data + buf->offset, val->data, val->n);
7521
- }
7522
- buf->offset += val->n;
7523
- }
7524
-
7525
- static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
7526
- gguf_buf_grow(buf, el_size);
7527
-
7528
- if (buf->data) {
7529
- memcpy((char *) buf->data + buf->offset, val, el_size);
7530
- }
7531
- buf->offset += el_size;
7532
- }
7533
-
7534
- void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
7535
- // write header
7536
- gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
7537
- gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
7538
- gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
7539
- gguf_bwrite_el(buf, &ctx->header.n_kv, sizeof(ctx->header.n_kv));
7540
-
7541
- // write key-value pairs
7542
- for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
7543
- struct gguf_kv * kv = &ctx->kv[i];
7544
-
7545
- gguf_bwrite_str(buf, &kv->key);
7546
- gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
7547
-
7548
- switch (kv->type) {
7549
- case GGUF_TYPE_UINT8: gguf_bwrite_el( buf, &kv->value.uint8, sizeof(kv->value.uint8) ); break;
7550
- case GGUF_TYPE_INT8: gguf_bwrite_el (buf, &kv->value.int8, sizeof(kv->value.int8) ); break;
7551
- case GGUF_TYPE_UINT16: gguf_bwrite_el (buf, &kv->value.uint16, sizeof(kv->value.uint16) ); break;
7552
- case GGUF_TYPE_INT16: gguf_bwrite_el (buf, &kv->value.int16, sizeof(kv->value.int16) ); break;
7553
- case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
7554
- case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
7555
- case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
7556
- case GGUF_TYPE_UINT64: gguf_bwrite_el (buf, &kv->value.uint64, sizeof(kv->value.uint64) ); break;
7557
- case GGUF_TYPE_INT64: gguf_bwrite_el (buf, &kv->value.int64, sizeof(kv->value.int64) ); break;
7558
- case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
7559
- case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
7560
- case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
7561
- case GGUF_TYPE_ARRAY:
7562
- {
7563
- gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
7564
- gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n) );
7565
-
7566
- switch (kv->value.arr.type) {
7567
- case GGUF_TYPE_UINT8:
7568
- case GGUF_TYPE_INT8:
7569
- case GGUF_TYPE_UINT16:
7570
- case GGUF_TYPE_INT16:
7571
- case GGUF_TYPE_UINT32:
7572
- case GGUF_TYPE_INT32:
7573
- case GGUF_TYPE_FLOAT32:
7574
- case GGUF_TYPE_UINT64:
7575
- case GGUF_TYPE_INT64:
7576
- case GGUF_TYPE_FLOAT64:
7577
- case GGUF_TYPE_BOOL:
7578
- {
7579
- gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type));
7580
- } break;
7581
- case GGUF_TYPE_STRING:
7582
- {
7583
- for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
7584
- gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
7585
- }
7586
- } break;
7587
- case GGUF_TYPE_ARRAY:
7588
- default: GGML_ABORT("invalid type");
7589
- }
7590
- } break;
7591
- default: GGML_ABORT("invalid type");
7592
- }
7593
- }
7594
-
7595
- // write tensor infos
7596
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
7597
- struct gguf_tensor_info * info = &ctx->infos[i];
7598
-
7599
- gguf_bwrite_str(buf, &info->name);
7600
- gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
7601
- for (uint32_t j = 0; j < info->n_dims; ++j) {
7602
- gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
7603
- }
7604
- gguf_bwrite_el(buf, &info->type, sizeof(info->type));
7605
- gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
7606
- }
7607
-
7608
- // we require the data section to be aligned, so take into account any padding
7609
- {
7610
- const size_t offset = buf->offset;
7611
- const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
7612
-
7613
- if (offset_pad != offset) {
7614
- uint8_t pad = 0;
7615
- for (size_t i = 0; i < offset_pad - offset; ++i) {
7616
- gguf_bwrite_el(buf, &pad, sizeof(pad));
7617
- }
7618
- }
7619
- }
7620
-
7621
- if (only_meta) {
7622
- return;
7623
- }
7624
-
7625
- size_t offset = 0;
7626
-
7627
- // write tensor data
7628
- for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
7629
- struct gguf_tensor_info * info = &ctx->infos[i];
7630
-
7631
- const size_t size = info->size;
7632
- const size_t size_pad = GGML_PAD(size, ctx->alignment);
7633
-
7634
- gguf_bwrite_el(buf, info->data, size);
7635
-
7636
- if (size_pad != size) {
7637
- uint8_t pad = 0;
7638
- for (size_t j = 0; j < size_pad - size; ++j) {
7639
- gguf_bwrite_el(buf, &pad, sizeof(pad));
7640
- }
7641
- }
7642
-
7643
- GGML_ASSERT(offset == info->offset);
7644
-
7645
- offset += size_pad;
7646
- }
7647
- }
7648
-
7649
- void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
7650
- FILE * file = ggml_fopen(fname, "wb");
7651
- if (!file) {
7652
- GGML_ABORT("failed to open file for writing");
7653
- }
7654
-
7655
- struct gguf_buf buf = gguf_buf_init(16*1024);
7656
-
7657
- gguf_write_to_buf(ctx, &buf, only_meta);
7658
-
7659
- fwrite(buf.data, 1, buf.offset, file);
7660
-
7661
- gguf_buf_free(buf);
7662
-
7663
- fclose(file);
7664
- }
7665
-
7666
- size_t gguf_get_meta_size(const struct gguf_context * ctx) {
7667
- // no allocs - only compute size
7668
- struct gguf_buf buf = gguf_buf_init(0);
7669
-
7670
- gguf_write_to_buf(ctx, &buf, true);
7671
-
7672
- return buf.offset;
7673
- }
7674
-
7675
- void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
7676
- struct gguf_buf buf = gguf_buf_init(16*1024);
7677
-
7678
- gguf_write_to_buf(ctx, &buf, true);
7679
-
7680
- memcpy(data, buf.data, buf.offset);
7681
-
7682
- gguf_buf_free(buf);
7683
- }
7684
-
7685
6475
  void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
7686
6476
  g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
7687
6477
  g_logger_state.log_callback_user_data = user_data;