@fugood/llama.node 0.3.13 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +89 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/CMakeLists.txt +9 -1
  25. package/src/llama.cpp/cmake/common.cmake +2 -0
  26. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  27. package/src/llama.cpp/common/arg.cpp +132 -13
  28. package/src/llama.cpp/common/chat.cpp +960 -266
  29. package/src/llama.cpp/common/chat.h +135 -0
  30. package/src/llama.cpp/common/common.cpp +33 -174
  31. package/src/llama.cpp/common/common.h +27 -67
  32. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  33. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  34. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  35. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  36. package/src/llama.cpp/common/sampling.cpp +45 -7
  37. package/src/llama.cpp/common/speculative.cpp +10 -9
  38. package/src/llama.cpp/common/speculative.h +1 -1
  39. package/src/llama.cpp/docs/build.md +45 -7
  40. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
  41. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
  42. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
  43. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  44. package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
  45. package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
  46. package/src/llama.cpp/examples/infill/infill.cpp +2 -2
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  48. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  50. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  51. package/src/llama.cpp/examples/llava/clip.h +19 -3
  52. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  53. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  54. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  55. package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
  56. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  57. package/src/llama.cpp/examples/main/main.cpp +79 -34
  58. package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
  59. package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
  60. package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
  61. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  62. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
  63. package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +196 -108
  67. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
  68. package/src/llama.cpp/examples/server/server.cpp +113 -101
  69. package/src/llama.cpp/examples/server/utils.hpp +94 -105
  70. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  74. package/src/llama.cpp/examples/tts/tts.cpp +263 -151
  75. package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
  76. package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
  77. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  79. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  80. package/src/llama.cpp/ggml/include/ggml.h +29 -1
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
  82. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  83. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  84. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  85. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
  87. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
  88. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  89. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
  90. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  91. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  102. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
  103. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
  104. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  105. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  106. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  107. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
  108. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
  109. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  110. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
  111. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  112. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  113. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
  117. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  118. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
  124. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
  125. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
  127. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
  128. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
  129. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
  130. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
  132. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  134. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
  135. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
  139. package/src/llama.cpp/ggml/src/ggml.c +93 -5
  140. package/src/llama.cpp/include/llama.h +105 -27
  141. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  142. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  143. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  144. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  145. package/src/llama.cpp/requirements.txt +1 -0
  146. package/src/llama.cpp/src/CMakeLists.txt +5 -2
  147. package/src/llama.cpp/src/llama-adapter.cpp +19 -20
  148. package/src/llama.cpp/src/llama-adapter.h +11 -9
  149. package/src/llama.cpp/src/llama-arch.cpp +123 -16
  150. package/src/llama.cpp/src/llama-arch.h +19 -0
  151. package/src/llama.cpp/src/llama-batch.h +2 -2
  152. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  153. package/src/llama.cpp/src/llama-context.cpp +2253 -1222
  154. package/src/llama.cpp/src/llama-context.h +214 -77
  155. package/src/llama.cpp/src/llama-cparams.h +1 -0
  156. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  157. package/src/llama.cpp/src/llama-grammar.h +12 -3
  158. package/src/llama.cpp/src/llama-graph.cpp +1662 -0
  159. package/src/llama.cpp/src/llama-graph.h +574 -0
  160. package/src/llama.cpp/src/llama-hparams.cpp +8 -0
  161. package/src/llama.cpp/src/llama-hparams.h +9 -0
  162. package/src/llama.cpp/src/llama-io.cpp +15 -0
  163. package/src/llama.cpp/src/llama-io.h +35 -0
  164. package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
  165. package/src/llama.cpp/src/llama-kv-cache.h +178 -109
  166. package/src/llama.cpp/src/llama-memory.cpp +1 -0
  167. package/src/llama.cpp/src/llama-memory.h +21 -0
  168. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  169. package/src/llama.cpp/src/llama-model.cpp +8230 -122
  170. package/src/llama.cpp/src/llama-model.h +34 -1
  171. package/src/llama.cpp/src/llama-quant.cpp +10 -1
  172. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  173. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  174. package/src/llama.cpp/src/llama.cpp +51 -9837
  175. package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
  176. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  177. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  178. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  179. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  180. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  181. package/src/llama.cpp/common/chat.hpp +0 -55
  182. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
  183. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
  184. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -10,3 +10,4 @@
10
10
  -r ./requirements/requirements-convert_hf_to_gguf_update.txt
11
11
  -r ./requirements/requirements-convert_llama_ggml_to_gguf.txt
12
12
  -r ./requirements/requirements-convert_lora_to_gguf.txt
13
+ -r ./requirements/requirements-tool_bench.txt
@@ -15,18 +15,21 @@ add_library(llama
15
15
  llama-chat.cpp
16
16
  llama-context.cpp
17
17
  llama-grammar.cpp
18
+ llama-graph.cpp
18
19
  llama-hparams.cpp
19
20
  llama-impl.cpp
21
+ llama-io.cpp
20
22
  llama-kv-cache.cpp
23
+ llama-memory.cpp
21
24
  llama-mmap.cpp
22
25
  llama-model-loader.cpp
23
26
  llama-model.cpp
24
27
  llama-quant.cpp
25
28
  llama-sampling.cpp
26
29
  llama-vocab.cpp
27
- unicode.h
28
- unicode.cpp
29
30
  unicode-data.cpp
31
+ unicode.cpp
32
+ unicode.h
30
33
  )
31
34
 
32
35
  target_include_directories(llama PUBLIC . ../include ../common)
@@ -4,14 +4,13 @@
4
4
  #include "llama-mmap.h"
5
5
  #include "llama-model.h"
6
6
 
7
- #include <algorithm>
8
7
  #include <map>
9
8
  #include <cassert>
10
9
  #include <stdexcept>
11
10
 
12
11
  // vec
13
12
 
14
- struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
13
+ ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
15
14
  if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
16
15
  return nullptr;
17
16
  }
@@ -19,7 +18,7 @@ struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
19
18
  return tensors[il];
20
19
  }
21
20
 
22
- struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
21
+ ggml_tensor * llama_adapter_cvec::apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const {
23
22
  ggml_tensor * layer_dir = tensor_for(il);
24
23
  if (layer_dir != nullptr) {
25
24
  cur = ggml_add(ctx, cur, layer_dir);
@@ -40,7 +39,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
40
39
  auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
41
40
  auto it = ctx_map.find(buft);
42
41
  if (it == ctx_map.end()) {
43
- struct ggml_init_params params = {
42
+ ggml_init_params params = {
44
43
  /*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(),
45
44
  /*.mem_buffer =*/ NULL,
46
45
  /*.no_alloc =*/ true,
@@ -91,7 +90,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
91
90
  return true;
92
91
  }
93
92
 
94
- int32_t llama_adapter_cvec::apply(
93
+ bool llama_adapter_cvec::apply(
95
94
  const llama_model & model,
96
95
  const float * data,
97
96
  size_t len,
@@ -104,17 +103,17 @@ int32_t llama_adapter_cvec::apply(
104
103
  // disable the current control vector (but leave allocated for later)
105
104
  layer_start = -1;
106
105
  layer_end = -1;
107
- return 0;
106
+ return true;
108
107
  }
109
108
 
110
109
  if (n_embd != (int) hparams.n_embd) {
111
110
  LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
112
- return 1;
111
+ return false;
113
112
  }
114
113
 
115
114
  if (tensors.empty()) {
116
115
  if (!init(model)) {
117
- return 1;
116
+ return false;
118
117
  }
119
118
  }
120
119
 
@@ -130,12 +129,12 @@ int32_t llama_adapter_cvec::apply(
130
129
  }
131
130
  }
132
131
 
133
- return 0;
132
+ return true;
134
133
  }
135
134
 
136
135
  // lora
137
136
 
138
- llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) {
137
+ llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
139
138
  const std::string name(w->name);
140
139
 
141
140
  const auto pos = ab_map.find(name);
@@ -146,11 +145,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor *
146
145
  return nullptr;
147
146
  }
148
147
 
149
- static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
148
+ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
150
149
  LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
151
150
 
152
151
  ggml_context * ctx_init;
153
- struct gguf_init_params meta_gguf_params = {
152
+ gguf_init_params meta_gguf_params = {
154
153
  /* .no_alloc = */ true,
155
154
  /* .ctx = */ &ctx_init,
156
155
  };
@@ -201,7 +200,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
201
200
  auto it = ctx_map.find(buft);
202
201
  if (it == ctx_map.end()) {
203
202
  // add a new context
204
- struct ggml_init_params params = {
203
+ ggml_init_params params = {
205
204
  /*.mem_size =*/ n_tensors*ggml_tensor_overhead(),
206
205
  /*.mem_buffer =*/ NULL,
207
206
  /*.no_alloc =*/ true,
@@ -264,7 +263,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
264
263
  throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
265
264
  }
266
265
 
267
- struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
266
+ ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
268
267
  // validate tensor shape
269
268
  if (is_token_embd) {
270
269
  // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
@@ -281,8 +280,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
281
280
  }
282
281
 
283
282
  // save tensor to adapter
284
- struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
285
- struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
283
+ ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
284
+ ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
286
285
  ggml_set_name(tensor_a, w.a->name);
287
286
  ggml_set_name(tensor_b, w.b->name);
288
287
  adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
@@ -308,7 +307,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
308
307
  {
309
308
  llama_file gguf_file(path_lora, "rb");
310
309
  std::vector<uint8_t> read_buf;
311
- auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
310
+ auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) {
312
311
  size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
313
312
  size_t size = ggml_nbytes(orig);
314
313
  read_buf.resize(size);
@@ -327,8 +326,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
327
326
  LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
328
327
  }
329
328
 
330
- struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
331
- struct llama_adapter_lora * adapter = new llama_adapter_lora();
329
+ llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
330
+ llama_adapter_lora * adapter = new llama_adapter_lora();
332
331
 
333
332
  try {
334
333
  llama_adapter_lora_init_impl(*model, path_lora, *adapter);
@@ -342,6 +341,6 @@ struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model,
342
341
  return nullptr;
343
342
  }
344
343
 
345
- void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
344
+ void llama_adapter_lora_free(llama_adapter_lora * adapter) {
346
345
  delete adapter;
347
346
  }
@@ -15,11 +15,11 @@
15
15
  //
16
16
 
17
17
  struct llama_adapter_cvec {
18
- struct ggml_tensor * tensor_for(int il) const;
18
+ ggml_tensor * tensor_for(int il) const;
19
19
 
20
- struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const;
20
+ ggml_tensor * apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const;
21
21
 
22
- int32_t apply(
22
+ bool apply(
23
23
  const llama_model & model,
24
24
  const float * data,
25
25
  size_t len,
@@ -36,7 +36,7 @@ private:
36
36
  std::vector<ggml_context_ptr> ctxs;
37
37
  std::vector<ggml_backend_buffer_ptr> bufs;
38
38
 
39
- std::vector<struct ggml_tensor *> tensors; // per layer
39
+ std::vector<ggml_tensor *> tensors; // per layer
40
40
  };
41
41
 
42
42
  //
@@ -44,8 +44,8 @@ private:
44
44
  //
45
45
 
46
46
  struct llama_adapter_lora_weight {
47
- struct ggml_tensor * a = nullptr;
48
- struct ggml_tensor * b = nullptr;
47
+ ggml_tensor * a = nullptr;
48
+ ggml_tensor * b = nullptr;
49
49
 
50
50
  // get actual scale based on rank and alpha
51
51
  float get_scale(float alpha, float adapter_scale) const {
@@ -55,12 +55,12 @@ struct llama_adapter_lora_weight {
55
55
  }
56
56
 
57
57
  llama_adapter_lora_weight() = default;
58
- llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
58
+ llama_adapter_lora_weight(ggml_tensor * a, ggml_tensor * b) : a(a), b(b) {}
59
59
  };
60
60
 
61
61
  struct llama_adapter_lora {
62
62
  // map tensor name to lora_a_b
63
- std::unordered_map<std::string, struct llama_adapter_lora_weight> ab_map;
63
+ std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
64
64
 
65
65
  std::vector<ggml_context_ptr> ctxs;
66
66
  std::vector<ggml_backend_buffer_ptr> bufs;
@@ -70,5 +70,7 @@ struct llama_adapter_lora {
70
70
  llama_adapter_lora() = default;
71
71
  ~llama_adapter_lora() = default;
72
72
 
73
- llama_adapter_lora_weight * get_weight(struct ggml_tensor * w);
73
+ llama_adapter_lora_weight * get_weight(ggml_tensor * w);
74
74
  };
75
+
76
+ using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
@@ -36,6 +36,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
36
36
  { LLM_ARCH_MINICPM3, "minicpm3" },
37
37
  { LLM_ARCH_GEMMA, "gemma" },
38
38
  { LLM_ARCH_GEMMA2, "gemma2" },
39
+ { LLM_ARCH_GEMMA3, "gemma3" },
39
40
  { LLM_ARCH_STARCODER2, "starcoder2" },
40
41
  { LLM_ARCH_MAMBA, "mamba" },
41
42
  { LLM_ARCH_XVERSE, "xverse" },
@@ -58,6 +59,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
58
59
  { LLM_ARCH_EXAONE, "exaone" },
59
60
  { LLM_ARCH_RWKV6, "rwkv6" },
60
61
  { LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" },
62
+ { LLM_ARCH_RWKV7, "rwkv7" },
63
+ { LLM_ARCH_ARWKV7, "arwkv7" },
61
64
  { LLM_ARCH_GRANITE, "granite" },
62
65
  { LLM_ARCH_GRANITE_MOE, "granitemoe" },
63
66
  { LLM_ARCH_CHAMELEON, "chameleon" },
@@ -109,22 +112,26 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
109
112
  { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
110
113
  { LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
111
114
 
112
- { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
113
- { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
114
- { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
115
- { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
116
- { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
117
- { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
118
- { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
119
- { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
120
- { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
121
- { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
122
- { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
123
- { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
124
- { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
125
- { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
126
- { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
127
- { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
115
+ { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
116
+ { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
117
+ { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
118
+ { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
119
+ { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
120
+ { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
121
+ { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
122
+ { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
123
+ { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
124
+ { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
125
+ { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
126
+ { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
127
+ { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
128
+ { LLM_KV_ATTENTION_DECAY_LORA_RANK, "%s.attention.decay_lora_rank" },
129
+ { LLM_KV_ATTENTION_ICLR_LORA_RANK, "%s.attention.iclr_lora_rank" },
130
+ { LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, "%s.attention.value_residual_mix_lora_rank" },
131
+ { LLM_KV_ATTENTION_GATE_LORA_RANK, "%s.attention.gate_lora_rank" },
132
+ { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
133
+ { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
134
+ { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
128
135
 
129
136
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
130
137
  { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -766,6 +773,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
766
773
  { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
767
774
  },
768
775
  },
776
+ {
777
+ LLM_ARCH_GEMMA3,
778
+ {
779
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
780
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
781
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
782
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
783
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
784
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
785
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
786
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
787
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
788
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
789
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
790
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
791
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
792
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
793
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
794
+ },
795
+ },
769
796
  {
770
797
  LLM_ARCH_STARCODER2,
771
798
  {
@@ -1217,6 +1244,74 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1217
1244
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1218
1245
  },
1219
1246
  },
1247
+ {
1248
+ LLM_ARCH_RWKV7,
1249
+ {
1250
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1251
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
1252
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1253
+ { LLM_TENSOR_OUTPUT, "output" },
1254
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1255
+ { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
1256
+ { LLM_TENSOR_TIME_MIX_W0, "blk.%d.time_mix_w0" },
1257
+ { LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" },
1258
+ { LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" },
1259
+ { LLM_TENSOR_TIME_MIX_A0, "blk.%d.time_mix_a0" },
1260
+ { LLM_TENSOR_TIME_MIX_A1, "blk.%d.time_mix_a1" },
1261
+ { LLM_TENSOR_TIME_MIX_A2, "blk.%d.time_mix_a2" },
1262
+ { LLM_TENSOR_TIME_MIX_V0, "blk.%d.time_mix_v0" },
1263
+ { LLM_TENSOR_TIME_MIX_V1, "blk.%d.time_mix_v1" },
1264
+ { LLM_TENSOR_TIME_MIX_V2, "blk.%d.time_mix_v2" },
1265
+ { LLM_TENSOR_TIME_MIX_G1, "blk.%d.time_mix_g1" },
1266
+ { LLM_TENSOR_TIME_MIX_G2, "blk.%d.time_mix_g2" },
1267
+ { LLM_TENSOR_TIME_MIX_K_K, "blk.%d.time_mix_k_k" },
1268
+ { LLM_TENSOR_TIME_MIX_K_A, "blk.%d.time_mix_k_a" },
1269
+ { LLM_TENSOR_TIME_MIX_R_K, "blk.%d.time_mix_r_k" },
1270
+ { LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" },
1271
+ { LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" },
1272
+ { LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" },
1273
+ { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" },
1274
+ { LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix_ln" },
1275
+ { LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" },
1276
+ { LLM_TENSOR_CHANNEL_MIX_LERP_K, "blk.%d.channel_mix_lerp_k" },
1277
+ { LLM_TENSOR_CHANNEL_MIX_KEY, "blk.%d.channel_mix_key" },
1278
+ { LLM_TENSOR_CHANNEL_MIX_VALUE, "blk.%d.channel_mix_value" },
1279
+ },
1280
+ },
1281
+ {
1282
+ LLM_ARCH_ARWKV7,
1283
+ {
1284
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1285
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
1286
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1287
+ { LLM_TENSOR_OUTPUT, "output" },
1288
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1289
+ { LLM_TENSOR_TIME_MIX_W0, "blk.%d.time_mix_w0" },
1290
+ { LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" },
1291
+ { LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" },
1292
+ { LLM_TENSOR_TIME_MIX_A0, "blk.%d.time_mix_a0" },
1293
+ { LLM_TENSOR_TIME_MIX_A1, "blk.%d.time_mix_a1" },
1294
+ { LLM_TENSOR_TIME_MIX_A2, "blk.%d.time_mix_a2" },
1295
+ { LLM_TENSOR_TIME_MIX_V0, "blk.%d.time_mix_v0" },
1296
+ { LLM_TENSOR_TIME_MIX_V1, "blk.%d.time_mix_v1" },
1297
+ { LLM_TENSOR_TIME_MIX_V2, "blk.%d.time_mix_v2" },
1298
+ { LLM_TENSOR_TIME_MIX_G1, "blk.%d.time_mix_g1" },
1299
+ { LLM_TENSOR_TIME_MIX_G2, "blk.%d.time_mix_g2" },
1300
+ { LLM_TENSOR_TIME_MIX_K_K, "blk.%d.time_mix_k_k" },
1301
+ { LLM_TENSOR_TIME_MIX_K_A, "blk.%d.time_mix_k_a" },
1302
+ { LLM_TENSOR_TIME_MIX_R_K, "blk.%d.time_mix_r_k" },
1303
+ { LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" },
1304
+ { LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" },
1305
+ { LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" },
1306
+ { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" },
1307
+ { LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix_ln" },
1308
+ { LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" },
1309
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1310
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1311
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1312
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1313
+ },
1314
+ },
1220
1315
  {
1221
1316
  LLM_ARCH_GRANITE,
1222
1317
  {
@@ -1376,6 +1471,12 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
1376
1471
  {LLM_TENSOR_SSM_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1377
1472
  {LLM_TENSOR_TIME_MIX_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1378
1473
  {LLM_TENSOR_TIME_MIX_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1474
+ {LLM_TENSOR_TIME_MIX_A1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1475
+ {LLM_TENSOR_TIME_MIX_A2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1476
+ {LLM_TENSOR_TIME_MIX_V1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1477
+ {LLM_TENSOR_TIME_MIX_V2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1478
+ {LLM_TENSOR_TIME_MIX_G1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1479
+ {LLM_TENSOR_TIME_MIX_G2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1379
1480
  {LLM_TENSOR_TIME_MIX_DECAY_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1380
1481
  {LLM_TENSOR_TIME_MIX_DECAY_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1381
1482
  {LLM_TENSOR_TIME_MIX_KEY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@@ -1394,6 +1495,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
1394
1495
  {LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1395
1496
  {LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1396
1497
  {LLM_TENSOR_CHANNEL_MIX_LERP_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1498
+ {LLM_TENSOR_TIME_MIX_K_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1499
+ {LLM_TENSOR_TIME_MIX_K_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1500
+ {LLM_TENSOR_TIME_MIX_R_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1397
1501
  {LLM_TENSOR_TIME_MIX_LERP_W, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
1398
1502
  {LLM_TENSOR_TIME_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
1399
1503
  {LLM_TENSOR_TIME_MIX_LERP_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
@@ -1401,6 +1505,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
1401
1505
  {LLM_TENSOR_TIME_MIX_LERP_G, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
1402
1506
  {LLM_TENSOR_TIME_MIX_LERP_FUSED, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
1403
1507
  {LLM_TENSOR_TIME_MIX_DECAY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
1508
+ {LLM_TENSOR_TIME_MIX_W0, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
1509
+ {LLM_TENSOR_TIME_MIX_A0, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
1510
+ {LLM_TENSOR_TIME_MIX_V0, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
1404
1511
  {LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV6}},
1405
1512
  {LLM_TENSOR_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1406
1513
  {LLM_TENSOR_ATTN_NORM_2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@@ -40,6 +40,7 @@ enum llm_arch {
40
40
  LLM_ARCH_MINICPM3,
41
41
  LLM_ARCH_GEMMA,
42
42
  LLM_ARCH_GEMMA2,
43
+ LLM_ARCH_GEMMA3,
43
44
  LLM_ARCH_STARCODER2,
44
45
  LLM_ARCH_MAMBA,
45
46
  LLM_ARCH_XVERSE,
@@ -62,6 +63,8 @@ enum llm_arch {
62
63
  LLM_ARCH_EXAONE,
63
64
  LLM_ARCH_RWKV6,
64
65
  LLM_ARCH_RWKV6QWEN2,
66
+ LLM_ARCH_RWKV7,
67
+ LLM_ARCH_ARWKV7,
65
68
  LLM_ARCH_GRANITE,
66
69
  LLM_ARCH_GRANITE_MOE,
67
70
  LLM_ARCH_CHAMELEON,
@@ -126,6 +129,10 @@ enum llm_kv {
126
129
  LLM_KV_ATTENTION_CAUSAL,
127
130
  LLM_KV_ATTENTION_Q_LORA_RANK,
128
131
  LLM_KV_ATTENTION_KV_LORA_RANK,
132
+ LLM_KV_ATTENTION_DECAY_LORA_RANK,
133
+ LLM_KV_ATTENTION_ICLR_LORA_RANK,
134
+ LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK,
135
+ LLM_KV_ATTENTION_GATE_LORA_RANK,
129
136
  LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
130
137
  LLM_KV_ATTENTION_SLIDING_WINDOW,
131
138
  LLM_KV_ATTENTION_SCALE,
@@ -249,8 +256,20 @@ enum llm_tensor {
249
256
  LLM_TENSOR_SSM_A,
250
257
  LLM_TENSOR_SSM_D,
251
258
  LLM_TENSOR_SSM_OUT,
259
+ LLM_TENSOR_TIME_MIX_W0,
252
260
  LLM_TENSOR_TIME_MIX_W1,
253
261
  LLM_TENSOR_TIME_MIX_W2,
262
+ LLM_TENSOR_TIME_MIX_A0,
263
+ LLM_TENSOR_TIME_MIX_A1,
264
+ LLM_TENSOR_TIME_MIX_A2,
265
+ LLM_TENSOR_TIME_MIX_V0,
266
+ LLM_TENSOR_TIME_MIX_V1,
267
+ LLM_TENSOR_TIME_MIX_V2,
268
+ LLM_TENSOR_TIME_MIX_G1,
269
+ LLM_TENSOR_TIME_MIX_G2,
270
+ LLM_TENSOR_TIME_MIX_K_K,
271
+ LLM_TENSOR_TIME_MIX_K_A,
272
+ LLM_TENSOR_TIME_MIX_R_K,
254
273
  LLM_TENSOR_TIME_MIX_LERP_X,
255
274
  LLM_TENSOR_TIME_MIX_LERP_W,
256
275
  LLM_TENSOR_TIME_MIX_LERP_K,
@@ -42,9 +42,9 @@ struct llama_sbatch {
42
42
  bool logits_all; // TODO: remove once lctx.logits_all is removed too
43
43
 
44
44
  // sorted indices into the batch
45
- std::vector<size_t> ids;
45
+ std::vector<int64_t> ids;
46
46
  // batch indices of the output
47
- std::vector<size_t> out_ids;
47
+ std::vector<int64_t> out_ids;
48
48
  std::vector<llama_sbatch_seq> seq;
49
49
 
50
50
  const llama_batch * batch = nullptr;
@@ -4,6 +4,7 @@
4
4
 
5
5
  #include <map>
6
6
  #include <sstream>
7
+ #include <algorithm>
7
8
 
8
9
  #if __cplusplus >= 202000L
9
10
  #define LU8(x) (const char*)(u8##x)