@fugood/llama.node 1.2.3 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +33 -11
  3. package/src/llama.cpp/CMakeLists.txt +1 -0
  4. package/src/llama.cpp/common/CMakeLists.txt +46 -2
  5. package/src/llama.cpp/common/arg.cpp +322 -70
  6. package/src/llama.cpp/common/arg.h +0 -1
  7. package/src/llama.cpp/common/chat-parser.cpp +154 -13
  8. package/src/llama.cpp/common/chat-parser.h +3 -0
  9. package/src/llama.cpp/common/chat.cpp +217 -6
  10. package/src/llama.cpp/common/chat.h +5 -3
  11. package/src/llama.cpp/common/common.cpp +22 -6
  12. package/src/llama.cpp/common/common.h +6 -4
  13. package/src/llama.cpp/common/http.h +73 -0
  14. package/src/llama.cpp/ggml/CMakeLists.txt +7 -6
  15. package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
  16. package/src/llama.cpp/ggml/include/ggml-rpc.h +8 -9
  17. package/src/llama.cpp/ggml/include/ggml.h +22 -0
  18. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  20. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +12 -12
  22. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +100 -3
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
  24. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  25. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +209 -96
  27. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +32 -44
  28. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +107 -83
  29. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +17 -17
  30. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +8 -8
  31. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  32. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +103 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +1 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +66 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +6 -5
  39. package/src/llama.cpp/include/llama.h +8 -0
  40. package/src/llama.cpp/src/llama-arch.cpp +93 -0
  41. package/src/llama.cpp/src/llama-arch.h +22 -0
  42. package/src/llama.cpp/src/llama-chat.cpp +1 -1
  43. package/src/llama.cpp/src/llama-context.cpp +6 -0
  44. package/src/llama.cpp/src/llama-graph.cpp +57 -22
  45. package/src/llama.cpp/src/llama-graph.h +10 -1
  46. package/src/llama.cpp/src/llama-hparams.h +17 -2
  47. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +2 -2
  48. package/src/llama.cpp/src/llama-kv-cache.cpp +2 -5
  49. package/src/llama.cpp/src/llama-memory-hybrid.cpp +11 -9
  50. package/src/llama.cpp/src/llama-memory-recurrent.cpp +11 -3
  51. package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
  52. package/src/llama.cpp/src/llama-model.cpp +568 -41
  53. package/src/llama.cpp/src/llama-model.h +18 -0
  54. package/src/llama.cpp/src/llama-sampling.cpp +5 -0
  55. package/src/llama.cpp/src/llama-vocab.cpp +7 -1
  56. package/src/llama.cpp/src/llama-vocab.h +41 -40
  57. package/src/llama.cpp/src/unicode.h +43 -0
@@ -204,7 +204,10 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
204
204
  std::vector<int> target_pos(n_seqs_unq, -1);
205
205
  std::vector<int> target_row(n_seqs_unq, -1);
206
206
 
207
- bool last = cparams.pooling_type == LLAMA_POOLING_TYPE_LAST;
207
+ const bool last = (
208
+ cparams.pooling_type == LLAMA_POOLING_TYPE_LAST ||
209
+ (cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && arch == LLM_ARCH_QWEN3) // qwen3 reranking & embedding models use last token
210
+ );
208
211
 
209
212
  for (int i = 0; i < n_tokens; ++i) {
210
213
  const llama_pos pos = ubatch->pos[i];
@@ -920,15 +923,29 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
920
923
  selection_probs = logits;
921
924
  }
922
925
 
926
+ if (arch == LLM_ARCH_GROVEMOE) {
927
+ selection_probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
928
+ cb(selection_probs, "ffn_moe_probs_biased", il);
929
+ }
930
+
923
931
  // select experts
924
932
  ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
925
933
  cb(selected_experts->src[0], "ffn_moe_argsort", il);
926
934
  cb(selected_experts, "ffn_moe_topk", il);
927
935
 
928
- ggml_tensor * weights = ggml_get_rows(ctx0,
929
- ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
936
+ if (arch == LLM_ARCH_GROVEMOE && n_expert != hparams.n_expert) {
937
+ // TODO: Use scalar div instead when/if implemented
938
+ ggml_tensor * f_sel = ggml_cast(ctx0, selected_experts, GGML_TYPE_F32);
939
+ selected_experts = ggml_cast(ctx0, ggml_scale(ctx0, f_sel, 1.0f / float(hparams.n_group_experts)), GGML_TYPE_I32);
940
+ probs = ggml_reshape_3d(ctx0, probs, 1, hparams.n_expert, n_tokens);
941
+ } else {
942
+ probs = ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens);
943
+ }
944
+
945
+ ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [1, n_expert_used, n_tokens]
930
946
  cb(weights, "ffn_moe_weights", il);
931
947
 
948
+
932
949
  if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) {
933
950
  weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
934
951
  weights = ggml_soft_max(ctx0, weights); // [n_expert_used, n_tokens]
@@ -952,6 +969,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
952
969
  cb(weights, "ffn_moe_weights_scaled", il);
953
970
  }
954
971
 
972
+ //call early so that topk-moe can be used
973
+ ggml_build_forward_expand(gf, weights);
974
+
955
975
  cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
956
976
 
957
977
  if (weight_before_ffn) {
@@ -1177,7 +1197,7 @@ ggml_tensor * llm_graph_context::build_inp_mean() const {
1177
1197
  }
1178
1198
 
1179
1199
  ggml_tensor * llm_graph_context::build_inp_cls() const {
1180
- auto inp = std::make_unique<llm_graph_input_cls>(cparams);
1200
+ auto inp = std::make_unique<llm_graph_input_cls>(cparams, arch);
1181
1201
 
1182
1202
  auto & cur = inp->cls;
1183
1203
 
@@ -1833,6 +1853,23 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
1833
1853
  return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
1834
1854
  }
1835
1855
 
1856
+ void llm_graph_context::build_dense_out(
1857
+ ggml_tensor * dense_2,
1858
+ ggml_tensor * dense_3) const {
1859
+ if (!cparams.embeddings || dense_2 == nullptr || dense_3 == nullptr) {
1860
+ return;
1861
+ }
1862
+ ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
1863
+ GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd");
1864
+
1865
+ cur = ggml_mul_mat(ctx0, dense_2, cur);
1866
+ cur = ggml_mul_mat(ctx0, dense_3, cur);
1867
+ cb(cur, "result_embd_pooled", -1);
1868
+ res->t_embd_pooled = cur;
1869
+ ggml_build_forward_expand(gf, cur);
1870
+ }
1871
+
1872
+
1836
1873
  void llm_graph_context::build_pooling(
1837
1874
  ggml_tensor * cls,
1838
1875
  ggml_tensor * cls_b,
@@ -1877,34 +1914,32 @@ void llm_graph_context::build_pooling(
1877
1914
  case LLAMA_POOLING_TYPE_RANK:
1878
1915
  {
1879
1916
  ggml_tensor * inp_cls = build_inp_cls();
1880
- inp = ggml_get_rows(ctx0, inp, inp_cls);
1917
+ cur = ggml_get_rows(ctx0, inp, inp_cls);
1881
1918
 
1919
+ // classification head
1920
+ // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
1882
1921
  if (cls) {
1883
- // classification head
1884
- // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
1885
- cur = ggml_mul_mat(ctx0, cls, inp);
1922
+ cur = ggml_mul_mat(ctx0, cls, cur);
1886
1923
  if (cls_b) {
1887
1924
  cur = ggml_add(ctx0, cur, cls_b);
1888
1925
  }
1889
1926
  cur = ggml_tanh(ctx0, cur);
1927
+ }
1890
1928
 
1891
- // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
1892
- // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
1893
- if (cls_out) {
1894
- cur = ggml_mul_mat(ctx0, cls_out, cur);
1895
- if (cls_out_b) {
1896
- cur = ggml_add(ctx0, cur, cls_out_b);
1897
- }
1898
- }
1899
- } else if (cls_out) {
1900
- // Single layer classification head (direct projection)
1901
- // https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
1902
- cur = ggml_mul_mat(ctx0, cls_out, inp);
1929
+ // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
1930
+ // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
1931
+ // Single layer classification head (direct projection)
1932
+ // https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
1933
+ if (cls_out) {
1934
+ cur = ggml_mul_mat(ctx0, cls_out, cur);
1903
1935
  if (cls_out_b) {
1904
1936
  cur = ggml_add(ctx0, cur, cls_out_b);
1905
1937
  }
1906
- } else {
1907
- GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
1938
+ }
1939
+
1940
+ // softmax for qwen3 reranker
1941
+ if (arch == LLM_ARCH_QWEN3) {
1942
+ cur = ggml_soft_max(ctx0, cur);
1908
1943
  }
1909
1944
  } break;
1910
1945
  default:
@@ -206,7 +206,7 @@ public:
206
206
 
207
207
  class llm_graph_input_cls : public llm_graph_input_i {
208
208
  public:
209
- llm_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {}
209
+ llm_graph_input_cls(const llama_cparams & cparams, const llm_arch arch) : cparams(cparams), arch(arch) {}
210
210
  virtual ~llm_graph_input_cls() = default;
211
211
 
212
212
  void set_input(const llama_ubatch * ubatch) override;
@@ -214,6 +214,7 @@ public:
214
214
  ggml_tensor * cls; // I32 [n_batch]
215
215
 
216
216
  const llama_cparams cparams;
217
+ const llm_arch arch;
217
218
  };
218
219
 
219
220
  class llm_graph_input_rs : public llm_graph_input_i {
@@ -813,6 +814,14 @@ struct llm_graph_context {
813
814
  ggml_tensor * cls_b,
814
815
  ggml_tensor * cls_out,
815
816
  ggml_tensor * cls_out_b) const;
817
+
818
+ //
819
+ // dense (out)
820
+ //
821
+
822
+ void build_dense_out(
823
+ ggml_tensor * dense_2,
824
+ ggml_tensor * dense_3) const;
816
825
  };
817
826
 
818
827
  // TODO: better name
@@ -42,7 +42,7 @@ struct llama_hparams {
42
42
  uint32_t n_embd;
43
43
  uint32_t n_embd_features = 0;
44
44
  uint32_t n_layer;
45
- int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
45
+ int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
46
46
  uint32_t n_rot;
47
47
  uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
48
48
  uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
@@ -69,10 +69,13 @@ struct llama_hparams {
69
69
  uint32_t n_lora_kv = 0;
70
70
  uint32_t n_ff_exp = 0;
71
71
  uint32_t n_ff_shexp = 0;
72
+ uint32_t n_ff_chexp = 0;
72
73
  uint32_t n_expert_shared = 0;
73
74
  uint32_t n_norm_groups = 0;
75
+ uint32_t n_group_experts = 0;
74
76
 
75
- float expert_weights_scale = 0.0;
77
+ float expert_group_scale = 0.05f;
78
+ float expert_weights_scale = 0.0f;
76
79
  bool expert_weights_norm = false;
77
80
  uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
78
81
  uint32_t moe_every_n_layers = 0;
@@ -166,6 +169,18 @@ struct llama_hparams {
166
169
  uint32_t laurel_rank = 64;
167
170
  uint32_t n_embd_altup = 256;
168
171
 
172
+ // needed for sentence-transformers dense layers
173
+ uint32_t dense_2_feat_in = 0; // in_features of the 2_Dense
174
+ uint32_t dense_2_feat_out = 0; // out_features of the 2_Dense
175
+ uint32_t dense_3_feat_in = 0; // in_features of the 3_Dense
176
+ uint32_t dense_3_feat_out = 0; // out_features of the 3_Dense
177
+
178
+ // xIELU
179
+ std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_n;
180
+ std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_p;
181
+ std::array<float, LLAMA_MAX_LAYERS> xielu_beta;
182
+ std::array<float, LLAMA_MAX_LAYERS> xielu_eps;
183
+
169
184
  // needed by encoder-decoder models (e.g. T5, FLAN-T5)
170
185
  // ref: https://github.com/ggerganov/llama.cpp/pull/8141
171
186
  llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
@@ -220,7 +220,7 @@ bool llama_kv_cache_iswa::get_can_shift() const {
220
220
  }
221
221
 
222
222
  void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
223
- if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
223
+ if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
224
224
  kv_base->state_write(io, seq_id, flags);
225
225
  }
226
226
 
@@ -228,7 +228,7 @@ void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id
228
228
  }
229
229
 
230
230
  void llama_kv_cache_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
231
- if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
231
+ if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
232
232
  kv_base->state_read(io, seq_id, flags);
233
233
  }
234
234
 
@@ -123,11 +123,8 @@ llama_kv_cache::llama_kv_cache(
123
123
  throw std::runtime_error("failed to create ggml context for kv cache");
124
124
  }
125
125
 
126
- ggml_tensor * k;
127
- ggml_tensor * v;
128
-
129
- k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
130
- v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);
126
+ ggml_tensor * k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
127
+ ggml_tensor * v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);
131
128
 
132
129
  ggml_format_name(k, "cache_k_l%d", il);
133
130
  ggml_format_name(v, "cache_v_l%d", il);
@@ -73,7 +73,9 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
73
73
  // if all tokens are output, split by sequence
74
74
  ubatch = balloc.split_seq(n_ubatch);
75
75
  } else {
76
- ubatch = balloc.split_equal(n_ubatch, false);
76
+ // TODO: non-sequential equal split can be done if using unified KV cache
77
+ // for simplicity, we always use sequential equal split for now
78
+ ubatch = balloc.split_equal(n_ubatch, true);
77
79
  }
78
80
 
79
81
  if (ubatch.n_tokens == 0) {
@@ -175,17 +177,17 @@ std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdo
175
177
  }
176
178
 
177
179
  void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
178
- GGML_UNUSED(flags);
179
-
180
- mem_attn->state_write(io, seq_id);
181
- mem_recr->state_write(io, seq_id);
180
+ if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
181
+ mem_attn->state_write(io, seq_id, flags);
182
+ }
183
+ mem_recr->state_write(io, seq_id, flags);
182
184
  }
183
185
 
184
186
  void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
185
- GGML_UNUSED(flags);
186
-
187
- mem_attn->state_read(io, seq_id);
188
- mem_recr->state_read(io, seq_id);
187
+ if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
188
+ mem_attn->state_read(io, seq_id, flags);
189
+ }
190
+ mem_recr->state_read(io, seq_id, flags);
189
191
  }
190
192
 
191
193
  llama_kv_cache * llama_memory_hybrid::get_mem_attn() const {
@@ -136,6 +136,7 @@ void llama_memory_recurrent::clear(bool data) {
136
136
  }
137
137
 
138
138
  bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
139
+ //printf("[DEBUG] calling llama_memory_recurrent::seq_rm` with `seq_id=%d, p0=%d, p1=%d`\n", seq_id, p0, p1);
139
140
  uint32_t new_head = size;
140
141
 
141
142
  if (p0 < 0) {
@@ -156,7 +157,8 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
156
157
  if (tail_id >= 0) {
157
158
  const auto & cell = cells[tail_id];
158
159
  // partial intersection is invalid
159
- if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
160
+ if ((0 < p0 && p0 < cell.pos) || (0 < p1 && p1 <= cell.pos)) {
161
+ //printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n");
160
162
  return false;
161
163
  }
162
164
  // invalidate tails which will be cleared
@@ -167,6 +169,7 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
167
169
  } else {
168
170
  // seq_id is negative, then the range should include everything or nothing
169
171
  if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
172
+ //printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: `seq_id` is negative, so returning false\n");
170
173
  return false;
171
174
  }
172
175
  }
@@ -379,7 +382,9 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr &
379
382
  // if all tokens are output, split by sequence
380
383
  ubatch = balloc.split_seq(n_ubatch);
381
384
  } else {
382
- ubatch = balloc.split_equal(n_ubatch, false);
385
+ // TODO: non-sequential equal split can be done if using unified KV cache
386
+ // for simplicity, we always use sequential equal split for now
387
+ ubatch = balloc.split_equal(n_ubatch, true);
383
388
  }
384
389
 
385
390
  if (ubatch.n_tokens == 0) {
@@ -856,9 +861,12 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
856
861
  bool llama_memory_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
857
862
  if (dest_seq_id != -1) {
858
863
  // single sequence
859
-
860
864
  seq_rm(dest_seq_id, -1, -1);
861
865
 
866
+ if (cell_count == 0) {
867
+ return true;
868
+ }
869
+
862
870
  llama_batch_allocr balloc(hparams.n_pos_per_embd());
863
871
 
864
872
  llama_ubatch ubatch = balloc.ubatch_reserve(cell_count, 1);
@@ -465,6 +465,8 @@ namespace GGUFMeta {
465
465
  // TODO: this is not very clever - figure out something better
466
466
  template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
467
467
  template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
468
+ template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
469
+
468
470
 
469
471
  llama_model_loader::llama_model_loader(
470
472
  const std::string & fname,