@fugood/llama.node 1.2.0-rc.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -483,11 +483,16 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
483
483
  /**
484
484
  * @see https://github.com/ggml-org/llama.cpp/pull/14037
485
485
  */
486
- inline static float vec_hsum(float32x4_t v) {
486
+ inline static float vec_hsum_f32x4(float32x4_t v) {
487
487
  float32x4_t v_temp = v + vec_reve(v);
488
488
  return v_temp[0] + v_temp[1];
489
489
  }
490
490
 
491
+ inline static int32_t vec_hsum_i32x4(int32x4_t v) {
492
+ int32x4_t v_temp = v + vec_reve(v);
493
+ return v_temp[0] + v_temp[1];
494
+ }
495
+
491
496
  inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
492
497
  const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
493
498
  return acc + (vec_unpackh(p) + vec_unpackl(p));
@@ -373,6 +373,9 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
373
373
  .vec_dot_type = GGML_TYPE_Q8_K,
374
374
  .nrows = 1,
375
375
  },
376
+ [GGML_TYPE_I32] = {
377
+ .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
378
+ },
376
379
  };
377
380
 
378
381
  const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
@@ -2696,7 +2699,10 @@ struct ggml_cplan ggml_graph_plan(
2696
2699
  if (ggml_is_quantized(node->type) ||
2697
2700
  // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
2698
2701
  (node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
2699
- (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) {
2702
+ (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16) ||
2703
+ // conversion between F32 and I32
2704
+ (node->src[0]->type == GGML_TYPE_F32 && node->src[1] && node->src[1]->type == GGML_TYPE_I32) ||
2705
+ (node->src[0]->type == GGML_TYPE_I32 && node->src[1] && node->src[1]->type == GGML_TYPE_F32)) {
2700
2706
  cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
2701
2707
  }
2702
2708
  } break;
@@ -3258,6 +3264,13 @@ void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
3258
3264
  }
3259
3265
  }
3260
3266
 
3267
+ void ggml_cpu_fp32_to_i32(const float * x, int32_t * y, int64_t n) {
3268
+ int64_t i = 0;
3269
+ for (; i < n; ++i) {
3270
+ y[i] = x[i];
3271
+ }
3272
+ }
3273
+
3261
3274
  void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
3262
3275
  int64_t i = 0;
3263
3276
  #if defined(__AVX2__)
@@ -190,6 +190,7 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
190
190
  /* .graph_compute = */ ggml_backend_cpu_graph_compute,
191
191
  /* .event_record = */ NULL,
192
192
  /* .event_wait = */ NULL,
193
+ /* .optimize_graph = */ NULL,
193
194
  };
194
195
 
195
196
  static ggml_guid_t ggml_backend_cpu_guid(void) {
@@ -515,9 +515,6 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
515
515
  op->src[0]->buffer &&
516
516
  (ggml_n_dims(op->src[0]) == 2) &&
517
517
  op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels) {
518
- if (op->op == GGML_OP_GET_ROWS && op->src[1]->ne[0] != 8) {
519
- return false;
520
- }
521
518
  if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
522
519
  return false;
523
520
  }
@@ -776,6 +776,24 @@ static void ggml_compute_forward_dup_f32(
776
776
  id += ne00 * (ne01 - ir1);
777
777
  }
778
778
  }
779
+ } else if (dst->type == GGML_TYPE_I32) {
780
+ size_t id = 0;
781
+ int32_t * dst_ptr = (int32_t *) dst->data;
782
+
783
+ for (int i03 = 0; i03 < ne03; i03++) {
784
+ for (int i02 = 0; i02 < ne02; i02++) {
785
+ id += ne00 * ir0;
786
+ for (int i01 = ir0; i01 < ir1; i01++) {
787
+ for (int i00 = 0; i00 < ne00; i00++) {
788
+ const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
789
+
790
+ dst_ptr[id] = *src0_ptr;
791
+ id++;
792
+ }
793
+ }
794
+ id += ne00 * (ne01 - ir1);
795
+ }
796
+ }
779
797
  } else {
780
798
  GGML_ABORT("fatal error"); // TODO: implement
781
799
  }
@@ -947,6 +965,144 @@ static void ggml_compute_forward_dup_f32(
947
965
  }
948
966
  }
949
967
  }
968
+ } else if (dst->type == GGML_TYPE_I32) {
969
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
970
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
971
+ i10 += ne00 * ir0;
972
+ while (i10 >= ne0) {
973
+ i10 -= ne0;
974
+ if (++i11 == ne1) {
975
+ i11 = 0;
976
+ if (++i12 == ne2) {
977
+ i12 = 0;
978
+ if (++i13 == ne3) {
979
+ i13 = 0;
980
+ }
981
+ }
982
+ }
983
+ }
984
+ for (int64_t i01 = ir0; i01 < ir1; i01++) {
985
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
986
+ const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
987
+ char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
988
+
989
+ *(int32_t *) dst_ptr = *(const float *) src0_ptr;
990
+
991
+ if (++i10 == ne0) {
992
+ i10 = 0;
993
+ if (++i11 == ne1) {
994
+ i11 = 0;
995
+ if (++i12 == ne2) {
996
+ i12 = 0;
997
+ if (++i13 == ne3) {
998
+ i13 = 0;
999
+ }
1000
+ }
1001
+ }
1002
+ }
1003
+ }
1004
+ }
1005
+ i10 += ne00 * (ne01 - ir1);
1006
+ while (i10 >= ne0) {
1007
+ i10 -= ne0;
1008
+ if (++i11 == ne1) {
1009
+ i11 = 0;
1010
+ if (++i12 == ne2) {
1011
+ i12 = 0;
1012
+ if (++i13 == ne3) {
1013
+ i13 = 0;
1014
+ }
1015
+ }
1016
+ }
1017
+ }
1018
+ }
1019
+ }
1020
+ } else {
1021
+ GGML_ABORT("fatal error"); // TODO: implement
1022
+ }
1023
+ }
1024
+
1025
+ static void ggml_compute_forward_dup_i32(
1026
+ const ggml_compute_params * params,
1027
+ ggml_tensor * dst) {
1028
+
1029
+ const ggml_tensor * src0 = dst->src[0];
1030
+
1031
+ GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
1032
+
1033
+ GGML_TENSOR_UNARY_OP_LOCALS
1034
+
1035
+ const int ith = params->ith; // thread index
1036
+ const int nth = params->nth; // number of threads
1037
+
1038
+ // parallelize by rows
1039
+ const int nr = ne01;
1040
+ // number of rows per thread
1041
+ const int dr = (nr + nth - 1) / nth;
1042
+ // row range for this thread
1043
+ const int ir0 = dr * ith;
1044
+ const int ir1 = MIN(ir0 + dr, nr);
1045
+
1046
+ // dst counters
1047
+
1048
+ int64_t i10 = 0;
1049
+ int64_t i11 = 0;
1050
+ int64_t i12 = 0;
1051
+ int64_t i13 = 0;
1052
+
1053
+ // TODO: not optimal, but works
1054
+ if (dst->type == GGML_TYPE_F32) {
1055
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
1056
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
1057
+ i10 += ne00 * ir0;
1058
+ while (i10 >= ne0) {
1059
+ i10 -= ne0;
1060
+ if (++i11 == ne1) {
1061
+ i11 = 0;
1062
+ if (++i12 == ne2) {
1063
+ i12 = 0;
1064
+ if (++i13 == ne3) {
1065
+ i13 = 0;
1066
+ }
1067
+ }
1068
+ }
1069
+ }
1070
+ for (int64_t i01 = ir0; i01 < ir1; i01++) {
1071
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
1072
+ const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
1073
+ char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
1074
+
1075
+ *(float *) dst_ptr = *(const int32_t *) src0_ptr;
1076
+
1077
+ if (++i10 == ne0) {
1078
+ i10 = 0;
1079
+ if (++i11 == ne1) {
1080
+ i11 = 0;
1081
+ if (++i12 == ne2) {
1082
+ i12 = 0;
1083
+ if (++i13 == ne3) {
1084
+ i13 = 0;
1085
+ }
1086
+ }
1087
+ }
1088
+ }
1089
+ }
1090
+ }
1091
+ i10 += ne00 * (ne01 - ir1);
1092
+ while (i10 >= ne0) {
1093
+ i10 -= ne0;
1094
+ if (++i11 == ne1) {
1095
+ i11 = 0;
1096
+ if (++i12 == ne2) {
1097
+ i12 = 0;
1098
+ if (++i13 == ne3) {
1099
+ i13 = 0;
1100
+ }
1101
+ }
1102
+ }
1103
+ }
1104
+ }
1105
+ }
950
1106
  } else {
951
1107
  GGML_ABORT("fatal error"); // TODO: implement
952
1108
  }
@@ -1177,6 +1333,10 @@ void ggml_compute_forward_dup(
1177
1333
  {
1178
1334
  ggml_compute_forward_dup_f32(params, dst);
1179
1335
  } break;
1336
+ case GGML_TYPE_I32:
1337
+ {
1338
+ ggml_compute_forward_dup_i32(params, dst);
1339
+ } break;
1180
1340
  default:
1181
1341
  {
1182
1342
  if (ggml_is_quantized(src0->type) && dst->type == GGML_TYPE_F32) {
@@ -8438,6 +8598,7 @@ static void ggml_compute_forward_timestep_embedding_f32(
8438
8598
  embed_data[j + half] = sinf(arg);
8439
8599
  }
8440
8600
  if (dim % 2 != 0 && ith == 0) {
8601
+ embed_data[2 * half] = 0.f;
8441
8602
  embed_data[dim] = 0.f;
8442
8603
  }
8443
8604
  }
@@ -137,6 +137,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
137
137
  { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
138
138
  { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
139
139
  { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
140
+ { LLM_KV_DECODER_BLOCK_COUNT, "%s.decoder_block_count" },
140
141
  { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
141
142
  { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
142
143
  { LLM_KV_SWIN_NORM, "%s.swin_norm" },
@@ -141,6 +141,7 @@ enum llm_kv {
141
141
  LLM_KV_POOLING_TYPE,
142
142
  LLM_KV_LOGIT_SCALE,
143
143
  LLM_KV_DECODER_START_TOKEN_ID,
144
+ LLM_KV_DECODER_BLOCK_COUNT,
144
145
  LLM_KV_ATTN_LOGIT_SOFTCAPPING,
145
146
  LLM_KV_FINAL_LOGIT_SOFTCAPPING,
146
147
  LLM_KV_SWIN_NORM,
@@ -285,8 +285,8 @@ llama_context::llama_context(
285
285
  const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
286
286
  const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
287
287
 
288
- // avoid reserving graphs with zero outputs
289
- n_outputs = 1;
288
+ // avoid reserving graphs with zero outputs - assume one output per sequence
289
+ n_outputs = n_seqs;
290
290
 
291
291
  LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
292
292
 
@@ -1447,7 +1447,9 @@ ggml_status llama_context::graph_compute(
1447
1447
  if (backend_cpu != nullptr) {
1448
1448
  auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
1449
1449
  auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
1450
- set_threadpool_fn(backend_cpu, tp);
1450
+ if (set_threadpool_fn) {
1451
+ set_threadpool_fn(backend_cpu, tp);
1452
+ }
1451
1453
  }
1452
1454
 
1453
1455
  // set the number of threads for all the backends
@@ -1273,7 +1273,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
1273
1273
  // split the batch into streams if needed
1274
1274
  const auto n_stream = k->ne[3];
1275
1275
 
1276
- q = ggml_reshape_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream);
1276
+ q = ggml_view_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream, q->nb[1], q->nb[2], q->nb[3]/n_stream, 0);
1277
1277
 
1278
1278
  q = ggml_permute(ctx0, q, 0, 2, 1, 3);
1279
1279
  k = ggml_permute(ctx0, k, 0, 2, 1, 3);
@@ -1431,7 +1431,8 @@ ggml_tensor * llm_graph_context::build_attn(
1431
1431
 
1432
1432
  // [TAG_NO_CACHE_PAD]
1433
1433
  // TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
1434
- assert(!ubatch.equal_seqs() || (k_cur->ne[3] == 1 && k_cur->ne[3] == ubatch.n_seqs_unq));
1434
+ // but it might not be worth it: https://github.com/ggml-org/llama.cpp/pull/15636
1435
+ //assert(!ubatch.equal_seqs() || (k_cur->ne[3] == 1 && k_cur->ne[3] == ubatch.n_seqs_unq));
1435
1436
 
1436
1437
  ggml_tensor * q = q_cur;
1437
1438
  ggml_tensor * k = k_cur;
@@ -159,6 +159,7 @@ struct llama_hparams {
159
159
  // needed by encoder-decoder models (e.g. T5, FLAN-T5)
160
160
  // ref: https://github.com/ggerganov/llama.cpp/pull/8141
161
161
  llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
162
+ uint32_t dec_n_layer = 0;
162
163
 
163
164
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
164
165
  enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@@ -1018,16 +1018,33 @@ ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggm
1018
1018
 
1019
1019
  const int32_t ikv = map_layer_ids.at(il);
1020
1020
 
1021
- auto * k = layers[ikv].k;
1021
+ ggml_tensor * k = layers[ikv].k;
1022
+
1023
+ const int64_t n_embd_head = k_cur->ne[0];
1024
+ const int64_t n_head = k_cur->ne[1];
1025
+ const int64_t n_tokens = k_cur->ne[2];
1026
+
1027
+ const int64_t n_embd_gqa = n_embd_head*n_head;
1022
1028
 
1023
- const int64_t n_tokens = k_cur->ne[2];
1029
+ // we can merge dims 0 and 1
1030
+ // TODO: add ggml helper function for this?
1031
+ GGML_ASSERT(ggml_row_size(k_cur->type, n_embd_head) == k_cur->nb[1]);
1024
1032
 
1025
- k_cur = ggml_reshape_2d(ctx, k_cur, k->ne[0], n_tokens);
1033
+ k_cur = ggml_view_2d(ctx, k_cur, n_embd_gqa, n_tokens, k_cur->nb[2], 0);
1026
1034
 
1027
- if (k->ne[2] > 1) {
1028
- k = ggml_reshape_2d(ctx, k, k->ne[0], k->ne[1]*k->ne[2]);
1035
+ const int64_t n_stream = k->ne[2];
1036
+
1037
+ if (n_stream > 1) {
1038
+ const int64_t kv_size = get_size();
1039
+
1040
+ assert(n_embd_gqa == k->ne[0]);
1041
+ assert(kv_size == k->ne[1]);
1042
+
1043
+ // merge the buffer across all streams because the idxs are global
1044
+ k = ggml_reshape_2d(ctx, k, n_embd_gqa, kv_size*n_stream);
1029
1045
  }
1030
1046
 
1047
+ // store the current K values into the cache
1031
1048
  return ggml_set_rows(ctx, k, k_cur, k_idxs);
1032
1049
  }
1033
1050
 
@@ -1038,28 +1055,51 @@ ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggm
1038
1055
 
1039
1056
  auto * v = layers[ikv].v;
1040
1057
 
1041
- const int64_t n_embd_v_gqa = v_cur->ne[0]*v_cur->ne[1];
1042
- const int64_t n_tokens = v_cur->ne[2];
1058
+ const int64_t n_embd_head = v_cur->ne[0];
1059
+ const int64_t n_head = v_cur->ne[1];
1060
+ const int64_t n_tokens = v_cur->ne[2];
1061
+
1062
+ const int64_t n_embd_gqa = n_embd_head*n_head;
1043
1063
 
1044
- v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens);
1064
+ // we can merge dims 0 and 1
1065
+ GGML_ASSERT(ggml_row_size(v_cur->type, n_embd_head) == v_cur->nb[1]);
1045
1066
 
1067
+ const int64_t n_stream = v->ne[2];
1068
+
1069
+ // take this branch when FA is enabled (the V cache is not transposed)
1046
1070
  if (!v_trans) {
1047
- if (v->ne[2] > 1) {
1048
- v = ggml_reshape_2d(ctx, v, v->ne[0], v->ne[1]*v->ne[2]);
1071
+ v_cur = ggml_view_2d(ctx, v_cur, n_embd_gqa, n_tokens, v_cur->nb[2], 0);
1072
+
1073
+ if (n_stream > 1) {
1074
+ const int64_t kv_size = get_size();
1075
+
1076
+ assert(n_embd_gqa == v->ne[0]);
1077
+ assert(kv_size == v->ne[1]);
1078
+
1079
+ // merge the buffer across all streams because the idxs are global
1080
+ v = ggml_reshape_2d(ctx, v, n_embd_gqa, kv_size*n_stream);
1049
1081
  }
1050
1082
 
1051
1083
  return ggml_set_rows(ctx, v, v_cur, v_idxs);
1052
1084
  }
1053
1085
 
1086
+ if (ggml_row_size(v_cur->type, n_embd_gqa) == v_cur->nb[2]) {
1087
+ // we can merge dims 0, 1 and 2
1088
+ v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens);
1089
+ } else {
1090
+ // otherwise -> make a copy to get contiguous data
1091
+ v_cur = ggml_cont_2d (ctx, v_cur, n_embd_gqa, n_tokens);
1092
+ }
1093
+
1054
1094
  // [TAG_V_CACHE_VARIABLE]
1055
- if (n_embd_v_gqa < v->ne[0]) {
1056
- v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_v_gqa, 0, 0, 0);
1095
+ if (n_embd_gqa < v->ne[0]) {
1096
+ v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_gqa, 0, 0, 0);
1057
1097
  }
1058
1098
 
1059
- // the row becomes a single element
1060
- ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v->ne[0]*v->ne[1]*v->ne[2]);
1099
+ // in this branch the v_idxs are constructed in such a way that each row is a single head element
1100
+ ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, ggml_nelements(v));
1061
1101
 
1062
- v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur->ne[0]*v_cur->ne[1]);
1102
+ v_cur = ggml_reshape_2d(ctx, v_cur, 1, ggml_nelements(v_cur));
1063
1103
 
1064
1104
  return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
1065
1105
  }
@@ -317,9 +317,17 @@ public:
317
317
  ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
318
318
 
319
319
  // store k_cur and v_cur in the cache based on the provided head location
320
+ // note: the heads in k_cur and v_cur should be layed out contiguously in memory
321
+ // - k_cur [n_embd_head_k, n_head_k, n_tokens]
322
+ // - k_idxs [n_tokens]
323
+ // - v_cur [n_embd_head_v, n_head_v, n_tokens]
324
+ // - v_idxs [n_tokens] or [n_tokens*n_embd_v_gqa] depending if V cache is transposed
320
325
  ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const;
321
326
  ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const;
322
327
 
328
+ // create destination indices for each head of the current batch for where it would be written in the KV cache
329
+ // the indices address the global KV cache (not per stream) - this is not relevant for the user of this API, but
330
+ // helps understand the implementation logic of cpy_k and cpy_v
323
331
  ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
324
332
  ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
325
333