@fugood/llama.node 0.3.11 → 0.3.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -0
  18. package/lib/index.js +26 -20
  19. package/lib/index.ts +32 -28
  20. package/package.json +1 -1
  21. package/src/LlamaCompletionWorker.cpp +14 -0
  22. package/src/LlamaContext.cpp +13 -4
  23. package/src/llama.cpp/.github/workflows/build.yml +35 -3
  24. package/src/llama.cpp/.github/workflows/docker.yml +2 -0
  25. package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
  26. package/src/llama.cpp/common/CMakeLists.txt +20 -3
  27. package/src/llama.cpp/common/arg.cpp +180 -3
  28. package/src/llama.cpp/common/chat-template.hpp +21 -7
  29. package/src/llama.cpp/common/chat.cpp +220 -101
  30. package/src/llama.cpp/common/chat.hpp +3 -0
  31. package/src/llama.cpp/common/common.h +15 -7
  32. package/src/llama.cpp/common/llguidance.cpp +3 -3
  33. package/src/llama.cpp/common/log.cpp +1 -0
  34. package/src/llama.cpp/common/log.h +2 -1
  35. package/src/llama.cpp/common/minja.hpp +24 -9
  36. package/src/llama.cpp/common/sampling.cpp +52 -46
  37. package/src/llama.cpp/common/speculative.h +1 -1
  38. package/src/llama.cpp/docs/build.md +2 -2
  39. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -1
  40. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
  41. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
  43. package/src/llama.cpp/examples/run/run.cpp +5 -12
  44. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
  45. package/src/llama.cpp/examples/server/httplib.h +381 -292
  46. package/src/llama.cpp/examples/server/server.cpp +58 -47
  47. package/src/llama.cpp/examples/server/utils.hpp +7 -5
  48. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
  49. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
  50. package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
  51. package/src/llama.cpp/ggml/include/ggml.h +1 -1
  52. package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
  53. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -12
  54. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +852 -268
  55. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +200 -107
  56. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -5
  57. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
  58. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +2 -2
  59. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +26 -4
  60. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +6 -7
  61. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +812 -569
  62. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +25 -1
  63. package/src/llama.cpp/ggml/src/ggml.c +1 -1
  64. package/src/llama.cpp/include/llama.h +14 -10
  65. package/src/llama.cpp/src/llama-grammar.cpp +1 -1
  66. package/src/llama.cpp/src/llama-grammar.h +1 -1
  67. package/src/llama.cpp/src/llama-impl.h +6 -6
  68. package/src/llama.cpp/src/llama-kv-cache.h +1 -1
  69. package/src/llama.cpp/src/llama-mmap.h +1 -0
  70. package/src/llama.cpp/src/llama-model.cpp +1 -1
  71. package/src/llama.cpp/src/llama-sampling.cpp +131 -57
  72. package/src/llama.cpp/src/llama.cpp +7 -5
  73. package/src/llama.cpp/src/unicode.cpp +9 -2
  74. package/src/llama.cpp/tests/test-backend-ops.cpp +5 -5
  75. package/src/llama.cpp/tests/test-chat.cpp +237 -69
  76. package/src/llama.cpp/tests/test-gguf.cpp +4 -4
  77. package/src/llama.cpp/tests/test-sampling.cpp +15 -0
@@ -7,10 +7,8 @@
7
7
  #include "ggml-cpu-impl.h"
8
8
  #include "ggml-cpu.h"
9
9
  #include "ggml-impl.h"
10
- #include "ggml-quants.h"
11
10
  #include "ggml-cpu-quants.h"
12
11
  #include "ggml-threading.h"
13
- #include "amx/amx.h"
14
12
  #include "ggml.h"
15
13
 
16
14
  #if defined(_MSC_VER) || defined(__MINGW32__)
@@ -1078,29 +1076,23 @@ do { \
1078
1076
  #define GGML_F16_STEP 32
1079
1077
  #define GGML_F16_EPR 8
1080
1078
 
1081
- // F16 arithmetic is not supported by AVX, so we use F32 instead
1079
+ // F16 arithmetic is not supported by LASX, so we use F32 instead
1082
1080
 
1083
1081
  #define GGML_F32Cx8 __m256
1084
1082
  #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
1085
1083
  #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
1086
1084
 
1087
1085
  static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
1088
- float tmp[8];
1089
-
1090
- for (int i = 0; i < 8; i++) {
1091
- tmp[i] = GGML_FP16_TO_FP32(x[i]);
1092
- }
1093
-
1094
- return (__m256)__lasx_xvld(tmp, 0);
1086
+ __m256i a;
1087
+ memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
1088
+ a = __lasx_xvpermi_d(a, 0 | (1 << 4));
1089
+ return __lasx_xvfcvtl_s_h(a);
1095
1090
  }
1096
- static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
1097
- float arr[8];
1098
-
1099
- __lasx_xvst(y, arr, 0);
1100
1091
 
1101
- for (int i = 0; i < 8; i++) {
1102
- x[i] = GGML_FP32_TO_FP16(arr[i]);
1103
- }
1092
+ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
1093
+ __m256i a = __lasx_xvfcvt_h_s(y, y);
1094
+ a = __lasx_xvpermi_d(a, 0 | (2 << 2));
1095
+ memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
1104
1096
  }
1105
1097
  #define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
1106
1098
  #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
@@ -1297,7 +1289,7 @@ struct ggml_threadpool {
1297
1289
  atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
1298
1290
  atomic_int GGML_CACHE_ALIGN n_barrier;
1299
1291
  atomic_int GGML_CACHE_ALIGN n_barrier_passed;
1300
- atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
1292
+ atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
1301
1293
 
1302
1294
  // these are atomic as an annotation for thread-sanitizer
1303
1295
  atomic_bool stop; // Used for stopping the threadpool altogether
@@ -1824,7 +1816,7 @@ inline static float ggml_silu_f32(float x) {
1824
1816
 
1825
1817
  #if __FINITE_MATH_ONLY__
1826
1818
  #error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
1827
- #error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461"
1819
+ #error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
1828
1820
  #endif
1829
1821
 
1830
1822
  #if defined(__ARM_NEON) && defined(__aarch64__)
@@ -7496,6 +7488,7 @@ UseGgmlGemm1:;
7496
7488
  if (src1->type != vec_dot_type) {
7497
7489
  char * wdata = params->wdata;
7498
7490
 
7491
+ const size_t nbw0 = ggml_type_size(vec_dot_type);
7499
7492
  const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
7500
7493
  const size_t nbw2 = nbw1*ne11;
7501
7494
  const size_t nbw3 = nbw2*ne12;
@@ -7503,6 +7496,7 @@ UseGgmlGemm1:;
7503
7496
  assert(params->wsize >= ne13*nbw3);
7504
7497
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
7505
7498
 
7499
+ #if 0
7506
7500
  for (int64_t i13 = 0; i13 < ne13; ++i13) {
7507
7501
  for (int64_t i12 = 0; i12 < ne12; ++i12) {
7508
7502
  for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
@@ -7512,6 +7506,20 @@ UseGgmlGemm1:;
7512
7506
  }
7513
7507
  }
7514
7508
  }
7509
+ #else
7510
+ for (int64_t i13 = 0; i13 < ne13; ++i13) {
7511
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
7512
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
7513
+ size_t bs = ggml_blck_size(vec_dot_type);
7514
+ int64_t ne10_block_start = (ith * ne10/bs) / nth;
7515
+ int64_t ne10_block_end = ((ith + 1) * ne10/bs) / nth;
7516
+ from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10),
7517
+ (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0),
7518
+ (ne10_block_end - ne10_block_start) * bs);
7519
+ }
7520
+ }
7521
+ }
7522
+ #endif
7515
7523
  }
7516
7524
 
7517
7525
  if (ith == 0) {
@@ -7566,7 +7574,7 @@ UseGgmlGemm2:;
7566
7574
  int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
7567
7575
 
7568
7576
  // If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread.
7569
- // Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggerganov/llama.cpp/pull/6915
7577
+ // Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggml-org/llama.cpp/pull/6915
7570
7578
  // In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
7571
7579
  if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
7572
7580
  // distribute the thread work across the inner or outer loop based on which one is larger
@@ -7599,7 +7607,6 @@ UseGgmlGemm2:;
7599
7607
  if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
7600
7608
  num_rows_per_vec_dot = 1;
7601
7609
  }
7602
-
7603
7610
  ggml_compute_forward_mul_mat_one_chunk(params, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
7604
7611
 
7605
7612
  if (nth >= nchunk0 * nchunk1) {
@@ -7612,6 +7619,84 @@ UseGgmlGemm2:;
7612
7619
 
7613
7620
  // ggml_compute_forward_mul_mat_id
7614
7621
 
7622
+ #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ids->ne[0]*ids->ne[1] + (i1)]
7623
+
7624
+ struct mmid_row_mapping {
7625
+ int32_t i1;
7626
+ int32_t i2;
7627
+ };
7628
+
7629
+ static void ggml_compute_forward_mul_mat_id_one_chunk(
7630
+ struct ggml_tensor * dst,
7631
+ const struct ggml_tensor * src0,
7632
+ const struct ggml_tensor * src1,
7633
+ const struct ggml_tensor * ids,
7634
+ const int64_t cur_a,
7635
+ const int64_t ir0_start,
7636
+ const int64_t ir0_end,
7637
+ const int64_t ir1_start,
7638
+ const int64_t ir1_end,
7639
+ const char * src0_cur,
7640
+ const struct mmid_row_mapping * matrix_rows,
7641
+ const size_t row_size,
7642
+ const bool src1_cont,
7643
+ const void * wdata) {
7644
+
7645
+ GGML_TENSOR_BINARY_OP_LOCALS
7646
+
7647
+ const enum ggml_type type = src0->type;
7648
+
7649
+ ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
7650
+ enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
7651
+
7652
+ const int64_t blck_0 = 16;
7653
+ const int64_t blck_1 = 16;
7654
+
7655
+ float tmp[16];
7656
+
7657
+ for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
7658
+ for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
7659
+ for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ++ir1) {
7660
+ const int64_t _i12 = ir1; // logical row index for this expert
7661
+
7662
+ struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
7663
+ const int id = row_mapping.i1; // selected expert index
7664
+
7665
+ const int64_t i11 = id % ne11;
7666
+ const int64_t i12 = row_mapping.i2; // row index in src1
7667
+
7668
+ const int64_t i1 = id; // selected expert index
7669
+ const int64_t i2 = i12; // row
7670
+
7671
+ // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
7672
+ // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
7673
+ // the original src1 data pointer, so we should index using the indices directly
7674
+ // TODO: this is a bit of a hack, we should probably have a better way to handle this
7675
+ const char * src1_col = (const char *) wdata +
7676
+ (src1_cont || src1->type != vec_dot_type
7677
+ ? (i11 + i12*ne11)*row_size
7678
+ : (i11*nb11 + i12*nb12));
7679
+
7680
+ float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
7681
+
7682
+ for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
7683
+ vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
7684
+ }
7685
+
7686
+ memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir0_end) - iir0)*sizeof(float));
7687
+ }
7688
+ }
7689
+ }
7690
+ }
7691
+
7692
+ static void * incr_ptr_aligned(void ** p, size_t size, size_t align) {
7693
+
7694
+ void * ptr = *p;
7695
+ ptr = (void *) GGML_PAD((uintptr_t) ptr, align);
7696
+ *p = (void *) ((char *) ptr + size);
7697
+ return ptr;
7698
+ }
7699
+
7615
7700
  static void ggml_compute_forward_mul_mat_id(
7616
7701
  const struct ggml_compute_params * params,
7617
7702
  struct ggml_tensor * dst) {
@@ -7629,7 +7714,6 @@ static void ggml_compute_forward_mul_mat_id(
7629
7714
 
7630
7715
  const bool src1_cont = ggml_is_contiguous(src1);
7631
7716
 
7632
- ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
7633
7717
  enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
7634
7718
  ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
7635
7719
 
@@ -7647,21 +7731,27 @@ static void ggml_compute_forward_mul_mat_id(
7647
7731
  const int n_ids = ids->ne[0]; // n_expert_used
7648
7732
  const int n_as = ne02; // n_expert
7649
7733
 
7650
- char * wdata_src1_end = (src1->type == vec_dot_type) ?
7651
- (char *) params->wdata :
7652
- (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
7734
+ void * wdata_cur = params->wdata;
7653
7735
 
7654
- struct mmid_row_mapping {
7655
- int32_t i1;
7656
- int32_t i2;
7657
- };
7736
+ if (src1->type != vec_dot_type) {
7737
+ incr_ptr_aligned(&wdata_cur, ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
7738
+ }
7658
7739
 
7659
- int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
7660
- struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
7740
+ int64_t * matrix_row_counts = // [n_as]
7741
+ incr_ptr_aligned(&wdata_cur, n_as*sizeof(int64_t), sizeof(int64_t));
7742
+
7743
+ struct mmid_row_mapping * matrix_rows = // [n_as][ids->ne[0]*ids->ne[1]]
7744
+ incr_ptr_aligned(&wdata_cur, n_as*ids->ne[0]*ids->ne[1]*sizeof(struct mmid_row_mapping), sizeof(int64_t));
7745
+
7746
+ char (*atomic_current_chunk)[CACHE_LINE_SIZE] = // [n_as]
7747
+ incr_ptr_aligned(&wdata_cur, CACHE_LINE_SIZE * n_as, CACHE_LINE_SIZE);
7748
+
7749
+ GGML_ASSERT(params->wsize >= (size_t)((char *) wdata_cur - (char *) params->wdata));
7661
7750
 
7662
7751
  if (src1->type != vec_dot_type) {
7663
7752
  char * wdata = params->wdata;
7664
7753
 
7754
+ const size_t nbw0 = ggml_type_size(vec_dot_type);
7665
7755
  const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
7666
7756
  const size_t nbw2 = nbw1*ne11;
7667
7757
  const size_t nbw3 = nbw2*ne12;
@@ -7669,19 +7759,32 @@ static void ggml_compute_forward_mul_mat_id(
7669
7759
  assert(params->wsize >= ne13*nbw3);
7670
7760
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
7671
7761
 
7762
+ #if 0
7672
7763
  for (int64_t i13 = 0; i13 < ne13; ++i13) {
7673
- for (int64_t i12 = 0; i12 < ne12; ++i12) {
7674
- for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
7764
+ for (int64_t i12 = ith; i12 < ne12; i12 += nth) {
7765
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
7675
7766
  from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
7676
7767
  (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
7677
7768
  ne10);
7678
7769
  }
7679
7770
  }
7680
7771
  }
7772
+ #else
7773
+ for (int64_t i13 = 0; i13 < ne13; ++i13) {
7774
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
7775
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
7776
+ size_t bs = ggml_blck_size(vec_dot_type);
7777
+ int64_t ne10_block_start = (ith * ne10/bs) / nth;
7778
+ int64_t ne10_block_end = ((ith + 1) * ne10/bs) / nth;
7779
+ from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10),
7780
+ (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0),
7781
+ (ne10_block_end - ne10_block_start) * bs);
7782
+ }
7783
+ }
7784
+ }
7785
+ #endif
7681
7786
  }
7682
7787
 
7683
- #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
7684
-
7685
7788
  if (ith == 0) {
7686
7789
  // initialize matrix_row_counts
7687
7790
  memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
@@ -7699,9 +7802,14 @@ static void ggml_compute_forward_mul_mat_id(
7699
7802
  }
7700
7803
  }
7701
7804
 
7805
+ // reset current_chunk
7806
+ for (int cur_a = ith; cur_a < n_as; cur_a += nth) {
7807
+ atomic_int * current_chunk_ctr = (atomic_int *)(atomic_current_chunk + cur_a);
7808
+ *current_chunk_ctr = nth;
7809
+ }
7810
+
7702
7811
  ggml_barrier(params->threadpool);
7703
7812
 
7704
- // compute each matrix multiplication in sequence
7705
7813
  for (int cur_a = 0; cur_a < n_as; ++cur_a) {
7706
7814
  const int64_t cne1 = matrix_row_counts[cur_a];
7707
7815
 
@@ -7709,84 +7817,64 @@ static void ggml_compute_forward_mul_mat_id(
7709
7817
  continue;
7710
7818
  }
7711
7819
 
7712
- const char * src0_cur = (const char *) src0->data + cur_a*nb02;
7713
-
7714
- const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
7820
+ const char * src0_cur = (const char *) src0->data + cur_a * nb02;
7821
+ const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
7715
7822
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
7716
7823
 
7717
- const int64_t nr0 = ne01; // src0 rows
7718
- const int64_t nr1 = cne1; // src1 rows
7719
-
7720
- // distribute the thread work across the inner or outer loop based on which one is larger
7721
-
7722
- const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
7723
- const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
7724
-
7725
- const int64_t ith0 = ith % nth0;
7726
- const int64_t ith1 = ith / nth0;
7824
+ const int64_t nr0 = ne01;
7825
+ const int64_t nr1 = cne1;
7727
7826
 
7728
- const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
7729
- const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
7730
-
7731
- const int64_t ir010 = dr0*ith0;
7732
- const int64_t ir011 = MIN(ir010 + dr0, nr0);
7733
-
7734
- const int64_t ir110 = dr1*ith1;
7735
- const int64_t ir111 = MIN(ir110 + dr1, nr1);
7827
+ int chunk_size = 16;
7828
+ if (nr0 == 1 || nr1 == 1) {
7829
+ chunk_size = 64;
7830
+ }
7736
7831
 
7737
- // threads with no work simply yield (not sure if it helps)
7738
- //if (ir010 >= ir011 || ir110 >= ir111) {
7739
- // sched_yield();
7740
- // continue;
7741
- //}
7832
+ #if defined(__aarch64__)
7833
+ // disable for ARM
7834
+ const bool disable_chunking = true;
7835
+ #else
7836
+ // disable for NUMA
7837
+ const bool disable_chunking = ggml_is_numa();
7838
+ #endif // defined(__aarch64__)
7742
7839
 
7743
- // block-tiling attempt
7744
- const int64_t blck_0 = 16;
7745
- const int64_t blck_1 = 16;
7840
+ int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
7841
+ int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
7746
7842
 
7747
- // attempt to reduce false-sharing (does not seem to make a difference)
7748
- float tmp[16];
7843
+ if (nchunk0 * nchunk1 < nth * 4 || disable_chunking) {
7844
+ nchunk0 = nr0 > nr1 ? nth : 1;
7845
+ nchunk1 = nr0 > nr1 ? 1 : nth;
7846
+ }
7749
7847
 
7750
- for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
7751
- for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
7752
- for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
7753
- const int64_t _i12 = ir1; // logical row index for this expert
7848
+ const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
7849
+ const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
7754
7850
 
7755
- struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
7756
- const int id = row_mapping.i1; // selected expert index
7851
+ int current_chunk = ith;
7757
7852
 
7758
- const int64_t i11 = id % ne11;
7759
- const int64_t i12 = row_mapping.i2; // row index in src1
7853
+ atomic_int * current_chunk_ctr = (atomic_int *)(atomic_current_chunk + cur_a);
7760
7854
 
7761
- const int64_t i1 = id; // selected expert index
7762
- const int64_t i2 = i12; // row
7855
+ while (current_chunk < nchunk0 * nchunk1) {
7856
+ const int64_t ith0 = current_chunk % nchunk0;
7857
+ const int64_t ith1 = current_chunk / nchunk0;
7763
7858
 
7764
- // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
7765
- // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
7766
- // the original src1 data pointer, so we should index using the indices directly
7767
- // TODO: this is a bit of a hack, we should probably have a better way to handle this
7768
- const char * src1_col = (const char *) wdata +
7769
- (src1_cont || src1->type != vec_dot_type
7770
- ? (i11 + i12*ne11)*row_size
7771
- : (i11*nb11 + i12*nb12));
7859
+ const int64_t ir0_start = dr0 * ith0;
7860
+ const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
7772
7861
 
7773
- float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
7862
+ const int64_t ir1_start = dr1 * ith1;
7863
+ const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
7774
7864
 
7775
- //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
7776
- // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
7777
- //}
7865
+ ggml_compute_forward_mul_mat_id_one_chunk(
7866
+ dst, src0, src1, ids, cur_a,
7867
+ ir0_start, ir0_end, ir1_start, ir1_end,
7868
+ src0_cur, matrix_rows, row_size, src1_cont, wdata
7869
+ );
7778
7870
 
7779
- for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
7780
- vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
7781
- }
7782
-
7783
- memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
7784
- }
7871
+ if (nth >= nchunk0 * nchunk1) {
7872
+ break;
7785
7873
  }
7874
+
7875
+ current_chunk = atomic_fetch_add_explicit(current_chunk_ctr, 1, memory_order_relaxed);
7786
7876
  }
7787
7877
  }
7788
-
7789
- #undef MMID_MATRIX_ROW
7790
7878
  }
7791
7879
 
7792
7880
  // ggml_compute_forward_out_prod
@@ -9080,10 +9168,6 @@ static void ggml_compute_forward_clamp_f32(
9080
9168
 
9081
9169
  const struct ggml_tensor * src0 = dst->src[0];
9082
9170
 
9083
- if (params->ith != 0) {
9084
- return;
9085
- }
9086
-
9087
9171
  float min;
9088
9172
  float max;
9089
9173
  memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
@@ -13723,14 +13807,19 @@ struct ggml_cplan ggml_graph_plan(
13723
13807
  cur = 0;
13724
13808
  const struct ggml_tensor * src0 = node->src[0];
13725
13809
  const struct ggml_tensor * src1 = node->src[1];
13810
+ const struct ggml_tensor * ids = node->src[2];
13726
13811
  const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
13812
+ const int n_as = src0->ne[2];
13813
+ // src1
13727
13814
  if (src1->type != vec_dot_type) {
13728
- cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
13815
+ cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)) + sizeof(int64_t);
13729
13816
  }
13730
- const int n_as = src0->ne[2];
13731
- cur += GGML_PAD(cur, sizeof(int64_t)); // align
13732
- cur += n_as * sizeof(int64_t); // matrix_row_counts
13733
- cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
13817
+ // matrix_row_counts
13818
+ cur += n_as * sizeof(int64_t) + sizeof(int64_t);
13819
+ // matrix_rows
13820
+ cur += n_as*ids->ne[0]*ids->ne[1]*sizeof(struct mmid_row_mapping) + sizeof(int64_t);
13821
+ // atomic_current_chunk
13822
+ cur += CACHE_LINE_SIZE*n_as + CACHE_LINE_SIZE;
13734
13823
  } break;
13735
13824
  case GGML_OP_OUT_PROD:
13736
13825
  {
@@ -13862,9 +13951,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
13862
13951
  tp->ec = GGML_STATUS_ABORTED;
13863
13952
  }
13864
13953
 
13865
- ggml_barrier(state->threadpool);
13954
+ if (node_n + 1 < cgraph->n_nodes) {
13955
+ ggml_barrier(state->threadpool);
13956
+ }
13866
13957
  }
13867
13958
 
13959
+ ggml_barrier(state->threadpool);
13960
+
13868
13961
  return 0;
13869
13962
  }
13870
13963
 
@@ -284,14 +284,14 @@ struct ggml_backend_cpu_device_context {
284
284
  &hKey) == ERROR_SUCCESS) {
285
285
  DWORD cpu_brand_size = 0;
286
286
  if (RegQueryValueExA(hKey,
287
- TEXT("ProcessorNameString"),
287
+ "ProcessorNameString",
288
288
  NULL,
289
289
  NULL,
290
290
  NULL,
291
291
  &cpu_brand_size) == ERROR_SUCCESS) {
292
292
  description.resize(cpu_brand_size);
293
293
  if (RegQueryValueExA(hKey,
294
- TEXT("ProcessorNameString"),
294
+ "ProcessorNameString",
295
295
  NULL,
296
296
  NULL,
297
297
  (LPBYTE)&description[0], // NOLINT
@@ -534,9 +534,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
534
534
  if (ggml_cpu_has_dotprod()) {
535
535
  features.push_back({ "DOTPROD", "1" });
536
536
  }
537
- if (ggml_cpu_has_matmul_int8()) {
538
- features.push_back({ "MATMUL_INT8", "1" });
539
- }
540
537
  if (ggml_cpu_get_sve_cnt() > 0) {
541
538
  static std::string sve_cnt = std::to_string(ggml_cpu_get_sve_cnt());
542
539
  features.push_back({ "SVE_CNT", sve_cnt.c_str() });
@@ -280,14 +280,6 @@ template <> inline __m256bh load(const float *p) {
280
280
  }
281
281
  #endif
282
282
 
283
- ////////////////////////////////////////////////////////////////////////////////////////////////////
284
- // CONSTANTS
285
-
286
- #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
287
- static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
288
- static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
289
- #endif
290
-
291
283
  ////////////////////////////////////////////////////////////////////////////////////////////////////
292
284
  // FLOATING POINT MATRIX MULTIPLICATION
293
285
 
@@ -614,6 +606,14 @@ class tinyBLAS_Q0_AVX {
614
606
  TC *C, int64_t ldc,
615
607
  int ith, int nth)
616
608
  : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
609
+ const int8_t kvalues_iq4nl[16] = {
610
+ -127, -104, -83, -65,
611
+ -49, -35, -22, -10,
612
+ 1, 13, 25, 38,
613
+ 53, 69, 89, 113
614
+ };
615
+
616
+ iq4nlt = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
617
617
  }
618
618
 
619
619
  void matmul(int64_t m, int64_t n) {
@@ -1038,6 +1038,7 @@ class tinyBLAS_Q0_AVX {
1038
1038
  const int64_t ldc;
1039
1039
  const int ith;
1040
1040
  const int nth;
1041
+ __m128i iq4nlt;
1041
1042
  };
1042
1043
  #endif // __AVX__
1043
1044
 
@@ -15,9 +15,9 @@ if (CUDAToolkit_FOUND)
15
15
  if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
16
16
  set(CMAKE_CUDA_ARCHITECTURES "native")
17
17
  elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
18
- set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75")
18
+ set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75;80")
19
19
  else()
20
- set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75")
20
+ set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75;80")
21
21
  endif()
22
22
  endif()
23
23
  message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
@@ -143,6 +143,7 @@ struct ggml_backend_opencl_context {
143
143
  cl_kernel kernel_rms_norm;
144
144
  cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
145
145
  cl_kernel kernel_soft_max, kernel_soft_max_4;
146
+ cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
146
147
  cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
147
148
  cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
148
149
  cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
@@ -614,6 +615,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
614
615
  CL_CHECK((backend_ctx->kernel_diag_mask_inf_8 = clCreateKernel(backend_ctx->program, "kernel_diag_mask_inf_8", &err), err));
615
616
  CL_CHECK((backend_ctx->kernel_soft_max = clCreateKernel(backend_ctx->program, "kernel_soft_max", &err), err));
616
617
  CL_CHECK((backend_ctx->kernel_soft_max_4 = clCreateKernel(backend_ctx->program, "kernel_soft_max_4", &err), err));
618
+ CL_CHECK((backend_ctx->kernel_soft_max_f16 = clCreateKernel(backend_ctx->program, "kernel_soft_max_f16", &err), err));
619
+ CL_CHECK((backend_ctx->kernel_soft_max_4_f16 = clCreateKernel(backend_ctx->program, "kernel_soft_max_4_f16", &err), err));
617
620
  CL_CHECK((backend_ctx->kernel_rope_norm_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f32", &err), err));
618
621
  CL_CHECK((backend_ctx->kernel_rope_norm_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f16", &err), err));
619
622
  CL_CHECK((backend_ctx->kernel_rope_neox_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f32", &err), err));
@@ -1044,8 +1047,16 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1044
1047
  return true;
1045
1048
  case GGML_OP_DIAG_MASK_INF:
1046
1049
  return op->ne[3] == 1;
1047
- case GGML_OP_ROPE:
1050
+ case GGML_OP_ROPE: {
1051
+ const int mode = ((const int32_t *) op->op_params)[2];
1052
+ if (mode & GGML_ROPE_TYPE_MROPE) {
1053
+ return false;
1054
+ }
1055
+ if (mode & GGML_ROPE_TYPE_VISION) {
1056
+ return false;
1057
+ }
1048
1058
  return true;
1059
+ }
1049
1060
  default:
1050
1061
  return false;
1051
1062
  }
@@ -3666,6 +3677,8 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
3666
3677
  const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
3667
3678
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
3668
3679
 
3680
+ const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
3681
+
3669
3682
  // Local size must be wave size. Each workgroup is a wave, working on a row,
3670
3683
  // where a row corresponds to leading dimension.
3671
3684
  int nth = MIN(32, ne00);
@@ -3683,9 +3696,17 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
3683
3696
  cl_kernel kernel;
3684
3697
 
3685
3698
  if (ne00%4 == 0) {
3686
- kernel = backend_ctx->kernel_soft_max_4;
3699
+ if (use_f16) {
3700
+ kernel = backend_ctx->kernel_soft_max_4_f16;
3701
+ } else {
3702
+ kernel = backend_ctx->kernel_soft_max_4;
3703
+ }
3687
3704
  } else {
3688
- kernel = backend_ctx->kernel_soft_max;
3705
+ if (use_f16) {
3706
+ kernel = backend_ctx->kernel_soft_max_f16;
3707
+ } else {
3708
+ kernel = backend_ctx->kernel_soft_max;
3709
+ }
3689
3710
  }
3690
3711
 
3691
3712
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
@@ -3766,7 +3787,8 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
3766
3787
  const int nb2 = dst ? dst->nb[2] : 0;
3767
3788
  const int nb3 = dst ? dst->nb[3] : 0;
3768
3789
 
3769
- GGML_ASSERT(ne10 == ne02);
3790
+ GGML_ASSERT(ne10 % ne02 == 0);
3791
+ GGML_ASSERT(ne10 >= ne02);
3770
3792
 
3771
3793
  int nth = MIN(64, ne00);
3772
3794