@fugood/llama.node 0.3.9 → 0.3.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.js +2 -2
  18. package/lib/binding.ts +47 -8
  19. package/lib/index.js +21 -1
  20. package/lib/index.ts +31 -1
  21. package/package.json +12 -3
  22. package/src/LlamaCompletionWorker.cpp +33 -6
  23. package/src/LlamaCompletionWorker.h +3 -1
  24. package/src/LlamaContext.cpp +336 -28
  25. package/src/LlamaContext.h +2 -0
  26. package/src/common.hpp +19 -2
  27. package/src/llama.cpp/.github/workflows/build.yml +289 -107
  28. package/src/llama.cpp/.github/workflows/close-issue.yml +1 -1
  29. package/src/llama.cpp/.github/workflows/docker.yml +2 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +25 -2
  31. package/src/llama.cpp/CMakeLists.txt +10 -19
  32. package/src/llama.cpp/cmake/build-info.cmake +1 -1
  33. package/src/llama.cpp/common/CMakeLists.txt +32 -0
  34. package/src/llama.cpp/common/arg.cpp +66 -16
  35. package/src/llama.cpp/common/chat-template.hpp +515 -0
  36. package/src/llama.cpp/common/chat.cpp +966 -0
  37. package/src/llama.cpp/common/chat.hpp +52 -0
  38. package/src/llama.cpp/common/common.cpp +159 -36
  39. package/src/llama.cpp/common/common.h +56 -14
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +46 -66
  41. package/src/llama.cpp/common/json-schema-to-grammar.h +15 -1
  42. package/src/llama.cpp/common/llguidance.cpp +270 -0
  43. package/src/llama.cpp/common/log.cpp +1 -10
  44. package/src/llama.cpp/common/log.h +10 -0
  45. package/src/llama.cpp/common/minja.hpp +2868 -0
  46. package/src/llama.cpp/common/sampling.cpp +22 -1
  47. package/src/llama.cpp/common/sampling.h +3 -0
  48. package/src/llama.cpp/docs/build.md +54 -9
  49. package/src/llama.cpp/examples/export-lora/export-lora.cpp +12 -2
  50. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +1 -1
  51. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  52. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +59 -0
  53. package/src/llama.cpp/examples/llava/clip.cpp +133 -14
  54. package/src/llama.cpp/examples/llava/clip.h +2 -0
  55. package/src/llama.cpp/examples/llava/llava.cpp +22 -8
  56. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +9 -1
  57. package/src/llama.cpp/examples/main/main.cpp +26 -25
  58. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +136 -137
  59. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +18 -4
  60. package/src/llama.cpp/examples/run/run.cpp +224 -69
  61. package/src/llama.cpp/examples/server/server.cpp +252 -81
  62. package/src/llama.cpp/examples/server/utils.hpp +73 -21
  63. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +6 -4
  64. package/src/llama.cpp/examples/simple-cmake-pkg/CMakeLists.txt +11 -0
  65. package/src/llama.cpp/ggml/CMakeLists.txt +78 -1
  66. package/src/llama.cpp/ggml/include/ggml.h +1 -1
  67. package/src/llama.cpp/ggml/src/CMakeLists.txt +21 -4
  68. package/src/llama.cpp/ggml/src/ggml-alloc.c +1 -13
  69. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +91 -78
  70. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +7 -7
  71. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -1
  72. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
  73. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +46 -0
  74. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +16 -1
  75. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +1 -1
  76. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +28 -8
  77. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +5 -7
  78. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +33 -23
  79. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +1 -5
  80. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +323 -121
  81. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
  82. package/src/llama.cpp/ggml/src/ggml.c +23 -13
  83. package/src/llama.cpp/include/llama.h +14 -1
  84. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +112 -0
  85. package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +46 -0
  86. package/src/llama.cpp/src/CMakeLists.txt +1 -1
  87. package/src/llama.cpp/src/llama-arch.cpp +7 -2
  88. package/src/llama.cpp/src/llama-arch.h +3 -1
  89. package/src/llama.cpp/src/llama-chat.cpp +11 -2
  90. package/src/llama.cpp/src/llama-chat.h +1 -0
  91. package/src/llama.cpp/src/llama-grammar.cpp +86 -6
  92. package/src/llama.cpp/src/llama-grammar.h +22 -1
  93. package/src/llama.cpp/src/llama-mmap.cpp +1 -0
  94. package/src/llama.cpp/src/llama-model-loader.cpp +1 -1
  95. package/src/llama.cpp/src/llama-model.cpp +76 -6
  96. package/src/llama.cpp/src/llama-sampling.cpp +47 -4
  97. package/src/llama.cpp/src/llama-vocab.cpp +10 -4
  98. package/src/llama.cpp/src/llama.cpp +181 -123
  99. package/src/llama.cpp/tests/CMakeLists.txt +4 -0
  100. package/src/llama.cpp/tests/test-backend-ops.cpp +158 -57
  101. package/src/llama.cpp/tests/test-chat-template.cpp +154 -31
  102. package/src/llama.cpp/tests/test-chat.cpp +607 -0
  103. package/src/llama.cpp/tests/test-grammar-integration.cpp +2 -2
  104. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +1140 -0
  105. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -1
  106. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +0 -32
@@ -1302,6 +1302,59 @@ struct test_repeat : public test_case {
1302
1302
  }
1303
1303
  };
1304
1304
 
1305
+ // GGML_OP_REPEAT_BACK
1306
+ struct test_repeat_back : public test_case {
1307
+ const ggml_type type;
1308
+ const std::array<int64_t, 4> ne;
1309
+ const std::array<int, 4> nr;
1310
+ const bool v; // whether src is a noncontiguous view
1311
+
1312
+ std::string vars() override {
1313
+ return VARS_TO_STR4(type, ne, nr, v);
1314
+ }
1315
+
1316
+ size_t op_size(ggml_tensor * t) override {
1317
+ return ggml_nbytes(t) * 2;
1318
+ }
1319
+
1320
+ test_repeat_back(ggml_type type = GGML_TYPE_F32,
1321
+ std::array<int64_t, 4> ne = {8, 6, 4, 2},
1322
+ std::array<int, 4> nr = {2, 2, 2, 2},
1323
+ bool v = false)
1324
+ : type(type), ne(ne), nr(nr), v(v) {}
1325
+
1326
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1327
+ ggml_tensor * src = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
1328
+ ggml_set_name(src, "src");
1329
+
1330
+ if (v) {
1331
+ GGML_ASSERT(ne[0] % 2 == 0);
1332
+ GGML_ASSERT(ne[1] % 2 == 0);
1333
+ GGML_ASSERT(ne[2] % 2 == 0);
1334
+ GGML_ASSERT(ne[3] % 2 == 0);
1335
+ GGML_ASSERT(nr[0] % 2 == 0 || nr[0] == 1);
1336
+ GGML_ASSERT(nr[1] % 2 == 0 || nr[1] == 1);
1337
+ GGML_ASSERT(nr[2] % 2 == 0 || nr[2] == 1);
1338
+ GGML_ASSERT(nr[3] % 2 == 0 || nr[3] == 1);
1339
+
1340
+ const int64_t ne00 = nr[0] == 1 ? src->ne[0] : src->ne[0] / 2;
1341
+ const int64_t ne01 = nr[1] == 1 ? src->ne[1] : src->ne[1] / 2;
1342
+ const int64_t ne02 = nr[2] == 1 ? src->ne[2] : src->ne[2] / 2;
1343
+ const int64_t ne03 = nr[3] == 1 ? src->ne[3] : src->ne[3] / 2;
1344
+
1345
+ src = ggml_view_4d(ctx, src, ne00, ne01, ne02, ne03, src->nb[1], src->nb[2], src->nb[3], 0);
1346
+ }
1347
+
1348
+ ggml_tensor * target = ggml_new_tensor(ctx, type, 4, ne.data());
1349
+ ggml_set_name(target, "target");
1350
+
1351
+ ggml_tensor * out = ggml_repeat_back(ctx, src, target);
1352
+ ggml_set_name(out, "out");
1353
+
1354
+ return out;
1355
+ }
1356
+ };
1357
+
1305
1358
  // GGML_OP_DUP
1306
1359
  struct test_dup : public test_case {
1307
1360
  const ggml_type type;
@@ -1621,21 +1674,28 @@ struct test_silu_back : public test_case {
1621
1674
  struct test_norm : public test_case {
1622
1675
  const ggml_type type;
1623
1676
  const std::array<int64_t, 4> ne;
1624
- float eps;
1677
+ const bool v; // whether a is a non-contiguous view
1678
+ const float eps;
1625
1679
 
1626
1680
  std::string vars() override {
1627
- return VARS_TO_STR3(type, ne, eps);
1681
+ return VARS_TO_STR4(type, ne, v, eps);
1628
1682
  }
1629
1683
 
1630
1684
  test_norm(ggml_type type = GGML_TYPE_F32,
1631
1685
  std::array<int64_t, 4> ne = {64, 5, 4, 3},
1686
+ bool v = false,
1632
1687
  float eps = 1e-6f)
1633
- : type(type), ne(ne), eps(eps) {}
1688
+ : type(type), ne(ne), v(v), eps(eps) {}
1634
1689
 
1635
1690
  ggml_tensor * build_graph(ggml_context * ctx) override {
1636
1691
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1637
1692
  ggml_set_name(a, "a");
1638
1693
 
1694
+ if (v) {
1695
+ a = ggml_view_4d(ctx, a, a->ne[0]/2, a->ne[1]/2, a->ne[2]/2, a->ne[3]/2, a->nb[1], a->nb[2], a->nb[3], 0);
1696
+ ggml_set_name(a, "view of a");
1697
+ }
1698
+
1639
1699
  ggml_tensor * out = ggml_norm(ctx, a, eps);
1640
1700
  ggml_set_name(out, "out");
1641
1701
 
@@ -1647,22 +1707,29 @@ struct test_norm : public test_case {
1647
1707
  struct test_rms_norm : public test_case {
1648
1708
  const ggml_type type;
1649
1709
  const std::array<int64_t, 4> ne;
1650
- float eps;
1710
+ const bool v; // whether a is a non-contiguous view
1711
+ const float eps;
1651
1712
 
1652
1713
  std::string vars() override {
1653
- return VARS_TO_STR3(type, ne, eps);
1714
+ return VARS_TO_STR4(type, ne, v, eps);
1654
1715
  }
1655
1716
 
1656
1717
  test_rms_norm(ggml_type type = GGML_TYPE_F32,
1657
1718
  std::array<int64_t, 4> ne = {64, 5, 4, 3},
1719
+ bool v = false,
1658
1720
  float eps = 1e-6f)
1659
- : type(type), ne(ne), eps(eps) {}
1721
+ : type(type), ne(ne), v(v), eps(eps) {}
1660
1722
 
1661
1723
  ggml_tensor * build_graph(ggml_context * ctx) override {
1662
1724
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1663
1725
  ggml_set_param(ctx, a);
1664
1726
  ggml_set_name(a, "a");
1665
1727
 
1728
+ if (v) {
1729
+ a = ggml_view_4d(ctx, a, a->ne[0]/2, a->ne[1]/2, a->ne[2]/2, a->ne[3]/2, a->nb[1], a->nb[2], a->nb[3], 0);
1730
+ ggml_set_name(a, "view of a");
1731
+ }
1732
+
1666
1733
  ggml_tensor * out = ggml_rms_norm(ctx, a, eps);
1667
1734
  ggml_set_name(out, "out");
1668
1735
 
@@ -1688,7 +1755,7 @@ struct test_rms_norm : public test_case {
1688
1755
  struct test_rms_norm_back : public test_case {
1689
1756
  const ggml_type type;
1690
1757
  const std::array<int64_t, 4> ne;
1691
- float eps;
1758
+ const float eps;
1692
1759
 
1693
1760
  std::string vars() override {
1694
1761
  return VARS_TO_STR3(type, ne, eps);
@@ -1849,6 +1916,10 @@ struct test_mul_mat : public test_case {
1849
1916
  return 5e-4;
1850
1917
  }
1851
1918
 
1919
+ int64_t grad_nmax() override {
1920
+ return 20000;
1921
+ }
1922
+
1852
1923
  uint64_t op_flops(ggml_tensor * t) override {
1853
1924
  GGML_UNUSED(t);
1854
1925
  return 2 * m * n * k * bs[0] * nr[0] * bs[1] * nr[1];
@@ -1878,8 +1949,12 @@ struct test_mul_mat : public test_case {
1878
1949
 
1879
1950
  a = ggml_new_tensor_4d(ctx, type_a, ne_a[per[0]], ne_a[per[1]], ne_a[per[2]], ne_a[per[3]]);
1880
1951
  b = ggml_new_tensor_4d(ctx, type_b, ne_b[per[0]], ne_b[per[1]], ne_b[per[2]], ne_b[per[3]]);
1881
- ggml_set_param(ctx, a);
1882
- ggml_set_param(ctx, b);
1952
+ if (!ggml_is_quantized(type_a)) {
1953
+ if (bs[1] == 1 && nr[1] == 1) {
1954
+ ggml_set_param(ctx, a);
1955
+ }
1956
+ ggml_set_param(ctx, b);
1957
+ }
1883
1958
  ggml_set_name(a, "a");
1884
1959
  ggml_set_name(b, "b");
1885
1960
 
@@ -1890,8 +1965,12 @@ struct test_mul_mat : public test_case {
1890
1965
  } else {
1891
1966
  a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0], bs[1]);
1892
1967
  b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
1893
- ggml_set_param(ctx, a);
1894
- ggml_set_param(ctx, b);
1968
+ if (!ggml_is_quantized(type_a)) {
1969
+ if (bs[1] == 1 && nr[1] == 1) {
1970
+ ggml_set_param(ctx, a);
1971
+ }
1972
+ ggml_set_param(ctx, b);
1973
+ }
1895
1974
  ggml_set_name(a, "a");
1896
1975
  ggml_set_name(b, "b");
1897
1976
  }
@@ -2282,11 +2361,12 @@ struct test_soft_max : public test_case {
2282
2361
  const ggml_type type;
2283
2362
  const std::array<int64_t, 4> ne;
2284
2363
  const bool mask;
2364
+ const ggml_type m_prec;
2285
2365
  const float scale;
2286
2366
  const float max_bias;
2287
2367
 
2288
2368
  std::string vars() override {
2289
- return VARS_TO_STR5(type, ne, mask, scale, max_bias);
2369
+ return VARS_TO_STR6(type, ne, mask, m_prec, scale, max_bias);
2290
2370
  }
2291
2371
 
2292
2372
  // the 1024 test with bias occasionally fails:
@@ -2298,9 +2378,10 @@ struct test_soft_max : public test_case {
2298
2378
  test_soft_max(ggml_type type = GGML_TYPE_F32,
2299
2379
  std::array<int64_t, 4> ne = {10, 5, 4, 3},
2300
2380
  bool mask = false,
2381
+ ggml_type m_prec = GGML_TYPE_F32,
2301
2382
  float scale = 1.0f,
2302
2383
  float max_bias = 0.0f)
2303
- : type(type), ne(ne), mask(mask), scale(scale), max_bias(max_bias) {}
2384
+ : type(type), ne(ne), mask(mask), m_prec(m_prec), scale(scale), max_bias(max_bias) {}
2304
2385
 
2305
2386
  ggml_tensor * build_graph(ggml_context * ctx) override {
2306
2387
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
@@ -2309,7 +2390,7 @@ struct test_soft_max : public test_case {
2309
2390
 
2310
2391
  ggml_tensor * mask = nullptr;
2311
2392
  if (this->mask) {
2312
- mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne[0], ne[1]);
2393
+ mask = ggml_new_tensor_2d(ctx, m_prec, ne[0], ne[1]);
2313
2394
  ggml_set_name(mask, "mask");
2314
2395
  }
2315
2396
 
@@ -2852,7 +2933,7 @@ struct test_group_norm : public test_case {
2852
2933
  const float eps;
2853
2934
 
2854
2935
  std::string vars() override {
2855
- return VARS_TO_STR3(type, ne, num_groups);
2936
+ return VARS_TO_STR4(type, ne, num_groups, eps);
2856
2937
  }
2857
2938
 
2858
2939
  test_group_norm(ggml_type type = GGML_TYPE_F32,
@@ -3798,6 +3879,16 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
3798
3879
  test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 5, 4, ne3}, {1, 1, 1, 2}));
3799
3880
  }
3800
3881
 
3882
+ for (bool view : {false, true}) {
3883
+ test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {1, 1, 1, 1}, view));
3884
+ test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {2, 1, 1, 1}, view));
3885
+ test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {1, 2, 1, 1}, view));
3886
+ test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {1, 1, 2, 1}, view));
3887
+ test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {1, 1, 1, 2}, view));
3888
+ test_cases.emplace_back(new test_repeat_back(GGML_TYPE_I32, {8, 6, 4, 2}, {2, 1, 1, 1}, view));
3889
+ test_cases.emplace_back(new test_repeat_back(GGML_TYPE_I16, {8, 6, 4, 2}, {1, 1, 1, 2}, view));
3890
+ }
3891
+
3801
3892
  test_cases.emplace_back(new test_dup(GGML_TYPE_F32));
3802
3893
  test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
3803
3894
  test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
@@ -3887,9 +3978,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
3887
3978
  test_cases.emplace_back(new test_scale());
3888
3979
  test_cases.emplace_back(new test_silu_back());
3889
3980
 
3890
- for (float eps : {0.0f, 1e-7f, 1e-4f, 1e-1f}) {
3891
- test_cases.emplace_back(new test_norm (GGML_TYPE_F32, {64, 5, 4, 3}, eps));
3892
- test_cases.emplace_back(new test_rms_norm (GGML_TYPE_F32, {64, 5, 4, 3}, eps));
3981
+ for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f}) {
3982
+ for (bool v : {false, true}) {
3983
+ test_cases.emplace_back(new test_norm (GGML_TYPE_F32, {64, 5, 4, 3}, v, eps));
3984
+ test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, v, eps));
3985
+ }
3893
3986
  test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
3894
3987
  }
3895
3988
 
@@ -3909,38 +4002,35 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
3909
4002
  test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 32, 4));
3910
4003
  test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 128, 4));
3911
4004
 
3912
- for (int i = 1; i < 9; ++i) {
3913
- test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16, i, 256, { 1, 1}, {1, 1}));
3914
- test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 16, i, 256, { 1, 1}, {1, 1}));
3915
- test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_1, GGML_TYPE_F32, 16, i, 256, { 1, 1}, {1, 1}));
3916
- test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q5_0, GGML_TYPE_F32, 16, i, 256, { 1, 1}, {1, 1}));
3917
- test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q5_1, GGML_TYPE_F32, 16, i, 256, { 1, 1}, {1, 1}));
3918
- test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q8_0, GGML_TYPE_F32, 16, i, 256, { 1, 1}, {1, 1}));
3919
- test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_K, GGML_TYPE_F32, 16, i, 256, { 1, 1}, {1, 1}));
3920
- test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q5_K, GGML_TYPE_F32, 16, i, 256, { 1, 1}, {1, 1}));
3921
- test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q6_K, GGML_TYPE_F32, 16, i, 256, { 1, 1}, {1, 1}));
3922
- test_cases.emplace_back(new test_mul_mat(GGML_TYPE_IQ4_NL, GGML_TYPE_F32, 16, i, 256, { 1, 1}, {1, 1}));
4005
+ for (ggml_type type_a : all_types) {
4006
+ for (int i = 1; i < 10; ++i) {
4007
+ test_cases.emplace_back(new test_mul_mat(type_a, GGML_TYPE_F32, 16, i, 256, { 1, 1}, {1, 1}));
4008
+ }
3923
4009
  }
3924
4010
 
3925
4011
  #if 1
3926
4012
  for (ggml_type type_a : base_types) {
3927
4013
  for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
3928
4014
  // test cases without permutation
3929
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
3930
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {1, 1}));
3931
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {2, 1}));
3932
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 1}));
3933
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 1}));
3934
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 2}));
3935
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 2}));
3936
-
3937
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 1, 1}, {1, 1}));
3938
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 1}, {1, 1}));
3939
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 1}, {2, 1}));
3940
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 1}));
3941
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 1}));
3942
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 2}));
3943
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
4015
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1, 1}, {1, 1}));
4016
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1, 1}, {2, 1}));
4017
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1, 1}, {1, 2}));
4018
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {3, 1}, {1, 1}));
4019
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {3, 1}, {2, 1}));
4020
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {3, 2}, {1, 1}));
4021
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {3, 2}, {2, 1}));
4022
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {3, 2}, {1, 2}));
4023
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {3, 2}, {2, 2}));
4024
+
4025
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {1, 1}, {1, 1}));
4026
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {1, 1}, {2, 1}));
4027
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {1, 1}, {1, 2}));
4028
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 1}, {1, 1}));
4029
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 1}, {2, 1}));
4030
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 2}, {1, 1}));
4031
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 2}, {2, 1}));
4032
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 2}, {1, 2}));
4033
+ test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 2}, {2, 2}));
3944
4034
 
3945
4035
  // test cases with permutation
3946
4036
  test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
@@ -4078,17 +4168,28 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
4078
4168
  for (float scale : {1.0f, 0.1f}) {
4079
4169
  for (int64_t ne0 : {16, 1024}) {
4080
4170
  for (int64_t ne1 : {16, 1024}) {
4081
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, mask, scale, max_bias));
4082
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, scale, max_bias));
4171
+ if (mask) {
4172
+ for (ggml_type m_prec : {GGML_TYPE_F32, GGML_TYPE_F16}) {
4173
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, mask, m_prec, scale, max_bias));
4174
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, m_prec, scale, max_bias));
4175
+ }
4176
+ } else {
4177
+ /* The precision of mask here doesn't matter as boolean mask is false */
4178
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, mask, GGML_TYPE_F32, scale, max_bias));
4179
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, GGML_TYPE_F32, scale, max_bias));
4180
+ }
4083
4181
  }
4084
4182
  }
4085
4183
  }
4086
4184
  }
4087
4185
  }
4088
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, 0.1f, 0.0f));
4089
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
4090
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f));
4091
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
4186
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, GGML_TYPE_F32, 0.1f, 0.0f));
4187
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, GGML_TYPE_F16, 0.1f, 0.0f));
4188
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, GGML_TYPE_F32, 0.1f, 0.0f));
4189
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F32, 0.1f, 0.0f));
4190
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F16, 0.1f, 0.0f));
4191
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F32, 0.1f, 8.0f));
4192
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F16, 0.1f, 8.0f));
4092
4193
 
4093
4194
  for (float max_bias : {0.0f, 8.0f}) {
4094
4195
  for (float scale : {1.0f, 0.1f}) {
@@ -4224,13 +4325,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
4224
4325
  test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {0, 2, 1, 3}));
4225
4326
  test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {0, 2, 1, 3}));
4226
4327
 
4227
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, 1.0f, 0.0f));
4228
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 4096, 5, 1}, false, 1.0f, 0.0f));
4229
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {1024, 1024, 10, 1}, false, 1.0f, 0.0f));
4230
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 1024, 10, 1}, false, 1.0f, 0.0f));
4231
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {256, 256, 20, 1}, false, 1.0f, 0.0f));
4232
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {64, 64, 20, 1}, false, 1.0f, 0.0f));
4233
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 64, 20, 1}, false, 1.0f, 0.0f));
4328
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
4329
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 4096, 5, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
4330
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {1024, 1024, 10, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
4331
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 1024, 10, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
4332
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {256, 256, 20, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
4333
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {64, 64, 20, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
4334
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 64, 20, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
4234
4335
 
4235
4336
  test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32, 10, 1, 1}));
4236
4337
  test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1}));