@fugood/llama.node 0.0.1-alpha.4 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CMakeLists.txt +36 -7
  2. package/README.md +9 -0
  3. package/bin/darwin/arm64/default.metallib +0 -0
  4. package/bin/darwin/arm64/llama-node.node +0 -0
  5. package/bin/darwin/x64/default.metallib +0 -0
  6. package/bin/darwin/x64/llama-node.node +0 -0
  7. package/bin/linux/arm64/llama-node.node +0 -0
  8. package/bin/linux/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/lib/binding.js +1 -1
  14. package/lib/binding.ts +5 -2
  15. package/lib/index.ts +2 -2
  16. package/package.json +15 -3
  17. package/src/LlamaCompletionWorker.cpp +5 -1
  18. package/src/LlamaCompletionWorker.h +4 -0
  19. package/src/LlamaContext.cpp +18 -1
  20. package/src/common.hpp +11 -7
  21. package/src/llama.cpp/CMakeLists.txt +13 -7
  22. package/src/llama.cpp/common/common.cpp +221 -173
  23. package/src/llama.cpp/common/common.h +19 -8
  24. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
  25. package/src/llama.cpp/common/log.h +2 -2
  26. package/src/llama.cpp/common/sampling.cpp +17 -1
  27. package/src/llama.cpp/common/sampling.h +28 -20
  28. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +17 -11
  29. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +5 -5
  30. package/src/llama.cpp/examples/finetune/finetune.cpp +1 -1
  31. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -4
  32. package/src/llama.cpp/examples/imatrix/imatrix.cpp +72 -39
  33. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -3
  34. package/src/llama.cpp/examples/llava/clip.cpp +74 -23
  35. package/src/llama.cpp/examples/llava/llava-cli.cpp +37 -28
  36. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -1
  37. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -1
  38. package/src/llama.cpp/examples/main/main.cpp +10 -8
  39. package/src/llama.cpp/examples/perplexity/perplexity.cpp +175 -55
  40. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  41. package/src/llama.cpp/examples/quantize/quantize.cpp +74 -47
  42. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  43. package/src/llama.cpp/examples/server/server.cpp +97 -86
  44. package/src/llama.cpp/examples/server/utils.hpp +17 -15
  45. package/src/llama.cpp/ggml-backend.c +7 -5
  46. package/src/llama.cpp/ggml-impl.h +339 -4
  47. package/src/llama.cpp/ggml-kompute.cpp +7 -0
  48. package/src/llama.cpp/ggml-opencl.cpp +1 -0
  49. package/src/llama.cpp/ggml-quants.c +302 -293
  50. package/src/llama.cpp/ggml-sycl.cpp +28 -16
  51. package/src/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
  52. package/src/llama.cpp/ggml-vulkan.cpp +951 -263
  53. package/src/llama.cpp/ggml.c +1469 -116
  54. package/src/llama.cpp/ggml.h +37 -7
  55. package/src/llama.cpp/llama.cpp +969 -432
  56. package/src/llama.cpp/llama.h +46 -14
  57. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +2 -0
  58. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -1
  59. package/src/llama.cpp/requirements/requirements-convert.txt +2 -2
  60. package/src/llama.cpp/requirements.txt +1 -0
  61. package/src/llama.cpp/sgemm.cpp +134 -103
  62. package/src/llama.cpp/sgemm.h +4 -2
  63. package/src/llama.cpp/tests/CMakeLists.txt +96 -36
  64. package/src/llama.cpp/tests/test-backend-ops.cpp +56 -6
  65. package/src/llama.cpp/tests/test-chat-template.cpp +4 -0
  66. package/src/llama.cpp/tests/test-grammar-integration.cpp +225 -136
  67. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -0
  68. package/src/llama.cpp/tests/test-tokenizer-0.cpp +292 -0
  69. package/src/llama.cpp/tests/{test-tokenizer-1-llama.cpp → test-tokenizer-1-spm.cpp} +1 -1
  70. package/src/llama.cpp/unicode-data.cpp +1188 -656
  71. package/src/llama.cpp/unicode-data.h +4 -3
  72. package/src/llama.cpp/unicode.cpp +590 -49
  73. package/src/llama.cpp/unicode.h +6 -3
  74. package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +0 -187
  75. package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +0 -190
@@ -1,10 +1,40 @@
1
+ function(llama_test target)
2
+ include(CMakeParseArguments)
3
+ set(options)
4
+ set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
5
+ set(multiValueArgs ARGS)
6
+ cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
7
+
8
+ if (NOT DEFINED LLAMA_TEST_LABEL)
9
+ set(LLAMA_TEST_LABEL "main")
10
+ endif()
11
+ if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
12
+ set(LLAMA_TEST_WORKING_DIRECTORY .)
13
+ endif()
14
+ if (DEFINED LLAMA_TEST_NAME)
15
+ set(TEST_NAME ${LLAMA_TEST_NAME})
16
+ else()
17
+ set(TEST_NAME ${target})
18
+ endif()
19
+
20
+ set(TEST_TARGET ${target})
21
+
22
+ add_test(
23
+ NAME ${TEST_NAME}
24
+ WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
25
+ COMMAND $<TARGET_FILE:${TEST_TARGET}>
26
+ ${LLAMA_TEST_ARGS})
27
+
28
+ set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
29
+ endfunction()
30
+
1
31
  # Builds and runs a test source file.
2
32
  # Optional args:
3
33
  # - NAME: name of the executable & test target (defaults to the source file name without extension)
4
34
  # - LABEL: label for the test (defaults to main)
5
35
  # - ARGS: arguments to pass to the test executable
6
36
  # - WORKING_DIRECTORY
7
- function(llama_test source)
37
+ function(llama_target_and_test source)
8
38
  include(CMakeParseArguments)
9
39
  set(options)
10
40
  set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
@@ -35,41 +65,71 @@ function(llama_test source)
35
65
  set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${LLAMA_TEST_LABEL})
36
66
  endfunction()
37
67
 
38
- # llama_test(test-double-float.cpp) # SLOW
39
- llama_test(test-quantize-fns.cpp)
40
- llama_test(test-quantize-perf.cpp)
41
- llama_test(test-sampling.cpp)
42
- llama_test(test-chat-template.cpp)
43
-
44
- llama_test(test-tokenizer-0-llama.cpp NAME test-tokenizer-0-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
45
- llama_test(test-tokenizer-0-falcon.cpp NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
46
-
47
- llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
48
- llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
49
-
50
- llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
51
- llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
52
- llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
53
- llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-stablelm-3b-4e1t ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf)
54
- llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
55
- llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
56
- llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
57
- llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-gpt2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
58
- #llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-bloom ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
59
-
60
- llama_test(test-grammar-parser.cpp)
61
- llama_test(test-llama-grammar.cpp)
62
- llama_test(test-grammar-integration.cpp)
63
- llama_test(test-grad0.cpp)
64
- # llama_test(test-opt.cpp) # SLOW
65
- llama_test(test-backend-ops.cpp)
66
-
67
- llama_test(test-rope.cpp)
68
-
69
- llama_test(test-model-load-cancel.cpp LABEL "model")
70
- llama_test(test-autorelease.cpp LABEL "model")
71
-
72
- llama_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
68
+ # build test-tokenizer-0 target once and add many tests
69
+ add_executable(test-tokenizer-0 test-tokenizer-0.cpp)
70
+ target_link_libraries(test-tokenizer-0 PRIVATE common)
71
+ install(TARGETS test-tokenizer-0 RUNTIME)
72
+
73
+ llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
74
+ llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
75
+ llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
76
+ llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
77
+ llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
78
+ # TODO: enable when fixed
79
+ # https://github.com/ggerganov/llama.cpp/pull/7036
80
+ #llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
81
+ #llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
82
+ #llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
83
+ llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
84
+ llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
85
+ llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
86
+ llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
87
+ llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
88
+
89
+ # build test-tokenizer-1-bpe target once and add many tests
90
+ add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
91
+ target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
92
+ install(TARGETS test-tokenizer-1-bpe RUNTIME)
93
+
94
+ # TODO: disabled due to slowness
95
+ #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
96
+ #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
97
+ #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
98
+ #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
99
+ #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-stablelm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm.gguf)
100
+ #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
101
+ #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
102
+ #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
103
+ #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
104
+ #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-bloom ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf)
105
+
106
+ # build test-tokenizer-1-spm target once and add many tests
107
+ add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
108
+ target_link_libraries(test-tokenizer-1-spm PRIVATE common)
109
+ install(TARGETS test-tokenizer-1-spm RUNTIME)
110
+
111
+ llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
112
+ #llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
113
+
114
+ # llama_target_and_test(test-double-float.cpp) # SLOW
115
+ llama_target_and_test(test-quantize-fns.cpp)
116
+ llama_target_and_test(test-quantize-perf.cpp)
117
+ llama_target_and_test(test-sampling.cpp)
118
+ llama_target_and_test(test-chat-template.cpp)
119
+
120
+ llama_target_and_test(test-grammar-parser.cpp)
121
+ llama_target_and_test(test-llama-grammar.cpp)
122
+ llama_target_and_test(test-grammar-integration.cpp)
123
+ llama_target_and_test(test-grad0.cpp)
124
+ # llama_target_and_test(test-opt.cpp) # SLOW
125
+ llama_target_and_test(test-backend-ops.cpp)
126
+
127
+ llama_target_and_test(test-rope.cpp)
128
+
129
+ llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
130
+ llama_target_and_test(test-autorelease.cpp LABEL "model")
131
+
132
+ llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
73
133
  target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
74
134
 
75
135
  # dummy executable - not installed
@@ -50,7 +50,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
50
50
 
51
51
  if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
52
52
  ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
53
- } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) {
53
+ } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
54
54
  GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
55
55
  std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
56
56
  std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
@@ -92,6 +92,8 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
92
92
  size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0];
93
93
  if (t->type == GGML_TYPE_F16) {
94
94
  tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]));
95
+ } else if (t->type == GGML_TYPE_BF16) {
96
+ tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i]));
95
97
  } else if (t->type == GGML_TYPE_F32) {
96
98
  tv.push_back(*(float *) &buf[i]);
97
99
  } else if (t->type == GGML_TYPE_I32) {
@@ -1090,6 +1092,12 @@ struct test_soft_max : public test_case {
1090
1092
  return VARS_TO_STR5(type, ne, mask, scale, max_bias);
1091
1093
  }
1092
1094
 
1095
+ // the 1024 test with bias occasionally fails:
1096
+ // SOFT_MAX(type=f32,ne=[1024,16,1,1],mask=1,scale=1.000000,max_bias=8.000000): [SOFT_MAX] NMSE = 0.000000103 > 0.000000100 FAIL
1097
+ virtual double max_nmse_err() override {
1098
+ return 1e-6;
1099
+ }
1100
+
1093
1101
  test_soft_max(ggml_type type = GGML_TYPE_F32,
1094
1102
  std::array<int64_t, 4> ne = {10, 10, 10, 10},
1095
1103
  bool mask = false,
@@ -1101,7 +1109,7 @@ struct test_soft_max : public test_case {
1101
1109
  ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1102
1110
  ggml_tensor * mask = nullptr;
1103
1111
  if (this->mask) {
1104
- mask = ggml_new_tensor_2d(ctx, type, ne[0], ne[1]);
1112
+ mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne[0], ne[1]);
1105
1113
  }
1106
1114
  ggml_tensor * pos = nullptr;
1107
1115
  if (max_bias > 0.0f) {
@@ -1475,6 +1483,34 @@ struct test_leaky_relu : public test_case {
1475
1483
  }
1476
1484
  };
1477
1485
 
1486
+ // GGML_OP_FLASH_ATTN_EXT
1487
+ struct test_flash_attn_ext : public test_case {
1488
+ const int64_t hs; // head size
1489
+ const int64_t nh; // num heads
1490
+ const int64_t kv; // kv size
1491
+ const int64_t nb; // batch size
1492
+
1493
+ std::string vars() override {
1494
+ return VARS_TO_STR4(hs, nh, kv, nb);
1495
+ }
1496
+
1497
+ double max_nmse_err() override {
1498
+ return 5e-4;
1499
+ }
1500
+
1501
+ test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8)
1502
+ : hs(hs), nh(nh), kv(kv), nb(nb) {}
1503
+
1504
+ ggml_tensor * build_graph(ggml_context * ctx) override {
1505
+ ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1);
1506
+ ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
1507
+ ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
1508
+ ggml_tensor * mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1);
1509
+ ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, mask, 1.0f/sqrtf(hs));
1510
+ return out;
1511
+ }
1512
+ };
1513
+
1478
1514
  enum llm_norm_type {
1479
1515
  LLM_NORM,
1480
1516
  LLM_NORM_RMS,
@@ -1661,7 +1697,7 @@ struct test_llama : public test_llm {
1661
1697
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
1662
1698
 
1663
1699
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
1664
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hp.n_kv, hp.n_tokens, 1);
1700
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
1665
1701
 
1666
1702
  ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
1667
1703
  ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
@@ -1783,7 +1819,7 @@ struct test_falcon : public test_llm {
1783
1819
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
1784
1820
 
1785
1821
  // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
1786
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hp.n_kv, hp.n_tokens, 1);
1822
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
1787
1823
 
1788
1824
  ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
1789
1825
  ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
@@ -1864,7 +1900,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
1864
1900
  std::default_random_engine rng(0);
1865
1901
 
1866
1902
  const ggml_type all_types[] = {
1867
- GGML_TYPE_F32, GGML_TYPE_F16,
1903
+ GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
1868
1904
  GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
1869
1905
  GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
1870
1906
  GGML_TYPE_Q8_0,
@@ -2095,7 +2131,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2095
2131
  for (float scale : {1.0f, 0.1f}) {
2096
2132
  for (int64_t ne0 : {16, 1024}) {
2097
2133
  for (int64_t ne1 : {16, 1024}) {
2098
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, mask, scale, max_bias));
2134
+ test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, mask, scale, max_bias));
2099
2135
  test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, scale, max_bias));
2100
2136
  }
2101
2137
  }
@@ -2139,6 +2175,20 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
2139
2175
  test_cases.emplace_back(new test_timestep_embedding());
2140
2176
  test_cases.emplace_back(new test_leaky_relu());
2141
2177
 
2178
+ #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
2179
+ for (int hs : { 64, 128, }) { // other head sizes not implemented
2180
+ #else
2181
+ for (int hs : { 64, 80, 128, 256, }) {
2182
+ #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
2183
+ for (int nh : { 32, }) {
2184
+ for (int kv : { 512, 1024, }) {
2185
+ for (int nb : { 1, 2, 4, 8, }) {
2186
+ test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb));
2187
+ }
2188
+ }
2189
+ }
2190
+ }
2191
+
2142
2192
  // these tests are disabled to save execution time, but they can be handy for debugging
2143
2193
  #if 0
2144
2194
  test_cases.emplace_back(new test_llama(1));
@@ -49,6 +49,8 @@ int main(void) {
49
49
  "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
50
50
  // Llama-3
51
51
  "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
52
+ // Phi-3
53
+ "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + ' ' + message['content'] + '<|end|> ' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|> ' }}{% else %}{{ eos_token }}{% endif %}"
52
54
  };
53
55
  std::vector<std::string> expected_output = {
54
56
  // teknium/OpenHermes-2.5-Mistral-7B
@@ -77,6 +79,8 @@ int main(void) {
77
79
  "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
78
80
  // Llama 3
79
81
  "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnother question<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
82
+ // Phi 3
83
+ "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\nI am an assistant<|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
80
84
  };
81
85
  std::vector<char> formatted_chat(1024);
82
86
  int32_t res;