@fugood/llama.node 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/CMakeLists.txt +1 -10
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +6 -4
  17. package/src/LlamaCompletionWorker.cpp +6 -6
  18. package/src/LlamaContext.cpp +7 -9
  19. package/src/common.hpp +2 -1
  20. package/src/llama.cpp/.github/workflows/build.yml +98 -24
  21. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  22. package/src/llama.cpp/.github/workflows/docker.yml +43 -34
  23. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  24. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  25. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  26. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  27. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  28. package/src/llama.cpp/CMakeLists.txt +20 -8
  29. package/src/llama.cpp/common/CMakeLists.txt +12 -10
  30. package/src/llama.cpp/common/arg.cpp +2006 -0
  31. package/src/llama.cpp/common/arg.h +77 -0
  32. package/src/llama.cpp/common/common.cpp +496 -1632
  33. package/src/llama.cpp/common/common.h +161 -63
  34. package/src/llama.cpp/common/console.cpp +3 -0
  35. package/src/llama.cpp/common/log.cpp +401 -0
  36. package/src/llama.cpp/common/log.h +66 -698
  37. package/src/llama.cpp/common/ngram-cache.cpp +3 -0
  38. package/src/llama.cpp/common/sampling.cpp +348 -350
  39. package/src/llama.cpp/common/sampling.h +62 -139
  40. package/src/llama.cpp/common/stb_image.h +5990 -6398
  41. package/src/llama.cpp/common/train.cpp +2 -0
  42. package/src/llama.cpp/docs/build.md +36 -1
  43. package/src/llama.cpp/examples/CMakeLists.txt +0 -1
  44. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
  45. package/src/llama.cpp/examples/batched/batched.cpp +39 -55
  46. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
  47. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  48. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
  49. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  50. package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
  51. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
  52. package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
  53. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  54. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
  55. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  57. package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
  58. package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
  59. package/src/llama.cpp/examples/infill/infill.cpp +117 -132
  60. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
  61. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
  62. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  63. package/src/llama.cpp/examples/llava/clip.cpp +685 -150
  64. package/src/llama.cpp/examples/llava/clip.h +11 -2
  65. package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
  66. package/src/llama.cpp/examples/llava/llava.cpp +110 -24
  67. package/src/llama.cpp/examples/llava/llava.h +2 -3
  68. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  69. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  70. package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
  71. package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
  72. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
  73. package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
  74. package/src/llama.cpp/examples/main/main.cpp +210 -262
  75. package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
  76. package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
  77. package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
  78. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  80. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
  81. package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
  82. package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
  83. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
  84. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
  85. package/src/llama.cpp/examples/server/server.cpp +1027 -1073
  86. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  87. package/src/llama.cpp/examples/server/utils.hpp +107 -105
  88. package/src/llama.cpp/examples/simple/simple.cpp +35 -41
  89. package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
  90. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  91. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  92. package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
  93. package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
  94. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  95. package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
  96. package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
  97. package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
  98. package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
  99. package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
  100. package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
  101. package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
  102. package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
  103. package/src/llama.cpp/ggml/include/ggml.h +293 -186
  104. package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
  105. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
  106. package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
  107. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
  108. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
  109. package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
  110. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  111. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  112. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  113. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  114. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  116. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  117. package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
  118. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
  120. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  121. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  122. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  123. package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
  124. package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
  125. package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
  126. package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
  127. package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
  128. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  129. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
  130. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  132. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  134. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  135. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  136. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  137. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
  138. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  141. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
  142. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
  143. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  144. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  145. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  146. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
  148. package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
  149. package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
  150. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
  151. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
  152. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
  153. package/src/llama.cpp/include/llama.h +241 -264
  154. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  155. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  156. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  157. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  158. package/src/llama.cpp/src/llama-grammar.h +120 -15
  159. package/src/llama.cpp/src/llama-impl.h +156 -1
  160. package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
  161. package/src/llama.cpp/src/llama-sampling.h +20 -47
  162. package/src/llama.cpp/src/llama-vocab.cpp +343 -120
  163. package/src/llama.cpp/src/llama-vocab.h +33 -17
  164. package/src/llama.cpp/src/llama.cpp +4247 -1525
  165. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  166. package/src/llama.cpp/src/unicode-data.h +4 -4
  167. package/src/llama.cpp/src/unicode.cpp +15 -7
  168. package/src/llama.cpp/tests/CMakeLists.txt +3 -0
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
  171. package/src/llama.cpp/tests/test-barrier.cpp +93 -0
  172. package/src/llama.cpp/tests/test-grad0.cpp +187 -70
  173. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  174. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  175. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
  176. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  177. package/src/llama.cpp/tests/test-log.cpp +39 -0
  178. package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
  179. package/src/llama.cpp/tests/test-rope.cpp +1 -1
  180. package/src/llama.cpp/tests/test-sampling.cpp +157 -98
  181. package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
  182. package/patches/llama.patch +0 -22
  183. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  184. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  185. package/src/llama.cpp/common/grammar-parser.h +0 -29
  186. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  187. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
@@ -1,3 +1,4 @@
1
+ #include "ggml-impl.h"
1
2
  #include "ggml-blas.h"
2
3
  #include "ggml-backend-impl.h"
3
4
 
@@ -234,25 +235,25 @@ static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct g
234
235
 
235
236
  // backend interface
236
237
 
237
- GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) {
238
+ static const char * ggml_backend_blas_name(ggml_backend_t backend) {
238
239
  return "BLAS";
239
240
 
240
241
  GGML_UNUSED(backend);
241
242
  }
242
243
 
243
- GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) {
244
+ static void ggml_backend_blas_free(ggml_backend_t backend) {
244
245
  ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
245
246
  delete ctx;
246
247
  delete backend;
247
248
  }
248
249
 
249
- GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
250
+ static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
250
251
  return ggml_backend_cpu_buffer_type();
251
252
 
252
253
  GGML_UNUSED(backend);
253
254
  }
254
255
 
255
- GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
256
+ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
256
257
  ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
257
258
 
258
259
  for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -284,7 +285,7 @@ GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t
284
285
  GGML_UNUSED(backend);
285
286
  }
286
287
 
287
- GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
288
+ static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
288
289
  const struct ggml_tensor * src0 = op->src[0];
289
290
  const struct ggml_tensor * src1 = op->src[1];
290
291
 
@@ -299,7 +300,7 @@ GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, cons
299
300
  GGML_UNUSED(backend);
300
301
  }
301
302
 
302
- GGML_CALL static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
303
+ static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
303
304
  return ggml_backend_buft_is_host(buft);
304
305
 
305
306
  GGML_UNUSED(backend);
@@ -321,11 +322,8 @@ static struct ggml_backend_i blas_backend_i = {
321
322
  /* .supports_op = */ ggml_backend_blas_supports_op,
322
323
  /* .supports_buft = */ ggml_backend_blas_supports_buft,
323
324
  /* .offload_op = */ NULL,
324
- /* .event_new = */ NULL,
325
- /* .event_free = */ NULL,
326
325
  /* .event_record = */ NULL,
327
326
  /* .event_wait = */ NULL,
328
- /* .event_synchronize = */ NULL,
329
327
  };
330
328
 
331
329
  static ggml_guid_t ggml_backend_blas_guid(void) {
@@ -339,6 +337,7 @@ ggml_backend_t ggml_backend_blas_init(void) {
339
337
  ggml_backend_t backend = new ggml_backend {
340
338
  /* .guid = */ ggml_backend_blas_guid(),
341
339
  /* .interface = */ blas_backend_i,
340
+ /* .device = */ nullptr,
342
341
  /* .context = */ ctx,
343
342
  };
344
343
 
@@ -355,7 +354,7 @@ ggml_backend_t ggml_backend_blas_init(void) {
355
354
  return backend;
356
355
  }
357
356
 
358
- GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend) {
357
+ bool ggml_backend_is_blas(ggml_backend_t backend) {
359
358
  return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid());
360
359
  }
361
360
 
@@ -37,6 +37,10 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
37
37
  return ACL_INT16;
38
38
  case GGML_TYPE_I32:
39
39
  return ACL_INT32;
40
+ case GGML_TYPE_Q4_0:
41
+ return ACL_INT4;
42
+ case GGML_TYPE_Q8_0:
43
+ return ACL_INT8;
40
44
  default:
41
45
  return ACL_DT_UNDEFINED;
42
46
  }
@@ -89,33 +93,6 @@ bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
89
93
  return false;
90
94
  }
91
95
 
92
- aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
93
- size_t type_size, int64_t* ne, size_t* nb,
94
- int64_t dims, aclFormat format,
95
- size_t offset) {
96
- int64_t tmp_ne[GGML_MAX_DIMS * 2];
97
- int64_t tmp_stride[GGML_MAX_DIMS * 2];
98
-
99
- memcpy(tmp_ne, ne, dims * sizeof(int64_t));
100
- for (int i = 0; i < dims; i++) {
101
- tmp_stride[i] = nb[i] / type_size;
102
- }
103
-
104
- std::reverse(tmp_ne, tmp_ne + dims);
105
- std::reverse(tmp_stride, tmp_stride + dims);
106
-
107
- int64_t acl_storage_len = 0;
108
- for (int i = 0; i < dims; i++) {
109
- acl_storage_len += (ne[i] - 1) * nb[i];
110
- }
111
-
112
- aclTensor* acl_tensor =
113
- aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
114
- format, &acl_storage_len, 1, data_ptr);
115
-
116
- return acl_tensor;
117
- }
118
-
119
96
  int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0,
120
97
  const ggml_tensor* src1,
121
98
  int64_t* bcast_src0_ne,
@@ -23,6 +23,9 @@
23
23
  #ifndef CANN_ACL_TENSOR_H
24
24
  #define CANN_ACL_TENSOR_H
25
25
 
26
+ #include <algorithm>
27
+ #include <cstring>
28
+
26
29
  #include <aclnn/aclnn_base.h>
27
30
  #include "common.h"
28
31
 
@@ -65,7 +68,8 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null
65
68
  size_t offset = 0);
66
69
 
67
70
  /**
68
- * @brief Creates an ACL tensor from provided parameters.
71
+ * @brief Template for creating an ACL tensor from provided parameters. typename TYPE
72
+ * should be size_t or float.
69
73
  *
70
74
  * @details This function creates an ACL tensor using the provided data pointer,
71
75
  * data type, dimensions, strides, format, offset, and additional parameters.
@@ -83,10 +87,34 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null
83
87
  * @param offset Offset in bytes for the ACL tensor data. Defaults to 0.
84
88
  * @return Pointer to the created ACL tensor.
85
89
  */
90
+ template<typename TYPE>
86
91
  aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
87
- size_t type_size, int64_t* ne, size_t* nb,
88
- int64_t dims, aclFormat format = ACL_FORMAT_ND,
89
- size_t offset = 0);
92
+ TYPE type_size, int64_t* ne, TYPE* nb,
93
+ int64_t dims,
94
+ aclFormat format = ACL_FORMAT_ND,
95
+ size_t offset = 0) {
96
+ int64_t tmp_ne[GGML_MAX_DIMS * 2];
97
+ int64_t tmp_stride[GGML_MAX_DIMS * 2];
98
+
99
+ memcpy(tmp_ne, ne, dims * sizeof(int64_t));
100
+ for (int i = 0; i < dims; i++) {
101
+ tmp_stride[i] = nb[i] / type_size;
102
+ }
103
+
104
+ std::reverse(tmp_ne, tmp_ne + dims);
105
+ std::reverse(tmp_stride, tmp_stride + dims);
106
+
107
+ int64_t acl_storage_len = 0;
108
+ for (int i = 0; i < dims; i++) {
109
+ acl_storage_len += (ne[i] - 1) * nb[i];
110
+ }
111
+
112
+ aclTensor* acl_tensor =
113
+ aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
114
+ format, &acl_storage_len, 1, data_ptr);
115
+
116
+ return acl_tensor;
117
+ }
90
118
 
91
119
  /**
92
120
  * @brief Checks if tensors require broadcasting based on their shapes.
@@ -464,9 +464,11 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
464
464
  aclTensor* acl_src = ggml_cann_create_tensor(src);
465
465
  aclTensor* acl_dst = ggml_cann_create_tensor(dst);
466
466
 
467
- const float eps = 1e-6f; // TODO: make this a parameter
468
467
  int n_groups = dst->op_params[0];
469
468
 
469
+ float eps;
470
+ memcpy(&eps, dst->op_params + 1, sizeof(float));
471
+
470
472
  uint64_t workspaceSize = 0;
471
473
  aclOpExecutor* executor;
472
474
  void* workspaceAddr = nullptr;
@@ -910,6 +912,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
910
912
  ((ggml_tensor*)dst->extra)->ne);
911
913
  return;
912
914
  }
915
+ if (dst->type == GGML_TYPE_Q4_0) {
916
+ aclrtlaunch_ascendc_quantize_f16_to_q4_0(
917
+ 24, ctx.stream(), src->data, dst->data,
918
+ ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
919
+ ((ggml_tensor*)dst->extra)->ne);
920
+ return;
921
+ }
913
922
  if (dst->type == GGML_TYPE_F16) {
914
923
  if (ggml_are_same_shape(src, dst)) {
915
924
  cann_copy(ctx, acl_src, acl_dst);
@@ -971,6 +980,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
971
980
  ((ggml_tensor*)dst->extra)->ne);
972
981
  return;
973
982
  }
983
+ if (dst->type == GGML_TYPE_Q4_0) {
984
+ aclrtlaunch_ascendc_quantize_f32_to_q4_0(
985
+ 24, ctx.stream(), src->data, dst->data,
986
+ ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
987
+ ((ggml_tensor*)dst->extra)->ne);
988
+ return;
989
+ }
974
990
  if (dst->type == GGML_TYPE_F32) {
975
991
  if (ggml_are_same_shape(src, dst)) {
976
992
  cann_copy(ctx, acl_src, acl_dst);
@@ -1312,6 +1328,111 @@ aclnnStatus aclnnIm2col(void* workspace, uint64_t workspaceSize,
1312
1328
  #ifdef __cplusplus
1313
1329
  }
1314
1330
  #endif
1331
+
1332
+ static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx,
1333
+ ggml_tensor* dst,
1334
+ ggml_tensor* src1,
1335
+ aclTensor* tmp_cast_tensor,
1336
+ aclTensor* tmp_im2col_tensor) {
1337
+ // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
1338
+ int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
1339
+ size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
1340
+ aclTensor* acl_dst =
1341
+ ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
1342
+
1343
+ int64_t permute_dim[] = {0, 2, 1};
1344
+ if (src1->type != dst->type) {
1345
+ aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3);
1346
+ } else {
1347
+ aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3);
1348
+ }
1349
+
1350
+ // release
1351
+ ACL_CHECK(aclDestroyTensor(acl_dst));
1352
+ }
1353
+
1354
+ static void ggml_cann_im2col_1d_post_process(
1355
+ ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_tensor* src1,
1356
+ aclTensor* tmp_cast_tensor, aclTensor* tmp_im2col_tensor,
1357
+ const std::vector<int64_t>& im2col_op_params) {
1358
+ // get params
1359
+ const int64_t KH = im2col_op_params[0];
1360
+ const int64_t KW = im2col_op_params[1];
1361
+ const int64_t IW = im2col_op_params[2];
1362
+ const int64_t IC = im2col_op_params[3];
1363
+ const int64_t N = im2col_op_params[4];
1364
+ const int64_t OH = im2col_op_params[5];
1365
+ const int64_t OW = im2col_op_params[6];
1366
+ const int64_t s0 = im2col_op_params[7];
1367
+ const int64_t p0 = im2col_op_params[8];
1368
+ const int64_t d0 = im2col_op_params[9];
1369
+ const int64_t n_bytes_factor = im2col_op_params[10];
1370
+
1371
+ // Permute: [N, IC * KH * KW, OW * OH] ->
1372
+ // [N, OW * OH * n_bytes_factor, IC * KH * KW]
1373
+ aclTensor* tmp_permute_tensor = nullptr;
1374
+ ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool());
1375
+ tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
1376
+ void* tmp_permute_buffer = tmp_permute_allocator.get();
1377
+
1378
+ int64_t tmp_permute_ne[] = {IC * KH * KW, OW * OH * n_bytes_factor, N};
1379
+ size_t tmp_permute_nb[GGML_MAX_DIMS - 1];
1380
+ tmp_permute_nb[0] = ggml_type_size(dst->type);
1381
+ for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
1382
+ tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
1383
+ }
1384
+
1385
+ tmp_permute_tensor = ggml_cann_create_tensor(
1386
+ tmp_permute_buffer, ggml_cann_type_mapping(dst->type),
1387
+ ggml_type_size(dst->type), tmp_permute_ne, tmp_permute_nb,
1388
+ GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
1389
+
1390
+ int64_t permute_dim[] = {0, 2, 1};
1391
+ if (src1->type != dst->type) {
1392
+ aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor, permute_dim, 3);
1393
+ } else {
1394
+ aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor, permute_dim,
1395
+ 3);
1396
+ }
1397
+
1398
+ // number of times the kernel moves in W dimension
1399
+ const int n_step_w = (IW + 2 * p0 - d0 * (KW - 1) - 1) / s0 + 1;
1400
+ size_t offset;
1401
+ void *cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer;
1402
+
1403
+ // memory copy with offset to restore 1D im2col from 2d
1404
+ if (IC > 1) {
1405
+ offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type);
1406
+ size_t size_cpy = KH * KW * ggml_type_size(dst->type);
1407
+
1408
+ for (int c = 0; c < IC; c++) {
1409
+ cur_permute_buffer = (char*)tmp_permute_buffer + offset +
1410
+ KH * KW * c * ggml_type_size(dst->type);
1411
+ cur_dst_buffer = (char*)dst->data +
1412
+ c * KH * KW * n_step_w * ggml_type_size(dst->type);
1413
+
1414
+ for (int i = 0; i < n_step_w; i++) {
1415
+ ACL_CHECK(aclrtMemcpyAsync(
1416
+ cur_dst_buffer, size_cpy, cur_permute_buffer, size_cpy,
1417
+ ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
1418
+ cur_dst_buffer =
1419
+ (char*)cur_dst_buffer + KH * KW * ggml_type_size(dst->type);
1420
+ cur_permute_buffer = (char*)cur_permute_buffer +
1421
+ KH * KW * IC * ggml_type_size(dst->type);
1422
+ }
1423
+ }
1424
+ } else {
1425
+ offset = KH * KW * n_step_w *
1426
+ ggml_type_size(dst->type); // equal to ggml_nbytes(dst)
1427
+ ACL_CHECK(aclrtMemcpyAsync(dst->data, offset,
1428
+ (char*)tmp_permute_buffer + offset, offset,
1429
+ ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
1430
+ }
1431
+
1432
+ // release
1433
+ ACL_CHECK(aclDestroyTensor(tmp_permute_tensor));
1434
+ }
1435
+
1315
1436
  void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1316
1437
  ggml_tensor* src0 = dst->src[0]; // kernel
1317
1438
  ggml_tensor* src1 = dst->src[1]; // input
@@ -1320,21 +1441,23 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1320
1441
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
1321
1442
  GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
1322
1443
 
1444
+ GGML_TENSOR_BINARY_OP_LOCALS;
1445
+
1446
+ // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
1447
+ // im2col and do post-processing to restore it to 1D.
1448
+ const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
1323
1449
  const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
1324
- const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
1450
+ const int32_t s1 = is_2D ? ((const int32_t*)(dst->op_params))[1] : 1;
1325
1451
  const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
1326
- const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
1452
+ const int32_t p1 = is_2D ? ((const int32_t*)(dst->op_params))[3] : 1;
1327
1453
  const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
1328
- const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
1329
- const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
1454
+ const int32_t d1 = is_2D ? ((const int32_t*)(dst->op_params))[5] : 1;
1330
1455
 
1331
- GGML_TENSOR_BINARY_OP_LOCALS;
1332
-
1333
- const int64_t N = is_2D ? ne13 : ne12;
1334
- const int64_t IC = is_2D ? ne12 : ne11;
1335
-
1336
- const int64_t KH = is_2D ? ne01 : 1;
1456
+ const int64_t N = ne13;
1457
+ const int64_t IC = ne12;
1458
+ const int64_t KH = ne01;
1337
1459
  const int64_t KW = ne00;
1460
+ const int64_t IW = ne10;
1338
1461
 
1339
1462
  const int64_t OH = is_2D ? ne2 : 1;
1340
1463
  const int64_t OW = ne1;
@@ -1342,9 +1465,12 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1342
1465
  GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
1343
1466
  GGML_ASSERT(nb10 == sizeof(float));
1344
1467
 
1345
- // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH]
1468
+ // memory allocated increased to 3x when is_2D == false
1469
+ const int64_t n_bytes_factor = is_2D ? 1 : 3;
1470
+
1471
+ // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH * n_bytes_factor]
1346
1472
  aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
1347
- int64_t tmp_im2col_ne[] = {OW * OH, IC * KH * KW, N};
1473
+ int64_t tmp_im2col_ne[] = {OW * OH * n_bytes_factor, IC * KH * KW, N};
1348
1474
  size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];
1349
1475
 
1350
1476
  tmp_im2col_nb[0] = ggml_type_size(src1->type);
@@ -1356,8 +1482,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1356
1482
  // If dst is f16, tmp_buffer is f32, we need alloc src.typesize *
1357
1483
  // dst.elemcount.
1358
1484
  ggml_cann_pool_alloc im2col_allocator(
1359
- ctx.pool(), ggml_nelements(dst) * ggml_element_size(src1));
1485
+ ctx.pool(),
1486
+ ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor);
1360
1487
  void* tmp_im2col_buffer = im2col_allocator.get();
1488
+
1361
1489
  aclTensor* tmp_im2col_tensor = ggml_cann_create_tensor(
1362
1490
  tmp_im2col_buffer, ggml_cann_type_mapping(src1->type),
1363
1491
  ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb,
@@ -1380,8 +1508,9 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1380
1508
  paddings, strides, tmp_im2col_tensor,
1381
1509
  &workspaceSize, &executor));
1382
1510
 
1511
+ ggml_cann_pool_alloc workspace_allocator(ctx.pool());
1383
1512
  if (workspaceSize > 0) {
1384
- ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
1513
+ workspace_allocator.alloc(workspaceSize);
1385
1514
  workspaceAddr = workspace_allocator.get();
1386
1515
  }
1387
1516
 
@@ -1391,9 +1520,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1391
1520
  // Cast if dst is f16.
1392
1521
  aclTensor* tmp_cast_tensor = nullptr;
1393
1522
  ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool());
1523
+ void* tmp_cast_buffer = nullptr;
1394
1524
  if (src1->type != dst->type) {
1395
- tmp_cast_allocator.alloc(ggml_nbytes(dst));
1396
- void* tmp_cast_buffer = tmp_cast_allocator.get();
1525
+ tmp_cast_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
1526
+ tmp_cast_buffer = tmp_cast_allocator.get();
1397
1527
  size_t temp_cast_nb[GGML_MAX_DIMS - 1];
1398
1528
  temp_cast_nb[0] = ggml_type_size(dst->type);
1399
1529
  for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
@@ -1408,24 +1538,21 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1408
1538
  ggml_cann_type_mapping(dst->type));
1409
1539
  }
1410
1540
 
1411
- // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
1412
- int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
1413
- size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
1414
- aclTensor* acl_dst =
1415
- ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
1416
-
1417
- int64_t permute_dim[] = {0, 2, 1};
1418
- if (src1->type != dst->type) {
1419
- aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3);
1541
+ // post-processing
1542
+ if (is_2D) {
1543
+ ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor,
1544
+ tmp_im2col_tensor);
1420
1545
  } else {
1421
- aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3);
1546
+ std::vector<int64_t> im2col_op_params = {
1547
+ KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor};
1548
+ ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor,
1549
+ tmp_im2col_tensor, im2col_op_params);
1422
1550
  }
1423
1551
 
1424
1552
  // release
1425
1553
  ACL_CHECK(aclDestroyTensor(acl_src1));
1426
1554
  ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor));
1427
1555
  ACL_CHECK(aclDestroyTensor(tmp_cast_tensor));
1428
- ACL_CHECK(aclDestroyTensor(acl_dst));
1429
1556
  ACL_CHECK(aclDestroyIntArray(kernel_size));
1430
1557
  ACL_CHECK(aclDestroyIntArray(dilations));
1431
1558
  ACL_CHECK(aclDestroyIntArray(paddings));
@@ -2352,21 +2479,33 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
2352
2479
  * @param dst The destination tensor where the result of the matrix
2353
2480
  * multiplication will be stored.
2354
2481
  */
2355
- static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
2356
- ggml_tensor* dst) {
2482
+ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
2483
+ ggml_tensor* dst,
2484
+ const enum ggml_type type) {
2357
2485
  ggml_tensor* src0 = dst->src[0]; // weight
2358
2486
  ggml_tensor* src1 = dst->src[1]; // input
2359
2487
 
2360
2488
  // The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC
2361
2489
  // is regarded as batch. weight need transpose.
2362
2490
  int64_t weight_ne[] = {src0->ne[1], src0->ne[0]};
2363
- size_t weight_elem_size = sizeof(uint8_t);
2364
- size_t weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
2491
+ float weight_elem_size;
2492
+ if (type == GGML_TYPE_Q4_0) {
2493
+ weight_elem_size = float(sizeof(uint8_t)) / 2;
2494
+ }
2495
+ else if (type == GGML_TYPE_Q8_0) {
2496
+ weight_elem_size = float(sizeof(uint8_t));
2497
+ }
2498
+ else {
2499
+ GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
2500
+ }
2501
+ float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
2502
+
2365
2503
  // size of one matrix is element_size * height * width.
2366
2504
  size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1];
2367
2505
  size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
2368
2506
 
2369
2507
  // scale stored at the end of weight. Also need transpose.
2508
+ GGML_ASSERT(QK4_0 == QK8_0);
2370
2509
  int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0};
2371
2510
  size_t scale_elem_size = sizeof(uint16_t);
2372
2511
  size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
@@ -2381,10 +2520,10 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
2381
2520
  size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]};
2382
2521
  size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1];
2383
2522
 
2523
+ ggml_cann_pool_alloc input_alloctor(ctx.pool());
2384
2524
  if (src1->type != GGML_TYPE_F16) {
2385
2525
  aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
2386
- ggml_cann_pool_alloc input_alloctor(
2387
- ctx.pool(), ggml_nelements(src1) * input_elem_size);
2526
+ input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
2388
2527
  input_buffer = input_alloctor.get();
2389
2528
 
2390
2529
  int64_t* input_cast_ne = src1->ne;
@@ -2430,8 +2569,9 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
2430
2569
  (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
2431
2570
  input_elem_size, input_ne, input_nb, 2);
2432
2571
  aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
2433
- (char*)src0->data + batch0 * weight_stride, ACL_INT8,
2434
- weight_elem_size, weight_ne, weight_nb, 2);
2572
+ (char*)src0->data + batch0 * weight_stride,
2573
+ ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
2574
+ weight_nb, 2);
2435
2575
  aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
2436
2576
  scale_offset + batch0 * scale_stride, ACL_FLOAT16,
2437
2577
  scale_elem_size, scale_ne, scale_nb, 2);
@@ -2485,11 +2625,9 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2485
2625
  case GGML_TYPE_F16:
2486
2626
  ggml_cann_mat_mul_fp(ctx, dst);
2487
2627
  break;
2488
- // case GGML_TYPE_Q4_0:
2489
- // ggml_cann_mul_mat_q4_0(ctx, dst);
2490
- // break;
2628
+ case GGML_TYPE_Q4_0:
2491
2629
  case GGML_TYPE_Q8_0:
2492
- ggml_cann_mul_mat_q8_0(ctx, dst);
2630
+ ggml_cann_mul_mat_quant(ctx, dst, type);
2493
2631
  break;
2494
2632
  default:
2495
2633
  GGML_ABORT("fatal error");
@@ -2743,7 +2881,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
2743
2881
  ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast,
2744
2882
  beta_slow, corr_dims);
2745
2883
 
2746
- const bool is_neox = mode & 2;
2884
+ const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
2747
2885
 
2748
2886
  // init cos/sin cache
2749
2887
  ggml_cann_pool_alloc sin_allocator(
@@ -227,6 +227,7 @@ struct ggml_backend_cann_context {
227
227
  * @brief Destructor for cleaning up resources.
228
228
  */
229
229
  ~ggml_backend_cann_context() {
230
+ ggml_cann_set_device(device);
230
231
  if (copy_event != nullptr) {
231
232
  ACL_CHECK(aclrtDestroyEvent(copy_event));
232
233
  }
@@ -9,6 +9,7 @@ file(GLOB SRC_FILES
9
9
  get_row_q8_0.cpp
10
10
  quantize_f32_q8_0.cpp
11
11
  quantize_f16_q8_0.cpp
12
+ quantize_float_to_q4_0.cpp
12
13
  dup.cpp
13
14
  )
14
15
 
@@ -29,4 +30,4 @@ ascendc_library(ascendc_kernels STATIC
29
30
  ${SRC_FILES}
30
31
  )
31
32
 
32
- #ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
33
+ # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
@@ -8,6 +8,8 @@
8
8
 
9
9
  #include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
10
10
  #include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
11
+ #include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
12
+ #include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
11
13
 
12
14
  #include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
13
15
  #include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"