@fugood/llama.node 0.3.16 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/CMakeLists.txt +3 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +5 -0
  19. package/package.json +1 -1
  20. package/src/LlamaCompletionWorker.cpp +8 -0
  21. package/src/LlamaCompletionWorker.h +1 -0
  22. package/src/LlamaContext.cpp +3 -2
  23. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +70 -27
  25. package/src/llama.cpp/.github/workflows/docker.yml +6 -6
  26. package/src/llama.cpp/.github/workflows/server.yml +7 -11
  27. package/src/llama.cpp/CMakeLists.txt +23 -1
  28. package/src/llama.cpp/common/CMakeLists.txt +6 -3
  29. package/src/llama.cpp/common/arg.cpp +809 -105
  30. package/src/llama.cpp/common/arg.h +9 -0
  31. package/src/llama.cpp/common/chat.cpp +1 -1
  32. package/src/llama.cpp/common/common.cpp +31 -521
  33. package/src/llama.cpp/common/common.h +17 -36
  34. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  35. package/src/llama.cpp/common/llguidance.cpp +30 -47
  36. package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
  37. package/src/llama.cpp/common/minja/minja.hpp +119 -93
  38. package/src/llama.cpp/common/sampling.cpp +3 -0
  39. package/src/llama.cpp/docs/build.md +122 -7
  40. package/src/llama.cpp/examples/CMakeLists.txt +0 -9
  41. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
  43. package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
  44. package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
  45. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
  46. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
  48. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
  50. package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
  51. package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
  52. package/src/llama.cpp/examples/llava/clip.h +39 -22
  53. package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +64 -52
  55. package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
  56. package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
  57. package/src/llama.cpp/examples/llava/mtmd.h +168 -0
  58. package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
  59. package/src/llama.cpp/examples/main/main.cpp +16 -5
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
  64. package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
  65. package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
  66. package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
  67. package/src/llama.cpp/examples/run/run.cpp +14 -28
  68. package/src/llama.cpp/examples/server/httplib.h +313 -247
  69. package/src/llama.cpp/examples/server/server.cpp +238 -139
  70. package/src/llama.cpp/examples/server/utils.hpp +51 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  74. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  75. package/src/llama.cpp/examples/tts/tts.cpp +6 -9
  76. package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
  77. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  78. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  79. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  80. package/src/llama.cpp/ggml/include/ggml.h +66 -99
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  82. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  83. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  84. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  85. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  87. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  88. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  89. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  90. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
  91. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  93. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
  99. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
  101. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
  102. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
  103. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  106. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  107. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  108. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  109. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
  110. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  111. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  112. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  114. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  115. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
  116. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  117. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
  118. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
  119. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
  120. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  124. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  130. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
  131. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  133. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  134. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  135. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  136. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  137. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
  139. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
  140. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
  141. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
  142. package/src/llama.cpp/ggml/src/ggml.c +141 -245
  143. package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
  144. package/src/llama.cpp/include/llama.h +30 -11
  145. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  146. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  147. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  148. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  149. package/src/llama.cpp/requirements/requirements-all.txt +2 -0
  150. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  151. package/src/llama.cpp/src/CMakeLists.txt +3 -2
  152. package/src/llama.cpp/src/llama-adapter.cpp +37 -1
  153. package/src/llama.cpp/src/llama-arch.cpp +160 -17
  154. package/src/llama.cpp/src/llama-arch.h +16 -0
  155. package/src/llama.cpp/src/llama-chat.cpp +82 -17
  156. package/src/llama.cpp/src/llama-chat.h +6 -2
  157. package/src/llama.cpp/src/llama-context.cpp +108 -92
  158. package/src/llama.cpp/src/llama-context.h +1 -2
  159. package/src/llama.cpp/src/llama-graph.cpp +189 -119
  160. package/src/llama.cpp/src/llama-graph.h +26 -6
  161. package/src/llama.cpp/src/llama-hparams.h +13 -0
  162. package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
  163. package/src/llama.cpp/src/llama-kv-cache.h +41 -115
  164. package/src/llama.cpp/src/llama-memory.h +1 -1
  165. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  166. package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
  167. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  168. package/src/llama.cpp/src/llama-model.cpp +1760 -534
  169. package/src/llama.cpp/src/llama-model.h +13 -1
  170. package/src/llama.cpp/src/llama-quant.cpp +29 -8
  171. package/src/llama.cpp/src/llama-sampling.cpp +7 -1
  172. package/src/llama.cpp/src/llama-vocab.cpp +44 -6
  173. package/src/llama.cpp/src/llama.cpp +1 -1
  174. package/src/llama.cpp/tests/CMakeLists.txt +43 -30
  175. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  176. package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
  177. package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
  178. package/src/llama.cpp/tests/test-chat.cpp +12 -2
  179. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  180. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  181. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  182. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  183. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  184. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  185. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  186. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  187. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  188. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  189. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  190. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  191. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  192. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  193. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  194. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  195. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  196. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  197. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  198. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  199. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  200. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  201. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  202. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
@@ -3,7 +3,7 @@
3
3
  #include "common.h"
4
4
  #include "log.h"
5
5
  #include "llama.h"
6
- #include "common/base64.hpp"
6
+ #include "base64.hpp"
7
7
 
8
8
  // increase max payload length to allow use of larger context size
9
9
  #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
@@ -58,6 +58,32 @@ static T json_value(const json & body, const std::string & key, const T & defaul
58
58
 
59
59
  const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
60
60
 
61
+ // thin wrapper around common_grammar_trigger with (de)serialization functions
62
+ struct server_grammar_trigger {
63
+ common_grammar_trigger value;
64
+
65
+ server_grammar_trigger() = default;
66
+ server_grammar_trigger(const common_grammar_trigger & value) : value(value) {}
67
+ server_grammar_trigger(const json & in) {
68
+ value.type = (common_grammar_trigger_type) in.at("type").get<int>();
69
+ value.value = in.at("value").get<std::string>();
70
+ if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
71
+ value.token = (llama_token) in.at("token").get<int>();
72
+ }
73
+ }
74
+
75
+ json to_json() const {
76
+ json out {
77
+ {"type", (int) value.type},
78
+ {"value", value.value},
79
+ };
80
+ if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
81
+ out["token"] = (int) value.token;
82
+ }
83
+ return out;
84
+ }
85
+ };
86
+
61
87
  //
62
88
  // tokenizer and input processing utils
63
89
  //
@@ -616,9 +642,31 @@ static json oaicompat_completion_params_parse(
616
642
  throw std::runtime_error("Cannot use custom grammar constraints with tools.");
617
643
  }
618
644
 
645
+ // if the assistant message appears at the end of list, we do not add end-of-turn token
646
+ // for ex. this can be useful to modify the reasoning process in reasoning models
647
+ bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant";
648
+ common_chat_msg last_message;
649
+ if (prefill_assistant_message) {
650
+ last_message = inputs.messages.back();
651
+ inputs.messages.pop_back();
652
+
653
+ /* sanity check, max one assistant message at the end of the list */
654
+ if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){
655
+ throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list.");
656
+ }
657
+
658
+ inputs.extract_reasoning = false;
659
+ inputs.add_generation_prompt = true;
660
+ }
661
+
619
662
  // Apply chat template to the list of messages
620
663
  auto chat_params = common_chat_templates_apply(tmpls, inputs);
621
664
 
665
+ /* Append assistant prefilled message */
666
+ if (prefill_assistant_message) {
667
+ chat_params.prompt += last_message.content;
668
+ }
669
+
622
670
  llama_params["chat_format"] = static_cast<int>(chat_params.format);
623
671
  llama_params["prompt"] = chat_params.prompt;
624
672
  if (!chat_params.grammar.empty()) {
@@ -627,7 +675,8 @@ static json oaicompat_completion_params_parse(
627
675
  llama_params["grammar_lazy"] = chat_params.grammar_lazy;
628
676
  auto grammar_triggers = json::array();
629
677
  for (const auto & trigger : chat_params.grammar_triggers) {
630
- grammar_triggers.push_back(trigger.to_json<json>());
678
+ server_grammar_trigger ct(trigger);
679
+ grammar_triggers.push_back(ct.to_json());
631
680
  }
632
681
  llama_params["grammar_triggers"] = grammar_triggers;
633
682
  llama_params["preserved_tokens"] = chat_params.preserved_tokens;
@@ -46,7 +46,7 @@ int main(int argc, char ** argv) {
46
46
 
47
47
  common_init();
48
48
 
49
- if (params.speculative.model.empty()) {
49
+ if (params.speculative.model.path.empty()) {
50
50
  LOG_ERR("%s: --model-draft is required\n", __func__);
51
51
  return 1;
52
52
  }
@@ -24,7 +24,7 @@ int main(int argc, char ** argv) {
24
24
 
25
25
  common_init();
26
26
 
27
- if (params.speculative.model.empty()) {
27
+ if (params.speculative.model.path.empty()) {
28
28
  LOG_ERR("%s: --model-draft is required\n", __func__);
29
29
  return 1;
30
30
  }
@@ -8,10 +8,10 @@ cd build
8
8
  source /opt/intel/oneapi/setvars.sh
9
9
 
10
10
  #for FP16
11
- #cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # faster for long-prompt inference
11
+ #cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_CURL=OFF # faster for long-prompt inference
12
12
 
13
13
  #for FP32
14
- cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
14
+ cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=OFF
15
15
 
16
16
  #build example/main
17
17
  #cmake --build . --config Release --target main
@@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR
13
13
 
14
14
  :: for FP16
15
15
  :: faster for long-prompt inference
16
- :: cmake -G "MinGW Makefiles" .. -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
16
+ :: cmake -G "MinGW Makefiles" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
17
17
 
18
18
  :: for FP32
19
- cmake -G "Ninja" .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
19
+ cmake -G "Ninja" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
20
20
  if %errorlevel% neq 0 goto ERROR
21
21
  :: build example/main only
22
22
  :: make main
@@ -577,12 +577,7 @@ int main(int argc, char ** argv) {
577
577
 
578
578
  const llama_vocab * vocab = llama_model_get_vocab(model_ttc);
579
579
 
580
- // TODO: refactor in a common struct
581
- params.model = params.vocoder.model;
582
- params.model_url = params.vocoder.model_url;
583
- params.hf_repo = params.vocoder.hf_repo;
584
- params.hf_file = params.vocoder.hf_file;
585
-
580
+ params.model = params.vocoder.model;
586
581
  params.embedding = true;
587
582
 
588
583
  common_init_result llama_init_cts = common_init_from_params(params);
@@ -699,11 +694,13 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
699
694
  const std::string voice_data = audio_data;
700
695
 
701
696
  auto tmp = common_tokenize(vocab, voice_data, false, true);
702
- printf("\n\n");
697
+
698
+ std::ostringstream tokens_oss;
703
699
  for (size_t i = 0; i < tmp.size(); ++i) {
704
- printf("%d, ", tmp[i]);
700
+ tokens_oss << tmp[i] << ", ";
705
701
  }
706
- printf("\n\n");
702
+ LOG_INF("\n\n%s: llama tokens: %s\n\n", __func__, tokens_oss.str().c_str());
703
+
707
704
  prompt_add(prompt_inp, tmp);
708
705
  #else
709
706
  prompt_add(prompt_inp, llama_tokens {
@@ -100,9 +100,14 @@ else()
100
100
  set(INS_ENB ON)
101
101
  endif()
102
102
 
103
+ message(DEBUG "GGML_NATIVE : ${GGML_NATIVE}")
104
+ message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
105
+ message(DEBUG "INS_ENB : ${INS_ENB}")
106
+
103
107
  option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
104
108
  option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
105
109
  option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
110
+ option(GGML_SSE42 "ggml: enable SSE 4.2" ${INS_ENB})
106
111
  option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
107
112
  option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
108
113
  option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
@@ -123,10 +128,12 @@ endif()
123
128
  option(GGML_LASX "ggml: enable lasx" ON)
124
129
  option(GGML_LSX "ggml: enable lsx" ON)
125
130
  option(GGML_RVV "ggml: enable rvv" ON)
131
+ option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
126
132
  option(GGML_VXE "ggml: enable vxe" ON)
127
133
 
128
134
  option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
129
- set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
135
+ set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
136
+ set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
130
137
 
131
138
 
132
139
  if (WIN32)
@@ -164,7 +171,6 @@ option(GGML_HIP "ggml: use HIP"
164
171
  option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
165
172
  option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
166
173
  option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
167
- option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
168
174
  option(GGML_VULKAN "ggml: use Vulkan" OFF)
169
175
  option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
170
176
  option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
@@ -0,0 +1,22 @@
1
+ find_package(Git)
2
+
3
+ # the commit's SHA1
4
+ execute_process(COMMAND
5
+ "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
6
+ WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
7
+ OUTPUT_VARIABLE GIT_SHA1
8
+ ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
9
+
10
+ # the date of the commit
11
+ execute_process(COMMAND
12
+ "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
13
+ WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
14
+ OUTPUT_VARIABLE GIT_DATE
15
+ ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
16
+
17
+ # the subject of the commit
18
+ execute_process(COMMAND
19
+ "${GIT_EXECUTABLE}" log -1 --format=%s
20
+ WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
21
+ OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
22
+ ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
@@ -133,6 +133,11 @@ extern "C" {
133
133
 
134
134
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
135
135
 
136
+ GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
137
+ GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
138
+ GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
139
+ GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t);
140
+
136
141
  #ifdef __cplusplus
137
142
  }
138
143
  #endif
@@ -7,6 +7,9 @@
7
7
  extern "C" {
8
8
  #endif
9
9
 
10
+ #define RPC_PROTO_MAJOR_VERSION 2
11
+ #define RPC_PROTO_MINOR_VERSION 0
12
+ #define RPC_PROTO_PATCH_VERSION 0
10
13
  #define GGML_RPC_MAX_SERVERS 16
11
14
 
12
15
  // backend API
@@ -17,7 +20,9 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c
17
20
 
18
21
  GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
19
22
 
20
- GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
23
+ GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
24
+ const char * cache_dir,
25
+ size_t free_mem, size_t total_mem);
21
26
 
22
27
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
23
28
 
@@ -393,8 +393,8 @@ extern "C" {
393
393
 
394
394
  // precision
395
395
  enum ggml_prec {
396
- GGML_PREC_DEFAULT,
397
- GGML_PREC_F32,
396
+ GGML_PREC_DEFAULT = 0, // stored as ggml_tensor.op_params, 0 by default
397
+ GGML_PREC_F32 = 10,
398
398
  };
399
399
 
400
400
  // model file types
@@ -481,6 +481,7 @@ extern "C" {
481
481
  GGML_OP_CONV_TRANSPOSE_1D,
482
482
  GGML_OP_IM2COL,
483
483
  GGML_OP_IM2COL_BACK,
484
+ GGML_OP_CONV_2D_DW,
484
485
  GGML_OP_CONV_TRANSPOSE_2D,
485
486
  GGML_OP_POOL_1D,
486
487
  GGML_OP_POOL_2D,
@@ -507,17 +508,12 @@ extern "C" {
507
508
 
508
509
  GGML_OP_UNARY,
509
510
 
510
- GGML_OP_MAP_UNARY,
511
- GGML_OP_MAP_BINARY,
512
-
513
- GGML_OP_MAP_CUSTOM1_F32,
514
- GGML_OP_MAP_CUSTOM2_F32,
515
- GGML_OP_MAP_CUSTOM3_F32,
516
-
517
511
  GGML_OP_MAP_CUSTOM1,
518
512
  GGML_OP_MAP_CUSTOM2,
519
513
  GGML_OP_MAP_CUSTOM3,
520
514
 
515
+ GGML_OP_CUSTOM,
516
+
521
517
  GGML_OP_CROSS_ENTROPY_LOSS,
522
518
  GGML_OP_CROSS_ENTROPY_LOSS_BACK,
523
519
  GGML_OP_OPT_STEP_ADAMW,
@@ -682,6 +678,9 @@ extern "C" {
682
678
  GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
683
679
  GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
684
680
 
681
+ // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
682
+ GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
683
+
685
684
  GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
686
685
  GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
687
686
 
@@ -1665,7 +1664,7 @@ extern "C" {
1665
1664
  struct ggml_tensor * a,
1666
1665
  struct ggml_tensor * b);
1667
1666
 
1668
- // depthwise
1667
+ // depthwise (via im2col and mul_mat)
1669
1668
  GGML_API struct ggml_tensor * ggml_conv_2d_dw(
1670
1669
  struct ggml_context * ctx,
1671
1670
  struct ggml_tensor * a, // convolution kernel
@@ -1677,6 +1676,22 @@ extern "C" {
1677
1676
  int d0, // dilation dimension 0
1678
1677
  int d1); // dilation dimension 1
1679
1678
 
1679
+ // Depthwise 2D convolution
1680
+ // may be faster than ggml_conv_2d_dw, but not available in all backends
1681
+ // a: KW KH 1 C convolution kernel
1682
+ // b: W H C N input data
1683
+ // res: W_out H_out C N
1684
+ GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct(
1685
+ struct ggml_context * ctx,
1686
+ struct ggml_tensor * a,
1687
+ struct ggml_tensor * b,
1688
+ int stride0,
1689
+ int stride1,
1690
+ int pad0,
1691
+ int pad1,
1692
+ int dilation0,
1693
+ int dilation1);
1694
+
1680
1695
  GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
1681
1696
  struct ggml_context * ctx,
1682
1697
  struct ggml_tensor * a,
@@ -1722,24 +1737,29 @@ extern "C" {
1722
1737
  float p0,
1723
1738
  float p1);
1724
1739
 
1725
- // nearest interpolate
1740
+ enum ggml_scale_mode {
1741
+ GGML_SCALE_MODE_NEAREST = 0,
1742
+ GGML_SCALE_MODE_BILINEAR = 1,
1743
+ };
1744
+
1745
+ // interpolate
1726
1746
  // multiplies ne0 and ne1 by scale factor
1727
- // used in stable-diffusion
1728
1747
  GGML_API struct ggml_tensor * ggml_upscale(
1729
1748
  struct ggml_context * ctx,
1730
1749
  struct ggml_tensor * a,
1731
- int scale_factor);
1750
+ int scale_factor,
1751
+ enum ggml_scale_mode mode);
1732
1752
 
1733
- // nearest interpolate
1734
- // nearest interpolate to specified dimensions
1735
- // used in tortoise.cpp
1753
+ // interpolate
1754
+ // interpolate scale to specified dimensions
1736
1755
  GGML_API struct ggml_tensor * ggml_upscale_ext(
1737
1756
  struct ggml_context * ctx,
1738
1757
  struct ggml_tensor * a,
1739
1758
  int ne0,
1740
1759
  int ne1,
1741
1760
  int ne2,
1742
- int ne3);
1761
+ int ne3,
1762
+ enum ggml_scale_mode mode);
1743
1763
 
1744
1764
  // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
1745
1765
  GGML_API struct ggml_tensor * ggml_pad(
@@ -1791,11 +1811,11 @@ extern "C" {
1791
1811
 
1792
1812
  #define GGML_KQ_MASK_PAD 64
1793
1813
 
1794
- // q: [n_embd, n_batch, n_head, 1]
1795
- // k: [n_embd, n_kv, n_head_kv, 1]
1796
- // v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !!
1797
- // mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
1798
- // res: [n_embd, n_head, n_batch, 1] !! permuted !!
1814
+ // q: [n_embd_k, n_batch, n_head, 1]
1815
+ // k: [n_embd_k, n_kv, n_head_kv, 1]
1816
+ // v: [n_embd_v, n_kv, n_head_kv, 1] !! not transposed !!
1817
+ // mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
1818
+ // res: [n_embd_v, n_head, n_batch, 1] !! permuted !!
1799
1819
  GGML_API struct ggml_tensor * ggml_flash_attn_ext(
1800
1820
  struct ggml_context * ctx,
1801
1821
  struct ggml_tensor * q,
@@ -1916,83 +1936,6 @@ extern "C" {
1916
1936
 
1917
1937
  // custom operators
1918
1938
 
1919
- typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
1920
- typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
1921
-
1922
- typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
1923
- typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1924
- typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1925
-
1926
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
1927
- struct ggml_context * ctx,
1928
- struct ggml_tensor * a,
1929
- ggml_unary_op_f32_t fun),
1930
- "use ggml_map_custom1 instead");
1931
-
1932
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
1933
- struct ggml_context * ctx,
1934
- struct ggml_tensor * a,
1935
- ggml_unary_op_f32_t fun),
1936
- "use ggml_map_custom1_inplace instead");
1937
-
1938
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
1939
- struct ggml_context * ctx,
1940
- struct ggml_tensor * a,
1941
- struct ggml_tensor * b,
1942
- ggml_binary_op_f32_t fun),
1943
- "use ggml_map_custom2 instead");
1944
-
1945
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
1946
- struct ggml_context * ctx,
1947
- struct ggml_tensor * a,
1948
- struct ggml_tensor * b,
1949
- ggml_binary_op_f32_t fun),
1950
- "use ggml_map_custom2_inplace instead");
1951
-
1952
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
1953
- struct ggml_context * ctx,
1954
- struct ggml_tensor * a,
1955
- ggml_custom1_op_f32_t fun),
1956
- "use ggml_map_custom1 instead");
1957
-
1958
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
1959
- struct ggml_context * ctx,
1960
- struct ggml_tensor * a,
1961
- ggml_custom1_op_f32_t fun),
1962
- "use ggml_map_custom1_inplace instead");
1963
-
1964
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
1965
- struct ggml_context * ctx,
1966
- struct ggml_tensor * a,
1967
- struct ggml_tensor * b,
1968
- ggml_custom2_op_f32_t fun),
1969
- "use ggml_map_custom2 instead");
1970
-
1971
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
1972
- struct ggml_context * ctx,
1973
- struct ggml_tensor * a,
1974
- struct ggml_tensor * b,
1975
- ggml_custom2_op_f32_t fun),
1976
- "use ggml_map_custom2_inplace instead");
1977
-
1978
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
1979
- struct ggml_context * ctx,
1980
- struct ggml_tensor * a,
1981
- struct ggml_tensor * b,
1982
- struct ggml_tensor * c,
1983
- ggml_custom3_op_f32_t fun),
1984
- "use ggml_map_custom3 instead");
1985
-
1986
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
1987
- struct ggml_context * ctx,
1988
- struct ggml_tensor * a,
1989
- struct ggml_tensor * b,
1990
- struct ggml_tensor * c,
1991
- ggml_custom3_op_f32_t fun),
1992
- "use ggml_map_custom3_inplace instead");
1993
-
1994
- // custom operators v2
1995
-
1996
1939
  typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
1997
1940
  typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
1998
1941
  typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
@@ -2048,6 +1991,30 @@ extern "C" {
2048
1991
  int n_tasks,
2049
1992
  void * userdata);
2050
1993
 
1994
+ typedef void (*ggml_custom_op_t)(struct ggml_tensor * dst , int ith, int nth, void * userdata);
1995
+
1996
+ GGML_API struct ggml_tensor * ggml_custom_4d(
1997
+ struct ggml_context * ctx,
1998
+ enum ggml_type type,
1999
+ int64_t ne0,
2000
+ int64_t ne1,
2001
+ int64_t ne2,
2002
+ int64_t ne3,
2003
+ struct ggml_tensor ** args,
2004
+ int n_args,
2005
+ ggml_custom_op_t fun,
2006
+ int n_tasks,
2007
+ void * userdata);
2008
+
2009
+ GGML_API struct ggml_tensor * ggml_custom_inplace(
2010
+ struct ggml_context * ctx,
2011
+ struct ggml_tensor * a,
2012
+ struct ggml_tensor ** args,
2013
+ int n_args,
2014
+ ggml_custom_op_t fun,
2015
+ int n_tasks,
2016
+ void * userdata);
2017
+
2051
2018
  // loss function
2052
2019
 
2053
2020
  GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
@@ -65,7 +65,7 @@ if (GGML_LTO)
65
65
  endif()
66
66
  endif()
67
67
 
68
- if (GGML_CCACHE)
68
+ if (GGML_CCACHE AND NOT CMAKE_C_COMPILER_LAUNCHER AND NOT CMAKE_CXX_COMPILER_LAUNCHER)
69
69
  find_program(GGML_CCACHE_FOUND ccache)
70
70
  find_program(GGML_SCCACHE_FOUND sccache)
71
71
 
@@ -267,6 +267,7 @@ function(ggml_add_cpu_backend_variant tag_name)
267
267
  set(GGML_CPU_TAG_NAME ${tag_name})
268
268
  # other: OPENMP LLAMAFILE CPU_HBM
269
269
  foreach (feat NATIVE
270
+ SSE42
270
271
  AVX AVX2 BMI2 AVX_VNNI FMA F16C
271
272
  AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
272
273
  AMX_TILE AMX_INT8 AMX_BF16)
@@ -286,14 +287,16 @@ if (GGML_CPU_ALL_VARIANTS)
286
287
  if (NOT GGML_BACKEND_DL)
287
288
  message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
288
289
  endif()
289
- ggml_add_cpu_backend_variant(sandybridge AVX)
290
- ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA)
291
- ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512)
292
- ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
293
- ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI)
290
+ ggml_add_cpu_backend_variant(x64)
291
+ ggml_add_cpu_backend_variant(sse42 SSE42)
292
+ ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
293
+ ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C AVX2 BMI2 FMA)
294
+ ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
295
+ ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
296
+ ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
294
297
  if (NOT MSVC)
295
298
  # MSVC doesn't support AMX
296
- ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
299
+ ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
297
300
  endif()
298
301
  elseif (GGML_CPU)
299
302
  ggml_add_cpu_backend_variant_impl("")
@@ -51,13 +51,11 @@ if (CANN_INSTALL_DIR)
51
51
  ${CANN_INSTALL_DIR}/acllib/include
52
52
  )
53
53
 
54
- add_subdirectory(kernels)
55
54
  list(APPEND CANN_LIBRARIES
56
55
  ascendcl
57
56
  nnopbase
58
57
  opapi
59
58
  acl_op_compiler
60
- ascendc_kernels
61
59
  )
62
60
 
63
61
  file(GLOB GGML_SOURCES_CANN "*.cpp")
@@ -41,6 +41,8 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
41
41
  return ACL_INT4;
42
42
  case GGML_TYPE_Q8_0:
43
43
  return ACL_INT8;
44
+ case GGML_TYPE_I64:
45
+ return ACL_INT64;
44
46
  default:
45
47
  return ACL_DT_UNDEFINED;
46
48
  }
@@ -54,9 +56,7 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
54
56
  // added.
55
57
  int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];
56
58
 
57
- int64_t acl_storage_len = 0;
58
59
  if (ne == nullptr) {
59
- acl_storage_len = ggml_nbytes(tensor);
60
60
  for (int i = 0; i < GGML_MAX_DIMS; i++) {
61
61
  acl_ne[i] = tensor->ne[i];
62
62
  // The step size of acl is in elements.
@@ -65,14 +65,18 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
65
65
  } else {
66
66
  // With bcast
67
67
  for (int i = 0; i < dims; i++) {
68
- acl_storage_len += (ne[i] - 1) * nb[i];
69
68
  acl_ne[i] = ne[i];
70
69
  acl_stride[i] = nb[i] / ggml_element_size(tensor);
71
70
  }
72
71
  }
73
72
 
74
- // Reverse ne and stride.
75
73
  int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
74
+ int64_t acl_storage_len = 1;
75
+ for (int i = 0; i < final_dims; i++) {
76
+ acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
77
+ }
78
+
79
+ // Reverse ne and stride.
76
80
  std::reverse(acl_ne, acl_ne + final_dims);
77
81
  std::reverse(acl_stride, acl_stride + final_dims);
78
82
 
@@ -101,14 +101,14 @@ aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
101
101
  tmp_stride[i] = nb[i] / type_size;
102
102
  }
103
103
 
104
- std::reverse(tmp_ne, tmp_ne + dims);
105
- std::reverse(tmp_stride, tmp_stride + dims);
106
-
107
- int64_t acl_storage_len = 0;
104
+ int64_t acl_storage_len = 1;
108
105
  for (int i = 0; i < dims; i++) {
109
- acl_storage_len += (ne[i] - 1) * nb[i];
106
+ acl_storage_len += (tmp_ne[i] - 1) * tmp_stride[i];
110
107
  }
111
108
 
109
+ std::reverse(tmp_ne, tmp_ne + dims);
110
+ std::reverse(tmp_stride, tmp_stride + dims);
111
+
112
112
  aclTensor* acl_tensor =
113
113
  aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
114
114
  format, &acl_storage_len, 1, data_ptr);