@fugood/llama.node 0.3.6 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/README.md +17 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +3 -1
  19. package/lib/index.js +16 -1
  20. package/lib/index.ts +16 -0
  21. package/package.json +1 -1
  22. package/src/EmbeddingWorker.cpp +4 -3
  23. package/src/LlamaCompletionWorker.cpp +4 -2
  24. package/src/LlamaContext.cpp +61 -6
  25. package/src/LlamaContext.h +1 -0
  26. package/src/common.hpp +6 -11
  27. package/src/llama.cpp/.github/workflows/build.yml +19 -17
  28. package/src/llama.cpp/.github/workflows/docker.yml +77 -30
  29. package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +22 -3
  31. package/src/llama.cpp/CMakeLists.txt +49 -24
  32. package/src/llama.cpp/common/arg.cpp +82 -26
  33. package/src/llama.cpp/common/arg.h +3 -0
  34. package/src/llama.cpp/common/common.cpp +192 -72
  35. package/src/llama.cpp/common/common.h +51 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +12 -12
  37. package/src/llama.cpp/common/ngram-cache.h +2 -2
  38. package/src/llama.cpp/common/sampling.cpp +11 -6
  39. package/src/llama.cpp/common/speculative.cpp +18 -15
  40. package/src/llama.cpp/docs/build.md +2 -0
  41. package/src/llama.cpp/examples/batched/batched.cpp +9 -7
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
  43. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
  44. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
  45. package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
  46. package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
  47. package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
  48. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
  49. package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
  50. package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
  51. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
  52. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
  53. package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
  54. package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
  55. package/src/llama.cpp/examples/infill/infill.cpp +23 -24
  56. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
  57. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
  58. package/src/llama.cpp/examples/llava/clip.cpp +4 -2
  59. package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
  60. package/src/llama.cpp/examples/llava/llava.cpp +2 -2
  61. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
  62. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
  63. package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
  64. package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
  65. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
  66. package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
  67. package/src/llama.cpp/examples/main/main.cpp +51 -29
  68. package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
  69. package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
  70. package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
  71. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
  72. package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
  73. package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
  74. package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
  76. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
  77. package/src/llama.cpp/examples/run/run.cpp +175 -61
  78. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
  79. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
  80. package/src/llama.cpp/examples/server/httplib.h +1295 -409
  81. package/src/llama.cpp/examples/server/server.cpp +387 -181
  82. package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
  83. package/src/llama.cpp/examples/server/utils.hpp +170 -58
  84. package/src/llama.cpp/examples/simple/simple.cpp +9 -8
  85. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
  86. package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
  87. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
  88. package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
  89. package/src/llama.cpp/examples/tts/tts.cpp +64 -23
  90. package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
  91. package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
  92. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
  93. package/src/llama.cpp/ggml/include/ggml.h +36 -145
  94. package/src/llama.cpp/ggml/include/gguf.h +202 -0
  95. package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
  96. package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
  97. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
  98. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
  99. package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
  100. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
  101. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
  102. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
  103. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
  105. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
  106. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
  107. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
  109. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
  111. package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
  112. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
  113. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
  115. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
  117. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
  120. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
  121. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
  124. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
  125. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
  126. package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
  128. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
  129. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
  130. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
  131. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
  132. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
  133. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
  134. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
  135. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
  138. package/src/llama.cpp/ggml/src/ggml.c +117 -1327
  139. package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
  140. package/src/llama.cpp/include/llama-cpp.h +6 -1
  141. package/src/llama.cpp/include/llama.h +138 -75
  142. package/src/llama.cpp/src/CMakeLists.txt +13 -1
  143. package/src/llama.cpp/src/llama-adapter.cpp +347 -0
  144. package/src/llama.cpp/src/llama-adapter.h +74 -0
  145. package/src/llama.cpp/src/llama-arch.cpp +1487 -0
  146. package/src/llama.cpp/src/llama-arch.h +400 -0
  147. package/src/llama.cpp/src/llama-batch.cpp +368 -0
  148. package/src/llama.cpp/src/llama-batch.h +88 -0
  149. package/src/llama.cpp/src/llama-chat.cpp +578 -0
  150. package/src/llama.cpp/src/llama-chat.h +52 -0
  151. package/src/llama.cpp/src/llama-context.cpp +1775 -0
  152. package/src/llama.cpp/src/llama-context.h +128 -0
  153. package/src/llama.cpp/src/llama-cparams.cpp +1 -0
  154. package/src/llama.cpp/src/llama-cparams.h +37 -0
  155. package/src/llama.cpp/src/llama-grammar.cpp +5 -4
  156. package/src/llama.cpp/src/llama-grammar.h +3 -1
  157. package/src/llama.cpp/src/llama-hparams.cpp +71 -0
  158. package/src/llama.cpp/src/llama-hparams.h +139 -0
  159. package/src/llama.cpp/src/llama-impl.cpp +167 -0
  160. package/src/llama.cpp/src/llama-impl.h +16 -136
  161. package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
  162. package/src/llama.cpp/src/llama-kv-cache.h +218 -0
  163. package/src/llama.cpp/src/llama-mmap.cpp +589 -0
  164. package/src/llama.cpp/src/llama-mmap.h +67 -0
  165. package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
  166. package/src/llama.cpp/src/llama-model-loader.h +167 -0
  167. package/src/llama.cpp/src/llama-model.cpp +3953 -0
  168. package/src/llama.cpp/src/llama-model.h +370 -0
  169. package/src/llama.cpp/src/llama-quant.cpp +934 -0
  170. package/src/llama.cpp/src/llama-quant.h +1 -0
  171. package/src/llama.cpp/src/llama-sampling.cpp +147 -32
  172. package/src/llama.cpp/src/llama-sampling.h +3 -19
  173. package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
  174. package/src/llama.cpp/src/llama-vocab.h +97 -142
  175. package/src/llama.cpp/src/llama.cpp +7160 -20314
  176. package/src/llama.cpp/src/unicode.cpp +8 -3
  177. package/src/llama.cpp/tests/CMakeLists.txt +2 -0
  178. package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
  179. package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
  180. package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
  181. package/src/llama.cpp/tests/test-gguf.cpp +222 -187
  182. package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
  183. package/src/llama.cpp/tests/test-sampling.cpp +0 -1
  184. package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
  185. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
  186. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
@@ -0,0 +1,202 @@
1
+ // This file contains functionality related to "GGUF" files, the binary file format used by ggml.
2
+ // GGUF files have the following structure:
3
+ //
4
+ // 1. File magic "GGUF" (4 bytes).
5
+ // 2. File version (uint32_t).
6
+ // 3. Number of ggml tensors in file (int64_t).
7
+ // 4. Number of key-value-pairs in file (int64_t).
8
+ // 5. For each KV pair:
9
+ // 1. The key (string).
10
+ // 2. The value type (gguf_type).
11
+ // 3a. If the value type is GGUF_TYPE_ARRAY:
12
+ // 1. The type of the array (gguf_type).
13
+ // 2. The number of elements in the array (uint64_t).
14
+ // 3. The binary representation of each element in the array.
15
+ // 3b. Otherwise:
16
+ // 1. The binary representation of the value.
17
+ // 6. For each ggml tensor:
18
+ // 1. The tensor name (string).
19
+ // 2. The number of dimensions of the tensor (uint32_t).
20
+ // 3. For each dimension:
21
+ // 1. The size of the tensor in the dimension (int64_t).
22
+ // 4. The tensor data type (ggml_type).
23
+ // 5. The tensor data offset in the tensor data binary blob (uint64_t).
24
+ // 7. The tensor data binary blob (optional, aligned).
25
+ //
26
+ // Strings are serialized as the string length (uint64_t) followed by the C string without the null terminator.
27
+ // All enums are stored as int32_t.
28
+ // All bool values are stored as int8_t.
29
+ // If the special key "general.alignment" (uint32_t) is defined it is used for alignment,
30
+ // otherwise GGUF_DEFAULT_ALIGNMENT is used.
31
+ //
32
+ // Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
33
+
34
+ #pragma once
35
+
36
+ #include "ggml.h"
37
+
38
+ #include <stdbool.h>
39
+ #include <stdint.h>
40
+
41
+ #define GGUF_MAGIC "GGUF"
42
+ #define GGUF_VERSION 3
43
+
44
+ #define GGUF_KEY_GENERAL_ALIGNMENT "general.alignment"
45
+
46
+ #define GGUF_DEFAULT_ALIGNMENT 32
47
+
48
+ #ifdef __cplusplus
49
+ extern "C" {
50
+ #endif
51
+
52
+ // types that can be stored as GGUF KV data
53
+ enum gguf_type {
54
+ GGUF_TYPE_UINT8 = 0,
55
+ GGUF_TYPE_INT8 = 1,
56
+ GGUF_TYPE_UINT16 = 2,
57
+ GGUF_TYPE_INT16 = 3,
58
+ GGUF_TYPE_UINT32 = 4,
59
+ GGUF_TYPE_INT32 = 5,
60
+ GGUF_TYPE_FLOAT32 = 6,
61
+ GGUF_TYPE_BOOL = 7,
62
+ GGUF_TYPE_STRING = 8,
63
+ GGUF_TYPE_ARRAY = 9,
64
+ GGUF_TYPE_UINT64 = 10,
65
+ GGUF_TYPE_INT64 = 11,
66
+ GGUF_TYPE_FLOAT64 = 12,
67
+ GGUF_TYPE_COUNT, // marks the end of the enum
68
+ };
69
+
70
+ struct gguf_context;
71
+
72
+ struct gguf_init_params {
73
+ bool no_alloc;
74
+
75
+ // if not NULL, create a ggml_context and allocate the tensor data in it
76
+ struct ggml_context ** ctx;
77
+ };
78
+
79
+ GGML_API struct gguf_context * gguf_init_empty(void);
80
+ GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
81
+ //GGML_API struct gguf_context * gguf_init_from_buffer(..);
82
+
83
+ GGML_API void gguf_free(struct gguf_context * ctx);
84
+
85
+ GGML_API const char * gguf_type_name(enum gguf_type type);
86
+
87
+ GGML_API uint32_t gguf_get_version (const struct gguf_context * ctx);
88
+ GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx);
89
+ GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
90
+
91
+ GGML_API int64_t gguf_get_n_kv(const struct gguf_context * ctx);
92
+ GGML_API int64_t gguf_find_key(const struct gguf_context * ctx, const char * key); // returns -1 if key is not found
93
+ GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int64_t key_id);
94
+
95
+ GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int64_t key_id);
96
+ GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id);
97
+
98
+ // will abort if the wrong type is used for the key
99
+ GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int64_t key_id);
100
+ GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int64_t key_id);
101
+ GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int64_t key_id);
102
+ GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int64_t key_id);
103
+ GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int64_t key_id);
104
+ GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int64_t key_id);
105
+ GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int64_t key_id);
106
+ GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int64_t key_id);
107
+ GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int64_t key_id);
108
+ GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int64_t key_id);
109
+ GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int64_t key_id);
110
+ GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int64_t key_id);
111
+ GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int64_t key_id);
112
+ GGML_API size_t gguf_get_arr_n (const struct gguf_context * ctx, int64_t key_id);
113
+
114
+ // get raw pointer to the first element of the array with the given key_id
115
+ // for bool arrays, note that they are always stored as int8 on all platforms (usually this makes no difference)
116
+ GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int64_t key_id);
117
+
118
+ // get ith C string from array with given key_id
119
+ GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i);
120
+
121
+ GGML_API int64_t gguf_get_n_tensors (const struct gguf_context * ctx);
122
+ GGML_API int64_t gguf_find_tensor (const struct gguf_context * ctx, const char * name); // returns -1 if the tensor is not found
123
+ GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int64_t tensor_id);
124
+ GGML_API const char * gguf_get_tensor_name (const struct gguf_context * ctx, int64_t tensor_id);
125
+ GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int64_t tensor_id);
126
+ GGML_API size_t gguf_get_tensor_size (const struct gguf_context * ctx, int64_t tensor_id);
127
+
128
+ // removes key if it exists, returns id that the key had prior to removal (-1 if it didn't exist)
129
+ GGML_API int64_t gguf_remove_key(struct gguf_context * ctx, const char * key);
130
+
131
+ // overrides an existing KV pair or adds a new one, the new KV pair is always at the back
132
+ GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
133
+ GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
134
+ GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
135
+ GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
136
+ GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
137
+ GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
138
+ GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
139
+ GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
140
+ GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
141
+ GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
142
+ GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
143
+ GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
144
+
145
+ // creates a new array with n elements of the given type and copies the corresponding number of bytes from data
146
+ GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, size_t n);
147
+
148
+ // creates a new array with n strings and copies the corresponding strings from data
149
+ GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, size_t n);
150
+
151
+ // set or add KV pairs from another context
152
+ GGML_API void gguf_set_kv(struct gguf_context * ctx, const struct gguf_context * src);
153
+
154
+ // add tensor to GGUF context, tensor name must be unique
155
+ GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
156
+
157
+ // after changing a tensor's type, the offsets of all tensors with higher indices are immediately recalculated
158
+ // in such a way that the tensor data remains as one contiguous block (except for padding)
159
+ GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
160
+
161
+ // assumes that at least gguf_get_tensor_size bytes can be read from data
162
+ GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data);
163
+
164
+ // writing gguf files can be done in 3 ways:
165
+ //
166
+ // - write the entire gguf_context to a binary file in a single pass:
167
+ //
168
+ // gguf_write_to_file(ctx, fname, /*only_meta =*/ false);
169
+ //
170
+ // - write only the meta data to a file, then re-open the file and append the tensor data:
171
+ //
172
+ // gguf_write_to_file(ctx, fname, /*only_meta =*/ true);
173
+ // FILE * f = fopen(fname, "ab");
174
+ // fwrite(f, ...); // write tensor data
175
+ // fclose(f);
176
+ //
177
+ // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
178
+ //
179
+ // FILE * f = fopen(fname, "wb");
180
+ // const size_t size_meta = gguf_get_meta_size(ctx);
181
+ // fseek(f, size_meta, SEEK_SET);
182
+ // fwrite(f, ...); // write tensor data
183
+ // void * data = malloc(size_meta);
184
+ // gguf_get_meta_data(ctx, data);
185
+ // rewind(f);
186
+ // fwrite(data, 1, data, f);
187
+ // free(data);
188
+ // fclose(f);
189
+ //
190
+
191
+ // write the entire context to a binary file
192
+ GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
193
+
194
+ // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
195
+ GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
196
+
197
+ // writes the meta data to pointer "data"
198
+ GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
199
+
200
+ #ifdef __cplusplus
201
+ }
202
+ #endif
@@ -208,6 +208,7 @@ add_library(ggml-base
208
208
  ../include/ggml-backend.h
209
209
  ../include/ggml-cpp.h
210
210
  ../include/ggml-opt.h
211
+ ../include/gguf.h
211
212
  ggml.c
212
213
  ggml-alloc.c
213
214
  ggml-backend.cpp
@@ -215,7 +216,8 @@ add_library(ggml-base
215
216
  ggml-threading.cpp
216
217
  ggml-threading.h
217
218
  ggml-quants.c
218
- ggml-quants.h)
219
+ ggml-quants.h
220
+ gguf.cpp)
219
221
 
220
222
  target_include_directories(ggml-base PRIVATE .)
221
223
 
@@ -234,6 +236,7 @@ function(ggml_add_backend_library backend)
234
236
  # write the shared library to the output directory
235
237
  set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
236
238
  target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
239
+ add_dependencies(ggml ${backend})
237
240
  else()
238
241
  add_library(${backend} ${ARGN})
239
242
  target_link_libraries(ggml PUBLIC ${backend})
@@ -289,9 +292,9 @@ if (GGML_CPU_ALL_VARIANTS)
289
292
  ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 FMA)
290
293
  ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)
291
294
  ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
295
+ ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI)
292
296
  if (NOT MSVC)
293
- # MSVC doesn't support AVX-VNNI or AMX
294
- ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI)
297
+ # MSVC doesn't support AMX
295
298
  ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
296
299
  endif()
297
300
  else ()
@@ -37,6 +37,7 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
37
37
  return true;
38
38
  }
39
39
 
40
+ // ops that return true for this function must not use restrict pointers for their backend implementations
40
41
  static bool ggml_op_can_inplace(enum ggml_op op) {
41
42
  switch (op) {
42
43
  case GGML_OP_SCALE:
@@ -52,8 +53,12 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
52
53
  case GGML_OP_LOG:
53
54
  case GGML_OP_UNARY:
54
55
  case GGML_OP_ROPE:
56
+ case GGML_OP_ROPE_BACK:
57
+ case GGML_OP_SILU_BACK:
55
58
  case GGML_OP_RMS_NORM:
59
+ case GGML_OP_RMS_NORM_BACK:
56
60
  case GGML_OP_SOFT_MAX:
61
+ case GGML_OP_SOFT_MAX_BACK:
57
62
  return true;
58
63
 
59
64
  default:
@@ -208,7 +208,6 @@ extern "C" {
208
208
 
209
209
  // Internal backend registry API
210
210
  GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
211
- GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
212
211
 
213
212
  // Add backend dynamic loading support to the backend
214
213
 
@@ -66,6 +66,26 @@
66
66
  #include "ggml-kompute.h"
67
67
  #endif
68
68
 
69
+ // disable C++17 deprecation warning for std::codecvt_utf8
70
+ #if defined(__clang__)
71
+ # pragma clang diagnostic push
72
+ # pragma clang diagnostic ignored "-Wdeprecated-declarations"
73
+ #endif
74
+
75
+ static std::wstring utf8_to_utf16(const std::string & str) {
76
+ std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
77
+ return converter.from_bytes(str);
78
+ }
79
+
80
+ static std::string utf16_to_utf8(const std::wstring & str) {
81
+ std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
82
+ return converter.to_bytes(str);
83
+ }
84
+
85
+ #if defined(__clang__)
86
+ # pragma clang diagnostic pop
87
+ #endif
88
+
69
89
  #ifdef _WIN32
70
90
 
71
91
  using dl_handle = std::remove_pointer_t<HMODULE>;
@@ -88,11 +108,6 @@ static dl_handle * dl_load_library(const std::wstring & path) {
88
108
  return handle;
89
109
  }
90
110
 
91
- static dl_handle * dl_load_library(const std::string & path) {
92
- std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
93
- return dl_load_library(converter.from_bytes(path));
94
- }
95
-
96
111
  static void * dl_get_sym(dl_handle * handle, const char * name) {
97
112
  DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
98
113
  SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
@@ -114,8 +129,8 @@ struct dl_handle_deleter {
114
129
  }
115
130
  };
116
131
 
117
- static void * dl_load_library(const std::string & path) {
118
- dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
132
+ static void * dl_load_library(const std::wstring & path) {
133
+ dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
119
134
 
120
135
  return handle;
121
136
  }
@@ -202,11 +217,11 @@ struct ggml_backend_registry {
202
217
  devices.push_back(device);
203
218
  }
204
219
 
205
- ggml_backend_reg_t load_backend(const char * path, bool silent) {
220
+ ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
206
221
  dl_handle_ptr handle { dl_load_library(path) };
207
222
  if (!handle) {
208
223
  if (!silent) {
209
- GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
224
+ GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
210
225
  }
211
226
  return nullptr;
212
227
  }
@@ -214,7 +229,7 @@ struct ggml_backend_registry {
214
229
  auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
215
230
  if (score_fn && score_fn() == 0) {
216
231
  if (!silent) {
217
- GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
232
+ GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
218
233
  }
219
234
  return nullptr;
220
235
  }
@@ -222,7 +237,7 @@ struct ggml_backend_registry {
222
237
  auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
223
238
  if (!backend_init_fn) {
224
239
  if (!silent) {
225
- GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path);
240
+ GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
226
241
  }
227
242
  return nullptr;
228
243
  }
@@ -231,16 +246,16 @@ struct ggml_backend_registry {
231
246
  if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
232
247
  if (!silent) {
233
248
  if (!reg) {
234
- GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
249
+ GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
235
250
  } else {
236
251
  GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
237
- __func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
252
+ __func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
238
253
  }
239
254
  }
240
255
  return nullptr;
241
256
  }
242
257
 
243
- GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
258
+ GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
244
259
 
245
260
  register_backend(reg, std::move(handle));
246
261
 
@@ -376,14 +391,14 @@ ggml_backend_t ggml_backend_init_best(void) {
376
391
 
377
392
  // Dynamic loading
378
393
  ggml_backend_reg_t ggml_backend_load(const char * path) {
379
- return get_reg().load_backend(path, false);
394
+ return get_reg().load_backend(utf8_to_utf16(path), false);
380
395
  }
381
396
 
382
397
  void ggml_backend_unload(ggml_backend_reg_t reg) {
383
398
  get_reg().unload_backend(reg, true);
384
399
  }
385
400
 
386
- static std::string get_executable_path() {
401
+ static std::wstring get_executable_path() {
387
402
  #if defined(__APPLE__)
388
403
  // get executable path
389
404
  std::vector<char> path;
@@ -401,13 +416,17 @@ static std::string get_executable_path() {
401
416
  if (last_slash != std::string::npos) {
402
417
  base_path = base_path.substr(0, last_slash);
403
418
  }
404
- return base_path + "/";
405
- #elif defined(__linux__)
419
+ return utf8_to_utf16(base_path + "/");
420
+ #elif defined(__linux__) || defined(__FreeBSD__)
406
421
  std::string base_path = ".";
407
422
  std::vector<char> path(1024);
408
423
  while (true) {
409
424
  // get executable path
425
+ # if defined(__linux__)
410
426
  ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
427
+ # elif defined(__FreeBSD__)
428
+ ssize_t len = readlink("/proc/curproc/file", path.data(), path.size());
429
+ # endif
411
430
  if (len == -1) {
412
431
  break;
413
432
  }
@@ -423,57 +442,63 @@ static std::string get_executable_path() {
423
442
  path.resize(path.size() * 2);
424
443
  }
425
444
 
426
- return base_path + "/";
445
+ return utf8_to_utf16(base_path + "/");
427
446
  #elif defined(_WIN32)
428
- std::vector<char> path(MAX_PATH);
429
- DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
447
+ std::vector<wchar_t> path(MAX_PATH);
448
+ DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
430
449
  if (len == 0) {
431
- return "";
450
+ return {};
432
451
  }
433
- std::string base_path(path.data(), len);
452
+ std::wstring base_path(path.data(), len);
434
453
  // remove executable name
435
454
  auto last_slash = base_path.find_last_of('\\');
436
455
  if (last_slash != std::string::npos) {
437
456
  base_path = base_path.substr(0, last_slash);
438
457
  }
439
- return base_path + "\\";
458
+ return base_path + L"\\";
459
+ #else
460
+ return {};
440
461
  #endif
441
462
  }
442
463
 
443
- static std::string backend_filename_prefix() {
464
+ static std::wstring backend_filename_prefix() {
444
465
  #ifdef _WIN32
445
- return "ggml-";
466
+ return L"ggml-";
446
467
  #else
447
- return "libggml-";
468
+ return L"libggml-";
448
469
  #endif
449
470
  }
450
471
 
451
- static std::string backend_filename_suffix() {
472
+ static std::wstring backend_filename_suffix() {
452
473
  #ifdef _WIN32
453
- return ".dll";
474
+ return L".dll";
454
475
  #else
455
- return ".so";
476
+ return L".so";
477
+ #endif
478
+ }
479
+
480
+ static std::wstring path_separator() {
481
+ #ifdef _WIN32
482
+ return L"\\";
483
+ #else
484
+ return L"/";
456
485
  #endif
457
486
  }
458
487
 
459
488
  static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
460
489
  // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
461
490
  // TODO: search system paths
462
- std::string file_prefix = backend_filename_prefix() + name + "-";
463
- std::vector<std::string> search_paths;
491
+ std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
492
+ std::vector<std::wstring> search_paths;
464
493
  if (user_search_path == nullptr) {
465
- search_paths.push_back("./");
494
+ search_paths.push_back(L"." + path_separator());
466
495
  search_paths.push_back(get_executable_path());
467
496
  } else {
468
- #if defined(_WIN32)
469
- search_paths.push_back(std::string(user_search_path) + "\\");
470
- #else
471
- search_paths.push_back(std::string(user_search_path) + "/");
472
- #endif
497
+ search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
473
498
  }
474
499
 
475
500
  int best_score = 0;
476
- std::string best_path;
501
+ std::wstring best_path;
477
502
 
478
503
  namespace fs = std::filesystem;
479
504
  for (const auto & search_path : search_paths) {
@@ -483,27 +508,27 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
483
508
  fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
484
509
  for (const auto & entry : dir_it) {
485
510
  if (entry.is_regular_file()) {
486
- std::string filename = entry.path().filename().string();
487
- std::string ext = entry.path().extension().string();
511
+ std::wstring filename = entry.path().filename().wstring();
512
+ std::wstring ext = entry.path().extension().wstring();
488
513
  if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
489
- dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
514
+ dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
490
515
  if (!handle && !silent) {
491
- GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
516
+ GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
492
517
  }
493
518
  if (handle) {
494
519
  auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
495
520
  if (score_fn) {
496
521
  int s = score_fn();
497
522
  #ifndef NDEBUG
498
- GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
523
+ GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
499
524
  #endif
500
525
  if (s > best_score) {
501
526
  best_score = s;
502
- best_path = entry.path().string();
527
+ best_path = entry.path().wstring();
503
528
  }
504
529
  } else {
505
530
  if (!silent) {
506
- GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
531
+ GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
507
532
  }
508
533
  }
509
534
  }
@@ -515,15 +540,15 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
515
540
  if (best_score == 0) {
516
541
  // try to load the base backend
517
542
  for (const auto & search_path : search_paths) {
518
- std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
543
+ std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
519
544
  if (fs::exists(path)) {
520
- return get_reg().load_backend(path.c_str(), silent);
545
+ return get_reg().load_backend(path, silent);
521
546
  }
522
547
  }
523
548
  return nullptr;
524
549
  }
525
550
 
526
- return get_reg().load_backend(best_path.c_str(), silent);
551
+ return get_reg().load_backend(best_path, silent);
527
552
  }
528
553
 
529
554
  void ggml_backend_load_all() {
@@ -549,4 +574,9 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
549
574
  ggml_backend_load_best("opencl", silent, dir_path);
550
575
  ggml_backend_load_best("musa", silent, dir_path);
551
576
  ggml_backend_load_best("cpu", silent, dir_path);
577
+ // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
578
+ const char * backend_path = std::getenv("GGML_BACKEND_PATH");
579
+ if (backend_path) {
580
+ ggml_backend_load(backend_path);
581
+ }
552
582
  }
@@ -764,7 +764,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
764
764
  if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
765
765
  int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
766
766
  // check if a backend with higher prio wants to offload the op
767
- if (src_backend_id == sched->n_backends - 1) {
767
+ if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
768
768
  for (int b = 0; b < src_backend_id; b++) {
769
769
  if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
770
770
  SET_CAUSE(tensor, "1.off");
@@ -795,9 +795,12 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
795
795
  for (int i = 0; i < graph->n_nodes; i++) {
796
796
  if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
797
797
  ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
798
- GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
798
+ GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, ggml_backend_name(split_backend),
799
799
  sched->splits[cur_split].n_inputs);
800
800
  for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
801
+ if (j == 0) {
802
+ GGML_LOG_DEBUG(": ");
803
+ }
801
804
  GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
802
805
  fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
803
806
  }