@fugood/llama.node 1.1.11 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -8
- package/lib/binding.ts +18 -1
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -2
- package/package.json +20 -16
- package/src/DecodeAudioTokenWorker.cpp +23 -26
- package/src/DecodeAudioTokenWorker.h +6 -8
- package/src/DetokenizeWorker.cpp +5 -8
- package/src/DetokenizeWorker.h +6 -5
- package/src/DisposeWorker.cpp +23 -3
- package/src/DisposeWorker.h +4 -2
- package/src/EmbeddingWorker.cpp +9 -35
- package/src/EmbeddingWorker.h +3 -2
- package/src/LlamaCompletionWorker.cpp +217 -315
- package/src/LlamaCompletionWorker.h +6 -12
- package/src/LlamaContext.cpp +166 -396
- package/src/LlamaContext.h +8 -13
- package/src/LoadSessionWorker.cpp +22 -19
- package/src/LoadSessionWorker.h +3 -2
- package/src/RerankWorker.h +3 -2
- package/src/SaveSessionWorker.cpp +22 -19
- package/src/SaveSessionWorker.h +3 -2
- package/src/TokenizeWorker.cpp +38 -35
- package/src/TokenizeWorker.h +12 -3
- package/src/common.hpp +0 -458
- package/src/llama.cpp/common/arg.cpp +50 -30
- package/src/llama.cpp/common/chat.cpp +250 -1
- package/src/llama.cpp/common/chat.h +4 -0
- package/src/llama.cpp/common/common.h +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +21 -1
- package/src/llama.cpp/common/log.cpp +53 -2
- package/src/llama.cpp/common/log.h +10 -4
- package/src/llama.cpp/common/sampling.cpp +23 -2
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
- package/src/llama.cpp/ggml/include/ggml-backend.h +15 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +0 -6
- package/src/llama.cpp/ggml/include/ggml.h +56 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +21 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +25 -38
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +4 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +379 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +41 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +150 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +320 -73
- package/src/llama.cpp/include/llama.h +5 -6
- package/src/llama.cpp/src/llama-adapter.cpp +33 -0
- package/src/llama.cpp/src/llama-adapter.h +3 -0
- package/src/llama.cpp/src/llama-arch.cpp +28 -4
- package/src/llama.cpp/src/llama-arch.h +3 -0
- package/src/llama.cpp/src/llama-context.cpp +65 -57
- package/src/llama.cpp/src/llama-context.h +1 -1
- package/src/llama.cpp/src/llama-graph.cpp +57 -11
- package/src/llama.cpp/src/llama-graph.h +8 -0
- package/src/llama.cpp/src/llama-hparams.cpp +37 -0
- package/src/llama.cpp/src/llama-hparams.h +10 -3
- package/src/llama.cpp/src/llama-kv-cache.cpp +56 -38
- package/src/llama.cpp/src/llama-kv-cache.h +9 -0
- package/src/llama.cpp/src/llama-model.cpp +217 -97
- package/src/llama.cpp/src/llama-model.h +0 -1
- package/src/llama.cpp/src/llama-quant.cpp +3 -3
- package/src/llama.cpp/src/llama-sampling.cpp +226 -126
- package/src/llama.cpp/src/llama.cpp +53 -10
- package/src/anyascii.c +0 -22223
- package/src/anyascii.h +0 -42
- package/src/tts_utils.cpp +0 -371
- package/src/tts_utils.h +0 -103
|
@@ -59,6 +59,7 @@ bool llama_supports_mlock(void) {
|
|
|
59
59
|
|
|
60
60
|
bool llama_supports_gpu_offload(void) {
|
|
61
61
|
return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
|
|
62
|
+
ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
|
|
62
63
|
llama_supports_rpc();
|
|
63
64
|
}
|
|
64
65
|
|
|
@@ -83,7 +84,9 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
|
|
|
83
84
|
GGML_ASSERT(dev && "CPU backend is not loaded");
|
|
84
85
|
auto * reg = ggml_backend_dev_backend_reg(dev);
|
|
85
86
|
auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
|
|
86
|
-
numa_init_fn
|
|
87
|
+
if (numa_init_fn) {
|
|
88
|
+
numa_init_fn(numa);
|
|
89
|
+
}
|
|
87
90
|
}
|
|
88
91
|
}
|
|
89
92
|
|
|
@@ -182,8 +185,13 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
|
182
185
|
model->devices.push_back(*dev);
|
|
183
186
|
}
|
|
184
187
|
} else {
|
|
188
|
+
// default device selection
|
|
189
|
+
|
|
190
|
+
// build list of available devices
|
|
191
|
+
std::vector<ggml_backend_dev_t> gpus;
|
|
192
|
+
std::vector<ggml_backend_dev_t> igpus;
|
|
185
193
|
std::vector<ggml_backend_dev_t> rpc_servers;
|
|
186
|
-
|
|
194
|
+
|
|
187
195
|
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
|
188
196
|
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
|
189
197
|
switch (ggml_backend_dev_type(dev)) {
|
|
@@ -192,19 +200,51 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
|
192
200
|
// skip CPU backends since they are handled separately
|
|
193
201
|
break;
|
|
194
202
|
|
|
195
|
-
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
|
203
|
+
case GGML_BACKEND_DEVICE_TYPE_GPU: {
|
|
196
204
|
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
|
197
205
|
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
|
|
198
206
|
rpc_servers.push_back(dev);
|
|
199
207
|
} else {
|
|
200
|
-
|
|
208
|
+
// check if there is already a GPU with the same device id
|
|
209
|
+
ggml_backend_dev_props props;
|
|
210
|
+
ggml_backend_dev_get_props(dev, &props);
|
|
211
|
+
auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
|
|
212
|
+
ggml_backend_dev_props d_props;
|
|
213
|
+
ggml_backend_dev_get_props(d, &d_props);
|
|
214
|
+
if (props.device_id && d_props.device_id) {
|
|
215
|
+
return strcmp(props.device_id, d_props.device_id) == 0;
|
|
216
|
+
}
|
|
217
|
+
return false;
|
|
218
|
+
});
|
|
219
|
+
|
|
220
|
+
if (it != gpus.end()) {
|
|
221
|
+
LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
|
|
222
|
+
__func__,
|
|
223
|
+
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
|
224
|
+
props.device_id ? props.device_id : "unknown id",
|
|
225
|
+
ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
|
|
226
|
+
} else {
|
|
227
|
+
gpus.push_back(dev);
|
|
228
|
+
}
|
|
201
229
|
}
|
|
202
230
|
break;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
case GGML_BACKEND_DEVICE_TYPE_IGPU:
|
|
234
|
+
igpus.push_back(dev);
|
|
235
|
+
break;
|
|
203
236
|
}
|
|
204
237
|
}
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
238
|
+
|
|
239
|
+
// add RPC servers at the front of the list to minimize network transfers
|
|
240
|
+
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
|
|
241
|
+
|
|
242
|
+
// add GPUs
|
|
243
|
+
model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
|
|
244
|
+
|
|
245
|
+
// add integrated GPUs only if no other devices were found
|
|
246
|
+
if (model->devices.empty()) {
|
|
247
|
+
model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
|
|
208
248
|
}
|
|
209
249
|
}
|
|
210
250
|
|
|
@@ -225,9 +265,12 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
|
225
265
|
}
|
|
226
266
|
|
|
227
267
|
for (auto * dev : model->devices) {
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__,
|
|
268
|
+
ggml_backend_dev_props props;
|
|
269
|
+
ggml_backend_dev_get_props(dev, &props);
|
|
270
|
+
LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
|
|
271
|
+
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
|
272
|
+
props.device_id ? props.device_id : "unknown id",
|
|
273
|
+
props.memory_free/1024/1024);
|
|
231
274
|
}
|
|
232
275
|
|
|
233
276
|
const int status = llama_model_load(path_model, splits, *model, params);
|