@fugood/llama.node 1.1.4 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +14 -14
- package/src/LlamaContext.cpp +3 -0
- package/src/llama.cpp/common/arg.cpp +60 -7
- package/src/llama.cpp/common/chat.cpp +6 -6
- package/src/llama.cpp/common/common.cpp +1 -0
- package/src/llama.cpp/common/common.h +14 -5
- package/src/llama.cpp/common/speculative.cpp +135 -54
- package/src/llama.cpp/common/speculative.h +8 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
- package/src/llama.cpp/include/llama.h +8 -4
- package/src/llama.cpp/src/llama-arch.cpp +40 -0
- package/src/llama.cpp/src/llama-arch.h +2 -0
- package/src/llama.cpp/src/llama-batch.cpp +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +20 -1
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +11 -2
- package/src/llama.cpp/src/llama-context.h +4 -1
- package/src/llama.cpp/src/llama-graph.cpp +57 -139
- package/src/llama.cpp/src/llama-graph.h +31 -32
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +2 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
- package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +400 -21
- package/src/llama.cpp/src/llama-quant.cpp +3 -3
- package/src/llama.cpp/src/llama-vocab.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.h +1 -0
|
@@ -12,7 +12,10 @@ struct common_speculative_params {
|
|
|
12
12
|
float p_min = 0.75f; // min probability required to accept a token in the draft
|
|
13
13
|
};
|
|
14
14
|
|
|
15
|
-
struct common_speculative * common_speculative_init(
|
|
15
|
+
struct common_speculative * common_speculative_init(
|
|
16
|
+
struct llama_context * ctx_tgt,
|
|
17
|
+
struct llama_context * ctx_dft
|
|
18
|
+
);
|
|
16
19
|
|
|
17
20
|
void common_speculative_free(struct common_speculative * spec);
|
|
18
21
|
|
|
@@ -20,6 +23,10 @@ bool common_speculative_are_compatible(
|
|
|
20
23
|
const struct llama_context * ctx_tgt,
|
|
21
24
|
const struct llama_context * ctx_dft);
|
|
22
25
|
|
|
26
|
+
void common_speculative_add_replacement_tgt_dft(
|
|
27
|
+
struct common_speculative * spec,
|
|
28
|
+
const char *source, const char *dest);
|
|
29
|
+
|
|
23
30
|
// sample up to n_draft tokens and add them to the batch using the draft model
|
|
24
31
|
llama_tokens common_speculative_gen_draft(
|
|
25
32
|
struct common_speculative * spec,
|
|
@@ -174,6 +174,7 @@ option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental,
|
|
|
174
174
|
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
|
175
175
|
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
|
176
176
|
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
|
177
|
+
option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON)
|
|
177
178
|
option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
|
|
178
179
|
option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
|
|
179
180
|
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|