@fugood/llama.node 0.3.13 → 0.3.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +1 -1
- package/src/LlamaContext.cpp +98 -76
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +89 -10
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/CMakeLists.txt +9 -1
- package/src/llama.cpp/cmake/common.cmake +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +3 -3
- package/src/llama.cpp/common/arg.cpp +132 -13
- package/src/llama.cpp/common/chat.cpp +960 -266
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +33 -174
- package/src/llama.cpp/common/common.h +27 -67
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +45 -7
- package/src/llama.cpp/common/speculative.cpp +10 -9
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +45 -7
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
- package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
- package/src/llama.cpp/examples/infill/infill.cpp +2 -2
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +79 -34
- package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
- package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +196 -108
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
- package/src/llama.cpp/examples/server/server.cpp +113 -101
- package/src/llama.cpp/examples/server/utils.hpp +94 -105
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +263 -151
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
- package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
- package/src/llama.cpp/ggml/include/ggml.h +29 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
- package/src/llama.cpp/ggml/src/ggml.c +93 -5
- package/src/llama.cpp/include/llama.h +105 -27
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/CMakeLists.txt +5 -2
- package/src/llama.cpp/src/llama-adapter.cpp +19 -20
- package/src/llama.cpp/src/llama-adapter.h +11 -9
- package/src/llama.cpp/src/llama-arch.cpp +123 -16
- package/src/llama.cpp/src/llama-arch.h +19 -0
- package/src/llama.cpp/src/llama-batch.h +2 -2
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-context.cpp +2253 -1222
- package/src/llama.cpp/src/llama-context.h +214 -77
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +182 -182
- package/src/llama.cpp/src/llama-grammar.h +12 -3
- package/src/llama.cpp/src/llama-graph.cpp +1662 -0
- package/src/llama.cpp/src/llama-graph.h +574 -0
- package/src/llama.cpp/src/llama-hparams.cpp +8 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -0
- package/src/llama.cpp/src/llama-io.cpp +15 -0
- package/src/llama.cpp/src/llama-io.h +35 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
- package/src/llama.cpp/src/llama-kv-cache.h +178 -109
- package/src/llama.cpp/src/llama-memory.cpp +1 -0
- package/src/llama.cpp/src/llama-memory.h +21 -0
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-model.cpp +8230 -122
- package/src/llama.cpp/src/llama-model.h +34 -1
- package/src/llama.cpp/src/llama-quant.cpp +10 -1
- package/src/llama.cpp/src/llama-sampling.cpp +43 -10
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +51 -9837
- package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +593 -395
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -55
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
- /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
|
@@ -1,143 +0,0 @@
|
|
|
1
|
-
#include <sycl/sycl.hpp>
|
|
2
|
-
#include "wkv6.hpp"
|
|
3
|
-
|
|
4
|
-
constexpr int WKV_BLOCK_SIZE = 64; // Matching CUDA_WKV_BLOCK_SIZE
|
|
5
|
-
|
|
6
|
-
// Helper function for the main kernel
|
|
7
|
-
static void rwkv_wkv_f32_kernel(
|
|
8
|
-
const int B, const int T, const int C, const int H,
|
|
9
|
-
const float* k, const float* v, const float* r,
|
|
10
|
-
const float* tf, const float* td, const float* s,
|
|
11
|
-
float* dst, const sycl::nd_item<3>& item_ct1, float* shared_mem) {
|
|
12
|
-
|
|
13
|
-
const int tid = item_ct1.get_local_id(2);
|
|
14
|
-
const int bid = item_ct1.get_group(2);
|
|
15
|
-
|
|
16
|
-
const int head_size = WKV_BLOCK_SIZE;
|
|
17
|
-
const int batch_i = bid / H;
|
|
18
|
-
const int head_i = bid % H;
|
|
19
|
-
const int state_size = C * head_size;
|
|
20
|
-
const int n_seq_tokens = T / B;
|
|
21
|
-
|
|
22
|
-
// Set up shared memory pointers
|
|
23
|
-
float* _k = shared_mem;
|
|
24
|
-
float* _r = _k + head_size;
|
|
25
|
-
float* _tf = _r + head_size;
|
|
26
|
-
float* _td = _tf + head_size;
|
|
27
|
-
|
|
28
|
-
// Local state array
|
|
29
|
-
float state[WKV_BLOCK_SIZE];
|
|
30
|
-
|
|
31
|
-
// Load initial state
|
|
32
|
-
#pragma unroll
|
|
33
|
-
for (int i = 0; i < head_size; i++) {
|
|
34
|
-
state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid];
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
// Sync threads before shared memory operations
|
|
38
|
-
item_ct1.barrier(sycl::access::fence_space::local_space);
|
|
39
|
-
|
|
40
|
-
// Load time-mixing parameters
|
|
41
|
-
_tf[tid] = tf[head_i * head_size + tid];
|
|
42
|
-
item_ct1.barrier(sycl::access::fence_space::local_space);
|
|
43
|
-
|
|
44
|
-
// Main sequence processing loop
|
|
45
|
-
for (int t = batch_i * n_seq_tokens * C + head_i * head_size + tid;
|
|
46
|
-
t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid;
|
|
47
|
-
t += C) {
|
|
48
|
-
|
|
49
|
-
item_ct1.barrier(sycl::access::fence_space::local_space);
|
|
50
|
-
|
|
51
|
-
// Load current timestep data to shared memory
|
|
52
|
-
_k[tid] = k[t];
|
|
53
|
-
_r[tid] = r[t];
|
|
54
|
-
_td[tid] = td[t];
|
|
55
|
-
|
|
56
|
-
item_ct1.barrier(sycl::access::fence_space::local_space);
|
|
57
|
-
|
|
58
|
-
const float _v = v[t];
|
|
59
|
-
float y = 0;
|
|
60
|
-
|
|
61
|
-
// Process in chunks of 4 for better vectorization
|
|
62
|
-
sycl::float4 k4, r4, tf4, td4, s4;
|
|
63
|
-
#pragma unroll
|
|
64
|
-
for (int j = 0; j < head_size; j += 4) {
|
|
65
|
-
// Load data in vec4 chunks
|
|
66
|
-
k4 = sycl::float4(_k[j], _k[j+1], _k[j+2], _k[j+3]);
|
|
67
|
-
r4 = sycl::float4(_r[j], _r[j+1], _r[j+2], _r[j+3]);
|
|
68
|
-
tf4 = sycl::float4(_tf[j], _tf[j+1], _tf[j+2], _tf[j+3]);
|
|
69
|
-
td4 = sycl::float4(_td[j], _td[j+1], _td[j+2], _td[j+3]);
|
|
70
|
-
s4 = sycl::float4(state[j], state[j+1], state[j+2], state[j+3]);
|
|
71
|
-
|
|
72
|
-
// Compute key-value product
|
|
73
|
-
sycl::float4 kv4 = k4 * _v;
|
|
74
|
-
|
|
75
|
-
// Accumulate weighted sum
|
|
76
|
-
y += sycl::dot(r4, tf4 * kv4 + s4);
|
|
77
|
-
|
|
78
|
-
// Update state
|
|
79
|
-
s4 = s4 * td4 + kv4;
|
|
80
|
-
|
|
81
|
-
// Store updated state
|
|
82
|
-
state[j] = s4.x();
|
|
83
|
-
state[j+1] = s4.y();
|
|
84
|
-
state[j+2] = s4.z();
|
|
85
|
-
state[j+3] = s4.w();
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
dst[t] = y;
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
// Save final state
|
|
92
|
-
#pragma unroll
|
|
93
|
-
for (int i = 0; i < head_size; i++) {
|
|
94
|
-
dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i];
|
|
95
|
-
}
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
|
99
|
-
|
|
100
|
-
const ggml_tensor *src0 = dst->src[0];
|
|
101
|
-
const ggml_tensor *src1 = dst->src[1];
|
|
102
|
-
|
|
103
|
-
const float* k_d = (const float*)dst->src[0]->data;
|
|
104
|
-
const float* v_d = (const float*)dst->src[1]->data;
|
|
105
|
-
const float* r_d = (const float*)dst->src[2]->data;
|
|
106
|
-
const float* tf_d = (const float*)dst->src[3]->data;
|
|
107
|
-
const float* td_d = (const float*)dst->src[4]->data;
|
|
108
|
-
const float* s_d = (const float*)dst->src[5]->data;
|
|
109
|
-
float* dst_d = (float*)dst->data;
|
|
110
|
-
|
|
111
|
-
const int64_t B = dst->src[5]->ne[1];
|
|
112
|
-
const int64_t T = dst->src[0]->ne[2];
|
|
113
|
-
const int64_t C = dst->ne[0];
|
|
114
|
-
const int64_t H = dst->src[0]->ne[1];
|
|
115
|
-
|
|
116
|
-
GGML_ASSERT(dst->src[5]->type == GGML_TYPE_F32);
|
|
117
|
-
GGML_ASSERT(C % H == 0);
|
|
118
|
-
GGML_ASSERT(C / H == WKV_BLOCK_SIZE); // The current sycl kernel is designed for RWKV6, HEAD_SIZE == 64
|
|
119
|
-
|
|
120
|
-
dpct::queue_ptr stream = ctx.stream();
|
|
121
|
-
|
|
122
|
-
// Calculate execution configuration
|
|
123
|
-
const size_t shared_mem_size = WKV_BLOCK_SIZE * 4 * sizeof(float); // For k, r, tf, td
|
|
124
|
-
sycl::range<3> block_dims(1, 1, C / H);
|
|
125
|
-
sycl::range<3> grid_dims(1, 1, B * H);
|
|
126
|
-
|
|
127
|
-
// Submit kernel
|
|
128
|
-
stream->submit([&](sycl::handler& cgh) {
|
|
129
|
-
sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);
|
|
130
|
-
|
|
131
|
-
cgh.parallel_for(
|
|
132
|
-
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
|
133
|
-
[=](sycl::nd_item<3> item_ct1) {
|
|
134
|
-
rwkv_wkv_f32_kernel(
|
|
135
|
-
B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d,
|
|
136
|
-
item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
|
|
137
|
-
);
|
|
138
|
-
});
|
|
139
|
-
});
|
|
140
|
-
|
|
141
|
-
GGML_UNUSED(src0);
|
|
142
|
-
GGML_UNUSED(src1);
|
|
143
|
-
}
|
|
File without changes
|