@fugood/llama.node 0.0.1-alpha.4 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CMakeLists.txt +36 -7
  2. package/README.md +9 -0
  3. package/bin/darwin/arm64/default.metallib +0 -0
  4. package/bin/darwin/arm64/llama-node.node +0 -0
  5. package/bin/darwin/x64/default.metallib +0 -0
  6. package/bin/darwin/x64/llama-node.node +0 -0
  7. package/bin/linux/arm64/llama-node.node +0 -0
  8. package/bin/linux/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/lib/binding.js +1 -1
  14. package/lib/binding.ts +5 -2
  15. package/lib/index.ts +2 -2
  16. package/package.json +15 -3
  17. package/src/LlamaCompletionWorker.cpp +5 -1
  18. package/src/LlamaCompletionWorker.h +4 -0
  19. package/src/LlamaContext.cpp +18 -1
  20. package/src/common.hpp +11 -7
  21. package/src/llama.cpp/CMakeLists.txt +13 -7
  22. package/src/llama.cpp/common/common.cpp +221 -173
  23. package/src/llama.cpp/common/common.h +19 -8
  24. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
  25. package/src/llama.cpp/common/log.h +2 -2
  26. package/src/llama.cpp/common/sampling.cpp +17 -1
  27. package/src/llama.cpp/common/sampling.h +28 -20
  28. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +17 -11
  29. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +5 -5
  30. package/src/llama.cpp/examples/finetune/finetune.cpp +1 -1
  31. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -4
  32. package/src/llama.cpp/examples/imatrix/imatrix.cpp +72 -39
  33. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -3
  34. package/src/llama.cpp/examples/llava/clip.cpp +74 -23
  35. package/src/llama.cpp/examples/llava/llava-cli.cpp +37 -28
  36. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -1
  37. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -1
  38. package/src/llama.cpp/examples/main/main.cpp +10 -8
  39. package/src/llama.cpp/examples/perplexity/perplexity.cpp +175 -55
  40. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  41. package/src/llama.cpp/examples/quantize/quantize.cpp +74 -47
  42. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  43. package/src/llama.cpp/examples/server/server.cpp +97 -86
  44. package/src/llama.cpp/examples/server/utils.hpp +17 -15
  45. package/src/llama.cpp/ggml-backend.c +7 -5
  46. package/src/llama.cpp/ggml-impl.h +339 -4
  47. package/src/llama.cpp/ggml-kompute.cpp +7 -0
  48. package/src/llama.cpp/ggml-opencl.cpp +1 -0
  49. package/src/llama.cpp/ggml-quants.c +302 -293
  50. package/src/llama.cpp/ggml-sycl.cpp +28 -16
  51. package/src/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
  52. package/src/llama.cpp/ggml-vulkan.cpp +951 -263
  53. package/src/llama.cpp/ggml.c +1469 -116
  54. package/src/llama.cpp/ggml.h +37 -7
  55. package/src/llama.cpp/llama.cpp +969 -432
  56. package/src/llama.cpp/llama.h +46 -14
  57. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +2 -0
  58. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -1
  59. package/src/llama.cpp/requirements/requirements-convert.txt +2 -2
  60. package/src/llama.cpp/requirements.txt +1 -0
  61. package/src/llama.cpp/sgemm.cpp +134 -103
  62. package/src/llama.cpp/sgemm.h +4 -2
  63. package/src/llama.cpp/tests/CMakeLists.txt +96 -36
  64. package/src/llama.cpp/tests/test-backend-ops.cpp +56 -6
  65. package/src/llama.cpp/tests/test-chat-template.cpp +4 -0
  66. package/src/llama.cpp/tests/test-grammar-integration.cpp +225 -136
  67. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -0
  68. package/src/llama.cpp/tests/test-tokenizer-0.cpp +292 -0
  69. package/src/llama.cpp/tests/{test-tokenizer-1-llama.cpp → test-tokenizer-1-spm.cpp} +1 -1
  70. package/src/llama.cpp/unicode-data.cpp +1188 -656
  71. package/src/llama.cpp/unicode-data.h +4 -3
  72. package/src/llama.cpp/unicode.cpp +590 -49
  73. package/src/llama.cpp/unicode.h +6 -3
  74. package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +0 -187
  75. package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +0 -190
@@ -11,6 +11,89 @@
11
11
  #include <string.h> // memcpy
12
12
  #include <math.h> // fabsf
13
13
 
14
+ #undef MIN
15
+ #undef MAX
16
+
17
+ #define MIN(a, b) ((a) < (b) ? (a) : (b))
18
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
19
+
20
+ /**
21
+ * Converts brain16 to float32.
22
+ *
23
+ * The bfloat16 floating point format has the following structure:
24
+ *
25
+ * ┌sign
26
+ * │
27
+ * │ ┌exponent
28
+ * │ │
29
+ * │ │ ┌mantissa
30
+ * │ │ │
31
+ * │┌──┴───┐┌─┴───┐
32
+ * 0b0000000000000000 brain16
33
+ *
34
+ * Since bf16 has the same number of exponent bits as a 32bit float,
35
+ * encoding and decoding numbers becomes relatively straightforward.
36
+ *
37
+ * ┌sign
38
+ * │
39
+ * │ ┌exponent
40
+ * │ │
41
+ * │ │ ┌mantissa
42
+ * │ │ │
43
+ * │┌──┴───┐┌─┴───────────────────┐
44
+ * 0b00000000000000000000000000000000 IEEE binary32
45
+ *
46
+ * For comparison, the standard fp16 format has fewer exponent bits.
47
+ *
48
+ * ┌sign
49
+ * │
50
+ * │ ┌exponent
51
+ * │ │
52
+ * │ │ ┌mantissa
53
+ * │ │ │
54
+ * │┌─┴─┐┌─┴──────┐
55
+ * 0b0000000000000000 IEEE binary16
56
+ *
57
+ * @see IEEE 754-2008
58
+ */
59
+ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
60
+ union {
61
+ float f;
62
+ uint32_t i;
63
+ } u;
64
+ u.i = (uint32_t)h.bits << 16;
65
+ return u.f;
66
+ }
67
+
68
+ /**
69
+ * Converts float32 to brain16.
70
+ *
71
+ * This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
72
+ * Subnormals shall be flushed to zero, and NANs will be quiet.
73
+ * This code should vectorize nicely if using modern compilers.
74
+ */
75
+ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
76
+ ggml_bf16_t h;
77
+ union {
78
+ float f;
79
+ uint32_t i;
80
+ } u;
81
+ u.f = s;
82
+ if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
83
+ h.bits = (u.i >> 16) | 64; /* force to quiet */
84
+ return h;
85
+ }
86
+ if (!(u.i & 0x7f800000)) { /* subnormal */
87
+ h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
88
+ return h;
89
+ }
90
+ h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
91
+ return h;
92
+ }
93
+
94
+ #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
95
+ #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
96
+
14
97
  #ifdef __cplusplus
15
98
  extern "C" {
16
99
  #endif
@@ -45,7 +128,7 @@ extern "C" {
45
128
  // 16-bit float
46
129
  // on Arm, we use __fp16
47
130
  // on x86, we use uint16_t
48
- #if defined(__ARM_NEON) && !defined(_MSC_VER)
131
+ #if defined(__ARM_NEON)
49
132
 
50
133
  // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
51
134
  //
@@ -53,8 +136,262 @@ extern "C" {
53
136
  //
54
137
  #include <arm_neon.h>
55
138
 
139
+ #ifdef _MSC_VER
140
+
141
+ typedef uint16_t ggml_fp16_internal_t;
142
+
143
+ #define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
144
+
145
+ #else
146
+
56
147
  typedef __fp16 ggml_fp16_internal_t;
57
148
 
149
+ #define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
150
+
151
+ #endif // _MSC_VER
152
+
153
+ #if !defined(__aarch64__)
154
+
155
+ // 32-bit ARM compatibility
156
+
157
+ // vaddvq_s16
158
+ // vpaddq_s16
159
+ // vpaddq_s32
160
+ // vaddvq_s32
161
+ // vaddvq_f32
162
+ // vmaxvq_f32
163
+ // vcvtnq_s32_f32
164
+ // vzip1_u8
165
+ // vzip2_u8
166
+
167
+ inline static int32_t vaddvq_s16(int16x8_t v) {
168
+ return
169
+ (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
170
+ (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
171
+ (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
172
+ (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
173
+ }
174
+
175
+ inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
176
+ int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
177
+ int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
178
+ return vcombine_s16(a0, b0);
179
+ }
180
+
181
+ inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
182
+ int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
183
+ int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
184
+ return vcombine_s32(a0, b0);
185
+ }
186
+
187
+ inline static int32_t vaddvq_s32(int32x4_t v) {
188
+ return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
189
+ }
190
+
191
+ inline static float vaddvq_f32(float32x4_t v) {
192
+ return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
193
+ }
194
+
195
+ inline static float vmaxvq_f32(float32x4_t v) {
196
+ return
197
+ MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
198
+ MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
199
+ }
200
+
201
+ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
202
+ int32x4_t res;
203
+
204
+ res[0] = roundf(vgetq_lane_f32(v, 0));
205
+ res[1] = roundf(vgetq_lane_f32(v, 1));
206
+ res[2] = roundf(vgetq_lane_f32(v, 2));
207
+ res[3] = roundf(vgetq_lane_f32(v, 3));
208
+
209
+ return res;
210
+ }
211
+
212
+ inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
213
+ uint8x8_t res;
214
+
215
+ res[0] = a[0]; res[1] = b[0];
216
+ res[2] = a[1]; res[3] = b[1];
217
+ res[4] = a[2]; res[5] = b[2];
218
+ res[6] = a[3]; res[7] = b[3];
219
+
220
+ return res;
221
+ }
222
+
223
+ inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
224
+ uint8x8_t res;
225
+
226
+ res[0] = a[4]; res[1] = b[4];
227
+ res[2] = a[5]; res[3] = b[5];
228
+ res[4] = a[6]; res[5] = b[6];
229
+ res[6] = a[7]; res[7] = b[7];
230
+
231
+ return res;
232
+ }
233
+
234
+ // vld1q_s16_x2
235
+ // vld1q_u8_x2
236
+ // vld1q_u8_x4
237
+ // vld1q_s8_x2
238
+ // vld1q_s8_x4
239
+ // TODO: double-check these work correctly
240
+
241
+ typedef struct ggml_int16x8x2_t {
242
+ int16x8_t val[2];
243
+ } ggml_int16x8x2_t;
244
+
245
+ inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
246
+ ggml_int16x8x2_t res;
247
+
248
+ res.val[0] = vld1q_s16(ptr + 0);
249
+ res.val[1] = vld1q_s16(ptr + 8);
250
+
251
+ return res;
252
+ }
253
+
254
+ typedef struct ggml_uint8x16x2_t {
255
+ uint8x16_t val[2];
256
+ } ggml_uint8x16x2_t;
257
+
258
+ inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
259
+ ggml_uint8x16x2_t res;
260
+
261
+ res.val[0] = vld1q_u8(ptr + 0);
262
+ res.val[1] = vld1q_u8(ptr + 16);
263
+
264
+ return res;
265
+ }
266
+
267
+ typedef struct ggml_uint8x16x4_t {
268
+ uint8x16_t val[4];
269
+ } ggml_uint8x16x4_t;
270
+
271
+ inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
272
+ ggml_uint8x16x4_t res;
273
+
274
+ res.val[0] = vld1q_u8(ptr + 0);
275
+ res.val[1] = vld1q_u8(ptr + 16);
276
+ res.val[2] = vld1q_u8(ptr + 32);
277
+ res.val[3] = vld1q_u8(ptr + 48);
278
+
279
+ return res;
280
+ }
281
+
282
+ typedef struct ggml_int8x16x2_t {
283
+ int8x16_t val[2];
284
+ } ggml_int8x16x2_t;
285
+
286
+ inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
287
+ ggml_int8x16x2_t res;
288
+
289
+ res.val[0] = vld1q_s8(ptr + 0);
290
+ res.val[1] = vld1q_s8(ptr + 16);
291
+
292
+ return res;
293
+ }
294
+
295
+ typedef struct ggml_int8x16x4_t {
296
+ int8x16_t val[4];
297
+ } ggml_int8x16x4_t;
298
+
299
+ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
300
+ ggml_int8x16x4_t res;
301
+
302
+ res.val[0] = vld1q_s8(ptr + 0);
303
+ res.val[1] = vld1q_s8(ptr + 16);
304
+ res.val[2] = vld1q_s8(ptr + 32);
305
+ res.val[3] = vld1q_s8(ptr + 48);
306
+
307
+ return res;
308
+ }
309
+
310
+ // NOTE: not tested
311
+ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
312
+ int8x16_t res;
313
+
314
+ res[ 0] = a[b[ 0]];
315
+ res[ 1] = a[b[ 1]];
316
+ res[ 2] = a[b[ 2]];
317
+ res[ 3] = a[b[ 3]];
318
+ res[ 4] = a[b[ 4]];
319
+ res[ 5] = a[b[ 5]];
320
+ res[ 6] = a[b[ 6]];
321
+ res[ 7] = a[b[ 7]];
322
+ res[ 8] = a[b[ 8]];
323
+ res[ 9] = a[b[ 9]];
324
+ res[10] = a[b[10]];
325
+ res[11] = a[b[11]];
326
+ res[12] = a[b[12]];
327
+ res[13] = a[b[13]];
328
+ res[14] = a[b[14]];
329
+ res[15] = a[b[15]];
330
+
331
+ return res;
332
+ }
333
+
334
+ // NOTE: not tested
335
+ inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
336
+ uint8x16_t res;
337
+
338
+ res[ 0] = a[b[ 0]];
339
+ res[ 1] = a[b[ 1]];
340
+ res[ 2] = a[b[ 2]];
341
+ res[ 3] = a[b[ 3]];
342
+ res[ 4] = a[b[ 4]];
343
+ res[ 5] = a[b[ 5]];
344
+ res[ 6] = a[b[ 6]];
345
+ res[ 7] = a[b[ 7]];
346
+ res[ 8] = a[b[ 8]];
347
+ res[ 9] = a[b[ 9]];
348
+ res[10] = a[b[10]];
349
+ res[11] = a[b[11]];
350
+ res[12] = a[b[12]];
351
+ res[13] = a[b[13]];
352
+ res[14] = a[b[14]];
353
+ res[15] = a[b[15]];
354
+
355
+ return res;
356
+ }
357
+
358
+ #else
359
+
360
+ #define ggml_int16x8x2_t int16x8x2_t
361
+ #define ggml_uint8x16x2_t uint8x16x2_t
362
+ #define ggml_uint8x16x4_t uint8x16x4_t
363
+ #define ggml_int8x16x2_t int8x16x2_t
364
+ #define ggml_int8x16x4_t int8x16x4_t
365
+
366
+ #define ggml_vld1q_s16_x2 vld1q_s16_x2
367
+ #define ggml_vld1q_u8_x2 vld1q_u8_x2
368
+ #define ggml_vld1q_u8_x4 vld1q_u8_x4
369
+ #define ggml_vld1q_s8_x2 vld1q_s8_x2
370
+ #define ggml_vld1q_s8_x4 vld1q_s8_x4
371
+ #define ggml_vqtbl1q_s8 vqtbl1q_s8
372
+ #define ggml_vqtbl1q_u8 vqtbl1q_u8
373
+
374
+ #endif // !defined(__aarch64__)
375
+
376
+ #if !defined(__ARM_FEATURE_DOTPROD)
377
+
378
+ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
379
+ const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
380
+ const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
381
+
382
+ return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
383
+ }
384
+
385
+ #else
386
+
387
+ #define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
388
+
389
+ #endif // !defined(__ARM_FEATURE_DOTPROD)
390
+
391
+ #endif // defined(__ARM_NEON)
392
+
393
+ #if defined(__ARM_NEON) && !defined(_MSC_VER)
394
+
58
395
  #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
59
396
  #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
60
397
 
@@ -75,8 +412,6 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
75
412
 
76
413
  #else
77
414
 
78
- typedef uint16_t ggml_fp16_internal_t;
79
-
80
415
  #ifdef __wasm_simd128__
81
416
  #include <wasm_simd128.h>
82
417
  #else
@@ -221,7 +556,7 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
221
556
 
222
557
  #endif // __F16C__
223
558
 
224
- #endif // __ARM_NEON
559
+ #endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
225
560
 
226
561
  // precomputed f32 table for f16 (256 KB)
227
562
  // defined in ggml.c, initialized in ggml_init()
@@ -1427,6 +1427,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1427
1427
  for (int i = node_start; i < node_end; ++i) {
1428
1428
  struct ggml_tensor * src0 = gf->nodes[i]->src[0];
1429
1429
  struct ggml_tensor * src1 = gf->nodes[i]->src[1];
1430
+ struct ggml_tensor * src2 = gf->nodes[i]->src[2]; GGML_UNUSED(src2);
1430
1431
  struct ggml_tensor * dst = gf->nodes[i];
1431
1432
  GGML_ASSERT(dst->data != nullptr);
1432
1433
 
@@ -1559,6 +1560,12 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1559
1560
  {
1560
1561
  float scale;
1561
1562
  memcpy(&scale, dst->op_params, sizeof(float));
1563
+
1564
+ #pragma message("TODO: add ggml_vk_soft_max() F16/F32 src1 and src2 support")
1565
+ #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
1566
+ GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
1567
+ GGML_ASSERT(src2 == nullptr);
1568
+
1562
1569
  ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
1563
1570
  } break;
1564
1571
  case GGML_OP_DIAG_MASK_INF:
@@ -2119,6 +2119,7 @@ static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_
2119
2119
  if (alignment == (cl_uint)-1) {
2120
2120
  ggml_cl_init();
2121
2121
  clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
2122
+ alignment /= 8; // bits to bytes
2122
2123
  }
2123
2124
  return alignment;
2124
2125