cui-llama.rn 1.4.6 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/android/src/main/CMakeLists.txt +9 -2
  2. package/android/src/main/jni.cpp +52 -34
  3. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  11. package/cpp/binary-ops.cpp +158 -0
  12. package/cpp/binary-ops.h +16 -0
  13. package/cpp/chat.cpp +1769 -1779
  14. package/cpp/chat.h +9 -1
  15. package/cpp/common.cpp +20 -522
  16. package/cpp/common.h +13 -36
  17. package/cpp/cpu-common.h +72 -0
  18. package/cpp/ggml-common.h +12 -6
  19. package/cpp/ggml-cpu-aarch64.cpp +1557 -80
  20. package/cpp/ggml-cpu-impl.h +2 -21
  21. package/cpp/ggml-cpu-quants.c +904 -405
  22. package/cpp/ggml-cpu.c +909 -13237
  23. package/cpp/ggml-impl.h +50 -23
  24. package/cpp/ggml-metal-impl.h +77 -3
  25. package/cpp/ggml-metal.m +794 -580
  26. package/cpp/ggml.c +92 -3
  27. package/cpp/ggml.h +29 -5
  28. package/cpp/gguf.cpp +1 -0
  29. package/cpp/llama-adapter.cpp +55 -20
  30. package/cpp/llama-adapter.h +11 -9
  31. package/cpp/llama-arch.cpp +217 -16
  32. package/cpp/llama-arch.h +25 -0
  33. package/cpp/llama-batch.h +2 -2
  34. package/cpp/llama-chat.cpp +54 -2
  35. package/cpp/llama-chat.h +3 -0
  36. package/cpp/llama-context.cpp +2294 -1238
  37. package/cpp/llama-context.h +214 -77
  38. package/cpp/llama-cparams.h +1 -0
  39. package/cpp/llama-graph.cpp +1695 -0
  40. package/cpp/llama-graph.h +592 -0
  41. package/cpp/llama-hparams.cpp +8 -0
  42. package/cpp/llama-hparams.h +17 -0
  43. package/cpp/llama-io.cpp +15 -0
  44. package/cpp/llama-io.h +35 -0
  45. package/cpp/llama-kv-cache.cpp +965 -303
  46. package/cpp/llama-kv-cache.h +145 -151
  47. package/cpp/llama-memory.cpp +1 -0
  48. package/cpp/llama-memory.h +21 -0
  49. package/cpp/llama-mmap.cpp +1 -1
  50. package/cpp/llama-model-loader.cpp +10 -5
  51. package/cpp/llama-model-loader.h +5 -3
  52. package/cpp/llama-model.cpp +9194 -201
  53. package/cpp/llama-model.h +40 -1
  54. package/cpp/llama-sampling.cpp +5 -0
  55. package/cpp/llama-vocab.cpp +36 -5
  56. package/cpp/llama.cpp +51 -9984
  57. package/cpp/llama.h +102 -22
  58. package/cpp/log.cpp +34 -0
  59. package/cpp/minja/chat-template.hpp +15 -7
  60. package/cpp/minja/minja.hpp +120 -94
  61. package/cpp/ops.cpp +8723 -0
  62. package/cpp/ops.h +128 -0
  63. package/cpp/rn-llama.cpp +44 -53
  64. package/cpp/rn-llama.h +2 -12
  65. package/cpp/sampling.cpp +3 -0
  66. package/cpp/sgemm.cpp +533 -88
  67. package/cpp/simd-mappings.h +888 -0
  68. package/cpp/speculative.cpp +4 -4
  69. package/cpp/unary-ops.cpp +186 -0
  70. package/cpp/unary-ops.h +28 -0
  71. package/cpp/vec.cpp +258 -0
  72. package/cpp/vec.h +802 -0
  73. package/ios/CMakeLists.txt +5 -2
  74. package/ios/RNLlama.mm +2 -2
  75. package/ios/RNLlamaContext.mm +40 -24
  76. package/package.json +1 -1
  77. package/src/NativeRNLlama.ts +6 -4
  78. package/src/index.ts +3 -1
  79. package/cpp/chat-template.hpp +0 -529
  80. package/cpp/minja.hpp +0 -2915
package/cpp/ggml-impl.h CHANGED
@@ -16,14 +16,6 @@
16
16
  #include <arm_sve.h>
17
17
  #endif // __ARM_FEATURE_SVE
18
18
 
19
- #if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
20
- // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
21
- //
22
- // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
23
- //
24
- #include <arm_neon.h>
25
- #endif
26
-
27
19
  #if defined(__F16C__)
28
20
  #include <immintrin.h>
29
21
  #endif
@@ -311,29 +303,35 @@ LM_GGML_API void lm_ggml_aligned_free(void * ptr, size_t size);
311
303
 
312
304
  // FP16 to FP32 conversion
313
305
 
314
- #if defined(__ARM_NEON)
315
- #if defined(_MSC_VER) || (defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
316
- typedef uint16_t lm_ggml_fp16_internal_t;
317
- #else
318
- typedef __fp16 lm_ggml_fp16_internal_t;
319
- #endif
320
- #endif
306
+ // 16-bit float
307
+ // on Arm, we use __fp16
308
+ // on x86, we use uint16_t
309
+ //
310
+ // for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
311
+ // for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
312
+ //
313
+ #if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
314
+
315
+ // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
316
+ //
317
+ // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
318
+ //
319
+ #include <arm_neon.h>
321
320
 
322
- #if defined(__ARM_NEON) && !defined(_MSC_VER) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
323
321
  #define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
324
322
  #define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x)
325
323
 
326
324
  #define LM_GGML_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
327
325
 
328
326
  static inline float lm_ggml_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
329
- lm_ggml_fp16_internal_t tmp;
327
+ __fp16 tmp;
330
328
  memcpy(&tmp, &h, sizeof(lm_ggml_fp16_t));
331
329
  return (float)tmp;
332
330
  }
333
331
 
334
332
  static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) {
335
333
  lm_ggml_fp16_t res;
336
- lm_ggml_fp16_internal_t tmp = f;
334
+ __fp16 tmp = f;
337
335
  memcpy(&res, &tmp, sizeof(lm_ggml_fp16_t));
338
336
  return res;
339
337
  }
@@ -357,8 +355,8 @@ LM_GGML_API void lm_ggml_aligned_free(void * ptr, size_t size);
357
355
  #define LM_GGML_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x)
358
356
 
359
357
  static inline float lm_ggml_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
360
- register float f;
361
- register double d;
358
+ float f;
359
+ double d;
362
360
  __asm__(
363
361
  "mtfprd %0,%2\n"
364
362
  "xscvhpdp %0,%0\n"
@@ -370,8 +368,8 @@ LM_GGML_API void lm_ggml_aligned_free(void * ptr, size_t size);
370
368
  }
371
369
 
372
370
  static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) {
373
- register double d;
374
- register lm_ggml_fp16_t r;
371
+ double d;
372
+ lm_ggml_fp16_t r;
375
373
  __asm__( /* xscvdphp can work on double or single precision */
376
374
  "xscvdphp %0,%2\n"
377
375
  "mffprd %1,%0\n" :
@@ -381,6 +379,35 @@ LM_GGML_API void lm_ggml_aligned_free(void * ptr, size_t size);
381
379
  return r;
382
380
  }
383
381
 
382
+ #elif defined(__riscv) && defined(LM_GGML_RV_ZFH)
383
+
384
+ static inline float lm_ggml_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
385
+ float f;
386
+ __asm__(
387
+ "fmv.h.x %[f], %[h]\n\t"
388
+ "fcvt.s.h %[f], %[f]"
389
+ : [f] "=&f" (f)
390
+ : [h] "r" (h)
391
+ );
392
+ return f;
393
+ }
394
+
395
+ static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) {
396
+ lm_ggml_fp16_t res;
397
+ __asm__(
398
+ "fcvt.h.s %[f], %[f]\n\t"
399
+ "fmv.x.h %[h], %[f]"
400
+ : [h] "=&r" (res)
401
+ : [f] "f" (f)
402
+ );
403
+ return res;
404
+ }
405
+
406
+ #define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
407
+ #define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x)
408
+ #define LM_GGML_FP16_TO_FP32(x) LM_GGML_COMPUTE_FP16_TO_FP32(x)
409
+ #define LM_GGML_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x)
410
+
384
411
  #else
385
412
 
386
413
  // FP16 <-> FP32
@@ -456,7 +483,7 @@ LM_GGML_API void lm_ggml_aligned_free(void * ptr, size_t size);
456
483
  #define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
457
484
  #define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x)
458
485
 
459
- #endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
486
+ #endif // defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
460
487
 
461
488
  // precomputed f32 table for f16 (256 KB)
462
489
  // defined in ggml.c, initialized in lm_ggml_init()
@@ -1,6 +1,70 @@
1
1
  #ifndef GGML_METAL_IMPL
2
2
  #define GGML_METAL_IMPL
3
3
 
4
+ // kernel parameters for mat-vec threadgroups
5
+ //
6
+ // N_R0: number of src0 rows to process per simdgroup
7
+ // N_SG: number of simdgroups per threadgroup
8
+ //
9
+ // TODO: for optimal performance, become function of the device and work size
10
+
11
+ #define N_R0_Q4_0 4
12
+ #define N_SG_Q4_0 2
13
+
14
+ #define N_R0_Q4_1 4
15
+ #define N_SG_Q4_1 2
16
+
17
+ #define N_R0_Q5_0 4
18
+ #define N_SG_Q5_0 2
19
+
20
+ #define N_R0_Q5_1 4
21
+ #define N_SG_Q5_1 2
22
+
23
+ #define N_R0_Q8_0 4
24
+ #define N_SG_Q8_0 2
25
+
26
+ #define N_R0_Q2_K 4
27
+ #define N_SG_Q2_K 2
28
+
29
+ #define N_R0_Q3_K 2
30
+ #define N_SG_Q3_K 2
31
+
32
+ #define N_R0_Q4_K 4
33
+ #define N_SG_Q4_K 2
34
+
35
+ #define N_R0_Q5_K 2
36
+ #define N_SG_Q5_K 2
37
+
38
+ #define N_R0_Q6_K 1
39
+ #define N_SG_Q6_K 2
40
+
41
+ #define N_R0_IQ1_S 4
42
+ #define N_SG_IQ1_S 2
43
+
44
+ #define N_R0_IQ1_M 4
45
+ #define N_SG_IQ1_M 2
46
+
47
+ #define N_R0_IQ2_XXS 4
48
+ #define N_SG_IQ2_XXS 2
49
+
50
+ #define N_R0_IQ2_XS 4
51
+ #define N_SG_IQ2_XS 2
52
+
53
+ #define N_R0_IQ2_S 4
54
+ #define N_SG_IQ2_S 2
55
+
56
+ #define N_R0_IQ3_XXS 4
57
+ #define N_SG_IQ3_XXS 2
58
+
59
+ #define N_R0_IQ3_S 4
60
+ #define N_SG_IQ3_S 2
61
+
62
+ #define N_R0_IQ4_NL 2
63
+ #define N_SG_IQ4_NL 2
64
+
65
+ #define N_R0_IQ4_XS 2
66
+ #define N_SG_IQ4_XS 2
67
+
4
68
  // kernel argument structs
5
69
  //
6
70
  // - element counters (e.g. ne00) typically use int32_t to reduce register usage
@@ -155,9 +219,12 @@ typedef struct {
155
219
  int32_t ne11;
156
220
  int32_t ne_12_2; // assume K and V are same shape
157
221
  int32_t ne_12_3;
158
- uint64_t nb_12_1;
159
- uint64_t nb_12_2;
160
- uint64_t nb_12_3;
222
+ uint64_t nb11;
223
+ uint64_t nb12;
224
+ uint64_t nb13;
225
+ uint64_t nb21;
226
+ uint64_t nb22;
227
+ uint64_t nb23;
161
228
  uint64_t nb31;
162
229
  int32_t ne1;
163
230
  int32_t ne2;
@@ -285,6 +352,13 @@ typedef struct {
285
352
  float eps;
286
353
  } ggml_metal_kargs_rms_norm;
287
354
 
355
+ typedef struct {
356
+ int32_t ne00;
357
+ int32_t ne00_4;
358
+ uint64_t nb01;
359
+ float eps;
360
+ } ggml_metal_kargs_l2_norm;
361
+
288
362
  typedef struct {
289
363
  int64_t ne00;
290
364
  int64_t ne01;