cui-llama.rn 1.3.0 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/android/src/main/CMakeLists.txt +6 -1
  2. package/android/src/main/jni.cpp +6 -6
  3. package/cpp/amx/amx.cpp +196 -0
  4. package/cpp/amx/amx.h +20 -0
  5. package/cpp/amx/common.h +101 -0
  6. package/cpp/amx/mmq.cpp +2524 -0
  7. package/cpp/amx/mmq.h +16 -0
  8. package/cpp/common.cpp +1981 -1682
  9. package/cpp/common.h +636 -600
  10. package/cpp/ggml-aarch64.c +129 -129
  11. package/cpp/ggml-aarch64.h +19 -19
  12. package/cpp/ggml-alloc.c +1038 -1040
  13. package/cpp/ggml-alloc.h +76 -76
  14. package/cpp/ggml-backend-impl.h +238 -216
  15. package/cpp/ggml-backend-reg.cpp +423 -195
  16. package/cpp/ggml-backend.cpp +1999 -1997
  17. package/cpp/ggml-backend.h +351 -328
  18. package/cpp/ggml-common.h +1859 -1853
  19. package/cpp/ggml-cpp.h +38 -38
  20. package/cpp/ggml-cpu-aarch64.c +3823 -3560
  21. package/cpp/ggml-cpu-aarch64.h +32 -30
  22. package/cpp/ggml-cpu-impl.h +386 -371
  23. package/cpp/ggml-cpu-quants.c +10835 -10822
  24. package/cpp/ggml-cpu-quants.h +63 -63
  25. package/cpp/ggml-cpu.c +99 -103
  26. package/cpp/ggml-cpu.cpp +69 -17
  27. package/cpp/ggml-cpu.h +152 -177
  28. package/cpp/ggml-impl.h +556 -550
  29. package/cpp/ggml-metal.h +66 -66
  30. package/cpp/ggml-metal.m +4426 -4294
  31. package/cpp/ggml-quants.c +5247 -5247
  32. package/cpp/ggml-quants.h +100 -100
  33. package/cpp/ggml-threading.cpp +12 -12
  34. package/cpp/ggml-threading.h +12 -12
  35. package/cpp/ggml.c +7618 -8180
  36. package/cpp/ggml.h +2255 -2411
  37. package/cpp/json-schema-to-grammar.cpp +1045 -0
  38. package/cpp/json-schema-to-grammar.h +8 -0
  39. package/cpp/json.hpp +24766 -0
  40. package/cpp/llama-grammar.cpp +1138 -1138
  41. package/cpp/llama-grammar.h +144 -144
  42. package/cpp/llama-impl.h +181 -181
  43. package/cpp/llama-sampling.cpp +2348 -2348
  44. package/cpp/llama-sampling.h +48 -48
  45. package/cpp/llama-vocab.cpp +1984 -1984
  46. package/cpp/llama-vocab.h +170 -170
  47. package/cpp/llama.cpp +22332 -22132
  48. package/cpp/llama.h +1259 -1253
  49. package/cpp/log.cpp +401 -401
  50. package/cpp/log.h +121 -121
  51. package/cpp/rn-llama.hpp +6 -6
  52. package/cpp/sampling.cpp +505 -466
  53. package/cpp/sampling.h +22 -1
  54. package/cpp/sgemm.cpp +1884 -1884
  55. package/cpp/speculative.cpp +270 -0
  56. package/cpp/speculative.h +28 -0
  57. package/cpp/unicode.cpp +11 -0
  58. package/ios/RNLlamaContext.mm +13 -0
  59. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  60. package/lib/commonjs/grammar.js +4 -2
  61. package/lib/commonjs/grammar.js.map +1 -1
  62. package/lib/commonjs/index.js.map +1 -1
  63. package/lib/module/NativeRNLlama.js.map +1 -1
  64. package/lib/module/grammar.js +2 -1
  65. package/lib/module/grammar.js.map +1 -1
  66. package/lib/module/index.js.map +1 -1
  67. package/lib/typescript/NativeRNLlama.d.ts +94 -4
  68. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  69. package/lib/typescript/grammar.d.ts +5 -6
  70. package/lib/typescript/grammar.d.ts.map +1 -1
  71. package/lib/typescript/index.d.ts +4 -2
  72. package/lib/typescript/index.d.ts.map +1 -1
  73. package/package.json +2 -1
  74. package/src/NativeRNLlama.ts +97 -10
  75. package/src/grammar.ts +10 -8
  76. package/src/index.ts +22 -1
@@ -1,129 +1,129 @@
1
- #define LM_GGML_COMMON_DECL_C
2
- #include "ggml-common.h"
3
-
4
- #include "ggml-aarch64.h"
5
- #include "ggml-impl.h"
6
- #include "ggml-quants.h"
7
- #include <assert.h>
8
-
9
- #define UNUSED LM_GGML_UNUSED
10
-
11
- static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
12
- block_q4_0x4 out;
13
-
14
- for (int i = 0; i < 4; i++) {
15
- out.d[i] = in[i].d;
16
- }
17
-
18
- const int end = QK4_0 * 2 / blck_size_interleave;
19
-
20
- if (blck_size_interleave == 8) {
21
- const uint64_t xor_mask = 0x8888888888888888ULL;
22
- for (int i = 0; i < end; ++i) {
23
- int src_id = i % 4;
24
- int src_offset = (i / 4) * blck_size_interleave;
25
- int dst_offset = i * blck_size_interleave;
26
-
27
- uint64_t elems;
28
- // Using memcpy to avoid unaligned memory accesses
29
- memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
30
- elems ^= xor_mask;
31
- memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
32
- }
33
- } else if (blck_size_interleave == 4) {
34
- const uint32_t xor_mask = 0x88888888;
35
- for (int i = 0; i < end; ++i) {
36
- int src_id = i % 4;
37
- int src_offset = (i / 4) * blck_size_interleave;
38
- int dst_offset = i * blck_size_interleave;
39
-
40
- uint32_t elems;
41
- memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
42
- elems ^= xor_mask;
43
- memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
44
- }
45
- } else {
46
- LM_GGML_ASSERT(false);
47
- }
48
-
49
- return out;
50
- }
51
-
52
- // interleave 8 block_q4_0s in blocks of blck_size_interleave
53
- // returns an interleaved block_q4_0x8
54
- // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
55
- // first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
56
- static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
57
- block_q4_0x8 out;
58
-
59
- for (int i = 0; i < 8; i++) {
60
- out.d[i] = in[i].d;
61
- }
62
-
63
- const int end = QK4_0 * 4 / blck_size_interleave;
64
- const uint64_t xor_mask = 0x8888888888888888ULL;
65
-
66
- for (int i = 0; i < end; ++i) {
67
- int src_id = i % 8;
68
- int src_offset = (i / 8) * blck_size_interleave;
69
- int dst_offset = i * blck_size_interleave;
70
-
71
- uint64_t elems;
72
- memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
73
- elems ^= xor_mask;
74
- memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
75
- }
76
-
77
- return out;
78
- }
79
-
80
- static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blck_size_interleave) {
81
- assert(n_per_row % QK4_0 == 0);
82
- const int nb = n_per_row / QK4_0;
83
-
84
- void * out_ptr = NULL;
85
- if (nrows_interleaved == 8) {
86
- out_ptr = (block_q4_0x8 *) dst;
87
- }
88
- else if (nrows_interleaved == 4) {
89
- out_ptr = (block_q4_0x4 *) dst;
90
- }
91
- assert(nrows_interleaved <= 8);
92
- block_q4_0 dst_tmp[8];
93
-
94
- for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) {
95
-
96
- for (int64_t x = 0; x < nb; x++) {
97
-
98
- for (int i = 0; i < nrows_interleaved; i++ ) {
99
- quantize_row_q4_0_ref(src + b + i * n_per_row + x * QK4_0, (block_q4_0 *) dst_tmp + i, QK4_0);
100
- }
101
-
102
- if (nrows_interleaved == 8) {
103
- *(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave);
104
- out_ptr = (block_q4_0x8 *) out_ptr + 1;
105
- }
106
- else if (nrows_interleaved == 4) {
107
- *(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave);
108
- out_ptr = (block_q4_0x4 *) out_ptr + 1;
109
- }
110
- }
111
- }
112
-
113
- return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0));
114
- }
115
-
116
- size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
117
- UNUSED(quant_weights);
118
- return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
119
- }
120
-
121
- size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
122
- UNUSED(quant_weights);
123
- return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
124
- }
125
-
126
- size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
127
- UNUSED(quant_weights);
128
- return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
129
- }
1
+ #define LM_GGML_COMMON_DECL_C
2
+ #include "ggml-common.h"
3
+
4
+ #include "ggml-aarch64.h"
5
+ #include "ggml-impl.h"
6
+ #include "ggml-quants.h"
7
+ #include <assert.h>
8
+
9
+ #define UNUSED LM_GGML_UNUSED
10
+
11
+ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
12
+ block_q4_0x4 out;
13
+
14
+ for (int i = 0; i < 4; i++) {
15
+ out.d[i] = in[i].d;
16
+ }
17
+
18
+ const int end = QK4_0 * 2 / blck_size_interleave;
19
+
20
+ if (blck_size_interleave == 8) {
21
+ const uint64_t xor_mask = 0x8888888888888888ULL;
22
+ for (int i = 0; i < end; ++i) {
23
+ int src_id = i % 4;
24
+ int src_offset = (i / 4) * blck_size_interleave;
25
+ int dst_offset = i * blck_size_interleave;
26
+
27
+ uint64_t elems;
28
+ // Using memcpy to avoid unaligned memory accesses
29
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
30
+ elems ^= xor_mask;
31
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
32
+ }
33
+ } else if (blck_size_interleave == 4) {
34
+ const uint32_t xor_mask = 0x88888888;
35
+ for (int i = 0; i < end; ++i) {
36
+ int src_id = i % 4;
37
+ int src_offset = (i / 4) * blck_size_interleave;
38
+ int dst_offset = i * blck_size_interleave;
39
+
40
+ uint32_t elems;
41
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
42
+ elems ^= xor_mask;
43
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
44
+ }
45
+ } else {
46
+ LM_GGML_ASSERT(false);
47
+ }
48
+
49
+ return out;
50
+ }
51
+
52
+ // interleave 8 block_q4_0s in blocks of blck_size_interleave
53
+ // returns an interleaved block_q4_0x8
54
+ // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
55
+ // first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
56
+ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
57
+ block_q4_0x8 out;
58
+
59
+ for (int i = 0; i < 8; i++) {
60
+ out.d[i] = in[i].d;
61
+ }
62
+
63
+ const int end = QK4_0 * 4 / blck_size_interleave;
64
+ const uint64_t xor_mask = 0x8888888888888888ULL;
65
+
66
+ for (int i = 0; i < end; ++i) {
67
+ int src_id = i % 8;
68
+ int src_offset = (i / 8) * blck_size_interleave;
69
+ int dst_offset = i * blck_size_interleave;
70
+
71
+ uint64_t elems;
72
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
73
+ elems ^= xor_mask;
74
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
75
+ }
76
+
77
+ return out;
78
+ }
79
+
80
+ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blck_size_interleave) {
81
+ assert(n_per_row % QK4_0 == 0);
82
+ const int nb = n_per_row / QK4_0;
83
+
84
+ void * out_ptr = NULL;
85
+ if (nrows_interleaved == 8) {
86
+ out_ptr = (block_q4_0x8 *) dst;
87
+ }
88
+ else if (nrows_interleaved == 4) {
89
+ out_ptr = (block_q4_0x4 *) dst;
90
+ }
91
+ assert(nrows_interleaved <= 8);
92
+ block_q4_0 dst_tmp[8];
93
+
94
+ for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) {
95
+
96
+ for (int64_t x = 0; x < nb; x++) {
97
+
98
+ for (int i = 0; i < nrows_interleaved; i++ ) {
99
+ quantize_row_q4_0_ref(src + b + i * n_per_row + x * QK4_0, (block_q4_0 *) dst_tmp + i, QK4_0);
100
+ }
101
+
102
+ if (nrows_interleaved == 8) {
103
+ *(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave);
104
+ out_ptr = (block_q4_0x8 *) out_ptr + 1;
105
+ }
106
+ else if (nrows_interleaved == 4) {
107
+ *(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave);
108
+ out_ptr = (block_q4_0x4 *) out_ptr + 1;
109
+ }
110
+ }
111
+ }
112
+
113
+ return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0));
114
+ }
115
+
116
+ size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
117
+ UNUSED(quant_weights);
118
+ return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
119
+ }
120
+
121
+ size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
122
+ UNUSED(quant_weights);
123
+ return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
124
+ }
125
+
126
+ size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
127
+ UNUSED(quant_weights);
128
+ return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
129
+ }
@@ -1,19 +1,19 @@
1
- #pragma once
2
-
3
- #include "ggml.h"
4
-
5
- // GGML internal header
6
-
7
- #ifdef __cplusplus
8
- extern "C" {
9
- #endif
10
-
11
- // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
12
- size_t quantize_q4_0_4x4(const float * LM_GGML_RESTRICT src, void * LM_GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
13
- size_t quantize_q4_0_4x8(const float * LM_GGML_RESTRICT src, void * LM_GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
14
- size_t quantize_q4_0_8x8(const float * LM_GGML_RESTRICT src, void * LM_GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
15
-
16
- #ifdef __cplusplus
17
- }
18
- #endif
19
-
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+
5
+ // GGML internal header
6
+
7
+ #ifdef __cplusplus
8
+ extern "C" {
9
+ #endif
10
+
11
+ // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
12
+ size_t quantize_q4_0_4x4(const float * LM_GGML_RESTRICT src, void * LM_GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
13
+ size_t quantize_q4_0_4x8(const float * LM_GGML_RESTRICT src, void * LM_GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
14
+ size_t quantize_q4_0_8x8(const float * LM_GGML_RESTRICT src, void * LM_GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
15
+
16
+ #ifdef __cplusplus
17
+ }
18
+ #endif
19
+