cui-llama.rn 1.3.0 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +6 -1
- package/android/src/main/jni.cpp +6 -6
- package/cpp/amx/amx.cpp +196 -0
- package/cpp/amx/amx.h +20 -0
- package/cpp/amx/common.h +101 -0
- package/cpp/amx/mmq.cpp +2524 -0
- package/cpp/amx/mmq.h +16 -0
- package/cpp/common.cpp +1981 -1682
- package/cpp/common.h +636 -600
- package/cpp/ggml-aarch64.c +129 -129
- package/cpp/ggml-aarch64.h +19 -19
- package/cpp/ggml-alloc.c +1038 -1040
- package/cpp/ggml-alloc.h +76 -76
- package/cpp/ggml-backend-impl.h +238 -216
- package/cpp/ggml-backend-reg.cpp +423 -195
- package/cpp/ggml-backend.cpp +1999 -1997
- package/cpp/ggml-backend.h +351 -328
- package/cpp/ggml-common.h +1859 -1853
- package/cpp/ggml-cpp.h +38 -38
- package/cpp/ggml-cpu-aarch64.c +3823 -3560
- package/cpp/ggml-cpu-aarch64.h +32 -30
- package/cpp/ggml-cpu-impl.h +386 -371
- package/cpp/ggml-cpu-quants.c +10835 -10822
- package/cpp/ggml-cpu-quants.h +63 -63
- package/cpp/ggml-cpu.c +99 -103
- package/cpp/ggml-cpu.cpp +69 -17
- package/cpp/ggml-cpu.h +152 -177
- package/cpp/ggml-impl.h +556 -550
- package/cpp/ggml-metal.h +66 -66
- package/cpp/ggml-metal.m +4426 -4294
- package/cpp/ggml-quants.c +5247 -5247
- package/cpp/ggml-quants.h +100 -100
- package/cpp/ggml-threading.cpp +12 -12
- package/cpp/ggml-threading.h +12 -12
- package/cpp/ggml.c +7618 -8180
- package/cpp/ggml.h +2255 -2411
- package/cpp/json-schema-to-grammar.cpp +1045 -0
- package/cpp/json-schema-to-grammar.h +8 -0
- package/cpp/json.hpp +24766 -0
- package/cpp/llama-grammar.cpp +1138 -1138
- package/cpp/llama-grammar.h +144 -144
- package/cpp/llama-impl.h +181 -181
- package/cpp/llama-sampling.cpp +2348 -2348
- package/cpp/llama-sampling.h +48 -48
- package/cpp/llama-vocab.cpp +1984 -1984
- package/cpp/llama-vocab.h +170 -170
- package/cpp/llama.cpp +22332 -22132
- package/cpp/llama.h +1259 -1253
- package/cpp/log.cpp +401 -401
- package/cpp/log.h +121 -121
- package/cpp/rn-llama.hpp +6 -6
- package/cpp/sampling.cpp +505 -466
- package/cpp/sampling.h +22 -1
- package/cpp/sgemm.cpp +1884 -1884
- package/cpp/speculative.cpp +270 -0
- package/cpp/speculative.h +28 -0
- package/cpp/unicode.cpp +11 -0
- package/ios/RNLlamaContext.mm +13 -0
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/commonjs/grammar.js +4 -2
- package/lib/commonjs/grammar.js.map +1 -1
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/module/grammar.js +2 -1
- package/lib/module/grammar.js.map +1 -1
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +94 -4
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/grammar.d.ts +5 -6
- package/lib/typescript/grammar.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +4 -2
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +2 -1
- package/src/NativeRNLlama.ts +97 -10
- package/src/grammar.ts +10 -8
- package/src/index.ts +22 -1
package/cpp/ggml-aarch64.c
CHANGED
@@ -1,129 +1,129 @@
|
|
1
|
-
#define LM_GGML_COMMON_DECL_C
|
2
|
-
#include "ggml-common.h"
|
3
|
-
|
4
|
-
#include "ggml-aarch64.h"
|
5
|
-
#include "ggml-impl.h"
|
6
|
-
#include "ggml-quants.h"
|
7
|
-
#include <assert.h>
|
8
|
-
|
9
|
-
#define UNUSED LM_GGML_UNUSED
|
10
|
-
|
11
|
-
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
12
|
-
block_q4_0x4 out;
|
13
|
-
|
14
|
-
for (int i = 0; i < 4; i++) {
|
15
|
-
out.d[i] = in[i].d;
|
16
|
-
}
|
17
|
-
|
18
|
-
const int end = QK4_0 * 2 / blck_size_interleave;
|
19
|
-
|
20
|
-
if (blck_size_interleave == 8) {
|
21
|
-
const uint64_t xor_mask = 0x8888888888888888ULL;
|
22
|
-
for (int i = 0; i < end; ++i) {
|
23
|
-
int src_id = i % 4;
|
24
|
-
int src_offset = (i / 4) * blck_size_interleave;
|
25
|
-
int dst_offset = i * blck_size_interleave;
|
26
|
-
|
27
|
-
uint64_t elems;
|
28
|
-
// Using memcpy to avoid unaligned memory accesses
|
29
|
-
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
30
|
-
elems ^= xor_mask;
|
31
|
-
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
32
|
-
}
|
33
|
-
} else if (blck_size_interleave == 4) {
|
34
|
-
const uint32_t xor_mask = 0x88888888;
|
35
|
-
for (int i = 0; i < end; ++i) {
|
36
|
-
int src_id = i % 4;
|
37
|
-
int src_offset = (i / 4) * blck_size_interleave;
|
38
|
-
int dst_offset = i * blck_size_interleave;
|
39
|
-
|
40
|
-
uint32_t elems;
|
41
|
-
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
|
42
|
-
elems ^= xor_mask;
|
43
|
-
memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
|
44
|
-
}
|
45
|
-
} else {
|
46
|
-
LM_GGML_ASSERT(false);
|
47
|
-
}
|
48
|
-
|
49
|
-
return out;
|
50
|
-
}
|
51
|
-
|
52
|
-
// interleave 8 block_q4_0s in blocks of blck_size_interleave
|
53
|
-
// returns an interleaved block_q4_0x8
|
54
|
-
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
|
55
|
-
// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
|
56
|
-
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
|
57
|
-
block_q4_0x8 out;
|
58
|
-
|
59
|
-
for (int i = 0; i < 8; i++) {
|
60
|
-
out.d[i] = in[i].d;
|
61
|
-
}
|
62
|
-
|
63
|
-
const int end = QK4_0 * 4 / blck_size_interleave;
|
64
|
-
const uint64_t xor_mask = 0x8888888888888888ULL;
|
65
|
-
|
66
|
-
for (int i = 0; i < end; ++i) {
|
67
|
-
int src_id = i % 8;
|
68
|
-
int src_offset = (i / 8) * blck_size_interleave;
|
69
|
-
int dst_offset = i * blck_size_interleave;
|
70
|
-
|
71
|
-
uint64_t elems;
|
72
|
-
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
73
|
-
elems ^= xor_mask;
|
74
|
-
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
75
|
-
}
|
76
|
-
|
77
|
-
return out;
|
78
|
-
}
|
79
|
-
|
80
|
-
static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blck_size_interleave) {
|
81
|
-
assert(n_per_row % QK4_0 == 0);
|
82
|
-
const int nb = n_per_row / QK4_0;
|
83
|
-
|
84
|
-
void * out_ptr = NULL;
|
85
|
-
if (nrows_interleaved == 8) {
|
86
|
-
out_ptr = (block_q4_0x8 *) dst;
|
87
|
-
}
|
88
|
-
else if (nrows_interleaved == 4) {
|
89
|
-
out_ptr = (block_q4_0x4 *) dst;
|
90
|
-
}
|
91
|
-
assert(nrows_interleaved <= 8);
|
92
|
-
block_q4_0 dst_tmp[8];
|
93
|
-
|
94
|
-
for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) {
|
95
|
-
|
96
|
-
for (int64_t x = 0; x < nb; x++) {
|
97
|
-
|
98
|
-
for (int i = 0; i < nrows_interleaved; i++ ) {
|
99
|
-
quantize_row_q4_0_ref(src + b + i * n_per_row + x * QK4_0, (block_q4_0 *) dst_tmp + i, QK4_0);
|
100
|
-
}
|
101
|
-
|
102
|
-
if (nrows_interleaved == 8) {
|
103
|
-
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave);
|
104
|
-
out_ptr = (block_q4_0x8 *) out_ptr + 1;
|
105
|
-
}
|
106
|
-
else if (nrows_interleaved == 4) {
|
107
|
-
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave);
|
108
|
-
out_ptr = (block_q4_0x4 *) out_ptr + 1;
|
109
|
-
}
|
110
|
-
}
|
111
|
-
}
|
112
|
-
|
113
|
-
return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0));
|
114
|
-
}
|
115
|
-
|
116
|
-
size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
117
|
-
UNUSED(quant_weights);
|
118
|
-
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
|
119
|
-
}
|
120
|
-
|
121
|
-
size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
122
|
-
UNUSED(quant_weights);
|
123
|
-
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
|
124
|
-
}
|
125
|
-
|
126
|
-
size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
127
|
-
UNUSED(quant_weights);
|
128
|
-
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
|
129
|
-
}
|
1
|
+
#define LM_GGML_COMMON_DECL_C
|
2
|
+
#include "ggml-common.h"
|
3
|
+
|
4
|
+
#include "ggml-aarch64.h"
|
5
|
+
#include "ggml-impl.h"
|
6
|
+
#include "ggml-quants.h"
|
7
|
+
#include <assert.h>
|
8
|
+
|
9
|
+
#define UNUSED LM_GGML_UNUSED
|
10
|
+
|
11
|
+
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
12
|
+
block_q4_0x4 out;
|
13
|
+
|
14
|
+
for (int i = 0; i < 4; i++) {
|
15
|
+
out.d[i] = in[i].d;
|
16
|
+
}
|
17
|
+
|
18
|
+
const int end = QK4_0 * 2 / blck_size_interleave;
|
19
|
+
|
20
|
+
if (blck_size_interleave == 8) {
|
21
|
+
const uint64_t xor_mask = 0x8888888888888888ULL;
|
22
|
+
for (int i = 0; i < end; ++i) {
|
23
|
+
int src_id = i % 4;
|
24
|
+
int src_offset = (i / 4) * blck_size_interleave;
|
25
|
+
int dst_offset = i * blck_size_interleave;
|
26
|
+
|
27
|
+
uint64_t elems;
|
28
|
+
// Using memcpy to avoid unaligned memory accesses
|
29
|
+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
30
|
+
elems ^= xor_mask;
|
31
|
+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
32
|
+
}
|
33
|
+
} else if (blck_size_interleave == 4) {
|
34
|
+
const uint32_t xor_mask = 0x88888888;
|
35
|
+
for (int i = 0; i < end; ++i) {
|
36
|
+
int src_id = i % 4;
|
37
|
+
int src_offset = (i / 4) * blck_size_interleave;
|
38
|
+
int dst_offset = i * blck_size_interleave;
|
39
|
+
|
40
|
+
uint32_t elems;
|
41
|
+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
|
42
|
+
elems ^= xor_mask;
|
43
|
+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
|
44
|
+
}
|
45
|
+
} else {
|
46
|
+
LM_GGML_ASSERT(false);
|
47
|
+
}
|
48
|
+
|
49
|
+
return out;
|
50
|
+
}
|
51
|
+
|
52
|
+
// interleave 8 block_q4_0s in blocks of blck_size_interleave
|
53
|
+
// returns an interleaved block_q4_0x8
|
54
|
+
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
|
55
|
+
// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
|
56
|
+
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
|
57
|
+
block_q4_0x8 out;
|
58
|
+
|
59
|
+
for (int i = 0; i < 8; i++) {
|
60
|
+
out.d[i] = in[i].d;
|
61
|
+
}
|
62
|
+
|
63
|
+
const int end = QK4_0 * 4 / blck_size_interleave;
|
64
|
+
const uint64_t xor_mask = 0x8888888888888888ULL;
|
65
|
+
|
66
|
+
for (int i = 0; i < end; ++i) {
|
67
|
+
int src_id = i % 8;
|
68
|
+
int src_offset = (i / 8) * blck_size_interleave;
|
69
|
+
int dst_offset = i * blck_size_interleave;
|
70
|
+
|
71
|
+
uint64_t elems;
|
72
|
+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
73
|
+
elems ^= xor_mask;
|
74
|
+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
75
|
+
}
|
76
|
+
|
77
|
+
return out;
|
78
|
+
}
|
79
|
+
|
80
|
+
static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blck_size_interleave) {
|
81
|
+
assert(n_per_row % QK4_0 == 0);
|
82
|
+
const int nb = n_per_row / QK4_0;
|
83
|
+
|
84
|
+
void * out_ptr = NULL;
|
85
|
+
if (nrows_interleaved == 8) {
|
86
|
+
out_ptr = (block_q4_0x8 *) dst;
|
87
|
+
}
|
88
|
+
else if (nrows_interleaved == 4) {
|
89
|
+
out_ptr = (block_q4_0x4 *) dst;
|
90
|
+
}
|
91
|
+
assert(nrows_interleaved <= 8);
|
92
|
+
block_q4_0 dst_tmp[8];
|
93
|
+
|
94
|
+
for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) {
|
95
|
+
|
96
|
+
for (int64_t x = 0; x < nb; x++) {
|
97
|
+
|
98
|
+
for (int i = 0; i < nrows_interleaved; i++ ) {
|
99
|
+
quantize_row_q4_0_ref(src + b + i * n_per_row + x * QK4_0, (block_q4_0 *) dst_tmp + i, QK4_0);
|
100
|
+
}
|
101
|
+
|
102
|
+
if (nrows_interleaved == 8) {
|
103
|
+
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave);
|
104
|
+
out_ptr = (block_q4_0x8 *) out_ptr + 1;
|
105
|
+
}
|
106
|
+
else if (nrows_interleaved == 4) {
|
107
|
+
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave);
|
108
|
+
out_ptr = (block_q4_0x4 *) out_ptr + 1;
|
109
|
+
}
|
110
|
+
}
|
111
|
+
}
|
112
|
+
|
113
|
+
return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0));
|
114
|
+
}
|
115
|
+
|
116
|
+
size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
117
|
+
UNUSED(quant_weights);
|
118
|
+
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
|
119
|
+
}
|
120
|
+
|
121
|
+
size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
122
|
+
UNUSED(quant_weights);
|
123
|
+
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
|
124
|
+
}
|
125
|
+
|
126
|
+
size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
127
|
+
UNUSED(quant_weights);
|
128
|
+
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
|
129
|
+
}
|
package/cpp/ggml-aarch64.h
CHANGED
@@ -1,19 +1,19 @@
|
|
1
|
-
#pragma once
|
2
|
-
|
3
|
-
#include "ggml.h"
|
4
|
-
|
5
|
-
// GGML internal header
|
6
|
-
|
7
|
-
#ifdef __cplusplus
|
8
|
-
extern "C" {
|
9
|
-
#endif
|
10
|
-
|
11
|
-
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
12
|
-
size_t quantize_q4_0_4x4(const float * LM_GGML_RESTRICT src, void * LM_GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
13
|
-
size_t quantize_q4_0_4x8(const float * LM_GGML_RESTRICT src, void * LM_GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
14
|
-
size_t quantize_q4_0_8x8(const float * LM_GGML_RESTRICT src, void * LM_GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
15
|
-
|
16
|
-
#ifdef __cplusplus
|
17
|
-
}
|
18
|
-
#endif
|
19
|
-
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include "ggml.h"
|
4
|
+
|
5
|
+
// GGML internal header
|
6
|
+
|
7
|
+
#ifdef __cplusplus
|
8
|
+
extern "C" {
|
9
|
+
#endif
|
10
|
+
|
11
|
+
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
12
|
+
size_t quantize_q4_0_4x4(const float * LM_GGML_RESTRICT src, void * LM_GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
13
|
+
size_t quantize_q4_0_4x8(const float * LM_GGML_RESTRICT src, void * LM_GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
14
|
+
size_t quantize_q4_0_8x8(const float * LM_GGML_RESTRICT src, void * LM_GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
15
|
+
|
16
|
+
#ifdef __cplusplus
|
17
|
+
}
|
18
|
+
#endif
|
19
|
+
|