cui-llama.rn 1.4.6 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +9 -2
- package/android/src/main/jni.cpp +52 -34
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/cpp/binary-ops.cpp +158 -0
- package/cpp/binary-ops.h +16 -0
- package/cpp/chat.cpp +1769 -1779
- package/cpp/chat.h +9 -1
- package/cpp/common.cpp +20 -522
- package/cpp/common.h +13 -36
- package/cpp/cpu-common.h +72 -0
- package/cpp/ggml-common.h +12 -6
- package/cpp/ggml-cpu-aarch64.cpp +1557 -80
- package/cpp/ggml-cpu-impl.h +2 -21
- package/cpp/ggml-cpu-quants.c +904 -405
- package/cpp/ggml-cpu.c +909 -13237
- package/cpp/ggml-impl.h +50 -23
- package/cpp/ggml-metal-impl.h +77 -3
- package/cpp/ggml-metal.m +794 -580
- package/cpp/ggml.c +92 -3
- package/cpp/ggml.h +29 -5
- package/cpp/gguf.cpp +1 -0
- package/cpp/llama-adapter.cpp +55 -20
- package/cpp/llama-adapter.h +11 -9
- package/cpp/llama-arch.cpp +217 -16
- package/cpp/llama-arch.h +25 -0
- package/cpp/llama-batch.h +2 -2
- package/cpp/llama-chat.cpp +54 -2
- package/cpp/llama-chat.h +3 -0
- package/cpp/llama-context.cpp +2294 -1238
- package/cpp/llama-context.h +214 -77
- package/cpp/llama-cparams.h +1 -0
- package/cpp/llama-graph.cpp +1695 -0
- package/cpp/llama-graph.h +592 -0
- package/cpp/llama-hparams.cpp +8 -0
- package/cpp/llama-hparams.h +17 -0
- package/cpp/llama-io.cpp +15 -0
- package/cpp/llama-io.h +35 -0
- package/cpp/llama-kv-cache.cpp +965 -303
- package/cpp/llama-kv-cache.h +145 -151
- package/cpp/llama-memory.cpp +1 -0
- package/cpp/llama-memory.h +21 -0
- package/cpp/llama-mmap.cpp +1 -1
- package/cpp/llama-model-loader.cpp +10 -5
- package/cpp/llama-model-loader.h +5 -3
- package/cpp/llama-model.cpp +9194 -201
- package/cpp/llama-model.h +40 -1
- package/cpp/llama-sampling.cpp +5 -0
- package/cpp/llama-vocab.cpp +36 -5
- package/cpp/llama.cpp +51 -9984
- package/cpp/llama.h +102 -22
- package/cpp/log.cpp +34 -0
- package/cpp/minja/chat-template.hpp +15 -7
- package/cpp/minja/minja.hpp +120 -94
- package/cpp/ops.cpp +8723 -0
- package/cpp/ops.h +128 -0
- package/cpp/rn-llama.cpp +44 -53
- package/cpp/rn-llama.h +2 -12
- package/cpp/sampling.cpp +3 -0
- package/cpp/sgemm.cpp +533 -88
- package/cpp/simd-mappings.h +888 -0
- package/cpp/speculative.cpp +4 -4
- package/cpp/unary-ops.cpp +186 -0
- package/cpp/unary-ops.h +28 -0
- package/cpp/vec.cpp +258 -0
- package/cpp/vec.h +802 -0
- package/ios/CMakeLists.txt +5 -2
- package/ios/RNLlama.mm +2 -2
- package/ios/RNLlamaContext.mm +40 -24
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +6 -4
- package/src/index.ts +3 -1
- package/cpp/chat-template.hpp +0 -529
- package/cpp/minja.hpp +0 -2915
package/cpp/ggml-impl.h
CHANGED
@@ -16,14 +16,6 @@
|
|
16
16
|
#include <arm_sve.h>
|
17
17
|
#endif // __ARM_FEATURE_SVE
|
18
18
|
|
19
|
-
#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
|
20
|
-
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
21
|
-
//
|
22
|
-
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
23
|
-
//
|
24
|
-
#include <arm_neon.h>
|
25
|
-
#endif
|
26
|
-
|
27
19
|
#if defined(__F16C__)
|
28
20
|
#include <immintrin.h>
|
29
21
|
#endif
|
@@ -311,29 +303,35 @@ LM_GGML_API void lm_ggml_aligned_free(void * ptr, size_t size);
|
|
311
303
|
|
312
304
|
// FP16 to FP32 conversion
|
313
305
|
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
306
|
+
// 16-bit float
|
307
|
+
// on Arm, we use __fp16
|
308
|
+
// on x86, we use uint16_t
|
309
|
+
//
|
310
|
+
// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
|
311
|
+
// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
|
312
|
+
//
|
313
|
+
#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
|
314
|
+
|
315
|
+
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
316
|
+
//
|
317
|
+
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
318
|
+
//
|
319
|
+
#include <arm_neon.h>
|
321
320
|
|
322
|
-
#if defined(__ARM_NEON) && !defined(_MSC_VER) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
|
323
321
|
#define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
|
324
322
|
#define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x)
|
325
323
|
|
326
324
|
#define LM_GGML_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
|
327
325
|
|
328
326
|
static inline float lm_ggml_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
|
329
|
-
|
327
|
+
__fp16 tmp;
|
330
328
|
memcpy(&tmp, &h, sizeof(lm_ggml_fp16_t));
|
331
329
|
return (float)tmp;
|
332
330
|
}
|
333
331
|
|
334
332
|
static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) {
|
335
333
|
lm_ggml_fp16_t res;
|
336
|
-
|
334
|
+
__fp16 tmp = f;
|
337
335
|
memcpy(&res, &tmp, sizeof(lm_ggml_fp16_t));
|
338
336
|
return res;
|
339
337
|
}
|
@@ -357,8 +355,8 @@ LM_GGML_API void lm_ggml_aligned_free(void * ptr, size_t size);
|
|
357
355
|
#define LM_GGML_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x)
|
358
356
|
|
359
357
|
static inline float lm_ggml_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
|
360
|
-
|
361
|
-
|
358
|
+
float f;
|
359
|
+
double d;
|
362
360
|
__asm__(
|
363
361
|
"mtfprd %0,%2\n"
|
364
362
|
"xscvhpdp %0,%0\n"
|
@@ -370,8 +368,8 @@ LM_GGML_API void lm_ggml_aligned_free(void * ptr, size_t size);
|
|
370
368
|
}
|
371
369
|
|
372
370
|
static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) {
|
373
|
-
|
374
|
-
|
371
|
+
double d;
|
372
|
+
lm_ggml_fp16_t r;
|
375
373
|
__asm__( /* xscvdphp can work on double or single precision */
|
376
374
|
"xscvdphp %0,%2\n"
|
377
375
|
"mffprd %1,%0\n" :
|
@@ -381,6 +379,35 @@ LM_GGML_API void lm_ggml_aligned_free(void * ptr, size_t size);
|
|
381
379
|
return r;
|
382
380
|
}
|
383
381
|
|
382
|
+
#elif defined(__riscv) && defined(LM_GGML_RV_ZFH)
|
383
|
+
|
384
|
+
static inline float lm_ggml_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
|
385
|
+
float f;
|
386
|
+
__asm__(
|
387
|
+
"fmv.h.x %[f], %[h]\n\t"
|
388
|
+
"fcvt.s.h %[f], %[f]"
|
389
|
+
: [f] "=&f" (f)
|
390
|
+
: [h] "r" (h)
|
391
|
+
);
|
392
|
+
return f;
|
393
|
+
}
|
394
|
+
|
395
|
+
static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) {
|
396
|
+
lm_ggml_fp16_t res;
|
397
|
+
__asm__(
|
398
|
+
"fcvt.h.s %[f], %[f]\n\t"
|
399
|
+
"fmv.x.h %[h], %[f]"
|
400
|
+
: [h] "=&r" (res)
|
401
|
+
: [f] "f" (f)
|
402
|
+
);
|
403
|
+
return res;
|
404
|
+
}
|
405
|
+
|
406
|
+
#define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
|
407
|
+
#define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x)
|
408
|
+
#define LM_GGML_FP16_TO_FP32(x) LM_GGML_COMPUTE_FP16_TO_FP32(x)
|
409
|
+
#define LM_GGML_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x)
|
410
|
+
|
384
411
|
#else
|
385
412
|
|
386
413
|
// FP16 <-> FP32
|
@@ -456,7 +483,7 @@ LM_GGML_API void lm_ggml_aligned_free(void * ptr, size_t size);
|
|
456
483
|
#define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
|
457
484
|
#define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x)
|
458
485
|
|
459
|
-
#endif // defined(__ARM_NEON) && (!defined(
|
486
|
+
#endif // defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
|
460
487
|
|
461
488
|
// precomputed f32 table for f16 (256 KB)
|
462
489
|
// defined in ggml.c, initialized in lm_ggml_init()
|
package/cpp/ggml-metal-impl.h
CHANGED
@@ -1,6 +1,70 @@
|
|
1
1
|
#ifndef GGML_METAL_IMPL
|
2
2
|
#define GGML_METAL_IMPL
|
3
3
|
|
4
|
+
// kernel parameters for mat-vec threadgroups
|
5
|
+
//
|
6
|
+
// N_R0: number of src0 rows to process per simdgroup
|
7
|
+
// N_SG: number of simdgroups per threadgroup
|
8
|
+
//
|
9
|
+
// TODO: for optimal performance, become function of the device and work size
|
10
|
+
|
11
|
+
#define N_R0_Q4_0 4
|
12
|
+
#define N_SG_Q4_0 2
|
13
|
+
|
14
|
+
#define N_R0_Q4_1 4
|
15
|
+
#define N_SG_Q4_1 2
|
16
|
+
|
17
|
+
#define N_R0_Q5_0 4
|
18
|
+
#define N_SG_Q5_0 2
|
19
|
+
|
20
|
+
#define N_R0_Q5_1 4
|
21
|
+
#define N_SG_Q5_1 2
|
22
|
+
|
23
|
+
#define N_R0_Q8_0 4
|
24
|
+
#define N_SG_Q8_0 2
|
25
|
+
|
26
|
+
#define N_R0_Q2_K 4
|
27
|
+
#define N_SG_Q2_K 2
|
28
|
+
|
29
|
+
#define N_R0_Q3_K 2
|
30
|
+
#define N_SG_Q3_K 2
|
31
|
+
|
32
|
+
#define N_R0_Q4_K 4
|
33
|
+
#define N_SG_Q4_K 2
|
34
|
+
|
35
|
+
#define N_R0_Q5_K 2
|
36
|
+
#define N_SG_Q5_K 2
|
37
|
+
|
38
|
+
#define N_R0_Q6_K 1
|
39
|
+
#define N_SG_Q6_K 2
|
40
|
+
|
41
|
+
#define N_R0_IQ1_S 4
|
42
|
+
#define N_SG_IQ1_S 2
|
43
|
+
|
44
|
+
#define N_R0_IQ1_M 4
|
45
|
+
#define N_SG_IQ1_M 2
|
46
|
+
|
47
|
+
#define N_R0_IQ2_XXS 4
|
48
|
+
#define N_SG_IQ2_XXS 2
|
49
|
+
|
50
|
+
#define N_R0_IQ2_XS 4
|
51
|
+
#define N_SG_IQ2_XS 2
|
52
|
+
|
53
|
+
#define N_R0_IQ2_S 4
|
54
|
+
#define N_SG_IQ2_S 2
|
55
|
+
|
56
|
+
#define N_R0_IQ3_XXS 4
|
57
|
+
#define N_SG_IQ3_XXS 2
|
58
|
+
|
59
|
+
#define N_R0_IQ3_S 4
|
60
|
+
#define N_SG_IQ3_S 2
|
61
|
+
|
62
|
+
#define N_R0_IQ4_NL 2
|
63
|
+
#define N_SG_IQ4_NL 2
|
64
|
+
|
65
|
+
#define N_R0_IQ4_XS 2
|
66
|
+
#define N_SG_IQ4_XS 2
|
67
|
+
|
4
68
|
// kernel argument structs
|
5
69
|
//
|
6
70
|
// - element counters (e.g. ne00) typically use int32_t to reduce register usage
|
@@ -155,9 +219,12 @@ typedef struct {
|
|
155
219
|
int32_t ne11;
|
156
220
|
int32_t ne_12_2; // assume K and V are same shape
|
157
221
|
int32_t ne_12_3;
|
158
|
-
uint64_t
|
159
|
-
uint64_t
|
160
|
-
uint64_t
|
222
|
+
uint64_t nb11;
|
223
|
+
uint64_t nb12;
|
224
|
+
uint64_t nb13;
|
225
|
+
uint64_t nb21;
|
226
|
+
uint64_t nb22;
|
227
|
+
uint64_t nb23;
|
161
228
|
uint64_t nb31;
|
162
229
|
int32_t ne1;
|
163
230
|
int32_t ne2;
|
@@ -285,6 +352,13 @@ typedef struct {
|
|
285
352
|
float eps;
|
286
353
|
} ggml_metal_kargs_rms_norm;
|
287
354
|
|
355
|
+
typedef struct {
|
356
|
+
int32_t ne00;
|
357
|
+
int32_t ne00_4;
|
358
|
+
uint64_t nb01;
|
359
|
+
float eps;
|
360
|
+
} ggml_metal_kargs_l2_norm;
|
361
|
+
|
288
362
|
typedef struct {
|
289
363
|
int64_t ne00;
|
290
364
|
int64_t ne01;
|