@fugood/llama.node 1.1.9 → 1.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +7 -1
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +15 -5
- package/src/LlamaCompletionWorker.cpp +12 -3
- package/src/LlamaCompletionWorker.h +3 -1
- package/src/LlamaContext.cpp +20 -2
- package/src/llama.cpp/common/arg.cpp +29 -19
- package/src/llama.cpp/common/chat.cpp +153 -3
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +10 -3
- package/src/llama.cpp/common/common.h +4 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +14 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +16 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +39 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +20 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +103 -1
- package/src/llama.cpp/include/llama.h +27 -1
- package/src/llama.cpp/src/llama-adapter.cpp +68 -4
- package/src/llama.cpp/src/llama-adapter.h +3 -0
- package/src/llama.cpp/src/llama-arch.cpp +46 -2
- package/src/llama.cpp/src/llama-arch.h +4 -0
- package/src/llama.cpp/src/llama-context.cpp +80 -39
- package/src/llama.cpp/src/llama-context.h +0 -4
- package/src/llama.cpp/src/llama-graph.cpp +20 -10
- package/src/llama.cpp/src/llama-graph.h +2 -1
- package/src/llama.cpp/src/llama-hparams.cpp +25 -0
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/llama-impl.h +2 -0
- package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +24 -7
- package/src/llama.cpp/src/llama-kv-cache-iswa.h +4 -2
- package/src/llama.cpp/src/llama-kv-cache.cpp +67 -130
- package/src/llama.cpp/src/llama-kv-cache.h +16 -28
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +29 -28
- package/src/llama.cpp/src/llama-memory-hybrid.h +18 -22
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +7 -7
- package/src/llama.cpp/src/llama-memory-recurrent.h +7 -11
- package/src/llama.cpp/src/llama-memory.h +8 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model.cpp +302 -31
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-vocab.cpp +1 -1
- package/src/llama.cpp/src/llama.cpp +12 -0
|
@@ -18,6 +18,10 @@
|
|
|
18
18
|
#include <immintrin.h>
|
|
19
19
|
#endif
|
|
20
20
|
|
|
21
|
+
#if defined(__riscv_v_intrinsic)
|
|
22
|
+
#include <riscv_vector.h>
|
|
23
|
+
#endif
|
|
24
|
+
|
|
21
25
|
#ifdef __cplusplus
|
|
22
26
|
extern "C" {
|
|
23
27
|
#endif
|
|
@@ -94,24 +98,15 @@ extern "C" {
|
|
|
94
98
|
}
|
|
95
99
|
#elif defined(__riscv) && defined(__riscv_zfhmin)
|
|
96
100
|
static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
"fcvt.s.h %[f], %[f]"
|
|
101
|
-
: [f] "=&f" (f)
|
|
102
|
-
: [h] "r" (h)
|
|
103
|
-
);
|
|
104
|
-
return f;
|
|
101
|
+
_Float16 hf;
|
|
102
|
+
memcpy(&hf, &h, sizeof(ggml_fp16_t));
|
|
103
|
+
return hf;
|
|
105
104
|
}
|
|
106
105
|
|
|
107
106
|
static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
|
|
108
107
|
ggml_fp16_t res;
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
"fmv.x.h %[h], %[f]"
|
|
112
|
-
: [h] "=&r" (res)
|
|
113
|
-
: [f] "f" (f)
|
|
114
|
-
);
|
|
108
|
+
_Float16 hf = (_Float16)f;
|
|
109
|
+
memcpy(&res, &hf, sizeof(ggml_fp16_t));
|
|
115
110
|
return res;
|
|
116
111
|
}
|
|
117
112
|
|
|
@@ -1170,6 +1165,36 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
|
|
|
1170
1165
|
#define GGML_F16_VEC_MUL GGML_F32x4_MUL
|
|
1171
1166
|
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
|
|
1172
1167
|
|
|
1168
|
+
#elif defined(__riscv_v_intrinsic)
|
|
1169
|
+
|
|
1170
|
+
// compatible with vlen >= 128
|
|
1171
|
+
|
|
1172
|
+
#define GGML_SIMD
|
|
1173
|
+
|
|
1174
|
+
// F32
|
|
1175
|
+
|
|
1176
|
+
#define GGML_F32_STEP 16
|
|
1177
|
+
#define GGML_F32_EPR 4
|
|
1178
|
+
|
|
1179
|
+
#define GGML_F32x4 vfloat32m1_t
|
|
1180
|
+
#define GGML_F32x4_ZERO __riscv_vfmv_v_f_f32m1(0.0f, GGML_F32_EPR)
|
|
1181
|
+
#define GGML_F32x4_SET1(x) __riscv_vfmv_v_f_f32m1(x, GGML_F32_EPR)
|
|
1182
|
+
#define GGML_F32x4_LOAD(x) __riscv_vle32_v_f32m1(x, GGML_F32_EPR)
|
|
1183
|
+
#define GGML_F32x4_STORE(b, v) __riscv_vse32_v_f32m1(b, v, GGML_F32_EPR)
|
|
1184
|
+
#define GGML_F32x4_FMA(a, b, c) __riscv_vfmacc_vv_f32m1(a, b, c, GGML_F32_EPR)
|
|
1185
|
+
#define GGML_F32x4_ADD(a, b) __riscv_vfadd_vv_f32m1(a, b, GGML_F32_EPR)
|
|
1186
|
+
#define GGML_F32x4_MUL(a, b) __riscv_vfmul_vv_f32m1(a, b, GGML_F32_EPR)
|
|
1187
|
+
|
|
1188
|
+
#define GGML_F32_VEC GGML_F32x4
|
|
1189
|
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
|
1190
|
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
|
1191
|
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
|
1192
|
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
|
1193
|
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
|
1194
|
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
|
1195
|
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
|
1196
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
|
1197
|
+
|
|
1173
1198
|
#endif
|
|
1174
1199
|
|
|
1175
1200
|
// GGML_F32_ARR / GGML_F16_ARR
|
|
@@ -84,6 +84,16 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
|
|
|
84
84
|
}
|
|
85
85
|
// reduce sum1,sum2 to sum1
|
|
86
86
|
GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
|
|
87
|
+
#elif defined(__riscv_v_intrinsic)
|
|
88
|
+
vfloat32m1_t vsum = __riscv_vfmv_v_f_f32m1(0.0f, 1);
|
|
89
|
+
for (int i = 0, avl; i < n; i += avl) {
|
|
90
|
+
avl = __riscv_vsetvl_e32m8(n - i);
|
|
91
|
+
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
|
|
92
|
+
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
|
|
93
|
+
vfloat32m8_t prod = __riscv_vfmul_vv_f32m8(ax, ay, avl);
|
|
94
|
+
vsum = __riscv_vfredusum_vs_f32m8_f32m1(prod, vsum, avl);
|
|
95
|
+
}
|
|
96
|
+
sumf += __riscv_vfmv_f_s_f32m1_f32(vsum);
|
|
87
97
|
#else
|
|
88
98
|
const int np = (n & ~(GGML_F32_STEP - 1));
|
|
89
99
|
|
|
@@ -197,7 +207,7 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
|
|
|
197
207
|
|
|
198
208
|
ggml_float sumf = 0.0;
|
|
199
209
|
|
|
200
|
-
#if defined(GGML_SIMD)
|
|
210
|
+
#if defined(GGML_SIMD) && !defined(__riscv_v_intrinsic)
|
|
201
211
|
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
202
212
|
|
|
203
213
|
GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
|
|
@@ -325,6 +335,15 @@ ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float
|
|
|
325
335
|
vst1q_f32(y + i, val);
|
|
326
336
|
sum += (ggml_float)vaddvq_f32(val);
|
|
327
337
|
}
|
|
338
|
+
#elif defined(__riscv_v_intrinsic)
|
|
339
|
+
vfloat64m1_t vsum = __riscv_vfmv_v_f_f64m1(0, 1);
|
|
340
|
+
for (int avl; i < n; i += avl) {
|
|
341
|
+
avl = __riscv_vsetvl_e32m2(n - i);
|
|
342
|
+
vfloat32m2_t val = ggml_v_expf_m2(__riscv_vfsub_vf_f32m2(__riscv_vle32_v_f32m2(&x[i], avl), max, avl), avl);
|
|
343
|
+
__riscv_vse32_v_f32m2(&y[i], val, avl);
|
|
344
|
+
vsum = __riscv_vfwredusum_vs_f32m2_f64m1(val, vsum, avl);
|
|
345
|
+
}
|
|
346
|
+
return (ggml_float)__riscv_vfmv_f_s_f64m1_f64(vsum);
|
|
328
347
|
#endif
|
|
329
348
|
for (; i < n; ++i) {
|
|
330
349
|
float val = expf(x[i] - max);
|
|
@@ -119,6 +119,14 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
|
|
119
119
|
}
|
|
120
120
|
|
|
121
121
|
#if defined(GGML_SIMD)
|
|
122
|
+
#if defined(__riscv_v_intrinsic)
|
|
123
|
+
// todo: RVV impl
|
|
124
|
+
for (int i = 0; i < n; ++i) {
|
|
125
|
+
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
|
126
|
+
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
#else
|
|
122
130
|
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
123
131
|
|
|
124
132
|
GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
|
|
@@ -149,6 +157,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
|
|
149
157
|
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
150
158
|
}
|
|
151
159
|
}
|
|
160
|
+
#endif
|
|
152
161
|
#else
|
|
153
162
|
for (int i = 0; i < n; ++i) {
|
|
154
163
|
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
|
@@ -243,6 +252,14 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
|
|
|
243
252
|
|
|
244
253
|
svst1_f32(pg, y + np2, ay1);
|
|
245
254
|
}
|
|
255
|
+
#elif defined(__riscv_v_intrinsic)
|
|
256
|
+
for (int i = 0, avl; i < n; i += avl) {
|
|
257
|
+
avl = __riscv_vsetvl_e32m8(n - i);
|
|
258
|
+
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
|
|
259
|
+
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
|
|
260
|
+
vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, v, ay, avl);
|
|
261
|
+
__riscv_vse32_v_f32m8(&y[i], ny, avl);
|
|
262
|
+
}
|
|
246
263
|
#else
|
|
247
264
|
const int np = (n & ~(GGML_F32_STEP - 1));
|
|
248
265
|
|
|
@@ -276,6 +293,13 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
|
|
|
276
293
|
|
|
277
294
|
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
|
|
278
295
|
#if defined(GGML_SIMD)
|
|
296
|
+
#if defined(__riscv_v_intrinsic)
|
|
297
|
+
// todo: RVV impl
|
|
298
|
+
// scalar
|
|
299
|
+
for (int i = 0; i < n; ++i) {
|
|
300
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
|
301
|
+
}
|
|
302
|
+
#else
|
|
279
303
|
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
280
304
|
|
|
281
305
|
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
|
@@ -297,6 +321,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
|
|
|
297
321
|
for (int i = np; i < n; ++i) {
|
|
298
322
|
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
|
299
323
|
}
|
|
324
|
+
#endif
|
|
300
325
|
#else
|
|
301
326
|
// scalar
|
|
302
327
|
for (int i = 0; i < n; ++i) {
|
|
@@ -324,6 +349,16 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
|
|
|
324
349
|
y[i] += x[k][i]*v[k][0];
|
|
325
350
|
}
|
|
326
351
|
}
|
|
352
|
+
#elif defined(__riscv_v_intrinsic)
|
|
353
|
+
for (int i = 0, avl; i < n; i += avl) {
|
|
354
|
+
avl = __riscv_vsetvl_e32m8(n - i);
|
|
355
|
+
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
|
|
356
|
+
for (int k = 0; k < GGML_VEC_MAD_UNROLL; k++) {
|
|
357
|
+
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[k][i], avl);
|
|
358
|
+
ay = __riscv_vfmadd_vf_f32m8(ax, v[k][0], ay, avl);
|
|
359
|
+
}
|
|
360
|
+
__riscv_vse32_v_f32m8(&y[i], ay, avl);
|
|
361
|
+
}
|
|
327
362
|
#else
|
|
328
363
|
const int np = (n & ~(GGML_F32_STEP - 1));
|
|
329
364
|
|
|
@@ -375,6 +410,14 @@ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, co
|
|
|
375
410
|
for (int i = 0; i < n; ++i) {
|
|
376
411
|
y[i] = x[i]*s + b;
|
|
377
412
|
}
|
|
413
|
+
#elif defined(__riscv_v_intrinsic)
|
|
414
|
+
for (int i = 0, avl; i < n; i += avl) {
|
|
415
|
+
avl = __riscv_vsetvl_e32m8(n - i);
|
|
416
|
+
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
|
|
417
|
+
vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(b, avl);
|
|
418
|
+
vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, s, vb, avl);
|
|
419
|
+
__riscv_vse32_v_f32m8(&y[i], ny, avl);
|
|
420
|
+
}
|
|
378
421
|
#else
|
|
379
422
|
const int np = (n & ~(GGML_F32_STEP - 1));
|
|
380
423
|
|
|
@@ -436,6 +479,13 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
|
|
436
479
|
ay1 = svmul_f32_m(pg, ay1, vx);
|
|
437
480
|
svst1_f32(pg, y + np, ay1);
|
|
438
481
|
}
|
|
482
|
+
#elif defined(__riscv_v_intrinsic)
|
|
483
|
+
for (int i = 0, avl; i < n; i += avl) {
|
|
484
|
+
avl = __riscv_vsetvl_e32m8(n - i);
|
|
485
|
+
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
|
|
486
|
+
vfloat32m8_t ny = __riscv_vfmul_vf_f32m8(ay, v, avl);
|
|
487
|
+
__riscv_vse32_v_f32m8(&y[i], ny, avl);
|
|
488
|
+
}
|
|
439
489
|
#else
|
|
440
490
|
const int np = (n & ~(GGML_F32_STEP - 1));
|
|
441
491
|
|
|
@@ -467,6 +517,13 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
|
|
467
517
|
|
|
468
518
|
inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
|
|
469
519
|
#if defined(GGML_SIMD)
|
|
520
|
+
#if defined(__riscv_v_intrinsic)
|
|
521
|
+
// todo: RVV impl
|
|
522
|
+
// scalar
|
|
523
|
+
for (int i = 0; i < n; ++i) {
|
|
524
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
|
525
|
+
}
|
|
526
|
+
#else
|
|
470
527
|
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
471
528
|
|
|
472
529
|
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
|
@@ -486,6 +543,7 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
|
|
|
486
543
|
for (int i = np; i < n; ++i) {
|
|
487
544
|
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
|
488
545
|
}
|
|
546
|
+
#endif
|
|
489
547
|
#else
|
|
490
548
|
// scalar
|
|
491
549
|
for (int i = 0; i < n; ++i) {
|
|
@@ -928,7 +986,51 @@ inline static __m128 ggml_v_silu(__m128 x) {
|
|
|
928
986
|
return _mm_div_ps(x, one_plus_exp_neg_x);
|
|
929
987
|
}
|
|
930
988
|
|
|
931
|
-
#
|
|
989
|
+
#elif defined(__riscv_v_intrinsic)
|
|
990
|
+
|
|
991
|
+
// adapted from arm limited optimized routine
|
|
992
|
+
// the maximum error is 1.45358 plus 0.5 ulps
|
|
993
|
+
// numbers above 88.38 will flush to infinity
|
|
994
|
+
// numbers beneath -103.97 will flush to zero
|
|
995
|
+
inline static vfloat32m2_t ggml_v_expf_m2(vfloat32m2_t x, int vl) {
|
|
996
|
+
const vfloat32m2_t r = __riscv_vfmv_v_f_f32m2(0x1.8p23f, vl);
|
|
997
|
+
#ifdef __riscv_xtheadvector
|
|
998
|
+
// workaround for compiler bug (gcc 14.3.0: Error: unrecognized opcode `th.vmv1r.v v2,v4')
|
|
999
|
+
vfloat32m2_t z = __riscv_vfadd_vf_f32m2(r, 0.0f, vl);
|
|
1000
|
+
z = __riscv_vfmacc_vf_f32m2(z, 0x1.715476p+0f, x, vl);
|
|
1001
|
+
#else
|
|
1002
|
+
const vfloat32m2_t z = __riscv_vfmacc_vf_f32m2(r, 0x1.715476p+0f, x, vl);
|
|
1003
|
+
#endif
|
|
1004
|
+
const vfloat32m2_t n = __riscv_vfsub_vv_f32m2(z, r, vl);
|
|
1005
|
+
const vfloat32m2_t b = __riscv_vfnmsac_vf_f32m2(__riscv_vfnmsac_vf_f32m2(x, 0x1.62e4p-1f, n, vl),
|
|
1006
|
+
0x1.7f7d1cp-20f, n, vl);
|
|
1007
|
+
const vuint32m2_t e = __riscv_vsll_vx_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(z), 23, vl);
|
|
1008
|
+
const vfloat32m2_t k = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(e, 0x3f800000, vl)); // 1.0f
|
|
1009
|
+
const vbool16_t c = __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 126.0f, vl);
|
|
1010
|
+
const vfloat32m2_t u = __riscv_vfmul_vv_f32m2(b, b, vl);
|
|
1011
|
+
const vfloat32m2_t j = __riscv_vfmacc_vv_f32m2(
|
|
1012
|
+
__riscv_vfmul_vf_f32m2(b, 0x1.ffffecp-1f, vl),
|
|
1013
|
+
__riscv_vfmacc_vv_f32m2(
|
|
1014
|
+
__riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.fffdb6p-2f, vl), 0x1.555e66p-3f, b, vl),
|
|
1015
|
+
__riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.573e2ep-5f, vl), 0x1.0e4020p-7f, b, vl),
|
|
1016
|
+
u, vl), u, vl);
|
|
1017
|
+
if (!__riscv_vcpop_m_b16(c, vl))
|
|
1018
|
+
return __riscv_vfmacc_vv_f32m2(k, j, k, vl);
|
|
1019
|
+
const vbool16_t dm = __riscv_vmfle_vf_f32m2_b16(n, 0.0f, vl);
|
|
1020
|
+
const vuint32m2_t d = __riscv_vmerge_vxm_u32m2(__riscv_vmv_v_x_u32m2(0, vl), 0x82000000, dm, vl);
|
|
1021
|
+
const vfloat32m2_t s1 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(d, 0x7f000000, vl));
|
|
1022
|
+
const vfloat32m2_t s2 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vsub_vv_u32m2(e, d, vl));
|
|
1023
|
+
const vfloat32m2_t r1 = __riscv_vmerge_vvm_f32m2(
|
|
1024
|
+
__riscv_vfmacc_vv_f32m2(k, k, j, vl),
|
|
1025
|
+
__riscv_vfmul_vv_f32m2(__riscv_vfmacc_vv_f32m2(s2, s2, j, vl), s1, vl),
|
|
1026
|
+
c, vl);
|
|
1027
|
+
return __riscv_vmerge_vvm_f32m2(
|
|
1028
|
+
r1, __riscv_vfmul_vv_f32m2(s1, s1, vl),
|
|
1029
|
+
__riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 192.0f, vl),
|
|
1030
|
+
vl);
|
|
1031
|
+
}
|
|
1032
|
+
|
|
1033
|
+
#endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
|
|
932
1034
|
|
|
933
1035
|
inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
934
1036
|
for (int i = 0; i < n; ++i) {
|
|
@@ -179,6 +179,14 @@ extern "C" {
|
|
|
179
179
|
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
|
|
180
180
|
};
|
|
181
181
|
|
|
182
|
+
enum llama_flash_attn_type {
|
|
183
|
+
LLAMA_FLASH_ATTN_TYPE_AUTO = -1,
|
|
184
|
+
LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
|
|
185
|
+
LLAMA_FLASH_ATTN_TYPE_ENABLED = 1,
|
|
186
|
+
};
|
|
187
|
+
|
|
188
|
+
LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
|
|
189
|
+
|
|
182
190
|
enum llama_split_mode {
|
|
183
191
|
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
|
184
192
|
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
|
@@ -303,6 +311,7 @@ extern "C" {
|
|
|
303
311
|
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
|
304
312
|
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
|
305
313
|
enum llama_attention_type attention_type; // attention type to use for embeddings
|
|
314
|
+
enum llama_flash_attn_type flash_attn_type; // when to enable Flash Attention
|
|
306
315
|
|
|
307
316
|
// ref: https://github.com/ggml-org/llama.cpp/pull/2054
|
|
308
317
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
|
@@ -329,7 +338,6 @@ extern "C" {
|
|
|
329
338
|
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
|
330
339
|
bool embeddings; // if true, extract embeddings (together with logits)
|
|
331
340
|
bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
|
|
332
|
-
bool flash_attn; // use flash attention [EXPERIMENTAL]
|
|
333
341
|
bool no_perf; // measure performance timings
|
|
334
342
|
bool op_offload; // offload host tensor operations to device
|
|
335
343
|
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
@@ -553,6 +561,24 @@ extern "C" {
|
|
|
553
561
|
struct llama_model * model,
|
|
554
562
|
const char * path_lora);
|
|
555
563
|
|
|
564
|
+
// Functions to access the adapter's GGUF metadata scalar values
|
|
565
|
+
// - The functions return the length of the string on success, or -1 on failure
|
|
566
|
+
// - The output string is always null-terminated and cleared on failure
|
|
567
|
+
// - When retrieving a string, an extra byte must be allocated to account for the null terminator
|
|
568
|
+
// - GGUF array values are not supported by these functions
|
|
569
|
+
|
|
570
|
+
// Get metadata value as a string by key name
|
|
571
|
+
LLAMA_API int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size);
|
|
572
|
+
|
|
573
|
+
// Get the number of metadata key/value pairs
|
|
574
|
+
LLAMA_API int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter);
|
|
575
|
+
|
|
576
|
+
// Get metadata key name by index
|
|
577
|
+
LLAMA_API int32_t llama_adapter_meta_key_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
|
|
578
|
+
|
|
579
|
+
// Get metadata value as a string by index
|
|
580
|
+
LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
|
|
581
|
+
|
|
556
582
|
// Manually free a LoRA adapter
|
|
557
583
|
// Note: loaded adapters will be free when the associated model is deleted
|
|
558
584
|
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
|
|
@@ -163,13 +163,38 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
|
|
163
163
|
|
|
164
164
|
// check metadata
|
|
165
165
|
{
|
|
166
|
+
const gguf_context * gguf_ctx = ctx_gguf.get();
|
|
167
|
+
|
|
168
|
+
LLAMA_LOG_INFO("%s: Dumping metadata keys/values.\n", __func__);
|
|
169
|
+
|
|
170
|
+
// get metadata as string
|
|
171
|
+
for (int i = 0; i < gguf_get_n_kv(gguf_ctx); i++) {
|
|
172
|
+
gguf_type type = gguf_get_kv_type(gguf_ctx, i);
|
|
173
|
+
const std::string type_name =
|
|
174
|
+
type == GGUF_TYPE_ARRAY
|
|
175
|
+
? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(gguf_ctx, i)), gguf_get_arr_n(gguf_ctx, i))
|
|
176
|
+
: gguf_type_name(type);
|
|
177
|
+
const char * name = gguf_get_key(gguf_ctx, i);
|
|
178
|
+
const std::string value = gguf_kv_to_str(gguf_ctx, i);
|
|
179
|
+
|
|
180
|
+
if (type != GGUF_TYPE_ARRAY) {
|
|
181
|
+
adapter.gguf_kv.emplace(name, value);
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
const size_t MAX_VALUE_LEN = 40;
|
|
185
|
+
std::string print_value = value.size() > MAX_VALUE_LEN ? format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()) : value;
|
|
186
|
+
replace_all(print_value, "\n", "\\n");
|
|
187
|
+
|
|
188
|
+
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), print_value.c_str());
|
|
189
|
+
}
|
|
190
|
+
|
|
166
191
|
auto get_kv_str = [&](const std::string & key) -> std::string {
|
|
167
|
-
int id = gguf_find_key(
|
|
168
|
-
return id < 0 ? "" : std::string(gguf_get_val_str(
|
|
192
|
+
int id = gguf_find_key(gguf_ctx, key.c_str());
|
|
193
|
+
return id < 0 ? "" : std::string(gguf_get_val_str(gguf_ctx, id));
|
|
169
194
|
};
|
|
170
195
|
auto get_kv_f32 = [&](const std::string & key) -> float {
|
|
171
|
-
int id = gguf_find_key(
|
|
172
|
-
return id < 0 ? 0.0f : gguf_get_val_f32(
|
|
196
|
+
int id = gguf_find_key(gguf_ctx, key.c_str());
|
|
197
|
+
return id < 0 ? 0.0f : gguf_get_val_f32(gguf_ctx, id);
|
|
173
198
|
};
|
|
174
199
|
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
|
175
200
|
|
|
@@ -383,6 +408,45 @@ llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * p
|
|
|
383
408
|
return nullptr;
|
|
384
409
|
}
|
|
385
410
|
|
|
411
|
+
int32_t llama_adapter_meta_val_str(const llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size) {
|
|
412
|
+
const auto & it = adapter->gguf_kv.find(key);
|
|
413
|
+
if (it == adapter->gguf_kv.end()) {
|
|
414
|
+
if (buf_size > 0) {
|
|
415
|
+
buf[0] = '\0';
|
|
416
|
+
}
|
|
417
|
+
return -1;
|
|
418
|
+
}
|
|
419
|
+
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
int32_t llama_adapter_meta_count(const llama_adapter_lora * adapter) {
|
|
423
|
+
return (int)adapter->gguf_kv.size();
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
int32_t llama_adapter_meta_key_by_index(const llama_adapter_lora * adapter, int i, char * buf, size_t buf_size) {
|
|
427
|
+
if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
|
|
428
|
+
if (buf_size > 0) {
|
|
429
|
+
buf[0] = '\0';
|
|
430
|
+
}
|
|
431
|
+
return -1;
|
|
432
|
+
}
|
|
433
|
+
auto it = adapter->gguf_kv.begin();
|
|
434
|
+
std::advance(it, i);
|
|
435
|
+
return snprintf(buf, buf_size, "%s", it->first.c_str());
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size) {
|
|
439
|
+
if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
|
|
440
|
+
if (buf_size > 0) {
|
|
441
|
+
buf[0] = '\0';
|
|
442
|
+
}
|
|
443
|
+
return -1;
|
|
444
|
+
}
|
|
445
|
+
auto it = adapter->gguf_kv.begin();
|
|
446
|
+
std::advance(it, i);
|
|
447
|
+
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
|
448
|
+
}
|
|
449
|
+
|
|
386
450
|
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
|
|
387
451
|
delete adapter;
|
|
388
452
|
}
|
|
@@ -22,6 +22,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
22
22
|
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
|
|
23
23
|
{ LLM_ARCH_NEO_BERT, "neo-bert" },
|
|
24
24
|
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
|
25
|
+
{ LLM_ARCH_JINA_BERT_V3, "jina-bert-v3" },
|
|
25
26
|
{ LLM_ARCH_BLOOM, "bloom" },
|
|
26
27
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
|
27
28
|
{ LLM_ARCH_QWEN, "qwen" },
|
|
@@ -68,6 +69,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
68
69
|
{ LLM_ARCH_T5ENCODER, "t5encoder" },
|
|
69
70
|
{ LLM_ARCH_JAIS, "jais" },
|
|
70
71
|
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
|
72
|
+
{ LLM_ARCH_NEMOTRON_H, "nemotron_h" },
|
|
71
73
|
{ LLM_ARCH_EXAONE, "exaone" },
|
|
72
74
|
{ LLM_ARCH_EXAONE4, "exaone4" },
|
|
73
75
|
{ LLM_ARCH_RWKV6, "rwkv6" },
|
|
@@ -234,8 +236,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
234
236
|
{ LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" },
|
|
235
237
|
{ LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" },
|
|
236
238
|
|
|
237
|
-
{ LLM_KV_ADAPTER_TYPE,
|
|
238
|
-
{ LLM_KV_ADAPTER_LORA_ALPHA,
|
|
239
|
+
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
|
|
240
|
+
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
|
|
241
|
+
{ LLM_KV_ADAPTER_LORA_TASK_NAME, "adapter.lora.task_name" },
|
|
242
|
+
{ LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, "adapter.lora.prompt_prefix" },
|
|
239
243
|
|
|
240
244
|
// deprecated
|
|
241
245
|
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
|
|
@@ -575,6 +579,20 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
575
579
|
{ LLM_TENSOR_CLS, "cls" },
|
|
576
580
|
},
|
|
577
581
|
},
|
|
582
|
+
{
|
|
583
|
+
LLM_ARCH_JINA_BERT_V3,
|
|
584
|
+
{
|
|
585
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
586
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
|
587
|
+
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
|
588
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
|
589
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
|
590
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
591
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
592
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
593
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
|
594
|
+
},
|
|
595
|
+
},
|
|
578
596
|
{
|
|
579
597
|
LLM_ARCH_BLOOM,
|
|
580
598
|
{
|
|
@@ -1533,6 +1551,31 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1533
1551
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1534
1552
|
},
|
|
1535
1553
|
},
|
|
1554
|
+
{
|
|
1555
|
+
LLM_ARCH_NEMOTRON_H,
|
|
1556
|
+
{
|
|
1557
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1558
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1559
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1560
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1561
|
+
// mamba(2) ssm layers
|
|
1562
|
+
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
|
|
1563
|
+
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
|
|
1564
|
+
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
|
|
1565
|
+
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
|
|
1566
|
+
{ LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
|
|
1567
|
+
{ LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" },
|
|
1568
|
+
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
|
1569
|
+
// attention layers
|
|
1570
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1571
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1572
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1573
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1574
|
+
// dense FFN
|
|
1575
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1576
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1577
|
+
},
|
|
1578
|
+
},
|
|
1536
1579
|
{
|
|
1537
1580
|
LLM_ARCH_EXAONE,
|
|
1538
1581
|
{
|
|
@@ -2338,6 +2381,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
|
|
|
2338
2381
|
case LLM_ARCH_PLAMO2:
|
|
2339
2382
|
case LLM_ARCH_GRANITE_HYBRID:
|
|
2340
2383
|
case LLM_ARCH_LFM2:
|
|
2384
|
+
case LLM_ARCH_NEMOTRON_H:
|
|
2341
2385
|
return true;
|
|
2342
2386
|
default:
|
|
2343
2387
|
return false;
|
|
@@ -26,6 +26,7 @@ enum llm_arch {
|
|
|
26
26
|
LLM_ARCH_NOMIC_BERT_MOE,
|
|
27
27
|
LLM_ARCH_NEO_BERT,
|
|
28
28
|
LLM_ARCH_JINA_BERT_V2,
|
|
29
|
+
LLM_ARCH_JINA_BERT_V3,
|
|
29
30
|
LLM_ARCH_BLOOM,
|
|
30
31
|
LLM_ARCH_STABLELM,
|
|
31
32
|
LLM_ARCH_QWEN,
|
|
@@ -72,6 +73,7 @@ enum llm_arch {
|
|
|
72
73
|
LLM_ARCH_T5ENCODER,
|
|
73
74
|
LLM_ARCH_JAIS,
|
|
74
75
|
LLM_ARCH_NEMOTRON,
|
|
76
|
+
LLM_ARCH_NEMOTRON_H,
|
|
75
77
|
LLM_ARCH_EXAONE,
|
|
76
78
|
LLM_ARCH_EXAONE4,
|
|
77
79
|
LLM_ARCH_RWKV6,
|
|
@@ -230,6 +232,8 @@ enum llm_kv {
|
|
|
230
232
|
|
|
231
233
|
LLM_KV_ADAPTER_TYPE,
|
|
232
234
|
LLM_KV_ADAPTER_LORA_ALPHA,
|
|
235
|
+
LLM_KV_ADAPTER_LORA_TASK_NAME,
|
|
236
|
+
LLM_KV_ADAPTER_LORA_PROMPT_PREFIX,
|
|
233
237
|
|
|
234
238
|
LLM_KV_POSNET_EMBEDDING_LENGTH,
|
|
235
239
|
LLM_KV_POSNET_BLOCK_COUNT,
|