@fugood/llama.node 1.4.7 → 1.4.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +23 -24
- package/src/LlamaContext.cpp +4 -2
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +470 -223
- package/src/llama.cpp/common/arg.h +43 -2
- package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
- package/src/llama.cpp/common/chat.cpp +140 -0
- package/src/llama.cpp/common/common.cpp +130 -67
- package/src/llama.cpp/common/common.h +44 -17
- package/src/llama.cpp/common/console.cpp +98 -18
- package/src/llama.cpp/common/console.h +30 -8
- package/src/llama.cpp/common/download.cpp +69 -25
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
- package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
- package/src/llama.cpp/common/log.cpp +5 -0
- package/src/llama.cpp/common/log.h +1 -0
- package/src/llama.cpp/common/peg-parser.cpp +1 -1
- package/src/llama.cpp/common/preset.cpp +206 -0
- package/src/llama.cpp/common/preset.h +32 -0
- package/src/llama.cpp/common/sampling.cpp +67 -54
- package/src/llama.cpp/common/sampling.h +8 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +7 -8
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +285 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -45
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +288 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
- package/src/llama.cpp/include/llama.h +18 -1
- package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
- package/src/llama.cpp/src/llama-arch.h +9 -2
- package/src/llama.cpp/src/llama-batch.cpp +12 -2
- package/src/llama.cpp/src/llama-batch.h +4 -2
- package/src/llama.cpp/src/llama-context.cpp +93 -23
- package/src/llama.cpp/src/llama-context.h +8 -2
- package/src/llama.cpp/src/llama-graph.cpp +84 -16
- package/src/llama.cpp/src/llama-graph.h +17 -4
- package/src/llama.cpp/src/llama-hparams.cpp +6 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -1
- package/src/llama.cpp/src/llama-impl.cpp +4 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
- package/src/llama.cpp/src/llama-kv-cache.h +19 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +123 -28
- package/src/llama.cpp/src/llama-mmap.h +5 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +58 -13
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +110 -49
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +16 -0
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
- package/src/llama.cpp/src/llama.cpp +665 -1
- package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
- package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
- package/src/llama.cpp/src/models/glm4.cpp +27 -4
- package/src/llama.cpp/src/models/models.h +5 -5
- package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
- package/src/llama.cpp/src/models/qwen2.cpp +12 -3
- package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
|
@@ -98,6 +98,10 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
98
98
|
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
99
99
|
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
100
100
|
void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
101
|
+
void ggml_gemv_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
102
|
+
void ggml_gemv_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
103
|
+
void ggml_gemm_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
104
|
+
void ggml_gemm_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
101
105
|
|
|
102
106
|
// Native implementations
|
|
103
107
|
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
@@ -120,6 +124,10 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
120
124
|
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
121
125
|
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
122
126
|
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
127
|
+
void ggml_gemv_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
128
|
+
void ggml_gemv_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
129
|
+
void ggml_gemm_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
130
|
+
void ggml_gemm_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
123
131
|
|
|
124
132
|
#if defined(__cplusplus)
|
|
125
133
|
} // extern "C"
|
|
@@ -195,8 +195,48 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
|
|
|
195
195
|
sumf += (ggml_float)_mm_cvtss_f32(g);
|
|
196
196
|
|
|
197
197
|
#undef LOAD
|
|
198
|
-
#
|
|
198
|
+
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfwma)
|
|
199
|
+
size_t vl = __riscv_vsetvlmax_e32m4();
|
|
200
|
+
|
|
201
|
+
// initialize accumulators to all zeroes
|
|
202
|
+
vfloat32m4_t vsum0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
|
|
203
|
+
vfloat32m4_t vsum1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
|
|
204
|
+
|
|
205
|
+
// calculate step size
|
|
206
|
+
const size_t epr = __riscv_vsetvlmax_e16m2();
|
|
207
|
+
const size_t step = epr * 2;
|
|
208
|
+
const int np = (n & ~(step - 1));
|
|
209
|
+
|
|
210
|
+
// unroll by 2
|
|
211
|
+
for (; i < np; i += step) {
|
|
212
|
+
vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], epr);
|
|
213
|
+
vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], epr);
|
|
214
|
+
vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, epr);
|
|
215
|
+
__asm__ __volatile__ ("" ::: "memory");
|
|
216
|
+
|
|
217
|
+
vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i + epr], epr);
|
|
218
|
+
vbfloat16m2_t ay1 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i + epr], epr);
|
|
219
|
+
vsum1 = __riscv_vfwmaccbf16_vv_f32m4(vsum1, ax1, ay1, epr);
|
|
220
|
+
__asm__ __volatile__ ("" ::: "memory");
|
|
221
|
+
}
|
|
199
222
|
|
|
223
|
+
// accumulate in 1 register
|
|
224
|
+
vsum0 = __riscv_vfadd_vv_f32m4(vsum0, vsum1, vl);
|
|
225
|
+
|
|
226
|
+
// leftovers
|
|
227
|
+
for (i = np; i < n; i += vl) {
|
|
228
|
+
vl = __riscv_vsetvl_e16m2(n - i);
|
|
229
|
+
vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], vl);
|
|
230
|
+
vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], vl);
|
|
231
|
+
vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, vl);
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
// reduce
|
|
235
|
+
vl = __riscv_vsetvlmax_e32m4();
|
|
236
|
+
vfloat32m1_t redsum = __riscv_vfredusum_vs_f32m4_f32m1(vsum0, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
|
|
237
|
+
sumf += __riscv_vfmv_f_s_f32m1_f32(redsum);
|
|
238
|
+
|
|
239
|
+
#endif
|
|
200
240
|
for (; i < n; ++i) {
|
|
201
241
|
sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
|
|
202
242
|
GGML_BF16_TO_FP32(y[i]));
|
|
@@ -224,13 +224,71 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
|
|
224
224
|
}
|
|
225
225
|
GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
|
|
226
226
|
GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
227
|
+
|
|
228
|
+
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
|
|
229
|
+
size_t vl = __riscv_vsetvlmax_e32m4();
|
|
230
|
+
|
|
231
|
+
// initialize accumulators to all zeroes
|
|
232
|
+
vfloat32m4_t vsum0_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
|
|
233
|
+
vfloat32m4_t vsum0_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
|
|
234
|
+
vfloat32m4_t vsum1_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
|
|
235
|
+
vfloat32m4_t vsum1_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
|
|
236
|
+
|
|
237
|
+
// calculate step size
|
|
238
|
+
const size_t epr = __riscv_vsetvlmax_e16m2();
|
|
239
|
+
const size_t step = epr * 2;
|
|
240
|
+
const int np = (n & ~(step - 1));
|
|
241
|
+
|
|
242
|
+
// unroll by 2 along the row dimension
|
|
243
|
+
for (int i = 0; i < np; i += step) {
|
|
244
|
+
vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), epr);
|
|
245
|
+
vfloat16m2_t ax0_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), epr);
|
|
246
|
+
vfloat16m2_t ax1_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), epr);
|
|
247
|
+
vsum0_0 = __riscv_vfwmacc_vv_f32m4(vsum0_0, ax0_0, ay0, epr);
|
|
248
|
+
vsum1_0 = __riscv_vfwmacc_vv_f32m4(vsum1_0, ax1_0, ay0, epr);
|
|
249
|
+
|
|
250
|
+
vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i + epr), epr);
|
|
251
|
+
vfloat16m2_t ax0_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i + epr), epr);
|
|
252
|
+
vfloat16m2_t ax1_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i + epr), epr);
|
|
253
|
+
vsum0_1 = __riscv_vfwmacc_vv_f32m4(vsum0_1, ax0_1, ay1, epr);
|
|
254
|
+
vsum1_1 = __riscv_vfwmacc_vv_f32m4(vsum1_1, ax1_1, ay1, epr);
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
vfloat32m4_t vsum0 = __riscv_vfadd_vv_f32m4(vsum0_0, vsum0_1, vl);
|
|
258
|
+
vfloat32m4_t vsum1 = __riscv_vfadd_vv_f32m4(vsum1_0, vsum1_1, vl);
|
|
259
|
+
|
|
260
|
+
// leftovers
|
|
261
|
+
for (int i = np; i < n; i += vl) {
|
|
262
|
+
vl = __riscv_vsetvl_e16m2(n - i);
|
|
263
|
+
vfloat16m2_t ay = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), vl);
|
|
264
|
+
vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), vl);
|
|
265
|
+
vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), vl);
|
|
266
|
+
|
|
267
|
+
vsum0 = __riscv_vfwmacc_vv_f32m4(vsum0, ax0, ay, vl);
|
|
268
|
+
vsum1 = __riscv_vfwmacc_vv_f32m4(vsum1, ax1, ay, vl);
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// reduce
|
|
272
|
+
vl = __riscv_vsetvlmax_e32m2();
|
|
273
|
+
vfloat32m2_t acc0_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0),
|
|
274
|
+
__riscv_vget_v_f32m4_f32m2(vsum0, 1), vl);
|
|
275
|
+
vl = __riscv_vsetvlmax_e32m1();
|
|
276
|
+
vfloat32m1_t acc0_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0_0, 0),
|
|
277
|
+
__riscv_vget_v_f32m2_f32m1(acc0_0, 1), vl);
|
|
278
|
+
vfloat32m1_t redsum0 = __riscv_vfredusum_vs_f32m1_f32m1(
|
|
279
|
+
acc0_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
|
|
280
|
+
|
|
281
|
+
vl = __riscv_vsetvlmax_e32m2();
|
|
282
|
+
vfloat32m2_t acc1_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum1, 0),
|
|
283
|
+
__riscv_vget_v_f32m4_f32m2(vsum1, 1), vl);
|
|
284
|
+
vl = __riscv_vsetvlmax_e32m1();
|
|
285
|
+
vfloat32m1_t acc1_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc1_0, 0),
|
|
286
|
+
__riscv_vget_v_f32m2_f32m1(acc1_0, 1), vl);
|
|
287
|
+
vfloat32m1_t redsum1 = __riscv_vfredusum_vs_f32m1_f32m1(
|
|
288
|
+
acc1_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
|
|
289
|
+
sumf[0] = __riscv_vfmv_f_s_f32m1_f32(redsum0);
|
|
290
|
+
sumf[1] = __riscv_vfmv_f_s_f32m1_f32(redsum1);
|
|
291
|
+
|
|
234
292
|
#else
|
|
235
293
|
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
236
294
|
|
|
@@ -475,15 +533,39 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
|
|
|
475
533
|
}
|
|
476
534
|
np = n;
|
|
477
535
|
#elif defined(__riscv_zvfh) // implies __riscv_v_intrinsic
|
|
478
|
-
const
|
|
479
|
-
_Float16
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
536
|
+
const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
|
|
537
|
+
const _Float16 scale = *(const _Float16*)(&s);
|
|
538
|
+
|
|
539
|
+
// calculate step size
|
|
540
|
+
const int epr = __riscv_vsetvlmax_e16m4();
|
|
541
|
+
const int step = epr * 2;
|
|
542
|
+
int np = (n & ~(step - 1));
|
|
543
|
+
|
|
544
|
+
// unroll by 2
|
|
545
|
+
for (int i = 0; i < np; i += step) {
|
|
546
|
+
vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, epr);
|
|
547
|
+
vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
|
|
548
|
+
ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, epr);
|
|
549
|
+
__riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
|
|
550
|
+
__asm__ __volatile__ ("" ::: "memory");
|
|
551
|
+
|
|
552
|
+
vfloat16m4_t ax1 = __riscv_vle16_v_f16m4((const _Float16*)x + i + epr, epr);
|
|
553
|
+
vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
|
|
554
|
+
ay1 = __riscv_vfmacc_vf_f16m4(ay1, scale, ax1, epr);
|
|
555
|
+
__riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
|
|
556
|
+
__asm__ __volatile__ ("" ::: "memory");
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
// leftovers
|
|
560
|
+
int vl;
|
|
561
|
+
for (int i = np; i < n; i += vl) {
|
|
562
|
+
vl = __riscv_vsetvl_e16m4(n - i);
|
|
563
|
+
vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, vl);
|
|
564
|
+
vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
|
|
565
|
+
ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, vl);
|
|
566
|
+
__riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
|
|
486
567
|
}
|
|
568
|
+
np = n;
|
|
487
569
|
#elif defined(GGML_SIMD)
|
|
488
570
|
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
489
571
|
|
|
@@ -724,13 +806,34 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
|
|
|
724
806
|
svst1_f16(pg, (__fp16 *)(y + np), out);
|
|
725
807
|
}
|
|
726
808
|
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
809
|
+
const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
|
|
810
|
+
const _Float16 scale = *(const _Float16*)(&s);
|
|
811
|
+
|
|
812
|
+
// calculate step size
|
|
813
|
+
const int epr = __riscv_vsetvlmax_e16m4();
|
|
814
|
+
const int step = epr * 2;
|
|
815
|
+
const int np = (n & ~(step - 1));
|
|
816
|
+
|
|
817
|
+
// unroll by 2
|
|
818
|
+
for (int i = 0; i < np; i += step) {
|
|
819
|
+
vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
|
|
820
|
+
ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, epr);
|
|
821
|
+
__riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
|
|
822
|
+
__asm__ __volatile__ ("" ::: "memory");
|
|
823
|
+
|
|
824
|
+
vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
|
|
825
|
+
ay1 = __riscv_vfmul_vf_f16m4(ay1, scale, epr);
|
|
826
|
+
__riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
|
|
827
|
+
__asm__ __volatile__ ("" ::: "memory");
|
|
828
|
+
}
|
|
829
|
+
|
|
830
|
+
// leftovers
|
|
831
|
+
int vl;
|
|
832
|
+
for (int i = np; i < n; i += vl) {
|
|
833
|
+
vl = __riscv_vsetvl_e16m4(n - i);
|
|
834
|
+
vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
|
|
835
|
+
ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, vl);
|
|
836
|
+
__riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
|
|
734
837
|
}
|
|
735
838
|
#elif defined(GGML_SIMD)
|
|
736
839
|
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
@@ -313,6 +313,7 @@ extern "C" {
|
|
|
313
313
|
bool check_tensors; // validate model tensor data
|
|
314
314
|
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
|
315
315
|
bool no_host; // bypass host buffer allowing extra buffers to be used
|
|
316
|
+
bool no_alloc; // only load metadata and simulate memory allocations
|
|
316
317
|
};
|
|
317
318
|
|
|
318
319
|
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
|
@@ -466,10 +467,24 @@ extern "C" {
|
|
|
466
467
|
// Frees all allocated memory
|
|
467
468
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
|
468
469
|
|
|
470
|
+
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
|
|
471
|
+
// returns true if the parameters could be successfully modified to fit device memory
|
|
472
|
+
// this function is NOT thread safe because it modifies the global llama logger state
|
|
473
|
+
LLAMA_API bool llama_params_fit(
|
|
474
|
+
const char * path_model,
|
|
475
|
+
struct llama_model_params * mparams,
|
|
476
|
+
struct llama_context_params * cparams,
|
|
477
|
+
float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
|
|
478
|
+
struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
|
|
479
|
+
size_t margin, // margin of memory to leave per device in bytes
|
|
480
|
+
uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
|
|
481
|
+
enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
|
|
482
|
+
|
|
469
483
|
LLAMA_API int64_t llama_time_us(void);
|
|
470
484
|
|
|
471
485
|
LLAMA_API size_t llama_max_devices(void);
|
|
472
486
|
LLAMA_API size_t llama_max_parallel_sequences(void);
|
|
487
|
+
LLAMA_API size_t llama_max_tensor_buft_overrides(void);
|
|
473
488
|
|
|
474
489
|
LLAMA_API bool llama_supports_mmap (void);
|
|
475
490
|
LLAMA_API bool llama_supports_mlock (void);
|
|
@@ -1354,7 +1369,9 @@ extern "C" {
|
|
|
1354
1369
|
|
|
1355
1370
|
// Set callback for all future logging events.
|
|
1356
1371
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
|
1357
|
-
|
|
1372
|
+
// The logger state is global so these functions are NOT thread safe.
|
|
1373
|
+
LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data);
|
|
1374
|
+
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
|
1358
1375
|
|
|
1359
1376
|
//
|
|
1360
1377
|
// Performance utils
|