@fugood/llama.node 1.1.10 → 1.2.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -8
- package/lib/binding.ts +20 -2
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -2
- package/package.json +20 -16
- package/src/DecodeAudioTokenWorker.cpp +23 -26
- package/src/DecodeAudioTokenWorker.h +6 -8
- package/src/DetokenizeWorker.cpp +5 -8
- package/src/DetokenizeWorker.h +6 -5
- package/src/DisposeWorker.cpp +23 -3
- package/src/DisposeWorker.h +4 -2
- package/src/EmbeddingWorker.cpp +9 -35
- package/src/EmbeddingWorker.h +3 -2
- package/src/LlamaCompletionWorker.cpp +217 -315
- package/src/LlamaCompletionWorker.h +6 -12
- package/src/LlamaContext.cpp +174 -388
- package/src/LlamaContext.h +8 -13
- package/src/LoadSessionWorker.cpp +22 -19
- package/src/LoadSessionWorker.h +3 -2
- package/src/RerankWorker.h +3 -2
- package/src/SaveSessionWorker.cpp +22 -19
- package/src/SaveSessionWorker.h +3 -2
- package/src/TokenizeWorker.cpp +38 -35
- package/src/TokenizeWorker.h +12 -3
- package/src/common.hpp +0 -458
- package/src/llama.cpp/common/arg.cpp +67 -37
- package/src/llama.cpp/common/chat.cpp +263 -2
- package/src/llama.cpp/common/chat.h +4 -0
- package/src/llama.cpp/common/common.cpp +10 -3
- package/src/llama.cpp/common/common.h +5 -2
- package/src/llama.cpp/common/log.cpp +53 -2
- package/src/llama.cpp/common/log.h +10 -4
- package/src/llama.cpp/common/sampling.cpp +23 -2
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +0 -1
- package/src/llama.cpp/ggml/include/ggml.h +50 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +19 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +11 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +18 -18
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +234 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +80 -51
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +161 -20
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +399 -50
- package/src/llama.cpp/include/llama.h +32 -7
- package/src/llama.cpp/src/llama-adapter.cpp +101 -4
- package/src/llama.cpp/src/llama-adapter.h +6 -0
- package/src/llama.cpp/src/llama-arch.cpp +69 -2
- package/src/llama.cpp/src/llama-arch.h +6 -0
- package/src/llama.cpp/src/llama-context.cpp +92 -45
- package/src/llama.cpp/src/llama-context.h +1 -5
- package/src/llama.cpp/src/llama-graph.cpp +74 -19
- package/src/llama.cpp/src/llama-graph.h +10 -1
- package/src/llama.cpp/src/llama-hparams.cpp +37 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -3
- package/src/llama.cpp/src/llama-impl.h +2 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +33 -120
- package/src/llama.cpp/src/llama-kv-cache.h +4 -13
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model.cpp +434 -21
- package/src/llama.cpp/src/llama-model.h +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +226 -126
- package/src/llama.cpp/src/llama-vocab.cpp +1 -1
- package/src/llama.cpp/src/llama.cpp +12 -0
- package/src/anyascii.c +0 -22223
- package/src/anyascii.h +0 -42
- package/src/tts_utils.cpp +0 -371
- package/src/tts_utils.h +0 -103
|
@@ -119,36 +119,149 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
|
|
119
119
|
}
|
|
120
120
|
|
|
121
121
|
#if defined(GGML_SIMD)
|
|
122
|
-
|
|
122
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
123
|
+
|
|
124
|
+
const int sve_register_length = svcntb() * 8;
|
|
125
|
+
const int ggml_f16_epr = sve_register_length / 16; // running when 16
|
|
126
|
+
const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
|
|
127
|
+
|
|
128
|
+
const int np = (n & ~(ggml_f16_step - 1));
|
|
129
|
+
|
|
130
|
+
svfloat16_t sum_00 = svdup_n_f16(0.0f);
|
|
131
|
+
svfloat16_t sum_01 = svdup_n_f16(0.0f);
|
|
132
|
+
svfloat16_t sum_02 = svdup_n_f16(0.0f);
|
|
133
|
+
svfloat16_t sum_03 = svdup_n_f16(0.0f);
|
|
134
|
+
|
|
135
|
+
svfloat16_t sum_10 = svdup_n_f16(0.0f);
|
|
136
|
+
svfloat16_t sum_11 = svdup_n_f16(0.0f);
|
|
137
|
+
svfloat16_t sum_12 = svdup_n_f16(0.0f);
|
|
138
|
+
svfloat16_t sum_13 = svdup_n_f16(0.0f);
|
|
139
|
+
|
|
140
|
+
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
|
141
|
+
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
|
142
|
+
|
|
143
|
+
for (int i = 0; i < np; i += ggml_f16_step) {
|
|
144
|
+
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
|
|
145
|
+
|
|
146
|
+
ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elemnst
|
|
147
|
+
sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
|
|
148
|
+
ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
|
|
149
|
+
sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
|
|
150
|
+
|
|
151
|
+
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
|
|
152
|
+
|
|
153
|
+
ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 ekements
|
|
154
|
+
sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
|
|
155
|
+
ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
|
|
156
|
+
sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
|
|
157
|
+
|
|
158
|
+
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
|
|
159
|
+
|
|
160
|
+
ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
|
|
161
|
+
sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
|
|
162
|
+
ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
|
|
163
|
+
sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
|
|
164
|
+
|
|
165
|
+
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
|
166
|
+
|
|
167
|
+
ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3);
|
|
168
|
+
sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4);
|
|
169
|
+
ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3);
|
|
170
|
+
sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4);
|
|
171
|
+
|
|
172
|
+
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
|
|
173
|
+
|
|
174
|
+
ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4);
|
|
175
|
+
|
|
176
|
+
sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5);
|
|
177
|
+
ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4);
|
|
178
|
+
sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5);
|
|
179
|
+
|
|
180
|
+
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
|
|
181
|
+
|
|
182
|
+
ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5);
|
|
183
|
+
|
|
184
|
+
sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6);
|
|
185
|
+
ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5);
|
|
186
|
+
sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6);
|
|
187
|
+
|
|
188
|
+
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
|
|
189
|
+
|
|
190
|
+
ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6);
|
|
191
|
+
|
|
192
|
+
sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7);
|
|
193
|
+
ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6);
|
|
194
|
+
sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7);
|
|
195
|
+
|
|
196
|
+
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
|
|
197
|
+
|
|
198
|
+
ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7);
|
|
199
|
+
|
|
200
|
+
sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8);
|
|
201
|
+
ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7);
|
|
202
|
+
sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8);
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
const int np2 = (n & ~(ggml_f16_epr - 1));
|
|
206
|
+
for (int k = np; k < np2; k += ggml_f16_epr) {
|
|
207
|
+
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
|
|
208
|
+
|
|
209
|
+
svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0);
|
|
210
|
+
sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry);
|
|
211
|
+
rx = GGML_F16x_VEC_LOAD(x[1] + k, 0);
|
|
212
|
+
sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
if (np2 < n) {
|
|
216
|
+
svbool_t pg = svwhilelt_b16(np2, n);
|
|
217
|
+
svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
|
|
218
|
+
svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
|
|
219
|
+
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
|
220
|
+
|
|
221
|
+
sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00);
|
|
222
|
+
sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10);
|
|
223
|
+
}
|
|
224
|
+
GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
|
|
225
|
+
GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
|
|
226
|
+
#elif defined(__riscv_v_intrinsic)
|
|
227
|
+
// todo: RVV impl
|
|
228
|
+
for (int i = 0; i < n; ++i) {
|
|
229
|
+
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
|
230
|
+
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
#else
|
|
234
|
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
123
235
|
|
|
124
|
-
|
|
236
|
+
GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
|
|
125
237
|
|
|
126
|
-
|
|
127
|
-
|
|
238
|
+
GGML_F16_VEC ax[GGML_F16_ARR];
|
|
239
|
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
|
128
240
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
241
|
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
|
242
|
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
|
243
|
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
|
132
244
|
|
|
133
|
-
|
|
134
|
-
|
|
245
|
+
for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
|
|
246
|
+
ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
|
|
135
247
|
|
|
136
|
-
|
|
248
|
+
sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
|
|
249
|
+
}
|
|
137
250
|
}
|
|
138
251
|
}
|
|
139
|
-
}
|
|
140
252
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
253
|
+
// reduce sum0..sum3 to sum0
|
|
254
|
+
for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
|
|
255
|
+
GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
|
|
256
|
+
}
|
|
145
257
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
258
|
+
// leftovers
|
|
259
|
+
for (int i = np; i < n; ++i) {
|
|
260
|
+
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
|
261
|
+
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
262
|
+
}
|
|
150
263
|
}
|
|
151
|
-
|
|
264
|
+
#endif
|
|
152
265
|
#else
|
|
153
266
|
for (int i = 0; i < n; ++i) {
|
|
154
267
|
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
|
@@ -243,6 +356,14 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
|
|
|
243
356
|
|
|
244
357
|
svst1_f32(pg, y + np2, ay1);
|
|
245
358
|
}
|
|
359
|
+
#elif defined(__riscv_v_intrinsic)
|
|
360
|
+
for (int i = 0, avl; i < n; i += avl) {
|
|
361
|
+
avl = __riscv_vsetvl_e32m8(n - i);
|
|
362
|
+
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
|
|
363
|
+
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
|
|
364
|
+
vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, v, ay, avl);
|
|
365
|
+
__riscv_vse32_v_f32m8(&y[i], ny, avl);
|
|
366
|
+
}
|
|
246
367
|
#else
|
|
247
368
|
const int np = (n & ~(GGML_F32_STEP - 1));
|
|
248
369
|
|
|
@@ -276,27 +397,112 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
|
|
|
276
397
|
|
|
277
398
|
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
|
|
278
399
|
#if defined(GGML_SIMD)
|
|
279
|
-
|
|
400
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
401
|
+
const int sve_register_length = svcntb() * 8;
|
|
402
|
+
const int ggml_f16_epr = sve_register_length / 16;
|
|
403
|
+
const int ggml_f16_step = 8 * ggml_f16_epr;
|
|
404
|
+
|
|
405
|
+
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
|
|
406
|
+
|
|
407
|
+
const int np= (n & ~(ggml_f16_step - 1));
|
|
408
|
+
|
|
409
|
+
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
|
410
|
+
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
|
411
|
+
for (int i = 0; i < np; i += ggml_f16_step) {
|
|
412
|
+
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
|
|
413
|
+
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
|
|
414
|
+
ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
|
|
415
|
+
|
|
416
|
+
GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
|
|
417
|
+
|
|
418
|
+
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
|
|
419
|
+
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
|
|
420
|
+
ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
|
|
421
|
+
|
|
422
|
+
GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
|
|
423
|
+
|
|
424
|
+
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
|
|
425
|
+
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
|
|
426
|
+
ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
|
|
427
|
+
|
|
428
|
+
GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
|
|
429
|
+
|
|
430
|
+
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
|
|
431
|
+
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
|
432
|
+
ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
|
|
433
|
+
|
|
434
|
+
GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
|
|
435
|
+
|
|
436
|
+
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
|
|
437
|
+
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
|
|
438
|
+
ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
|
|
439
|
+
|
|
440
|
+
GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
|
|
441
|
+
|
|
442
|
+
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
|
|
443
|
+
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
|
|
444
|
+
ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
|
|
445
|
+
|
|
446
|
+
GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
|
|
280
447
|
|
|
281
|
-
|
|
448
|
+
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
|
|
449
|
+
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
|
|
450
|
+
ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
|
|
282
451
|
|
|
283
|
-
|
|
284
|
-
GGML_F16_VEC ay[GGML_F16_ARR];
|
|
452
|
+
GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
|
|
285
453
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
|
290
|
-
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
|
454
|
+
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
|
|
455
|
+
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
|
|
456
|
+
ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
|
|
291
457
|
|
|
292
|
-
|
|
458
|
+
GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
|
|
293
459
|
}
|
|
294
|
-
|
|
460
|
+
const int np2 = (n & ~(ggml_f16_epr - 1));
|
|
461
|
+
for (int k = np; k < np2; k += ggml_f16_epr) {
|
|
462
|
+
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
|
|
463
|
+
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
|
|
464
|
+
ry = GGML_F16x_VEC_FMA(ry, rx, vx);
|
|
295
465
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
466
|
+
GGML_F16x_VEC_STORE(y + k, ry, 0);
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
if (np2 < n) {
|
|
470
|
+
svbool_t pg = svwhilelt_b16(np2, n);
|
|
471
|
+
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
|
|
472
|
+
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
|
473
|
+
hy = svmad_f16_x(pg, hx, vx, hy);
|
|
474
|
+
svst1_f16(pg, (__fp16 *)(y + np2), hy);
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
#elif defined(__riscv_v_intrinsic)
|
|
478
|
+
// todo: RVV impl
|
|
479
|
+
// scalar
|
|
480
|
+
for (int i = 0; i < n; ++i) {
|
|
481
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
|
482
|
+
}
|
|
483
|
+
#else
|
|
484
|
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
485
|
+
|
|
486
|
+
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
|
487
|
+
|
|
488
|
+
GGML_F16_VEC ax[GGML_F16_ARR];
|
|
489
|
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
|
490
|
+
|
|
491
|
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
|
492
|
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
|
493
|
+
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
|
494
|
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
|
495
|
+
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
|
496
|
+
|
|
497
|
+
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
// leftovers
|
|
502
|
+
for (int i = np; i < n; ++i) {
|
|
503
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
|
504
|
+
}
|
|
505
|
+
#endif
|
|
300
506
|
#else
|
|
301
507
|
// scalar
|
|
302
508
|
for (int i = 0; i < n; ++i) {
|
|
@@ -324,6 +530,16 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
|
|
|
324
530
|
y[i] += x[k][i]*v[k][0];
|
|
325
531
|
}
|
|
326
532
|
}
|
|
533
|
+
#elif defined(__riscv_v_intrinsic)
|
|
534
|
+
for (int i = 0, avl; i < n; i += avl) {
|
|
535
|
+
avl = __riscv_vsetvl_e32m8(n - i);
|
|
536
|
+
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
|
|
537
|
+
for (int k = 0; k < GGML_VEC_MAD_UNROLL; k++) {
|
|
538
|
+
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[k][i], avl);
|
|
539
|
+
ay = __riscv_vfmadd_vf_f32m8(ax, v[k][0], ay, avl);
|
|
540
|
+
}
|
|
541
|
+
__riscv_vse32_v_f32m8(&y[i], ay, avl);
|
|
542
|
+
}
|
|
327
543
|
#else
|
|
328
544
|
const int np = (n & ~(GGML_F32_STEP - 1));
|
|
329
545
|
|
|
@@ -375,6 +591,14 @@ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, co
|
|
|
375
591
|
for (int i = 0; i < n; ++i) {
|
|
376
592
|
y[i] = x[i]*s + b;
|
|
377
593
|
}
|
|
594
|
+
#elif defined(__riscv_v_intrinsic)
|
|
595
|
+
for (int i = 0, avl; i < n; i += avl) {
|
|
596
|
+
avl = __riscv_vsetvl_e32m8(n - i);
|
|
597
|
+
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
|
|
598
|
+
vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(b, avl);
|
|
599
|
+
vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, s, vb, avl);
|
|
600
|
+
__riscv_vse32_v_f32m8(&y[i], ny, avl);
|
|
601
|
+
}
|
|
378
602
|
#else
|
|
379
603
|
const int np = (n & ~(GGML_F32_STEP - 1));
|
|
380
604
|
|
|
@@ -436,6 +660,13 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
|
|
436
660
|
ay1 = svmul_f32_m(pg, ay1, vx);
|
|
437
661
|
svst1_f32(pg, y + np, ay1);
|
|
438
662
|
}
|
|
663
|
+
#elif defined(__riscv_v_intrinsic)
|
|
664
|
+
for (int i = 0, avl; i < n; i += avl) {
|
|
665
|
+
avl = __riscv_vsetvl_e32m8(n - i);
|
|
666
|
+
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
|
|
667
|
+
vfloat32m8_t ny = __riscv_vfmul_vf_f32m8(ay, v, avl);
|
|
668
|
+
__riscv_vse32_v_f32m8(&y[i], ny, avl);
|
|
669
|
+
}
|
|
439
670
|
#else
|
|
440
671
|
const int np = (n & ~(GGML_F32_STEP - 1));
|
|
441
672
|
|
|
@@ -467,25 +698,59 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
|
|
467
698
|
|
|
468
699
|
inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
|
|
469
700
|
#if defined(GGML_SIMD)
|
|
470
|
-
|
|
701
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
702
|
+
const int sve_register_length = svcntb() * 8;
|
|
703
|
+
const int ggml_f16_epr = sve_register_length / 16;
|
|
704
|
+
const int ggml_f16_step = 2 * ggml_f16_epr;
|
|
705
|
+
|
|
706
|
+
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
|
|
707
|
+
const int np = (n & ~(ggml_f16_step - 1));
|
|
708
|
+
svfloat16_t ay1, ay2;
|
|
709
|
+
|
|
710
|
+
for (int i = 0; i < np; i += ggml_f16_step) {
|
|
711
|
+
ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
|
|
712
|
+
ay1 = GGML_F16x_VEC_MUL(ay1, vx);
|
|
713
|
+
GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0);
|
|
714
|
+
|
|
715
|
+
ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1);
|
|
716
|
+
ay2 = GGML_F16x_VEC_MUL(ay2, vx);
|
|
717
|
+
GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1);
|
|
718
|
+
}
|
|
719
|
+
// leftovers
|
|
720
|
+
// maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
|
|
721
|
+
if (np < n) {
|
|
722
|
+
svbool_t pg = svwhilelt_b16(np, n);
|
|
723
|
+
svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
|
|
724
|
+
svfloat16_t out = svmul_f16_m(pg, hy, vx);
|
|
725
|
+
svst1_f16(pg, (__fp16 *)(y + np), out);
|
|
726
|
+
}
|
|
727
|
+
#elif defined(__riscv_v_intrinsic)
|
|
728
|
+
// todo: RVV impl
|
|
729
|
+
// scalar
|
|
730
|
+
for (int i = 0; i < n; ++i) {
|
|
731
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
|
732
|
+
}
|
|
733
|
+
#else
|
|
734
|
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
471
735
|
|
|
472
|
-
|
|
736
|
+
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
|
473
737
|
|
|
474
|
-
|
|
738
|
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
|
475
739
|
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
740
|
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
|
741
|
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
|
742
|
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
|
743
|
+
ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
|
|
480
744
|
|
|
481
|
-
|
|
745
|
+
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
|
746
|
+
}
|
|
482
747
|
}
|
|
483
|
-
}
|
|
484
748
|
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
749
|
+
// leftovers
|
|
750
|
+
for (int i = np; i < n; ++i) {
|
|
751
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
|
752
|
+
}
|
|
753
|
+
#endif
|
|
489
754
|
#else
|
|
490
755
|
// scalar
|
|
491
756
|
for (int i = 0; i < n; ++i) {
|
|
@@ -737,7 +1002,39 @@ https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/sr
|
|
|
737
1002
|
}
|
|
738
1003
|
#endif
|
|
739
1004
|
|
|
740
|
-
#if defined(
|
|
1005
|
+
#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
|
1006
|
+
|
|
1007
|
+
inline static svfloat32_t ggml_v_expf(svbool_t pg, svfloat32_t x) {
|
|
1008
|
+
const svfloat32_t r = svdup_n_f32_x(pg, 0x1.8p23f);
|
|
1009
|
+
const svfloat32_t z = svmla_n_f32_x(pg, r, x, 0x1.715476p+0f);
|
|
1010
|
+
const svfloat32_t n = svsub_f32_x(pg, z, r);
|
|
1011
|
+
const svfloat32_t b = svmls_n_f32_x(pg, svmls_n_f32_x(pg, x, n, 0x1.62e4p-1f), n, 0x1.7f7d1cp-20f);
|
|
1012
|
+
const svuint32_t e = svlsl_n_u32_x(pg, svreinterpret_u32_f32(z), 23);
|
|
1013
|
+
const svfloat32_t k = svreinterpret_f32_u32(svadd_u32_x(pg, e, svreinterpret_u32_f32(svdup_n_f32_x(pg, 1))));
|
|
1014
|
+
const svbool_t c = svacgt_n_f32(pg, n, 126);
|
|
1015
|
+
const svfloat32_t u = svmul_f32_x(pg, b, b);
|
|
1016
|
+
const svfloat32_t j = svmla_f32_x(pg,
|
|
1017
|
+
svmul_n_f32_x(pg, b, 0x1.ffffecp-1f),
|
|
1018
|
+
svmla_f32_x(pg, svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.fffdb6p-2f), svdup_n_f32_x(pg, 0x1.555e66p-3f), b),
|
|
1019
|
+
svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.573e2ep-5f), svdup_n_f32_x(pg, 0x1.0e4020p-7f), b), u), u);
|
|
1020
|
+
const svuint32_t d = svdup_n_u32_z(svcmple_n_f32(pg, n, 0.0), 0x82000000);
|
|
1021
|
+
const svfloat32_t s1 = svreinterpret_f32_u32(svadd_n_u32_x(pg, d, 0x7f000000));
|
|
1022
|
+
const svfloat32_t s2 = svreinterpret_f32_u32(svsub_u32_x(pg, e, d));
|
|
1023
|
+
return svsel_f32(svacgt_f32(pg, n, svdup_n_f32_x(pg, 192)), svmul_f32_x(pg, s1, s1),
|
|
1024
|
+
svsel_f32(c, svmul_f32_x(pg, svmla_f32_x(pg, s2, s2, j), s1), svmla_f32_x(pg, k, k, j)));
|
|
1025
|
+
}
|
|
1026
|
+
|
|
1027
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
|
1028
|
+
inline static svfloat32_t ggml_v_silu(svbool_t pg, svfloat32_t x) {
|
|
1029
|
+
const svfloat32_t one = svdup_n_f32_x(pg, 1.0f);
|
|
1030
|
+
const svfloat32_t zero = svdup_n_f32_x(pg, 0.0f);
|
|
1031
|
+
const svfloat32_t neg_x = svsub_f32_x(pg, zero, x);
|
|
1032
|
+
const svfloat32_t exp_neg_x = ggml_v_expf(pg, neg_x);
|
|
1033
|
+
const svfloat32_t one_plus_exp_neg_x = svadd_f32_x(pg, one, exp_neg_x);
|
|
1034
|
+
return svdiv_f32_x(pg, x, one_plus_exp_neg_x);
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
|
741
1038
|
|
|
742
1039
|
// adapted from arm limited optimized routine
|
|
743
1040
|
// the maximum error is 1.45358 plus 0.5 ulps
|
|
@@ -928,7 +1225,59 @@ inline static __m128 ggml_v_silu(__m128 x) {
|
|
|
928
1225
|
return _mm_div_ps(x, one_plus_exp_neg_x);
|
|
929
1226
|
}
|
|
930
1227
|
|
|
931
|
-
#
|
|
1228
|
+
#elif defined(__riscv_v_intrinsic)
|
|
1229
|
+
|
|
1230
|
+
// adapted from arm limited optimized routine
|
|
1231
|
+
// the maximum error is 1.45358 plus 0.5 ulps
|
|
1232
|
+
// numbers above 88.38 will flush to infinity
|
|
1233
|
+
// numbers beneath -103.97 will flush to zero
|
|
1234
|
+
inline static vfloat32m2_t ggml_v_expf_m2(vfloat32m2_t x, int vl) {
|
|
1235
|
+
const vfloat32m2_t r = __riscv_vfmv_v_f_f32m2(0x1.8p23f, vl);
|
|
1236
|
+
#ifdef __riscv_xtheadvector
|
|
1237
|
+
// workaround for compiler bug (gcc 14.3.0: Error: unrecognized opcode `th.vmv1r.v v2,v4')
|
|
1238
|
+
vfloat32m2_t z = __riscv_vfadd_vf_f32m2(r, 0.0f, vl);
|
|
1239
|
+
z = __riscv_vfmacc_vf_f32m2(z, 0x1.715476p+0f, x, vl);
|
|
1240
|
+
#else
|
|
1241
|
+
const vfloat32m2_t z = __riscv_vfmacc_vf_f32m2(r, 0x1.715476p+0f, x, vl);
|
|
1242
|
+
#endif
|
|
1243
|
+
const vfloat32m2_t n = __riscv_vfsub_vv_f32m2(z, r, vl);
|
|
1244
|
+
const vfloat32m2_t b = __riscv_vfnmsac_vf_f32m2(__riscv_vfnmsac_vf_f32m2(x, 0x1.62e4p-1f, n, vl),
|
|
1245
|
+
0x1.7f7d1cp-20f, n, vl);
|
|
1246
|
+
const vuint32m2_t e = __riscv_vsll_vx_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(z), 23, vl);
|
|
1247
|
+
const vfloat32m2_t k = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(e, 0x3f800000, vl)); // 1.0f
|
|
1248
|
+
const vbool16_t c = __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 126.0f, vl);
|
|
1249
|
+
const vfloat32m2_t u = __riscv_vfmul_vv_f32m2(b, b, vl);
|
|
1250
|
+
const vfloat32m2_t j = __riscv_vfmacc_vv_f32m2(
|
|
1251
|
+
__riscv_vfmul_vf_f32m2(b, 0x1.ffffecp-1f, vl),
|
|
1252
|
+
__riscv_vfmacc_vv_f32m2(
|
|
1253
|
+
__riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.fffdb6p-2f, vl), 0x1.555e66p-3f, b, vl),
|
|
1254
|
+
__riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.573e2ep-5f, vl), 0x1.0e4020p-7f, b, vl),
|
|
1255
|
+
u, vl), u, vl);
|
|
1256
|
+
if (!__riscv_vcpop_m_b16(c, vl))
|
|
1257
|
+
return __riscv_vfmacc_vv_f32m2(k, j, k, vl);
|
|
1258
|
+
const vbool16_t dm = __riscv_vmfle_vf_f32m2_b16(n, 0.0f, vl);
|
|
1259
|
+
const vuint32m2_t d = __riscv_vmerge_vxm_u32m2(__riscv_vmv_v_x_u32m2(0, vl), 0x82000000, dm, vl);
|
|
1260
|
+
const vfloat32m2_t s1 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(d, 0x7f000000, vl));
|
|
1261
|
+
const vfloat32m2_t s2 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vsub_vv_u32m2(e, d, vl));
|
|
1262
|
+
const vfloat32m2_t r1 = __riscv_vmerge_vvm_f32m2(
|
|
1263
|
+
__riscv_vfmacc_vv_f32m2(k, k, j, vl),
|
|
1264
|
+
__riscv_vfmul_vv_f32m2(__riscv_vfmacc_vv_f32m2(s2, s2, j, vl), s1, vl),
|
|
1265
|
+
c, vl);
|
|
1266
|
+
return __riscv_vmerge_vvm_f32m2(
|
|
1267
|
+
r1, __riscv_vfmul_vv_f32m2(s1, s1, vl),
|
|
1268
|
+
__riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 192.0f, vl),
|
|
1269
|
+
vl);
|
|
1270
|
+
}
|
|
1271
|
+
|
|
1272
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
|
1273
|
+
inline static vfloat32m2_t ggml_v_silu_m2(vfloat32m2_t x, int vl) {
|
|
1274
|
+
const vfloat32m2_t neg_x = __riscv_vfneg_v_f32m2(x, vl);
|
|
1275
|
+
const vfloat32m2_t exp_neg_x = ggml_v_expf_m2(neg_x, vl);
|
|
1276
|
+
const vfloat32m2_t one_plus_exp_neg_x = __riscv_vfadd_vf_f32m2(exp_neg_x, 1.0f, vl);
|
|
1277
|
+
return __riscv_vfdiv_vv_f32m2(x, one_plus_exp_neg_x, vl);
|
|
1278
|
+
}
|
|
1279
|
+
|
|
1280
|
+
#endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
|
|
932
1281
|
|
|
933
1282
|
inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
934
1283
|
for (int i = 0; i < n; ++i) {
|
|
@@ -179,6 +179,14 @@ extern "C" {
|
|
|
179
179
|
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
|
|
180
180
|
};
|
|
181
181
|
|
|
182
|
+
enum llama_flash_attn_type {
|
|
183
|
+
LLAMA_FLASH_ATTN_TYPE_AUTO = -1,
|
|
184
|
+
LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
|
|
185
|
+
LLAMA_FLASH_ATTN_TYPE_ENABLED = 1,
|
|
186
|
+
};
|
|
187
|
+
|
|
188
|
+
LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
|
|
189
|
+
|
|
182
190
|
enum llama_split_mode {
|
|
183
191
|
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
|
184
192
|
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
|
@@ -198,7 +206,7 @@ extern "C" {
|
|
|
198
206
|
llama_token_data * data;
|
|
199
207
|
size_t size;
|
|
200
208
|
int64_t selected; // this is the index in the data array (i.e. not the token id)
|
|
201
|
-
bool sorted;
|
|
209
|
+
bool sorted; // note: do not assume the data is sorted - always check this flag
|
|
202
210
|
} llama_token_data_array;
|
|
203
211
|
|
|
204
212
|
typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
|
@@ -303,6 +311,7 @@ extern "C" {
|
|
|
303
311
|
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
|
304
312
|
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
|
305
313
|
enum llama_attention_type attention_type; // attention type to use for embeddings
|
|
314
|
+
enum llama_flash_attn_type flash_attn_type; // when to enable Flash Attention
|
|
306
315
|
|
|
307
316
|
// ref: https://github.com/ggml-org/llama.cpp/pull/2054
|
|
308
317
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
|
@@ -329,7 +338,6 @@ extern "C" {
|
|
|
329
338
|
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
|
330
339
|
bool embeddings; // if true, extract embeddings (together with logits)
|
|
331
340
|
bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
|
|
332
|
-
bool flash_attn; // use flash attention [EXPERIMENTAL]
|
|
333
341
|
bool no_perf; // measure performance timings
|
|
334
342
|
bool op_offload; // offload host tensor operations to device
|
|
335
343
|
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
@@ -553,10 +561,32 @@ extern "C" {
|
|
|
553
561
|
struct llama_model * model,
|
|
554
562
|
const char * path_lora);
|
|
555
563
|
|
|
564
|
+
// Functions to access the adapter's GGUF metadata scalar values
|
|
565
|
+
// - The functions return the length of the string on success, or -1 on failure
|
|
566
|
+
// - The output string is always null-terminated and cleared on failure
|
|
567
|
+
// - When retrieving a string, an extra byte must be allocated to account for the null terminator
|
|
568
|
+
// - GGUF array values are not supported by these functions
|
|
569
|
+
|
|
570
|
+
// Get metadata value as a string by key name
|
|
571
|
+
LLAMA_API int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size);
|
|
572
|
+
|
|
573
|
+
// Get the number of metadata key/value pairs
|
|
574
|
+
LLAMA_API int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter);
|
|
575
|
+
|
|
576
|
+
// Get metadata key name by index
|
|
577
|
+
LLAMA_API int32_t llama_adapter_meta_key_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
|
|
578
|
+
|
|
579
|
+
// Get metadata value as a string by index
|
|
580
|
+
LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
|
|
581
|
+
|
|
556
582
|
// Manually free a LoRA adapter
|
|
557
583
|
// Note: loaded adapters will be free when the associated model is deleted
|
|
558
584
|
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
|
|
559
585
|
|
|
586
|
+
// Get the invocation tokens if the current lora is an alora
|
|
587
|
+
LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
|
|
588
|
+
LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens (const struct llama_adapter_lora * adapter);
|
|
589
|
+
|
|
560
590
|
// The following functions operate on a llama_context, hence the naming: llama_verb_...
|
|
561
591
|
|
|
562
592
|
// Add a loaded LoRA adapter to given context
|
|
@@ -1130,11 +1160,6 @@ extern "C" {
|
|
|
1130
1160
|
LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
|
|
1131
1161
|
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
|
|
1132
1162
|
|
|
1133
|
-
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
|
1134
|
-
/// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
|
|
1135
|
-
DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
|
|
1136
|
-
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
|
|
1137
|
-
|
|
1138
1163
|
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
|
1139
1164
|
/// Setting k <= 0 makes this a noop
|
|
1140
1165
|
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|