@fugood/llama.node 1.1.11 → 1.2.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -8
- package/lib/binding.ts +18 -1
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -2
- package/package.json +20 -16
- package/src/DecodeAudioTokenWorker.cpp +23 -26
- package/src/DecodeAudioTokenWorker.h +6 -8
- package/src/DetokenizeWorker.cpp +5 -8
- package/src/DetokenizeWorker.h +6 -5
- package/src/DisposeWorker.cpp +23 -3
- package/src/DisposeWorker.h +4 -2
- package/src/EmbeddingWorker.cpp +9 -35
- package/src/EmbeddingWorker.h +3 -2
- package/src/LlamaCompletionWorker.cpp +217 -315
- package/src/LlamaCompletionWorker.h +6 -12
- package/src/LlamaContext.cpp +166 -396
- package/src/LlamaContext.h +8 -13
- package/src/LoadSessionWorker.cpp +22 -19
- package/src/LoadSessionWorker.h +3 -2
- package/src/RerankWorker.h +3 -2
- package/src/SaveSessionWorker.cpp +22 -19
- package/src/SaveSessionWorker.h +3 -2
- package/src/TokenizeWorker.cpp +38 -35
- package/src/TokenizeWorker.h +12 -3
- package/src/common.hpp +0 -458
- package/src/llama.cpp/common/arg.cpp +50 -30
- package/src/llama.cpp/common/chat.cpp +111 -1
- package/src/llama.cpp/common/chat.h +3 -0
- package/src/llama.cpp/common/common.h +1 -1
- package/src/llama.cpp/common/log.cpp +53 -2
- package/src/llama.cpp/common/log.h +10 -4
- package/src/llama.cpp/common/sampling.cpp +23 -2
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +0 -1
- package/src/llama.cpp/ggml/include/ggml.h +50 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +14 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +0 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +11 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +4 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +218 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +41 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +150 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +320 -73
- package/src/llama.cpp/include/llama.h +5 -6
- package/src/llama.cpp/src/llama-adapter.cpp +33 -0
- package/src/llama.cpp/src/llama-adapter.h +3 -0
- package/src/llama.cpp/src/llama-arch.cpp +27 -4
- package/src/llama.cpp/src/llama-arch.h +2 -0
- package/src/llama.cpp/src/llama-context.cpp +62 -56
- package/src/llama.cpp/src/llama-context.h +1 -1
- package/src/llama.cpp/src/llama-graph.cpp +54 -9
- package/src/llama.cpp/src/llama-graph.h +8 -0
- package/src/llama.cpp/src/llama-hparams.cpp +37 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -3
- package/src/llama.cpp/src/llama-kv-cache.cpp +1 -23
- package/src/llama.cpp/src/llama-kv-cache.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +159 -1
- package/src/llama.cpp/src/llama-model.h +0 -1
- package/src/llama.cpp/src/llama-sampling.cpp +226 -126
- package/src/anyascii.c +0 -22223
- package/src/anyascii.h +0 -42
- package/src/tts_utils.cpp +0 -371
- package/src/tts_utils.h +0 -103
|
@@ -119,45 +119,149 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
|
|
119
119
|
}
|
|
120
120
|
|
|
121
121
|
#if defined(GGML_SIMD)
|
|
122
|
-
#if defined(
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
122
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
123
|
+
|
|
124
|
+
const int sve_register_length = svcntb() * 8;
|
|
125
|
+
const int ggml_f16_epr = sve_register_length / 16; // running when 16
|
|
126
|
+
const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
|
|
127
|
+
|
|
128
|
+
const int np = (n & ~(ggml_f16_step - 1));
|
|
129
|
+
|
|
130
|
+
svfloat16_t sum_00 = svdup_n_f16(0.0f);
|
|
131
|
+
svfloat16_t sum_01 = svdup_n_f16(0.0f);
|
|
132
|
+
svfloat16_t sum_02 = svdup_n_f16(0.0f);
|
|
133
|
+
svfloat16_t sum_03 = svdup_n_f16(0.0f);
|
|
134
|
+
|
|
135
|
+
svfloat16_t sum_10 = svdup_n_f16(0.0f);
|
|
136
|
+
svfloat16_t sum_11 = svdup_n_f16(0.0f);
|
|
137
|
+
svfloat16_t sum_12 = svdup_n_f16(0.0f);
|
|
138
|
+
svfloat16_t sum_13 = svdup_n_f16(0.0f);
|
|
139
|
+
|
|
140
|
+
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
|
141
|
+
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
|
142
|
+
|
|
143
|
+
for (int i = 0; i < np; i += ggml_f16_step) {
|
|
144
|
+
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
|
|
145
|
+
|
|
146
|
+
ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elemnst
|
|
147
|
+
sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
|
|
148
|
+
ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
|
|
149
|
+
sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
|
|
150
|
+
|
|
151
|
+
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
|
|
152
|
+
|
|
153
|
+
ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 ekements
|
|
154
|
+
sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
|
|
155
|
+
ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
|
|
156
|
+
sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
|
|
157
|
+
|
|
158
|
+
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
|
|
159
|
+
|
|
160
|
+
ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
|
|
161
|
+
sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
|
|
162
|
+
ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
|
|
163
|
+
sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
|
|
164
|
+
|
|
165
|
+
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
|
166
|
+
|
|
167
|
+
ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3);
|
|
168
|
+
sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4);
|
|
169
|
+
ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3);
|
|
170
|
+
sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4);
|
|
171
|
+
|
|
172
|
+
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
|
|
173
|
+
|
|
174
|
+
ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4);
|
|
175
|
+
|
|
176
|
+
sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5);
|
|
177
|
+
ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4);
|
|
178
|
+
sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5);
|
|
179
|
+
|
|
180
|
+
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
|
|
181
|
+
|
|
182
|
+
ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5);
|
|
183
|
+
|
|
184
|
+
sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6);
|
|
185
|
+
ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5);
|
|
186
|
+
sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6);
|
|
187
|
+
|
|
188
|
+
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
|
|
189
|
+
|
|
190
|
+
ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6);
|
|
191
|
+
|
|
192
|
+
sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7);
|
|
193
|
+
ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6);
|
|
194
|
+
sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7);
|
|
195
|
+
|
|
196
|
+
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
|
|
197
|
+
|
|
198
|
+
ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7);
|
|
199
|
+
|
|
200
|
+
sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8);
|
|
201
|
+
ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7);
|
|
202
|
+
sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8);
|
|
127
203
|
}
|
|
128
|
-
}
|
|
129
|
-
#else
|
|
130
|
-
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
131
204
|
|
|
132
|
-
|
|
205
|
+
const int np2 = (n & ~(ggml_f16_epr - 1));
|
|
206
|
+
for (int k = np; k < np2; k += ggml_f16_epr) {
|
|
207
|
+
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
|
|
133
208
|
|
|
134
|
-
|
|
135
|
-
|
|
209
|
+
svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0);
|
|
210
|
+
sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry);
|
|
211
|
+
rx = GGML_F16x_VEC_LOAD(x[1] + k, 0);
|
|
212
|
+
sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry);
|
|
213
|
+
}
|
|
136
214
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
215
|
+
if (np2 < n) {
|
|
216
|
+
svbool_t pg = svwhilelt_b16(np2, n);
|
|
217
|
+
svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
|
|
218
|
+
svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
|
|
219
|
+
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
|
140
220
|
|
|
141
|
-
|
|
142
|
-
|
|
221
|
+
sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00);
|
|
222
|
+
sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10);
|
|
223
|
+
}
|
|
224
|
+
GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
|
|
225
|
+
GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
|
|
226
|
+
#elif defined(__riscv_v_intrinsic)
|
|
227
|
+
// todo: RVV impl
|
|
228
|
+
for (int i = 0; i < n; ++i) {
|
|
229
|
+
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
|
230
|
+
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
#else
|
|
234
|
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
235
|
+
|
|
236
|
+
GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
|
|
143
237
|
|
|
144
|
-
|
|
238
|
+
GGML_F16_VEC ax[GGML_F16_ARR];
|
|
239
|
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
|
240
|
+
|
|
241
|
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
|
242
|
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
|
243
|
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
|
244
|
+
|
|
245
|
+
for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
|
|
246
|
+
ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
|
|
247
|
+
|
|
248
|
+
sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
|
|
249
|
+
}
|
|
145
250
|
}
|
|
146
251
|
}
|
|
147
|
-
}
|
|
148
252
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
253
|
+
// reduce sum0..sum3 to sum0
|
|
254
|
+
for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
|
|
255
|
+
GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
|
|
256
|
+
}
|
|
153
257
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
258
|
+
// leftovers
|
|
259
|
+
for (int i = np; i < n; ++i) {
|
|
260
|
+
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
|
261
|
+
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
262
|
+
}
|
|
158
263
|
}
|
|
159
|
-
|
|
160
|
-
#endif
|
|
264
|
+
#endif
|
|
161
265
|
#else
|
|
162
266
|
for (int i = 0; i < n; ++i) {
|
|
163
267
|
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
|
@@ -293,35 +397,112 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
|
|
|
293
397
|
|
|
294
398
|
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
|
|
295
399
|
#if defined(GGML_SIMD)
|
|
296
|
-
#if defined(
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
400
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
401
|
+
const int sve_register_length = svcntb() * 8;
|
|
402
|
+
const int ggml_f16_epr = sve_register_length / 16;
|
|
403
|
+
const int ggml_f16_step = 8 * ggml_f16_epr;
|
|
404
|
+
|
|
405
|
+
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
|
|
406
|
+
|
|
407
|
+
const int np= (n & ~(ggml_f16_step - 1));
|
|
408
|
+
|
|
409
|
+
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
|
410
|
+
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
|
411
|
+
for (int i = 0; i < np; i += ggml_f16_step) {
|
|
412
|
+
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
|
|
413
|
+
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
|
|
414
|
+
ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
|
|
415
|
+
|
|
416
|
+
GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
|
|
417
|
+
|
|
418
|
+
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
|
|
419
|
+
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
|
|
420
|
+
ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
|
|
421
|
+
|
|
422
|
+
GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
|
|
423
|
+
|
|
424
|
+
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
|
|
425
|
+
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
|
|
426
|
+
ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
|
|
304
427
|
|
|
305
|
-
|
|
428
|
+
GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
|
|
306
429
|
|
|
307
|
-
|
|
308
|
-
|
|
430
|
+
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
|
|
431
|
+
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
|
432
|
+
ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
|
|
309
433
|
|
|
310
|
-
|
|
311
|
-
for (int j = 0; j < GGML_F16_ARR; j++) {
|
|
312
|
-
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
|
313
|
-
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
|
314
|
-
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
|
434
|
+
GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
|
|
315
435
|
|
|
316
|
-
|
|
436
|
+
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
|
|
437
|
+
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
|
|
438
|
+
ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
|
|
439
|
+
|
|
440
|
+
GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
|
|
441
|
+
|
|
442
|
+
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
|
|
443
|
+
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
|
|
444
|
+
ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
|
|
445
|
+
|
|
446
|
+
GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
|
|
447
|
+
|
|
448
|
+
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
|
|
449
|
+
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
|
|
450
|
+
ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
|
|
451
|
+
|
|
452
|
+
GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
|
|
453
|
+
|
|
454
|
+
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
|
|
455
|
+
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
|
|
456
|
+
ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
|
|
457
|
+
|
|
458
|
+
GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
|
|
317
459
|
}
|
|
318
|
-
|
|
460
|
+
const int np2 = (n & ~(ggml_f16_epr - 1));
|
|
461
|
+
for (int k = np; k < np2; k += ggml_f16_epr) {
|
|
462
|
+
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
|
|
463
|
+
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
|
|
464
|
+
ry = GGML_F16x_VEC_FMA(ry, rx, vx);
|
|
319
465
|
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
466
|
+
GGML_F16x_VEC_STORE(y + k, ry, 0);
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
if (np2 < n) {
|
|
470
|
+
svbool_t pg = svwhilelt_b16(np2, n);
|
|
471
|
+
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
|
|
472
|
+
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
|
473
|
+
hy = svmad_f16_x(pg, hx, vx, hy);
|
|
474
|
+
svst1_f16(pg, (__fp16 *)(y + np2), hy);
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
#elif defined(__riscv_v_intrinsic)
|
|
478
|
+
// todo: RVV impl
|
|
479
|
+
// scalar
|
|
480
|
+
for (int i = 0; i < n; ++i) {
|
|
481
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
|
482
|
+
}
|
|
483
|
+
#else
|
|
484
|
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
485
|
+
|
|
486
|
+
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
|
487
|
+
|
|
488
|
+
GGML_F16_VEC ax[GGML_F16_ARR];
|
|
489
|
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
|
490
|
+
|
|
491
|
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
|
492
|
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
|
493
|
+
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
|
494
|
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
|
495
|
+
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
|
496
|
+
|
|
497
|
+
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
// leftovers
|
|
502
|
+
for (int i = np; i < n; ++i) {
|
|
503
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
|
504
|
+
}
|
|
505
|
+
#endif
|
|
325
506
|
#else
|
|
326
507
|
// scalar
|
|
327
508
|
for (int i = 0; i < n; ++i) {
|
|
@@ -517,33 +698,59 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
|
|
517
698
|
|
|
518
699
|
inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
|
|
519
700
|
#if defined(GGML_SIMD)
|
|
520
|
-
#if defined(
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
701
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
702
|
+
const int sve_register_length = svcntb() * 8;
|
|
703
|
+
const int ggml_f16_epr = sve_register_length / 16;
|
|
704
|
+
const int ggml_f16_step = 2 * ggml_f16_epr;
|
|
705
|
+
|
|
706
|
+
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
|
|
707
|
+
const int np = (n & ~(ggml_f16_step - 1));
|
|
708
|
+
svfloat16_t ay1, ay2;
|
|
709
|
+
|
|
710
|
+
for (int i = 0; i < np; i += ggml_f16_step) {
|
|
711
|
+
ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
|
|
712
|
+
ay1 = GGML_F16x_VEC_MUL(ay1, vx);
|
|
713
|
+
GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0);
|
|
714
|
+
|
|
715
|
+
ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1);
|
|
716
|
+
ay2 = GGML_F16x_VEC_MUL(ay2, vx);
|
|
717
|
+
GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1);
|
|
718
|
+
}
|
|
719
|
+
// leftovers
|
|
720
|
+
// maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
|
|
721
|
+
if (np < n) {
|
|
722
|
+
svbool_t pg = svwhilelt_b16(np, n);
|
|
723
|
+
svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
|
|
724
|
+
svfloat16_t out = svmul_f16_m(pg, hy, vx);
|
|
725
|
+
svst1_f16(pg, (__fp16 *)(y + np), out);
|
|
726
|
+
}
|
|
727
|
+
#elif defined(__riscv_v_intrinsic)
|
|
728
|
+
// todo: RVV impl
|
|
729
|
+
// scalar
|
|
730
|
+
for (int i = 0; i < n; ++i) {
|
|
731
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
|
732
|
+
}
|
|
733
|
+
#else
|
|
734
|
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
528
735
|
|
|
529
|
-
|
|
736
|
+
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
|
530
737
|
|
|
531
|
-
|
|
738
|
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
|
532
739
|
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
740
|
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
|
741
|
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
|
742
|
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
|
743
|
+
ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
|
|
537
744
|
|
|
538
|
-
|
|
745
|
+
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
|
746
|
+
}
|
|
539
747
|
}
|
|
540
|
-
}
|
|
541
748
|
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
#endif
|
|
749
|
+
// leftovers
|
|
750
|
+
for (int i = np; i < n; ++i) {
|
|
751
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
|
752
|
+
}
|
|
753
|
+
#endif
|
|
547
754
|
#else
|
|
548
755
|
// scalar
|
|
549
756
|
for (int i = 0; i < n; ++i) {
|
|
@@ -795,7 +1002,39 @@ https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/sr
|
|
|
795
1002
|
}
|
|
796
1003
|
#endif
|
|
797
1004
|
|
|
798
|
-
#if defined(
|
|
1005
|
+
#if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
|
1006
|
+
|
|
1007
|
+
inline static svfloat32_t ggml_v_expf(svbool_t pg, svfloat32_t x) {
|
|
1008
|
+
const svfloat32_t r = svdup_n_f32_x(pg, 0x1.8p23f);
|
|
1009
|
+
const svfloat32_t z = svmla_n_f32_x(pg, r, x, 0x1.715476p+0f);
|
|
1010
|
+
const svfloat32_t n = svsub_f32_x(pg, z, r);
|
|
1011
|
+
const svfloat32_t b = svmls_n_f32_x(pg, svmls_n_f32_x(pg, x, n, 0x1.62e4p-1f), n, 0x1.7f7d1cp-20f);
|
|
1012
|
+
const svuint32_t e = svlsl_n_u32_x(pg, svreinterpret_u32_f32(z), 23);
|
|
1013
|
+
const svfloat32_t k = svreinterpret_f32_u32(svadd_u32_x(pg, e, svreinterpret_u32_f32(svdup_n_f32_x(pg, 1))));
|
|
1014
|
+
const svbool_t c = svacgt_n_f32(pg, n, 126);
|
|
1015
|
+
const svfloat32_t u = svmul_f32_x(pg, b, b);
|
|
1016
|
+
const svfloat32_t j = svmla_f32_x(pg,
|
|
1017
|
+
svmul_n_f32_x(pg, b, 0x1.ffffecp-1f),
|
|
1018
|
+
svmla_f32_x(pg, svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.fffdb6p-2f), svdup_n_f32_x(pg, 0x1.555e66p-3f), b),
|
|
1019
|
+
svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.573e2ep-5f), svdup_n_f32_x(pg, 0x1.0e4020p-7f), b), u), u);
|
|
1020
|
+
const svuint32_t d = svdup_n_u32_z(svcmple_n_f32(pg, n, 0.0), 0x82000000);
|
|
1021
|
+
const svfloat32_t s1 = svreinterpret_f32_u32(svadd_n_u32_x(pg, d, 0x7f000000));
|
|
1022
|
+
const svfloat32_t s2 = svreinterpret_f32_u32(svsub_u32_x(pg, e, d));
|
|
1023
|
+
return svsel_f32(svacgt_f32(pg, n, svdup_n_f32_x(pg, 192)), svmul_f32_x(pg, s1, s1),
|
|
1024
|
+
svsel_f32(c, svmul_f32_x(pg, svmla_f32_x(pg, s2, s2, j), s1), svmla_f32_x(pg, k, k, j)));
|
|
1025
|
+
}
|
|
1026
|
+
|
|
1027
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
|
1028
|
+
inline static svfloat32_t ggml_v_silu(svbool_t pg, svfloat32_t x) {
|
|
1029
|
+
const svfloat32_t one = svdup_n_f32_x(pg, 1.0f);
|
|
1030
|
+
const svfloat32_t zero = svdup_n_f32_x(pg, 0.0f);
|
|
1031
|
+
const svfloat32_t neg_x = svsub_f32_x(pg, zero, x);
|
|
1032
|
+
const svfloat32_t exp_neg_x = ggml_v_expf(pg, neg_x);
|
|
1033
|
+
const svfloat32_t one_plus_exp_neg_x = svadd_f32_x(pg, one, exp_neg_x);
|
|
1034
|
+
return svdiv_f32_x(pg, x, one_plus_exp_neg_x);
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
|
799
1038
|
|
|
800
1039
|
// adapted from arm limited optimized routine
|
|
801
1040
|
// the maximum error is 1.45358 plus 0.5 ulps
|
|
@@ -1030,6 +1269,14 @@ inline static vfloat32m2_t ggml_v_expf_m2(vfloat32m2_t x, int vl) {
|
|
|
1030
1269
|
vl);
|
|
1031
1270
|
}
|
|
1032
1271
|
|
|
1272
|
+
// computes silu x/(1+exp(-x)) in single precision vector
|
|
1273
|
+
inline static vfloat32m2_t ggml_v_silu_m2(vfloat32m2_t x, int vl) {
|
|
1274
|
+
const vfloat32m2_t neg_x = __riscv_vfneg_v_f32m2(x, vl);
|
|
1275
|
+
const vfloat32m2_t exp_neg_x = ggml_v_expf_m2(neg_x, vl);
|
|
1276
|
+
const vfloat32m2_t one_plus_exp_neg_x = __riscv_vfadd_vf_f32m2(exp_neg_x, 1.0f, vl);
|
|
1277
|
+
return __riscv_vfdiv_vv_f32m2(x, one_plus_exp_neg_x, vl);
|
|
1278
|
+
}
|
|
1279
|
+
|
|
1033
1280
|
#endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
|
|
1034
1281
|
|
|
1035
1282
|
inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
@@ -206,7 +206,7 @@ extern "C" {
|
|
|
206
206
|
llama_token_data * data;
|
|
207
207
|
size_t size;
|
|
208
208
|
int64_t selected; // this is the index in the data array (i.e. not the token id)
|
|
209
|
-
bool sorted;
|
|
209
|
+
bool sorted; // note: do not assume the data is sorted - always check this flag
|
|
210
210
|
} llama_token_data_array;
|
|
211
211
|
|
|
212
212
|
typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
|
@@ -583,6 +583,10 @@ extern "C" {
|
|
|
583
583
|
// Note: loaded adapters will be free when the associated model is deleted
|
|
584
584
|
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
|
|
585
585
|
|
|
586
|
+
// Get the invocation tokens if the current lora is an alora
|
|
587
|
+
LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
|
|
588
|
+
LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens (const struct llama_adapter_lora * adapter);
|
|
589
|
+
|
|
586
590
|
// The following functions operate on a llama_context, hence the naming: llama_verb_...
|
|
587
591
|
|
|
588
592
|
// Add a loaded LoRA adapter to given context
|
|
@@ -1156,11 +1160,6 @@ extern "C" {
|
|
|
1156
1160
|
LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
|
|
1157
1161
|
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
|
|
1158
1162
|
|
|
1159
|
-
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
|
1160
|
-
/// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
|
|
1161
|
-
DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
|
|
1162
|
-
"will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
|
|
1163
|
-
|
|
1164
1163
|
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
|
1165
1164
|
/// Setting k <= 0 makes this a noop
|
|
1166
1165
|
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
#include <map>
|
|
8
8
|
#include <cassert>
|
|
9
|
+
#include <sstream>
|
|
9
10
|
#include <stdexcept>
|
|
10
11
|
|
|
11
12
|
// vec
|
|
@@ -215,6 +216,26 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
|
|
215
216
|
}
|
|
216
217
|
|
|
217
218
|
adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
|
|
219
|
+
|
|
220
|
+
// parse alora invocation sequence vector
|
|
221
|
+
const auto & key = llm_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS);
|
|
222
|
+
const int kid = gguf_find_key(ctx_gguf.get(), key.c_str());
|
|
223
|
+
if (kid >= 0) {
|
|
224
|
+
if (gguf_get_kv_type(ctx_gguf.get(), kid) != GGUF_TYPE_ARRAY) {
|
|
225
|
+
throw std::runtime_error("invalid gguf type for " + key);
|
|
226
|
+
}
|
|
227
|
+
const auto arr_type = gguf_get_arr_type(ctx_gguf.get(), kid);
|
|
228
|
+
if (arr_type != GGUF_TYPE_UINT32) {
|
|
229
|
+
throw std::runtime_error("invalid gguf element type for " + key);
|
|
230
|
+
}
|
|
231
|
+
const size_t seq_len = gguf_get_arr_n(ctx_gguf.get(), kid);
|
|
232
|
+
const void * data = gguf_get_arr_data(ctx_gguf.get(), kid);
|
|
233
|
+
adapter.alora_invocation_tokens.resize(seq_len);
|
|
234
|
+
std::copy(
|
|
235
|
+
(const llama_token *)data,
|
|
236
|
+
(const llama_token *)data + seq_len,
|
|
237
|
+
adapter.alora_invocation_tokens.begin());
|
|
238
|
+
}
|
|
218
239
|
}
|
|
219
240
|
|
|
220
241
|
int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
|
|
@@ -450,3 +471,15 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
|
|
|
450
471
|
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
|
|
451
472
|
delete adapter;
|
|
452
473
|
}
|
|
474
|
+
|
|
475
|
+
uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
|
|
476
|
+
if (!adapter) {
|
|
477
|
+
return 0;
|
|
478
|
+
}
|
|
479
|
+
return adapter->alora_invocation_tokens.size();
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
const llama_token * llama_adapter_get_alora_invocation_tokens(const llama_adapter_lora * adapter) {
|
|
483
|
+
GGML_ASSERT(adapter);
|
|
484
|
+
return adapter->alora_invocation_tokens.data();
|
|
485
|
+
}
|
|
@@ -70,6 +70,9 @@ struct llama_adapter_lora {
|
|
|
70
70
|
// gguf metadata
|
|
71
71
|
std::unordered_map<std::string, std::string> gguf_kv;
|
|
72
72
|
|
|
73
|
+
// activated lora (aLoRA)
|
|
74
|
+
std::vector<llama_token> alora_invocation_tokens;
|
|
75
|
+
|
|
73
76
|
llama_adapter_lora() = default;
|
|
74
77
|
~llama_adapter_lora() = default;
|
|
75
78
|
|
|
@@ -45,6 +45,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
45
45
|
{ LLM_ARCH_GEMMA2, "gemma2" },
|
|
46
46
|
{ LLM_ARCH_GEMMA3, "gemma3" },
|
|
47
47
|
{ LLM_ARCH_GEMMA3N, "gemma3n" },
|
|
48
|
+
{ LLM_ARCH_GEMMA_EMBEDDING, "gemma-embedding" },
|
|
48
49
|
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
|
49
50
|
{ LLM_ARCH_MAMBA, "mamba" },
|
|
50
51
|
{ LLM_ARCH_MAMBA2, "mamba2" },
|
|
@@ -236,10 +237,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
236
237
|
{ LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" },
|
|
237
238
|
{ LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" },
|
|
238
239
|
|
|
239
|
-
{ LLM_KV_ADAPTER_TYPE,
|
|
240
|
-
{ LLM_KV_ADAPTER_LORA_ALPHA,
|
|
241
|
-
{ LLM_KV_ADAPTER_LORA_TASK_NAME,
|
|
242
|
-
{ LLM_KV_ADAPTER_LORA_PROMPT_PREFIX,
|
|
240
|
+
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
|
|
241
|
+
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
|
|
242
|
+
{ LLM_KV_ADAPTER_LORA_TASK_NAME, "adapter.lora.task_name" },
|
|
243
|
+
{ LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, "adapter.lora.prompt_prefix" },
|
|
244
|
+
{ LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS, "adapter.alora.invocation_tokens" },
|
|
243
245
|
|
|
244
246
|
// deprecated
|
|
245
247
|
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
|
|
@@ -1038,6 +1040,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1038
1040
|
{ LLM_TENSOR_LAUREL_POST_NORM, "blk.%d.laurel_post_norm" },
|
|
1039
1041
|
},
|
|
1040
1042
|
},
|
|
1043
|
+
{
|
|
1044
|
+
LLM_ARCH_GEMMA_EMBEDDING,
|
|
1045
|
+
{
|
|
1046
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1047
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1048
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1049
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1050
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1051
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
1052
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1053
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
1054
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1055
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1056
|
+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
|
1057
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1058
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1059
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1060
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1061
|
+
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
|
1062
|
+
},
|
|
1063
|
+
},
|
|
1041
1064
|
{
|
|
1042
1065
|
LLM_ARCH_STARCODER2,
|
|
1043
1066
|
{
|
|
@@ -49,6 +49,7 @@ enum llm_arch {
|
|
|
49
49
|
LLM_ARCH_GEMMA2,
|
|
50
50
|
LLM_ARCH_GEMMA3,
|
|
51
51
|
LLM_ARCH_GEMMA3N,
|
|
52
|
+
LLM_ARCH_GEMMA_EMBEDDING,
|
|
52
53
|
LLM_ARCH_STARCODER2,
|
|
53
54
|
LLM_ARCH_MAMBA,
|
|
54
55
|
LLM_ARCH_MAMBA2,
|
|
@@ -234,6 +235,7 @@ enum llm_kv {
|
|
|
234
235
|
LLM_KV_ADAPTER_LORA_ALPHA,
|
|
235
236
|
LLM_KV_ADAPTER_LORA_TASK_NAME,
|
|
236
237
|
LLM_KV_ADAPTER_LORA_PROMPT_PREFIX,
|
|
238
|
+
LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS,
|
|
237
239
|
|
|
238
240
|
LLM_KV_POSNET_EMBEDDING_LENGTH,
|
|
239
241
|
LLM_KV_POSNET_BLOCK_COUNT,
|