@fugood/llama.node 1.0.2 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/CMakeLists.txt +0 -1
- package/src/llama.cpp/common/CMakeLists.txt +4 -5
- package/src/llama.cpp/common/arg.cpp +44 -0
- package/src/llama.cpp/common/common.cpp +22 -6
- package/src/llama.cpp/common/common.h +15 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +10 -2
- package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
- package/src/llama.cpp/ggml/include/ggml.h +104 -10
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +12 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +749 -163
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +12 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +88 -9
- package/src/llama.cpp/include/llama.h +13 -47
- package/src/llama.cpp/src/llama-arch.cpp +298 -3
- package/src/llama.cpp/src/llama-arch.h +22 -1
- package/src/llama.cpp/src/llama-batch.cpp +103 -71
- package/src/llama.cpp/src/llama-batch.h +31 -18
- package/src/llama.cpp/src/llama-chat.cpp +59 -1
- package/src/llama.cpp/src/llama-chat.h +3 -0
- package/src/llama.cpp/src/llama-context.cpp +134 -95
- package/src/llama.cpp/src/llama-context.h +13 -16
- package/src/llama.cpp/src/llama-cparams.h +3 -2
- package/src/llama.cpp/src/llama-graph.cpp +279 -180
- package/src/llama.cpp/src/llama-graph.h +183 -122
- package/src/llama.cpp/src/llama-hparams.cpp +47 -1
- package/src/llama.cpp/src/llama-hparams.h +12 -1
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +38 -22
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +7 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +849 -304
- package/src/llama.cpp/src/llama-kv-cache-unified.h +143 -47
- package/src/llama.cpp/src/llama-kv-cells.h +62 -10
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +10 -4
- package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +21 -11
- package/src/llama.cpp/src/llama-memory.cpp +17 -0
- package/src/llama.cpp/src/llama-memory.h +3 -0
- package/src/llama.cpp/src/llama-model.cpp +3373 -743
- package/src/llama.cpp/src/llama-model.h +20 -4
- package/src/llama.cpp/src/llama-quant.cpp +2 -2
- package/src/llama.cpp/src/llama-vocab.cpp +376 -10
- package/src/llama.cpp/src/llama-vocab.h +43 -0
- package/src/llama.cpp/src/unicode.cpp +207 -0
- package/src/llama.cpp/src/unicode.h +2 -0
- package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50
|
@@ -20,6 +20,9 @@
|
|
|
20
20
|
|
|
21
21
|
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
|
22
22
|
|
|
23
|
+
// Work buffer size for im2col operations in CONV2D
|
|
24
|
+
#define GGML_IM2COL_WORK_SIZE (16 * 1024 * 1024)
|
|
25
|
+
|
|
23
26
|
#ifdef __cplusplus
|
|
24
27
|
extern "C" {
|
|
25
28
|
#endif
|
|
@@ -65,6 +68,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc
|
|
|
65
68
|
void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
66
69
|
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
67
70
|
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
71
|
+
void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
68
72
|
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
69
73
|
void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
70
74
|
void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
@@ -107,6 +111,7 @@ void ggml_compute_forward_custom(const struct ggml_compute_params * params, stru
|
|
|
107
111
|
void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
108
112
|
void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
109
113
|
void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
114
|
+
void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
110
115
|
|
|
111
116
|
#ifdef __cplusplus
|
|
112
117
|
}
|
|
@@ -189,7 +189,7 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
|
189
189
|
#define GGML_F32xt_LOAD(...) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
|
|
190
190
|
#define GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b)
|
|
191
191
|
#define GGML_F32xt_STORE(...) GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
|
|
192
|
-
#define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg,
|
|
192
|
+
#define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, b, c, a)
|
|
193
193
|
#define GGML_F32xt_FMA(...) GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
|
|
194
194
|
#define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
|
|
195
195
|
#define GGML_F32xt_ADD(...) GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
|
|
@@ -37,35 +37,35 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
|
|
|
37
37
|
for (int i = 0; i < np; i += ggml_f32_step) {
|
|
38
38
|
ax1 = GGML_F32_VEC_LOAD(x + i);
|
|
39
39
|
ay1 = GGML_F32_VEC_LOAD(y + i);
|
|
40
|
-
sum1 = GGML_F32_VEC_FMA(ax1, ay1
|
|
40
|
+
sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
|
|
41
41
|
|
|
42
42
|
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
|
|
43
43
|
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
|
44
|
-
sum2 = GGML_F32_VEC_FMA(ax2, ay2
|
|
44
|
+
sum2 = GGML_F32_VEC_FMA(sum2, ax2, ay2);
|
|
45
45
|
|
|
46
46
|
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
|
|
47
47
|
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
|
|
48
|
-
sum3 = GGML_F32_VEC_FMA(ax3, ay3
|
|
48
|
+
sum3 = GGML_F32_VEC_FMA(sum3, ax3, ay3);
|
|
49
49
|
|
|
50
50
|
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
|
|
51
51
|
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
|
|
52
|
-
sum4 = GGML_F32_VEC_FMA(ax4, ay4
|
|
52
|
+
sum4 = GGML_F32_VEC_FMA(sum4, ax4, ay4);
|
|
53
53
|
|
|
54
54
|
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
|
|
55
55
|
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
|
|
56
|
-
sum5 = GGML_F32_VEC_FMA(ax5, ay5
|
|
56
|
+
sum5 = GGML_F32_VEC_FMA(sum5, ax5, ay5);
|
|
57
57
|
|
|
58
58
|
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
|
|
59
59
|
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
|
|
60
|
-
sum6 = GGML_F32_VEC_FMA(ax6, ay6
|
|
60
|
+
sum6 = GGML_F32_VEC_FMA(sum6, ax6, ay6);
|
|
61
61
|
|
|
62
62
|
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
|
|
63
63
|
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
|
|
64
|
-
sum7 = GGML_F32_VEC_FMA(ax7, ay7
|
|
64
|
+
sum7 = GGML_F32_VEC_FMA(sum7, ax7, ay7);
|
|
65
65
|
|
|
66
66
|
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
|
|
67
67
|
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
|
|
68
|
-
sum8 = GGML_F32_VEC_FMA(ax8, ay8
|
|
68
|
+
sum8 = GGML_F32_VEC_FMA(sum8, ax8, ay8);
|
|
69
69
|
}
|
|
70
70
|
// leftovers
|
|
71
71
|
// Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
|
|
@@ -73,7 +73,7 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
|
|
|
73
73
|
for (int i = np; i < np2; i += ggml_f32_epr) {
|
|
74
74
|
ax1 = GGML_F32_VEC_LOAD(x + i);
|
|
75
75
|
ay1 = GGML_F32_VEC_LOAD(y + i);
|
|
76
|
-
sum1 = GGML_F32_VEC_FMA(ax1, ay1
|
|
76
|
+
sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
|
|
77
77
|
}
|
|
78
78
|
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
|
|
79
79
|
if (np2 < n) {
|
|
@@ -221,6 +221,9 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
|
|
|
221
221
|
for (int i = np; i < n; ++i) {
|
|
222
222
|
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
223
223
|
}
|
|
224
|
+
|
|
225
|
+
// if you hit this, you are likely running outside the FP range
|
|
226
|
+
assert(!isnan(sumf) && !isinf(sumf));
|
|
224
227
|
#else
|
|
225
228
|
for (int i = 0; i < n; ++i) {
|
|
226
229
|
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
@@ -163,49 +163,49 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
|
|
|
163
163
|
|
|
164
164
|
ax1 = GGML_F32_VEC_LOAD(x + i);
|
|
165
165
|
ay1 = GGML_F32_VEC_LOAD(y + i);
|
|
166
|
-
ay1 = GGML_F32_VEC_FMA(ax1, vx
|
|
166
|
+
ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
|
|
167
167
|
|
|
168
168
|
GGML_F32_VEC_STORE(y + i, ay1);
|
|
169
169
|
|
|
170
170
|
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
|
|
171
171
|
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
|
172
|
-
ay2 = GGML_F32_VEC_FMA(ax2, vx
|
|
172
|
+
ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx);
|
|
173
173
|
|
|
174
174
|
GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
|
|
175
175
|
|
|
176
176
|
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
|
|
177
177
|
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
|
|
178
|
-
ay3 = GGML_F32_VEC_FMA(ax3, vx
|
|
178
|
+
ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx);
|
|
179
179
|
|
|
180
180
|
GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
|
|
181
181
|
|
|
182
182
|
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
|
|
183
183
|
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
|
|
184
|
-
ay4 = GGML_F32_VEC_FMA(ax4, vx
|
|
184
|
+
ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx);
|
|
185
185
|
|
|
186
186
|
GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
|
|
187
187
|
|
|
188
188
|
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
|
|
189
189
|
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
|
|
190
|
-
ay5 = GGML_F32_VEC_FMA(ax5, vx
|
|
190
|
+
ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx);
|
|
191
191
|
|
|
192
192
|
GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
|
|
193
193
|
|
|
194
194
|
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
|
|
195
195
|
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
|
|
196
|
-
ay6 = GGML_F32_VEC_FMA(ax6, vx
|
|
196
|
+
ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx);
|
|
197
197
|
|
|
198
198
|
GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
|
|
199
199
|
|
|
200
200
|
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
|
|
201
201
|
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
|
|
202
|
-
ay7 = GGML_F32_VEC_FMA(ax7, vx
|
|
202
|
+
ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx);
|
|
203
203
|
|
|
204
204
|
GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
|
|
205
205
|
|
|
206
206
|
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
|
|
207
207
|
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
|
|
208
|
-
ay8 = GGML_F32_VEC_FMA(ax8, vx
|
|
208
|
+
ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx);
|
|
209
209
|
|
|
210
210
|
GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
|
|
211
211
|
}
|
|
@@ -215,7 +215,7 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
|
|
|
215
215
|
for (int i = np; i < np2; i += ggml_f32_epr) {
|
|
216
216
|
ax1 = GGML_F32_VEC_LOAD(x + i);
|
|
217
217
|
ay1 = GGML_F32_VEC_LOAD(y + i);
|
|
218
|
-
ay1 = GGML_F32_VEC_FMA(ax1, vx
|
|
218
|
+
ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
|
|
219
219
|
|
|
220
220
|
GGML_F32_VEC_STORE(y + i, ay1);
|
|
221
221
|
}
|
|
@@ -351,6 +351,45 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
|
|
|
351
351
|
#endif
|
|
352
352
|
}
|
|
353
353
|
|
|
354
|
+
inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
|
|
355
|
+
#if defined(GGML_USE_ACCELERATE)
|
|
356
|
+
vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
|
|
357
|
+
#elif defined(GGML_SIMD)
|
|
358
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
359
|
+
// scalar ; TODO: Write SVE code
|
|
360
|
+
for (int i = 0; i < n; ++i) {
|
|
361
|
+
y[i] = x[i]*s + b;
|
|
362
|
+
}
|
|
363
|
+
#else
|
|
364
|
+
const int np = (n & ~(GGML_F32_STEP - 1));
|
|
365
|
+
|
|
366
|
+
GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
|
|
367
|
+
GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
|
|
368
|
+
|
|
369
|
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
|
370
|
+
|
|
371
|
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
|
372
|
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
|
373
|
+
ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
|
374
|
+
ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb);
|
|
375
|
+
|
|
376
|
+
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
// leftovers
|
|
381
|
+
for (int i = np; i < n; ++i) {
|
|
382
|
+
y[i] = x[i]*s + b;
|
|
383
|
+
}
|
|
384
|
+
#endif
|
|
385
|
+
#else
|
|
386
|
+
// scalar
|
|
387
|
+
for (int i = 0; i < n; ++i) {
|
|
388
|
+
y[i] = x[i]*s + b;
|
|
389
|
+
}
|
|
390
|
+
#endif
|
|
391
|
+
}
|
|
392
|
+
|
|
354
393
|
//inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
|
|
355
394
|
inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
|
356
395
|
#if defined(GGML_USE_ACCELERATE)
|
|
@@ -959,6 +998,46 @@ inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_
|
|
|
959
998
|
}
|
|
960
999
|
}
|
|
961
1000
|
|
|
1001
|
+
inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
|
|
1002
|
+
for (int i = 0; i < n; ++i) {
|
|
1003
|
+
float xi = x[i];
|
|
1004
|
+
y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
|
|
1005
|
+
}
|
|
1006
|
+
}
|
|
1007
|
+
|
|
1008
|
+
inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
|
1009
|
+
for (int i = 0; i < n; ++i) {
|
|
1010
|
+
float xi = GGML_CPU_FP16_TO_FP32(x[i]);
|
|
1011
|
+
float gi = GGML_CPU_FP16_TO_FP32(g[i]);
|
|
1012
|
+
y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
|
|
1013
|
+
}
|
|
1014
|
+
}
|
|
1015
|
+
|
|
1016
|
+
#ifdef GGML_GELU_QUICK_FP16
|
|
1017
|
+
inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
|
|
1018
|
+
uint16_t t;
|
|
1019
|
+
for (int i = 0; i < n; ++i) {
|
|
1020
|
+
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
|
|
1021
|
+
memcpy(&t, &fp16, sizeof(uint16_t));
|
|
1022
|
+
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i];
|
|
1023
|
+
}
|
|
1024
|
+
}
|
|
1025
|
+
#else
|
|
1026
|
+
inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
|
|
1027
|
+
for (int i = 0; i < n; ++i) {
|
|
1028
|
+
y[i] = ggml_gelu_quick_f32(x[i]) * g[i];
|
|
1029
|
+
}
|
|
1030
|
+
}
|
|
1031
|
+
#endif
|
|
1032
|
+
|
|
1033
|
+
inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
|
1034
|
+
const uint16_t * i16 = (const uint16_t *) x;
|
|
1035
|
+
for (int i = 0; i < n; ++i) {
|
|
1036
|
+
float v = GGML_CPU_FP16_TO_FP32(g[i]);
|
|
1037
|
+
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v);
|
|
1038
|
+
}
|
|
1039
|
+
}
|
|
1040
|
+
|
|
962
1041
|
inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
|
|
963
1042
|
#ifndef GGML_USE_ACCELERATE
|
|
964
1043
|
ggml_float sum = 0.0;
|
|
@@ -71,52 +71,13 @@ extern "C" {
|
|
|
71
71
|
typedef int32_t llama_seq_id;
|
|
72
72
|
|
|
73
73
|
enum llama_vocab_type {
|
|
74
|
-
LLAMA_VOCAB_TYPE_NONE
|
|
75
|
-
LLAMA_VOCAB_TYPE_SPM
|
|
76
|
-
LLAMA_VOCAB_TYPE_BPE
|
|
77
|
-
LLAMA_VOCAB_TYPE_WPM
|
|
78
|
-
LLAMA_VOCAB_TYPE_UGM
|
|
79
|
-
LLAMA_VOCAB_TYPE_RWKV
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
// pre-tokenization types
|
|
83
|
-
enum llama_vocab_pre_type {
|
|
84
|
-
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
|
85
|
-
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
|
86
|
-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
|
87
|
-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
|
88
|
-
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
|
89
|
-
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
|
90
|
-
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
|
91
|
-
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
|
92
|
-
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
|
93
|
-
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
|
94
|
-
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
|
|
95
|
-
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
|
96
|
-
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
|
97
|
-
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
|
98
|
-
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
|
99
|
-
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
|
100
|
-
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
|
|
101
|
-
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
|
102
|
-
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
|
103
|
-
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
|
104
|
-
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
|
105
|
-
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
|
106
|
-
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
|
107
|
-
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
|
108
|
-
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
|
109
|
-
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
|
110
|
-
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
|
111
|
-
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
|
112
|
-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
|
113
|
-
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
|
|
114
|
-
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
|
|
115
|
-
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
|
116
|
-
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
|
117
|
-
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
|
118
|
-
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
|
119
|
-
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
|
74
|
+
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
|
75
|
+
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
|
76
|
+
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
|
77
|
+
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
|
78
|
+
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
|
|
79
|
+
LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
|
|
80
|
+
LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
|
|
120
81
|
};
|
|
121
82
|
|
|
122
83
|
enum llama_rope_type {
|
|
@@ -374,6 +335,9 @@ extern "C" {
|
|
|
374
335
|
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
375
336
|
// NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
|
|
376
337
|
// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
|
|
338
|
+
bool kv_unified; // use a unified buffer across the input sequences when computing the attention
|
|
339
|
+
// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
|
|
340
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
|
|
377
341
|
};
|
|
378
342
|
|
|
379
343
|
// model quantization parameters
|
|
@@ -764,7 +728,7 @@ extern "C" {
|
|
|
764
728
|
// - lazily on next llama_decode()
|
|
765
729
|
// p0 < 0 : [0, p1]
|
|
766
730
|
// p1 < 0 : [p0, inf)
|
|
767
|
-
DEPRECATED(void llama_kv_self_seq_div(
|
|
731
|
+
DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
|
|
768
732
|
struct llama_context * ctx,
|
|
769
733
|
llama_seq_id seq_id,
|
|
770
734
|
llama_pos p0,
|
|
@@ -1044,6 +1008,7 @@ extern "C" {
|
|
|
1044
1008
|
LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
|
|
1045
1009
|
LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
|
|
1046
1010
|
LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
|
|
1011
|
+
LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
|
|
1047
1012
|
|
|
1048
1013
|
LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
|
|
1049
1014
|
LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
|
|
@@ -1429,6 +1394,7 @@ extern "C" {
|
|
|
1429
1394
|
|
|
1430
1395
|
int32_t n_p_eval;
|
|
1431
1396
|
int32_t n_eval;
|
|
1397
|
+
int32_t n_reused; // number of times a ggml compute graph had been reused
|
|
1432
1398
|
};
|
|
1433
1399
|
|
|
1434
1400
|
struct llama_perf_sampler_data {
|