cui-llama.rn 1.0.6 → 1.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/android/src/main/jni.cpp +2 -2
- package/cpp/common.cpp +68 -29
- package/cpp/common.h +23 -4
- package/cpp/ggml-aarch64.c +16 -14
- package/cpp/ggml-backend.c +15 -10
- package/cpp/ggml-impl.h +4 -6
- package/cpp/ggml-metal.h +2 -0
- package/cpp/ggml-metal.m +54 -21
- package/cpp/ggml-quants.c +8 -8
- package/cpp/ggml-quants.h +4 -0
- package/cpp/ggml.c +81 -12
- package/cpp/ggml.h +6 -4
- package/cpp/llama-impl.h +15 -0
- package/cpp/llama-vocab.cpp +10 -16
- package/cpp/llama-vocab.h +2 -0
- package/cpp/llama.cpp +434 -265
- package/cpp/llama.h +4 -1
- package/cpp/rn-llama.hpp +7 -6
- package/ios/RNLlamaContext.mm +1 -1
- package/jest/mock.js +3 -0
- package/package.json +1 -1
package/cpp/ggml-metal.m
CHANGED
@@ -210,7 +210,7 @@ enum lm_ggml_metal_kernel_type {
|
|
210
210
|
LM_GGML_METAL_KERNEL_TYPE_COUNT
|
211
211
|
};
|
212
212
|
|
213
|
-
struct
|
213
|
+
struct lm_ggml_backend_metal_context {
|
214
214
|
int n_cb;
|
215
215
|
|
216
216
|
id<MTLDevice> device;
|
@@ -224,6 +224,10 @@ struct lm_ggml_metal_context {
|
|
224
224
|
bool support_simdgroup_mm;
|
225
225
|
|
226
226
|
bool should_capture_next_compute;
|
227
|
+
|
228
|
+
// abort lm_ggml_metal_graph_compute if callback returns true
|
229
|
+
lm_ggml_abort_callback abort_callback;
|
230
|
+
void * abort_callback_data;
|
227
231
|
};
|
228
232
|
|
229
233
|
// MSL code
|
@@ -289,7 +293,7 @@ static void * lm_ggml_metal_host_malloc(size_t n) {
|
|
289
293
|
return data;
|
290
294
|
}
|
291
295
|
|
292
|
-
static struct
|
296
|
+
static struct lm_ggml_backend_metal_context * lm_ggml_metal_init(int n_cb) {
|
293
297
|
LM_GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
|
294
298
|
|
295
299
|
#if TARGET_OS_OSX && !LM_GGML_METAL_NDEBUG
|
@@ -306,7 +310,7 @@ static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) {
|
|
306
310
|
LM_GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
|
307
311
|
|
308
312
|
// Configure context
|
309
|
-
struct
|
313
|
+
struct lm_ggml_backend_metal_context * ctx = malloc(sizeof(struct lm_ggml_backend_metal_context));
|
310
314
|
ctx->device = device;
|
311
315
|
ctx->n_cb = MIN(n_cb, LM_GGML_METAL_MAX_BUFFERS);
|
312
316
|
ctx->queue = [ctx->device newCommandQueue];
|
@@ -668,7 +672,7 @@ static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) {
|
|
668
672
|
return ctx;
|
669
673
|
}
|
670
674
|
|
671
|
-
static void lm_ggml_metal_free(struct
|
675
|
+
static void lm_ggml_metal_free(struct lm_ggml_backend_metal_context * ctx) {
|
672
676
|
LM_GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
|
673
677
|
|
674
678
|
for (int i = 0; i < LM_GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
|
@@ -734,7 +738,7 @@ static id<MTLBuffer> lm_ggml_metal_get_buffer(struct lm_ggml_tensor * t, size_t
|
|
734
738
|
return nil;
|
735
739
|
}
|
736
740
|
|
737
|
-
static bool lm_ggml_metal_supports_op(const struct
|
741
|
+
static bool lm_ggml_metal_supports_op(const struct lm_ggml_backend_metal_context * ctx, const struct lm_ggml_tensor * op) {
|
738
742
|
for (size_t i = 0, n = 3; i < n; ++i) {
|
739
743
|
if (op->src[i] != NULL && op->src[i]->type == LM_GGML_TYPE_BF16) {
|
740
744
|
return false;
|
@@ -845,7 +849,7 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_metal_context * ctx,
|
|
845
849
|
}
|
846
850
|
|
847
851
|
static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
848
|
-
struct
|
852
|
+
struct lm_ggml_backend_metal_context * ctx,
|
849
853
|
struct lm_ggml_cgraph * gf) {
|
850
854
|
|
851
855
|
@autoreleasepool {
|
@@ -878,8 +882,11 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
|
878
882
|
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
|
879
883
|
command_buffer_builder[cb_idx] = command_buffer;
|
880
884
|
|
881
|
-
// enqueue the command buffers
|
882
|
-
|
885
|
+
// always enqueue the first two command buffers
|
886
|
+
// enqueue all of the command buffers if we don't need to abort
|
887
|
+
if (cb_idx < 2 || ctx->abort_callback == NULL) {
|
888
|
+
[command_buffer enqueue];
|
889
|
+
}
|
883
890
|
}
|
884
891
|
|
885
892
|
const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;
|
@@ -2229,10 +2236,8 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
|
2229
2236
|
LM_GGML_ASSERT(ne00 % 4 == 0);
|
2230
2237
|
LM_GGML_ASSERT(lm_ggml_is_contiguous(src0));
|
2231
2238
|
|
2232
|
-
|
2233
|
-
|
2234
|
-
|
2235
|
-
const float eps = 1e-6f; // TODO: temporarily hardcoded
|
2239
|
+
float eps;
|
2240
|
+
memcpy(&eps, dst->op_params + 1, sizeof(float));
|
2236
2241
|
|
2237
2242
|
const int32_t n_groups = ((int32_t *) dst->op_params)[0];
|
2238
2243
|
|
@@ -2829,7 +2834,9 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
|
2829
2834
|
|
2830
2835
|
[encoder endEncoding];
|
2831
2836
|
|
2832
|
-
|
2837
|
+
if (cb_idx < 2 || ctx->abort_callback == NULL) {
|
2838
|
+
[command_buffer commit];
|
2839
|
+
}
|
2833
2840
|
});
|
2834
2841
|
|
2835
2842
|
// Wait for completion and check status of each command buffer
|
@@ -2849,6 +2856,23 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
|
2849
2856
|
|
2850
2857
|
return LM_GGML_STATUS_FAILED;
|
2851
2858
|
}
|
2859
|
+
|
2860
|
+
id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? command_buffers[i + 1] : nil);
|
2861
|
+
if (!next_buffer) {
|
2862
|
+
continue;
|
2863
|
+
}
|
2864
|
+
|
2865
|
+
bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
|
2866
|
+
if (next_queued) {
|
2867
|
+
continue;
|
2868
|
+
}
|
2869
|
+
|
2870
|
+
if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
|
2871
|
+
LM_GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i);
|
2872
|
+
return LM_GGML_STATUS_ABORTED;
|
2873
|
+
}
|
2874
|
+
|
2875
|
+
[next_buffer commit];
|
2852
2876
|
}
|
2853
2877
|
|
2854
2878
|
if (should_capture) {
|
@@ -3152,7 +3176,7 @@ LM_GGML_CALL static const char * lm_ggml_backend_metal_name(lm_ggml_backend_t ba
|
|
3152
3176
|
}
|
3153
3177
|
|
3154
3178
|
LM_GGML_CALL static void lm_ggml_backend_metal_free(lm_ggml_backend_t backend) {
|
3155
|
-
struct
|
3179
|
+
struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
|
3156
3180
|
lm_ggml_metal_free(ctx);
|
3157
3181
|
free(backend);
|
3158
3182
|
}
|
@@ -3164,13 +3188,13 @@ LM_GGML_CALL static lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_get_defa
|
|
3164
3188
|
}
|
3165
3189
|
|
3166
3190
|
LM_GGML_CALL static enum lm_ggml_status lm_ggml_backend_metal_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
|
3167
|
-
struct
|
3191
|
+
struct lm_ggml_backend_metal_context * metal_ctx = (struct lm_ggml_backend_metal_context *)backend->context;
|
3168
3192
|
|
3169
3193
|
return lm_ggml_metal_graph_compute(metal_ctx, cgraph);
|
3170
3194
|
}
|
3171
3195
|
|
3172
3196
|
LM_GGML_CALL static bool lm_ggml_backend_metal_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
|
3173
|
-
struct
|
3197
|
+
struct lm_ggml_backend_metal_context * metal_ctx = (struct lm_ggml_backend_metal_context *)backend->context;
|
3174
3198
|
|
3175
3199
|
return lm_ggml_metal_supports_op(metal_ctx, op);
|
3176
3200
|
}
|
@@ -3215,9 +3239,9 @@ static lm_ggml_guid_t lm_ggml_backend_metal_guid(void) {
|
|
3215
3239
|
}
|
3216
3240
|
|
3217
3241
|
lm_ggml_backend_t lm_ggml_backend_metal_init(void) {
|
3218
|
-
struct
|
3219
|
-
|
3242
|
+
struct lm_ggml_backend_metal_context * ctx = lm_ggml_metal_init(LM_GGML_DEFAULT_N_THREADS);
|
3220
3243
|
if (ctx == NULL) {
|
3244
|
+
LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
|
3221
3245
|
return NULL;
|
3222
3246
|
}
|
3223
3247
|
|
@@ -3239,15 +3263,24 @@ bool lm_ggml_backend_is_metal(lm_ggml_backend_t backend) {
|
|
3239
3263
|
void lm_ggml_backend_metal_set_n_cb(lm_ggml_backend_t backend, int n_cb) {
|
3240
3264
|
LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
|
3241
3265
|
|
3242
|
-
struct
|
3266
|
+
struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
|
3243
3267
|
|
3244
3268
|
ctx->n_cb = MIN(n_cb, LM_GGML_METAL_MAX_BUFFERS);
|
3245
3269
|
}
|
3246
3270
|
|
3271
|
+
void lm_ggml_backend_metal_set_abort_callback(lm_ggml_backend_t backend, lm_ggml_abort_callback abort_callback, void * user_data) {
|
3272
|
+
LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
|
3273
|
+
|
3274
|
+
struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
|
3275
|
+
|
3276
|
+
ctx->abort_callback = abort_callback;
|
3277
|
+
ctx->abort_callback_data = user_data;
|
3278
|
+
}
|
3279
|
+
|
3247
3280
|
bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family) {
|
3248
3281
|
LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
|
3249
3282
|
|
3250
|
-
struct
|
3283
|
+
struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
|
3251
3284
|
|
3252
3285
|
return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
|
3253
3286
|
}
|
@@ -3255,7 +3288,7 @@ bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family
|
|
3255
3288
|
void lm_ggml_backend_metal_capture_next_compute(lm_ggml_backend_t backend) {
|
3256
3289
|
LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
|
3257
3290
|
|
3258
|
-
struct
|
3291
|
+
struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
|
3259
3292
|
ctx->should_capture_next_compute = true;
|
3260
3293
|
}
|
3261
3294
|
|
package/cpp/ggml-quants.c
CHANGED
@@ -3818,7 +3818,7 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void
|
|
3818
3818
|
float sumf = 0;
|
3819
3819
|
|
3820
3820
|
#if defined(__ARM_FEATURE_SVE)
|
3821
|
-
if (
|
3821
|
+
if (lm_ggml_sve_cnt_b == QK8_0) {
|
3822
3822
|
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
|
3823
3823
|
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
|
3824
3824
|
|
@@ -5303,7 +5303,7 @@ void lm_ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void
|
|
5303
5303
|
float sumf = 0;
|
5304
5304
|
|
5305
5305
|
#if defined(__ARM_FEATURE_SVE)
|
5306
|
-
if (
|
5306
|
+
if (lm_ggml_sve_cnt_b == QK8_0) {
|
5307
5307
|
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
5308
5308
|
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
5309
5309
|
|
@@ -6449,22 +6449,22 @@ void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void
|
|
6449
6449
|
// compute mask for subtraction
|
6450
6450
|
vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
6451
6451
|
vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
|
6452
|
-
vint8m1_t q3_m0 =
|
6452
|
+
vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
|
6453
6453
|
m <<= 1;
|
6454
6454
|
|
6455
6455
|
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
6456
6456
|
vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
|
6457
|
-
vint8m1_t q3_m1 =
|
6457
|
+
vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
|
6458
6458
|
m <<= 1;
|
6459
6459
|
|
6460
6460
|
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
6461
6461
|
vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
|
6462
|
-
vint8m1_t q3_m2 =
|
6462
|
+
vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
|
6463
6463
|
m <<= 1;
|
6464
6464
|
|
6465
6465
|
vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
6466
6466
|
vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
|
6467
|
-
vint8m1_t q3_m3 =
|
6467
|
+
vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
|
6468
6468
|
m <<= 1;
|
6469
6469
|
|
6470
6470
|
// load Q8 and take product with Q3
|
@@ -7720,13 +7720,13 @@ void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void
|
|
7720
7720
|
vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
|
7721
7721
|
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
7722
7722
|
vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
|
7723
|
-
vint8m1_t q5_m1 =
|
7723
|
+
vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl);
|
7724
7724
|
m <<= 1;
|
7725
7725
|
|
7726
7726
|
vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
|
7727
7727
|
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
7728
7728
|
vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
|
7729
|
-
vint8m1_t q5_m2 =
|
7729
|
+
vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl);
|
7730
7730
|
m <<= 1;
|
7731
7731
|
|
7732
7732
|
vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
|
package/cpp/ggml-quants.h
CHANGED
@@ -127,6 +127,10 @@ void iq2xs_free_impl(enum lm_ggml_type type);
|
|
127
127
|
void iq3xs_init_impl(int grid_size);
|
128
128
|
void iq3xs_free_impl(int grid_size);
|
129
129
|
|
130
|
+
#if defined(__ARM_FEATURE_SVE)
|
131
|
+
extern int lm_ggml_sve_cnt_b;
|
132
|
+
#endif
|
133
|
+
|
130
134
|
#ifdef __cplusplus
|
131
135
|
}
|
132
136
|
#endif
|
package/cpp/ggml.c
CHANGED
@@ -37,6 +37,9 @@
|
|
37
37
|
#include <unistd.h>
|
38
38
|
#endif
|
39
39
|
|
40
|
+
#if defined(__ARM_FEATURE_SVE)
|
41
|
+
int lm_ggml_sve_cnt_b = 0;
|
42
|
+
#endif
|
40
43
|
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
|
41
44
|
#undef LM_GGML_USE_LLAMAFILE
|
42
45
|
#endif
|
@@ -53,6 +56,9 @@
|
|
53
56
|
// disable POSIX deprecation warnings
|
54
57
|
// these functions are never going away, anyway
|
55
58
|
#pragma warning(disable: 4996)
|
59
|
+
|
60
|
+
// unreachable code because of multiple instances of code after LM_GGML_ABORT
|
61
|
+
#pragma warning(disable: 4702)
|
56
62
|
#endif
|
57
63
|
|
58
64
|
#if defined(_WIN32)
|
@@ -141,7 +147,51 @@ typedef pthread_t lm_ggml_thread_t;
|
|
141
147
|
|
142
148
|
#include <sys/wait.h>
|
143
149
|
|
144
|
-
#if defined(
|
150
|
+
#if defined(__ANDROID__)
|
151
|
+
#include <unwind.h>
|
152
|
+
#include <dlfcn.h>
|
153
|
+
#include <stdio.h>
|
154
|
+
|
155
|
+
struct backtrace_state {
|
156
|
+
void ** current;
|
157
|
+
void ** end;
|
158
|
+
};
|
159
|
+
|
160
|
+
static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context* context, void* arg) {
|
161
|
+
struct backtrace_state * state = (struct backtrace_state *)arg;
|
162
|
+
uintptr_t pc = _Unwind_GetIP(context);
|
163
|
+
if (pc) {
|
164
|
+
if (state->current == state->end) {
|
165
|
+
return _URC_END_OF_STACK;
|
166
|
+
} else {
|
167
|
+
*state->current++ = (void*)pc;
|
168
|
+
}
|
169
|
+
}
|
170
|
+
return _URC_NO_REASON;
|
171
|
+
}
|
172
|
+
|
173
|
+
static void lm_ggml_print_backtrace_symbols(void) {
|
174
|
+
const int max = 100;
|
175
|
+
void* buffer[max];
|
176
|
+
|
177
|
+
struct backtrace_state state = {buffer, buffer + max};
|
178
|
+
_Unwind_Backtrace(unwind_callback, &state);
|
179
|
+
|
180
|
+
int count = state.current - buffer;
|
181
|
+
|
182
|
+
for (int idx = 0; idx < count; ++idx) {
|
183
|
+
const void * addr = buffer[idx];
|
184
|
+
const char * symbol = "";
|
185
|
+
|
186
|
+
Dl_info info;
|
187
|
+
if (dladdr(addr, &info) && info.dli_sname) {
|
188
|
+
symbol = info.dli_sname;
|
189
|
+
}
|
190
|
+
|
191
|
+
fprintf(stderr, "%d: %p %s\n", idx, addr, symbol);
|
192
|
+
}
|
193
|
+
}
|
194
|
+
#elif defined(__linux__) && defined(__GLIBC__)
|
145
195
|
#include <execinfo.h>
|
146
196
|
static void lm_ggml_print_backtrace_symbols(void) {
|
147
197
|
// void * trace[100];
|
@@ -436,9 +486,16 @@ void lm_ggml_bf16_to_fp32_row(const lm_ggml_bf16_t * x, float * y, int64_t n) {
|
|
436
486
|
}
|
437
487
|
}
|
438
488
|
|
489
|
+
void lm_ggml_fp32_to_bf16_row_ref(const float * x, lm_ggml_bf16_t * y, int64_t n) {
|
490
|
+
for (int i = 0; i < n; i++) {
|
491
|
+
y[i] = lm_ggml_compute_fp32_to_bf16(x[i]);
|
492
|
+
}
|
493
|
+
}
|
494
|
+
|
439
495
|
void lm_ggml_fp32_to_bf16_row(const float * x, lm_ggml_bf16_t * y, int64_t n) {
|
440
496
|
int i = 0;
|
441
497
|
#if defined(__AVX512BF16__)
|
498
|
+
// subnormals are flushed to zero on this platform
|
442
499
|
for (; i + 32 <= n; i += 32) {
|
443
500
|
_mm512_storeu_si512(
|
444
501
|
(__m512i *)(y + i),
|
@@ -918,7 +975,7 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = {
|
|
918
975
|
.is_quantized = false,
|
919
976
|
.to_float = (lm_ggml_to_float_t) lm_ggml_bf16_to_fp32_row,
|
920
977
|
.from_float = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row,
|
921
|
-
.from_float_ref = (lm_ggml_from_float_t)
|
978
|
+
.from_float_ref = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row_ref,
|
922
979
|
.vec_dot = (lm_ggml_vec_dot_t) lm_ggml_vec_dot_bf16,
|
923
980
|
.vec_dot_type = LM_GGML_TYPE_BF16,
|
924
981
|
.nrows = 1,
|
@@ -2258,7 +2315,7 @@ inline static void lm_ggml_vec_abs_f32 (const int n, float * y, const float * x
|
|
2258
2315
|
inline static void lm_ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
|
2259
2316
|
inline static void lm_ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
|
2260
2317
|
inline static void lm_ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
|
2261
|
-
inline static void lm_ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] :
|
2318
|
+
inline static void lm_ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
|
2262
2319
|
inline static void lm_ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
2263
2320
|
inline static void lm_ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
2264
2321
|
inline static void lm_ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
|
@@ -3507,6 +3564,12 @@ struct lm_ggml_context * lm_ggml_init(struct lm_ggml_init_params params) {
|
|
3507
3564
|
|
3508
3565
|
LM_GGML_ASSERT_ALIGNED(ctx->mem_buffer);
|
3509
3566
|
|
3567
|
+
#if defined(__ARM_FEATURE_SVE)
|
3568
|
+
if (!lm_ggml_sve_cnt_b) {
|
3569
|
+
lm_ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
|
3570
|
+
}
|
3571
|
+
#endif
|
3572
|
+
|
3510
3573
|
LM_GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
|
3511
3574
|
|
3512
3575
|
lm_ggml_critical_section_end();
|
@@ -3661,7 +3724,8 @@ static struct lm_ggml_tensor * lm_ggml_new_tensor_impl(
|
|
3661
3724
|
struct lm_ggml_tensor * view_src,
|
3662
3725
|
size_t view_offs) {
|
3663
3726
|
|
3664
|
-
|
3727
|
+
LM_GGML_ASSERT(type >= 0 && type < LM_GGML_TYPE_COUNT);
|
3728
|
+
LM_GGML_ASSERT(n_dims >= 1 && n_dims <= LM_GGML_MAX_DIMS);
|
3665
3729
|
|
3666
3730
|
// find the base tensor and absolute offset
|
3667
3731
|
if (view_src != NULL && view_src->view_src != NULL) {
|
@@ -5314,6 +5378,7 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
|
|
5314
5378
|
struct lm_ggml_context * ctx,
|
5315
5379
|
struct lm_ggml_tensor * a,
|
5316
5380
|
int n_groups,
|
5381
|
+
float eps,
|
5317
5382
|
bool inplace) {
|
5318
5383
|
|
5319
5384
|
bool is_node = false;
|
@@ -5324,7 +5389,8 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
|
|
5324
5389
|
|
5325
5390
|
struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a);
|
5326
5391
|
|
5327
|
-
result
|
5392
|
+
lm_ggml_set_op_params_i32(result, 0, n_groups);
|
5393
|
+
lm_ggml_set_op_params_f32(result, 1, eps);
|
5328
5394
|
|
5329
5395
|
result->op = LM_GGML_OP_GROUP_NORM;
|
5330
5396
|
result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5336,15 +5402,17 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
|
|
5336
5402
|
struct lm_ggml_tensor * lm_ggml_group_norm(
|
5337
5403
|
struct lm_ggml_context * ctx,
|
5338
5404
|
struct lm_ggml_tensor * a,
|
5339
|
-
int n_groups
|
5340
|
-
|
5405
|
+
int n_groups,
|
5406
|
+
float eps) {
|
5407
|
+
return lm_ggml_group_norm_impl(ctx, a, n_groups, eps, false);
|
5341
5408
|
}
|
5342
5409
|
|
5343
5410
|
struct lm_ggml_tensor * lm_ggml_group_norm_inplace(
|
5344
5411
|
struct lm_ggml_context * ctx,
|
5345
5412
|
struct lm_ggml_tensor * a,
|
5346
|
-
int n_groups
|
5347
|
-
|
5413
|
+
int n_groups,
|
5414
|
+
float eps) {
|
5415
|
+
return lm_ggml_group_norm_impl(ctx, a, n_groups, eps, true);
|
5348
5416
|
}
|
5349
5417
|
|
5350
5418
|
// lm_ggml_mul_mat
|
@@ -12035,10 +12103,11 @@ static void lm_ggml_compute_forward_group_norm_f32(
|
|
12035
12103
|
|
12036
12104
|
LM_GGML_TENSOR_UNARY_OP_LOCALS
|
12037
12105
|
|
12038
|
-
const float eps = 1e-6f; // TODO: make this a parameter
|
12039
|
-
|
12040
12106
|
// TODO: optimize
|
12041
12107
|
|
12108
|
+
float eps;
|
12109
|
+
memcpy(&eps, dst->op_params + 1, sizeof(float));
|
12110
|
+
|
12042
12111
|
int n_channels = src0->ne[2];
|
12043
12112
|
int n_groups = dst->op_params[0];
|
12044
12113
|
int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
|
@@ -20606,7 +20675,7 @@ size_t lm_ggml_quantize_chunk(
|
|
20606
20675
|
case LM_GGML_TYPE_BF16:
|
20607
20676
|
{
|
20608
20677
|
size_t elemsize = sizeof(lm_ggml_bf16_t);
|
20609
|
-
|
20678
|
+
lm_ggml_fp32_to_bf16_row_ref(src + start, (lm_ggml_bf16_t *)dst + start, n);
|
20610
20679
|
result = n * elemsize;
|
20611
20680
|
} break;
|
20612
20681
|
case LM_GGML_TYPE_F32:
|
package/cpp/ggml.h
CHANGED
@@ -349,6 +349,7 @@ extern "C" {
|
|
349
349
|
LM_GGML_API lm_ggml_bf16_t lm_ggml_fp32_to_bf16(float);
|
350
350
|
LM_GGML_API float lm_ggml_bf16_to_fp32(lm_ggml_bf16_t); // consider just doing << 16
|
351
351
|
LM_GGML_API void lm_ggml_bf16_to_fp32_row(const lm_ggml_bf16_t *, float *, int64_t);
|
352
|
+
LM_GGML_API void lm_ggml_fp32_to_bf16_row_ref(const float *, lm_ggml_bf16_t *, int64_t);
|
352
353
|
LM_GGML_API void lm_ggml_fp32_to_bf16_row(const float *, lm_ggml_bf16_t *, int64_t);
|
353
354
|
|
354
355
|
struct lm_ggml_object;
|
@@ -1139,16 +1140,17 @@ extern "C" {
|
|
1139
1140
|
|
1140
1141
|
// group normalize along ne0*ne1*n_groups
|
1141
1142
|
// used in stable-diffusion
|
1142
|
-
// TODO: eps is hardcoded to 1e-6 for now
|
1143
1143
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_group_norm(
|
1144
1144
|
struct lm_ggml_context * ctx,
|
1145
1145
|
struct lm_ggml_tensor * a,
|
1146
|
-
int n_groups
|
1146
|
+
int n_groups,
|
1147
|
+
float eps);
|
1147
1148
|
|
1148
1149
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_group_norm_inplace(
|
1149
1150
|
struct lm_ggml_context * ctx,
|
1150
1151
|
struct lm_ggml_tensor * a,
|
1151
|
-
int n_groups
|
1152
|
+
int n_groups,
|
1153
|
+
float eps);
|
1152
1154
|
|
1153
1155
|
// a - x
|
1154
1156
|
// b - dy
|
@@ -1455,7 +1457,6 @@ extern "C" {
|
|
1455
1457
|
// if mode & 2 == 1, GPT-NeoX style
|
1456
1458
|
//
|
1457
1459
|
// b is an int32 vector with size a->ne[2], it contains the positions
|
1458
|
-
// c is freq factors (e.g. phi3-128k), (optional)
|
1459
1460
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope(
|
1460
1461
|
struct lm_ggml_context * ctx,
|
1461
1462
|
struct lm_ggml_tensor * a,
|
@@ -1472,6 +1473,7 @@ extern "C" {
|
|
1472
1473
|
int mode);
|
1473
1474
|
|
1474
1475
|
// custom RoPE
|
1476
|
+
// c is freq factors (e.g. phi3-128k), (optional)
|
1475
1477
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_ext(
|
1476
1478
|
struct lm_ggml_context * ctx,
|
1477
1479
|
struct lm_ggml_tensor * a,
|
package/cpp/llama-impl.h
CHANGED
@@ -24,3 +24,18 @@ void llama_log_callback_default(lm_ggml_log_level level, const char * text, void
|
|
24
24
|
#define LLAMA_LOG_INFO(...) llama_log_internal(LM_GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
25
25
|
#define LLAMA_LOG_WARN(...) llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
26
26
|
#define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
27
|
+
|
28
|
+
//
|
29
|
+
// helpers
|
30
|
+
//
|
31
|
+
|
32
|
+
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
33
|
+
if (search.empty()) {
|
34
|
+
return; // Avoid infinite loop if 'search' is an empty string
|
35
|
+
}
|
36
|
+
size_t pos = 0;
|
37
|
+
while ((pos = s.find(search, pos)) != std::string::npos) {
|
38
|
+
s.replace(pos, search.length(), replace);
|
39
|
+
pos += replace.length();
|
40
|
+
}
|
41
|
+
}
|
package/cpp/llama-vocab.cpp
CHANGED
@@ -16,20 +16,6 @@
|
|
16
16
|
// helpers
|
17
17
|
//
|
18
18
|
|
19
|
-
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
20
|
-
std::string result;
|
21
|
-
for (size_t pos = 0; ; pos += search.length()) {
|
22
|
-
auto new_pos = s.find(search, pos);
|
23
|
-
if (new_pos == std::string::npos) {
|
24
|
-
result += s.substr(pos, s.size() - pos);
|
25
|
-
break;
|
26
|
-
}
|
27
|
-
result += s.substr(pos, new_pos - pos) + replace;
|
28
|
-
pos = new_pos;
|
29
|
-
}
|
30
|
-
s = std::move(result);
|
31
|
-
}
|
32
|
-
|
33
19
|
LLAMA_ATTRIBUTE_FORMAT(1, 2)
|
34
20
|
static std::string format(const char * fmt, ...) {
|
35
21
|
va_list ap;
|
@@ -816,6 +802,9 @@ struct llm_tokenizer_ugm {
|
|
816
802
|
* the best tokenization.
|
817
803
|
*/
|
818
804
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
805
|
+
// get current size of output (for reversal later)
|
806
|
+
size_t output_size = output.size();
|
807
|
+
|
819
808
|
// normalize the input first
|
820
809
|
std::string normalized;
|
821
810
|
normalize(text, &normalized);
|
@@ -895,7 +884,7 @@ struct llm_tokenizer_ugm {
|
|
895
884
|
}
|
896
885
|
|
897
886
|
// reverse the output since we added tokens starting from the end of the input
|
898
|
-
std::reverse(output.begin(), output.end());
|
887
|
+
std::reverse(output.begin() + output_size, output.end());
|
899
888
|
}
|
900
889
|
|
901
890
|
private:
|
@@ -1444,7 +1433,8 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
|
|
1444
1433
|
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
|
1445
1434
|
return token != -1 && (
|
1446
1435
|
token == llama_token_eos_impl(vocab) ||
|
1447
|
-
token == llama_token_eot_impl(vocab)
|
1436
|
+
token == llama_token_eot_impl(vocab) ||
|
1437
|
+
token == llama_token_eom_impl(vocab)
|
1448
1438
|
);
|
1449
1439
|
}
|
1450
1440
|
|
@@ -1500,6 +1490,10 @@ llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
|
|
1500
1490
|
return vocab.special_eot_id;
|
1501
1491
|
}
|
1502
1492
|
|
1493
|
+
llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
|
1494
|
+
return vocab.special_eom_id;
|
1495
|
+
}
|
1496
|
+
|
1503
1497
|
int32_t llama_tokenize_impl(
|
1504
1498
|
const struct llama_vocab & vocab,
|
1505
1499
|
const char * text,
|
package/cpp/llama-vocab.h
CHANGED
@@ -45,6 +45,7 @@ struct llama_vocab {
|
|
45
45
|
id special_suffix_id = -1;
|
46
46
|
id special_middle_id = -1;
|
47
47
|
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
48
|
+
id special_eom_id = -1;
|
48
49
|
|
49
50
|
// tokenizer flags
|
50
51
|
bool tokenizer_add_space_prefix = false;
|
@@ -101,6 +102,7 @@ llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
|
|
101
102
|
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
|
102
103
|
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
|
103
104
|
llama_token llama_token_eot_impl (const struct llama_vocab & vocab);
|
105
|
+
llama_token llama_token_eom_impl (const struct llama_vocab & vocab);
|
104
106
|
|
105
107
|
int32_t llama_tokenize_impl(
|
106
108
|
const struct llama_vocab & vocab,
|