cui-llama.rn 1.0.7 → 1.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/jni.cpp +1 -1
- package/cpp/common.cpp +67 -34
- package/cpp/common.h +23 -8
- package/cpp/ggml-aarch64.c +16 -14
- package/cpp/ggml-backend.c +15 -10
- package/cpp/ggml-impl.h +4 -6
- package/cpp/ggml-metal.h +2 -0
- package/cpp/ggml-metal.m +55 -22
- package/cpp/ggml-quants.c +2 -2
- package/cpp/ggml-quants.h +4 -0
- package/cpp/ggml.c +40 -15
- package/cpp/ggml.h +10 -6
- package/cpp/grammar-parser.cpp +3 -0
- package/cpp/llama-impl.h +15 -0
- package/cpp/llama-sampling.cpp +2 -2
- package/cpp/llama-vocab.cpp +14 -18
- package/cpp/llama-vocab.h +4 -2
- package/cpp/llama.cpp +466 -280
- package/cpp/llama.h +10 -11
- package/cpp/rn-llama.hpp +23 -10
- package/package.json +1 -1
package/cpp/ggml-metal.m
CHANGED
@@ -210,7 +210,7 @@ enum lm_ggml_metal_kernel_type {
|
|
210
210
|
LM_GGML_METAL_KERNEL_TYPE_COUNT
|
211
211
|
};
|
212
212
|
|
213
|
-
struct
|
213
|
+
struct lm_ggml_backend_metal_context {
|
214
214
|
int n_cb;
|
215
215
|
|
216
216
|
id<MTLDevice> device;
|
@@ -224,6 +224,10 @@ struct lm_ggml_metal_context {
|
|
224
224
|
bool support_simdgroup_mm;
|
225
225
|
|
226
226
|
bool should_capture_next_compute;
|
227
|
+
|
228
|
+
// abort lm_ggml_metal_graph_compute if callback returns true
|
229
|
+
lm_ggml_abort_callback abort_callback;
|
230
|
+
void * abort_callback_data;
|
227
231
|
};
|
228
232
|
|
229
233
|
// MSL code
|
@@ -289,7 +293,7 @@ static void * lm_ggml_metal_host_malloc(size_t n) {
|
|
289
293
|
return data;
|
290
294
|
}
|
291
295
|
|
292
|
-
static struct
|
296
|
+
static struct lm_ggml_backend_metal_context * lm_ggml_metal_init(int n_cb) {
|
293
297
|
LM_GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
|
294
298
|
|
295
299
|
#if TARGET_OS_OSX && !LM_GGML_METAL_NDEBUG
|
@@ -306,7 +310,7 @@ static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) {
|
|
306
310
|
LM_GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
|
307
311
|
|
308
312
|
// Configure context
|
309
|
-
struct
|
313
|
+
struct lm_ggml_backend_metal_context * ctx = calloc(1, sizeof(struct lm_ggml_backend_metal_context));
|
310
314
|
ctx->device = device;
|
311
315
|
ctx->n_cb = MIN(n_cb, LM_GGML_METAL_MAX_BUFFERS);
|
312
316
|
ctx->queue = [ctx->device newCommandQueue];
|
@@ -668,7 +672,7 @@ static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) {
|
|
668
672
|
return ctx;
|
669
673
|
}
|
670
674
|
|
671
|
-
static void lm_ggml_metal_free(struct
|
675
|
+
static void lm_ggml_metal_free(struct lm_ggml_backend_metal_context * ctx) {
|
672
676
|
LM_GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
|
673
677
|
|
674
678
|
for (int i = 0; i < LM_GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
|
@@ -734,7 +738,7 @@ static id<MTLBuffer> lm_ggml_metal_get_buffer(struct lm_ggml_tensor * t, size_t
|
|
734
738
|
return nil;
|
735
739
|
}
|
736
740
|
|
737
|
-
static bool lm_ggml_metal_supports_op(const struct
|
741
|
+
static bool lm_ggml_metal_supports_op(const struct lm_ggml_backend_metal_context * ctx, const struct lm_ggml_tensor * op) {
|
738
742
|
for (size_t i = 0, n = 3; i < n; ++i) {
|
739
743
|
if (op->src[i] != NULL && op->src[i]->type == LM_GGML_TYPE_BF16) {
|
740
744
|
return false;
|
@@ -845,7 +849,7 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_metal_context * ctx,
|
|
845
849
|
}
|
846
850
|
|
847
851
|
static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
848
|
-
struct
|
852
|
+
struct lm_ggml_backend_metal_context * ctx,
|
849
853
|
struct lm_ggml_cgraph * gf) {
|
850
854
|
|
851
855
|
@autoreleasepool {
|
@@ -878,8 +882,11 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
|
878
882
|
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
|
879
883
|
command_buffer_builder[cb_idx] = command_buffer;
|
880
884
|
|
881
|
-
// enqueue the command buffers
|
882
|
-
|
885
|
+
// always enqueue the first two command buffers
|
886
|
+
// enqueue all of the command buffers if we don't need to abort
|
887
|
+
if (cb_idx < 2 || ctx->abort_callback == NULL) {
|
888
|
+
[command_buffer enqueue];
|
889
|
+
}
|
883
890
|
}
|
884
891
|
|
885
892
|
const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;
|
@@ -2229,10 +2236,8 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
|
2229
2236
|
LM_GGML_ASSERT(ne00 % 4 == 0);
|
2230
2237
|
LM_GGML_ASSERT(lm_ggml_is_contiguous(src0));
|
2231
2238
|
|
2232
|
-
|
2233
|
-
|
2234
|
-
|
2235
|
-
const float eps = 1e-6f; // TODO: temporarily hardcoded
|
2239
|
+
float eps;
|
2240
|
+
memcpy(&eps, dst->op_params + 1, sizeof(float));
|
2236
2241
|
|
2237
2242
|
const int32_t n_groups = ((int32_t *) dst->op_params)[0];
|
2238
2243
|
|
@@ -2308,7 +2313,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
|
2308
2313
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
2309
2314
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
2310
2315
|
|
2311
|
-
const bool is_neox = mode &
|
2316
|
+
const bool is_neox = mode & LM_GGML_ROPE_TYPE_NEOX;
|
2312
2317
|
|
2313
2318
|
id<MTLComputePipelineState> pipeline = nil;
|
2314
2319
|
|
@@ -2829,7 +2834,9 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
|
2829
2834
|
|
2830
2835
|
[encoder endEncoding];
|
2831
2836
|
|
2832
|
-
|
2837
|
+
if (cb_idx < 2 || ctx->abort_callback == NULL) {
|
2838
|
+
[command_buffer commit];
|
2839
|
+
}
|
2833
2840
|
});
|
2834
2841
|
|
2835
2842
|
// Wait for completion and check status of each command buffer
|
@@ -2849,6 +2856,23 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
|
2849
2856
|
|
2850
2857
|
return LM_GGML_STATUS_FAILED;
|
2851
2858
|
}
|
2859
|
+
|
2860
|
+
id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? command_buffers[i + 1] : nil);
|
2861
|
+
if (!next_buffer) {
|
2862
|
+
continue;
|
2863
|
+
}
|
2864
|
+
|
2865
|
+
bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
|
2866
|
+
if (next_queued) {
|
2867
|
+
continue;
|
2868
|
+
}
|
2869
|
+
|
2870
|
+
if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
|
2871
|
+
LM_GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i);
|
2872
|
+
return LM_GGML_STATUS_ABORTED;
|
2873
|
+
}
|
2874
|
+
|
2875
|
+
[next_buffer commit];
|
2852
2876
|
}
|
2853
2877
|
|
2854
2878
|
if (should_capture) {
|
@@ -3152,7 +3176,7 @@ LM_GGML_CALL static const char * lm_ggml_backend_metal_name(lm_ggml_backend_t ba
|
|
3152
3176
|
}
|
3153
3177
|
|
3154
3178
|
LM_GGML_CALL static void lm_ggml_backend_metal_free(lm_ggml_backend_t backend) {
|
3155
|
-
struct
|
3179
|
+
struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
|
3156
3180
|
lm_ggml_metal_free(ctx);
|
3157
3181
|
free(backend);
|
3158
3182
|
}
|
@@ -3164,13 +3188,13 @@ LM_GGML_CALL static lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_get_defa
|
|
3164
3188
|
}
|
3165
3189
|
|
3166
3190
|
LM_GGML_CALL static enum lm_ggml_status lm_ggml_backend_metal_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
|
3167
|
-
struct
|
3191
|
+
struct lm_ggml_backend_metal_context * metal_ctx = (struct lm_ggml_backend_metal_context *)backend->context;
|
3168
3192
|
|
3169
3193
|
return lm_ggml_metal_graph_compute(metal_ctx, cgraph);
|
3170
3194
|
}
|
3171
3195
|
|
3172
3196
|
LM_GGML_CALL static bool lm_ggml_backend_metal_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
|
3173
|
-
struct
|
3197
|
+
struct lm_ggml_backend_metal_context * metal_ctx = (struct lm_ggml_backend_metal_context *)backend->context;
|
3174
3198
|
|
3175
3199
|
return lm_ggml_metal_supports_op(metal_ctx, op);
|
3176
3200
|
}
|
@@ -3215,9 +3239,9 @@ static lm_ggml_guid_t lm_ggml_backend_metal_guid(void) {
|
|
3215
3239
|
}
|
3216
3240
|
|
3217
3241
|
lm_ggml_backend_t lm_ggml_backend_metal_init(void) {
|
3218
|
-
struct
|
3219
|
-
|
3242
|
+
struct lm_ggml_backend_metal_context * ctx = lm_ggml_metal_init(LM_GGML_DEFAULT_N_THREADS);
|
3220
3243
|
if (ctx == NULL) {
|
3244
|
+
LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
|
3221
3245
|
return NULL;
|
3222
3246
|
}
|
3223
3247
|
|
@@ -3239,15 +3263,24 @@ bool lm_ggml_backend_is_metal(lm_ggml_backend_t backend) {
|
|
3239
3263
|
void lm_ggml_backend_metal_set_n_cb(lm_ggml_backend_t backend, int n_cb) {
|
3240
3264
|
LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
|
3241
3265
|
|
3242
|
-
struct
|
3266
|
+
struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
|
3243
3267
|
|
3244
3268
|
ctx->n_cb = MIN(n_cb, LM_GGML_METAL_MAX_BUFFERS);
|
3245
3269
|
}
|
3246
3270
|
|
3271
|
+
void lm_ggml_backend_metal_set_abort_callback(lm_ggml_backend_t backend, lm_ggml_abort_callback abort_callback, void * user_data) {
|
3272
|
+
LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
|
3273
|
+
|
3274
|
+
struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
|
3275
|
+
|
3276
|
+
ctx->abort_callback = abort_callback;
|
3277
|
+
ctx->abort_callback_data = user_data;
|
3278
|
+
}
|
3279
|
+
|
3247
3280
|
bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family) {
|
3248
3281
|
LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
|
3249
3282
|
|
3250
|
-
struct
|
3283
|
+
struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
|
3251
3284
|
|
3252
3285
|
return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
|
3253
3286
|
}
|
@@ -3255,7 +3288,7 @@ bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family
|
|
3255
3288
|
void lm_ggml_backend_metal_capture_next_compute(lm_ggml_backend_t backend) {
|
3256
3289
|
LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
|
3257
3290
|
|
3258
|
-
struct
|
3291
|
+
struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
|
3259
3292
|
ctx->should_capture_next_compute = true;
|
3260
3293
|
}
|
3261
3294
|
|
package/cpp/ggml-quants.c
CHANGED
@@ -3818,7 +3818,7 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void
|
|
3818
3818
|
float sumf = 0;
|
3819
3819
|
|
3820
3820
|
#if defined(__ARM_FEATURE_SVE)
|
3821
|
-
if (
|
3821
|
+
if (lm_ggml_sve_cnt_b == QK8_0) {
|
3822
3822
|
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
|
3823
3823
|
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
|
3824
3824
|
|
@@ -5303,7 +5303,7 @@ void lm_ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void
|
|
5303
5303
|
float sumf = 0;
|
5304
5304
|
|
5305
5305
|
#if defined(__ARM_FEATURE_SVE)
|
5306
|
-
if (
|
5306
|
+
if (lm_ggml_sve_cnt_b == QK8_0) {
|
5307
5307
|
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
5308
5308
|
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
5309
5309
|
|
package/cpp/ggml-quants.h
CHANGED
@@ -127,6 +127,10 @@ void iq2xs_free_impl(enum lm_ggml_type type);
|
|
127
127
|
void iq3xs_init_impl(int grid_size);
|
128
128
|
void iq3xs_free_impl(int grid_size);
|
129
129
|
|
130
|
+
#if defined(__ARM_FEATURE_SVE)
|
131
|
+
extern int lm_ggml_sve_cnt_b;
|
132
|
+
#endif
|
133
|
+
|
130
134
|
#ifdef __cplusplus
|
131
135
|
}
|
132
136
|
#endif
|
package/cpp/ggml.c
CHANGED
@@ -37,6 +37,9 @@
|
|
37
37
|
#include <unistd.h>
|
38
38
|
#endif
|
39
39
|
|
40
|
+
#if defined(__ARM_FEATURE_SVE)
|
41
|
+
int lm_ggml_sve_cnt_b = 0;
|
42
|
+
#endif
|
40
43
|
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
|
41
44
|
#undef LM_GGML_USE_LLAMAFILE
|
42
45
|
#endif
|
@@ -53,6 +56,9 @@
|
|
53
56
|
// disable POSIX deprecation warnings
|
54
57
|
// these functions are never going away, anyway
|
55
58
|
#pragma warning(disable: 4996)
|
59
|
+
|
60
|
+
// unreachable code because of multiple instances of code after LM_GGML_ABORT
|
61
|
+
#pragma warning(disable: 4702)
|
56
62
|
#endif
|
57
63
|
|
58
64
|
#if defined(_WIN32)
|
@@ -185,7 +191,7 @@ static void lm_ggml_print_backtrace_symbols(void) {
|
|
185
191
|
fprintf(stderr, "%d: %p %s\n", idx, addr, symbol);
|
186
192
|
}
|
187
193
|
}
|
188
|
-
#elif defined(__linux__)
|
194
|
+
#elif defined(__linux__) && defined(__GLIBC__)
|
189
195
|
#include <execinfo.h>
|
190
196
|
static void lm_ggml_print_backtrace_symbols(void) {
|
191
197
|
// void * trace[100];
|
@@ -480,9 +486,16 @@ void lm_ggml_bf16_to_fp32_row(const lm_ggml_bf16_t * x, float * y, int64_t n) {
|
|
480
486
|
}
|
481
487
|
}
|
482
488
|
|
489
|
+
void lm_ggml_fp32_to_bf16_row_ref(const float * x, lm_ggml_bf16_t * y, int64_t n) {
|
490
|
+
for (int i = 0; i < n; i++) {
|
491
|
+
y[i] = lm_ggml_compute_fp32_to_bf16(x[i]);
|
492
|
+
}
|
493
|
+
}
|
494
|
+
|
483
495
|
void lm_ggml_fp32_to_bf16_row(const float * x, lm_ggml_bf16_t * y, int64_t n) {
|
484
496
|
int i = 0;
|
485
497
|
#if defined(__AVX512BF16__)
|
498
|
+
// subnormals are flushed to zero on this platform
|
486
499
|
for (; i + 32 <= n; i += 32) {
|
487
500
|
_mm512_storeu_si512(
|
488
501
|
(__m512i *)(y + i),
|
@@ -962,7 +975,7 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = {
|
|
962
975
|
.is_quantized = false,
|
963
976
|
.to_float = (lm_ggml_to_float_t) lm_ggml_bf16_to_fp32_row,
|
964
977
|
.from_float = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row,
|
965
|
-
.from_float_ref = (lm_ggml_from_float_t)
|
978
|
+
.from_float_ref = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row_ref,
|
966
979
|
.vec_dot = (lm_ggml_vec_dot_t) lm_ggml_vec_dot_bf16,
|
967
980
|
.vec_dot_type = LM_GGML_TYPE_BF16,
|
968
981
|
.nrows = 1,
|
@@ -2302,7 +2315,7 @@ inline static void lm_ggml_vec_abs_f32 (const int n, float * y, const float * x
|
|
2302
2315
|
inline static void lm_ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
|
2303
2316
|
inline static void lm_ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
|
2304
2317
|
inline static void lm_ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
|
2305
|
-
inline static void lm_ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] :
|
2318
|
+
inline static void lm_ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
|
2306
2319
|
inline static void lm_ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
2307
2320
|
inline static void lm_ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
2308
2321
|
inline static void lm_ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
|
@@ -3551,6 +3564,12 @@ struct lm_ggml_context * lm_ggml_init(struct lm_ggml_init_params params) {
|
|
3551
3564
|
|
3552
3565
|
LM_GGML_ASSERT_ALIGNED(ctx->mem_buffer);
|
3553
3566
|
|
3567
|
+
#if defined(__ARM_FEATURE_SVE)
|
3568
|
+
if (!lm_ggml_sve_cnt_b) {
|
3569
|
+
lm_ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
|
3570
|
+
}
|
3571
|
+
#endif
|
3572
|
+
|
3554
3573
|
LM_GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
|
3555
3574
|
|
3556
3575
|
lm_ggml_critical_section_end();
|
@@ -3705,7 +3724,8 @@ static struct lm_ggml_tensor * lm_ggml_new_tensor_impl(
|
|
3705
3724
|
struct lm_ggml_tensor * view_src,
|
3706
3725
|
size_t view_offs) {
|
3707
3726
|
|
3708
|
-
|
3727
|
+
LM_GGML_ASSERT(type >= 0 && type < LM_GGML_TYPE_COUNT);
|
3728
|
+
LM_GGML_ASSERT(n_dims >= 1 && n_dims <= LM_GGML_MAX_DIMS);
|
3709
3729
|
|
3710
3730
|
// find the base tensor and absolute offset
|
3711
3731
|
if (view_src != NULL && view_src->view_src != NULL) {
|
@@ -5358,6 +5378,7 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
|
|
5358
5378
|
struct lm_ggml_context * ctx,
|
5359
5379
|
struct lm_ggml_tensor * a,
|
5360
5380
|
int n_groups,
|
5381
|
+
float eps,
|
5361
5382
|
bool inplace) {
|
5362
5383
|
|
5363
5384
|
bool is_node = false;
|
@@ -5368,7 +5389,8 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
|
|
5368
5389
|
|
5369
5390
|
struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a);
|
5370
5391
|
|
5371
|
-
result
|
5392
|
+
lm_ggml_set_op_params_i32(result, 0, n_groups);
|
5393
|
+
lm_ggml_set_op_params_f32(result, 1, eps);
|
5372
5394
|
|
5373
5395
|
result->op = LM_GGML_OP_GROUP_NORM;
|
5374
5396
|
result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5380,15 +5402,17 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
|
|
5380
5402
|
struct lm_ggml_tensor * lm_ggml_group_norm(
|
5381
5403
|
struct lm_ggml_context * ctx,
|
5382
5404
|
struct lm_ggml_tensor * a,
|
5383
|
-
int n_groups
|
5384
|
-
|
5405
|
+
int n_groups,
|
5406
|
+
float eps) {
|
5407
|
+
return lm_ggml_group_norm_impl(ctx, a, n_groups, eps, false);
|
5385
5408
|
}
|
5386
5409
|
|
5387
5410
|
struct lm_ggml_tensor * lm_ggml_group_norm_inplace(
|
5388
5411
|
struct lm_ggml_context * ctx,
|
5389
5412
|
struct lm_ggml_tensor * a,
|
5390
|
-
int n_groups
|
5391
|
-
|
5413
|
+
int n_groups,
|
5414
|
+
float eps) {
|
5415
|
+
return lm_ggml_group_norm_impl(ctx, a, n_groups, eps, true);
|
5392
5416
|
}
|
5393
5417
|
|
5394
5418
|
// lm_ggml_mul_mat
|
@@ -12079,10 +12103,11 @@ static void lm_ggml_compute_forward_group_norm_f32(
|
|
12079
12103
|
|
12080
12104
|
LM_GGML_TENSOR_UNARY_OP_LOCALS
|
12081
12105
|
|
12082
|
-
const float eps = 1e-6f; // TODO: make this a parameter
|
12083
|
-
|
12084
12106
|
// TODO: optimize
|
12085
12107
|
|
12108
|
+
float eps;
|
12109
|
+
memcpy(&eps, dst->op_params + 1, sizeof(float));
|
12110
|
+
|
12086
12111
|
int n_channels = src0->ne[2];
|
12087
12112
|
int n_groups = dst->op_params[0];
|
12088
12113
|
int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
|
@@ -14069,7 +14094,7 @@ static void lm_ggml_compute_forward_rope_f32(
|
|
14069
14094
|
float corr_dims[2];
|
14070
14095
|
lm_ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
14071
14096
|
|
14072
|
-
const bool is_neox = mode &
|
14097
|
+
const bool is_neox = mode & LM_GGML_ROPE_TYPE_NEOX;
|
14073
14098
|
|
14074
14099
|
const float * freq_factors = NULL;
|
14075
14100
|
if (src2 != NULL) {
|
@@ -14194,7 +14219,7 @@ static void lm_ggml_compute_forward_rope_f16(
|
|
14194
14219
|
float corr_dims[2];
|
14195
14220
|
lm_ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
14196
14221
|
|
14197
|
-
const bool is_neox = mode &
|
14222
|
+
const bool is_neox = mode & LM_GGML_ROPE_TYPE_NEOX;
|
14198
14223
|
|
14199
14224
|
const float * freq_factors = NULL;
|
14200
14225
|
if (src2 != NULL) {
|
@@ -20650,7 +20675,7 @@ size_t lm_ggml_quantize_chunk(
|
|
20650
20675
|
case LM_GGML_TYPE_BF16:
|
20651
20676
|
{
|
20652
20677
|
size_t elemsize = sizeof(lm_ggml_bf16_t);
|
20653
|
-
|
20678
|
+
lm_ggml_fp32_to_bf16_row_ref(src + start, (lm_ggml_bf16_t *)dst + start, n);
|
20654
20679
|
result = n * elemsize;
|
20655
20680
|
} break;
|
20656
20681
|
case LM_GGML_TYPE_F32:
|
@@ -21104,7 +21129,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
|
|
21104
21129
|
(int64_t) info->ne[2] *
|
21105
21130
|
(int64_t) info->ne[3];
|
21106
21131
|
|
21107
|
-
if (ne % lm_ggml_blck_size(info->type) != 0) {
|
21132
|
+
if (lm_ggml_blck_size(info->type) == 0 || ne % lm_ggml_blck_size(info->type) != 0) {
|
21108
21133
|
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
|
21109
21134
|
__func__, info->name.data, (int) info->type, lm_ggml_type_name(info->type), ne, lm_ggml_blck_size(info->type));
|
21110
21135
|
fclose(file);
|
package/cpp/ggml.h
CHANGED
@@ -244,6 +244,8 @@
|
|
244
244
|
#define LM_GGML_EXIT_SUCCESS 0
|
245
245
|
#define LM_GGML_EXIT_ABORTED 1
|
246
246
|
|
247
|
+
#define LM_GGML_ROPE_TYPE_NEOX 2
|
248
|
+
|
247
249
|
#define LM_GGUF_MAGIC "GGUF"
|
248
250
|
|
249
251
|
#define LM_GGUF_VERSION 3
|
@@ -349,6 +351,7 @@ extern "C" {
|
|
349
351
|
LM_GGML_API lm_ggml_bf16_t lm_ggml_fp32_to_bf16(float);
|
350
352
|
LM_GGML_API float lm_ggml_bf16_to_fp32(lm_ggml_bf16_t); // consider just doing << 16
|
351
353
|
LM_GGML_API void lm_ggml_bf16_to_fp32_row(const lm_ggml_bf16_t *, float *, int64_t);
|
354
|
+
LM_GGML_API void lm_ggml_fp32_to_bf16_row_ref(const float *, lm_ggml_bf16_t *, int64_t);
|
352
355
|
LM_GGML_API void lm_ggml_fp32_to_bf16_row(const float *, lm_ggml_bf16_t *, int64_t);
|
353
356
|
|
354
357
|
struct lm_ggml_object;
|
@@ -1139,16 +1142,17 @@ extern "C" {
|
|
1139
1142
|
|
1140
1143
|
// group normalize along ne0*ne1*n_groups
|
1141
1144
|
// used in stable-diffusion
|
1142
|
-
// TODO: eps is hardcoded to 1e-6 for now
|
1143
1145
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_group_norm(
|
1144
1146
|
struct lm_ggml_context * ctx,
|
1145
1147
|
struct lm_ggml_tensor * a,
|
1146
|
-
int n_groups
|
1148
|
+
int n_groups,
|
1149
|
+
float eps);
|
1147
1150
|
|
1148
1151
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_group_norm_inplace(
|
1149
1152
|
struct lm_ggml_context * ctx,
|
1150
1153
|
struct lm_ggml_tensor * a,
|
1151
|
-
int n_groups
|
1154
|
+
int n_groups,
|
1155
|
+
float eps);
|
1152
1156
|
|
1153
1157
|
// a - x
|
1154
1158
|
// b - dy
|
@@ -1451,11 +1455,10 @@ extern "C" {
|
|
1451
1455
|
struct lm_ggml_tensor * b);
|
1452
1456
|
|
1453
1457
|
// rotary position embedding
|
1454
|
-
// if mode & 1
|
1455
|
-
// if mode &
|
1458
|
+
// if (mode & 1) - skip n_past elements (NOT SUPPORTED)
|
1459
|
+
// if (mode & LM_GGML_ROPE_TYPE_NEOX) - GPT-NeoX style
|
1456
1460
|
//
|
1457
1461
|
// b is an int32 vector with size a->ne[2], it contains the positions
|
1458
|
-
// c is freq factors (e.g. phi3-128k), (optional)
|
1459
1462
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope(
|
1460
1463
|
struct lm_ggml_context * ctx,
|
1461
1464
|
struct lm_ggml_tensor * a,
|
@@ -1472,6 +1475,7 @@ extern "C" {
|
|
1472
1475
|
int mode);
|
1473
1476
|
|
1474
1477
|
// custom RoPE
|
1478
|
+
// c is freq factors (e.g. phi3-128k), (optional)
|
1475
1479
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_ext(
|
1476
1480
|
struct lm_ggml_context * ctx,
|
1477
1481
|
struct lm_ggml_tensor * a,
|
package/cpp/grammar-parser.cpp
CHANGED
@@ -369,6 +369,9 @@ namespace grammar_parser {
|
|
369
369
|
}
|
370
370
|
// Validate the state to ensure that all rules are defined
|
371
371
|
for (const auto & rule : state.rules) {
|
372
|
+
if (rule.empty()) {
|
373
|
+
throw std::runtime_error("Undefined rule");
|
374
|
+
}
|
372
375
|
for (const auto & elem : rule) {
|
373
376
|
if (elem.type == LLAMA_GRETYPE_RULE_REF) {
|
374
377
|
// Ensure that the rule at that location exists
|
package/cpp/llama-impl.h
CHANGED
@@ -24,3 +24,18 @@ void llama_log_callback_default(lm_ggml_log_level level, const char * text, void
|
|
24
24
|
#define LLAMA_LOG_INFO(...) llama_log_internal(LM_GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
25
25
|
#define LLAMA_LOG_WARN(...) llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
26
26
|
#define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
27
|
+
|
28
|
+
//
|
29
|
+
// helpers
|
30
|
+
//
|
31
|
+
|
32
|
+
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
33
|
+
if (search.empty()) {
|
34
|
+
return; // Avoid infinite loop if 'search' is an empty string
|
35
|
+
}
|
36
|
+
size_t pos = 0;
|
37
|
+
while ((pos = s.find(search, pos)) != std::string::npos) {
|
38
|
+
s.replace(pos, search.length(), replace);
|
39
|
+
pos += replace.length();
|
40
|
+
}
|
41
|
+
}
|
package/cpp/llama-sampling.cpp
CHANGED
@@ -85,14 +85,14 @@ void llama_sample_top_k_impl(struct llama_sampling * smpl, llama_token_data_arra
|
|
85
85
|
constexpr float bucket_low = -10.0f;
|
86
86
|
constexpr float bucket_high = 10.0f;
|
87
87
|
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
|
88
|
-
constexpr float
|
88
|
+
constexpr float bucket_inter = -bucket_low * bucket_scale;
|
89
89
|
|
90
90
|
std::vector<int> bucket_idx(candidates->size);
|
91
91
|
std::vector<int> histo(nbuckets, 0);
|
92
92
|
|
93
93
|
for (int i = 0; i < (int)candidates->size; ++i) {
|
94
94
|
const float val = candidates->data[i].logit;
|
95
|
-
int ib = int(bucket_scale * val +
|
95
|
+
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
96
96
|
ib = std::max(0, std::min(nbuckets-1, ib));
|
97
97
|
bucket_idx[i] = ib;
|
98
98
|
++histo[ib];
|
package/cpp/llama-vocab.cpp
CHANGED
@@ -16,20 +16,6 @@
|
|
16
16
|
// helpers
|
17
17
|
//
|
18
18
|
|
19
|
-
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
20
|
-
std::string result;
|
21
|
-
for (size_t pos = 0; ; pos += search.length()) {
|
22
|
-
auto new_pos = s.find(search, pos);
|
23
|
-
if (new_pos == std::string::npos) {
|
24
|
-
result += s.substr(pos, s.size() - pos);
|
25
|
-
break;
|
26
|
-
}
|
27
|
-
result += s.substr(pos, new_pos - pos) + replace;
|
28
|
-
pos = new_pos;
|
29
|
-
}
|
30
|
-
s = std::move(result);
|
31
|
-
}
|
32
|
-
|
33
19
|
LLAMA_ATTRIBUTE_FORMAT(1, 2)
|
34
20
|
static std::string format(const char * fmt, ...) {
|
35
21
|
va_list ap;
|
@@ -424,6 +410,8 @@ struct llm_tokenizer_bpe {
|
|
424
410
|
};
|
425
411
|
break;
|
426
412
|
case LLAMA_VOCAB_PRE_TYPE_PORO:
|
413
|
+
case LLAMA_VOCAB_PRE_TYPE_BLOOM:
|
414
|
+
case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
|
427
415
|
regex_exprs = {
|
428
416
|
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
429
417
|
};
|
@@ -816,6 +804,9 @@ struct llm_tokenizer_ugm {
|
|
816
804
|
* the best tokenization.
|
817
805
|
*/
|
818
806
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
807
|
+
// get current size of output (for reversal later)
|
808
|
+
size_t output_size = output.size();
|
809
|
+
|
819
810
|
// normalize the input first
|
820
811
|
std::string normalized;
|
821
812
|
normalize(text, &normalized);
|
@@ -895,7 +886,7 @@ struct llm_tokenizer_ugm {
|
|
895
886
|
}
|
896
887
|
|
897
888
|
// reverse the output since we added tokens starting from the end of the input
|
898
|
-
std::reverse(output.begin(), output.end());
|
889
|
+
std::reverse(output.begin() + output_size, output.end());
|
899
890
|
}
|
900
891
|
|
901
892
|
private:
|
@@ -1444,7 +1435,8 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
|
|
1444
1435
|
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
|
1445
1436
|
return token != -1 && (
|
1446
1437
|
token == llama_token_eos_impl(vocab) ||
|
1447
|
-
token == llama_token_eot_impl(vocab)
|
1438
|
+
token == llama_token_eot_impl(vocab) ||
|
1439
|
+
token == llama_token_eom_impl(vocab)
|
1448
1440
|
);
|
1449
1441
|
}
|
1450
1442
|
|
@@ -1476,11 +1468,11 @@ llama_token llama_token_pad_impl(const struct llama_vocab & vocab) {
|
|
1476
1468
|
return vocab.special_pad_id;
|
1477
1469
|
}
|
1478
1470
|
|
1479
|
-
|
1471
|
+
bool llama_add_bos_token_impl(const struct llama_vocab & vocab) {
|
1480
1472
|
return vocab.tokenizer_add_bos;
|
1481
1473
|
}
|
1482
1474
|
|
1483
|
-
|
1475
|
+
bool llama_add_eos_token_impl(const struct llama_vocab & vocab) {
|
1484
1476
|
return vocab.tokenizer_add_eos;
|
1485
1477
|
}
|
1486
1478
|
|
@@ -1500,6 +1492,10 @@ llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
|
|
1500
1492
|
return vocab.special_eot_id;
|
1501
1493
|
}
|
1502
1494
|
|
1495
|
+
llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
|
1496
|
+
return vocab.special_eom_id;
|
1497
|
+
}
|
1498
|
+
|
1503
1499
|
int32_t llama_tokenize_impl(
|
1504
1500
|
const struct llama_vocab & vocab,
|
1505
1501
|
const char * text,
|
package/cpp/llama-vocab.h
CHANGED
@@ -45,6 +45,7 @@ struct llama_vocab {
|
|
45
45
|
id special_suffix_id = -1;
|
46
46
|
id special_middle_id = -1;
|
47
47
|
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
48
|
+
id special_eom_id = -1;
|
48
49
|
|
49
50
|
// tokenizer flags
|
50
51
|
bool tokenizer_add_space_prefix = false;
|
@@ -94,13 +95,14 @@ llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
|
|
94
95
|
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
|
95
96
|
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
|
96
97
|
|
97
|
-
|
98
|
-
|
98
|
+
bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
|
99
|
+
bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
|
99
100
|
|
100
101
|
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
|
101
102
|
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
|
102
103
|
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
|
103
104
|
llama_token llama_token_eot_impl (const struct llama_vocab & vocab);
|
105
|
+
llama_token llama_token_eom_impl (const struct llama_vocab & vocab);
|
104
106
|
|
105
107
|
int32_t llama_tokenize_impl(
|
106
108
|
const struct llama_vocab & vocab,
|