llama_cpp 0.3.7 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/README.md +1 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +3 -3
- data/ext/llama_cpp/llama_cpp.cpp +118 -117
- data/ext/llama_cpp/src/ggml-alloc.c +97 -53
- data/ext/llama_cpp/src/ggml-alloc.h +4 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +1010 -497
- data/ext/llama_cpp/src/ggml-cuda.h +32 -23
- data/ext/llama_cpp/src/ggml-metal.h +9 -3
- data/ext/llama_cpp/src/ggml-metal.m +142 -161
- data/ext/llama_cpp/src/ggml-metal.metal +577 -500
- data/ext/llama_cpp/src/ggml.c +2064 -233
- data/ext/llama_cpp/src/ggml.h +238 -13
- data/ext/llama_cpp/src/k_quants.c +110 -54
- data/ext/llama_cpp/src/llama-util.h +10 -8
- data/ext/llama_cpp/src/llama.cpp +4544 -2890
- data/ext/llama_cpp/src/llama.h +133 -123
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +8 -8
- metadata +2 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -207,7 +207,7 @@
|
|
207
207
|
#define GGML_MAX_PARAMS 256
|
208
208
|
#define GGML_MAX_CONTEXTS 64
|
209
209
|
#define GGML_MAX_SRC 6
|
210
|
-
#define GGML_MAX_NAME
|
210
|
+
#define GGML_MAX_NAME 64
|
211
211
|
#define GGML_MAX_OP_PARAMS 32
|
212
212
|
#define GGML_DEFAULT_N_THREADS 4
|
213
213
|
|
@@ -215,6 +215,11 @@
|
|
215
215
|
#define GGML_EXIT_SUCCESS 0
|
216
216
|
#define GGML_EXIT_ABORTED 1
|
217
217
|
|
218
|
+
#define GGUF_MAGIC 0x46554747 // "GGUF"
|
219
|
+
#define GGUF_VERSION 1
|
220
|
+
|
221
|
+
#define GGUF_DEFAULT_ALIGNMENT 32
|
222
|
+
|
218
223
|
#define GGML_UNUSED(x) (void)(x)
|
219
224
|
|
220
225
|
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
@@ -255,8 +260,9 @@
|
|
255
260
|
extern "C" {
|
256
261
|
#endif
|
257
262
|
|
258
|
-
#
|
259
|
-
|
263
|
+
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
264
|
+
typedef half ggml_fp16_t;
|
265
|
+
#elif defined(__ARM_NEON)
|
260
266
|
typedef __fp16 ggml_fp16_t;
|
261
267
|
#else
|
262
268
|
typedef uint16_t ggml_fp16_t;
|
@@ -340,10 +346,12 @@ extern "C" {
|
|
340
346
|
GGML_OP_ARGMAX,
|
341
347
|
GGML_OP_REPEAT,
|
342
348
|
GGML_OP_REPEAT_BACK,
|
349
|
+
GGML_OP_CONCAT,
|
343
350
|
GGML_OP_SILU_BACK,
|
344
351
|
GGML_OP_NORM, // normalize
|
345
352
|
GGML_OP_RMS_NORM,
|
346
353
|
GGML_OP_RMS_NORM_BACK,
|
354
|
+
GGML_OP_GROUP_NORM,
|
347
355
|
|
348
356
|
GGML_OP_MUL_MAT,
|
349
357
|
GGML_OP_OUT_PROD,
|
@@ -369,14 +377,19 @@ extern "C" {
|
|
369
377
|
GGML_OP_CLAMP,
|
370
378
|
GGML_OP_CONV_1D,
|
371
379
|
GGML_OP_CONV_2D,
|
380
|
+
GGML_OP_CONV_TRANSPOSE_2D,
|
372
381
|
GGML_OP_POOL_1D,
|
373
382
|
GGML_OP_POOL_2D,
|
374
383
|
|
384
|
+
GGML_OP_UPSCALE, // nearest interpolate
|
385
|
+
|
375
386
|
GGML_OP_FLASH_ATTN,
|
376
387
|
GGML_OP_FLASH_FF,
|
377
388
|
GGML_OP_FLASH_ATTN_BACK,
|
378
389
|
GGML_OP_WIN_PART,
|
379
390
|
GGML_OP_WIN_UNPART,
|
391
|
+
GGML_OP_GET_REL_POS,
|
392
|
+
GGML_OP_ADD_REL_POS,
|
380
393
|
|
381
394
|
GGML_OP_UNARY,
|
382
395
|
|
@@ -562,6 +575,7 @@ extern "C" {
|
|
562
575
|
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
563
576
|
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
564
577
|
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
578
|
+
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
565
579
|
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
566
580
|
|
567
581
|
GGML_API int ggml_blck_size (enum ggml_type type);
|
@@ -799,6 +813,13 @@ extern "C" {
|
|
799
813
|
struct ggml_tensor * a,
|
800
814
|
struct ggml_tensor * b);
|
801
815
|
|
816
|
+
// concat a and b on dim 2
|
817
|
+
// used in stable-diffusion
|
818
|
+
GGML_API struct ggml_tensor * ggml_concat(
|
819
|
+
struct ggml_context * ctx,
|
820
|
+
struct ggml_tensor * a,
|
821
|
+
struct ggml_tensor * b);
|
822
|
+
|
802
823
|
GGML_API struct ggml_tensor * ggml_abs(
|
803
824
|
struct ggml_context * ctx,
|
804
825
|
struct ggml_tensor * a);
|
@@ -888,14 +909,15 @@ extern "C" {
|
|
888
909
|
struct ggml_tensor * b);
|
889
910
|
|
890
911
|
// normalize along rows
|
891
|
-
// TODO: eps is hardcoded to 1e-5 for now
|
892
912
|
GGML_API struct ggml_tensor * ggml_norm(
|
893
913
|
struct ggml_context * ctx,
|
894
|
-
struct ggml_tensor * a
|
914
|
+
struct ggml_tensor * a,
|
915
|
+
float eps);
|
895
916
|
|
896
917
|
GGML_API struct ggml_tensor * ggml_norm_inplace(
|
897
918
|
struct ggml_context * ctx,
|
898
|
-
struct ggml_tensor * a
|
919
|
+
struct ggml_tensor * a,
|
920
|
+
float eps);
|
899
921
|
|
900
922
|
GGML_API struct ggml_tensor * ggml_rms_norm(
|
901
923
|
struct ggml_context * ctx,
|
@@ -907,6 +929,19 @@ extern "C" {
|
|
907
929
|
struct ggml_tensor * a,
|
908
930
|
float eps);
|
909
931
|
|
932
|
+
// group normalize along ne0*ne1*n_groups
|
933
|
+
// used in stable-diffusion
|
934
|
+
// TODO: eps is hardcoded to 1e-6 for now
|
935
|
+
GGML_API struct ggml_tensor * ggml_group_norm(
|
936
|
+
struct ggml_context * ctx,
|
937
|
+
struct ggml_tensor * a,
|
938
|
+
int n_groups);
|
939
|
+
|
940
|
+
GGML_API struct ggml_tensor * ggml_group_norm_inplace(
|
941
|
+
struct ggml_context * ctx,
|
942
|
+
struct ggml_tensor * a,
|
943
|
+
int n_groups);
|
944
|
+
|
910
945
|
// a - x
|
911
946
|
// b - dy
|
912
947
|
// TODO: update with configurable eps
|
@@ -1207,6 +1242,15 @@ extern "C" {
|
|
1207
1242
|
float freq_base,
|
1208
1243
|
float freq_scale);
|
1209
1244
|
|
1245
|
+
// xPos RoPE, in-place, returns view(a)
|
1246
|
+
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
|
1247
|
+
struct ggml_context * ctx,
|
1248
|
+
struct ggml_tensor * a,
|
1249
|
+
int n_past,
|
1250
|
+
int n_dims,
|
1251
|
+
float base,
|
1252
|
+
bool down);
|
1253
|
+
|
1210
1254
|
// rotary position embedding backward, i.e compute dx from dy
|
1211
1255
|
// a - dy
|
1212
1256
|
GGML_API struct ggml_tensor * ggml_rope_back(
|
@@ -1215,7 +1259,11 @@ extern "C" {
|
|
1215
1259
|
int n_past,
|
1216
1260
|
int n_dims,
|
1217
1261
|
int mode,
|
1218
|
-
int n_ctx
|
1262
|
+
int n_ctx,
|
1263
|
+
float freq_base,
|
1264
|
+
float freq_scale,
|
1265
|
+
float xpos_base,
|
1266
|
+
bool xpos_down);
|
1219
1267
|
|
1220
1268
|
// alibi position embedding
|
1221
1269
|
// in-place, returns view(a)
|
@@ -1242,6 +1290,15 @@ extern "C" {
|
|
1242
1290
|
int p0, // padding
|
1243
1291
|
int d0); // dilation
|
1244
1292
|
|
1293
|
+
// conv_1d with padding = half
|
1294
|
+
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
|
1295
|
+
GGML_API struct ggml_tensor* ggml_conv_1d_ph(
|
1296
|
+
struct ggml_context * ctx,
|
1297
|
+
struct ggml_tensor * a,
|
1298
|
+
struct ggml_tensor * b,
|
1299
|
+
int s,
|
1300
|
+
int d);
|
1301
|
+
|
1245
1302
|
GGML_API struct ggml_tensor * ggml_conv_2d(
|
1246
1303
|
struct ggml_context * ctx,
|
1247
1304
|
struct ggml_tensor * a,
|
@@ -1253,14 +1310,38 @@ extern "C" {
|
|
1253
1310
|
int d0,
|
1254
1311
|
int d1);
|
1255
1312
|
|
1256
|
-
|
1257
|
-
//
|
1258
|
-
|
1313
|
+
|
1314
|
+
// kernel size is a->ne[0] x a->ne[1]
|
1315
|
+
// stride is equal to kernel size
|
1316
|
+
// padding is zero
|
1317
|
+
// example:
|
1318
|
+
// a: 16 16 3 768
|
1319
|
+
// b: 1024 1024 3 1
|
1320
|
+
// res: 64 64 768 1
|
1321
|
+
// used in sam
|
1322
|
+
GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
|
1323
|
+
struct ggml_context * ctx,
|
1324
|
+
struct ggml_tensor * a,
|
1325
|
+
struct ggml_tensor * b);
|
1326
|
+
|
1327
|
+
// kernel size is a->ne[0] x a->ne[1]
|
1328
|
+
// stride is 1
|
1329
|
+
// padding is half
|
1330
|
+
// example:
|
1331
|
+
// a: 3 3 256 256
|
1332
|
+
// b: 64 64 256 1
|
1333
|
+
// res: 64 64 256 1
|
1334
|
+
// used in sam
|
1335
|
+
GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph(
|
1336
|
+
struct ggml_context * ctx,
|
1337
|
+
struct ggml_tensor * a,
|
1338
|
+
struct ggml_tensor * b);
|
1339
|
+
|
1340
|
+
GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
|
1259
1341
|
struct ggml_context * ctx,
|
1260
1342
|
struct ggml_tensor * a,
|
1261
1343
|
struct ggml_tensor * b,
|
1262
|
-
int
|
1263
|
-
int d);
|
1344
|
+
int stride);
|
1264
1345
|
|
1265
1346
|
enum ggml_op_pool {
|
1266
1347
|
GGML_OP_POOL_MAX,
|
@@ -1287,6 +1368,13 @@ extern "C" {
|
|
1287
1368
|
int p0,
|
1288
1369
|
int p1);
|
1289
1370
|
|
1371
|
+
// nearest interpolate
|
1372
|
+
// used in stable-diffusion
|
1373
|
+
GGML_API struct ggml_tensor * ggml_upscale(
|
1374
|
+
struct ggml_context * ctx,
|
1375
|
+
struct ggml_tensor * a,
|
1376
|
+
int scale_factor);
|
1377
|
+
|
1290
1378
|
GGML_API struct ggml_tensor * ggml_flash_attn(
|
1291
1379
|
struct ggml_context * ctx,
|
1292
1380
|
struct ggml_tensor * q,
|
@@ -1340,6 +1428,27 @@ extern "C" {
|
|
1340
1428
|
struct ggml_tensor * a,
|
1341
1429
|
enum ggml_unary_op op);
|
1342
1430
|
|
1431
|
+
// used in sam
|
1432
|
+
GGML_API struct ggml_tensor * ggml_get_rel_pos(
|
1433
|
+
struct ggml_context * ctx,
|
1434
|
+
struct ggml_tensor * a,
|
1435
|
+
int qh,
|
1436
|
+
int kh);
|
1437
|
+
|
1438
|
+
// used in sam
|
1439
|
+
|
1440
|
+
GGML_API struct ggml_tensor * ggml_add_rel_pos(
|
1441
|
+
struct ggml_context * ctx,
|
1442
|
+
struct ggml_tensor * a,
|
1443
|
+
struct ggml_tensor * pw,
|
1444
|
+
struct ggml_tensor * ph);
|
1445
|
+
|
1446
|
+
GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace(
|
1447
|
+
struct ggml_context * ctx,
|
1448
|
+
struct ggml_tensor * a,
|
1449
|
+
struct ggml_tensor * pw,
|
1450
|
+
struct ggml_tensor * ph);
|
1451
|
+
|
1343
1452
|
// custom operators
|
1344
1453
|
|
1345
1454
|
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
|
@@ -1703,6 +1812,118 @@ extern "C" {
|
|
1703
1812
|
|
1704
1813
|
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
1705
1814
|
|
1815
|
+
//
|
1816
|
+
// gguf
|
1817
|
+
//
|
1818
|
+
|
1819
|
+
enum gguf_type {
|
1820
|
+
GGUF_TYPE_UINT8 = 0,
|
1821
|
+
GGUF_TYPE_INT8 = 1,
|
1822
|
+
GGUF_TYPE_UINT16 = 2,
|
1823
|
+
GGUF_TYPE_INT16 = 3,
|
1824
|
+
GGUF_TYPE_UINT32 = 4,
|
1825
|
+
GGUF_TYPE_INT32 = 5,
|
1826
|
+
GGUF_TYPE_FLOAT32 = 6,
|
1827
|
+
GGUF_TYPE_BOOL = 7,
|
1828
|
+
GGUF_TYPE_STRING = 8,
|
1829
|
+
GGUF_TYPE_ARRAY = 9,
|
1830
|
+
GGUF_TYPE_COUNT, // marks the end of the enum
|
1831
|
+
};
|
1832
|
+
|
1833
|
+
struct gguf_context;
|
1834
|
+
|
1835
|
+
struct gguf_init_params {
|
1836
|
+
bool no_alloc;
|
1837
|
+
|
1838
|
+
// if not NULL, create a ggml_context and allocate the tensor data in it
|
1839
|
+
struct ggml_context ** ctx;
|
1840
|
+
};
|
1841
|
+
|
1842
|
+
GGML_API struct gguf_context * gguf_init_empty(void);
|
1843
|
+
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
|
1844
|
+
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
|
1845
|
+
|
1846
|
+
GGML_API void gguf_free(struct gguf_context * ctx);
|
1847
|
+
|
1848
|
+
GGML_API const char * gguf_type_name(enum gguf_type type);
|
1849
|
+
|
1850
|
+
GGML_API int gguf_get_version (struct gguf_context * ctx);
|
1851
|
+
GGML_API size_t gguf_get_alignment (struct gguf_context * ctx);
|
1852
|
+
GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
|
1853
|
+
GGML_API void * gguf_get_data (struct gguf_context * ctx);
|
1854
|
+
|
1855
|
+
GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
|
1856
|
+
GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
|
1857
|
+
GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
|
1858
|
+
|
1859
|
+
GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
|
1860
|
+
GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
|
1861
|
+
|
1862
|
+
// results are undefined if the wrong type is used for the key
|
1863
|
+
GGML_API uint8_t gguf_get_val_u8 (struct gguf_context * ctx, int i);
|
1864
|
+
GGML_API int8_t gguf_get_val_i8 (struct gguf_context * ctx, int i);
|
1865
|
+
GGML_API uint16_t gguf_get_val_u16 (struct gguf_context * ctx, int i);
|
1866
|
+
GGML_API int16_t gguf_get_val_i16 (struct gguf_context * ctx, int i);
|
1867
|
+
GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
|
1868
|
+
GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
|
1869
|
+
GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
|
1870
|
+
GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
|
1871
|
+
GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
|
1872
|
+
GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
|
1873
|
+
GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
|
1874
|
+
GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
|
1875
|
+
|
1876
|
+
GGML_API int gguf_get_n_tensors (struct gguf_context * ctx);
|
1877
|
+
GGML_API int gguf_find_tensor (struct gguf_context * ctx, const char * name);
|
1878
|
+
GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
|
1879
|
+
GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i);
|
1880
|
+
|
1881
|
+
// overrides existing values or adds a new one
|
1882
|
+
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
1883
|
+
GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
|
1884
|
+
GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
|
1885
|
+
GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
|
1886
|
+
GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
|
1887
|
+
GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
|
1888
|
+
GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
|
1889
|
+
GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
|
1890
|
+
GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
|
1891
|
+
GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
|
1892
|
+
GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
|
1893
|
+
|
1894
|
+
// set or add KV pairs from another context
|
1895
|
+
GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
|
1896
|
+
|
1897
|
+
// manage tensor info
|
1898
|
+
GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
|
1899
|
+
GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
|
1900
|
+
GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
|
1901
|
+
|
1902
|
+
// writing gguf files can be done in 2 ways:
|
1903
|
+
//
|
1904
|
+
// - write the entire gguf_context to a binary file in a single pass:
|
1905
|
+
//
|
1906
|
+
// gguf_write_to_file(ctx, fname);
|
1907
|
+
//
|
1908
|
+
// - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
|
1909
|
+
//
|
1910
|
+
// FILE * f = fopen(fname, "wb");
|
1911
|
+
// fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
|
1912
|
+
// fwrite(f, ...);
|
1913
|
+
// void * data = gguf_meta_get_meta_data(ctx);
|
1914
|
+
// fseek(f, 0, SEEK_SET);
|
1915
|
+
// fwrite(f, data, gguf_get_meta_size(ctx));
|
1916
|
+
// free(data);
|
1917
|
+
// fclose(f);
|
1918
|
+
//
|
1919
|
+
|
1920
|
+
// write the entire context to a binary file
|
1921
|
+
GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
|
1922
|
+
|
1923
|
+
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
|
1924
|
+
GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
|
1925
|
+
GGML_API void gguf_get_meta_data(struct gguf_context * ctx, void * data);
|
1926
|
+
|
1706
1927
|
//
|
1707
1928
|
// system info
|
1708
1929
|
//
|
@@ -1740,6 +1961,10 @@ extern "C" {
|
|
1740
1961
|
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
1741
1962
|
|
1742
1963
|
typedef struct {
|
1964
|
+
const char * type_name;
|
1965
|
+
int blck_size;
|
1966
|
+
size_t type_size;
|
1967
|
+
bool is_quantized;
|
1743
1968
|
ggml_to_float_t to_float;
|
1744
1969
|
ggml_from_float_t from_float;
|
1745
1970
|
ggml_from_float_t from_float_reference;
|
@@ -1747,7 +1972,7 @@ extern "C" {
|
|
1747
1972
|
enum ggml_type vec_dot_type;
|
1748
1973
|
} ggml_type_traits_t;
|
1749
1974
|
|
1750
|
-
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type
|
1975
|
+
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
1751
1976
|
|
1752
1977
|
#ifdef __cplusplus
|
1753
1978
|
}
|
@@ -77,6 +77,11 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
77
77
|
}
|
78
78
|
return 1/iscale;
|
79
79
|
}
|
80
|
+
bool return_early = false;
|
81
|
+
if (rmse_type < 0) {
|
82
|
+
rmse_type = -rmse_type;
|
83
|
+
return_early = true;
|
84
|
+
}
|
80
85
|
int weight_type = rmse_type%2;
|
81
86
|
float sumlx = 0;
|
82
87
|
float suml2 = 0;
|
@@ -89,56 +94,9 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
89
94
|
suml2 += w*l*l;
|
90
95
|
}
|
91
96
|
float scale = sumlx/suml2;
|
97
|
+
if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale;
|
92
98
|
float best = scale * sumlx;
|
93
|
-
for (int
|
94
|
-
iscale = 1/scale;
|
95
|
-
float slx = 0;
|
96
|
-
float sl2 = 0;
|
97
|
-
bool changed = false;
|
98
|
-
for (int i = 0; i < n; ++i) {
|
99
|
-
int l = nearest_int(iscale * x[i]);
|
100
|
-
l = MAX(-nmax, MIN(nmax-1, l));
|
101
|
-
if (l + nmax != L[i]) { changed = true; }
|
102
|
-
float w = weight_type == 1 ? x[i] * x[i] : 1.f;
|
103
|
-
slx += w*x[i]*l;
|
104
|
-
sl2 += w*l*l;
|
105
|
-
}
|
106
|
-
if (!changed || sl2 == 0 || slx*slx <= best*sl2) { break; }
|
107
|
-
for (int i = 0; i < n; ++i) {
|
108
|
-
int l = nearest_int(iscale * x[i]);
|
109
|
-
L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
|
110
|
-
}
|
111
|
-
sumlx = slx; suml2 = sl2;
|
112
|
-
scale = sumlx/suml2;
|
113
|
-
best = scale * sumlx;
|
114
|
-
}
|
115
|
-
for (int itry = 0; itry < 5; ++itry) {
|
116
|
-
int n_changed = 0;
|
117
|
-
for (int i = 0; i < n; ++i) {
|
118
|
-
float w = weight_type == 1 ? x[i]*x[i] : 1;
|
119
|
-
int l = L[i] - nmax;
|
120
|
-
float slx = sumlx - w*x[i]*l;
|
121
|
-
if (slx > 0) {
|
122
|
-
float sl2 = suml2 - w*l*l;
|
123
|
-
int new_l = nearest_int(x[i] * sl2 / slx);
|
124
|
-
new_l = MAX(-nmax, MIN(nmax-1, new_l));
|
125
|
-
if (new_l != l) {
|
126
|
-
slx += w*x[i]*new_l;
|
127
|
-
sl2 += w*new_l*new_l;
|
128
|
-
if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) {
|
129
|
-
L[i] = nmax + new_l; sumlx = slx; suml2 = sl2;
|
130
|
-
scale = sumlx / suml2; best = scale * sumlx;
|
131
|
-
++n_changed;
|
132
|
-
}
|
133
|
-
}
|
134
|
-
}
|
135
|
-
}
|
136
|
-
if (!n_changed) { break; }
|
137
|
-
}
|
138
|
-
if (rmse_type < 3) {
|
139
|
-
return scale;
|
140
|
-
}
|
141
|
-
for (int is = -4; is <= 4; ++is) {
|
99
|
+
for (int is = -9; is <= 9; ++is) {
|
142
100
|
if (is == 0) {
|
143
101
|
continue;
|
144
102
|
}
|
@@ -221,12 +179,17 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
221
179
|
return 1/iscale;
|
222
180
|
}
|
223
181
|
|
224
|
-
static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
|
182
|
+
static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
|
183
|
+
int ntry, float alpha) {
|
225
184
|
float min = x[0];
|
226
185
|
float max = x[0];
|
186
|
+
float sum_x = 0;
|
187
|
+
float sum_x2 = 0;
|
227
188
|
for (int i = 1; i < n; ++i) {
|
228
189
|
if (x[i] < min) min = x[i];
|
229
190
|
if (x[i] > max) max = x[i];
|
191
|
+
sum_x += x[i];
|
192
|
+
sum_x2 += x[i]*x[i];
|
230
193
|
}
|
231
194
|
if (max == min) {
|
232
195
|
for (int i = 0; i < n; ++i) L[i] = 0;
|
@@ -254,7 +217,7 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
|
|
254
217
|
for (int i = 0; i < n; ++i) {
|
255
218
|
sum += x[i] - scale*L[i];
|
256
219
|
}
|
257
|
-
min = sum/n;
|
220
|
+
min = alpha*min + (1 - alpha)*sum/n;
|
258
221
|
if (min > 0) min = 0;
|
259
222
|
iscale = 1/scale;
|
260
223
|
if (!did_change) break;
|
@@ -263,6 +226,82 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
|
|
263
226
|
return scale;
|
264
227
|
}
|
265
228
|
|
229
|
+
static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
|
230
|
+
uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
|
231
|
+
float rmin, float rdelta, int nstep, bool use_mad) {
|
232
|
+
float min = x[0];
|
233
|
+
float max = x[0];
|
234
|
+
float sum_w = weights[0];
|
235
|
+
float sum_x = sum_w * x[0];
|
236
|
+
for (int i = 1; i < n; ++i) {
|
237
|
+
if (x[i] < min) min = x[i];
|
238
|
+
if (x[i] > max) max = x[i];
|
239
|
+
float w = weights[i];
|
240
|
+
sum_w += w;
|
241
|
+
sum_x += w * x[i];
|
242
|
+
}
|
243
|
+
if (min > 0) min = 0;
|
244
|
+
if (max == min) {
|
245
|
+
for (int i = 0; i < n; ++i) L[i] = 0;
|
246
|
+
*the_min = -min;
|
247
|
+
return 0.f;
|
248
|
+
}
|
249
|
+
float iscale = nmax/(max - min);
|
250
|
+
float scale = 1/iscale;
|
251
|
+
float best_mad = 0;
|
252
|
+
for (int i = 0; i < n; ++i) {
|
253
|
+
int l = nearest_int(iscale*(x[i] - min));
|
254
|
+
L[i] = MAX(0, MIN(nmax, l));
|
255
|
+
float diff = scale * L[i] + min - x[i];
|
256
|
+
diff = use_mad ? fabsf(diff) : diff * diff;
|
257
|
+
float w = weights[i];
|
258
|
+
best_mad += w * diff;
|
259
|
+
}
|
260
|
+
if (nstep < 1) {
|
261
|
+
*the_min = -min;
|
262
|
+
return scale;
|
263
|
+
}
|
264
|
+
for (int is = 0; is <= nstep; ++is) {
|
265
|
+
iscale = (rmin + rdelta*is + nmax)/(max - min);
|
266
|
+
float sum_l = 0, sum_l2 = 0, sum_xl = 0;
|
267
|
+
for (int i = 0; i < n; ++i) {
|
268
|
+
int l = nearest_int(iscale*(x[i] - min));
|
269
|
+
l = MAX(0, MIN(nmax, l));
|
270
|
+
Laux[i] = l;
|
271
|
+
float w = weights[i];
|
272
|
+
sum_l += w*l;
|
273
|
+
sum_l2 += w*l*l;
|
274
|
+
sum_xl += w*l*x[i];
|
275
|
+
}
|
276
|
+
float D = sum_w * sum_l2 - sum_l * sum_l;
|
277
|
+
if (D > 0) {
|
278
|
+
float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
|
279
|
+
float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D;
|
280
|
+
if (this_min > 0) {
|
281
|
+
this_min = 0;
|
282
|
+
this_scale = sum_xl / sum_l2;
|
283
|
+
}
|
284
|
+
float mad = 0;
|
285
|
+
for (int i = 0; i < n; ++i) {
|
286
|
+
float diff = this_scale * Laux[i] + this_min - x[i];
|
287
|
+
diff = use_mad ? fabsf(diff) : diff * diff;
|
288
|
+
float w = weights[i];
|
289
|
+
mad += w * diff;
|
290
|
+
}
|
291
|
+
if (mad < best_mad) {
|
292
|
+
for (int i = 0; i < n; ++i) {
|
293
|
+
L[i] = Laux[i];
|
294
|
+
}
|
295
|
+
best_mad = mad;
|
296
|
+
scale = this_scale;
|
297
|
+
min = this_min;
|
298
|
+
}
|
299
|
+
}
|
300
|
+
}
|
301
|
+
*the_min = -min;
|
302
|
+
return scale;
|
303
|
+
}
|
304
|
+
|
266
305
|
#if QK_K == 256
|
267
306
|
static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
|
268
307
|
if (j < 4) {
|
@@ -281,6 +320,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
|
|
281
320
|
const int nb = k / QK_K;
|
282
321
|
|
283
322
|
uint8_t L[QK_K];
|
323
|
+
uint8_t Laux[16];
|
324
|
+
float weights[16];
|
284
325
|
float mins[QK_K/16];
|
285
326
|
float scales[QK_K/16];
|
286
327
|
|
@@ -291,7 +332,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
|
|
291
332
|
float max_scale = 0; // as we are deducting the min, scales are always positive
|
292
333
|
float max_min = 0;
|
293
334
|
for (int j = 0; j < QK_K/16; ++j) {
|
294
|
-
|
335
|
+
for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]);
|
336
|
+
scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, true);
|
295
337
|
float scale = scales[j];
|
296
338
|
if (scale > max_scale) {
|
297
339
|
max_scale = scale;
|
@@ -637,6 +679,8 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
|
|
637
679
|
const int nb = k / QK_K;
|
638
680
|
|
639
681
|
uint8_t L[QK_K];
|
682
|
+
uint8_t Laux[32];
|
683
|
+
float weights[32];
|
640
684
|
float mins[QK_K/32];
|
641
685
|
float scales[QK_K/32];
|
642
686
|
|
@@ -645,7 +689,12 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
|
|
645
689
|
float max_scale = 0; // as we are deducting the min, scales are always positive
|
646
690
|
float max_min = 0;
|
647
691
|
for (int j = 0; j < QK_K/32; ++j) {
|
648
|
-
scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j],
|
692
|
+
//scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
|
693
|
+
float sum_x2 = 0;
|
694
|
+
for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
|
695
|
+
float av_x = sqrtf(sum_x2/32);
|
696
|
+
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
|
697
|
+
scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
|
649
698
|
float scale = scales[j];
|
650
699
|
if (scale > max_scale) {
|
651
700
|
max_scale = scale;
|
@@ -798,6 +847,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
|
|
798
847
|
uint8_t L[QK_K];
|
799
848
|
float mins[QK_K/32];
|
800
849
|
float scales[QK_K/32];
|
850
|
+
float weights[32];
|
851
|
+
uint8_t Laux[32];
|
801
852
|
#else
|
802
853
|
int8_t L[QK_K];
|
803
854
|
float scales[QK_K/16];
|
@@ -810,7 +861,12 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
|
|
810
861
|
float max_scale = 0; // as we are deducting the min, scales are always positive
|
811
862
|
float max_min = 0;
|
812
863
|
for (int j = 0; j < QK_K/32; ++j) {
|
813
|
-
scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j],
|
864
|
+
//scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
|
865
|
+
float sum_x2 = 0;
|
866
|
+
for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
|
867
|
+
float av_x = sqrtf(sum_x2/32);
|
868
|
+
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
|
869
|
+
scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false);
|
814
870
|
float scale = scales[j];
|
815
871
|
if (scale > max_scale) {
|
816
872
|
max_scale = scale;
|
@@ -273,14 +273,16 @@ struct llama_mmap {
|
|
273
273
|
|
274
274
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
275
275
|
if (prefetch) {
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
276
|
+
// Advise the kernel to preload the mapped memory
|
277
|
+
|
278
|
+
WIN32_MEMORY_RANGE_ENTRY range;
|
279
|
+
range.VirtualAddress = addr;
|
280
|
+
|
281
|
+
range.NumberOfBytes = (SIZE_T)size;
|
282
|
+
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
283
|
+
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
284
|
+
llama_format_win_err(GetLastError()).c_str());
|
285
|
+
}
|
284
286
|
}
|
285
287
|
#else
|
286
288
|
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|