llama_cpp 0.3.8 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/README.md +1 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +110 -117
- data/ext/llama_cpp/src/ggml-alloc.c +79 -65
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +330 -69
- data/ext/llama_cpp/src/ggml-cuda.h +13 -0
- data/ext/llama_cpp/src/ggml-metal.h +3 -0
- data/ext/llama_cpp/src/ggml-metal.m +102 -66
- data/ext/llama_cpp/src/ggml-metal.metal +113 -9
- data/ext/llama_cpp/src/ggml.c +2064 -233
- data/ext/llama_cpp/src/ggml.h +238 -13
- data/ext/llama_cpp/src/k_quants.c +110 -54
- data/ext/llama_cpp/src/llama.cpp +4520 -2978
- data/ext/llama_cpp/src/llama.h +133 -125
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +7 -8
- metadata +2 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -207,7 +207,7 @@
|
|
207
207
|
#define GGML_MAX_PARAMS 256
|
208
208
|
#define GGML_MAX_CONTEXTS 64
|
209
209
|
#define GGML_MAX_SRC 6
|
210
|
-
#define GGML_MAX_NAME
|
210
|
+
#define GGML_MAX_NAME 64
|
211
211
|
#define GGML_MAX_OP_PARAMS 32
|
212
212
|
#define GGML_DEFAULT_N_THREADS 4
|
213
213
|
|
@@ -215,6 +215,11 @@
|
|
215
215
|
#define GGML_EXIT_SUCCESS 0
|
216
216
|
#define GGML_EXIT_ABORTED 1
|
217
217
|
|
218
|
+
#define GGUF_MAGIC 0x46554747 // "GGUF"
|
219
|
+
#define GGUF_VERSION 1
|
220
|
+
|
221
|
+
#define GGUF_DEFAULT_ALIGNMENT 32
|
222
|
+
|
218
223
|
#define GGML_UNUSED(x) (void)(x)
|
219
224
|
|
220
225
|
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
@@ -255,8 +260,9 @@
|
|
255
260
|
extern "C" {
|
256
261
|
#endif
|
257
262
|
|
258
|
-
#
|
259
|
-
|
263
|
+
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
264
|
+
typedef half ggml_fp16_t;
|
265
|
+
#elif defined(__ARM_NEON)
|
260
266
|
typedef __fp16 ggml_fp16_t;
|
261
267
|
#else
|
262
268
|
typedef uint16_t ggml_fp16_t;
|
@@ -340,10 +346,12 @@ extern "C" {
|
|
340
346
|
GGML_OP_ARGMAX,
|
341
347
|
GGML_OP_REPEAT,
|
342
348
|
GGML_OP_REPEAT_BACK,
|
349
|
+
GGML_OP_CONCAT,
|
343
350
|
GGML_OP_SILU_BACK,
|
344
351
|
GGML_OP_NORM, // normalize
|
345
352
|
GGML_OP_RMS_NORM,
|
346
353
|
GGML_OP_RMS_NORM_BACK,
|
354
|
+
GGML_OP_GROUP_NORM,
|
347
355
|
|
348
356
|
GGML_OP_MUL_MAT,
|
349
357
|
GGML_OP_OUT_PROD,
|
@@ -369,14 +377,19 @@ extern "C" {
|
|
369
377
|
GGML_OP_CLAMP,
|
370
378
|
GGML_OP_CONV_1D,
|
371
379
|
GGML_OP_CONV_2D,
|
380
|
+
GGML_OP_CONV_TRANSPOSE_2D,
|
372
381
|
GGML_OP_POOL_1D,
|
373
382
|
GGML_OP_POOL_2D,
|
374
383
|
|
384
|
+
GGML_OP_UPSCALE, // nearest interpolate
|
385
|
+
|
375
386
|
GGML_OP_FLASH_ATTN,
|
376
387
|
GGML_OP_FLASH_FF,
|
377
388
|
GGML_OP_FLASH_ATTN_BACK,
|
378
389
|
GGML_OP_WIN_PART,
|
379
390
|
GGML_OP_WIN_UNPART,
|
391
|
+
GGML_OP_GET_REL_POS,
|
392
|
+
GGML_OP_ADD_REL_POS,
|
380
393
|
|
381
394
|
GGML_OP_UNARY,
|
382
395
|
|
@@ -562,6 +575,7 @@ extern "C" {
|
|
562
575
|
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
563
576
|
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
564
577
|
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
578
|
+
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
565
579
|
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
566
580
|
|
567
581
|
GGML_API int ggml_blck_size (enum ggml_type type);
|
@@ -799,6 +813,13 @@ extern "C" {
|
|
799
813
|
struct ggml_tensor * a,
|
800
814
|
struct ggml_tensor * b);
|
801
815
|
|
816
|
+
// concat a and b on dim 2
|
817
|
+
// used in stable-diffusion
|
818
|
+
GGML_API struct ggml_tensor * ggml_concat(
|
819
|
+
struct ggml_context * ctx,
|
820
|
+
struct ggml_tensor * a,
|
821
|
+
struct ggml_tensor * b);
|
822
|
+
|
802
823
|
GGML_API struct ggml_tensor * ggml_abs(
|
803
824
|
struct ggml_context * ctx,
|
804
825
|
struct ggml_tensor * a);
|
@@ -888,14 +909,15 @@ extern "C" {
|
|
888
909
|
struct ggml_tensor * b);
|
889
910
|
|
890
911
|
// normalize along rows
|
891
|
-
// TODO: eps is hardcoded to 1e-5 for now
|
892
912
|
GGML_API struct ggml_tensor * ggml_norm(
|
893
913
|
struct ggml_context * ctx,
|
894
|
-
struct ggml_tensor * a
|
914
|
+
struct ggml_tensor * a,
|
915
|
+
float eps);
|
895
916
|
|
896
917
|
GGML_API struct ggml_tensor * ggml_norm_inplace(
|
897
918
|
struct ggml_context * ctx,
|
898
|
-
struct ggml_tensor * a
|
919
|
+
struct ggml_tensor * a,
|
920
|
+
float eps);
|
899
921
|
|
900
922
|
GGML_API struct ggml_tensor * ggml_rms_norm(
|
901
923
|
struct ggml_context * ctx,
|
@@ -907,6 +929,19 @@ extern "C" {
|
|
907
929
|
struct ggml_tensor * a,
|
908
930
|
float eps);
|
909
931
|
|
932
|
+
// group normalize along ne0*ne1*n_groups
|
933
|
+
// used in stable-diffusion
|
934
|
+
// TODO: eps is hardcoded to 1e-6 for now
|
935
|
+
GGML_API struct ggml_tensor * ggml_group_norm(
|
936
|
+
struct ggml_context * ctx,
|
937
|
+
struct ggml_tensor * a,
|
938
|
+
int n_groups);
|
939
|
+
|
940
|
+
GGML_API struct ggml_tensor * ggml_group_norm_inplace(
|
941
|
+
struct ggml_context * ctx,
|
942
|
+
struct ggml_tensor * a,
|
943
|
+
int n_groups);
|
944
|
+
|
910
945
|
// a - x
|
911
946
|
// b - dy
|
912
947
|
// TODO: update with configurable eps
|
@@ -1207,6 +1242,15 @@ extern "C" {
|
|
1207
1242
|
float freq_base,
|
1208
1243
|
float freq_scale);
|
1209
1244
|
|
1245
|
+
// xPos RoPE, in-place, returns view(a)
|
1246
|
+
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
|
1247
|
+
struct ggml_context * ctx,
|
1248
|
+
struct ggml_tensor * a,
|
1249
|
+
int n_past,
|
1250
|
+
int n_dims,
|
1251
|
+
float base,
|
1252
|
+
bool down);
|
1253
|
+
|
1210
1254
|
// rotary position embedding backward, i.e compute dx from dy
|
1211
1255
|
// a - dy
|
1212
1256
|
GGML_API struct ggml_tensor * ggml_rope_back(
|
@@ -1215,7 +1259,11 @@ extern "C" {
|
|
1215
1259
|
int n_past,
|
1216
1260
|
int n_dims,
|
1217
1261
|
int mode,
|
1218
|
-
int n_ctx
|
1262
|
+
int n_ctx,
|
1263
|
+
float freq_base,
|
1264
|
+
float freq_scale,
|
1265
|
+
float xpos_base,
|
1266
|
+
bool xpos_down);
|
1219
1267
|
|
1220
1268
|
// alibi position embedding
|
1221
1269
|
// in-place, returns view(a)
|
@@ -1242,6 +1290,15 @@ extern "C" {
|
|
1242
1290
|
int p0, // padding
|
1243
1291
|
int d0); // dilation
|
1244
1292
|
|
1293
|
+
// conv_1d with padding = half
|
1294
|
+
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
|
1295
|
+
GGML_API struct ggml_tensor* ggml_conv_1d_ph(
|
1296
|
+
struct ggml_context * ctx,
|
1297
|
+
struct ggml_tensor * a,
|
1298
|
+
struct ggml_tensor * b,
|
1299
|
+
int s,
|
1300
|
+
int d);
|
1301
|
+
|
1245
1302
|
GGML_API struct ggml_tensor * ggml_conv_2d(
|
1246
1303
|
struct ggml_context * ctx,
|
1247
1304
|
struct ggml_tensor * a,
|
@@ -1253,14 +1310,38 @@ extern "C" {
|
|
1253
1310
|
int d0,
|
1254
1311
|
int d1);
|
1255
1312
|
|
1256
|
-
|
1257
|
-
//
|
1258
|
-
|
1313
|
+
|
1314
|
+
// kernel size is a->ne[0] x a->ne[1]
|
1315
|
+
// stride is equal to kernel size
|
1316
|
+
// padding is zero
|
1317
|
+
// example:
|
1318
|
+
// a: 16 16 3 768
|
1319
|
+
// b: 1024 1024 3 1
|
1320
|
+
// res: 64 64 768 1
|
1321
|
+
// used in sam
|
1322
|
+
GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
|
1323
|
+
struct ggml_context * ctx,
|
1324
|
+
struct ggml_tensor * a,
|
1325
|
+
struct ggml_tensor * b);
|
1326
|
+
|
1327
|
+
// kernel size is a->ne[0] x a->ne[1]
|
1328
|
+
// stride is 1
|
1329
|
+
// padding is half
|
1330
|
+
// example:
|
1331
|
+
// a: 3 3 256 256
|
1332
|
+
// b: 64 64 256 1
|
1333
|
+
// res: 64 64 256 1
|
1334
|
+
// used in sam
|
1335
|
+
GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph(
|
1336
|
+
struct ggml_context * ctx,
|
1337
|
+
struct ggml_tensor * a,
|
1338
|
+
struct ggml_tensor * b);
|
1339
|
+
|
1340
|
+
GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
|
1259
1341
|
struct ggml_context * ctx,
|
1260
1342
|
struct ggml_tensor * a,
|
1261
1343
|
struct ggml_tensor * b,
|
1262
|
-
int
|
1263
|
-
int d);
|
1344
|
+
int stride);
|
1264
1345
|
|
1265
1346
|
enum ggml_op_pool {
|
1266
1347
|
GGML_OP_POOL_MAX,
|
@@ -1287,6 +1368,13 @@ extern "C" {
|
|
1287
1368
|
int p0,
|
1288
1369
|
int p1);
|
1289
1370
|
|
1371
|
+
// nearest interpolate
|
1372
|
+
// used in stable-diffusion
|
1373
|
+
GGML_API struct ggml_tensor * ggml_upscale(
|
1374
|
+
struct ggml_context * ctx,
|
1375
|
+
struct ggml_tensor * a,
|
1376
|
+
int scale_factor);
|
1377
|
+
|
1290
1378
|
GGML_API struct ggml_tensor * ggml_flash_attn(
|
1291
1379
|
struct ggml_context * ctx,
|
1292
1380
|
struct ggml_tensor * q,
|
@@ -1340,6 +1428,27 @@ extern "C" {
|
|
1340
1428
|
struct ggml_tensor * a,
|
1341
1429
|
enum ggml_unary_op op);
|
1342
1430
|
|
1431
|
+
// used in sam
|
1432
|
+
GGML_API struct ggml_tensor * ggml_get_rel_pos(
|
1433
|
+
struct ggml_context * ctx,
|
1434
|
+
struct ggml_tensor * a,
|
1435
|
+
int qh,
|
1436
|
+
int kh);
|
1437
|
+
|
1438
|
+
// used in sam
|
1439
|
+
|
1440
|
+
GGML_API struct ggml_tensor * ggml_add_rel_pos(
|
1441
|
+
struct ggml_context * ctx,
|
1442
|
+
struct ggml_tensor * a,
|
1443
|
+
struct ggml_tensor * pw,
|
1444
|
+
struct ggml_tensor * ph);
|
1445
|
+
|
1446
|
+
GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace(
|
1447
|
+
struct ggml_context * ctx,
|
1448
|
+
struct ggml_tensor * a,
|
1449
|
+
struct ggml_tensor * pw,
|
1450
|
+
struct ggml_tensor * ph);
|
1451
|
+
|
1343
1452
|
// custom operators
|
1344
1453
|
|
1345
1454
|
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
|
@@ -1703,6 +1812,118 @@ extern "C" {
|
|
1703
1812
|
|
1704
1813
|
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
1705
1814
|
|
1815
|
+
//
|
1816
|
+
// gguf
|
1817
|
+
//
|
1818
|
+
|
1819
|
+
enum gguf_type {
|
1820
|
+
GGUF_TYPE_UINT8 = 0,
|
1821
|
+
GGUF_TYPE_INT8 = 1,
|
1822
|
+
GGUF_TYPE_UINT16 = 2,
|
1823
|
+
GGUF_TYPE_INT16 = 3,
|
1824
|
+
GGUF_TYPE_UINT32 = 4,
|
1825
|
+
GGUF_TYPE_INT32 = 5,
|
1826
|
+
GGUF_TYPE_FLOAT32 = 6,
|
1827
|
+
GGUF_TYPE_BOOL = 7,
|
1828
|
+
GGUF_TYPE_STRING = 8,
|
1829
|
+
GGUF_TYPE_ARRAY = 9,
|
1830
|
+
GGUF_TYPE_COUNT, // marks the end of the enum
|
1831
|
+
};
|
1832
|
+
|
1833
|
+
struct gguf_context;
|
1834
|
+
|
1835
|
+
struct gguf_init_params {
|
1836
|
+
bool no_alloc;
|
1837
|
+
|
1838
|
+
// if not NULL, create a ggml_context and allocate the tensor data in it
|
1839
|
+
struct ggml_context ** ctx;
|
1840
|
+
};
|
1841
|
+
|
1842
|
+
GGML_API struct gguf_context * gguf_init_empty(void);
|
1843
|
+
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
|
1844
|
+
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
|
1845
|
+
|
1846
|
+
GGML_API void gguf_free(struct gguf_context * ctx);
|
1847
|
+
|
1848
|
+
GGML_API const char * gguf_type_name(enum gguf_type type);
|
1849
|
+
|
1850
|
+
GGML_API int gguf_get_version (struct gguf_context * ctx);
|
1851
|
+
GGML_API size_t gguf_get_alignment (struct gguf_context * ctx);
|
1852
|
+
GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
|
1853
|
+
GGML_API void * gguf_get_data (struct gguf_context * ctx);
|
1854
|
+
|
1855
|
+
GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
|
1856
|
+
GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
|
1857
|
+
GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
|
1858
|
+
|
1859
|
+
GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
|
1860
|
+
GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
|
1861
|
+
|
1862
|
+
// results are undefined if the wrong type is used for the key
|
1863
|
+
GGML_API uint8_t gguf_get_val_u8 (struct gguf_context * ctx, int i);
|
1864
|
+
GGML_API int8_t gguf_get_val_i8 (struct gguf_context * ctx, int i);
|
1865
|
+
GGML_API uint16_t gguf_get_val_u16 (struct gguf_context * ctx, int i);
|
1866
|
+
GGML_API int16_t gguf_get_val_i16 (struct gguf_context * ctx, int i);
|
1867
|
+
GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
|
1868
|
+
GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
|
1869
|
+
GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
|
1870
|
+
GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
|
1871
|
+
GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
|
1872
|
+
GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
|
1873
|
+
GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
|
1874
|
+
GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
|
1875
|
+
|
1876
|
+
GGML_API int gguf_get_n_tensors (struct gguf_context * ctx);
|
1877
|
+
GGML_API int gguf_find_tensor (struct gguf_context * ctx, const char * name);
|
1878
|
+
GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
|
1879
|
+
GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i);
|
1880
|
+
|
1881
|
+
// overrides existing values or adds a new one
|
1882
|
+
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
1883
|
+
GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
|
1884
|
+
GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
|
1885
|
+
GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
|
1886
|
+
GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
|
1887
|
+
GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
|
1888
|
+
GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
|
1889
|
+
GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
|
1890
|
+
GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
|
1891
|
+
GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
|
1892
|
+
GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
|
1893
|
+
|
1894
|
+
// set or add KV pairs from another context
|
1895
|
+
GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
|
1896
|
+
|
1897
|
+
// manage tensor info
|
1898
|
+
GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
|
1899
|
+
GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
|
1900
|
+
GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
|
1901
|
+
|
1902
|
+
// writing gguf files can be done in 2 ways:
|
1903
|
+
//
|
1904
|
+
// - write the entire gguf_context to a binary file in a single pass:
|
1905
|
+
//
|
1906
|
+
// gguf_write_to_file(ctx, fname);
|
1907
|
+
//
|
1908
|
+
// - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
|
1909
|
+
//
|
1910
|
+
// FILE * f = fopen(fname, "wb");
|
1911
|
+
// fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
|
1912
|
+
// fwrite(f, ...);
|
1913
|
+
// void * data = gguf_meta_get_meta_data(ctx);
|
1914
|
+
// fseek(f, 0, SEEK_SET);
|
1915
|
+
// fwrite(f, data, gguf_get_meta_size(ctx));
|
1916
|
+
// free(data);
|
1917
|
+
// fclose(f);
|
1918
|
+
//
|
1919
|
+
|
1920
|
+
// write the entire context to a binary file
|
1921
|
+
GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
|
1922
|
+
|
1923
|
+
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
|
1924
|
+
GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
|
1925
|
+
GGML_API void gguf_get_meta_data(struct gguf_context * ctx, void * data);
|
1926
|
+
|
1706
1927
|
//
|
1707
1928
|
// system info
|
1708
1929
|
//
|
@@ -1740,6 +1961,10 @@ extern "C" {
|
|
1740
1961
|
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
1741
1962
|
|
1742
1963
|
typedef struct {
|
1964
|
+
const char * type_name;
|
1965
|
+
int blck_size;
|
1966
|
+
size_t type_size;
|
1967
|
+
bool is_quantized;
|
1743
1968
|
ggml_to_float_t to_float;
|
1744
1969
|
ggml_from_float_t from_float;
|
1745
1970
|
ggml_from_float_t from_float_reference;
|
@@ -1747,7 +1972,7 @@ extern "C" {
|
|
1747
1972
|
enum ggml_type vec_dot_type;
|
1748
1973
|
} ggml_type_traits_t;
|
1749
1974
|
|
1750
|
-
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type
|
1975
|
+
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
1751
1976
|
|
1752
1977
|
#ifdef __cplusplus
|
1753
1978
|
}
|
@@ -77,6 +77,11 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
77
77
|
}
|
78
78
|
return 1/iscale;
|
79
79
|
}
|
80
|
+
bool return_early = false;
|
81
|
+
if (rmse_type < 0) {
|
82
|
+
rmse_type = -rmse_type;
|
83
|
+
return_early = true;
|
84
|
+
}
|
80
85
|
int weight_type = rmse_type%2;
|
81
86
|
float sumlx = 0;
|
82
87
|
float suml2 = 0;
|
@@ -89,56 +94,9 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
89
94
|
suml2 += w*l*l;
|
90
95
|
}
|
91
96
|
float scale = sumlx/suml2;
|
97
|
+
if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale;
|
92
98
|
float best = scale * sumlx;
|
93
|
-
for (int
|
94
|
-
iscale = 1/scale;
|
95
|
-
float slx = 0;
|
96
|
-
float sl2 = 0;
|
97
|
-
bool changed = false;
|
98
|
-
for (int i = 0; i < n; ++i) {
|
99
|
-
int l = nearest_int(iscale * x[i]);
|
100
|
-
l = MAX(-nmax, MIN(nmax-1, l));
|
101
|
-
if (l + nmax != L[i]) { changed = true; }
|
102
|
-
float w = weight_type == 1 ? x[i] * x[i] : 1.f;
|
103
|
-
slx += w*x[i]*l;
|
104
|
-
sl2 += w*l*l;
|
105
|
-
}
|
106
|
-
if (!changed || sl2 == 0 || slx*slx <= best*sl2) { break; }
|
107
|
-
for (int i = 0; i < n; ++i) {
|
108
|
-
int l = nearest_int(iscale * x[i]);
|
109
|
-
L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
|
110
|
-
}
|
111
|
-
sumlx = slx; suml2 = sl2;
|
112
|
-
scale = sumlx/suml2;
|
113
|
-
best = scale * sumlx;
|
114
|
-
}
|
115
|
-
for (int itry = 0; itry < 5; ++itry) {
|
116
|
-
int n_changed = 0;
|
117
|
-
for (int i = 0; i < n; ++i) {
|
118
|
-
float w = weight_type == 1 ? x[i]*x[i] : 1;
|
119
|
-
int l = L[i] - nmax;
|
120
|
-
float slx = sumlx - w*x[i]*l;
|
121
|
-
if (slx > 0) {
|
122
|
-
float sl2 = suml2 - w*l*l;
|
123
|
-
int new_l = nearest_int(x[i] * sl2 / slx);
|
124
|
-
new_l = MAX(-nmax, MIN(nmax-1, new_l));
|
125
|
-
if (new_l != l) {
|
126
|
-
slx += w*x[i]*new_l;
|
127
|
-
sl2 += w*new_l*new_l;
|
128
|
-
if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) {
|
129
|
-
L[i] = nmax + new_l; sumlx = slx; suml2 = sl2;
|
130
|
-
scale = sumlx / suml2; best = scale * sumlx;
|
131
|
-
++n_changed;
|
132
|
-
}
|
133
|
-
}
|
134
|
-
}
|
135
|
-
}
|
136
|
-
if (!n_changed) { break; }
|
137
|
-
}
|
138
|
-
if (rmse_type < 3) {
|
139
|
-
return scale;
|
140
|
-
}
|
141
|
-
for (int is = -4; is <= 4; ++is) {
|
99
|
+
for (int is = -9; is <= 9; ++is) {
|
142
100
|
if (is == 0) {
|
143
101
|
continue;
|
144
102
|
}
|
@@ -221,12 +179,17 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
221
179
|
return 1/iscale;
|
222
180
|
}
|
223
181
|
|
224
|
-
static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
|
182
|
+
static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
|
183
|
+
int ntry, float alpha) {
|
225
184
|
float min = x[0];
|
226
185
|
float max = x[0];
|
186
|
+
float sum_x = 0;
|
187
|
+
float sum_x2 = 0;
|
227
188
|
for (int i = 1; i < n; ++i) {
|
228
189
|
if (x[i] < min) min = x[i];
|
229
190
|
if (x[i] > max) max = x[i];
|
191
|
+
sum_x += x[i];
|
192
|
+
sum_x2 += x[i]*x[i];
|
230
193
|
}
|
231
194
|
if (max == min) {
|
232
195
|
for (int i = 0; i < n; ++i) L[i] = 0;
|
@@ -254,7 +217,7 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
|
|
254
217
|
for (int i = 0; i < n; ++i) {
|
255
218
|
sum += x[i] - scale*L[i];
|
256
219
|
}
|
257
|
-
min = sum/n;
|
220
|
+
min = alpha*min + (1 - alpha)*sum/n;
|
258
221
|
if (min > 0) min = 0;
|
259
222
|
iscale = 1/scale;
|
260
223
|
if (!did_change) break;
|
@@ -263,6 +226,82 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
|
|
263
226
|
return scale;
|
264
227
|
}
|
265
228
|
|
229
|
+
static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
|
230
|
+
uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
|
231
|
+
float rmin, float rdelta, int nstep, bool use_mad) {
|
232
|
+
float min = x[0];
|
233
|
+
float max = x[0];
|
234
|
+
float sum_w = weights[0];
|
235
|
+
float sum_x = sum_w * x[0];
|
236
|
+
for (int i = 1; i < n; ++i) {
|
237
|
+
if (x[i] < min) min = x[i];
|
238
|
+
if (x[i] > max) max = x[i];
|
239
|
+
float w = weights[i];
|
240
|
+
sum_w += w;
|
241
|
+
sum_x += w * x[i];
|
242
|
+
}
|
243
|
+
if (min > 0) min = 0;
|
244
|
+
if (max == min) {
|
245
|
+
for (int i = 0; i < n; ++i) L[i] = 0;
|
246
|
+
*the_min = -min;
|
247
|
+
return 0.f;
|
248
|
+
}
|
249
|
+
float iscale = nmax/(max - min);
|
250
|
+
float scale = 1/iscale;
|
251
|
+
float best_mad = 0;
|
252
|
+
for (int i = 0; i < n; ++i) {
|
253
|
+
int l = nearest_int(iscale*(x[i] - min));
|
254
|
+
L[i] = MAX(0, MIN(nmax, l));
|
255
|
+
float diff = scale * L[i] + min - x[i];
|
256
|
+
diff = use_mad ? fabsf(diff) : diff * diff;
|
257
|
+
float w = weights[i];
|
258
|
+
best_mad += w * diff;
|
259
|
+
}
|
260
|
+
if (nstep < 1) {
|
261
|
+
*the_min = -min;
|
262
|
+
return scale;
|
263
|
+
}
|
264
|
+
for (int is = 0; is <= nstep; ++is) {
|
265
|
+
iscale = (rmin + rdelta*is + nmax)/(max - min);
|
266
|
+
float sum_l = 0, sum_l2 = 0, sum_xl = 0;
|
267
|
+
for (int i = 0; i < n; ++i) {
|
268
|
+
int l = nearest_int(iscale*(x[i] - min));
|
269
|
+
l = MAX(0, MIN(nmax, l));
|
270
|
+
Laux[i] = l;
|
271
|
+
float w = weights[i];
|
272
|
+
sum_l += w*l;
|
273
|
+
sum_l2 += w*l*l;
|
274
|
+
sum_xl += w*l*x[i];
|
275
|
+
}
|
276
|
+
float D = sum_w * sum_l2 - sum_l * sum_l;
|
277
|
+
if (D > 0) {
|
278
|
+
float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
|
279
|
+
float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D;
|
280
|
+
if (this_min > 0) {
|
281
|
+
this_min = 0;
|
282
|
+
this_scale = sum_xl / sum_l2;
|
283
|
+
}
|
284
|
+
float mad = 0;
|
285
|
+
for (int i = 0; i < n; ++i) {
|
286
|
+
float diff = this_scale * Laux[i] + this_min - x[i];
|
287
|
+
diff = use_mad ? fabsf(diff) : diff * diff;
|
288
|
+
float w = weights[i];
|
289
|
+
mad += w * diff;
|
290
|
+
}
|
291
|
+
if (mad < best_mad) {
|
292
|
+
for (int i = 0; i < n; ++i) {
|
293
|
+
L[i] = Laux[i];
|
294
|
+
}
|
295
|
+
best_mad = mad;
|
296
|
+
scale = this_scale;
|
297
|
+
min = this_min;
|
298
|
+
}
|
299
|
+
}
|
300
|
+
}
|
301
|
+
*the_min = -min;
|
302
|
+
return scale;
|
303
|
+
}
|
304
|
+
|
266
305
|
#if QK_K == 256
|
267
306
|
static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
|
268
307
|
if (j < 4) {
|
@@ -281,6 +320,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
|
|
281
320
|
const int nb = k / QK_K;
|
282
321
|
|
283
322
|
uint8_t L[QK_K];
|
323
|
+
uint8_t Laux[16];
|
324
|
+
float weights[16];
|
284
325
|
float mins[QK_K/16];
|
285
326
|
float scales[QK_K/16];
|
286
327
|
|
@@ -291,7 +332,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
|
|
291
332
|
float max_scale = 0; // as we are deducting the min, scales are always positive
|
292
333
|
float max_min = 0;
|
293
334
|
for (int j = 0; j < QK_K/16; ++j) {
|
294
|
-
|
335
|
+
for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]);
|
336
|
+
scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, true);
|
295
337
|
float scale = scales[j];
|
296
338
|
if (scale > max_scale) {
|
297
339
|
max_scale = scale;
|
@@ -637,6 +679,8 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
|
|
637
679
|
const int nb = k / QK_K;
|
638
680
|
|
639
681
|
uint8_t L[QK_K];
|
682
|
+
uint8_t Laux[32];
|
683
|
+
float weights[32];
|
640
684
|
float mins[QK_K/32];
|
641
685
|
float scales[QK_K/32];
|
642
686
|
|
@@ -645,7 +689,12 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
|
|
645
689
|
float max_scale = 0; // as we are deducting the min, scales are always positive
|
646
690
|
float max_min = 0;
|
647
691
|
for (int j = 0; j < QK_K/32; ++j) {
|
648
|
-
scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j],
|
692
|
+
//scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
|
693
|
+
float sum_x2 = 0;
|
694
|
+
for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
|
695
|
+
float av_x = sqrtf(sum_x2/32);
|
696
|
+
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
|
697
|
+
scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
|
649
698
|
float scale = scales[j];
|
650
699
|
if (scale > max_scale) {
|
651
700
|
max_scale = scale;
|
@@ -798,6 +847,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
|
|
798
847
|
uint8_t L[QK_K];
|
799
848
|
float mins[QK_K/32];
|
800
849
|
float scales[QK_K/32];
|
850
|
+
float weights[32];
|
851
|
+
uint8_t Laux[32];
|
801
852
|
#else
|
802
853
|
int8_t L[QK_K];
|
803
854
|
float scales[QK_K/16];
|
@@ -810,7 +861,12 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
|
|
810
861
|
float max_scale = 0; // as we are deducting the min, scales are always positive
|
811
862
|
float max_min = 0;
|
812
863
|
for (int j = 0; j < QK_K/32; ++j) {
|
813
|
-
scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j],
|
864
|
+
//scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
|
865
|
+
float sum_x2 = 0;
|
866
|
+
for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
|
867
|
+
float av_x = sqrtf(sum_x2/32);
|
868
|
+
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
|
869
|
+
scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false);
|
814
870
|
float scale = scales[j];
|
815
871
|
if (scale > max_scale) {
|
816
872
|
max_scale = scale;
|