llama_cpp 0.3.8 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -207,7 +207,7 @@
207
207
  #define GGML_MAX_PARAMS 256
208
208
  #define GGML_MAX_CONTEXTS 64
209
209
  #define GGML_MAX_SRC 6
210
- #define GGML_MAX_NAME 48
210
+ #define GGML_MAX_NAME 64
211
211
  #define GGML_MAX_OP_PARAMS 32
212
212
  #define GGML_DEFAULT_N_THREADS 4
213
213
 
@@ -215,6 +215,11 @@
215
215
  #define GGML_EXIT_SUCCESS 0
216
216
  #define GGML_EXIT_ABORTED 1
217
217
 
218
+ #define GGUF_MAGIC 0x46554747 // "GGUF"
219
+ #define GGUF_VERSION 1
220
+
221
+ #define GGUF_DEFAULT_ALIGNMENT 32
222
+
218
223
  #define GGML_UNUSED(x) (void)(x)
219
224
 
220
225
  #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
@@ -255,8 +260,9 @@
255
260
  extern "C" {
256
261
  #endif
257
262
 
258
- #ifdef __ARM_NEON
259
- // we use the built-in 16-bit float type
263
+ #if defined(__ARM_NEON) && defined(__CUDACC__)
264
+ typedef half ggml_fp16_t;
265
+ #elif defined(__ARM_NEON)
260
266
  typedef __fp16 ggml_fp16_t;
261
267
  #else
262
268
  typedef uint16_t ggml_fp16_t;
@@ -340,10 +346,12 @@ extern "C" {
340
346
  GGML_OP_ARGMAX,
341
347
  GGML_OP_REPEAT,
342
348
  GGML_OP_REPEAT_BACK,
349
+ GGML_OP_CONCAT,
343
350
  GGML_OP_SILU_BACK,
344
351
  GGML_OP_NORM, // normalize
345
352
  GGML_OP_RMS_NORM,
346
353
  GGML_OP_RMS_NORM_BACK,
354
+ GGML_OP_GROUP_NORM,
347
355
 
348
356
  GGML_OP_MUL_MAT,
349
357
  GGML_OP_OUT_PROD,
@@ -369,14 +377,19 @@ extern "C" {
369
377
  GGML_OP_CLAMP,
370
378
  GGML_OP_CONV_1D,
371
379
  GGML_OP_CONV_2D,
380
+ GGML_OP_CONV_TRANSPOSE_2D,
372
381
  GGML_OP_POOL_1D,
373
382
  GGML_OP_POOL_2D,
374
383
 
384
+ GGML_OP_UPSCALE, // nearest interpolate
385
+
375
386
  GGML_OP_FLASH_ATTN,
376
387
  GGML_OP_FLASH_FF,
377
388
  GGML_OP_FLASH_ATTN_BACK,
378
389
  GGML_OP_WIN_PART,
379
390
  GGML_OP_WIN_UNPART,
391
+ GGML_OP_GET_REL_POS,
392
+ GGML_OP_ADD_REL_POS,
380
393
 
381
394
  GGML_OP_UNARY,
382
395
 
@@ -562,6 +575,7 @@ extern "C" {
562
575
  GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
563
576
  GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
564
577
  GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
578
+ GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
565
579
  GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
566
580
 
567
581
  GGML_API int ggml_blck_size (enum ggml_type type);
@@ -799,6 +813,13 @@ extern "C" {
799
813
  struct ggml_tensor * a,
800
814
  struct ggml_tensor * b);
801
815
 
816
+ // concat a and b on dim 2
817
+ // used in stable-diffusion
818
+ GGML_API struct ggml_tensor * ggml_concat(
819
+ struct ggml_context * ctx,
820
+ struct ggml_tensor * a,
821
+ struct ggml_tensor * b);
822
+
802
823
  GGML_API struct ggml_tensor * ggml_abs(
803
824
  struct ggml_context * ctx,
804
825
  struct ggml_tensor * a);
@@ -888,14 +909,15 @@ extern "C" {
888
909
  struct ggml_tensor * b);
889
910
 
890
911
  // normalize along rows
891
- // TODO: eps is hardcoded to 1e-5 for now
892
912
  GGML_API struct ggml_tensor * ggml_norm(
893
913
  struct ggml_context * ctx,
894
- struct ggml_tensor * a);
914
+ struct ggml_tensor * a,
915
+ float eps);
895
916
 
896
917
  GGML_API struct ggml_tensor * ggml_norm_inplace(
897
918
  struct ggml_context * ctx,
898
- struct ggml_tensor * a);
919
+ struct ggml_tensor * a,
920
+ float eps);
899
921
 
900
922
  GGML_API struct ggml_tensor * ggml_rms_norm(
901
923
  struct ggml_context * ctx,
@@ -907,6 +929,19 @@ extern "C" {
907
929
  struct ggml_tensor * a,
908
930
  float eps);
909
931
 
932
+ // group normalize along ne0*ne1*n_groups
933
+ // used in stable-diffusion
934
+ // TODO: eps is hardcoded to 1e-6 for now
935
+ GGML_API struct ggml_tensor * ggml_group_norm(
936
+ struct ggml_context * ctx,
937
+ struct ggml_tensor * a,
938
+ int n_groups);
939
+
940
+ GGML_API struct ggml_tensor * ggml_group_norm_inplace(
941
+ struct ggml_context * ctx,
942
+ struct ggml_tensor * a,
943
+ int n_groups);
944
+
910
945
  // a - x
911
946
  // b - dy
912
947
  // TODO: update with configurable eps
@@ -1207,6 +1242,15 @@ extern "C" {
1207
1242
  float freq_base,
1208
1243
  float freq_scale);
1209
1244
 
1245
+ // xPos RoPE, in-place, returns view(a)
1246
+ GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
1247
+ struct ggml_context * ctx,
1248
+ struct ggml_tensor * a,
1249
+ int n_past,
1250
+ int n_dims,
1251
+ float base,
1252
+ bool down);
1253
+
1210
1254
  // rotary position embedding backward, i.e compute dx from dy
1211
1255
  // a - dy
1212
1256
  GGML_API struct ggml_tensor * ggml_rope_back(
@@ -1215,7 +1259,11 @@ extern "C" {
1215
1259
  int n_past,
1216
1260
  int n_dims,
1217
1261
  int mode,
1218
- int n_ctx);
1262
+ int n_ctx,
1263
+ float freq_base,
1264
+ float freq_scale,
1265
+ float xpos_base,
1266
+ bool xpos_down);
1219
1267
 
1220
1268
  // alibi position embedding
1221
1269
  // in-place, returns view(a)
@@ -1242,6 +1290,15 @@ extern "C" {
1242
1290
  int p0, // padding
1243
1291
  int d0); // dilation
1244
1292
 
1293
+ // conv_1d with padding = half
1294
+ // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
1295
+ GGML_API struct ggml_tensor* ggml_conv_1d_ph(
1296
+ struct ggml_context * ctx,
1297
+ struct ggml_tensor * a,
1298
+ struct ggml_tensor * b,
1299
+ int s,
1300
+ int d);
1301
+
1245
1302
  GGML_API struct ggml_tensor * ggml_conv_2d(
1246
1303
  struct ggml_context * ctx,
1247
1304
  struct ggml_tensor * a,
@@ -1253,14 +1310,38 @@ extern "C" {
1253
1310
  int d0,
1254
1311
  int d1);
1255
1312
 
1256
- // conv_1d with padding = half
1257
- // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
1258
- GGML_API struct ggml_tensor * ggml_conv_1d_ph(
1313
+
1314
+ // kernel size is a->ne[0] x a->ne[1]
1315
+ // stride is equal to kernel size
1316
+ // padding is zero
1317
+ // example:
1318
+ // a: 16 16 3 768
1319
+ // b: 1024 1024 3 1
1320
+ // res: 64 64 768 1
1321
+ // used in sam
1322
+ GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
1323
+ struct ggml_context * ctx,
1324
+ struct ggml_tensor * a,
1325
+ struct ggml_tensor * b);
1326
+
1327
+ // kernel size is a->ne[0] x a->ne[1]
1328
+ // stride is 1
1329
+ // padding is half
1330
+ // example:
1331
+ // a: 3 3 256 256
1332
+ // b: 64 64 256 1
1333
+ // res: 64 64 256 1
1334
+ // used in sam
1335
+ GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph(
1336
+ struct ggml_context * ctx,
1337
+ struct ggml_tensor * a,
1338
+ struct ggml_tensor * b);
1339
+
1340
+ GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
1259
1341
  struct ggml_context * ctx,
1260
1342
  struct ggml_tensor * a,
1261
1343
  struct ggml_tensor * b,
1262
- int s,
1263
- int d);
1344
+ int stride);
1264
1345
 
1265
1346
  enum ggml_op_pool {
1266
1347
  GGML_OP_POOL_MAX,
@@ -1287,6 +1368,13 @@ extern "C" {
1287
1368
  int p0,
1288
1369
  int p1);
1289
1370
 
1371
+ // nearest interpolate
1372
+ // used in stable-diffusion
1373
+ GGML_API struct ggml_tensor * ggml_upscale(
1374
+ struct ggml_context * ctx,
1375
+ struct ggml_tensor * a,
1376
+ int scale_factor);
1377
+
1290
1378
  GGML_API struct ggml_tensor * ggml_flash_attn(
1291
1379
  struct ggml_context * ctx,
1292
1380
  struct ggml_tensor * q,
@@ -1340,6 +1428,27 @@ extern "C" {
1340
1428
  struct ggml_tensor * a,
1341
1429
  enum ggml_unary_op op);
1342
1430
 
1431
+ // used in sam
1432
+ GGML_API struct ggml_tensor * ggml_get_rel_pos(
1433
+ struct ggml_context * ctx,
1434
+ struct ggml_tensor * a,
1435
+ int qh,
1436
+ int kh);
1437
+
1438
+ // used in sam
1439
+
1440
+ GGML_API struct ggml_tensor * ggml_add_rel_pos(
1441
+ struct ggml_context * ctx,
1442
+ struct ggml_tensor * a,
1443
+ struct ggml_tensor * pw,
1444
+ struct ggml_tensor * ph);
1445
+
1446
+ GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace(
1447
+ struct ggml_context * ctx,
1448
+ struct ggml_tensor * a,
1449
+ struct ggml_tensor * pw,
1450
+ struct ggml_tensor * ph);
1451
+
1343
1452
  // custom operators
1344
1453
 
1345
1454
  typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
@@ -1703,6 +1812,118 @@ extern "C" {
1703
1812
 
1704
1813
  GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
1705
1814
 
1815
+ //
1816
+ // gguf
1817
+ //
1818
+
1819
+ enum gguf_type {
1820
+ GGUF_TYPE_UINT8 = 0,
1821
+ GGUF_TYPE_INT8 = 1,
1822
+ GGUF_TYPE_UINT16 = 2,
1823
+ GGUF_TYPE_INT16 = 3,
1824
+ GGUF_TYPE_UINT32 = 4,
1825
+ GGUF_TYPE_INT32 = 5,
1826
+ GGUF_TYPE_FLOAT32 = 6,
1827
+ GGUF_TYPE_BOOL = 7,
1828
+ GGUF_TYPE_STRING = 8,
1829
+ GGUF_TYPE_ARRAY = 9,
1830
+ GGUF_TYPE_COUNT, // marks the end of the enum
1831
+ };
1832
+
1833
+ struct gguf_context;
1834
+
1835
+ struct gguf_init_params {
1836
+ bool no_alloc;
1837
+
1838
+ // if not NULL, create a ggml_context and allocate the tensor data in it
1839
+ struct ggml_context ** ctx;
1840
+ };
1841
+
1842
+ GGML_API struct gguf_context * gguf_init_empty(void);
1843
+ GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
1844
+ //GGML_API struct gguf_context * gguf_init_from_buffer(..);
1845
+
1846
+ GGML_API void gguf_free(struct gguf_context * ctx);
1847
+
1848
+ GGML_API const char * gguf_type_name(enum gguf_type type);
1849
+
1850
+ GGML_API int gguf_get_version (struct gguf_context * ctx);
1851
+ GGML_API size_t gguf_get_alignment (struct gguf_context * ctx);
1852
+ GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
1853
+ GGML_API void * gguf_get_data (struct gguf_context * ctx);
1854
+
1855
+ GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
1856
+ GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
1857
+ GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
1858
+
1859
+ GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
1860
+ GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
1861
+
1862
+ // results are undefined if the wrong type is used for the key
1863
+ GGML_API uint8_t gguf_get_val_u8 (struct gguf_context * ctx, int i);
1864
+ GGML_API int8_t gguf_get_val_i8 (struct gguf_context * ctx, int i);
1865
+ GGML_API uint16_t gguf_get_val_u16 (struct gguf_context * ctx, int i);
1866
+ GGML_API int16_t gguf_get_val_i16 (struct gguf_context * ctx, int i);
1867
+ GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
1868
+ GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
1869
+ GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
1870
+ GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
1871
+ GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
1872
+ GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
1873
+ GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
1874
+ GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
1875
+
1876
+ GGML_API int gguf_get_n_tensors (struct gguf_context * ctx);
1877
+ GGML_API int gguf_find_tensor (struct gguf_context * ctx, const char * name);
1878
+ GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
1879
+ GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i);
1880
+
1881
+ // overrides existing values or adds a new one
1882
+ GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
1883
+ GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
1884
+ GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
1885
+ GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
1886
+ GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
1887
+ GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
1888
+ GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
1889
+ GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
1890
+ GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
1891
+ GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
1892
+ GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
1893
+
1894
+ // set or add KV pairs from another context
1895
+ GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
1896
+
1897
+ // manage tensor info
1898
+ GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
1899
+ GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
1900
+ GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
1901
+
1902
+ // writing gguf files can be done in 2 ways:
1903
+ //
1904
+ // - write the entire gguf_context to a binary file in a single pass:
1905
+ //
1906
+ // gguf_write_to_file(ctx, fname);
1907
+ //
1908
+ // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
1909
+ //
1910
+ // FILE * f = fopen(fname, "wb");
1911
+ // fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
1912
+ // fwrite(f, ...);
1913
+ // void * data = gguf_meta_get_meta_data(ctx);
1914
+ // fseek(f, 0, SEEK_SET);
1915
+ // fwrite(f, data, gguf_get_meta_size(ctx));
1916
+ // free(data);
1917
+ // fclose(f);
1918
+ //
1919
+
1920
+ // write the entire context to a binary file
1921
+ GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
1922
+
1923
+ // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
1924
+ GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
1925
+ GGML_API void gguf_get_meta_data(struct gguf_context * ctx, void * data);
1926
+
1706
1927
  //
1707
1928
  // system info
1708
1929
  //
@@ -1740,6 +1961,10 @@ extern "C" {
1740
1961
  typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
1741
1962
 
1742
1963
  typedef struct {
1964
+ const char * type_name;
1965
+ int blck_size;
1966
+ size_t type_size;
1967
+ bool is_quantized;
1743
1968
  ggml_to_float_t to_float;
1744
1969
  ggml_from_float_t from_float;
1745
1970
  ggml_from_float_t from_float_reference;
@@ -1747,7 +1972,7 @@ extern "C" {
1747
1972
  enum ggml_type vec_dot_type;
1748
1973
  } ggml_type_traits_t;
1749
1974
 
1750
- ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
1975
+ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
1751
1976
 
1752
1977
  #ifdef __cplusplus
1753
1978
  }
@@ -77,6 +77,11 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
77
77
  }
78
78
  return 1/iscale;
79
79
  }
80
+ bool return_early = false;
81
+ if (rmse_type < 0) {
82
+ rmse_type = -rmse_type;
83
+ return_early = true;
84
+ }
80
85
  int weight_type = rmse_type%2;
81
86
  float sumlx = 0;
82
87
  float suml2 = 0;
@@ -89,56 +94,9 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
89
94
  suml2 += w*l*l;
90
95
  }
91
96
  float scale = sumlx/suml2;
97
+ if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale;
92
98
  float best = scale * sumlx;
93
- for (int itry = 0; itry < 3; ++itry) {
94
- iscale = 1/scale;
95
- float slx = 0;
96
- float sl2 = 0;
97
- bool changed = false;
98
- for (int i = 0; i < n; ++i) {
99
- int l = nearest_int(iscale * x[i]);
100
- l = MAX(-nmax, MIN(nmax-1, l));
101
- if (l + nmax != L[i]) { changed = true; }
102
- float w = weight_type == 1 ? x[i] * x[i] : 1.f;
103
- slx += w*x[i]*l;
104
- sl2 += w*l*l;
105
- }
106
- if (!changed || sl2 == 0 || slx*slx <= best*sl2) { break; }
107
- for (int i = 0; i < n; ++i) {
108
- int l = nearest_int(iscale * x[i]);
109
- L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
110
- }
111
- sumlx = slx; suml2 = sl2;
112
- scale = sumlx/suml2;
113
- best = scale * sumlx;
114
- }
115
- for (int itry = 0; itry < 5; ++itry) {
116
- int n_changed = 0;
117
- for (int i = 0; i < n; ++i) {
118
- float w = weight_type == 1 ? x[i]*x[i] : 1;
119
- int l = L[i] - nmax;
120
- float slx = sumlx - w*x[i]*l;
121
- if (slx > 0) {
122
- float sl2 = suml2 - w*l*l;
123
- int new_l = nearest_int(x[i] * sl2 / slx);
124
- new_l = MAX(-nmax, MIN(nmax-1, new_l));
125
- if (new_l != l) {
126
- slx += w*x[i]*new_l;
127
- sl2 += w*new_l*new_l;
128
- if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) {
129
- L[i] = nmax + new_l; sumlx = slx; suml2 = sl2;
130
- scale = sumlx / suml2; best = scale * sumlx;
131
- ++n_changed;
132
- }
133
- }
134
- }
135
- }
136
- if (!n_changed) { break; }
137
- }
138
- if (rmse_type < 3) {
139
- return scale;
140
- }
141
- for (int is = -4; is <= 4; ++is) {
99
+ for (int is = -9; is <= 9; ++is) {
142
100
  if (is == 0) {
143
101
  continue;
144
102
  }
@@ -221,12 +179,17 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
221
179
  return 1/iscale;
222
180
  }
223
181
 
224
- static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, int ntry) {
182
+ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
183
+ int ntry, float alpha) {
225
184
  float min = x[0];
226
185
  float max = x[0];
186
+ float sum_x = 0;
187
+ float sum_x2 = 0;
227
188
  for (int i = 1; i < n; ++i) {
228
189
  if (x[i] < min) min = x[i];
229
190
  if (x[i] > max) max = x[i];
191
+ sum_x += x[i];
192
+ sum_x2 += x[i]*x[i];
230
193
  }
231
194
  if (max == min) {
232
195
  for (int i = 0; i < n; ++i) L[i] = 0;
@@ -254,7 +217,7 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
254
217
  for (int i = 0; i < n; ++i) {
255
218
  sum += x[i] - scale*L[i];
256
219
  }
257
- min = sum/n;
220
+ min = alpha*min + (1 - alpha)*sum/n;
258
221
  if (min > 0) min = 0;
259
222
  iscale = 1/scale;
260
223
  if (!did_change) break;
@@ -263,6 +226,82 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
263
226
  return scale;
264
227
  }
265
228
 
229
+ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
230
+ uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
231
+ float rmin, float rdelta, int nstep, bool use_mad) {
232
+ float min = x[0];
233
+ float max = x[0];
234
+ float sum_w = weights[0];
235
+ float sum_x = sum_w * x[0];
236
+ for (int i = 1; i < n; ++i) {
237
+ if (x[i] < min) min = x[i];
238
+ if (x[i] > max) max = x[i];
239
+ float w = weights[i];
240
+ sum_w += w;
241
+ sum_x += w * x[i];
242
+ }
243
+ if (min > 0) min = 0;
244
+ if (max == min) {
245
+ for (int i = 0; i < n; ++i) L[i] = 0;
246
+ *the_min = -min;
247
+ return 0.f;
248
+ }
249
+ float iscale = nmax/(max - min);
250
+ float scale = 1/iscale;
251
+ float best_mad = 0;
252
+ for (int i = 0; i < n; ++i) {
253
+ int l = nearest_int(iscale*(x[i] - min));
254
+ L[i] = MAX(0, MIN(nmax, l));
255
+ float diff = scale * L[i] + min - x[i];
256
+ diff = use_mad ? fabsf(diff) : diff * diff;
257
+ float w = weights[i];
258
+ best_mad += w * diff;
259
+ }
260
+ if (nstep < 1) {
261
+ *the_min = -min;
262
+ return scale;
263
+ }
264
+ for (int is = 0; is <= nstep; ++is) {
265
+ iscale = (rmin + rdelta*is + nmax)/(max - min);
266
+ float sum_l = 0, sum_l2 = 0, sum_xl = 0;
267
+ for (int i = 0; i < n; ++i) {
268
+ int l = nearest_int(iscale*(x[i] - min));
269
+ l = MAX(0, MIN(nmax, l));
270
+ Laux[i] = l;
271
+ float w = weights[i];
272
+ sum_l += w*l;
273
+ sum_l2 += w*l*l;
274
+ sum_xl += w*l*x[i];
275
+ }
276
+ float D = sum_w * sum_l2 - sum_l * sum_l;
277
+ if (D > 0) {
278
+ float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
279
+ float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D;
280
+ if (this_min > 0) {
281
+ this_min = 0;
282
+ this_scale = sum_xl / sum_l2;
283
+ }
284
+ float mad = 0;
285
+ for (int i = 0; i < n; ++i) {
286
+ float diff = this_scale * Laux[i] + this_min - x[i];
287
+ diff = use_mad ? fabsf(diff) : diff * diff;
288
+ float w = weights[i];
289
+ mad += w * diff;
290
+ }
291
+ if (mad < best_mad) {
292
+ for (int i = 0; i < n; ++i) {
293
+ L[i] = Laux[i];
294
+ }
295
+ best_mad = mad;
296
+ scale = this_scale;
297
+ min = this_min;
298
+ }
299
+ }
300
+ }
301
+ *the_min = -min;
302
+ return scale;
303
+ }
304
+
266
305
  #if QK_K == 256
267
306
  static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
268
307
  if (j < 4) {
@@ -281,6 +320,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
281
320
  const int nb = k / QK_K;
282
321
 
283
322
  uint8_t L[QK_K];
323
+ uint8_t Laux[16];
324
+ float weights[16];
284
325
  float mins[QK_K/16];
285
326
  float scales[QK_K/16];
286
327
 
@@ -291,7 +332,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
291
332
  float max_scale = 0; // as we are deducting the min, scales are always positive
292
333
  float max_min = 0;
293
334
  for (int j = 0; j < QK_K/16; ++j) {
294
- scales[j] = make_qkx1_quants(16, 3, x + 16*j, L + 16*j, &mins[j], 5);
335
+ for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]);
336
+ scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, true);
295
337
  float scale = scales[j];
296
338
  if (scale > max_scale) {
297
339
  max_scale = scale;
@@ -637,6 +679,8 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
637
679
  const int nb = k / QK_K;
638
680
 
639
681
  uint8_t L[QK_K];
682
+ uint8_t Laux[32];
683
+ float weights[32];
640
684
  float mins[QK_K/32];
641
685
  float scales[QK_K/32];
642
686
 
@@ -645,7 +689,12 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
645
689
  float max_scale = 0; // as we are deducting the min, scales are always positive
646
690
  float max_min = 0;
647
691
  for (int j = 0; j < QK_K/32; ++j) {
648
- scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 5);
692
+ //scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
693
+ float sum_x2 = 0;
694
+ for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
695
+ float av_x = sqrtf(sum_x2/32);
696
+ for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
697
+ scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
649
698
  float scale = scales[j];
650
699
  if (scale > max_scale) {
651
700
  max_scale = scale;
@@ -798,6 +847,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
798
847
  uint8_t L[QK_K];
799
848
  float mins[QK_K/32];
800
849
  float scales[QK_K/32];
850
+ float weights[32];
851
+ uint8_t Laux[32];
801
852
  #else
802
853
  int8_t L[QK_K];
803
854
  float scales[QK_K/16];
@@ -810,7 +861,12 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
810
861
  float max_scale = 0; // as we are deducting the min, scales are always positive
811
862
  float max_min = 0;
812
863
  for (int j = 0; j < QK_K/32; ++j) {
813
- scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 5);
864
+ //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
865
+ float sum_x2 = 0;
866
+ for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
867
+ float av_x = sqrtf(sum_x2/32);
868
+ for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
869
+ scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false);
814
870
  float scale = scales[j];
815
871
  if (scale > max_scale) {
816
872
  max_scale = scale;