llama_cpp 0.3.8 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -207,7 +207,7 @@
207
207
  #define GGML_MAX_PARAMS 256
208
208
  #define GGML_MAX_CONTEXTS 64
209
209
  #define GGML_MAX_SRC 6
210
- #define GGML_MAX_NAME 48
210
+ #define GGML_MAX_NAME 64
211
211
  #define GGML_MAX_OP_PARAMS 32
212
212
  #define GGML_DEFAULT_N_THREADS 4
213
213
 
@@ -215,6 +215,11 @@
215
215
  #define GGML_EXIT_SUCCESS 0
216
216
  #define GGML_EXIT_ABORTED 1
217
217
 
218
+ #define GGUF_MAGIC 0x46554747 // "GGUF"
219
+ #define GGUF_VERSION 1
220
+
221
+ #define GGUF_DEFAULT_ALIGNMENT 32
222
+
218
223
  #define GGML_UNUSED(x) (void)(x)
219
224
 
220
225
  #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
@@ -255,8 +260,9 @@
255
260
  extern "C" {
256
261
  #endif
257
262
 
258
- #ifdef __ARM_NEON
259
- // we use the built-in 16-bit float type
263
+ #if defined(__ARM_NEON) && defined(__CUDACC__)
264
+ typedef half ggml_fp16_t;
265
+ #elif defined(__ARM_NEON)
260
266
  typedef __fp16 ggml_fp16_t;
261
267
  #else
262
268
  typedef uint16_t ggml_fp16_t;
@@ -340,10 +346,12 @@ extern "C" {
340
346
  GGML_OP_ARGMAX,
341
347
  GGML_OP_REPEAT,
342
348
  GGML_OP_REPEAT_BACK,
349
+ GGML_OP_CONCAT,
343
350
  GGML_OP_SILU_BACK,
344
351
  GGML_OP_NORM, // normalize
345
352
  GGML_OP_RMS_NORM,
346
353
  GGML_OP_RMS_NORM_BACK,
354
+ GGML_OP_GROUP_NORM,
347
355
 
348
356
  GGML_OP_MUL_MAT,
349
357
  GGML_OP_OUT_PROD,
@@ -369,14 +377,19 @@ extern "C" {
369
377
  GGML_OP_CLAMP,
370
378
  GGML_OP_CONV_1D,
371
379
  GGML_OP_CONV_2D,
380
+ GGML_OP_CONV_TRANSPOSE_2D,
372
381
  GGML_OP_POOL_1D,
373
382
  GGML_OP_POOL_2D,
374
383
 
384
+ GGML_OP_UPSCALE, // nearest interpolate
385
+
375
386
  GGML_OP_FLASH_ATTN,
376
387
  GGML_OP_FLASH_FF,
377
388
  GGML_OP_FLASH_ATTN_BACK,
378
389
  GGML_OP_WIN_PART,
379
390
  GGML_OP_WIN_UNPART,
391
+ GGML_OP_GET_REL_POS,
392
+ GGML_OP_ADD_REL_POS,
380
393
 
381
394
  GGML_OP_UNARY,
382
395
 
@@ -562,6 +575,7 @@ extern "C" {
562
575
  GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
563
576
  GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
564
577
  GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
578
+ GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
565
579
  GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
566
580
 
567
581
  GGML_API int ggml_blck_size (enum ggml_type type);
@@ -799,6 +813,13 @@ extern "C" {
799
813
  struct ggml_tensor * a,
800
814
  struct ggml_tensor * b);
801
815
 
816
+ // concat a and b on dim 2
817
+ // used in stable-diffusion
818
+ GGML_API struct ggml_tensor * ggml_concat(
819
+ struct ggml_context * ctx,
820
+ struct ggml_tensor * a,
821
+ struct ggml_tensor * b);
822
+
802
823
  GGML_API struct ggml_tensor * ggml_abs(
803
824
  struct ggml_context * ctx,
804
825
  struct ggml_tensor * a);
@@ -888,14 +909,15 @@ extern "C" {
888
909
  struct ggml_tensor * b);
889
910
 
890
911
  // normalize along rows
891
- // TODO: eps is hardcoded to 1e-5 for now
892
912
  GGML_API struct ggml_tensor * ggml_norm(
893
913
  struct ggml_context * ctx,
894
- struct ggml_tensor * a);
914
+ struct ggml_tensor * a,
915
+ float eps);
895
916
 
896
917
  GGML_API struct ggml_tensor * ggml_norm_inplace(
897
918
  struct ggml_context * ctx,
898
- struct ggml_tensor * a);
919
+ struct ggml_tensor * a,
920
+ float eps);
899
921
 
900
922
  GGML_API struct ggml_tensor * ggml_rms_norm(
901
923
  struct ggml_context * ctx,
@@ -907,6 +929,19 @@ extern "C" {
907
929
  struct ggml_tensor * a,
908
930
  float eps);
909
931
 
932
+ // group normalize along ne0*ne1*n_groups
933
+ // used in stable-diffusion
934
+ // TODO: eps is hardcoded to 1e-6 for now
935
+ GGML_API struct ggml_tensor * ggml_group_norm(
936
+ struct ggml_context * ctx,
937
+ struct ggml_tensor * a,
938
+ int n_groups);
939
+
940
+ GGML_API struct ggml_tensor * ggml_group_norm_inplace(
941
+ struct ggml_context * ctx,
942
+ struct ggml_tensor * a,
943
+ int n_groups);
944
+
910
945
  // a - x
911
946
  // b - dy
912
947
  // TODO: update with configurable eps
@@ -1207,6 +1242,15 @@ extern "C" {
1207
1242
  float freq_base,
1208
1243
  float freq_scale);
1209
1244
 
1245
+ // xPos RoPE, in-place, returns view(a)
1246
+ GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
1247
+ struct ggml_context * ctx,
1248
+ struct ggml_tensor * a,
1249
+ int n_past,
1250
+ int n_dims,
1251
+ float base,
1252
+ bool down);
1253
+
1210
1254
  // rotary position embedding backward, i.e compute dx from dy
1211
1255
  // a - dy
1212
1256
  GGML_API struct ggml_tensor * ggml_rope_back(
@@ -1215,7 +1259,11 @@ extern "C" {
1215
1259
  int n_past,
1216
1260
  int n_dims,
1217
1261
  int mode,
1218
- int n_ctx);
1262
+ int n_ctx,
1263
+ float freq_base,
1264
+ float freq_scale,
1265
+ float xpos_base,
1266
+ bool xpos_down);
1219
1267
 
1220
1268
  // alibi position embedding
1221
1269
  // in-place, returns view(a)
@@ -1242,6 +1290,15 @@ extern "C" {
1242
1290
  int p0, // padding
1243
1291
  int d0); // dilation
1244
1292
 
1293
+ // conv_1d with padding = half
1294
+ // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
1295
+ GGML_API struct ggml_tensor* ggml_conv_1d_ph(
1296
+ struct ggml_context * ctx,
1297
+ struct ggml_tensor * a,
1298
+ struct ggml_tensor * b,
1299
+ int s,
1300
+ int d);
1301
+
1245
1302
  GGML_API struct ggml_tensor * ggml_conv_2d(
1246
1303
  struct ggml_context * ctx,
1247
1304
  struct ggml_tensor * a,
@@ -1253,14 +1310,38 @@ extern "C" {
1253
1310
  int d0,
1254
1311
  int d1);
1255
1312
 
1256
- // conv_1d with padding = half
1257
- // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
1258
- GGML_API struct ggml_tensor * ggml_conv_1d_ph(
1313
+
1314
+ // kernel size is a->ne[0] x a->ne[1]
1315
+ // stride is equal to kernel size
1316
+ // padding is zero
1317
+ // example:
1318
+ // a: 16 16 3 768
1319
+ // b: 1024 1024 3 1
1320
+ // res: 64 64 768 1
1321
+ // used in sam
1322
+ GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
1323
+ struct ggml_context * ctx,
1324
+ struct ggml_tensor * a,
1325
+ struct ggml_tensor * b);
1326
+
1327
+ // kernel size is a->ne[0] x a->ne[1]
1328
+ // stride is 1
1329
+ // padding is half
1330
+ // example:
1331
+ // a: 3 3 256 256
1332
+ // b: 64 64 256 1
1333
+ // res: 64 64 256 1
1334
+ // used in sam
1335
+ GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph(
1336
+ struct ggml_context * ctx,
1337
+ struct ggml_tensor * a,
1338
+ struct ggml_tensor * b);
1339
+
1340
+ GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
1259
1341
  struct ggml_context * ctx,
1260
1342
  struct ggml_tensor * a,
1261
1343
  struct ggml_tensor * b,
1262
- int s,
1263
- int d);
1344
+ int stride);
1264
1345
 
1265
1346
  enum ggml_op_pool {
1266
1347
  GGML_OP_POOL_MAX,
@@ -1287,6 +1368,13 @@ extern "C" {
1287
1368
  int p0,
1288
1369
  int p1);
1289
1370
 
1371
+ // nearest interpolate
1372
+ // used in stable-diffusion
1373
+ GGML_API struct ggml_tensor * ggml_upscale(
1374
+ struct ggml_context * ctx,
1375
+ struct ggml_tensor * a,
1376
+ int scale_factor);
1377
+
1290
1378
  GGML_API struct ggml_tensor * ggml_flash_attn(
1291
1379
  struct ggml_context * ctx,
1292
1380
  struct ggml_tensor * q,
@@ -1340,6 +1428,27 @@ extern "C" {
1340
1428
  struct ggml_tensor * a,
1341
1429
  enum ggml_unary_op op);
1342
1430
 
1431
+ // used in sam
1432
+ GGML_API struct ggml_tensor * ggml_get_rel_pos(
1433
+ struct ggml_context * ctx,
1434
+ struct ggml_tensor * a,
1435
+ int qh,
1436
+ int kh);
1437
+
1438
+ // used in sam
1439
+
1440
+ GGML_API struct ggml_tensor * ggml_add_rel_pos(
1441
+ struct ggml_context * ctx,
1442
+ struct ggml_tensor * a,
1443
+ struct ggml_tensor * pw,
1444
+ struct ggml_tensor * ph);
1445
+
1446
+ GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace(
1447
+ struct ggml_context * ctx,
1448
+ struct ggml_tensor * a,
1449
+ struct ggml_tensor * pw,
1450
+ struct ggml_tensor * ph);
1451
+
1343
1452
  // custom operators
1344
1453
 
1345
1454
  typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
@@ -1703,6 +1812,118 @@ extern "C" {
1703
1812
 
1704
1813
  GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
1705
1814
 
1815
+ //
1816
+ // gguf
1817
+ //
1818
+
1819
+ enum gguf_type {
1820
+ GGUF_TYPE_UINT8 = 0,
1821
+ GGUF_TYPE_INT8 = 1,
1822
+ GGUF_TYPE_UINT16 = 2,
1823
+ GGUF_TYPE_INT16 = 3,
1824
+ GGUF_TYPE_UINT32 = 4,
1825
+ GGUF_TYPE_INT32 = 5,
1826
+ GGUF_TYPE_FLOAT32 = 6,
1827
+ GGUF_TYPE_BOOL = 7,
1828
+ GGUF_TYPE_STRING = 8,
1829
+ GGUF_TYPE_ARRAY = 9,
1830
+ GGUF_TYPE_COUNT, // marks the end of the enum
1831
+ };
1832
+
1833
+ struct gguf_context;
1834
+
1835
+ struct gguf_init_params {
1836
+ bool no_alloc;
1837
+
1838
+ // if not NULL, create a ggml_context and allocate the tensor data in it
1839
+ struct ggml_context ** ctx;
1840
+ };
1841
+
1842
+ GGML_API struct gguf_context * gguf_init_empty(void);
1843
+ GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
1844
+ //GGML_API struct gguf_context * gguf_init_from_buffer(..);
1845
+
1846
+ GGML_API void gguf_free(struct gguf_context * ctx);
1847
+
1848
+ GGML_API const char * gguf_type_name(enum gguf_type type);
1849
+
1850
+ GGML_API int gguf_get_version (struct gguf_context * ctx);
1851
+ GGML_API size_t gguf_get_alignment (struct gguf_context * ctx);
1852
+ GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
1853
+ GGML_API void * gguf_get_data (struct gguf_context * ctx);
1854
+
1855
+ GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
1856
+ GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
1857
+ GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
1858
+
1859
+ GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
1860
+ GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
1861
+
1862
+ // results are undefined if the wrong type is used for the key
1863
+ GGML_API uint8_t gguf_get_val_u8 (struct gguf_context * ctx, int i);
1864
+ GGML_API int8_t gguf_get_val_i8 (struct gguf_context * ctx, int i);
1865
+ GGML_API uint16_t gguf_get_val_u16 (struct gguf_context * ctx, int i);
1866
+ GGML_API int16_t gguf_get_val_i16 (struct gguf_context * ctx, int i);
1867
+ GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
1868
+ GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
1869
+ GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
1870
+ GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
1871
+ GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
1872
+ GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
1873
+ GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
1874
+ GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
1875
+
1876
+ GGML_API int gguf_get_n_tensors (struct gguf_context * ctx);
1877
+ GGML_API int gguf_find_tensor (struct gguf_context * ctx, const char * name);
1878
+ GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
1879
+ GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i);
1880
+
1881
+ // overrides existing values or adds a new one
1882
+ GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
1883
+ GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
1884
+ GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
1885
+ GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
1886
+ GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
1887
+ GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
1888
+ GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
1889
+ GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
1890
+ GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
1891
+ GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
1892
+ GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
1893
+
1894
+ // set or add KV pairs from another context
1895
+ GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
1896
+
1897
+ // manage tensor info
1898
+ GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
1899
+ GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
1900
+ GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
1901
+
1902
+ // writing gguf files can be done in 2 ways:
1903
+ //
1904
+ // - write the entire gguf_context to a binary file in a single pass:
1905
+ //
1906
+ // gguf_write_to_file(ctx, fname);
1907
+ //
1908
+ // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
1909
+ //
1910
+ // FILE * f = fopen(fname, "wb");
1911
+ // fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
1912
+ // fwrite(f, ...);
1913
+ // void * data = gguf_meta_get_meta_data(ctx);
1914
+ // fseek(f, 0, SEEK_SET);
1915
+ // fwrite(f, data, gguf_get_meta_size(ctx));
1916
+ // free(data);
1917
+ // fclose(f);
1918
+ //
1919
+
1920
+ // write the entire context to a binary file
1921
+ GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
1922
+
1923
+ // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
1924
+ GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
1925
+ GGML_API void gguf_get_meta_data(struct gguf_context * ctx, void * data);
1926
+
1706
1927
  //
1707
1928
  // system info
1708
1929
  //
@@ -1740,6 +1961,10 @@ extern "C" {
1740
1961
  typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
1741
1962
 
1742
1963
  typedef struct {
1964
+ const char * type_name;
1965
+ int blck_size;
1966
+ size_t type_size;
1967
+ bool is_quantized;
1743
1968
  ggml_to_float_t to_float;
1744
1969
  ggml_from_float_t from_float;
1745
1970
  ggml_from_float_t from_float_reference;
@@ -1747,7 +1972,7 @@ extern "C" {
1747
1972
  enum ggml_type vec_dot_type;
1748
1973
  } ggml_type_traits_t;
1749
1974
 
1750
- ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
1975
+ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
1751
1976
 
1752
1977
  #ifdef __cplusplus
1753
1978
  }
@@ -77,6 +77,11 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
77
77
  }
78
78
  return 1/iscale;
79
79
  }
80
+ bool return_early = false;
81
+ if (rmse_type < 0) {
82
+ rmse_type = -rmse_type;
83
+ return_early = true;
84
+ }
80
85
  int weight_type = rmse_type%2;
81
86
  float sumlx = 0;
82
87
  float suml2 = 0;
@@ -89,56 +94,9 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
89
94
  suml2 += w*l*l;
90
95
  }
91
96
  float scale = sumlx/suml2;
97
+ if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale;
92
98
  float best = scale * sumlx;
93
- for (int itry = 0; itry < 3; ++itry) {
94
- iscale = 1/scale;
95
- float slx = 0;
96
- float sl2 = 0;
97
- bool changed = false;
98
- for (int i = 0; i < n; ++i) {
99
- int l = nearest_int(iscale * x[i]);
100
- l = MAX(-nmax, MIN(nmax-1, l));
101
- if (l + nmax != L[i]) { changed = true; }
102
- float w = weight_type == 1 ? x[i] * x[i] : 1.f;
103
- slx += w*x[i]*l;
104
- sl2 += w*l*l;
105
- }
106
- if (!changed || sl2 == 0 || slx*slx <= best*sl2) { break; }
107
- for (int i = 0; i < n; ++i) {
108
- int l = nearest_int(iscale * x[i]);
109
- L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
110
- }
111
- sumlx = slx; suml2 = sl2;
112
- scale = sumlx/suml2;
113
- best = scale * sumlx;
114
- }
115
- for (int itry = 0; itry < 5; ++itry) {
116
- int n_changed = 0;
117
- for (int i = 0; i < n; ++i) {
118
- float w = weight_type == 1 ? x[i]*x[i] : 1;
119
- int l = L[i] - nmax;
120
- float slx = sumlx - w*x[i]*l;
121
- if (slx > 0) {
122
- float sl2 = suml2 - w*l*l;
123
- int new_l = nearest_int(x[i] * sl2 / slx);
124
- new_l = MAX(-nmax, MIN(nmax-1, new_l));
125
- if (new_l != l) {
126
- slx += w*x[i]*new_l;
127
- sl2 += w*new_l*new_l;
128
- if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) {
129
- L[i] = nmax + new_l; sumlx = slx; suml2 = sl2;
130
- scale = sumlx / suml2; best = scale * sumlx;
131
- ++n_changed;
132
- }
133
- }
134
- }
135
- }
136
- if (!n_changed) { break; }
137
- }
138
- if (rmse_type < 3) {
139
- return scale;
140
- }
141
- for (int is = -4; is <= 4; ++is) {
99
+ for (int is = -9; is <= 9; ++is) {
142
100
  if (is == 0) {
143
101
  continue;
144
102
  }
@@ -221,12 +179,17 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
221
179
  return 1/iscale;
222
180
  }
223
181
 
224
- static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, int ntry) {
182
+ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
183
+ int ntry, float alpha) {
225
184
  float min = x[0];
226
185
  float max = x[0];
186
+ float sum_x = 0;
187
+ float sum_x2 = 0;
227
188
  for (int i = 1; i < n; ++i) {
228
189
  if (x[i] < min) min = x[i];
229
190
  if (x[i] > max) max = x[i];
191
+ sum_x += x[i];
192
+ sum_x2 += x[i]*x[i];
230
193
  }
231
194
  if (max == min) {
232
195
  for (int i = 0; i < n; ++i) L[i] = 0;
@@ -254,7 +217,7 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
254
217
  for (int i = 0; i < n; ++i) {
255
218
  sum += x[i] - scale*L[i];
256
219
  }
257
- min = sum/n;
220
+ min = alpha*min + (1 - alpha)*sum/n;
258
221
  if (min > 0) min = 0;
259
222
  iscale = 1/scale;
260
223
  if (!did_change) break;
@@ -263,6 +226,82 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
263
226
  return scale;
264
227
  }
265
228
 
229
+ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
230
+ uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
231
+ float rmin, float rdelta, int nstep, bool use_mad) {
232
+ float min = x[0];
233
+ float max = x[0];
234
+ float sum_w = weights[0];
235
+ float sum_x = sum_w * x[0];
236
+ for (int i = 1; i < n; ++i) {
237
+ if (x[i] < min) min = x[i];
238
+ if (x[i] > max) max = x[i];
239
+ float w = weights[i];
240
+ sum_w += w;
241
+ sum_x += w * x[i];
242
+ }
243
+ if (min > 0) min = 0;
244
+ if (max == min) {
245
+ for (int i = 0; i < n; ++i) L[i] = 0;
246
+ *the_min = -min;
247
+ return 0.f;
248
+ }
249
+ float iscale = nmax/(max - min);
250
+ float scale = 1/iscale;
251
+ float best_mad = 0;
252
+ for (int i = 0; i < n; ++i) {
253
+ int l = nearest_int(iscale*(x[i] - min));
254
+ L[i] = MAX(0, MIN(nmax, l));
255
+ float diff = scale * L[i] + min - x[i];
256
+ diff = use_mad ? fabsf(diff) : diff * diff;
257
+ float w = weights[i];
258
+ best_mad += w * diff;
259
+ }
260
+ if (nstep < 1) {
261
+ *the_min = -min;
262
+ return scale;
263
+ }
264
+ for (int is = 0; is <= nstep; ++is) {
265
+ iscale = (rmin + rdelta*is + nmax)/(max - min);
266
+ float sum_l = 0, sum_l2 = 0, sum_xl = 0;
267
+ for (int i = 0; i < n; ++i) {
268
+ int l = nearest_int(iscale*(x[i] - min));
269
+ l = MAX(0, MIN(nmax, l));
270
+ Laux[i] = l;
271
+ float w = weights[i];
272
+ sum_l += w*l;
273
+ sum_l2 += w*l*l;
274
+ sum_xl += w*l*x[i];
275
+ }
276
+ float D = sum_w * sum_l2 - sum_l * sum_l;
277
+ if (D > 0) {
278
+ float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
279
+ float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D;
280
+ if (this_min > 0) {
281
+ this_min = 0;
282
+ this_scale = sum_xl / sum_l2;
283
+ }
284
+ float mad = 0;
285
+ for (int i = 0; i < n; ++i) {
286
+ float diff = this_scale * Laux[i] + this_min - x[i];
287
+ diff = use_mad ? fabsf(diff) : diff * diff;
288
+ float w = weights[i];
289
+ mad += w * diff;
290
+ }
291
+ if (mad < best_mad) {
292
+ for (int i = 0; i < n; ++i) {
293
+ L[i] = Laux[i];
294
+ }
295
+ best_mad = mad;
296
+ scale = this_scale;
297
+ min = this_min;
298
+ }
299
+ }
300
+ }
301
+ *the_min = -min;
302
+ return scale;
303
+ }
304
+
266
305
  #if QK_K == 256
267
306
  static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
268
307
  if (j < 4) {
@@ -281,6 +320,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
281
320
  const int nb = k / QK_K;
282
321
 
283
322
  uint8_t L[QK_K];
323
+ uint8_t Laux[16];
324
+ float weights[16];
284
325
  float mins[QK_K/16];
285
326
  float scales[QK_K/16];
286
327
 
@@ -291,7 +332,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
291
332
  float max_scale = 0; // as we are deducting the min, scales are always positive
292
333
  float max_min = 0;
293
334
  for (int j = 0; j < QK_K/16; ++j) {
294
- scales[j] = make_qkx1_quants(16, 3, x + 16*j, L + 16*j, &mins[j], 5);
335
+ for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]);
336
+ scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, true);
295
337
  float scale = scales[j];
296
338
  if (scale > max_scale) {
297
339
  max_scale = scale;
@@ -637,6 +679,8 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
637
679
  const int nb = k / QK_K;
638
680
 
639
681
  uint8_t L[QK_K];
682
+ uint8_t Laux[32];
683
+ float weights[32];
640
684
  float mins[QK_K/32];
641
685
  float scales[QK_K/32];
642
686
 
@@ -645,7 +689,12 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
645
689
  float max_scale = 0; // as we are deducting the min, scales are always positive
646
690
  float max_min = 0;
647
691
  for (int j = 0; j < QK_K/32; ++j) {
648
- scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 5);
692
+ //scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
693
+ float sum_x2 = 0;
694
+ for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
695
+ float av_x = sqrtf(sum_x2/32);
696
+ for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
697
+ scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
649
698
  float scale = scales[j];
650
699
  if (scale > max_scale) {
651
700
  max_scale = scale;
@@ -798,6 +847,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
798
847
  uint8_t L[QK_K];
799
848
  float mins[QK_K/32];
800
849
  float scales[QK_K/32];
850
+ float weights[32];
851
+ uint8_t Laux[32];
801
852
  #else
802
853
  int8_t L[QK_K];
803
854
  float scales[QK_K/16];
@@ -810,7 +861,12 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
810
861
  float max_scale = 0; // as we are deducting the min, scales are always positive
811
862
  float max_min = 0;
812
863
  for (int j = 0; j < QK_K/32; ++j) {
813
- scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 5);
864
+ //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
865
+ float sum_x2 = 0;
866
+ for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
867
+ float av_x = sqrtf(sum_x2/32);
868
+ for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
869
+ scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false);
814
870
  float scale = scales[j];
815
871
  if (scale > max_scale) {
816
872
  max_scale = scale;