@fugood/llama.node 0.0.1-alpha.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/CMakeLists.txt +42 -7
  2. package/README.md +10 -0
  3. package/bin/darwin/arm64/default.metallib +0 -0
  4. package/bin/darwin/arm64/llama-node.node +0 -0
  5. package/bin/darwin/x64/default.metallib +0 -0
  6. package/bin/darwin/x64/llama-node.node +0 -0
  7. package/bin/linux/arm64/llama-node.node +0 -0
  8. package/bin/linux/x64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  10. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  11. package/lib/binding.js +1 -1
  12. package/lib/binding.ts +16 -2
  13. package/lib/index.ts +2 -2
  14. package/package.json +15 -3
  15. package/src/DetokenizeWorker.cpp +22 -0
  16. package/src/DetokenizeWorker.h +19 -0
  17. package/src/EmbeddingWorker.cpp +46 -0
  18. package/src/EmbeddingWorker.h +23 -0
  19. package/src/LlamaCompletionWorker.cpp +5 -1
  20. package/src/LlamaCompletionWorker.h +4 -0
  21. package/src/LlamaContext.cpp +80 -1
  22. package/src/LlamaContext.h +3 -0
  23. package/src/TokenizeWorker.cpp +26 -0
  24. package/src/TokenizeWorker.h +23 -0
  25. package/src/common.hpp +12 -7
  26. package/src/llama.cpp/CMakeLists.txt +13 -7
  27. package/src/llama.cpp/common/common.cpp +221 -173
  28. package/src/llama.cpp/common/common.h +19 -8
  29. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
  30. package/src/llama.cpp/common/log.h +2 -2
  31. package/src/llama.cpp/common/sampling.cpp +17 -1
  32. package/src/llama.cpp/common/sampling.h +28 -20
  33. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +17 -11
  34. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +5 -5
  35. package/src/llama.cpp/examples/finetune/finetune.cpp +1 -1
  36. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -4
  37. package/src/llama.cpp/examples/imatrix/imatrix.cpp +72 -39
  38. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -3
  39. package/src/llama.cpp/examples/llava/clip.cpp +74 -23
  40. package/src/llama.cpp/examples/llava/llava-cli.cpp +37 -28
  41. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -1
  42. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -1
  43. package/src/llama.cpp/examples/main/main.cpp +10 -8
  44. package/src/llama.cpp/examples/perplexity/perplexity.cpp +175 -55
  45. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  46. package/src/llama.cpp/examples/quantize/quantize.cpp +74 -47
  47. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  48. package/src/llama.cpp/examples/server/server.cpp +97 -86
  49. package/src/llama.cpp/examples/server/utils.hpp +17 -15
  50. package/src/llama.cpp/ggml-backend.c +7 -5
  51. package/src/llama.cpp/ggml-impl.h +339 -4
  52. package/src/llama.cpp/ggml-kompute.cpp +7 -0
  53. package/src/llama.cpp/ggml-opencl.cpp +1 -0
  54. package/src/llama.cpp/ggml-quants.c +302 -293
  55. package/src/llama.cpp/ggml-sycl.cpp +28 -16
  56. package/src/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
  57. package/src/llama.cpp/ggml-vulkan.cpp +951 -263
  58. package/src/llama.cpp/ggml.c +1469 -116
  59. package/src/llama.cpp/ggml.h +37 -7
  60. package/src/llama.cpp/llama.cpp +969 -432
  61. package/src/llama.cpp/llama.h +46 -14
  62. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +2 -0
  63. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -1
  64. package/src/llama.cpp/requirements/requirements-convert.txt +2 -2
  65. package/src/llama.cpp/requirements.txt +1 -0
  66. package/src/llama.cpp/sgemm.cpp +134 -103
  67. package/src/llama.cpp/sgemm.h +4 -2
  68. package/src/llama.cpp/tests/CMakeLists.txt +96 -36
  69. package/src/llama.cpp/tests/test-backend-ops.cpp +56 -6
  70. package/src/llama.cpp/tests/test-chat-template.cpp +4 -0
  71. package/src/llama.cpp/tests/test-grammar-integration.cpp +225 -136
  72. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -0
  73. package/src/llama.cpp/tests/test-tokenizer-0.cpp +292 -0
  74. package/src/llama.cpp/tests/{test-tokenizer-1-llama.cpp → test-tokenizer-1-spm.cpp} +1 -1
  75. package/src/llama.cpp/unicode-data.cpp +1188 -656
  76. package/src/llama.cpp/unicode-data.h +4 -3
  77. package/src/llama.cpp/unicode.cpp +590 -49
  78. package/src/llama.cpp/unicode.h +6 -3
  79. package/bin/win32/arm64/llama-node.node +0 -0
  80. package/bin/win32/arm64/node.lib +0 -0
  81. package/bin/win32/x64/llama-node.node +0 -0
  82. package/bin/win32/x64/node.lib +0 -0
  83. package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +0 -187
  84. package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +0 -190
@@ -216,17 +216,22 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits,
216
216
  }
217
217
 
218
218
  struct kl_divergence_result {
219
- double sum_nll = 0;
220
- double sum_nll2 = 0;
221
- double sum_kld = 0;
222
- double sum_kld2 = 0;
223
- double sum_nll_diff = 0;
224
- double sum_nll_diff2 = 0;
225
- size_t n_same_top = 0;
226
- size_t count = 0;
219
+ double sum_nll = 0.0;
220
+ double sum_nll2 = 0.0;
221
+ double sum_nll_base = 0.0;
222
+ double sum_nll_base2 = 0.0;
223
+ double sum_nll_nll_base = 0.0;
224
+ double sum_kld = 0.0;
225
+ double sum_kld2 = 0.0;
226
+ double sum_p_diff = 0.0;
227
+ double sum_p_diff2 = 0.0;
228
+ double sum_p_diff4 = 0.0;
229
+ float max_p_diff = 0.0f;
230
+ size_t n_same_top = 0.0;
231
+ size_t count = 0.0;
227
232
  };
228
233
 
229
- static double log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
234
+ static std::pair<double, float> log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
230
235
  float max_logit = logits[0];
231
236
  int imax = 0;
232
237
  for (int i = 1; i < n_vocab; ++i) {
@@ -244,12 +249,17 @@ static double log_softmax(int n_vocab, const float * logits, const uint16_t * ba
244
249
  const float scale = d[0];
245
250
  const float min_log_prob = d[1];
246
251
  base_log_prob += 4;
247
- float nll = max_logit + log_sum_exp - logits[tok];
252
+
253
+ const float nll = max_logit + log_sum_exp - logits[tok];
248
254
  kld.sum_nll += nll;
249
255
  kld.sum_nll2 += nll*nll;
250
- nll += (scale*base_log_prob[tok] + min_log_prob);
251
- kld.sum_nll_diff += nll;
252
- kld.sum_nll_diff2 += nll*nll;
256
+
257
+ const float nll_base = -(scale*base_log_prob[tok] + min_log_prob);
258
+ kld.sum_nll_base += nll_base;
259
+ kld.sum_nll_base2 += nll_base*nll_base;
260
+
261
+ kld.sum_nll_nll_base += nll*nll_base;
262
+
253
263
  max_logit += log_sum_exp;
254
264
  double sum = 0;
255
265
  int imax_base = -1;
@@ -269,34 +279,50 @@ static double log_softmax(int n_vocab, const float * logits, const uint16_t * ba
269
279
  kld.sum_kld2 += sum*sum;
270
280
  ++kld.count;
271
281
  if (imax == imax_base) ++kld.n_same_top;
272
- return sum;
282
+
283
+ const float p_base = expf(-nll_base);
284
+ const float p = expf(-nll);
285
+ const float p_diff = p - p_base;
286
+ kld.sum_p_diff += p_diff;
287
+ const double p_diff2 = p_diff*p_diff;
288
+ kld.sum_p_diff2 += p_diff2;
289
+ kld.sum_p_diff4 += p_diff2*p_diff2;
290
+ kld.max_p_diff = std::max(kld.max_p_diff, std::fabs(p_diff));
291
+
292
+ return std::make_pair(sum, p_diff);
273
293
  }
274
294
 
275
295
  static void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token,
276
296
  std::vector<std::thread> & workers, const std::vector<uint16_t> & base_log_probs, kl_divergence_result & kld,
277
- float * kld_values) {
297
+ float * kld_values, float * p_diff_values) {
278
298
  std::mutex mutex;
279
299
  const int nv = 2*((n_vocab + 1)/2) + 4;
280
300
  int counter = 0;
281
- auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values] () {
301
+ auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values, p_diff_values] () {
282
302
  kl_divergence_result local_kld;
283
303
  while (true) {
284
304
  std::unique_lock<std::mutex> lock(mutex);
285
305
  int i = counter++;
286
306
  if (i >= n_token) {
287
- kld.sum_nll += local_kld.sum_nll;
288
- kld.sum_nll2 += local_kld.sum_nll2;
289
- kld.sum_kld += local_kld.sum_kld;
290
- kld.sum_kld2 += local_kld.sum_kld2;
291
- kld.sum_nll_diff += local_kld.sum_nll_diff;
292
- kld.sum_nll_diff2 += local_kld.sum_nll_diff2;
293
- kld.n_same_top += local_kld.n_same_top;
294
- kld.count += local_kld.count;
307
+ kld.sum_nll += local_kld.sum_nll;
308
+ kld.sum_nll2 += local_kld.sum_nll2;
309
+ kld.sum_nll_base += local_kld.sum_nll_base;
310
+ kld.sum_nll_base2 += local_kld.sum_nll_base2;
311
+ kld.sum_nll_nll_base += local_kld.sum_nll_nll_base;
312
+ kld.sum_kld += local_kld.sum_kld;
313
+ kld.sum_kld2 += local_kld.sum_kld2;
314
+ kld.sum_p_diff += local_kld.sum_p_diff;
315
+ kld.sum_p_diff2 += local_kld.sum_p_diff2;
316
+ kld.sum_p_diff4 += local_kld.sum_p_diff4;
317
+ kld.n_same_top += local_kld.n_same_top;
318
+ kld.max_p_diff = std::max(kld.max_p_diff, local_kld.max_p_diff);
319
+ kld.count += local_kld.count;
295
320
  break;
296
321
  }
297
322
  lock.unlock();
298
- double v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
299
- kld_values[i] = (float)v;
323
+ std::pair<double, float> v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
324
+ kld_values[i] = (float)v.first;
325
+ p_diff_values[i] = v.second;
300
326
  }
301
327
  };
302
328
  for (auto & w : workers) {
@@ -1711,7 +1737,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1711
1737
  GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
1712
1738
 
1713
1739
  std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
1714
- std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
1740
+ std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
1741
+ std::vector<float> p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
1715
1742
  std::vector<float> logits;
1716
1743
  if (num_batches > 1) {
1717
1744
  logits.reserve(n_ctx * n_vocab);
@@ -1728,9 +1755,18 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1728
1755
  df = df > 0 && count > 10 ? sqrt(df/(count-1)) : 0.;
1729
1756
  return std::make_pair(f, df);
1730
1757
  };
1758
+ auto covariance = [] (double suma, double sumb, double sumab, size_t count) {
1759
+ if (count < 10) {
1760
+ return 0.0;
1761
+ }
1762
+ double var = sumab/count - (suma/count)*(sumb/count);
1763
+ var /= count - 1;
1764
+ return var;
1765
+ };
1731
1766
 
1732
1767
  kl_divergence_result kld;
1733
- auto kld_ptr = kld_values.data();
1768
+ auto kld_ptr = kld_values.data();
1769
+ auto p_diff_ptr = p_diff_values.data();
1734
1770
 
1735
1771
  for (int i = 0; i < n_chunk; ++i) {
1736
1772
  const int start = i * n_ctx;
@@ -1785,24 +1821,42 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1785
1821
  }
1786
1822
  fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
1787
1823
 
1788
- printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL-Divergence Same top\n");
1824
+ printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
1789
1825
  }
1790
1826
 
1791
1827
  const int first = n_ctx/2;
1792
1828
  const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
1793
1829
  process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
1794
- workers, log_probs_uint16, kld, kld_ptr);
1795
- kld_ptr += n_ctx - 1 - first;
1830
+ workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr);
1831
+ p_diff_ptr += n_ctx - 1 - first;
1832
+ kld_ptr += n_ctx - 1 - first;
1833
+
1834
+ printf("%4d", i+1);
1835
+
1836
+ auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
1837
+ const double ppl_val = exp(log_ppl.first);
1838
+ const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
1839
+ printf(" %9.4lf ± %9.4lf", ppl_val, ppl_unc);
1840
+
1841
+ auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
1842
+ const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
1843
+ const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
1844
+ const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
1845
+ printf(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
1846
+
1847
+ auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
1848
+ printf(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
1796
1849
 
1797
- auto ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
1798
- auto log_ppl_ratio = mean_and_uncertainty(kld.sum_nll_diff, kld.sum_nll_diff2, kld.count);
1799
- auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
1800
- auto p_top = 1.*kld.n_same_top/kld.count;
1801
- auto d_p_top = sqrt(p_top*(1 - p_top)/(kld.count - 1));
1850
+ auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
1851
+ const double p_diff_rms_val = sqrt(p_diff_mse.first);
1852
+ const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
1853
+ printf(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
1802
1854
 
1803
- printf("%4d %10.4lf %10.5lf ± %10.5f %10.5f ± %10.5lf %.5f ± %.5f\n", i+1, exp(ppl.first),
1804
- log_ppl_ratio.first, log_ppl_ratio.second, kl_div.first, kl_div.second,
1805
- p_top, d_p_top);
1855
+ double p_top_val = 1.*kld.n_same_top/kld.count;
1856
+ double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
1857
+ printf(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
1858
+
1859
+ printf("\n");
1806
1860
 
1807
1861
  fflush(stdout);
1808
1862
 
@@ -1813,31 +1867,97 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1813
1867
  if (kld.count < 100) return; // we do not wish to do statistics on so few values
1814
1868
 
1815
1869
  std::sort(kld_values.begin(), kld_values.end());
1870
+ std::sort(p_diff_values.begin(), p_diff_values.end());
1871
+
1872
+ printf("====== Perplexity statistics ======\n");
1873
+
1874
+ auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
1875
+ const double ppl_val = exp(log_ppl.first);
1876
+ const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
1877
+ printf("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
1878
+
1879
+ auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
1880
+ const double ppl_base_val = exp(log_ppl_base.first);
1881
+ const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
1882
+ printf("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
1883
+
1884
+ const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
1885
+ // printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
1886
+ const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
1887
+ printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
1888
+
1889
+ const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
1890
+ const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
1891
+ printf("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
1816
1892
 
1817
- printf("===== KL-divergence statistics\n");
1893
+ const double ppl_ratio_val = exp(log_ppl_ratio_val);
1894
+ const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
1895
+ printf("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
1896
+
1897
+ const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
1898
+ const double ppl_diff_val = ppl_val - ppl_base_val;
1899
+ const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
1900
+ printf("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
1901
+
1902
+ printf("\n");
1903
+
1904
+ printf("====== KL divergence statistics ======\n");
1818
1905
  auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
1819
- printf("Average: %10.6f ±%10.6lf\n", kl_div.first, kl_div.second);
1906
+ printf("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
1820
1907
  auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
1821
1908
  : kld_values[kld_values.size()/2];
1822
- printf("Median : %10.6f\n", kld_median);
1823
1909
 
1824
- auto percentile = [&kld_values] (float fraction) {
1825
- if (fraction <= 0) return kld_values.front();
1826
- if (fraction >= 1) return kld_values.back();
1827
- float p = fraction*(kld_values.size() - 1);
1910
+ auto percentile = [] (std::vector<float> values, float fraction) {
1911
+ if (fraction <= 0) return values.front();
1912
+ if (fraction >= 1) return values.back();
1913
+ float p = fraction*(values.size() - 1);
1828
1914
  size_t ip = size_t(p); p -= ip;
1829
- return (1 - p)*kld_values[ip] + p*kld_values[std::min(ip+1, kld_values.size()-1)];
1915
+ return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
1830
1916
  };
1831
1917
 
1832
- printf("Maximum: %10.6f\n", kld_values.back());
1833
- printf("KLD_99 : %10.6f\n", percentile(0.99f));
1834
- printf("KLD_95 : %10.6f\n", percentile(0.95f));
1835
- printf("KLD_90 : %10.6f\n", percentile(0.90f));
1918
+ printf("Maximum KLD: %10.6f\n", kld_values.back());
1919
+ printf("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f));
1920
+ printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
1921
+ printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
1922
+ printf("Median KLD: %10.6f\n", kld_median);
1923
+ printf("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f));
1924
+ printf(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f));
1925
+ printf(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f));
1926
+ printf("Minimum KLD: %10.6f\n", kld_values.front());
1927
+
1928
+ printf("\n");
1836
1929
 
1837
- printf("Minimum: %10.6f\n", kld_values.front());
1838
- printf("KLD_01 : %10.6f\n", percentile(0.01f));
1839
- printf("KLD_05 : %10.6f\n", percentile(0.05f));
1840
- printf("KLD_10 : %10.6f\n", percentile(0.10f));
1930
+ printf("====== Token probability statistics ======\n");
1931
+
1932
+ auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
1933
+ printf("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second);
1934
+
1935
+ auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
1936
+ : p_diff_values[p_diff_values.size()/2];
1937
+
1938
+ printf("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back());
1939
+ printf("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
1940
+ printf("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
1941
+ printf("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
1942
+ printf("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
1943
+ printf("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
1944
+ printf("Median Δp: %6.3lf%%\n", 100.0*p_diff_median);
1945
+ printf("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
1946
+ printf("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
1947
+ printf(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
1948
+ printf(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
1949
+ printf(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
1950
+ printf("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front());
1951
+
1952
+ auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
1953
+ // printf("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
1954
+
1955
+ const double p_diff_rms_val = sqrt(p_diff_mse.first);
1956
+ const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
1957
+ printf("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
1958
+
1959
+ const double same_top_p = 1.0*kld.n_same_top/kld.count;
1960
+ printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
1841
1961
 
1842
1962
  }
1843
1963
 
@@ -1,6 +1,6 @@
1
1
  set(TARGET quantize)
2
2
  add_executable(${TARGET} quantize.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
- target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
4
+ target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
5
5
  target_include_directories(${TARGET} PRIVATE ../../common)
6
6
  target_compile_features(${TARGET} PRIVATE cxx_std_11)
@@ -8,7 +8,6 @@
8
8
  #include <unordered_map>
9
9
  #include <fstream>
10
10
  #include <cmath>
11
- #include <algorithm>
12
11
 
13
12
  struct quant_option {
14
13
  std::string name;
@@ -47,12 +46,17 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
47
46
  { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
48
47
  { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
49
48
  { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
50
- { "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
49
+ { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, -0.0020 ppl @ Mistral-7B", },
50
+ { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
51
51
  { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
52
52
  // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
53
53
  { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
54
54
  };
55
55
 
56
+ static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file";
57
+ static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix.dataset";
58
+ static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count";
59
+ static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count";
56
60
 
57
61
  static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
58
62
  std::string ftype_str;
@@ -97,6 +101,7 @@ static void usage(const char * executable) {
97
101
  printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
98
102
  printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
99
103
  printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
104
+ printf(" --keep-split: will generate quatized model in the same shards as input");
100
105
  printf(" --override-kv KEY=TYPE:VALUE\n");
101
106
  printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
102
107
  printf("Note: --include-weights and --exclude-weights cannot be used together\n");
@@ -112,7 +117,7 @@ static void usage(const char * executable) {
112
117
  exit(1);
113
118
  }
114
119
 
115
- static void load_imatrix(const std::string & imatrix_file, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
120
+ static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
116
121
  std::ifstream in(imatrix_file.c_str(), std::ios::binary);
117
122
  if (!in) {
118
123
  printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
@@ -159,18 +164,33 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
159
164
  printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
160
165
  }
161
166
  }
162
- printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str());
167
+
168
+ // latest imatrix version contains the dataset filename at the end of the file
169
+ int m_last_call = 0;
170
+ if (in.peek() != EOF) {
171
+ in.read((char *)&m_last_call, sizeof(m_last_call));
172
+ int dataset_len;
173
+ in.read((char *)&dataset_len, sizeof(dataset_len));
174
+ std::vector<char> dataset_as_vec(dataset_len);
175
+ in.read(dataset_as_vec.data(), dataset_len);
176
+ imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end());
177
+ printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
178
+ }
179
+ printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call);
180
+ return m_last_call;
163
181
  }
164
182
 
165
- static void prepare_imatrix(const std::string & imatrix_file,
183
+ static int prepare_imatrix(const std::string & imatrix_file,
184
+ std::string & imatrix_dataset,
166
185
  const std::vector<std::string> & included_weights,
167
186
  const std::vector<std::string> & excluded_weights,
168
187
  std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
188
+ int m_last_call = -1;
169
189
  if (!imatrix_file.empty()) {
170
- load_imatrix(imatrix_file, imatrix_data);
190
+ m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data);
171
191
  }
172
192
  if (imatrix_data.empty()) {
173
- return;
193
+ return m_last_call;
174
194
  }
175
195
  if (!excluded_weights.empty()) {
176
196
  for (auto& name : excluded_weights) {
@@ -196,6 +216,7 @@ static void prepare_imatrix(const std::string & imatrix_file,
196
216
  if (!imatrix_data.empty()) {
197
217
  printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
198
218
  }
219
+ return m_last_call;
199
220
  }
200
221
 
201
222
  static ggml_type parse_ggml_type(const char * arg) {
@@ -210,43 +231,6 @@ static ggml_type parse_ggml_type(const char * arg) {
210
231
  return result;
211
232
  }
212
233
 
213
- static bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
214
- const char* sep = strchr(data, '=');
215
- if (sep == nullptr || sep - data >= 128) {
216
- fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
217
- return false;
218
- }
219
- llama_model_kv_override kvo;
220
- std::strncpy(kvo.key, data, sep - data);
221
- kvo.key[sep - data] = 0;
222
- sep++;
223
- if (strncmp(sep, "int:", 4) == 0) {
224
- sep += 4;
225
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
226
- kvo.int_value = std::atol(sep);
227
- } else if (strncmp(sep, "float:", 6) == 0) {
228
- sep += 6;
229
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
230
- kvo.float_value = std::atof(sep);
231
- } else if (strncmp(sep, "bool:", 5) == 0) {
232
- sep += 5;
233
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
234
- if (std::strcmp(sep, "true") == 0) {
235
- kvo.bool_value = true;
236
- } else if (std::strcmp(sep, "false") == 0) {
237
- kvo.bool_value = false;
238
- } else {
239
- fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
240
- return false;
241
- }
242
- } else {
243
- fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
244
- return false;
245
- }
246
- overrides.emplace_back(std::move(kvo));
247
- return true;
248
- }
249
-
250
234
  int main(int argc, char ** argv) {
251
235
  if (argc < 3) {
252
236
  usage(argv[0]);
@@ -300,6 +284,8 @@ int main(int argc, char ** argv) {
300
284
  } else {
301
285
  usage(argv[0]);
302
286
  }
287
+ } else if (strcmp(argv[arg_idx], "--keep-split")) {
288
+ params.keep_split = true;
303
289
  } else {
304
290
  usage(argv[0]);
305
291
  }
@@ -313,10 +299,43 @@ int main(int argc, char ** argv) {
313
299
  usage(argv[0]);
314
300
  }
315
301
 
302
+ std::string imatrix_dataset;
316
303
  std::unordered_map<std::string, std::vector<float>> imatrix_data;
317
- prepare_imatrix(imatrix_file, included_weights, excluded_weights, imatrix_data);
304
+ int m_last_call = prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data);
318
305
  if (!imatrix_data.empty()) {
319
306
  params.imatrix = &imatrix_data;
307
+ {
308
+ llama_model_kv_override kvo;
309
+ std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
310
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
311
+ strncpy(kvo.val_str, imatrix_file.c_str(), 127);
312
+ kvo.val_str[127] = '\0';
313
+ kv_overrides.emplace_back(std::move(kvo));
314
+ }
315
+ if (!imatrix_dataset.empty()) {
316
+ llama_model_kv_override kvo;
317
+ std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
318
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
319
+ strncpy(kvo.val_str, imatrix_dataset.c_str(), 127);
320
+ kvo.val_str[127] = '\0';
321
+ kv_overrides.emplace_back(std::move(kvo));
322
+ }
323
+
324
+ {
325
+ llama_model_kv_override kvo;
326
+ std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
327
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
328
+ kvo.val_i64 = imatrix_data.size();
329
+ kv_overrides.emplace_back(std::move(kvo));
330
+ }
331
+
332
+ if (m_last_call > 0) {
333
+ llama_model_kv_override kvo;
334
+ std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
335
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
336
+ kvo.val_i64 = m_last_call;
337
+ kv_overrides.emplace_back(std::move(kvo));
338
+ }
320
339
  }
321
340
  if (!kv_overrides.empty()) {
322
341
  kv_overrides.emplace_back();
@@ -332,20 +351,28 @@ int main(int argc, char ** argv) {
332
351
  std::string fname_out;
333
352
 
334
353
  std::string ftype_str;
354
+ std::string suffix = ".gguf";
335
355
  if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
336
356
  std::string fpath;
337
357
  const size_t pos = fname_inp.find_last_of("/\\");
338
358
  if (pos != std::string::npos) {
339
359
  fpath = fname_inp.substr(0, pos + 1);
340
360
  }
341
- // export as [inp path]/ggml-model-[ftype].gguf
342
- fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
361
+
362
+ // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
363
+ fname_out = fpath + "ggml-model-" + ftype_str;
364
+ if (!params.keep_split) {
365
+ fname_out += suffix;
366
+ }
343
367
  arg_idx++;
344
368
  if (ftype_str == "COPY") {
345
369
  params.only_copy = true;
346
370
  }
347
371
  } else {
348
372
  fname_out = argv[arg_idx];
373
+ if (params.keep_split && fname_out.find(suffix) != std::string::npos) {
374
+ fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
375
+ }
349
376
  arg_idx++;
350
377
 
351
378
  if (argc <= arg_idx) {
@@ -23,7 +23,7 @@
23
23
  #endif
24
24
 
25
25
  struct quantize_stats_params {
26
- std::string model = "models/7B/ggml-model-f16.gguf";
26
+ std::string model = DEFAULT_MODEL_PATH;
27
27
  bool verbose = false;
28
28
  bool per_layer_stats = false;
29
29
  bool print_histogram = false;