@fugood/llama.node 0.0.1-alpha.3 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +36 -7
- package/README.md +9 -0
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/lib/binding.js +18 -1
- package/lib/binding.ts +22 -2
- package/lib/index.ts +2 -2
- package/package.json +15 -3
- package/src/LlamaCompletionWorker.cpp +5 -1
- package/src/LlamaCompletionWorker.h +4 -0
- package/src/LlamaContext.cpp +18 -1
- package/src/common.hpp +11 -7
- package/src/llama.cpp/CMakeLists.txt +13 -7
- package/src/llama.cpp/common/common.cpp +221 -173
- package/src/llama.cpp/common/common.h +19 -8
- package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
- package/src/llama.cpp/common/log.h +2 -2
- package/src/llama.cpp/common/sampling.cpp +17 -1
- package/src/llama.cpp/common/sampling.h +28 -20
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +17 -11
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +5 -5
- package/src/llama.cpp/examples/finetune/finetune.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -4
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +72 -39
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -3
- package/src/llama.cpp/examples/llava/clip.cpp +74 -23
- package/src/llama.cpp/examples/llava/llava-cli.cpp +37 -28
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +0 -1
- package/src/llama.cpp/examples/main/main.cpp +10 -8
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +175 -55
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +74 -47
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/server/server.cpp +97 -86
- package/src/llama.cpp/examples/server/utils.hpp +17 -15
- package/src/llama.cpp/ggml-backend.c +7 -5
- package/src/llama.cpp/ggml-impl.h +339 -4
- package/src/llama.cpp/ggml-kompute.cpp +7 -0
- package/src/llama.cpp/ggml-opencl.cpp +1 -0
- package/src/llama.cpp/ggml-quants.c +302 -293
- package/src/llama.cpp/ggml-sycl.cpp +28 -16
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- package/src/llama.cpp/ggml-vulkan.cpp +951 -263
- package/src/llama.cpp/ggml.c +1469 -116
- package/src/llama.cpp/ggml.h +37 -7
- package/src/llama.cpp/llama.cpp +969 -432
- package/src/llama.cpp/llama.h +46 -14
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -1
- package/src/llama.cpp/requirements/requirements-convert.txt +2 -2
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/sgemm.cpp +134 -103
- package/src/llama.cpp/sgemm.h +4 -2
- package/src/llama.cpp/tests/CMakeLists.txt +96 -36
- package/src/llama.cpp/tests/test-backend-ops.cpp +56 -6
- package/src/llama.cpp/tests/test-chat-template.cpp +4 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +225 -136
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -0
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +292 -0
- package/src/llama.cpp/tests/{test-tokenizer-1-llama.cpp → test-tokenizer-1-spm.cpp} +1 -1
- package/src/llama.cpp/unicode-data.cpp +1188 -656
- package/src/llama.cpp/unicode-data.h +4 -3
- package/src/llama.cpp/unicode.cpp +590 -49
- package/src/llama.cpp/unicode.h +6 -3
- package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +0 -187
- package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +0 -190
|
@@ -216,17 +216,22 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits,
|
|
|
216
216
|
}
|
|
217
217
|
|
|
218
218
|
struct kl_divergence_result {
|
|
219
|
-
double sum_nll
|
|
220
|
-
double sum_nll2
|
|
221
|
-
double
|
|
222
|
-
double
|
|
223
|
-
double
|
|
224
|
-
double
|
|
225
|
-
|
|
226
|
-
|
|
219
|
+
double sum_nll = 0.0;
|
|
220
|
+
double sum_nll2 = 0.0;
|
|
221
|
+
double sum_nll_base = 0.0;
|
|
222
|
+
double sum_nll_base2 = 0.0;
|
|
223
|
+
double sum_nll_nll_base = 0.0;
|
|
224
|
+
double sum_kld = 0.0;
|
|
225
|
+
double sum_kld2 = 0.0;
|
|
226
|
+
double sum_p_diff = 0.0;
|
|
227
|
+
double sum_p_diff2 = 0.0;
|
|
228
|
+
double sum_p_diff4 = 0.0;
|
|
229
|
+
float max_p_diff = 0.0f;
|
|
230
|
+
size_t n_same_top = 0.0;
|
|
231
|
+
size_t count = 0.0;
|
|
227
232
|
};
|
|
228
233
|
|
|
229
|
-
static double log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
|
|
234
|
+
static std::pair<double, float> log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
|
|
230
235
|
float max_logit = logits[0];
|
|
231
236
|
int imax = 0;
|
|
232
237
|
for (int i = 1; i < n_vocab; ++i) {
|
|
@@ -244,12 +249,17 @@ static double log_softmax(int n_vocab, const float * logits, const uint16_t * ba
|
|
|
244
249
|
const float scale = d[0];
|
|
245
250
|
const float min_log_prob = d[1];
|
|
246
251
|
base_log_prob += 4;
|
|
247
|
-
|
|
252
|
+
|
|
253
|
+
const float nll = max_logit + log_sum_exp - logits[tok];
|
|
248
254
|
kld.sum_nll += nll;
|
|
249
255
|
kld.sum_nll2 += nll*nll;
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
kld.
|
|
256
|
+
|
|
257
|
+
const float nll_base = -(scale*base_log_prob[tok] + min_log_prob);
|
|
258
|
+
kld.sum_nll_base += nll_base;
|
|
259
|
+
kld.sum_nll_base2 += nll_base*nll_base;
|
|
260
|
+
|
|
261
|
+
kld.sum_nll_nll_base += nll*nll_base;
|
|
262
|
+
|
|
253
263
|
max_logit += log_sum_exp;
|
|
254
264
|
double sum = 0;
|
|
255
265
|
int imax_base = -1;
|
|
@@ -269,34 +279,50 @@ static double log_softmax(int n_vocab, const float * logits, const uint16_t * ba
|
|
|
269
279
|
kld.sum_kld2 += sum*sum;
|
|
270
280
|
++kld.count;
|
|
271
281
|
if (imax == imax_base) ++kld.n_same_top;
|
|
272
|
-
|
|
282
|
+
|
|
283
|
+
const float p_base = expf(-nll_base);
|
|
284
|
+
const float p = expf(-nll);
|
|
285
|
+
const float p_diff = p - p_base;
|
|
286
|
+
kld.sum_p_diff += p_diff;
|
|
287
|
+
const double p_diff2 = p_diff*p_diff;
|
|
288
|
+
kld.sum_p_diff2 += p_diff2;
|
|
289
|
+
kld.sum_p_diff4 += p_diff2*p_diff2;
|
|
290
|
+
kld.max_p_diff = std::max(kld.max_p_diff, std::fabs(p_diff));
|
|
291
|
+
|
|
292
|
+
return std::make_pair(sum, p_diff);
|
|
273
293
|
}
|
|
274
294
|
|
|
275
295
|
static void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token,
|
|
276
296
|
std::vector<std::thread> & workers, const std::vector<uint16_t> & base_log_probs, kl_divergence_result & kld,
|
|
277
|
-
float * kld_values) {
|
|
297
|
+
float * kld_values, float * p_diff_values) {
|
|
278
298
|
std::mutex mutex;
|
|
279
299
|
const int nv = 2*((n_vocab + 1)/2) + 4;
|
|
280
300
|
int counter = 0;
|
|
281
|
-
auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values] () {
|
|
301
|
+
auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values, p_diff_values] () {
|
|
282
302
|
kl_divergence_result local_kld;
|
|
283
303
|
while (true) {
|
|
284
304
|
std::unique_lock<std::mutex> lock(mutex);
|
|
285
305
|
int i = counter++;
|
|
286
306
|
if (i >= n_token) {
|
|
287
|
-
kld.sum_nll
|
|
288
|
-
kld.sum_nll2
|
|
289
|
-
kld.
|
|
290
|
-
kld.
|
|
291
|
-
kld.
|
|
292
|
-
kld.
|
|
293
|
-
kld.
|
|
294
|
-
kld.
|
|
307
|
+
kld.sum_nll += local_kld.sum_nll;
|
|
308
|
+
kld.sum_nll2 += local_kld.sum_nll2;
|
|
309
|
+
kld.sum_nll_base += local_kld.sum_nll_base;
|
|
310
|
+
kld.sum_nll_base2 += local_kld.sum_nll_base2;
|
|
311
|
+
kld.sum_nll_nll_base += local_kld.sum_nll_nll_base;
|
|
312
|
+
kld.sum_kld += local_kld.sum_kld;
|
|
313
|
+
kld.sum_kld2 += local_kld.sum_kld2;
|
|
314
|
+
kld.sum_p_diff += local_kld.sum_p_diff;
|
|
315
|
+
kld.sum_p_diff2 += local_kld.sum_p_diff2;
|
|
316
|
+
kld.sum_p_diff4 += local_kld.sum_p_diff4;
|
|
317
|
+
kld.n_same_top += local_kld.n_same_top;
|
|
318
|
+
kld.max_p_diff = std::max(kld.max_p_diff, local_kld.max_p_diff);
|
|
319
|
+
kld.count += local_kld.count;
|
|
295
320
|
break;
|
|
296
321
|
}
|
|
297
322
|
lock.unlock();
|
|
298
|
-
double v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
|
|
299
|
-
kld_values[i]
|
|
323
|
+
std::pair<double, float> v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
|
|
324
|
+
kld_values[i] = (float)v.first;
|
|
325
|
+
p_diff_values[i] = v.second;
|
|
300
326
|
}
|
|
301
327
|
};
|
|
302
328
|
for (auto & w : workers) {
|
|
@@ -1711,7 +1737,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|
|
1711
1737
|
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
|
1712
1738
|
|
|
1713
1739
|
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
|
|
1714
|
-
std::vector<float>
|
|
1740
|
+
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
|
|
1741
|
+
std::vector<float> p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
|
|
1715
1742
|
std::vector<float> logits;
|
|
1716
1743
|
if (num_batches > 1) {
|
|
1717
1744
|
logits.reserve(n_ctx * n_vocab);
|
|
@@ -1728,9 +1755,18 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|
|
1728
1755
|
df = df > 0 && count > 10 ? sqrt(df/(count-1)) : 0.;
|
|
1729
1756
|
return std::make_pair(f, df);
|
|
1730
1757
|
};
|
|
1758
|
+
auto covariance = [] (double suma, double sumb, double sumab, size_t count) {
|
|
1759
|
+
if (count < 10) {
|
|
1760
|
+
return 0.0;
|
|
1761
|
+
}
|
|
1762
|
+
double var = sumab/count - (suma/count)*(sumb/count);
|
|
1763
|
+
var /= count - 1;
|
|
1764
|
+
return var;
|
|
1765
|
+
};
|
|
1731
1766
|
|
|
1732
1767
|
kl_divergence_result kld;
|
|
1733
|
-
auto
|
|
1768
|
+
auto kld_ptr = kld_values.data();
|
|
1769
|
+
auto p_diff_ptr = p_diff_values.data();
|
|
1734
1770
|
|
|
1735
1771
|
for (int i = 0; i < n_chunk; ++i) {
|
|
1736
1772
|
const int start = i * n_ctx;
|
|
@@ -1785,24 +1821,42 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|
|
1785
1821
|
}
|
|
1786
1822
|
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
|
|
1787
1823
|
|
|
1788
|
-
printf("\nchunk
|
|
1824
|
+
printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
|
|
1789
1825
|
}
|
|
1790
1826
|
|
|
1791
1827
|
const int first = n_ctx/2;
|
|
1792
1828
|
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
|
1793
1829
|
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
|
|
1794
|
-
workers, log_probs_uint16, kld, kld_ptr);
|
|
1795
|
-
|
|
1830
|
+
workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr);
|
|
1831
|
+
p_diff_ptr += n_ctx - 1 - first;
|
|
1832
|
+
kld_ptr += n_ctx - 1 - first;
|
|
1833
|
+
|
|
1834
|
+
printf("%4d", i+1);
|
|
1835
|
+
|
|
1836
|
+
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
|
|
1837
|
+
const double ppl_val = exp(log_ppl.first);
|
|
1838
|
+
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
|
|
1839
|
+
printf(" %9.4lf ± %9.4lf", ppl_val, ppl_unc);
|
|
1840
|
+
|
|
1841
|
+
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
|
|
1842
|
+
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
|
|
1843
|
+
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
|
|
1844
|
+
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
|
|
1845
|
+
printf(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
|
|
1846
|
+
|
|
1847
|
+
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
|
|
1848
|
+
printf(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
|
|
1796
1849
|
|
|
1797
|
-
auto
|
|
1798
|
-
|
|
1799
|
-
|
|
1800
|
-
|
|
1801
|
-
auto d_p_top = sqrt(p_top*(1 - p_top)/(kld.count - 1));
|
|
1850
|
+
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
|
|
1851
|
+
const double p_diff_rms_val = sqrt(p_diff_mse.first);
|
|
1852
|
+
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
|
|
1853
|
+
printf(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
|
|
1802
1854
|
|
|
1803
|
-
|
|
1804
|
-
|
|
1805
|
-
|
|
1855
|
+
double p_top_val = 1.*kld.n_same_top/kld.count;
|
|
1856
|
+
double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
|
|
1857
|
+
printf(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
|
|
1858
|
+
|
|
1859
|
+
printf("\n");
|
|
1806
1860
|
|
|
1807
1861
|
fflush(stdout);
|
|
1808
1862
|
|
|
@@ -1813,31 +1867,97 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|
|
1813
1867
|
if (kld.count < 100) return; // we do not wish to do statistics on so few values
|
|
1814
1868
|
|
|
1815
1869
|
std::sort(kld_values.begin(), kld_values.end());
|
|
1870
|
+
std::sort(p_diff_values.begin(), p_diff_values.end());
|
|
1871
|
+
|
|
1872
|
+
printf("====== Perplexity statistics ======\n");
|
|
1873
|
+
|
|
1874
|
+
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
|
|
1875
|
+
const double ppl_val = exp(log_ppl.first);
|
|
1876
|
+
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
|
|
1877
|
+
printf("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
|
|
1878
|
+
|
|
1879
|
+
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
|
|
1880
|
+
const double ppl_base_val = exp(log_ppl_base.first);
|
|
1881
|
+
const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
|
|
1882
|
+
printf("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
|
|
1883
|
+
|
|
1884
|
+
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
|
|
1885
|
+
// printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
|
|
1886
|
+
const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
|
|
1887
|
+
printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
|
|
1888
|
+
|
|
1889
|
+
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
|
|
1890
|
+
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
|
|
1891
|
+
printf("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
|
|
1816
1892
|
|
|
1817
|
-
|
|
1893
|
+
const double ppl_ratio_val = exp(log_ppl_ratio_val);
|
|
1894
|
+
const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
|
|
1895
|
+
printf("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
|
|
1896
|
+
|
|
1897
|
+
const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
|
|
1898
|
+
const double ppl_diff_val = ppl_val - ppl_base_val;
|
|
1899
|
+
const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
|
|
1900
|
+
printf("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
|
|
1901
|
+
|
|
1902
|
+
printf("\n");
|
|
1903
|
+
|
|
1904
|
+
printf("====== KL divergence statistics ======\n");
|
|
1818
1905
|
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
|
|
1819
|
-
printf("
|
|
1906
|
+
printf("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
|
|
1820
1907
|
auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
|
|
1821
1908
|
: kld_values[kld_values.size()/2];
|
|
1822
|
-
printf("Median : %10.6f\n", kld_median);
|
|
1823
1909
|
|
|
1824
|
-
auto percentile = [
|
|
1825
|
-
if (fraction <= 0) return
|
|
1826
|
-
if (fraction >= 1) return
|
|
1827
|
-
float p = fraction*(
|
|
1910
|
+
auto percentile = [] (std::vector<float> values, float fraction) {
|
|
1911
|
+
if (fraction <= 0) return values.front();
|
|
1912
|
+
if (fraction >= 1) return values.back();
|
|
1913
|
+
float p = fraction*(values.size() - 1);
|
|
1828
1914
|
size_t ip = size_t(p); p -= ip;
|
|
1829
|
-
return (1 - p)*
|
|
1915
|
+
return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
|
|
1830
1916
|
};
|
|
1831
1917
|
|
|
1832
|
-
printf("Maximum: %10.6f\n", kld_values.back());
|
|
1833
|
-
printf("
|
|
1834
|
-
printf("
|
|
1835
|
-
printf("
|
|
1918
|
+
printf("Maximum KLD: %10.6f\n", kld_values.back());
|
|
1919
|
+
printf("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f));
|
|
1920
|
+
printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
|
|
1921
|
+
printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
|
|
1922
|
+
printf("Median KLD: %10.6f\n", kld_median);
|
|
1923
|
+
printf("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f));
|
|
1924
|
+
printf(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f));
|
|
1925
|
+
printf(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f));
|
|
1926
|
+
printf("Minimum KLD: %10.6f\n", kld_values.front());
|
|
1927
|
+
|
|
1928
|
+
printf("\n");
|
|
1836
1929
|
|
|
1837
|
-
printf("
|
|
1838
|
-
|
|
1839
|
-
|
|
1840
|
-
printf("
|
|
1930
|
+
printf("====== Token probability statistics ======\n");
|
|
1931
|
+
|
|
1932
|
+
auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
|
|
1933
|
+
printf("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second);
|
|
1934
|
+
|
|
1935
|
+
auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
|
|
1936
|
+
: p_diff_values[p_diff_values.size()/2];
|
|
1937
|
+
|
|
1938
|
+
printf("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back());
|
|
1939
|
+
printf("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
|
|
1940
|
+
printf("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
|
|
1941
|
+
printf("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
|
|
1942
|
+
printf("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
|
|
1943
|
+
printf("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
|
|
1944
|
+
printf("Median Δp: %6.3lf%%\n", 100.0*p_diff_median);
|
|
1945
|
+
printf("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
|
|
1946
|
+
printf("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
|
|
1947
|
+
printf(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
|
|
1948
|
+
printf(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
|
|
1949
|
+
printf(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
|
|
1950
|
+
printf("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front());
|
|
1951
|
+
|
|
1952
|
+
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
|
|
1953
|
+
// printf("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
|
|
1954
|
+
|
|
1955
|
+
const double p_diff_rms_val = sqrt(p_diff_mse.first);
|
|
1956
|
+
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
|
|
1957
|
+
printf("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
|
|
1958
|
+
|
|
1959
|
+
const double same_top_p = 1.0*kld.n_same_top/kld.count;
|
|
1960
|
+
printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
|
|
1841
1961
|
|
|
1842
1962
|
}
|
|
1843
1963
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
set(TARGET quantize)
|
|
2
2
|
add_executable(${TARGET} quantize.cpp)
|
|
3
3
|
install(TARGETS ${TARGET} RUNTIME)
|
|
4
|
-
target_link_libraries(${TARGET} PRIVATE llama
|
|
4
|
+
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
|
|
5
5
|
target_include_directories(${TARGET} PRIVATE ../../common)
|
|
6
6
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
|
@@ -8,7 +8,6 @@
|
|
|
8
8
|
#include <unordered_map>
|
|
9
9
|
#include <fstream>
|
|
10
10
|
#include <cmath>
|
|
11
|
-
#include <algorithm>
|
|
12
11
|
|
|
13
12
|
struct quant_option {
|
|
14
13
|
std::string name;
|
|
@@ -47,12 +46,17 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|
|
47
46
|
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
|
|
48
47
|
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
|
|
49
48
|
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
|
50
|
-
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "
|
|
49
|
+
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, -0.0020 ppl @ Mistral-7B", },
|
|
50
|
+
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
|
|
51
51
|
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
|
52
52
|
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
|
|
53
53
|
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
|
|
54
54
|
};
|
|
55
55
|
|
|
56
|
+
static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file";
|
|
57
|
+
static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix.dataset";
|
|
58
|
+
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count";
|
|
59
|
+
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count";
|
|
56
60
|
|
|
57
61
|
static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
|
|
58
62
|
std::string ftype_str;
|
|
@@ -97,6 +101,7 @@ static void usage(const char * executable) {
|
|
|
97
101
|
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
|
|
98
102
|
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
|
|
99
103
|
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
|
|
104
|
+
printf(" --keep-split: will generate quatized model in the same shards as input");
|
|
100
105
|
printf(" --override-kv KEY=TYPE:VALUE\n");
|
|
101
106
|
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
|
|
102
107
|
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
|
|
@@ -112,7 +117,7 @@ static void usage(const char * executable) {
|
|
|
112
117
|
exit(1);
|
|
113
118
|
}
|
|
114
119
|
|
|
115
|
-
static
|
|
120
|
+
static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
|
|
116
121
|
std::ifstream in(imatrix_file.c_str(), std::ios::binary);
|
|
117
122
|
if (!in) {
|
|
118
123
|
printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
|
|
@@ -159,18 +164,33 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
|
|
|
159
164
|
printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
|
|
160
165
|
}
|
|
161
166
|
}
|
|
162
|
-
|
|
167
|
+
|
|
168
|
+
// latest imatrix version contains the dataset filename at the end of the file
|
|
169
|
+
int m_last_call = 0;
|
|
170
|
+
if (in.peek() != EOF) {
|
|
171
|
+
in.read((char *)&m_last_call, sizeof(m_last_call));
|
|
172
|
+
int dataset_len;
|
|
173
|
+
in.read((char *)&dataset_len, sizeof(dataset_len));
|
|
174
|
+
std::vector<char> dataset_as_vec(dataset_len);
|
|
175
|
+
in.read(dataset_as_vec.data(), dataset_len);
|
|
176
|
+
imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end());
|
|
177
|
+
printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
|
|
178
|
+
}
|
|
179
|
+
printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call);
|
|
180
|
+
return m_last_call;
|
|
163
181
|
}
|
|
164
182
|
|
|
165
|
-
static
|
|
183
|
+
static int prepare_imatrix(const std::string & imatrix_file,
|
|
184
|
+
std::string & imatrix_dataset,
|
|
166
185
|
const std::vector<std::string> & included_weights,
|
|
167
186
|
const std::vector<std::string> & excluded_weights,
|
|
168
187
|
std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
|
|
188
|
+
int m_last_call = -1;
|
|
169
189
|
if (!imatrix_file.empty()) {
|
|
170
|
-
load_imatrix(imatrix_file, imatrix_data);
|
|
190
|
+
m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data);
|
|
171
191
|
}
|
|
172
192
|
if (imatrix_data.empty()) {
|
|
173
|
-
return;
|
|
193
|
+
return m_last_call;
|
|
174
194
|
}
|
|
175
195
|
if (!excluded_weights.empty()) {
|
|
176
196
|
for (auto& name : excluded_weights) {
|
|
@@ -196,6 +216,7 @@ static void prepare_imatrix(const std::string & imatrix_file,
|
|
|
196
216
|
if (!imatrix_data.empty()) {
|
|
197
217
|
printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
|
|
198
218
|
}
|
|
219
|
+
return m_last_call;
|
|
199
220
|
}
|
|
200
221
|
|
|
201
222
|
static ggml_type parse_ggml_type(const char * arg) {
|
|
@@ -210,43 +231,6 @@ static ggml_type parse_ggml_type(const char * arg) {
|
|
|
210
231
|
return result;
|
|
211
232
|
}
|
|
212
233
|
|
|
213
|
-
static bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
|
|
214
|
-
const char* sep = strchr(data, '=');
|
|
215
|
-
if (sep == nullptr || sep - data >= 128) {
|
|
216
|
-
fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
|
|
217
|
-
return false;
|
|
218
|
-
}
|
|
219
|
-
llama_model_kv_override kvo;
|
|
220
|
-
std::strncpy(kvo.key, data, sep - data);
|
|
221
|
-
kvo.key[sep - data] = 0;
|
|
222
|
-
sep++;
|
|
223
|
-
if (strncmp(sep, "int:", 4) == 0) {
|
|
224
|
-
sep += 4;
|
|
225
|
-
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
|
226
|
-
kvo.int_value = std::atol(sep);
|
|
227
|
-
} else if (strncmp(sep, "float:", 6) == 0) {
|
|
228
|
-
sep += 6;
|
|
229
|
-
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
|
230
|
-
kvo.float_value = std::atof(sep);
|
|
231
|
-
} else if (strncmp(sep, "bool:", 5) == 0) {
|
|
232
|
-
sep += 5;
|
|
233
|
-
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
|
234
|
-
if (std::strcmp(sep, "true") == 0) {
|
|
235
|
-
kvo.bool_value = true;
|
|
236
|
-
} else if (std::strcmp(sep, "false") == 0) {
|
|
237
|
-
kvo.bool_value = false;
|
|
238
|
-
} else {
|
|
239
|
-
fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
|
|
240
|
-
return false;
|
|
241
|
-
}
|
|
242
|
-
} else {
|
|
243
|
-
fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
|
|
244
|
-
return false;
|
|
245
|
-
}
|
|
246
|
-
overrides.emplace_back(std::move(kvo));
|
|
247
|
-
return true;
|
|
248
|
-
}
|
|
249
|
-
|
|
250
234
|
int main(int argc, char ** argv) {
|
|
251
235
|
if (argc < 3) {
|
|
252
236
|
usage(argv[0]);
|
|
@@ -300,6 +284,8 @@ int main(int argc, char ** argv) {
|
|
|
300
284
|
} else {
|
|
301
285
|
usage(argv[0]);
|
|
302
286
|
}
|
|
287
|
+
} else if (strcmp(argv[arg_idx], "--keep-split")) {
|
|
288
|
+
params.keep_split = true;
|
|
303
289
|
} else {
|
|
304
290
|
usage(argv[0]);
|
|
305
291
|
}
|
|
@@ -313,10 +299,43 @@ int main(int argc, char ** argv) {
|
|
|
313
299
|
usage(argv[0]);
|
|
314
300
|
}
|
|
315
301
|
|
|
302
|
+
std::string imatrix_dataset;
|
|
316
303
|
std::unordered_map<std::string, std::vector<float>> imatrix_data;
|
|
317
|
-
prepare_imatrix(imatrix_file, included_weights, excluded_weights, imatrix_data);
|
|
304
|
+
int m_last_call = prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data);
|
|
318
305
|
if (!imatrix_data.empty()) {
|
|
319
306
|
params.imatrix = &imatrix_data;
|
|
307
|
+
{
|
|
308
|
+
llama_model_kv_override kvo;
|
|
309
|
+
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
|
|
310
|
+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
|
311
|
+
strncpy(kvo.val_str, imatrix_file.c_str(), 127);
|
|
312
|
+
kvo.val_str[127] = '\0';
|
|
313
|
+
kv_overrides.emplace_back(std::move(kvo));
|
|
314
|
+
}
|
|
315
|
+
if (!imatrix_dataset.empty()) {
|
|
316
|
+
llama_model_kv_override kvo;
|
|
317
|
+
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
|
|
318
|
+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
|
319
|
+
strncpy(kvo.val_str, imatrix_dataset.c_str(), 127);
|
|
320
|
+
kvo.val_str[127] = '\0';
|
|
321
|
+
kv_overrides.emplace_back(std::move(kvo));
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
{
|
|
325
|
+
llama_model_kv_override kvo;
|
|
326
|
+
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
|
|
327
|
+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
|
328
|
+
kvo.val_i64 = imatrix_data.size();
|
|
329
|
+
kv_overrides.emplace_back(std::move(kvo));
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
if (m_last_call > 0) {
|
|
333
|
+
llama_model_kv_override kvo;
|
|
334
|
+
std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
|
|
335
|
+
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
|
336
|
+
kvo.val_i64 = m_last_call;
|
|
337
|
+
kv_overrides.emplace_back(std::move(kvo));
|
|
338
|
+
}
|
|
320
339
|
}
|
|
321
340
|
if (!kv_overrides.empty()) {
|
|
322
341
|
kv_overrides.emplace_back();
|
|
@@ -332,20 +351,28 @@ int main(int argc, char ** argv) {
|
|
|
332
351
|
std::string fname_out;
|
|
333
352
|
|
|
334
353
|
std::string ftype_str;
|
|
354
|
+
std::string suffix = ".gguf";
|
|
335
355
|
if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
|
|
336
356
|
std::string fpath;
|
|
337
357
|
const size_t pos = fname_inp.find_last_of("/\\");
|
|
338
358
|
if (pos != std::string::npos) {
|
|
339
359
|
fpath = fname_inp.substr(0, pos + 1);
|
|
340
360
|
}
|
|
341
|
-
|
|
342
|
-
|
|
361
|
+
|
|
362
|
+
// export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
|
|
363
|
+
fname_out = fpath + "ggml-model-" + ftype_str;
|
|
364
|
+
if (!params.keep_split) {
|
|
365
|
+
fname_out += suffix;
|
|
366
|
+
}
|
|
343
367
|
arg_idx++;
|
|
344
368
|
if (ftype_str == "COPY") {
|
|
345
369
|
params.only_copy = true;
|
|
346
370
|
}
|
|
347
371
|
} else {
|
|
348
372
|
fname_out = argv[arg_idx];
|
|
373
|
+
if (params.keep_split && fname_out.find(suffix) != std::string::npos) {
|
|
374
|
+
fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
|
|
375
|
+
}
|
|
349
376
|
arg_idx++;
|
|
350
377
|
|
|
351
378
|
if (argc <= arg_idx) {
|