@fugood/llama.node 0.0.1-alpha.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/CMakeLists.txt +42 -7
  2. package/README.md +10 -0
  3. package/bin/darwin/arm64/default.metallib +0 -0
  4. package/bin/darwin/arm64/llama-node.node +0 -0
  5. package/bin/darwin/x64/default.metallib +0 -0
  6. package/bin/darwin/x64/llama-node.node +0 -0
  7. package/bin/linux/arm64/llama-node.node +0 -0
  8. package/bin/linux/x64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  10. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  11. package/lib/binding.js +1 -1
  12. package/lib/binding.ts +16 -2
  13. package/lib/index.ts +2 -2
  14. package/package.json +15 -3
  15. package/src/DetokenizeWorker.cpp +22 -0
  16. package/src/DetokenizeWorker.h +19 -0
  17. package/src/EmbeddingWorker.cpp +46 -0
  18. package/src/EmbeddingWorker.h +23 -0
  19. package/src/LlamaCompletionWorker.cpp +5 -1
  20. package/src/LlamaCompletionWorker.h +4 -0
  21. package/src/LlamaContext.cpp +80 -1
  22. package/src/LlamaContext.h +3 -0
  23. package/src/TokenizeWorker.cpp +26 -0
  24. package/src/TokenizeWorker.h +23 -0
  25. package/src/common.hpp +12 -7
  26. package/src/llama.cpp/CMakeLists.txt +13 -7
  27. package/src/llama.cpp/common/common.cpp +221 -173
  28. package/src/llama.cpp/common/common.h +19 -8
  29. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
  30. package/src/llama.cpp/common/log.h +2 -2
  31. package/src/llama.cpp/common/sampling.cpp +17 -1
  32. package/src/llama.cpp/common/sampling.h +28 -20
  33. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +17 -11
  34. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +5 -5
  35. package/src/llama.cpp/examples/finetune/finetune.cpp +1 -1
  36. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -4
  37. package/src/llama.cpp/examples/imatrix/imatrix.cpp +72 -39
  38. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -3
  39. package/src/llama.cpp/examples/llava/clip.cpp +74 -23
  40. package/src/llama.cpp/examples/llava/llava-cli.cpp +37 -28
  41. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -1
  42. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -1
  43. package/src/llama.cpp/examples/main/main.cpp +10 -8
  44. package/src/llama.cpp/examples/perplexity/perplexity.cpp +175 -55
  45. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  46. package/src/llama.cpp/examples/quantize/quantize.cpp +74 -47
  47. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  48. package/src/llama.cpp/examples/server/server.cpp +97 -86
  49. package/src/llama.cpp/examples/server/utils.hpp +17 -15
  50. package/src/llama.cpp/ggml-backend.c +7 -5
  51. package/src/llama.cpp/ggml-impl.h +339 -4
  52. package/src/llama.cpp/ggml-kompute.cpp +7 -0
  53. package/src/llama.cpp/ggml-opencl.cpp +1 -0
  54. package/src/llama.cpp/ggml-quants.c +302 -293
  55. package/src/llama.cpp/ggml-sycl.cpp +28 -16
  56. package/src/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
  57. package/src/llama.cpp/ggml-vulkan.cpp +951 -263
  58. package/src/llama.cpp/ggml.c +1469 -116
  59. package/src/llama.cpp/ggml.h +37 -7
  60. package/src/llama.cpp/llama.cpp +969 -432
  61. package/src/llama.cpp/llama.h +46 -14
  62. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +2 -0
  63. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -1
  64. package/src/llama.cpp/requirements/requirements-convert.txt +2 -2
  65. package/src/llama.cpp/requirements.txt +1 -0
  66. package/src/llama.cpp/sgemm.cpp +134 -103
  67. package/src/llama.cpp/sgemm.h +4 -2
  68. package/src/llama.cpp/tests/CMakeLists.txt +96 -36
  69. package/src/llama.cpp/tests/test-backend-ops.cpp +56 -6
  70. package/src/llama.cpp/tests/test-chat-template.cpp +4 -0
  71. package/src/llama.cpp/tests/test-grammar-integration.cpp +225 -136
  72. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -0
  73. package/src/llama.cpp/tests/test-tokenizer-0.cpp +292 -0
  74. package/src/llama.cpp/tests/{test-tokenizer-1-llama.cpp → test-tokenizer-1-spm.cpp} +1 -1
  75. package/src/llama.cpp/unicode-data.cpp +1188 -656
  76. package/src/llama.cpp/unicode-data.h +4 -3
  77. package/src/llama.cpp/unicode.cpp +590 -49
  78. package/src/llama.cpp/unicode.h +6 -3
  79. package/bin/win32/arm64/llama-node.node +0 -0
  80. package/bin/win32/arm64/node.lib +0 -0
  81. package/bin/win32/x64/llama-node.node +0 -0
  82. package/bin/win32/x64/node.lib +0 -0
  83. package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +0 -187
  84. package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +0 -190
@@ -31,6 +31,8 @@
31
31
  fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
32
32
  } while(0)
33
33
 
34
+ #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
35
+
34
36
  // build info
35
37
  extern int LLAMA_BUILD_NUMBER;
36
38
  extern char const *LLAMA_COMMIT;
@@ -86,13 +88,13 @@ struct gpt_params {
86
88
 
87
89
  ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
88
90
 
89
- llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
90
- llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
91
+ enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
92
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
91
93
 
92
94
  // // sampling parameters
93
95
  struct llama_sampling_params sparams;
94
96
 
95
- std::string model = "models/7B/ggml-model-f16.gguf"; // model path
97
+ std::string model = ""; // model path
96
98
  std::string model_draft = ""; // draft model for speculative decoding
97
99
  std::string model_alias = "unknown"; // model alias
98
100
  std::string model_url = ""; // model url to download
@@ -133,11 +135,12 @@ struct gpt_params {
133
135
  bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
134
136
  size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
135
137
 
136
- bool kl_divergence = false; // compute KL-divergence
138
+ bool kl_divergence = false; // compute KL divergence
137
139
 
138
140
  bool random_prompt = false; // do not randomize prompt if none provided
139
141
  bool use_color = false; // use color to distinguish generations and inputs
140
142
  bool interactive = false; // interactive mode
143
+ bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
141
144
  bool chatml = false; // chatml mode (used for models trained on chatml syntax)
142
145
  bool prompt_cache_all = false; // save user input and generations to prompt cache
143
146
  bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
@@ -148,6 +151,7 @@ struct gpt_params {
148
151
  bool multiline_input = false; // reverse the usage of `\`
149
152
  bool simple_io = false; // improves compatibility with subprocesses and limited consoles
150
153
  bool cont_batching = true; // insert new sequences for decoding on-the-fly
154
+ bool flash_attn = false; // flash attention
151
155
 
152
156
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
153
157
  bool ignore_eos = false; // ignore generated EOS tokens
@@ -161,15 +165,20 @@ struct gpt_params {
161
165
  bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
162
166
  bool no_kv_offload = false; // disable KV offloading
163
167
  bool warmup = true; // warmup run
168
+ bool check_tensors = false; // validate tensor data
164
169
 
165
170
  std::string cache_type_k = "f16"; // KV cache data type for the K
166
171
  std::string cache_type_v = "f16"; // KV cache data type for the V
167
172
 
168
173
  // multimodal models (see examples/llava)
169
- std::string mmproj = ""; // path to multimodal projector
170
- std::string image = ""; // path to an image file
174
+ std::string mmproj = ""; // path to multimodal projector
175
+ std::vector<std::string> image; // path to image file(s)
171
176
  };
172
177
 
178
+ void gpt_params_handle_model_default(gpt_params & params);
179
+
180
+ bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
181
+
173
182
  bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
174
183
 
175
184
  bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
@@ -193,6 +202,7 @@ bool validate_file_name(const std::string & filename);
193
202
  std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
194
203
  std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
195
204
  std::vector<std::string> string_split(std::string input, char separator);
205
+ std::string string_strip(const std::string & str);
196
206
  std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
197
207
 
198
208
  //
@@ -237,11 +247,12 @@ std::vector<llama_token> llama_tokenize(
237
247
  bool add_special,
238
248
  bool parse_special = false);
239
249
 
240
- // tokenizes a token into a piece
250
+ // tokenizes a token into a piece, optionally renders special/control tokens
241
251
  // should work similar to Python's `tokenizer.id_to_piece`
242
252
  std::string llama_token_to_piece(
243
253
  const struct llama_context * ctx,
244
- llama_token token);
254
+ llama_token token,
255
+ bool special = true);
245
256
 
246
257
  // TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
247
258
  // that takes into account the tokenizer type and decides how to handle the leading space
@@ -1,4 +1,8 @@
1
1
  #pragma once
2
+
3
+ #include "ggml.h"
4
+ // Change JSON_ASSERT from assert() to GGML_ASSERT:
5
+ #define JSON_ASSERT GGML_ASSERT
2
6
  #include "json.hpp"
3
7
 
4
8
  std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
@@ -234,7 +234,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
234
234
  // INTERNAL, DO NOT USE
235
235
  // USE LOG() INSTEAD
236
236
  //
237
- #if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
237
+ #if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
238
238
  #define LOG_IMPL(str, ...) \
239
239
  do { \
240
240
  if (LOG_TARGET != nullptr) \
@@ -257,7 +257,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
257
257
  // INTERNAL, DO NOT USE
258
258
  // USE LOG_TEE() INSTEAD
259
259
  //
260
- #if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
260
+ #if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
261
261
  #define LOG_TEE_IMPL(str, ...) \
262
262
  do { \
263
263
  if (LOG_TARGET != nullptr) \
@@ -1,4 +1,6 @@
1
+ #define LLAMA_API_INTERNAL
1
2
  #include "sampling.h"
3
+ #include <random>
2
4
 
3
5
  struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
4
6
  struct llama_sampling_context * result = new llama_sampling_context();
@@ -33,6 +35,10 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
33
35
 
34
36
  result->prev.resize(params.n_prev);
35
37
 
38
+ result->n_considered = 0;
39
+
40
+ llama_sampling_set_rng_seed(result, params.seed);
41
+
36
42
  return result;
37
43
  }
38
44
 
@@ -60,6 +66,14 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
60
66
 
61
67
  std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
62
68
  ctx->cur.clear();
69
+ ctx->n_considered = 0;
70
+ }
71
+
72
+ void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
73
+ if (seed == LLAMA_DEFAULT_SEED) {
74
+ seed = std::random_device{}();
75
+ }
76
+ ctx->rng.seed(seed);
63
77
  }
64
78
 
65
79
  void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
@@ -203,7 +217,7 @@ static llama_token llama_sampling_sample_impl(
203
217
 
204
218
  sampler_queue(ctx_main, params, cur_p, min_keep);
205
219
 
206
- id = llama_sample_token(ctx_main, &cur_p);
220
+ id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);
207
221
 
208
222
  //{
209
223
  // const int n_top = 10;
@@ -242,6 +256,8 @@ static llama_token llama_sampling_sample_impl(
242
256
  }
243
257
  }
244
258
 
259
+ ctx_sampling->n_considered = cur_p.size;
260
+
245
261
  return id;
246
262
  }
247
263
 
@@ -4,9 +4,10 @@
4
4
 
5
5
  #include "grammar-parser.h"
6
6
 
7
+ #include <random>
7
8
  #include <string>
8
- #include <vector>
9
9
  #include <unordered_map>
10
+ #include <vector>
10
11
 
11
12
  // sampler types
12
13
  enum class llama_sampler_type : char {
@@ -20,25 +21,26 @@ enum class llama_sampler_type : char {
20
21
 
21
22
  // sampling parameters
22
23
  typedef struct llama_sampling_params {
23
- int32_t n_prev = 64; // number of previous tokens to remember
24
- int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
25
- int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
26
- int32_t top_k = 40; // <= 0 to use vocab size
27
- float top_p = 0.95f; // 1.0 = disabled
28
- float min_p = 0.05f; // 0.0 = disabled
29
- float tfs_z = 1.00f; // 1.0 = disabled
30
- float typical_p = 1.00f; // 1.0 = disabled
31
- float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
32
- float dynatemp_range = 0.00f; // 0.0 = disabled
33
- float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
34
- int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
35
- float penalty_repeat = 1.00f; // 1.0 = disabled
36
- float penalty_freq = 0.00f; // 0.0 = disabled
37
- float penalty_present = 0.00f; // 0.0 = disabled
38
- int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
39
- float mirostat_tau = 5.00f; // target entropy
40
- float mirostat_eta = 0.10f; // learning rate
41
- bool penalize_nl = false; // consider newlines as a repeatable token
24
+ int32_t n_prev = 64; // number of previous tokens to remember
25
+ int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
26
+ int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
27
+ int32_t top_k = 40; // <= 0 to use vocab size
28
+ float top_p = 0.95f; // 1.0 = disabled
29
+ float min_p = 0.05f; // 0.0 = disabled
30
+ float tfs_z = 1.00f; // 1.0 = disabled
31
+ float typical_p = 1.00f; // 1.0 = disabled
32
+ float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
33
+ float dynatemp_range = 0.00f; // 0.0 = disabled
34
+ float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
35
+ int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
36
+ float penalty_repeat = 1.00f; // 1.0 = disabled
37
+ float penalty_freq = 0.00f; // 0.0 = disabled
38
+ float penalty_present = 0.00f; // 0.0 = disabled
39
+ int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
40
+ float mirostat_tau = 5.00f; // target entropy
41
+ float mirostat_eta = 0.10f; // learning rate
42
+ bool penalize_nl = false; // consider newlines as a repeatable token
43
+ uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
42
44
 
43
45
  std::vector<llama_sampler_type> samplers_sequence = {
44
46
  llama_sampler_type::TOP_K,
@@ -79,6 +81,9 @@ struct llama_sampling_context {
79
81
  // TODO: replace with ring-buffer
80
82
  std::vector<llama_token> prev;
81
83
  std::vector<llama_token_data> cur;
84
+ size_t n_considered;
85
+
86
+ std::mt19937 rng;
82
87
  };
83
88
 
84
89
  #include "common.h"
@@ -93,6 +98,9 @@ void llama_sampling_free(struct llama_sampling_context * ctx);
93
98
  // - reset grammar
94
99
  void llama_sampling_reset(llama_sampling_context * ctx);
95
100
 
101
+ // Set the sampler seed
102
+ void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
103
+
96
104
  // Copy the sampler context
97
105
  void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
98
106
 
@@ -32,7 +32,7 @@ int main(int argc, char ** argv) {
32
32
  gpt_params params;
33
33
 
34
34
  if (argc == 1 || argv[1][0] == '-') {
35
- printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
35
+ printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
36
36
  printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
37
37
  printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
38
38
  return 1 ;
@@ -41,6 +41,7 @@ int main(int argc, char ** argv) {
41
41
  int n_kv_max = 2048;
42
42
  int n_batch = 2048;
43
43
  int n_ubatch = 512;
44
+ bool flash_attn = false;
44
45
  int is_pp_shared = 0;
45
46
  int n_gpu_layers = 0;
46
47
 
@@ -66,23 +67,27 @@ int main(int argc, char ** argv) {
66
67
  }
67
68
 
68
69
  if (argc >= 6) {
69
- is_pp_shared = std::atoi(argv[5]);
70
+ flash_attn = std::atoi(argv[5]);
70
71
  }
71
72
 
72
73
  if (argc >= 7) {
73
- n_gpu_layers = std::atoi(argv[6]);
74
+ is_pp_shared = std::atoi(argv[6]);
74
75
  }
75
76
 
76
77
  if (argc >= 8) {
77
- n_pp = parse_list(argv[7]);
78
+ n_gpu_layers = std::atoi(argv[7]);
78
79
  }
79
80
 
80
81
  if (argc >= 9) {
81
- n_tg = parse_list(argv[8]);
82
+ n_pp = parse_list(argv[8]);
82
83
  }
83
84
 
84
85
  if (argc >= 10) {
85
- n_pl = parse_list(argv[9]);
86
+ n_tg = parse_list(argv[9]);
87
+ }
88
+
89
+ if (argc >= 11) {
90
+ n_pl = parse_list(argv[10]);
86
91
  }
87
92
 
88
93
  // init LLM
@@ -108,10 +113,11 @@ int main(int argc, char ** argv) {
108
113
 
109
114
  llama_context_params ctx_params = llama_context_default_params();
110
115
 
111
- ctx_params.seed = 1234;
112
- ctx_params.n_ctx = n_kv_max;
113
- ctx_params.n_batch = n_batch;
114
- ctx_params.n_ubatch = n_ubatch;
116
+ ctx_params.seed = 1234;
117
+ ctx_params.n_ctx = n_kv_max;
118
+ ctx_params.n_batch = n_batch;
119
+ ctx_params.n_ubatch = n_ubatch;
120
+ ctx_params.flash_attn = flash_attn;
115
121
 
116
122
  ctx_params.n_threads = params.n_threads;
117
123
  ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -169,7 +175,7 @@ int main(int argc, char ** argv) {
169
175
  }
170
176
 
171
177
  LOG_TEE("\n");
172
- LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
178
+ LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
173
179
  LOG_TEE("\n");
174
180
 
175
181
  LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
@@ -52,15 +52,15 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
52
52
  size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
53
53
  float v;
54
54
  if (type == GGML_TYPE_F16) {
55
- v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i);
55
+ v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
56
56
  } else if (type == GGML_TYPE_F32) {
57
- v = *(float *) data + i;
57
+ v = *(float *) &data[i];
58
58
  } else if (type == GGML_TYPE_I32) {
59
- v = (float) *(int32_t *) data + i;
59
+ v = (float) *(int32_t *) &data[i];
60
60
  } else if (type == GGML_TYPE_I16) {
61
- v = (float) *(int16_t *) data + i;
61
+ v = (float) *(int16_t *) &data[i];
62
62
  } else if (type == GGML_TYPE_I8) {
63
- v = (float) *(int8_t *) data + i;
63
+ v = (float) *(int8_t *) &data[i];
64
64
  } else {
65
65
  GGML_ASSERT(false);
66
66
  }
@@ -575,7 +575,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
575
575
  GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
576
576
 
577
577
  auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
578
- if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
578
+ if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16) {
579
579
  return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
580
580
  } else if (a->type == GGML_TYPE_F32) {
581
581
  return ggml_add(ctx, a, b);
@@ -32,6 +32,7 @@ struct split_params {
32
32
  int n_split_tensors = 128;
33
33
  std::string input;
34
34
  std::string output;
35
+ bool no_tensor_first_split = false;
35
36
  bool dry_run = false;
36
37
  };
37
38
 
@@ -49,6 +50,7 @@ static void split_print_usage(const char * executable) {
49
50
  printf(" --merge merge multiple GGUF to a single GGUF\n");
50
51
  printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors);
51
52
  printf(" --split-max-size N(M|G) max size per split\n");
53
+ printf(" --no-tensor-first-split do not add tensors to the first split (disabled by default)\n");
52
54
  printf(" --dry-run only print out a split plan and exit, without writing any new files\n");
53
55
  printf("\n");
54
56
  }
@@ -100,6 +102,10 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
100
102
  arg_found = true;
101
103
  params.dry_run = true;
102
104
  }
105
+ if (arg == "--no-tensor-first-split") {
106
+ arg_found = true;
107
+ params.no_tensor_first_split = true;
108
+ }
103
109
 
104
110
  if (is_op_set) {
105
111
  throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
@@ -200,10 +206,10 @@ struct split_strategy {
200
206
  // because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
201
207
  int i_split = -1;
202
208
  struct gguf_context * ctx_out = NULL;
203
- auto new_ctx_out = [&]() {
209
+ auto new_ctx_out = [&](bool allow_no_tensors) {
204
210
  i_split++;
205
211
  if (ctx_out != NULL) {
206
- if (gguf_get_n_tensors(ctx_out) == 0) {
212
+ if (gguf_get_n_tensors(ctx_out) == 0 && !allow_no_tensors) {
207
213
  fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n");
208
214
  exit(EXIT_FAILURE);
209
215
  }
@@ -220,7 +226,12 @@ struct split_strategy {
220
226
  };
221
227
 
222
228
  // initialize ctx_out for the first split
223
- new_ctx_out();
229
+ new_ctx_out(false);
230
+
231
+ // skip first split if no_tensor_first_split is set
232
+ if (params.no_tensor_first_split) {
233
+ new_ctx_out(true);
234
+ }
224
235
 
225
236
  // process tensors one by one
226
237
  size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
@@ -230,7 +241,7 @@ struct split_strategy {
230
241
  size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
231
242
  size_t next_tensors_size = curr_tensors_size + n_bytes;
232
243
  if (should_split(i, next_tensors_size)) {
233
- new_ctx_out();
244
+ new_ctx_out(false);
234
245
  curr_tensors_size = n_bytes;
235
246
  } else {
236
247
  curr_tensors_size = next_tensors_size;
@@ -19,10 +19,12 @@
19
19
 
20
20
  struct Stats {
21
21
  std::vector<float> values;
22
+ std::vector<int> counts;
22
23
  int ncall = 0;
23
24
  };
24
25
 
25
26
  struct StatParams {
27
+ std::string dataset;
26
28
  std::string ofile = "imatrix.dat";
27
29
  int n_output_frequency = 10;
28
30
  int verbosity = 1;
@@ -46,7 +48,7 @@ private:
46
48
  std::vector<float> m_src1_data;
47
49
  std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
48
50
  //
49
- void save_imatrix(const char * file_name) const;
51
+ void save_imatrix(const char * file_name, const char * dataset) const;
50
52
  void keep_imatrix(int ncall) const;
51
53
  };
52
54
 
@@ -120,12 +122,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
120
122
  auto & e = m_stats[wname];
121
123
 
122
124
  ++e.ncall;
123
- // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
124
- // using the following line, we can correct for that if needed by replacing the line above with:
125
- //if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
126
125
 
127
126
  if (e.values.empty()) {
128
127
  e.values.resize(src1->ne[0]*n_as, 0);
128
+ e.counts.resize(src1->ne[0]*n_as, 0);
129
129
  }
130
130
  else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
131
131
  fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
@@ -152,6 +152,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
152
152
 
153
153
  for (int j = 0; j < (int)src1->ne[0]; ++j) {
154
154
  e.values[e_start + j] += x[j]*x[j];
155
+ e.counts[e_start + j]++;
155
156
  }
156
157
  }
157
158
  }
@@ -169,6 +170,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
169
170
  auto& e = m_stats[wname];
170
171
  if (e.values.empty()) {
171
172
  e.values.resize(src1->ne[0], 0);
173
+ e.counts.resize(src1->ne[0], 0);
172
174
  }
173
175
  else if (e.values.size() != (size_t)src1->ne[0]) {
174
176
  fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
@@ -182,6 +184,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
182
184
  const float * x = data + row * src1->ne[0];
183
185
  for (int j = 0; j < (int)src1->ne[0]; ++j) {
184
186
  e.values[j] += x[j]*x[j];
187
+ e.counts[j]++;
185
188
  }
186
189
  }
187
190
  if (e.ncall > m_last_call) {
@@ -199,7 +202,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
199
202
  }
200
203
 
201
204
  void IMatrixCollector::save_imatrix() const {
202
- save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
205
+ save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
203
206
  }
204
207
 
205
208
  void IMatrixCollector::keep_imatrix(int ncall) const {
@@ -207,24 +210,39 @@ void IMatrixCollector::keep_imatrix(int ncall) const {
207
210
  if (file_name.empty()) file_name = "imatrix.dat";
208
211
  file_name += ".at_";
209
212
  file_name += std::to_string(ncall);
210
- save_imatrix(file_name.c_str());
213
+ save_imatrix(file_name.c_str(), m_params.dataset.c_str());
211
214
  }
212
215
 
213
- void IMatrixCollector::save_imatrix(const char * fname) const {
216
+ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
214
217
  std::ofstream out(fname, std::ios::binary);
215
218
  int n_entries = m_stats.size();
216
- out.write((const char*)&n_entries, sizeof(n_entries));
217
- for (auto& p : m_stats) {
219
+ out.write((const char *) &n_entries, sizeof(n_entries));
220
+ for (const auto & p : m_stats) {
218
221
  int len = p.first.size();
219
- out.write((const char*)&len, sizeof(len));
222
+ out.write((const char *) &len, sizeof(len));
220
223
  out.write(p.first.c_str(), len);
221
- out.write((const char*)&p.second.ncall, sizeof(p.second.ncall));
224
+ out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
222
225
  int nval = p.second.values.size();
223
- out.write((const char*)&nval, sizeof(nval));
224
- if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float));
226
+ out.write((const char *) &nval, sizeof(nval));
227
+ if (nval > 0) {
228
+ std::vector<float> tmp(nval);
229
+ for (int i = 0; i < nval; i++) {
230
+ tmp[i] = (p.second.values[i] / static_cast<float>(p.second.counts[i])) * static_cast<float>(p.second.ncall);
231
+ }
232
+ out.write((const char*)tmp.data(), nval*sizeof(float));
233
+ }
225
234
  }
235
+
236
+ // Write the number of call the matrix was computed with
237
+ out.write((const char *) &m_last_call, sizeof(m_last_call));
238
+
239
+ // Write the dataset name at the end of the file to later on specify it in quantize
240
+ int n_dataset = strlen(dataset);
241
+ out.write((const char *) &n_dataset, sizeof(n_dataset));
242
+ out.write(dataset, n_dataset);
243
+
226
244
  if (m_params.verbosity > 0) {
227
- fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname);
245
+ fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
228
246
  }
229
247
  }
230
248
 
@@ -260,14 +278,28 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
260
278
  imatrix_data = {};
261
279
  return false;
262
280
  }
263
- e.values.resize(nval);
264
- in.read((char*)e.values.data(), nval*sizeof(float));
281
+
282
+ // When re-called from load_imatrix() with add set, this will already be created.
283
+ if (e.values.empty()) {
284
+ e.values.resize(nval, 0);
285
+ e.counts.resize(nval, 0);
286
+ }
287
+
288
+ std::vector<float> tmp(nval);
289
+ in.read((char*)tmp.data(), nval*sizeof(float));
265
290
  if (in.fail()) {
266
291
  printf("%s: failed reading data for entry %d\n",__func__,i);
267
292
  imatrix_data = {};
268
293
  return false;
269
294
  }
270
- e.ncall = ncall;
295
+
296
+ // Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
297
+ for (int i = 0; i < nval; i++) {
298
+ e.values[i] += tmp[i];
299
+ e.counts[i] += ncall;
300
+ }
301
+ e.ncall += ncall;
302
+
271
303
  }
272
304
  return true;
273
305
  }
@@ -547,6 +579,29 @@ int main(int argc, char ** argv) {
547
579
  }
548
580
  }
549
581
 
582
+ gpt_params params;
583
+ params.n_batch = 512;
584
+ if (!gpt_params_parse(args.size(), args.data(), params)) {
585
+ return 1;
586
+ }
587
+
588
+ params.logits_all = true;
589
+ params.n_batch = std::min(params.n_batch, params.n_ctx);
590
+
591
+ print_build_info();
592
+
593
+ if (params.seed == LLAMA_DEFAULT_SEED) {
594
+ params.seed = time(NULL);
595
+ }
596
+
597
+ fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
598
+
599
+ std::mt19937 rng(params.seed);
600
+ if (params.random_prompt) {
601
+ params.prompt = gpt_random_prompt(rng);
602
+ }
603
+
604
+ sparams.dataset = params.prompt_file;
550
605
  g_collector.set_parameters(std::move(sparams));
551
606
 
552
607
  if (!combine_files.empty()) {
@@ -585,28 +640,6 @@ int main(int argc, char ** argv) {
585
640
  }
586
641
  }
587
642
 
588
- gpt_params params;
589
- params.n_batch = 512;
590
- if (!gpt_params_parse(args.size(), args.data(), params)) {
591
- return 1;
592
- }
593
-
594
- params.logits_all = true;
595
- params.n_batch = std::min(params.n_batch, params.n_ctx);
596
-
597
- print_build_info();
598
-
599
- if (params.seed == LLAMA_DEFAULT_SEED) {
600
- params.seed = time(NULL);
601
- }
602
-
603
- fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
604
-
605
- std::mt19937 rng(params.seed);
606
- if (params.random_prompt) {
607
- params.prompt = gpt_random_prompt(rng);
608
- }
609
-
610
643
  llama_backend_init();
611
644
  llama_numa_init(params.numa);
612
645