cui-llama.rn 1.2.3 → 1.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -2
- package/android/src/main/CMakeLists.txt +1 -0
- package/android/src/main/java/com/rnllama/LlamaContext.java +0 -3
- package/android/src/main/jni.cpp +9 -11
- package/cpp/common.cpp +85 -75
- package/cpp/common.h +127 -91
- package/cpp/ggml-aarch64.c +269 -0
- package/cpp/ggml-alloc.c +17 -19
- package/cpp/ggml-backend-impl.h +4 -15
- package/cpp/ggml-backend.cpp +1697 -1626
- package/cpp/ggml-backend.h +13 -25
- package/cpp/ggml-cpp.h +38 -0
- package/cpp/ggml-cpu.c +13720 -0
- package/cpp/ggml-cpu.h +150 -0
- package/cpp/ggml-impl.h +95 -0
- package/cpp/ggml-metal.m +185 -71
- package/cpp/ggml-quants.c +38 -51
- package/cpp/ggml.c +4468 -19500
- package/cpp/ggml.h +26 -146
- package/cpp/json-schema-to-grammar.cpp +1 -1
- package/cpp/llama-sampling.cpp +742 -249
- package/cpp/llama-sampling.h +21 -2
- package/cpp/llama-vocab.cpp +49 -9
- package/cpp/llama-vocab.h +35 -11
- package/cpp/llama.cpp +2468 -2307
- package/cpp/llama.h +65 -32
- package/cpp/log.cpp +50 -50
- package/cpp/log.h +18 -18
- package/cpp/rn-llama.hpp +23 -22
- package/cpp/sampling.cpp +117 -118
- package/cpp/sampling.h +20 -20
- package/cpp/sgemm.cpp +57 -0
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +0 -1
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +0 -1
package/cpp/sampling.cpp
CHANGED
@@ -98,8 +98,8 @@ struct ring_buffer {
|
|
98
98
|
std::vector<T> data;
|
99
99
|
};
|
100
100
|
|
101
|
-
struct
|
102
|
-
|
101
|
+
struct common_sampler {
|
102
|
+
common_sampler_params params;
|
103
103
|
|
104
104
|
struct llama_sampler * grmr;
|
105
105
|
struct llama_sampler * chain;
|
@@ -125,26 +125,28 @@ struct gpt_sampler {
|
|
125
125
|
}
|
126
126
|
};
|
127
127
|
|
128
|
-
std::string
|
128
|
+
std::string common_sampler_params::print() const {
|
129
129
|
char result[1024];
|
130
130
|
|
131
131
|
snprintf(result, sizeof(result),
|
132
132
|
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
133
|
-
"\
|
133
|
+
"\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
|
134
|
+
"\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
|
134
135
|
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
135
136
|
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
|
136
|
-
|
137
|
+
dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
|
138
|
+
top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
|
137
139
|
mirostat, mirostat_eta, mirostat_tau);
|
138
140
|
|
139
141
|
return std::string(result);
|
140
142
|
}
|
141
143
|
|
142
|
-
struct
|
144
|
+
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) {
|
143
145
|
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
144
146
|
|
145
147
|
lparams.no_perf = params.no_perf;
|
146
148
|
|
147
|
-
auto * result = new
|
149
|
+
auto * result = new common_sampler {
|
148
150
|
/* .params = */ params,
|
149
151
|
/* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
|
150
152
|
/* .chain = */ llama_sampler_chain_init(lparams),
|
@@ -170,64 +172,61 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
|
|
170
172
|
params.penalty_present,
|
171
173
|
params.penalize_nl,
|
172
174
|
params.ignore_eos));
|
173
|
-
|
174
|
-
if (params.
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
llama_sampler_chain_add(result->chain,
|
186
|
-
|
187
|
-
case GPT_SAMPLER_TYPE_TFS_Z:
|
188
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
|
189
|
-
break;
|
190
|
-
case GPT_SAMPLER_TYPE_TYPICAL_P:
|
191
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
192
|
-
break;
|
193
|
-
case GPT_SAMPLER_TYPE_XTC:
|
194
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_p, params.xtc_t, params.min_keep, params.seed));
|
195
|
-
break;
|
196
|
-
case GPT_SAMPLER_TYPE_TEMPERATURE:
|
197
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
175
|
+
|
176
|
+
if (params.mirostat == 0) {
|
177
|
+
for (const auto & cnstr : params.samplers) {
|
178
|
+
switch (cnstr) {
|
179
|
+
case COMMON_SAMPLER_TYPE_DRY:
|
180
|
+
{
|
181
|
+
std::vector<const char*> c_breakers;
|
182
|
+
c_breakers.reserve(params.dry_sequence_breakers.size());
|
183
|
+
for (const auto& str : params.dry_sequence_breakers) {
|
184
|
+
c_breakers.push_back(str.c_str());
|
185
|
+
}
|
186
|
+
|
187
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
188
|
+
}
|
198
189
|
break;
|
199
|
-
|
200
|
-
|
201
|
-
|
190
|
+
case COMMON_SAMPLER_TYPE_TOP_K:
|
191
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
192
|
+
break;
|
193
|
+
case COMMON_SAMPLER_TYPE_TOP_P:
|
194
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
195
|
+
break;
|
196
|
+
case COMMON_SAMPLER_TYPE_MIN_P:
|
197
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
198
|
+
break;
|
199
|
+
case COMMON_SAMPLER_TYPE_XTC:
|
200
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
201
|
+
break;
|
202
|
+
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
203
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
204
|
+
break;
|
205
|
+
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
206
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
207
|
+
break;
|
208
|
+
case COMMON_SAMPLER_TYPE_INFILL:
|
209
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
|
210
|
+
break;
|
211
|
+
default:
|
212
|
+
LM_GGML_ASSERT(false && "unknown sampler type");
|
202
213
|
}
|
203
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
|
204
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
205
|
-
} else if (params.mirostat == 1) {
|
206
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
207
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
208
|
-
} else if (params.mirostat == 2) {
|
209
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
210
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
211
|
-
} else {
|
212
|
-
LM_GGML_ASSERT(false && "unknown mirostat version");
|
213
214
|
}
|
215
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
216
|
+
} else if (params.mirostat == 1) {
|
217
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
218
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
219
|
+
} else if (params.mirostat == 2) {
|
220
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
221
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
214
222
|
} else {
|
215
|
-
|
216
|
-
// some use cases require to sample greedily, but still obtain the probabilities of the top tokens
|
217
|
-
// ref: https://github.com/ggerganov/llama.cpp/pull/9605
|
218
|
-
//
|
219
|
-
// the following will not produce exactly the same probs as applyging softmax to the full vocabulary, but
|
220
|
-
// it is much faster, since we avoid sorting all tokens and should give a good approximation
|
221
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k(params.n_probs));
|
222
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
|
223
|
-
}
|
224
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
|
223
|
+
LM_GGML_ASSERT(false && "unknown mirostat version");
|
225
224
|
}
|
226
225
|
|
227
226
|
return result;
|
228
227
|
}
|
229
228
|
|
230
|
-
void
|
229
|
+
void common_sampler_free(struct common_sampler * gsmpl) {
|
231
230
|
if (gsmpl) {
|
232
231
|
llama_sampler_free(gsmpl->grmr);
|
233
232
|
|
@@ -237,7 +236,7 @@ void gpt_sampler_free(struct gpt_sampler * gsmpl) {
|
|
237
236
|
}
|
238
237
|
}
|
239
238
|
|
240
|
-
void
|
239
|
+
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
241
240
|
if (accept_grammar) {
|
242
241
|
llama_sampler_accept(gsmpl->grmr, token);
|
243
242
|
}
|
@@ -247,14 +246,14 @@ void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool acce
|
|
247
246
|
gsmpl->prev.push_back(token);
|
248
247
|
}
|
249
248
|
|
250
|
-
void
|
249
|
+
void common_sampler_reset(struct common_sampler * gsmpl) {
|
251
250
|
llama_sampler_reset(gsmpl->grmr);
|
252
251
|
|
253
252
|
llama_sampler_reset(gsmpl->chain);
|
254
253
|
}
|
255
254
|
|
256
|
-
struct
|
257
|
-
return new
|
255
|
+
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
256
|
+
return new common_sampler {
|
258
257
|
/* .params = */ gsmpl->params,
|
259
258
|
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
260
259
|
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
@@ -264,7 +263,7 @@ struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
|
|
264
263
|
};
|
265
264
|
}
|
266
265
|
|
267
|
-
void
|
266
|
+
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
|
268
267
|
// TODO: measure grammar performance
|
269
268
|
|
270
269
|
if (gsmpl) {
|
@@ -275,7 +274,7 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
|
|
275
274
|
}
|
276
275
|
}
|
277
276
|
|
278
|
-
llama_token
|
277
|
+
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
279
278
|
gsmpl->set_logits(ctx, idx);
|
280
279
|
|
281
280
|
auto & grmr = gsmpl->grmr;
|
@@ -321,21 +320,21 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
|
|
321
320
|
return cur_p.data[cur_p.selected].id;
|
322
321
|
}
|
323
322
|
|
324
|
-
uint32_t
|
323
|
+
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
325
324
|
return llama_sampler_get_seed(gsmpl->chain);
|
326
325
|
}
|
327
326
|
|
328
327
|
// helpers
|
329
328
|
|
330
|
-
llama_token_data_array *
|
329
|
+
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
|
331
330
|
return &gsmpl->cur_p;
|
332
331
|
}
|
333
332
|
|
334
|
-
llama_token
|
333
|
+
llama_token common_sampler_last(const struct common_sampler * gsmpl) {
|
335
334
|
return gsmpl->prev.rat(0);
|
336
335
|
}
|
337
336
|
|
338
|
-
std::string
|
337
|
+
std::string common_sampler_print(const struct common_sampler * gsmpl) {
|
339
338
|
std::string result = "logits ";
|
340
339
|
|
341
340
|
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
@@ -346,7 +345,7 @@ std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
|
|
346
345
|
return result;
|
347
346
|
}
|
348
347
|
|
349
|
-
std::string
|
348
|
+
std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) {
|
350
349
|
n = std::min(n, (int) gsmpl->prev.size());
|
351
350
|
|
352
351
|
if (n <= 0) {
|
@@ -361,68 +360,67 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main,
|
|
361
360
|
|
362
361
|
LM_GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
|
363
362
|
|
364
|
-
result +=
|
363
|
+
result += common_token_to_piece(ctx_main, id);
|
365
364
|
}
|
366
365
|
|
367
366
|
return result;
|
368
367
|
}
|
369
368
|
|
370
|
-
char
|
369
|
+
char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
|
371
370
|
switch (cnstr) {
|
372
|
-
case
|
373
|
-
case
|
374
|
-
case
|
375
|
-
case
|
376
|
-
case
|
377
|
-
case
|
378
|
-
case
|
371
|
+
case COMMON_SAMPLER_TYPE_DRY: return 'd';
|
372
|
+
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
|
373
|
+
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
374
|
+
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
|
375
|
+
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
|
376
|
+
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
|
377
|
+
case COMMON_SAMPLER_TYPE_XTC: return 'x';
|
378
|
+
case COMMON_SAMPLER_TYPE_INFILL: return 'i';
|
379
379
|
default : return '?';
|
380
380
|
}
|
381
381
|
}
|
382
382
|
|
383
|
-
std::string
|
383
|
+
std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
|
384
384
|
switch (cnstr) {
|
385
|
-
case
|
386
|
-
case
|
387
|
-
case
|
388
|
-
case
|
389
|
-
case
|
390
|
-
case
|
391
|
-
case
|
385
|
+
case COMMON_SAMPLER_TYPE_DRY: return "dry";
|
386
|
+
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
|
387
|
+
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
388
|
+
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
|
389
|
+
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
|
390
|
+
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
391
|
+
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
|
392
|
+
case COMMON_SAMPLER_TYPE_INFILL: return "infill";
|
392
393
|
default : return "";
|
393
394
|
}
|
394
395
|
}
|
395
396
|
|
396
|
-
std::vector<
|
397
|
-
std::unordered_map<std::string,
|
398
|
-
{ "
|
399
|
-
{ "
|
400
|
-
{ "
|
401
|
-
{ "
|
402
|
-
{ "
|
403
|
-
{ "
|
404
|
-
{ "
|
397
|
+
std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
|
398
|
+
std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
|
399
|
+
{ "dry", COMMON_SAMPLER_TYPE_DRY },
|
400
|
+
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
|
401
|
+
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
|
402
|
+
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
403
|
+
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
404
|
+
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
405
|
+
{ "xtc", COMMON_SAMPLER_TYPE_XTC },
|
406
|
+
{ "infill", COMMON_SAMPLER_TYPE_INFILL },
|
405
407
|
};
|
406
408
|
|
407
409
|
// since samplers names are written multiple ways
|
408
410
|
// make it ready for both system names and input names
|
409
|
-
std::unordered_map<std::string,
|
410
|
-
{ "top-k",
|
411
|
-
{ "top-p",
|
412
|
-
{ "nucleus",
|
413
|
-
{ "typical-p",
|
414
|
-
{ "typical",
|
415
|
-
{ "typ-p",
|
416
|
-
{ "typ",
|
417
|
-
{ "min-p",
|
418
|
-
{ "
|
419
|
-
{ "tfs", GPT_SAMPLER_TYPE_TFS_Z },
|
420
|
-
{ "xtc_p", GPT_SAMPLER_TYPE_XTC},
|
421
|
-
{ "xtc_t", GPT_SAMPLER_TYPE_XTC},
|
422
|
-
{ "temp", GPT_SAMPLER_TYPE_TEMPERATURE },
|
411
|
+
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
|
412
|
+
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
|
413
|
+
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
|
414
|
+
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
|
415
|
+
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
416
|
+
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
417
|
+
{ "typ-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
418
|
+
{ "typ", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
419
|
+
{ "min-p", COMMON_SAMPLER_TYPE_MIN_P },
|
420
|
+
{ "temp", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
423
421
|
};
|
424
422
|
|
425
|
-
std::vector<
|
423
|
+
std::vector<common_sampler_type> samplers;
|
426
424
|
samplers.reserve(names.size());
|
427
425
|
|
428
426
|
for (const auto & name : names) {
|
@@ -442,18 +440,19 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
|
|
442
440
|
return samplers;
|
443
441
|
}
|
444
442
|
|
445
|
-
std::vector<
|
446
|
-
std::unordered_map<char,
|
447
|
-
{
|
448
|
-
{
|
449
|
-
{
|
450
|
-
{
|
451
|
-
{
|
452
|
-
{
|
453
|
-
{
|
443
|
+
std::vector<common_sampler_type> common_sampler_types_from_chars(const std::string & chars) {
|
444
|
+
std::unordered_map<char, common_sampler_type> sampler_name_map = {
|
445
|
+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_DRY), COMMON_SAMPLER_TYPE_DRY },
|
446
|
+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
|
447
|
+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
|
448
|
+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
|
449
|
+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
|
450
|
+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
|
451
|
+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
|
452
|
+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
|
454
453
|
};
|
455
454
|
|
456
|
-
std::vector<
|
455
|
+
std::vector<common_sampler_type> samplers;
|
457
456
|
samplers.reserve(chars.size());
|
458
457
|
|
459
458
|
for (const auto & c : chars) {
|
package/cpp/sampling.h
CHANGED
@@ -7,7 +7,7 @@
|
|
7
7
|
#include <string>
|
8
8
|
#include <vector>
|
9
9
|
|
10
|
-
//
|
10
|
+
// common_sampler extends llama_sampler with additional functionality:
|
11
11
|
//
|
12
12
|
// - grammar support
|
13
13
|
// - custom sampler logic based on the parameters
|
@@ -23,30 +23,30 @@
|
|
23
23
|
// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
|
24
24
|
// grammar constraints are applied to the full vocabulary and the token is resampled.
|
25
25
|
//
|
26
|
-
// The
|
26
|
+
// The common_sampler also maintains a container with the last accepted tokens. In the future, this can
|
27
27
|
// be moved into the core llama library.
|
28
28
|
//
|
29
|
-
// For convenience, the
|
29
|
+
// For convenience, the common_sampler also maintains a container with the current candidate tokens.
|
30
30
|
// This can be used to access the probabilities of the rest of the non-sampled tokens.
|
31
31
|
//
|
32
32
|
// TODO: measure grammar performance
|
33
33
|
//
|
34
34
|
|
35
|
-
struct
|
35
|
+
struct common_sampler;
|
36
36
|
|
37
37
|
// llama_sampler API overloads
|
38
38
|
|
39
|
-
struct
|
39
|
+
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params);
|
40
40
|
|
41
|
-
void
|
41
|
+
void common_sampler_free(struct common_sampler * gsmpl);
|
42
42
|
|
43
43
|
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
|
44
|
-
void
|
45
|
-
void
|
46
|
-
struct
|
44
|
+
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
|
45
|
+
void common_sampler_reset (struct common_sampler * gsmpl);
|
46
|
+
struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
|
47
47
|
|
48
48
|
// arguments can be nullptr to skip printing
|
49
|
-
void
|
49
|
+
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
|
50
50
|
|
51
51
|
// extended sampling implementation:
|
52
52
|
//
|
@@ -58,26 +58,26 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
|
|
58
58
|
// if grammar_first is true, the grammar is applied before the samplers (slower)
|
59
59
|
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
60
60
|
//
|
61
|
-
llama_token
|
61
|
+
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
62
62
|
|
63
|
-
uint32_t
|
63
|
+
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
64
64
|
|
65
65
|
// helpers
|
66
66
|
|
67
67
|
// access the internal list of current candidate tokens
|
68
|
-
llama_token_data_array *
|
68
|
+
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
|
69
69
|
|
70
70
|
// get the last accepted token
|
71
|
-
llama_token
|
71
|
+
llama_token common_sampler_last(const struct common_sampler * gsmpl);
|
72
72
|
|
73
73
|
// print the sampler chain into a string
|
74
|
-
std::string
|
74
|
+
std::string common_sampler_print(const struct common_sampler * gsmpl);
|
75
75
|
|
76
76
|
// get a string representation of the last accepted tokens
|
77
|
-
std::string
|
77
|
+
std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
|
78
78
|
|
79
|
-
char
|
80
|
-
std::string
|
79
|
+
char common_sampler_type_to_chr(enum common_sampler_type cnstr);
|
80
|
+
std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
|
81
81
|
|
82
|
-
std::vector<enum
|
83
|
-
std::vector<enum
|
82
|
+
std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
83
|
+
std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
|
package/cpp/sgemm.cpp
CHANGED
@@ -942,6 +942,36 @@ class tinyBLAS_Q0_AVX {
|
|
942
942
|
return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
|
943
943
|
}
|
944
944
|
|
945
|
+
inline __m256i load(const block_q5_0 *b) {
|
946
|
+
return _mm256_or_si256(denibble(b->qs), bittobyte(b->qh));
|
947
|
+
}
|
948
|
+
|
949
|
+
inline __m128i load0(const block_q5_0* b) {
|
950
|
+
const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
|
951
|
+
uint32_t x32;
|
952
|
+
memcpy(&x32, b->qh, sizeof(uint32_t));
|
953
|
+
__m128i qxl = _mm_and_si128(_mm_set1_epi8(15), x);
|
954
|
+
__m128i bytesl = _mm_cmpeq_epi8(_mm_set1_epi64x(-1),
|
955
|
+
_mm_or_si128(_mm_set1_epi64x(0x7fbfdfeff7fbfdfe),
|
956
|
+
_mm_shuffle_epi8(_mm_set1_epi32(x32),
|
957
|
+
_mm_set_epi64x(0x0101010101010101, 0x0000000000000000))));
|
958
|
+
bytesl = _mm_andnot_si128(bytesl, _mm_set1_epi8((char)0xF0));
|
959
|
+
return _mm_or_si128(qxl, bytesl);
|
960
|
+
}
|
961
|
+
|
962
|
+
inline __m128i load1(const block_q5_0* b) {
|
963
|
+
const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
|
964
|
+
uint32_t x32;
|
965
|
+
memcpy(&x32, b->qh, sizeof(uint32_t));
|
966
|
+
__m128i qxh = _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4));
|
967
|
+
__m128i bytesh = _mm_cmpeq_epi8(_mm_set1_epi64x(-1),
|
968
|
+
_mm_or_si128(_mm_set1_epi64x(0x7fbfdfeff7fbfdfe),
|
969
|
+
_mm_shuffle_epi8(_mm_set1_epi32(x32),
|
970
|
+
_mm_set_epi64x(0x0303030303030303, 0x0202020202020202))));
|
971
|
+
bytesh = _mm_andnot_si128(bytesh, _mm_set1_epi8((char)0xF0));
|
972
|
+
return _mm_or_si128(qxh, bytesh);
|
973
|
+
}
|
974
|
+
|
945
975
|
inline __m256i load(const block_iq4_nl *b) {
|
946
976
|
return MM256_SET_M128I(load1(b), load0(b));
|
947
977
|
}
|
@@ -973,6 +1003,17 @@ class tinyBLAS_Q0_AVX {
|
|
973
1003
|
_mm_srli_epi16(x, 4), 1));
|
974
1004
|
}
|
975
1005
|
|
1006
|
+
static inline __m256i bittobyte(const uint8_t *p) {
|
1007
|
+
uint32_t x32;
|
1008
|
+
memcpy(&x32, p, sizeof(uint32_t));
|
1009
|
+
__m256i bytes = _mm256_cmpeq_epi8(_mm256_set1_epi64x(-1),
|
1010
|
+
_mm256_or_si256(_mm256_set1_epi64x(0x7fbfdfeff7fbfdfe),
|
1011
|
+
_mm256_shuffle_epi8(_mm256_set1_epi32(x32),
|
1012
|
+
_mm256_set_epi64x(0x0303030303030303, 0x0202020202020202,
|
1013
|
+
0x0101010101010101, 0x0000000000000000))));
|
1014
|
+
return _mm256_andnot_si256(bytes, _mm256_set1_epi8((char)0xF0));
|
1015
|
+
}
|
1016
|
+
|
976
1017
|
const TA *const A;
|
977
1018
|
const TB *const B;
|
978
1019
|
TC *const C;
|
@@ -1182,6 +1223,22 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
|
1182
1223
|
#endif
|
1183
1224
|
}
|
1184
1225
|
|
1226
|
+
case LM_GGML_TYPE_Q5_0: {
|
1227
|
+
if (Btype != LM_GGML_TYPE_Q8_0)
|
1228
|
+
return false;
|
1229
|
+
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
|
1230
|
+
tinyBLAS_Q0_AVX<block_q5_0, block_q8_0, float> tb{
|
1231
|
+
k, (const block_q5_0 *)A, lda,
|
1232
|
+
(const block_q8_0 *)B, ldb,
|
1233
|
+
(float *)C, ldc,
|
1234
|
+
ith, nth};
|
1235
|
+
tb.matmul(m, n);
|
1236
|
+
return true;
|
1237
|
+
#else
|
1238
|
+
return false;
|
1239
|
+
#endif
|
1240
|
+
}
|
1241
|
+
|
1185
1242
|
case LM_GGML_TYPE_IQ4_NL: {
|
1186
1243
|
if (Btype != LM_GGML_TYPE_Q8_0)
|
1187
1244
|
return false;
|
@@ -1 +1 @@
|
|
1
|
-
{"version":3,"names":["_reactNative","require","_default","TurboModuleRegistry","get","exports","default"],"sourceRoot":"..\\..\\src","sources":["NativeRNLlama.ts"],"mappings":";;;;;;AACA,IAAAA,YAAA,GAAAC,OAAA;AAAkD,IAAAC,QAAA,
|
1
|
+
{"version":3,"names":["_reactNative","require","_default","TurboModuleRegistry","get","exports","default"],"sourceRoot":"..\\..\\src","sources":["NativeRNLlama.ts"],"mappings":";;;;;;AACA,IAAAA,YAAA,GAAAC,OAAA;AAAkD,IAAAC,QAAA,GAqKnCC,gCAAmB,CAACC,GAAG,CAAO,SAAS,CAAC;AAAAC,OAAA,CAAAC,OAAA,GAAAJ,QAAA"}
|
@@ -1 +1 @@
|
|
1
|
-
{"version":3,"names":["TurboModuleRegistry","get"],"sourceRoot":"..\\..\\src","sources":["NativeRNLlama.ts"],"mappings":"AACA,SAASA,mBAAmB,QAAQ,cAAc;
|
1
|
+
{"version":3,"names":["TurboModuleRegistry","get"],"sourceRoot":"..\\..\\src","sources":["NativeRNLlama.ts"],"mappings":"AACA,SAASA,mBAAmB,QAAQ,cAAc;AAqKlD,eAAeA,mBAAmB,CAACC,GAAG,CAAO,SAAS,CAAC"}
|
@@ -1 +1 @@
|
|
1
|
-
{"version":3,"file":"NativeRNLlama.d.ts","sourceRoot":"","sources":["../../src/NativeRNLlama.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,cAAc,CAAA;AAG/C,MAAM,MAAM,mBAAmB,GAAG;IAChC,KAAK,EAAE,MAAM,CAAA;IACb,cAAc,CAAC,EAAE,OAAO,CAAA;IAExB,SAAS,CAAC,EAAE,OAAO,CAAA;IAEnB,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,OAAO,CAAC,EAAE,MAAM,CAAA;IAEhB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,YAAY,CAAC,EAAE,MAAM,CAAA;IAErB,SAAS,CAAC,EAAE,OAAO,CAAA;IACnB,QAAQ,CAAC,EAAE,OAAO,CAAA;IAClB,UAAU,CAAC,EAAE,OAAO,CAAA;IAEpB,IAAI,CAAC,EAAE,MAAM,CAAA;IACb,WAAW,CAAC,EAAE,MAAM,CAAA;IAEpB,cAAc,CAAC,EAAE,MAAM,CAAA;IACvB,eAAe,CAAC,EAAE,MAAM,CAAA;CACzB,CAAA;AAED,MAAM,MAAM,sBAAsB,GAAG;IACnC,MAAM,EAAE,MAAM,CAAA;IACd,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,IAAI,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAA;IAEpB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,
|
1
|
+
{"version":3,"file":"NativeRNLlama.d.ts","sourceRoot":"","sources":["../../src/NativeRNLlama.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,cAAc,CAAA;AAG/C,MAAM,MAAM,mBAAmB,GAAG;IAChC,KAAK,EAAE,MAAM,CAAA;IACb,cAAc,CAAC,EAAE,OAAO,CAAA;IAExB,SAAS,CAAC,EAAE,OAAO,CAAA;IAEnB,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,OAAO,CAAC,EAAE,MAAM,CAAA;IAEhB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,YAAY,CAAC,EAAE,MAAM,CAAA;IAErB,SAAS,CAAC,EAAE,OAAO,CAAA;IACnB,QAAQ,CAAC,EAAE,OAAO,CAAA;IAClB,UAAU,CAAC,EAAE,OAAO,CAAA;IAEpB,IAAI,CAAC,EAAE,MAAM,CAAA;IACb,WAAW,CAAC,EAAE,MAAM,CAAA;IAEpB,cAAc,CAAC,EAAE,MAAM,CAAA;IACvB,eAAe,CAAC,EAAE,MAAM,CAAA;CACzB,CAAA;AAED,MAAM,MAAM,sBAAsB,GAAG;IACnC,MAAM,EAAE,MAAM,CAAA;IACd,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,IAAI,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAA;IAEpB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,WAAW,CAAC,EAAE,MAAM,CAAA;IACpB,cAAc,CAAC,EAAE,MAAM,CAAA;IACvB,cAAc,CAAC,EAAE,MAAM,CAAA;IACvB,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,eAAe,CAAC,EAAE,MAAM,CAAA;IACxB,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,WAAW,CAAC,EAAE,OAAO,CAAA;IACrB,IAAI,CAAC,EAAE,MAAM,CAAA;IAEb,UAAU,CAAC,EAAE,OAAO,CAAA;IACpB,UAAU,CAAC,EAAE,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAA;IAEjC,uBAAuB,EAAE,OAAO,CAAA;CACjC,CAAA;AAED,MAAM,MAAM,6BAA6B,GAAG;IAC1C,OAAO,EAAE,MAAM,CAAA;IACf,IAAI,EAAE,MAAM,CAAA;CACb,CAAA;AAED,MAAM,MAAM,yBAAyB,GAAG;IACtC,OAAO,EAAE,MAAM,CAAA;IACf,KAAK,EAAE,KAAK,CAAC,6BAA6B,CAAC,CAAA;CAC5C,CAAA;AAED,MAAM,MAAM,6BAA6B,GAAG;IAC1C,QAAQ,EAAE,MAAM,CAAA;IAChB,SAAS,EAAE,MAAM,CAAA;IACjB,mBAAmB,EAAE,MAAM,CAAA;IAC3B,iBAAiB,EAAE,MAAM,CAAA;IACzB,WAAW,EAAE,MAAM,CAAA;IACnB,YAAY,EAAE,MAAM,CAAA;IACpB,sBAAsB,EAAE,MAAM,CAAA;IAC9B,oBAAoB,EAAE,MAAM,CAAA;CAC7B,CAAA;AAED,MAAM,MAAM,sBAAsB,GAAG;IACnC,IAAI,EAAE,MAAM,CAAA;IAEZ,gBAAgB,EAAE,MAAM,CAAA;IACxB,gBAAgB,EAAE,MAAM,CAAA;IACxB,SAAS,EAAE,OAAO,CAAA;IAClB,WAAW,EAAE,OAAO,CAAA;IACpB,YAAY,EAAE,MAAM,CAAA;IACpB,aAAa,EAAE,MAAM,CAAA;IACrB,aAAa,EAAE,MAAM,CAAA;IACrB,aAAa,EAAE,MAAM,CAAA;IACrB,OAAO,EAAE,6BAA6B,CAAA;IAEtC,wBAAwB,CAAC,EAAE,KAAK,CAAC,yBAAyB,CAAC,CAAA;CAC5D,CAAA;AAED,MAAM,MAAM,oBAAoB,GAAG;IACjC,MAAM,EAAE,KAAK,CAAC,MAAM,CAAC,CAAA;CACtB,CAAA;AAED,MAAM,MAAM,qBAAqB,GAAG;IAClC,SAAS,EAAE,KAAK,CAAC,MAAM,CAAC,CAAA;CACzB,CAAA;AAED,MAAM,MAAM,kBAAkB,GAAG;IAC/B,SAAS,EAAE,MAAM,CAAA;IACjB,GAAG,EAAE,OAAO,CAAA;IACZ,WAAW,EAAE,MAAM,CAAA;IACnB,KAAK,EAAE,MAAM,CAAA;CACd,CAAA;AAED,MAAM,MAAM,uBAAuB,GAAG;IACpC,aAAa,EAAE,MAAM,CAAA;IACrB,MAAM,EAAE,MAAM,CAAA;CACf,CAAA;AAED,MAAM,MAAM,sBAAsB,GAAG;IACnC,IAAI,EAAE,MAAM,CAAA;IACZ,OAAO,EAAE,MAAM,CAAA;CAChB,CAAA;AAED,MAAM,MAAM,iBAAiB,GAAG;IAC9B,KAAK,EAAE,OAAO,CAAA;IACd,IAAI,EAAE,OAAO,CAAA;IACb,OAAO,EAAE,OAAO,CAAA;CACjB,CAAA;AAED,MAAM,WAAW,IAAK,SAAQ,WAAW;IACvC,eAAe,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAA;IAC7C,WAAW,CAAC,MAAM,EAAE,mBAAmB,GAAG,OAAO,CAAC,kBAAkB,CAAC,CAAA;IAErE,WAAW,CACT,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,MAAM,GACf,OAAO,CAAC,uBAAuB,CAAC,CAAA;IACnC,WAAW,CACT,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,MAAM,EAChB,IAAI,EAAE,MAAM,GACX,OAAO,CAAC,MAAM,CAAC,CAAA;IAClB,UAAU,CACR,SAAS,EAAE,MAAM,EACjB,MAAM,EAAE,sBAAsB,GAC7B,OAAO,CAAC,sBAAsB,CAAC,CAAA;IAClC,cAAc,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAA;IAChD,aAAa,CAAC,SAAS,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,oBAAoB,CAAC,CAAA;IAC7E,YAAY,CAAC,SAAS,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,oBAAoB,CAAA;IACnE,cAAc,IAAK,OAAO,CAAC,iBAAiB,CAAC,CAAA;IAC7C,gBAAgB,CACd,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,sBAAsB,EAAE,EAClC,YAAY,CAAC,EAAE,MAAM,GACpB,OAAO,CAAC,MAAM,CAAC,CAAA;IAClB,UAAU,CAAC,SAAS,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,CAAA;IAChE,SAAS,CAAC,SAAS,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,qBAAqB,CAAC,CAAA;IAC1E,KAAK,CACH,SAAS,EAAE,MAAM,EACjB,EAAE,EAAE,MAAM,EACV,EAAE,EAAE,MAAM,EACV,EAAE,EAAE,MAAM,EACV,EAAE,EAAE,MAAM,GACT,OAAO,CAAC,MAAM,CAAC,CAAA;IAElB,cAAc,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAA;IAEhD,kBAAkB,IAAI,OAAO,CAAC,IAAI,CAAC,CAAA;CACpC;;AAED,wBAA+D"}
|
package/package.json
CHANGED