cui-llama.rn 1.1.4 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +1 -0
- package/android/src/main/jni.cpp +3 -4
- package/cpp/common.cpp +183 -1990
- package/cpp/common.h +101 -130
- package/cpp/ggml-impl.h +32 -0
- package/cpp/ggml-metal.m +38 -28
- package/cpp/ggml-quants.c +275 -84
- package/cpp/ggml.c +89 -35
- package/cpp/ggml.h +30 -67
- package/cpp/llama-impl.h +1 -0
- package/cpp/llama-sampling.cpp +218 -102
- package/cpp/llama.cpp +599 -120
- package/cpp/llama.h +33 -25
- package/cpp/log.cpp +401 -0
- package/cpp/log.h +85 -703
- package/cpp/rn-llama.hpp +9 -11
- package/cpp/sampling.cpp +12 -9
- package/cpp/sampling.h +4 -56
- package/cpp/sgemm.cpp +38 -0
- package/package.json +1 -1
package/cpp/llama-sampling.cpp
CHANGED
@@ -8,49 +8,45 @@
|
|
8
8
|
#include <cstring>
|
9
9
|
#include <ctime>
|
10
10
|
#include <cfloat>
|
11
|
+
#include <chrono>
|
12
|
+
#include <cmath>
|
11
13
|
#include <numeric>
|
12
14
|
#include <random>
|
13
15
|
#include <unordered_map>
|
14
16
|
|
15
|
-
static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng
|
16
|
-
|
17
|
-
|
18
|
-
for (size_t i = 0; i < cur_p->size; ++i) {
|
19
|
-
probs[i] = cur_p->data[i].p;
|
20
|
-
}
|
21
|
-
|
22
|
-
std::discrete_distribution<size_t> dist(probs.begin(), probs.end());
|
23
|
-
#else
|
24
|
-
// avoid the copy with a custom iterator
|
17
|
+
static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) {
|
18
|
+
// iterator for the probabilities
|
19
|
+
#ifdef __GNUC__
|
25
20
|
#pragma GCC diagnostic push
|
26
21
|
#pragma GCC diagnostic ignored "-Wunused-local-typedefs"
|
22
|
+
#endif
|
27
23
|
|
28
24
|
struct probs_iterator {
|
29
25
|
typedef std::input_iterator_tag iterator_category;
|
30
26
|
typedef float value_type;
|
31
27
|
typedef float * pointer;
|
32
28
|
typedef float & reference;
|
33
|
-
typedef
|
29
|
+
typedef ptrdiff_t difference_type;
|
34
30
|
|
35
|
-
const
|
36
|
-
size_t i;
|
31
|
+
const llama_token_data * data;
|
37
32
|
|
38
|
-
bool operator==(const probs_iterator & other) const { return data
|
39
|
-
bool operator!=(const probs_iterator & other) const { return data
|
40
|
-
float operator*() const { return data->
|
41
|
-
probs_iterator & operator++() { ++
|
42
|
-
probs_iterator operator++(int) { probs_iterator tmp = *this; ++
|
33
|
+
bool operator==(const probs_iterator & other) const { return data == other.data; }
|
34
|
+
bool operator!=(const probs_iterator & other) const { return data != other.data; }
|
35
|
+
const float & operator*() const { return data->p; }
|
36
|
+
probs_iterator & operator++() { ++data; return *this; }
|
37
|
+
probs_iterator operator++(int) { probs_iterator tmp = *this; ++data; return tmp; }
|
43
38
|
};
|
44
|
-
#pragma GCC diagnostic pop
|
45
|
-
|
46
|
-
std::discrete_distribution<size_t> dist(probs_iterator{cur_p, 0}, probs_iterator{cur_p, cur_p->size});
|
47
39
|
|
48
|
-
|
40
|
+
#ifdef __GNUC__
|
41
|
+
#pragma GCC diagnostic pop
|
49
42
|
#endif
|
50
43
|
|
44
|
+
std::discrete_distribution<int> dist(probs_iterator{cur_p->data}, probs_iterator{cur_p->data + cur_p->size});
|
45
|
+
|
51
46
|
return dist(rng);
|
52
47
|
}
|
53
48
|
|
49
|
+
/*
|
54
50
|
static void llama_log_softmax(float * array, size_t size) {
|
55
51
|
float max_l = *std::max_element(array, array + size);
|
56
52
|
float sum = 0.f;
|
@@ -64,6 +60,7 @@ static void llama_log_softmax(float * array, size_t size) {
|
|
64
60
|
array[i] = logf(array[i] / sum);
|
65
61
|
}
|
66
62
|
}
|
63
|
+
*/
|
67
64
|
|
68
65
|
static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
|
69
66
|
LM_GGML_ASSERT(cur_p->size > 0);
|
@@ -166,6 +163,19 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
|
|
166
163
|
cur_p->size = k;
|
167
164
|
}
|
168
165
|
|
166
|
+
static uint32_t get_rng_seed(uint32_t seed) {
|
167
|
+
if (seed == LLAMA_DEFAULT_SEED) {
|
168
|
+
// use system clock if std::random_device is not a true RNG
|
169
|
+
static bool is_rd_prng = std::random_device().entropy() == 0;
|
170
|
+
if (is_rd_prng) {
|
171
|
+
return (uint32_t) std::chrono::system_clock::now().time_since_epoch().count();
|
172
|
+
}
|
173
|
+
std::random_device rd;
|
174
|
+
return rd();
|
175
|
+
}
|
176
|
+
return seed;
|
177
|
+
}
|
178
|
+
|
169
179
|
// llama_sampler API
|
170
180
|
|
171
181
|
const char * llama_sampler_name(const struct llama_sampler * smpl) {
|
@@ -231,67 +241,92 @@ llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_conte
|
|
231
241
|
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
|
232
242
|
}
|
233
243
|
|
234
|
-
llama_token_data_array cur_p = {
|
244
|
+
llama_token_data_array cur_p = {
|
245
|
+
/* .data = */ cur.data(),
|
246
|
+
/* .size = */ cur.size(),
|
247
|
+
/* .selected = */ -1,
|
248
|
+
/* .sorted = */ false,
|
249
|
+
};
|
235
250
|
|
236
251
|
llama_sampler_apply(smpl, &cur_p);
|
237
252
|
|
238
|
-
|
253
|
+
LM_GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size);
|
254
|
+
|
255
|
+
auto token = cur_p.data[cur_p.selected].id;
|
256
|
+
|
257
|
+
llama_sampler_accept(smpl, token);
|
258
|
+
|
259
|
+
return token;
|
239
260
|
}
|
240
261
|
|
241
262
|
// sampler chain
|
242
263
|
|
243
|
-
static struct
|
244
|
-
|
245
|
-
|
246
|
-
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
264
|
+
static const char * llama_sampler_chain_name(const struct llama_sampler * /*smpl*/) {
|
265
|
+
return "chain";
|
266
|
+
}
|
247
267
|
|
248
|
-
|
268
|
+
static void llama_sampler_chain_accept(struct llama_sampler * smpl, llama_token token) {
|
269
|
+
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
249
270
|
|
250
|
-
|
251
|
-
llama_sampler_accept(smpl, token);
|
252
|
-
}
|
271
|
+
time_meas tm(chain->t_sample_us, chain->params.no_perf);
|
253
272
|
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
273
|
+
for (auto * smpl : chain->samplers) {
|
274
|
+
llama_sampler_accept(smpl, token);
|
275
|
+
}
|
258
276
|
|
259
|
-
|
277
|
+
chain->n_sample++;
|
278
|
+
}
|
260
279
|
|
261
|
-
|
262
|
-
|
263
|
-
}
|
264
|
-
},
|
265
|
-
/* .reset = */ [](struct llama_sampler * smpl) {
|
266
|
-
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
280
|
+
static void llama_sampler_chain_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
281
|
+
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
267
282
|
|
268
|
-
|
269
|
-
llama_sampler_reset(smpl);
|
270
|
-
}
|
283
|
+
time_meas tm(chain->t_sample_us, chain->params.no_perf);
|
271
284
|
|
272
|
-
|
273
|
-
|
274
|
-
}
|
275
|
-
|
276
|
-
const auto * chain_src = (const llama_sampler_chain *) smpl->ctx;
|
285
|
+
for (auto * smpl : chain->samplers) {
|
286
|
+
llama_sampler_apply(smpl, cur_p);
|
287
|
+
}
|
288
|
+
}
|
277
289
|
|
278
|
-
|
290
|
+
static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
|
291
|
+
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
279
292
|
|
280
|
-
|
281
|
-
|
282
|
-
|
293
|
+
for (auto * smpl : chain->samplers) {
|
294
|
+
llama_sampler_reset(smpl);
|
295
|
+
}
|
283
296
|
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
297
|
+
chain->t_sample_us = 0;
|
298
|
+
chain->n_sample = 0;
|
299
|
+
}
|
288
300
|
|
289
|
-
|
290
|
-
|
291
|
-
|
301
|
+
static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) {
|
302
|
+
const auto * chain_src = (const llama_sampler_chain *) smpl->ctx;
|
303
|
+
|
304
|
+
auto * result = llama_sampler_chain_init(chain_src->params);
|
305
|
+
|
306
|
+
for (auto * smpl : chain_src->samplers) {
|
307
|
+
llama_sampler_chain_add(result, llama_sampler_clone(smpl));
|
308
|
+
}
|
309
|
+
|
310
|
+
return result;
|
311
|
+
}
|
312
|
+
|
313
|
+
static void llama_sampler_chain_free(struct llama_sampler * smpl) {
|
314
|
+
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
292
315
|
|
293
|
-
|
294
|
-
|
316
|
+
for (auto * smpl : chain->samplers) {
|
317
|
+
llama_sampler_free(smpl);
|
318
|
+
}
|
319
|
+
|
320
|
+
delete chain;
|
321
|
+
}
|
322
|
+
|
323
|
+
static struct llama_sampler_i llama_sampler_chain_i = {
|
324
|
+
/* .name = */ llama_sampler_chain_name,
|
325
|
+
/* .accept = */ llama_sampler_chain_accept,
|
326
|
+
/* .apply = */ llama_sampler_chain_apply,
|
327
|
+
/* .reset = */ llama_sampler_chain_reset,
|
328
|
+
/* .clone = */ llama_sampler_chain_clone,
|
329
|
+
/* .free = */ llama_sampler_chain_free,
|
295
330
|
};
|
296
331
|
|
297
332
|
struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) {
|
@@ -311,25 +346,30 @@ void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler
|
|
311
346
|
p->samplers.push_back(smpl);
|
312
347
|
}
|
313
348
|
|
314
|
-
llama_sampler_timings llama_sampler_chain_timings(struct llama_sampler * chain) {
|
315
|
-
auto * p = (llama_sampler_chain *) chain->ctx;
|
316
|
-
struct llama_sampler_timings result = {
|
317
|
-
p -> t_sample_us,
|
318
|
-
p -> n_sample
|
319
|
-
};
|
320
|
-
return result;
|
321
|
-
}
|
322
349
|
|
323
350
|
struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i) {
|
324
351
|
const auto * p = (const llama_sampler_chain *) chain->ctx;
|
325
352
|
|
326
|
-
if (i < 0 || i >=
|
353
|
+
if (i < 0 || (size_t) i >= p->samplers.size()) {
|
327
354
|
return nullptr;
|
328
355
|
}
|
329
356
|
|
330
357
|
return p->samplers[i];
|
331
358
|
}
|
332
359
|
|
360
|
+
struct llama_sampler * llama_sampler_chain_remove(struct llama_sampler * chain, int32_t i) {
|
361
|
+
auto * p = (llama_sampler_chain *) chain->ctx;
|
362
|
+
|
363
|
+
if (i < 0 || (size_t) i >= p->samplers.size()) {
|
364
|
+
return nullptr;
|
365
|
+
}
|
366
|
+
|
367
|
+
auto * result = p->samplers[i];
|
368
|
+
p->samplers.erase(p->samplers.begin() + i);
|
369
|
+
|
370
|
+
return result;
|
371
|
+
}
|
372
|
+
|
333
373
|
int llama_sampler_chain_n(const struct llama_sampler * chain) {
|
334
374
|
const auto * p = (const llama_sampler_chain *) chain->ctx;
|
335
375
|
|
@@ -375,10 +415,9 @@ struct llama_sampler * llama_sampler_init_greedy() {
|
|
375
415
|
|
376
416
|
struct llama_sampler_dist {
|
377
417
|
const uint32_t seed;
|
418
|
+
uint32_t seed_cur;
|
378
419
|
|
379
420
|
std::mt19937 rng;
|
380
|
-
|
381
|
-
std::vector<float> probs; // work array
|
382
421
|
};
|
383
422
|
|
384
423
|
static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*/) {
|
@@ -387,7 +426,7 @@ static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*
|
|
387
426
|
|
388
427
|
static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
389
428
|
auto * ctx = (llama_sampler_dist *) smpl->ctx;
|
390
|
-
cur_p->selected = llama_sample_dist(cur_p, ctx->rng
|
429
|
+
cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
|
391
430
|
}
|
392
431
|
|
393
432
|
static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) {
|
@@ -406,7 +445,8 @@ static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sample
|
|
406
445
|
|
407
446
|
static void llama_sampler_dist_reset(struct llama_sampler * smpl) {
|
408
447
|
auto * ctx = (llama_sampler_dist *) smpl->ctx;
|
409
|
-
ctx->
|
448
|
+
ctx->seed_cur = get_rng_seed(ctx->seed);
|
449
|
+
ctx->rng.seed(ctx->seed_cur);
|
410
450
|
}
|
411
451
|
|
412
452
|
static void llama_sampler_dist_free(struct llama_sampler * smpl) {
|
@@ -423,12 +463,13 @@ static struct llama_sampler_i llama_sampler_dist_i = {
|
|
423
463
|
};
|
424
464
|
|
425
465
|
struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
|
466
|
+
auto seed_cur = get_rng_seed(seed);
|
426
467
|
return new llama_sampler {
|
427
468
|
/* .iface = */ &llama_sampler_dist_i,
|
428
469
|
/* .ctx = */ new llama_sampler_dist {
|
429
|
-
/* .seed
|
430
|
-
/* .
|
431
|
-
/* .
|
470
|
+
/* .seed = */ seed,
|
471
|
+
/* .seed_cur = */ seed_cur,
|
472
|
+
/* .rng = */ std::mt19937(seed_cur),
|
432
473
|
},
|
433
474
|
};
|
434
475
|
}
|
@@ -1167,6 +1208,7 @@ struct llama_sampler_mirostat {
|
|
1167
1208
|
const int32_t n_vocab;
|
1168
1209
|
|
1169
1210
|
const uint32_t seed;
|
1211
|
+
uint32_t seed_cur;
|
1170
1212
|
|
1171
1213
|
const float tau;
|
1172
1214
|
const float eta;
|
@@ -1176,8 +1218,6 @@ struct llama_sampler_mirostat {
|
|
1176
1218
|
float mu;
|
1177
1219
|
|
1178
1220
|
std::mt19937 rng;
|
1179
|
-
|
1180
|
-
std::vector<float> probs;
|
1181
1221
|
};
|
1182
1222
|
|
1183
1223
|
static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*smpl*/) {
|
@@ -1208,7 +1248,7 @@ static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_toke
|
|
1208
1248
|
llama_sampler_top_k_impl(cur_p, std::max(int(k), 1));
|
1209
1249
|
llama_sampler_softmax_impl(cur_p);
|
1210
1250
|
|
1211
|
-
const int idx = llama_sample_dist(cur_p, ctx->rng
|
1251
|
+
const int idx = llama_sample_dist(cur_p, ctx->rng);
|
1212
1252
|
|
1213
1253
|
cur_p->selected = idx;
|
1214
1254
|
|
@@ -1237,7 +1277,8 @@ static struct llama_sampler * llama_sampler_mirostat_clone(const struct llama_sa
|
|
1237
1277
|
static void llama_sampler_mirostat_reset(struct llama_sampler * smpl) {
|
1238
1278
|
auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
|
1239
1279
|
ctx->mu = 2.0f*ctx->tau;
|
1240
|
-
ctx->
|
1280
|
+
ctx->seed_cur = get_rng_seed(ctx->seed);
|
1281
|
+
ctx->rng.seed(ctx->seed_cur);
|
1241
1282
|
}
|
1242
1283
|
|
1243
1284
|
static void llama_sampler_mirostat_free(struct llama_sampler * smpl) {
|
@@ -1254,17 +1295,18 @@ static struct llama_sampler_i llama_sampler_mirostat_i = {
|
|
1254
1295
|
};
|
1255
1296
|
|
1256
1297
|
struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
|
1298
|
+
auto seed_cur = get_rng_seed(seed);
|
1257
1299
|
return new llama_sampler {
|
1258
1300
|
/* .iface = */ &llama_sampler_mirostat_i,
|
1259
1301
|
/* .ctx = */ new llama_sampler_mirostat {
|
1260
|
-
/* .n_vocab
|
1261
|
-
/* .seed
|
1262
|
-
/* .
|
1263
|
-
/* .
|
1264
|
-
/* .
|
1265
|
-
/* .
|
1266
|
-
/* .
|
1267
|
-
/* .
|
1302
|
+
/* .n_vocab = */ n_vocab,
|
1303
|
+
/* .seed = */ seed,
|
1304
|
+
/* .seed_cur = */ seed_cur,
|
1305
|
+
/* .tau = */ tau,
|
1306
|
+
/* .eta = */ eta,
|
1307
|
+
/* .m = */ m,
|
1308
|
+
/* .mu = */ 2.0f*tau,
|
1309
|
+
/* .rng = */ std::mt19937(seed_cur),
|
1268
1310
|
},
|
1269
1311
|
};
|
1270
1312
|
}
|
@@ -1273,6 +1315,7 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see
|
|
1273
1315
|
|
1274
1316
|
struct llama_sampler_mirostat_v2 {
|
1275
1317
|
const uint32_t seed;
|
1318
|
+
uint32_t seed_cur;
|
1276
1319
|
|
1277
1320
|
const float tau;
|
1278
1321
|
const float eta;
|
@@ -1280,8 +1323,6 @@ struct llama_sampler_mirostat_v2 {
|
|
1280
1323
|
float mu;
|
1281
1324
|
|
1282
1325
|
std::mt19937 rng;
|
1283
|
-
|
1284
|
-
std::vector<float> probs;
|
1285
1326
|
};
|
1286
1327
|
|
1287
1328
|
static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler * /*smpl*/) {
|
@@ -1305,7 +1346,7 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t
|
|
1305
1346
|
// Normalize the probabilities of the remaining words
|
1306
1347
|
llama_sampler_softmax_impl(cur_p);
|
1307
1348
|
|
1308
|
-
const int idx = llama_sample_dist(cur_p, ctx->rng
|
1349
|
+
const int idx = llama_sample_dist(cur_p, ctx->rng);
|
1309
1350
|
|
1310
1351
|
cur_p->selected = idx;
|
1311
1352
|
|
@@ -1319,7 +1360,8 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t
|
|
1319
1360
|
static void llama_sampler_mirostat_v2_reset(struct llama_sampler * smpl) {
|
1320
1361
|
auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
|
1321
1362
|
ctx->mu = 2.0f*ctx->tau;
|
1322
|
-
ctx->
|
1363
|
+
ctx->seed_cur = get_rng_seed(ctx->seed);
|
1364
|
+
ctx->rng.seed(ctx->seed_cur);
|
1323
1365
|
}
|
1324
1366
|
|
1325
1367
|
static struct llama_sampler * llama_sampler_mirostat_v2_clone(const struct llama_sampler * smpl) {
|
@@ -1352,15 +1394,16 @@ static struct llama_sampler_i llama_sampler_mirostat_v2_i = {
|
|
1352
1394
|
};
|
1353
1395
|
|
1354
1396
|
struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
|
1397
|
+
auto seed_cur = get_rng_seed(seed);
|
1355
1398
|
return new llama_sampler {
|
1356
1399
|
/* .iface = */ &llama_sampler_mirostat_v2_i,
|
1357
1400
|
/* .ctx = */ new llama_sampler_mirostat_v2 {
|
1358
|
-
/* .seed
|
1359
|
-
/* .
|
1360
|
-
/* .
|
1361
|
-
/* .
|
1362
|
-
/* .
|
1363
|
-
/* .
|
1401
|
+
/* .seed = */ seed,
|
1402
|
+
/* .seed_cur = */ seed_cur,
|
1403
|
+
/* .tau = */ tau,
|
1404
|
+
/* .eta = */ eta,
|
1405
|
+
/* .mu = */ 2.0f*tau,
|
1406
|
+
/* .rng = */ std::mt19937(seed_cur),
|
1364
1407
|
},
|
1365
1408
|
};
|
1366
1409
|
}
|
@@ -1646,6 +1689,8 @@ struct llama_sampler * llama_sampler_init_penalties(
|
|
1646
1689
|
ignore_eos = false;
|
1647
1690
|
}
|
1648
1691
|
|
1692
|
+
penalty_last_n = std::max(penalty_last_n, 0);
|
1693
|
+
|
1649
1694
|
return new llama_sampler {
|
1650
1695
|
/* .iface = */ &llama_sampler_penalties_i,
|
1651
1696
|
/* .ctx = */ new llama_sampler_penalties {
|
@@ -1680,6 +1725,10 @@ static const char * llama_sampler_logit_bias_name(const struct llama_sampler * /
|
|
1680
1725
|
static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
1681
1726
|
auto * ctx = (llama_sampler_logit_bias *) smpl->ctx;
|
1682
1727
|
|
1728
|
+
if (ctx->logit_bias.empty()) {
|
1729
|
+
return;
|
1730
|
+
}
|
1731
|
+
|
1683
1732
|
ctx->to_search.clear();
|
1684
1733
|
|
1685
1734
|
// update the candidates that have not been shuffled in the vocabulary (i.e. idx == id)
|
@@ -1691,6 +1740,10 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to
|
|
1691
1740
|
}
|
1692
1741
|
}
|
1693
1742
|
|
1743
|
+
if (ctx->to_search.empty()) {
|
1744
|
+
return;
|
1745
|
+
}
|
1746
|
+
|
1694
1747
|
// search for the remaining candidates that were not found in the previous step
|
1695
1748
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
1696
1749
|
for (const auto & lb : ctx->to_search) {
|
@@ -1701,6 +1754,7 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to
|
|
1701
1754
|
}
|
1702
1755
|
}
|
1703
1756
|
}
|
1757
|
+
|
1704
1758
|
static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) {
|
1705
1759
|
const auto * ctx = (const llama_sampler_logit_bias *) smpl->ctx;
|
1706
1760
|
return llama_sampler_init_logit_bias(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data());
|
@@ -1732,3 +1786,65 @@ struct llama_sampler * llama_sampler_init_logit_bias(
|
|
1732
1786
|
},
|
1733
1787
|
};
|
1734
1788
|
}
|
1789
|
+
|
1790
|
+
// utils
|
1791
|
+
|
1792
|
+
uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
|
1793
|
+
if (smpl->iface == &llama_sampler_dist_i) {
|
1794
|
+
return ((const llama_sampler_dist *) smpl->ctx)->seed_cur;
|
1795
|
+
}
|
1796
|
+
|
1797
|
+
if (smpl->iface == &llama_sampler_mirostat_i) {
|
1798
|
+
return ((const llama_sampler_mirostat *) smpl->ctx)->seed_cur;
|
1799
|
+
}
|
1800
|
+
|
1801
|
+
if (smpl->iface == &llama_sampler_mirostat_v2_i) {
|
1802
|
+
return ((const llama_sampler_mirostat_v2 *) smpl->ctx)->seed_cur;
|
1803
|
+
}
|
1804
|
+
|
1805
|
+
if (smpl->iface == &llama_sampler_chain_i) {
|
1806
|
+
const auto * ctx = (const llama_sampler_chain *) smpl->ctx;
|
1807
|
+
for (auto it = ctx->samplers.rbegin(); it != ctx->samplers.rend(); ++it) {
|
1808
|
+
const uint32_t seed = llama_sampler_get_seed(*it);
|
1809
|
+
if (seed != LLAMA_DEFAULT_SEED) {
|
1810
|
+
return seed;
|
1811
|
+
}
|
1812
|
+
}
|
1813
|
+
}
|
1814
|
+
|
1815
|
+
return LLAMA_DEFAULT_SEED;
|
1816
|
+
}
|
1817
|
+
|
1818
|
+
// perf
|
1819
|
+
|
1820
|
+
struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) {
|
1821
|
+
struct llama_perf_sampler_data data = {};
|
1822
|
+
|
1823
|
+
if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
|
1824
|
+
LM_GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
|
1825
|
+
}
|
1826
|
+
|
1827
|
+
const auto * ctx = (const struct llama_sampler_chain *) chain->ctx;
|
1828
|
+
|
1829
|
+
data.t_sample_ms = 1e-3 * ctx->t_sample_us;
|
1830
|
+
data.n_sample = std::max(0, ctx->n_sample);
|
1831
|
+
|
1832
|
+
return data;
|
1833
|
+
}
|
1834
|
+
|
1835
|
+
void llama_perf_sampler_print(const struct llama_sampler * chain) {
|
1836
|
+
const auto data = llama_perf_sampler(chain);
|
1837
|
+
|
1838
|
+
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
1839
|
+
__func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
|
1840
|
+
}
|
1841
|
+
|
1842
|
+
void llama_perf_sampler_reset(struct llama_sampler * chain) {
|
1843
|
+
if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
|
1844
|
+
LM_GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
|
1845
|
+
}
|
1846
|
+
|
1847
|
+
auto * ctx = (struct llama_sampler_chain *) chain->ctx;
|
1848
|
+
|
1849
|
+
ctx->t_sample_us = ctx->n_sample = 0;
|
1850
|
+
}
|