cui-llama.rn 1.1.2 → 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +1 -2
- package/android/src/main/jni.cpp +26 -21
- package/cpp/common.cpp +2028 -1520
- package/cpp/common.h +134 -18
- package/cpp/ggml-aarch64.c +612 -0
- package/cpp/ggml-alloc.h +2 -2
- package/cpp/ggml-backend.c +33 -6
- package/cpp/ggml-backend.h +2 -0
- package/cpp/ggml-common.h +20 -0
- package/cpp/ggml-impl.h +4 -7
- package/cpp/ggml-metal.m +63 -2
- package/cpp/ggml-quants.c +690 -2
- package/cpp/ggml-quants.h +15 -0
- package/cpp/ggml.c +1650 -317
- package/cpp/ggml.h +155 -48
- package/cpp/llama-grammar.cpp +721 -122
- package/cpp/llama-grammar.h +120 -15
- package/cpp/llama-impl.h +132 -1
- package/cpp/llama-sampling.cpp +1361 -356
- package/cpp/llama-sampling.h +20 -48
- package/cpp/llama-vocab.cpp +140 -7
- package/cpp/llama-vocab.h +3 -2
- package/cpp/llama.cpp +810 -307
- package/cpp/llama.h +213 -259
- package/cpp/rn-llama.hpp +17 -14
- package/cpp/sampling.cpp +347 -355
- package/cpp/sampling.h +106 -135
- package/cpp/sgemm.cpp +153 -0
- package/package.json +1 -1
- package/cpp/grammar-parser.cpp +0 -539
- package/cpp/grammar-parser.h +0 -29
package/cpp/rn-llama.hpp
CHANGED
@@ -163,8 +163,8 @@ struct llama_rn_context
|
|
163
163
|
|
164
164
|
llama_model *model = nullptr;
|
165
165
|
llama_context *ctx = nullptr;
|
166
|
-
|
167
|
-
|
166
|
+
gpt_sampler *ctx_sampling = nullptr;
|
167
|
+
|
168
168
|
int n_ctx;
|
169
169
|
|
170
170
|
bool truncated = false;
|
@@ -188,7 +188,7 @@ struct llama_rn_context
|
|
188
188
|
}
|
189
189
|
if (ctx_sampling != nullptr)
|
190
190
|
{
|
191
|
-
|
191
|
+
gpt_sampler_free(ctx_sampling);
|
192
192
|
}
|
193
193
|
}
|
194
194
|
|
@@ -215,9 +215,9 @@ struct llama_rn_context
|
|
215
215
|
|
216
216
|
bool initSampling() {
|
217
217
|
if (ctx_sampling != nullptr) {
|
218
|
-
|
218
|
+
gpt_sampler_free(ctx_sampling);
|
219
219
|
}
|
220
|
-
ctx_sampling =
|
220
|
+
ctx_sampling = gpt_sampler_init(model, params.sparams);
|
221
221
|
return ctx_sampling != nullptr;
|
222
222
|
}
|
223
223
|
|
@@ -304,7 +304,7 @@ struct llama_rn_context
|
|
304
304
|
// push the prompt into the sampling context (do not apply grammar)
|
305
305
|
for (auto & token : prompt_tokens)
|
306
306
|
{
|
307
|
-
|
307
|
+
gpt_sampler_accept(ctx_sampling, token, false);
|
308
308
|
}
|
309
309
|
// compare the evaluated prompt with the new prompt
|
310
310
|
n_past = params.embedding? 0 : common_part(embd, prompt_tokens);
|
@@ -334,8 +334,7 @@ struct llama_rn_context
|
|
334
334
|
{
|
335
335
|
// number of tokens to keep when resetting context
|
336
336
|
n_remain = params.n_predict;
|
337
|
-
|
338
|
-
|
337
|
+
llama_perf_reset(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
339
338
|
is_predicting = true;
|
340
339
|
}
|
341
340
|
|
@@ -383,7 +382,7 @@ struct llama_rn_context
|
|
383
382
|
LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s",
|
384
383
|
n_eval,
|
385
384
|
n_past,
|
386
|
-
params.n_threads,
|
385
|
+
params.cpuparams.n_threads,
|
387
386
|
tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str()
|
388
387
|
);
|
389
388
|
has_next_token = false;
|
@@ -411,22 +410,26 @@ struct llama_rn_context
|
|
411
410
|
std::vector<llama_token_data> candidates;
|
412
411
|
candidates.reserve(llama_n_vocab(model));
|
413
412
|
|
414
|
-
result.tok =
|
415
|
-
|
416
|
-
llama_token_data_array cur_p =
|
413
|
+
result.tok = gpt_sampler_sample(ctx_sampling, ctx, -1);
|
414
|
+
|
415
|
+
llama_token_data_array cur_p = *gpt_sampler_get_candidates(ctx_sampling);
|
417
416
|
|
418
417
|
const int32_t n_probs = params.sparams.n_probs;
|
418
|
+
|
419
|
+
|
419
420
|
if (params.sparams.temp <= 0 && n_probs > 0)
|
420
421
|
{
|
421
422
|
// For llama_sample_token_greedy we need to sort candidates
|
422
|
-
|
423
|
+
llama_sampler_init_softmax();
|
423
424
|
}
|
425
|
+
|
424
426
|
|
425
427
|
for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
|
426
428
|
{
|
427
429
|
result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
|
428
430
|
}
|
429
|
-
|
431
|
+
|
432
|
+
gpt_sampler_accept(ctx_sampling, result.tok, true);
|
430
433
|
if (tg) {
|
431
434
|
num_tokens_predicted++;
|
432
435
|
}
|