cui-llama.rn 1.1.2 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +1 -2
- package/android/src/main/jni.cpp +26 -21
- package/cpp/common.cpp +181 -1584
- package/cpp/common.h +131 -52
- package/cpp/ggml-aarch64.c +612 -0
- package/cpp/ggml-alloc.h +2 -2
- package/cpp/ggml-backend.c +33 -6
- package/cpp/ggml-backend.h +2 -0
- package/cpp/ggml-common.h +20 -0
- package/cpp/ggml-impl.h +36 -7
- package/cpp/ggml-metal.m +68 -8
- package/cpp/ggml-quants.c +932 -50
- package/cpp/ggml-quants.h +15 -0
- package/cpp/ggml.c +1712 -325
- package/cpp/ggml.h +169 -100
- package/cpp/llama-grammar.cpp +721 -122
- package/cpp/llama-grammar.h +120 -15
- package/cpp/llama-impl.h +132 -1
- package/cpp/llama-sampling.cpp +1483 -354
- package/cpp/llama-sampling.h +20 -48
- package/cpp/llama-vocab.cpp +140 -7
- package/cpp/llama-vocab.h +3 -2
- package/cpp/llama.cpp +824 -327
- package/cpp/llama.h +235 -256
- package/cpp/rn-llama.hpp +18 -14
- package/cpp/sampling.cpp +353 -354
- package/cpp/sampling.h +62 -143
- package/cpp/sgemm.cpp +153 -0
- package/package.json +1 -1
- package/cpp/grammar-parser.cpp +0 -539
- package/cpp/grammar-parser.h +0 -29
package/cpp/rn-llama.hpp
CHANGED
@@ -7,6 +7,7 @@
|
|
7
7
|
#include "llama.h"
|
8
8
|
|
9
9
|
#include <android/log.h>
|
10
|
+
#include "sampling.h"
|
10
11
|
#define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID"
|
11
12
|
#define LLAMA_LOG_INFO(...) __android_log_print(ANDROID_LOG_INFO , LLAMA_ANDROID_TAG, __VA_ARGS__)
|
12
13
|
|
@@ -163,8 +164,8 @@ struct llama_rn_context
|
|
163
164
|
|
164
165
|
llama_model *model = nullptr;
|
165
166
|
llama_context *ctx = nullptr;
|
166
|
-
|
167
|
-
|
167
|
+
gpt_sampler *ctx_sampling = nullptr;
|
168
|
+
|
168
169
|
int n_ctx;
|
169
170
|
|
170
171
|
bool truncated = false;
|
@@ -188,7 +189,7 @@ struct llama_rn_context
|
|
188
189
|
}
|
189
190
|
if (ctx_sampling != nullptr)
|
190
191
|
{
|
191
|
-
|
192
|
+
gpt_sampler_free(ctx_sampling);
|
192
193
|
}
|
193
194
|
}
|
194
195
|
|
@@ -215,9 +216,9 @@ struct llama_rn_context
|
|
215
216
|
|
216
217
|
bool initSampling() {
|
217
218
|
if (ctx_sampling != nullptr) {
|
218
|
-
|
219
|
+
gpt_sampler_free(ctx_sampling);
|
219
220
|
}
|
220
|
-
ctx_sampling =
|
221
|
+
ctx_sampling = gpt_sampler_init(model, params.sparams);
|
221
222
|
return ctx_sampling != nullptr;
|
222
223
|
}
|
223
224
|
|
@@ -304,7 +305,7 @@ struct llama_rn_context
|
|
304
305
|
// push the prompt into the sampling context (do not apply grammar)
|
305
306
|
for (auto & token : prompt_tokens)
|
306
307
|
{
|
307
|
-
|
308
|
+
gpt_sampler_accept(ctx_sampling, token, false);
|
308
309
|
}
|
309
310
|
// compare the evaluated prompt with the new prompt
|
310
311
|
n_past = params.embedding? 0 : common_part(embd, prompt_tokens);
|
@@ -334,8 +335,7 @@ struct llama_rn_context
|
|
334
335
|
{
|
335
336
|
// number of tokens to keep when resetting context
|
336
337
|
n_remain = params.n_predict;
|
337
|
-
|
338
|
-
|
338
|
+
llama_perf_context_reset(ctx);
|
339
339
|
is_predicting = true;
|
340
340
|
}
|
341
341
|
|
@@ -383,7 +383,7 @@ struct llama_rn_context
|
|
383
383
|
LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s",
|
384
384
|
n_eval,
|
385
385
|
n_past,
|
386
|
-
params.n_threads,
|
386
|
+
params.cpuparams.n_threads,
|
387
387
|
tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str()
|
388
388
|
);
|
389
389
|
has_next_token = false;
|
@@ -411,22 +411,26 @@ struct llama_rn_context
|
|
411
411
|
std::vector<llama_token_data> candidates;
|
412
412
|
candidates.reserve(llama_n_vocab(model));
|
413
413
|
|
414
|
-
result.tok =
|
415
|
-
|
416
|
-
llama_token_data_array cur_p =
|
414
|
+
result.tok = gpt_sampler_sample(ctx_sampling, ctx, -1);
|
415
|
+
|
416
|
+
llama_token_data_array cur_p = *gpt_sampler_get_candidates(ctx_sampling);
|
417
417
|
|
418
418
|
const int32_t n_probs = params.sparams.n_probs;
|
419
|
+
|
420
|
+
|
419
421
|
if (params.sparams.temp <= 0 && n_probs > 0)
|
420
422
|
{
|
421
423
|
// For llama_sample_token_greedy we need to sort candidates
|
422
|
-
|
424
|
+
llama_sampler_init_softmax();
|
423
425
|
}
|
426
|
+
|
424
427
|
|
425
428
|
for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
|
426
429
|
{
|
427
430
|
result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
|
428
431
|
}
|
429
|
-
|
432
|
+
|
433
|
+
gpt_sampler_accept(ctx_sampling, result.tok, true);
|
430
434
|
if (tg) {
|
431
435
|
num_tokens_predicted++;
|
432
436
|
}
|