cui-llama.rn 1.1.2 → 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +1 -2
- package/android/src/main/jni.cpp +26 -21
- package/cpp/common.cpp +2028 -1520
- package/cpp/common.h +134 -18
- package/cpp/ggml-aarch64.c +612 -0
- package/cpp/ggml-alloc.h +2 -2
- package/cpp/ggml-backend.c +33 -6
- package/cpp/ggml-backend.h +2 -0
- package/cpp/ggml-common.h +20 -0
- package/cpp/ggml-impl.h +4 -7
- package/cpp/ggml-metal.m +63 -2
- package/cpp/ggml-quants.c +690 -2
- package/cpp/ggml-quants.h +15 -0
- package/cpp/ggml.c +1650 -317
- package/cpp/ggml.h +155 -48
- package/cpp/llama-grammar.cpp +721 -122
- package/cpp/llama-grammar.h +120 -15
- package/cpp/llama-impl.h +132 -1
- package/cpp/llama-sampling.cpp +1361 -356
- package/cpp/llama-sampling.h +20 -48
- package/cpp/llama-vocab.cpp +140 -7
- package/cpp/llama-vocab.h +3 -2
- package/cpp/llama.cpp +810 -307
- package/cpp/llama.h +213 -259
- package/cpp/rn-llama.hpp +17 -14
- package/cpp/sampling.cpp +347 -355
- package/cpp/sampling.h +106 -135
- package/cpp/sgemm.cpp +153 -0
- package/package.json +1 -1
- package/cpp/grammar-parser.cpp +0 -539
- package/cpp/grammar-parser.h +0 -29
@@ -19,7 +19,6 @@ set(
|
|
19
19
|
${RNLLAMA_LIB_DIR}/ggml.c
|
20
20
|
${RNLLAMA_LIB_DIR}/ggml-quants.c
|
21
21
|
${RNLLAMA_LIB_DIR}/common.cpp
|
22
|
-
${RNLLAMA_LIB_DIR}/grammar-parser.cpp
|
23
22
|
${RNLLAMA_LIB_DIR}/json.hpp
|
24
23
|
${RNLLAMA_LIB_DIR}/json-schema-to-grammar.cpp
|
25
24
|
${RNLLAMA_LIB_DIR}/sampling.cpp
|
@@ -65,6 +64,7 @@ function(build_library target_name cpu_flags)
|
|
65
64
|
# endif ()
|
66
65
|
endfunction()
|
67
66
|
|
67
|
+
|
68
68
|
# Default target (no specific CPU features)
|
69
69
|
build_library("rnllama" "")
|
70
70
|
|
@@ -78,5 +78,4 @@ if (${ANDROID_ABI} STREQUAL "arm64-v8a")
|
|
78
78
|
elseif (${ANDROID_ABI} STREQUAL "x86_64")
|
79
79
|
# x86_64 target
|
80
80
|
build_library("rnllama_x86_64" "-march=x86-64" "-mtune=intel" "-msse4.2" "-mpopcnt")
|
81
|
-
|
82
81
|
endif ()
|
package/android/src/main/jni.cpp
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
// #include <android/asset_manager_jni.h>
|
4
4
|
#include <android/log.h>
|
5
5
|
#include <cstdlib>
|
6
|
+
#include <ctime>
|
6
7
|
#include <sys/sysinfo.h>
|
7
8
|
#include <string>
|
8
9
|
#include <thread>
|
@@ -165,7 +166,7 @@ Java_com_rnllama_LlamaContext_initContext(
|
|
165
166
|
int max_threads = std::thread::hardware_concurrency();
|
166
167
|
// Use 2 threads by default on 4-core devices, 4 threads on more cores
|
167
168
|
int default_n_threads = max_threads == 4 ? 2 : min(4, max_threads);
|
168
|
-
defaultParams.n_threads = n_threads > 0 ? n_threads : default_n_threads;
|
169
|
+
defaultParams.cpuparams.n_threads = n_threads > 0 ? n_threads : default_n_threads;
|
169
170
|
|
170
171
|
defaultParams.n_gpu_layers = n_gpu_layers;
|
171
172
|
|
@@ -385,18 +386,18 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
385
386
|
|
386
387
|
llama->rewind();
|
387
388
|
|
388
|
-
llama_reset_timings(llama->ctx);
|
389
|
+
//llama_reset_timings(llama->ctx);
|
389
390
|
|
390
391
|
llama->params.prompt = env->GetStringUTFChars(prompt, nullptr);
|
391
|
-
llama->params.seed = seed;
|
392
|
+
llama->params.sparams.seed = (seed == -1) ? time(NULL) : seed;
|
392
393
|
|
393
394
|
int max_threads = std::thread::hardware_concurrency();
|
394
395
|
// Use 2 threads by default on 4-core devices, 4 threads on more cores
|
395
396
|
int default_n_threads = max_threads == 4 ? 2 : min(4, max_threads);
|
396
|
-
llama->params.n_threads = n_threads > 0 ? n_threads : default_n_threads;
|
397
|
+
llama->params.cpuparams.n_threads = n_threads > 0 ? n_threads : default_n_threads;
|
397
398
|
|
398
399
|
llama->params.n_predict = n_predict;
|
399
|
-
llama->params.ignore_eos = ignore_eos;
|
400
|
+
llama->params.sparams.ignore_eos = ignore_eos;
|
400
401
|
|
401
402
|
auto & sparams = llama->params.sparams;
|
402
403
|
sparams.temp = temperature;
|
@@ -412,7 +413,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
412
413
|
sparams.top_p = top_p;
|
413
414
|
sparams.min_p = min_p;
|
414
415
|
sparams.tfs_z = tfs_z;
|
415
|
-
sparams.
|
416
|
+
sparams.typ_p = typical_p;
|
416
417
|
sparams.n_probs = n_probs;
|
417
418
|
sparams.grammar = env->GetStringUTFChars(grammar, nullptr);
|
418
419
|
sparams.xtc_t = xtc_t;
|
@@ -420,7 +421,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
420
421
|
|
421
422
|
sparams.logit_bias.clear();
|
422
423
|
if (ignore_eos) {
|
423
|
-
sparams.logit_bias[llama_token_eos(llama->model)] = -INFINITY;
|
424
|
+
sparams.logit_bias[llama_token_eos(llama->model)].bias = -INFINITY;
|
424
425
|
}
|
425
426
|
|
426
427
|
const int n_vocab = llama_n_vocab(llama_get_model(llama->ctx));
|
@@ -434,9 +435,9 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
434
435
|
llama_token tok = static_cast<llama_token>(doubleArray[0]);
|
435
436
|
if (tok >= 0 && tok < n_vocab) {
|
436
437
|
if (doubleArray[1] != 0) { // If the second element is not false (0)
|
437
|
-
sparams.logit_bias[tok] = doubleArray[1];
|
438
|
+
sparams.logit_bias[tok].bias = doubleArray[1];
|
438
439
|
} else {
|
439
|
-
sparams.logit_bias[tok] = -INFINITY;
|
440
|
+
sparams.logit_bias[tok].bias = -INFINITY;
|
440
441
|
}
|
441
442
|
}
|
442
443
|
|
@@ -522,7 +523,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
522
523
|
}
|
523
524
|
}
|
524
525
|
|
525
|
-
|
526
|
+
llama_perf_print(llama->ctx, LLAMA_PERF_TYPE_CONTEXT);
|
526
527
|
llama->is_predicting = false;
|
527
528
|
|
528
529
|
auto result = createWriteableMap(env);
|
@@ -537,16 +538,17 @@ Java_com_rnllama_LlamaContext_doCompletion(
|
|
537
538
|
putString(env, result, "stopping_word", llama->stopping_word.c_str());
|
538
539
|
putInt(env, result, "tokens_cached", llama->n_past);
|
539
540
|
|
540
|
-
const auto
|
541
|
+
const auto timings_token = llama_get_token_timings(llama->ctx);
|
542
|
+
|
541
543
|
auto timingsResult = createWriteableMap(env);
|
542
|
-
putInt(env, timingsResult, "prompt_n",
|
543
|
-
putInt(env, timingsResult, "prompt_ms",
|
544
|
-
putInt(env, timingsResult, "prompt_per_token_ms",
|
545
|
-
putDouble(env, timingsResult, "prompt_per_second", 1e3 /
|
546
|
-
putInt(env, timingsResult, "predicted_n",
|
547
|
-
putInt(env, timingsResult, "predicted_ms",
|
548
|
-
putInt(env, timingsResult, "predicted_per_token_ms",
|
549
|
-
putDouble(env, timingsResult, "predicted_per_second", 1e3 /
|
544
|
+
putInt(env, timingsResult, "prompt_n", timings_token.n_p_eval);
|
545
|
+
putInt(env, timingsResult, "prompt_ms", timings_token.t_p_eval_ms);
|
546
|
+
putInt(env, timingsResult, "prompt_per_token_ms", timings_token.t_p_eval_ms / timings_token.n_p_eval);
|
547
|
+
putDouble(env, timingsResult, "prompt_per_second", 1e3 / timings_token.t_p_eval_ms * timings_token.n_p_eval);
|
548
|
+
putInt(env, timingsResult, "predicted_n", timings_token.n_eval);
|
549
|
+
putInt(env, timingsResult, "predicted_ms", timings_token.t_eval_ms);
|
550
|
+
putInt(env, timingsResult, "predicted_per_token_ms", timings_token.t_eval_ms / timings_token.n_eval);
|
551
|
+
putDouble(env, timingsResult, "predicted_per_second", 1e3 / timings_token.t_eval_ms * timings_token.n_eval);
|
550
552
|
|
551
553
|
putMap(env, result, "timings", timingsResult);
|
552
554
|
|
@@ -633,7 +635,10 @@ Java_com_rnllama_LlamaContext_embedding(
|
|
633
635
|
|
634
636
|
llama->rewind();
|
635
637
|
|
636
|
-
llama_reset_timings(llama->ctx);
|
638
|
+
// llama_reset_timings(llama->ctx);
|
639
|
+
llama_perf_reset(llama->ctx, LLAMA_PERF_TYPE_CONTEXT);
|
640
|
+
gpt_sampler_reset(llama->ctx_sampling);
|
641
|
+
|
637
642
|
|
638
643
|
llama->params.prompt = text_chars;
|
639
644
|
|
@@ -691,7 +696,7 @@ Java_com_rnllama_LlamaContext_freeContext(
|
|
691
696
|
}
|
692
697
|
if (llama->ctx_sampling != nullptr)
|
693
698
|
{
|
694
|
-
|
699
|
+
gpt_sampler_free(llama->ctx_sampling);
|
695
700
|
}
|
696
701
|
context_map.erase((long) llama->ctx);
|
697
702
|
}
|