cui-llama.rn 1.1.2 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,7 +19,6 @@ set(
19
19
  ${RNLLAMA_LIB_DIR}/ggml.c
20
20
  ${RNLLAMA_LIB_DIR}/ggml-quants.c
21
21
  ${RNLLAMA_LIB_DIR}/common.cpp
22
- ${RNLLAMA_LIB_DIR}/grammar-parser.cpp
23
22
  ${RNLLAMA_LIB_DIR}/json.hpp
24
23
  ${RNLLAMA_LIB_DIR}/json-schema-to-grammar.cpp
25
24
  ${RNLLAMA_LIB_DIR}/sampling.cpp
@@ -65,6 +64,7 @@ function(build_library target_name cpu_flags)
65
64
  # endif ()
66
65
  endfunction()
67
66
 
67
+
68
68
  # Default target (no specific CPU features)
69
69
  build_library("rnllama" "")
70
70
 
@@ -78,5 +78,4 @@ if (${ANDROID_ABI} STREQUAL "arm64-v8a")
78
78
  elseif (${ANDROID_ABI} STREQUAL "x86_64")
79
79
  # x86_64 target
80
80
  build_library("rnllama_x86_64" "-march=x86-64" "-mtune=intel" "-msse4.2" "-mpopcnt")
81
-
82
81
  endif ()
@@ -3,6 +3,7 @@
3
3
  // #include <android/asset_manager_jni.h>
4
4
  #include <android/log.h>
5
5
  #include <cstdlib>
6
+ #include <ctime>
6
7
  #include <sys/sysinfo.h>
7
8
  #include <string>
8
9
  #include <thread>
@@ -165,7 +166,7 @@ Java_com_rnllama_LlamaContext_initContext(
165
166
  int max_threads = std::thread::hardware_concurrency();
166
167
  // Use 2 threads by default on 4-core devices, 4 threads on more cores
167
168
  int default_n_threads = max_threads == 4 ? 2 : min(4, max_threads);
168
- defaultParams.n_threads = n_threads > 0 ? n_threads : default_n_threads;
169
+ defaultParams.cpuparams.n_threads = n_threads > 0 ? n_threads : default_n_threads;
169
170
 
170
171
  defaultParams.n_gpu_layers = n_gpu_layers;
171
172
 
@@ -385,18 +386,18 @@ Java_com_rnllama_LlamaContext_doCompletion(
385
386
 
386
387
  llama->rewind();
387
388
 
388
- llama_reset_timings(llama->ctx);
389
+ //llama_reset_timings(llama->ctx);
389
390
 
390
391
  llama->params.prompt = env->GetStringUTFChars(prompt, nullptr);
391
- llama->params.seed = seed;
392
+ llama->params.sparams.seed = (seed == -1) ? time(NULL) : seed;
392
393
 
393
394
  int max_threads = std::thread::hardware_concurrency();
394
395
  // Use 2 threads by default on 4-core devices, 4 threads on more cores
395
396
  int default_n_threads = max_threads == 4 ? 2 : min(4, max_threads);
396
- llama->params.n_threads = n_threads > 0 ? n_threads : default_n_threads;
397
+ llama->params.cpuparams.n_threads = n_threads > 0 ? n_threads : default_n_threads;
397
398
 
398
399
  llama->params.n_predict = n_predict;
399
- llama->params.ignore_eos = ignore_eos;
400
+ llama->params.sparams.ignore_eos = ignore_eos;
400
401
 
401
402
  auto & sparams = llama->params.sparams;
402
403
  sparams.temp = temperature;
@@ -412,7 +413,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
412
413
  sparams.top_p = top_p;
413
414
  sparams.min_p = min_p;
414
415
  sparams.tfs_z = tfs_z;
415
- sparams.typical_p = typical_p;
416
+ sparams.typ_p = typical_p;
416
417
  sparams.n_probs = n_probs;
417
418
  sparams.grammar = env->GetStringUTFChars(grammar, nullptr);
418
419
  sparams.xtc_t = xtc_t;
@@ -420,7 +421,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
420
421
 
421
422
  sparams.logit_bias.clear();
422
423
  if (ignore_eos) {
423
- sparams.logit_bias[llama_token_eos(llama->model)] = -INFINITY;
424
+ sparams.logit_bias[llama_token_eos(llama->model)].bias = -INFINITY;
424
425
  }
425
426
 
426
427
  const int n_vocab = llama_n_vocab(llama_get_model(llama->ctx));
@@ -434,9 +435,9 @@ Java_com_rnllama_LlamaContext_doCompletion(
434
435
  llama_token tok = static_cast<llama_token>(doubleArray[0]);
435
436
  if (tok >= 0 && tok < n_vocab) {
436
437
  if (doubleArray[1] != 0) { // If the second element is not false (0)
437
- sparams.logit_bias[tok] = doubleArray[1];
438
+ sparams.logit_bias[tok].bias = doubleArray[1];
438
439
  } else {
439
- sparams.logit_bias[tok] = -INFINITY;
440
+ sparams.logit_bias[tok].bias = -INFINITY;
440
441
  }
441
442
  }
442
443
 
@@ -522,7 +523,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
522
523
  }
523
524
  }
524
525
 
525
- llama_print_timings(llama->ctx);
526
+ llama_perf_print(llama->ctx, LLAMA_PERF_TYPE_CONTEXT);
526
527
  llama->is_predicting = false;
527
528
 
528
529
  auto result = createWriteableMap(env);
@@ -537,16 +538,17 @@ Java_com_rnllama_LlamaContext_doCompletion(
537
538
  putString(env, result, "stopping_word", llama->stopping_word.c_str());
538
539
  putInt(env, result, "tokens_cached", llama->n_past);
539
540
 
540
- const auto timings = llama_get_timings(llama->ctx);
541
+ const auto timings_token = llama_get_token_timings(llama->ctx);
542
+
541
543
  auto timingsResult = createWriteableMap(env);
542
- putInt(env, timingsResult, "prompt_n", timings.n_p_eval);
543
- putInt(env, timingsResult, "prompt_ms", timings.t_p_eval_ms);
544
- putInt(env, timingsResult, "prompt_per_token_ms", timings.t_p_eval_ms / timings.n_p_eval);
545
- putDouble(env, timingsResult, "prompt_per_second", 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
546
- putInt(env, timingsResult, "predicted_n", timings.n_eval);
547
- putInt(env, timingsResult, "predicted_ms", timings.t_eval_ms);
548
- putInt(env, timingsResult, "predicted_per_token_ms", timings.t_eval_ms / timings.n_eval);
549
- putDouble(env, timingsResult, "predicted_per_second", 1e3 / timings.t_eval_ms * timings.n_eval);
544
+ putInt(env, timingsResult, "prompt_n", timings_token.n_p_eval);
545
+ putInt(env, timingsResult, "prompt_ms", timings_token.t_p_eval_ms);
546
+ putInt(env, timingsResult, "prompt_per_token_ms", timings_token.t_p_eval_ms / timings_token.n_p_eval);
547
+ putDouble(env, timingsResult, "prompt_per_second", 1e3 / timings_token.t_p_eval_ms * timings_token.n_p_eval);
548
+ putInt(env, timingsResult, "predicted_n", timings_token.n_eval);
549
+ putInt(env, timingsResult, "predicted_ms", timings_token.t_eval_ms);
550
+ putInt(env, timingsResult, "predicted_per_token_ms", timings_token.t_eval_ms / timings_token.n_eval);
551
+ putDouble(env, timingsResult, "predicted_per_second", 1e3 / timings_token.t_eval_ms * timings_token.n_eval);
550
552
 
551
553
  putMap(env, result, "timings", timingsResult);
552
554
 
@@ -633,7 +635,10 @@ Java_com_rnllama_LlamaContext_embedding(
633
635
 
634
636
  llama->rewind();
635
637
 
636
- llama_reset_timings(llama->ctx);
638
+ // llama_reset_timings(llama->ctx);
639
+ llama_perf_reset(llama->ctx, LLAMA_PERF_TYPE_CONTEXT);
640
+ gpt_sampler_reset(llama->ctx_sampling);
641
+
637
642
 
638
643
  llama->params.prompt = text_chars;
639
644
 
@@ -691,7 +696,7 @@ Java_com_rnllama_LlamaContext_freeContext(
691
696
  }
692
697
  if (llama->ctx_sampling != nullptr)
693
698
  {
694
- llama_sampling_free(llama->ctx_sampling);
699
+ gpt_sampler_free(llama->ctx_sampling);
695
700
  }
696
701
  context_map.erase((long) llama->ctx);
697
702
  }