cui-llama.rn 1.1.2 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/rn-llama.hpp CHANGED
@@ -7,6 +7,7 @@
7
7
  #include "llama.h"
8
8
 
9
9
  #include <android/log.h>
10
+ #include "sampling.h"
10
11
  #define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID"
11
12
  #define LLAMA_LOG_INFO(...) __android_log_print(ANDROID_LOG_INFO , LLAMA_ANDROID_TAG, __VA_ARGS__)
12
13
 
@@ -163,8 +164,8 @@ struct llama_rn_context
163
164
 
164
165
  llama_model *model = nullptr;
165
166
  llama_context *ctx = nullptr;
166
- llama_sampling_context *ctx_sampling = nullptr;
167
-
167
+ gpt_sampler *ctx_sampling = nullptr;
168
+
168
169
  int n_ctx;
169
170
 
170
171
  bool truncated = false;
@@ -188,7 +189,7 @@ struct llama_rn_context
188
189
  }
189
190
  if (ctx_sampling != nullptr)
190
191
  {
191
- llama_sampling_free(ctx_sampling);
192
+ gpt_sampler_free(ctx_sampling);
192
193
  }
193
194
  }
194
195
 
@@ -215,9 +216,9 @@ struct llama_rn_context
215
216
 
216
217
  bool initSampling() {
217
218
  if (ctx_sampling != nullptr) {
218
- llama_sampling_free(ctx_sampling);
219
+ gpt_sampler_free(ctx_sampling);
219
220
  }
220
- ctx_sampling = llama_sampling_init(params.sparams);
221
+ ctx_sampling = gpt_sampler_init(model, params.sparams);
221
222
  return ctx_sampling != nullptr;
222
223
  }
223
224
 
@@ -304,7 +305,7 @@ struct llama_rn_context
304
305
  // push the prompt into the sampling context (do not apply grammar)
305
306
  for (auto & token : prompt_tokens)
306
307
  {
307
- llama_sampling_accept(ctx_sampling, ctx, token, false);
308
+ gpt_sampler_accept(ctx_sampling, token, false);
308
309
  }
309
310
  // compare the evaluated prompt with the new prompt
310
311
  n_past = params.embedding? 0 : common_part(embd, prompt_tokens);
@@ -334,8 +335,7 @@ struct llama_rn_context
334
335
  {
335
336
  // number of tokens to keep when resetting context
336
337
  n_remain = params.n_predict;
337
- llama_set_rng_seed(ctx, params.seed);
338
-
338
+ llama_perf_context_reset(ctx);
339
339
  is_predicting = true;
340
340
  }
341
341
 
@@ -383,7 +383,7 @@ struct llama_rn_context
383
383
  LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s",
384
384
  n_eval,
385
385
  n_past,
386
- params.n_threads,
386
+ params.cpuparams.n_threads,
387
387
  tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str()
388
388
  );
389
389
  has_next_token = false;
@@ -411,22 +411,26 @@ struct llama_rn_context
411
411
  std::vector<llama_token_data> candidates;
412
412
  candidates.reserve(llama_n_vocab(model));
413
413
 
414
- result.tok = llama_sampling_sample(ctx_sampling, ctx, NULL);
415
-
416
- llama_token_data_array cur_p = { ctx_sampling->cur.data(), ctx_sampling->cur.size(), false };
414
+ result.tok = gpt_sampler_sample(ctx_sampling, ctx, -1);
415
+
416
+ llama_token_data_array cur_p = *gpt_sampler_get_candidates(ctx_sampling);
417
417
 
418
418
  const int32_t n_probs = params.sparams.n_probs;
419
+
420
+
419
421
  if (params.sparams.temp <= 0 && n_probs > 0)
420
422
  {
421
423
  // For llama_sample_token_greedy we need to sort candidates
422
- llama_sample_softmax(ctx, &cur_p);
424
+ llama_sampler_init_softmax();
423
425
  }
426
+
424
427
 
425
428
  for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
426
429
  {
427
430
  result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
428
431
  }
429
- llama_sampling_accept(ctx_sampling, ctx, result.tok, true);
432
+
433
+ gpt_sampler_accept(ctx_sampling, result.tok, true);
430
434
  if (tg) {
431
435
  num_tokens_predicted++;
432
436
  }