cui-llama.rn 1.1.2 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/rn-llama.hpp CHANGED
@@ -163,8 +163,8 @@ struct llama_rn_context
163
163
 
164
164
  llama_model *model = nullptr;
165
165
  llama_context *ctx = nullptr;
166
- llama_sampling_context *ctx_sampling = nullptr;
167
-
166
+ gpt_sampler *ctx_sampling = nullptr;
167
+
168
168
  int n_ctx;
169
169
 
170
170
  bool truncated = false;
@@ -188,7 +188,7 @@ struct llama_rn_context
188
188
  }
189
189
  if (ctx_sampling != nullptr)
190
190
  {
191
- llama_sampling_free(ctx_sampling);
191
+ gpt_sampler_free(ctx_sampling);
192
192
  }
193
193
  }
194
194
 
@@ -215,9 +215,9 @@ struct llama_rn_context
215
215
 
216
216
  bool initSampling() {
217
217
  if (ctx_sampling != nullptr) {
218
- llama_sampling_free(ctx_sampling);
218
+ gpt_sampler_free(ctx_sampling);
219
219
  }
220
- ctx_sampling = llama_sampling_init(params.sparams);
220
+ ctx_sampling = gpt_sampler_init(model, params.sparams);
221
221
  return ctx_sampling != nullptr;
222
222
  }
223
223
 
@@ -304,7 +304,7 @@ struct llama_rn_context
304
304
  // push the prompt into the sampling context (do not apply grammar)
305
305
  for (auto & token : prompt_tokens)
306
306
  {
307
- llama_sampling_accept(ctx_sampling, ctx, token, false);
307
+ gpt_sampler_accept(ctx_sampling, token, false);
308
308
  }
309
309
  // compare the evaluated prompt with the new prompt
310
310
  n_past = params.embedding? 0 : common_part(embd, prompt_tokens);
@@ -334,8 +334,7 @@ struct llama_rn_context
334
334
  {
335
335
  // number of tokens to keep when resetting context
336
336
  n_remain = params.n_predict;
337
- llama_set_rng_seed(ctx, params.seed);
338
-
337
+ llama_perf_reset(ctx, LLAMA_PERF_TYPE_CONTEXT);
339
338
  is_predicting = true;
340
339
  }
341
340
 
@@ -383,7 +382,7 @@ struct llama_rn_context
383
382
  LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s",
384
383
  n_eval,
385
384
  n_past,
386
- params.n_threads,
385
+ params.cpuparams.n_threads,
387
386
  tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str()
388
387
  );
389
388
  has_next_token = false;
@@ -411,22 +410,26 @@ struct llama_rn_context
411
410
  std::vector<llama_token_data> candidates;
412
411
  candidates.reserve(llama_n_vocab(model));
413
412
 
414
- result.tok = llama_sampling_sample(ctx_sampling, ctx, NULL);
415
-
416
- llama_token_data_array cur_p = { ctx_sampling->cur.data(), ctx_sampling->cur.size(), false };
413
+ result.tok = gpt_sampler_sample(ctx_sampling, ctx, -1);
414
+
415
+ llama_token_data_array cur_p = *gpt_sampler_get_candidates(ctx_sampling);
417
416
 
418
417
  const int32_t n_probs = params.sparams.n_probs;
418
+
419
+
419
420
  if (params.sparams.temp <= 0 && n_probs > 0)
420
421
  {
421
422
  // For llama_sample_token_greedy we need to sort candidates
422
- llama_sample_softmax(ctx, &cur_p);
423
+ llama_sampler_init_softmax();
423
424
  }
425
+
424
426
 
425
427
  for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
426
428
  {
427
429
  result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
428
430
  }
429
- llama_sampling_accept(ctx_sampling, ctx, result.tok, true);
431
+
432
+ gpt_sampler_accept(ctx_sampling, result.tok, true);
430
433
  if (tg) {
431
434
  num_tokens_predicted++;
432
435
  }