cui-llama.rn 1.0.7 → 1.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/llama.h CHANGED
@@ -93,15 +93,14 @@ extern "C" {
93
93
  LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
94
94
  LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
95
95
  LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
96
+ LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
97
+ LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
96
98
  };
97
99
 
98
- // note: these values should be synchronized with lm_ggml_rope
99
- // TODO: maybe move this enum to ggml.h (lm_ggml_rope_type)
100
100
  enum llama_rope_type {
101
101
  LLAMA_ROPE_TYPE_NONE = -1,
102
- LLAMA_ROPE_TYPE_NORM = 0,
103
- LLAMA_ROPE_TYPE_NEOX = 2,
104
- LLAMA_ROPE_TYPE_GLM = 4,
102
+ LLAMA_ROPE_TYPE_NORM = 0,
103
+ LLAMA_ROPE_TYPE_NEOX = LM_GGML_ROPE_TYPE_NEOX,
105
104
  };
106
105
 
107
106
  enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
@@ -345,7 +344,7 @@ extern "C" {
345
344
  int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
346
345
  enum llama_ftype ftype; // quantize to this llama_ftype
347
346
  enum lm_ggml_type output_tensor_type; // output tensor type
348
- enum lm_ggml_type token_embedding_type; // itoken embeddings tensor type
347
+ enum lm_ggml_type token_embedding_type; // token embeddings tensor type
349
348
  bool allow_requantize; // allow quantizing non-f32/f16 tensors
350
349
  bool quantize_output_tensor; // quantize output.weight
351
350
  bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
@@ -504,6 +503,9 @@ extern "C" {
504
503
  // Returns true if the model contains an encoder that requires llama_encode() call
505
504
  LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
506
505
 
506
+ // Returns true if the model contains a decoder that requires llama_decode() call
507
+ LLAMA_API bool llama_model_has_decoder(const struct llama_model * model);
508
+
507
509
  // For encoder-decoder models, this function returns id of the token that must be provided
508
510
  // to the decoder to start generating output sequence. For other models, it returns -1.
509
511
  LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
@@ -912,11 +914,8 @@ extern "C" {
912
914
  LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
913
915
  LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
914
916
 
915
- // Returns -1 if unknown, 1 for true or 0 for false.
916
- LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
917
-
918
- // Returns -1 if unknown, 1 for true or 0 for false.
919
- LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
917
+ LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
918
+ LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
920
919
 
921
920
  // Codellama infill tokens
922
921
  LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
package/cpp/rn-llama.hpp CHANGED
@@ -6,13 +6,10 @@
6
6
  #include "common.h"
7
7
  #include "llama.h"
8
8
 
9
-
10
9
  #include <android/log.h>
11
10
  #define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID"
12
11
  #define LLAMA_LOG_INFO(...) __android_log_print(ANDROID_LOG_INFO , LLAMA_ANDROID_TAG, __VA_ARGS__)
13
12
 
14
-
15
-
16
13
  namespace rnllama {
17
14
 
18
15
  static void llama_batch_clear(llama_batch *batch) {
@@ -227,7 +224,9 @@ struct llama_rn_context
227
224
  bool loadModel(gpt_params &params_)
228
225
  {
229
226
  params = params_;
230
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
227
+ llama_init_result result = llama_init_from_gpt_params(params);
228
+ model = result.model;
229
+ ctx = result.context;
231
230
  if (model == nullptr)
232
231
  {
233
232
  LOG_ERROR("unable to load model: %s", params_.model.c_str());
@@ -298,7 +297,9 @@ struct llama_rn_context
298
297
  }
299
298
 
300
299
  // do Context Shift , may be buggy! TODO: Verify functionality
301
- purge_missing_tokens(ctx, embd, prompt_tokens, params.n_predict, params.n_ctx);
300
+ if(!params.embedding){
301
+ purge_missing_tokens(ctx, embd, prompt_tokens, params.n_predict, params.n_ctx);
302
+ }
302
303
 
303
304
  // push the prompt into the sampling context (do not apply grammar)
304
305
  for (auto & token : prompt_tokens)
@@ -306,7 +307,7 @@ struct llama_rn_context
306
307
  llama_sampling_accept(ctx_sampling, ctx, token, false);
307
308
  }
308
309
  // compare the evaluated prompt with the new prompt
309
- n_past = common_part(embd, prompt_tokens);
310
+ n_past = params.embedding? 0 : common_part(embd, prompt_tokens);
310
311
  LLAMA_LOG_INFO("%s: n_past: %zu", __func__, n_past);
311
312
  LLAMA_LOG_INFO("%s: embd size: %zu", __func__, embd.size());
312
313
  LLAMA_LOG_INFO("%s: prompt_tokens size: %zu", __func__, prompt_tokens.size());
@@ -343,9 +344,9 @@ struct llama_rn_context
343
344
  completion_token_output result;
344
345
  result.tok = -1;
345
346
 
347
+ // this truncation should never trigger with good context shifting
346
348
  if (embd.size() >= (size_t)params.n_ctx)
347
349
  {
348
- // Shift context
349
350
 
350
351
  const int n_left = n_past - params.n_keep - 1;
351
352
  const int n_discard = n_left/2;
@@ -547,9 +548,21 @@ struct llama_rn_context
547
548
  LOG_WARNING("embedding disabled, embedding: %s", params.embedding);
548
549
  return std::vector<float>(n_embd, 0.0f);
549
550
  }
550
- const float *data = llama_get_embeddings(ctx);
551
- std::vector<float> embedding(data, data + n_embd);
552
- return embedding;
551
+ float *data;
552
+
553
+ if(params.pooling_type == 0){
554
+ data = llama_get_embeddings(ctx);
555
+ }
556
+ else {
557
+ data = llama_get_embeddings_seq(ctx, 0);
558
+ }
559
+
560
+ if(!data) {
561
+ return std::vector<float>(n_embd, 0.0f);
562
+ }
563
+ std::vector<float> embedding(data, data + n_embd), out(data, data + n_embd);
564
+ llama_embd_normalize(embedding.data(), out.data(), n_embd, params.embd_normalize);
565
+ return out;
553
566
  }
554
567
 
555
568
  std::string bench(int pp, int tg, int pl, int nr)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "cui-llama.rn",
3
- "version": "1.0.7",
3
+ "version": "1.0.10",
4
4
  "description": "Fork of llama.rn for ChatterUI",
5
5
  "main": "lib/commonjs/index",
6
6
  "module": "lib/module/index",