cui-llama.rn 1.0.6 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -10,6 +10,7 @@ The following features have been added for Android:
10
10
  - Added stopping prompt processing between batches, vital for mobile devices with very slow prompt processing
11
11
  - `vocab_only` mode: utilize the llama.cpp tokenizer
12
12
  - tokenizeSync: non-blocking, synchronous tokenizer function
13
+ - Context Shift taken from [kobold.cpp](https://github.com/LostRuins/koboldcpp)
13
14
 
14
15
  Original repo README.md below.
15
16
 
@@ -211,7 +211,7 @@ Java_com_rnllama_LlamaContext_loadModelDetails(
211
211
  for (int i = 0; i < count; i++) {
212
212
  char key[256];
213
213
  llama_model_meta_key_by_index(llama->model, i, key, sizeof(key));
214
- char val[256];
214
+ char val[2048];
215
215
  llama_model_meta_val_str_by_index(llama->model, i, val, sizeof(val));
216
216
 
217
217
  putString(env, meta, key, val);
package/cpp/common.cpp CHANGED
@@ -1640,7 +1640,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
1640
1640
  options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
1641
1641
  options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
1642
1642
  options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
1643
- options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
1643
+ options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
1644
1644
  options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
1645
1645
  options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
1646
1646
  options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
package/cpp/ggml-quants.c CHANGED
@@ -6449,22 +6449,22 @@ void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void
6449
6449
  // compute mask for subtraction
6450
6450
  vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
6451
6451
  vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
6452
- vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
6452
+ vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
6453
6453
  m <<= 1;
6454
6454
 
6455
6455
  vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
6456
6456
  vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
6457
- vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
6457
+ vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
6458
6458
  m <<= 1;
6459
6459
 
6460
6460
  vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
6461
6461
  vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
6462
- vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
6462
+ vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
6463
6463
  m <<= 1;
6464
6464
 
6465
6465
  vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
6466
6466
  vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
6467
- vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
6467
+ vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
6468
6468
  m <<= 1;
6469
6469
 
6470
6470
  // load Q8 and take product with Q3
@@ -7720,13 +7720,13 @@ void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void
7720
7720
  vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
7721
7721
  vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
7722
7722
  vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
7723
- vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
7723
+ vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl);
7724
7724
  m <<= 1;
7725
7725
 
7726
7726
  vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
7727
7727
  vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
7728
7728
  vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
7729
- vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
7729
+ vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl);
7730
7730
  m <<= 1;
7731
7731
 
7732
7732
  vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
package/cpp/ggml.c CHANGED
@@ -141,7 +141,51 @@ typedef pthread_t lm_ggml_thread_t;
141
141
 
142
142
  #include <sys/wait.h>
143
143
 
144
- #if defined(__linux__)
144
+ #if defined(__ANDROID__)
145
+ #include <unwind.h>
146
+ #include <dlfcn.h>
147
+ #include <stdio.h>
148
+
149
+ struct backtrace_state {
150
+ void ** current;
151
+ void ** end;
152
+ };
153
+
154
+ static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context* context, void* arg) {
155
+ struct backtrace_state * state = (struct backtrace_state *)arg;
156
+ uintptr_t pc = _Unwind_GetIP(context);
157
+ if (pc) {
158
+ if (state->current == state->end) {
159
+ return _URC_END_OF_STACK;
160
+ } else {
161
+ *state->current++ = (void*)pc;
162
+ }
163
+ }
164
+ return _URC_NO_REASON;
165
+ }
166
+
167
+ static void lm_ggml_print_backtrace_symbols(void) {
168
+ const int max = 100;
169
+ void* buffer[max];
170
+
171
+ struct backtrace_state state = {buffer, buffer + max};
172
+ _Unwind_Backtrace(unwind_callback, &state);
173
+
174
+ int count = state.current - buffer;
175
+
176
+ for (int idx = 0; idx < count; ++idx) {
177
+ const void * addr = buffer[idx];
178
+ const char * symbol = "";
179
+
180
+ Dl_info info;
181
+ if (dladdr(addr, &info) && info.dli_sname) {
182
+ symbol = info.dli_sname;
183
+ }
184
+
185
+ fprintf(stderr, "%d: %p %s\n", idx, addr, symbol);
186
+ }
187
+ }
188
+ #elif defined(__linux__)
145
189
  #include <execinfo.h>
146
190
  static void lm_ggml_print_backtrace_symbols(void) {
147
191
  // void * trace[100];
package/cpp/llama.cpp CHANGED
@@ -4980,6 +4980,7 @@ static void llm_load_hparams(
4980
4980
  hparams.attn_soft_cap = true;
4981
4981
 
4982
4982
  switch (hparams.n_layer) {
4983
+ case 26: model.type = e_model::MODEL_2B; break;
4983
4984
  case 42: model.type = e_model::MODEL_9B; break;
4984
4985
  case 46: model.type = e_model::MODEL_27B; break;
4985
4986
  default: model.type = e_model::MODEL_UNKNOWN;
@@ -11747,6 +11748,7 @@ struct llm_build_context {
11747
11748
 
11748
11749
  // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
11749
11750
  switch (model.type) {
11751
+ case e_model::MODEL_2B:
11750
11752
  case e_model::MODEL_9B: Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
11751
11753
  case e_model::MODEL_27B: Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
11752
11754
  default: LM_GGML_ABORT("fatal error");
package/cpp/rn-llama.hpp CHANGED
@@ -240,9 +240,11 @@ struct llama_rn_context
240
240
  bool validateModelChatTemplate() const {
241
241
  llama_chat_message chat[] = {{"user", "test"}};
242
242
 
243
- const int res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0);
243
+ std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
244
+ std::string template_key = "tokenizer.chat_template";
245
+ int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
244
246
 
245
- return res > 0;
247
+ return res >= 0;
246
248
  }
247
249
 
248
250
  void truncatePrompt(std::vector<llama_token> &prompt_tokens) {
@@ -102,7 +102,7 @@
102
102
  for (int i = 0; i < count; i++) {
103
103
  char key[256];
104
104
  llama_model_meta_key_by_index(llama->model, i, key, sizeof(key));
105
- char val[256];
105
+ char val[2048];
106
106
  llama_model_meta_val_str_by_index(llama->model, i, val, sizeof(val));
107
107
 
108
108
  NSString *keyStr = [NSString stringWithUTF8String:key];
package/jest/mock.js CHANGED
@@ -10,6 +10,9 @@ if (!NativeModules.RNLlama) {
10
10
  }),
11
11
  ),
12
12
 
13
+ // TODO: Use jinja parser
14
+ getFormattedChat: jest.fn(() => ''),
15
+
13
16
  completion: jest.fn(async (contextId, jobId) => {
14
17
  const testResult = {
15
18
  text: '*giggles*',
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "cui-llama.rn",
3
- "version": "1.0.6",
3
+ "version": "1.0.7",
4
4
  "description": "Fork of llama.rn for ChatterUI",
5
5
  "main": "lib/commonjs/index",
6
6
  "module": "lib/module/index",