cui-llama.rn 1.0.6 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/android/src/main/jni.cpp +1 -1
- package/cpp/common.cpp +1 -1
- package/cpp/ggml-quants.c +6 -6
- package/cpp/ggml.c +45 -1
- package/cpp/llama.cpp +2 -0
- package/cpp/rn-llama.hpp +4 -2
- package/ios/RNLlamaContext.mm +1 -1
- package/jest/mock.js +3 -0
- package/package.json +1 -1
package/README.md
CHANGED
@@ -10,6 +10,7 @@ The following features have been added for Android:
|
|
10
10
|
- Added stopping prompt processing between batches, vital for mobile devices with very slow prompt processing
|
11
11
|
- `vocab_only` mode: utilize the llama.cpp tokenizer
|
12
12
|
- tokenizeSync: non-blocking, synchronous tokenizer function
|
13
|
+
- Context Shift taken from [kobold.cpp](https://github.com/LostRuins/koboldcpp)
|
13
14
|
|
14
15
|
Original repo README.md below.
|
15
16
|
|
package/android/src/main/jni.cpp
CHANGED
@@ -211,7 +211,7 @@ Java_com_rnllama_LlamaContext_loadModelDetails(
|
|
211
211
|
for (int i = 0; i < count; i++) {
|
212
212
|
char key[256];
|
213
213
|
llama_model_meta_key_by_index(llama->model, i, key, sizeof(key));
|
214
|
-
char val[
|
214
|
+
char val[2048];
|
215
215
|
llama_model_meta_val_str_by_index(llama->model, i, val, sizeof(val));
|
216
216
|
|
217
217
|
putString(env, meta, key, val);
|
package/cpp/common.cpp
CHANGED
@@ -1640,7 +1640,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|
1640
1640
|
options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
|
1641
1641
|
options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
|
1642
1642
|
options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
|
1643
|
-
options.push_back({ "server", " --embedding(s)", "
|
1643
|
+
options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
|
1644
1644
|
options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
|
1645
1645
|
options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
|
1646
1646
|
options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
|
package/cpp/ggml-quants.c
CHANGED
@@ -6449,22 +6449,22 @@ void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void
|
|
6449
6449
|
// compute mask for subtraction
|
6450
6450
|
vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
6451
6451
|
vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
|
6452
|
-
vint8m1_t q3_m0 =
|
6452
|
+
vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
|
6453
6453
|
m <<= 1;
|
6454
6454
|
|
6455
6455
|
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
6456
6456
|
vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
|
6457
|
-
vint8m1_t q3_m1 =
|
6457
|
+
vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
|
6458
6458
|
m <<= 1;
|
6459
6459
|
|
6460
6460
|
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
6461
6461
|
vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
|
6462
|
-
vint8m1_t q3_m2 =
|
6462
|
+
vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
|
6463
6463
|
m <<= 1;
|
6464
6464
|
|
6465
6465
|
vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
6466
6466
|
vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
|
6467
|
-
vint8m1_t q3_m3 =
|
6467
|
+
vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
|
6468
6468
|
m <<= 1;
|
6469
6469
|
|
6470
6470
|
// load Q8 and take product with Q3
|
@@ -7720,13 +7720,13 @@ void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void
|
|
7720
7720
|
vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
|
7721
7721
|
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
7722
7722
|
vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
|
7723
|
-
vint8m1_t q5_m1 =
|
7723
|
+
vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl);
|
7724
7724
|
m <<= 1;
|
7725
7725
|
|
7726
7726
|
vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
|
7727
7727
|
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
7728
7728
|
vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
|
7729
|
-
vint8m1_t q5_m2 =
|
7729
|
+
vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl);
|
7730
7730
|
m <<= 1;
|
7731
7731
|
|
7732
7732
|
vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
|
package/cpp/ggml.c
CHANGED
@@ -141,7 +141,51 @@ typedef pthread_t lm_ggml_thread_t;
|
|
141
141
|
|
142
142
|
#include <sys/wait.h>
|
143
143
|
|
144
|
-
#if defined(
|
144
|
+
#if defined(__ANDROID__)
|
145
|
+
#include <unwind.h>
|
146
|
+
#include <dlfcn.h>
|
147
|
+
#include <stdio.h>
|
148
|
+
|
149
|
+
struct backtrace_state {
|
150
|
+
void ** current;
|
151
|
+
void ** end;
|
152
|
+
};
|
153
|
+
|
154
|
+
static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context* context, void* arg) {
|
155
|
+
struct backtrace_state * state = (struct backtrace_state *)arg;
|
156
|
+
uintptr_t pc = _Unwind_GetIP(context);
|
157
|
+
if (pc) {
|
158
|
+
if (state->current == state->end) {
|
159
|
+
return _URC_END_OF_STACK;
|
160
|
+
} else {
|
161
|
+
*state->current++ = (void*)pc;
|
162
|
+
}
|
163
|
+
}
|
164
|
+
return _URC_NO_REASON;
|
165
|
+
}
|
166
|
+
|
167
|
+
static void lm_ggml_print_backtrace_symbols(void) {
|
168
|
+
const int max = 100;
|
169
|
+
void* buffer[max];
|
170
|
+
|
171
|
+
struct backtrace_state state = {buffer, buffer + max};
|
172
|
+
_Unwind_Backtrace(unwind_callback, &state);
|
173
|
+
|
174
|
+
int count = state.current - buffer;
|
175
|
+
|
176
|
+
for (int idx = 0; idx < count; ++idx) {
|
177
|
+
const void * addr = buffer[idx];
|
178
|
+
const char * symbol = "";
|
179
|
+
|
180
|
+
Dl_info info;
|
181
|
+
if (dladdr(addr, &info) && info.dli_sname) {
|
182
|
+
symbol = info.dli_sname;
|
183
|
+
}
|
184
|
+
|
185
|
+
fprintf(stderr, "%d: %p %s\n", idx, addr, symbol);
|
186
|
+
}
|
187
|
+
}
|
188
|
+
#elif defined(__linux__)
|
145
189
|
#include <execinfo.h>
|
146
190
|
static void lm_ggml_print_backtrace_symbols(void) {
|
147
191
|
// void * trace[100];
|
package/cpp/llama.cpp
CHANGED
@@ -4980,6 +4980,7 @@ static void llm_load_hparams(
|
|
4980
4980
|
hparams.attn_soft_cap = true;
|
4981
4981
|
|
4982
4982
|
switch (hparams.n_layer) {
|
4983
|
+
case 26: model.type = e_model::MODEL_2B; break;
|
4983
4984
|
case 42: model.type = e_model::MODEL_9B; break;
|
4984
4985
|
case 46: model.type = e_model::MODEL_27B; break;
|
4985
4986
|
default: model.type = e_model::MODEL_UNKNOWN;
|
@@ -11747,6 +11748,7 @@ struct llm_build_context {
|
|
11747
11748
|
|
11748
11749
|
// ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
|
11749
11750
|
switch (model.type) {
|
11751
|
+
case e_model::MODEL_2B:
|
11750
11752
|
case e_model::MODEL_9B: Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
|
11751
11753
|
case e_model::MODEL_27B: Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
|
11752
11754
|
default: LM_GGML_ABORT("fatal error");
|
package/cpp/rn-llama.hpp
CHANGED
@@ -240,9 +240,11 @@ struct llama_rn_context
|
|
240
240
|
bool validateModelChatTemplate() const {
|
241
241
|
llama_chat_message chat[] = {{"user", "test"}};
|
242
242
|
|
243
|
-
|
243
|
+
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
|
244
|
+
std::string template_key = "tokenizer.chat_template";
|
245
|
+
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
244
246
|
|
245
|
-
return res
|
247
|
+
return res >= 0;
|
246
248
|
}
|
247
249
|
|
248
250
|
void truncatePrompt(std::vector<llama_token> &prompt_tokens) {
|
package/ios/RNLlamaContext.mm
CHANGED
@@ -102,7 +102,7 @@
|
|
102
102
|
for (int i = 0; i < count; i++) {
|
103
103
|
char key[256];
|
104
104
|
llama_model_meta_key_by_index(llama->model, i, key, sizeof(key));
|
105
|
-
char val[
|
105
|
+
char val[2048];
|
106
106
|
llama_model_meta_val_str_by_index(llama->model, i, val, sizeof(val));
|
107
107
|
|
108
108
|
NSString *keyStr = [NSString stringWithUTF8String:key];
|
package/jest/mock.js
CHANGED