cui-llama.rn 1.3.5 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -1
- package/android/src/main/CMakeLists.txt +25 -20
- package/android/src/main/java/com/rnllama/LlamaContext.java +31 -9
- package/android/src/main/java/com/rnllama/RNLlama.java +98 -0
- package/android/src/main/jni-utils.h +94 -0
- package/android/src/main/jni.cpp +108 -37
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +15 -0
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +15 -0
- package/cpp/common.cpp +1982 -1965
- package/cpp/common.h +665 -657
- package/cpp/ggml-backend-reg.cpp +5 -0
- package/cpp/ggml-backend.cpp +5 -2
- package/cpp/ggml-cpp.h +1 -0
- package/cpp/ggml-cpu-aarch64.cpp +6 -1
- package/cpp/ggml-cpu-quants.c +5 -1
- package/cpp/ggml-cpu.c +14122 -14122
- package/cpp/ggml-cpu.cpp +627 -627
- package/cpp/ggml-impl.h +11 -16
- package/cpp/ggml-metal-impl.h +288 -0
- package/cpp/ggml-metal.m +2 -2
- package/cpp/ggml-opt.cpp +854 -0
- package/cpp/ggml-opt.h +216 -0
- package/cpp/ggml.c +0 -1276
- package/cpp/ggml.h +0 -140
- package/cpp/gguf.cpp +1325 -0
- package/cpp/gguf.h +202 -0
- package/cpp/llama-adapter.cpp +346 -0
- package/cpp/llama-adapter.h +73 -0
- package/cpp/llama-arch.cpp +1434 -0
- package/cpp/llama-arch.h +395 -0
- package/cpp/llama-batch.cpp +368 -0
- package/cpp/llama-batch.h +88 -0
- package/cpp/llama-chat.cpp +567 -0
- package/cpp/llama-chat.h +51 -0
- package/cpp/llama-context.cpp +1771 -0
- package/cpp/llama-context.h +128 -0
- package/cpp/llama-cparams.cpp +1 -0
- package/cpp/llama-cparams.h +37 -0
- package/cpp/llama-cpp.h +30 -0
- package/cpp/llama-grammar.cpp +1 -0
- package/cpp/llama-grammar.h +3 -1
- package/cpp/llama-hparams.cpp +71 -0
- package/cpp/llama-hparams.h +140 -0
- package/cpp/llama-impl.cpp +167 -0
- package/cpp/llama-impl.h +16 -136
- package/cpp/llama-kv-cache.cpp +718 -0
- package/cpp/llama-kv-cache.h +218 -0
- package/cpp/llama-mmap.cpp +589 -0
- package/cpp/llama-mmap.h +67 -0
- package/cpp/llama-model-loader.cpp +1011 -0
- package/cpp/llama-model-loader.h +158 -0
- package/cpp/llama-model.cpp +2202 -0
- package/cpp/llama-model.h +391 -0
- package/cpp/llama-sampling.cpp +117 -4
- package/cpp/llama-vocab.cpp +21 -28
- package/cpp/llama-vocab.h +13 -1
- package/cpp/llama.cpp +12547 -23528
- package/cpp/llama.h +31 -6
- package/cpp/rn-llama.hpp +90 -87
- package/cpp/sgemm.cpp +776 -70
- package/cpp/sgemm.h +14 -14
- package/cpp/unicode.cpp +6 -0
- package/ios/RNLlama.mm +47 -0
- package/ios/RNLlamaContext.h +3 -1
- package/ios/RNLlamaContext.mm +71 -14
- package/jest/mock.js +15 -3
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/commonjs/index.js +33 -37
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/module/index.js +31 -35
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +26 -6
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +21 -36
- package/lib/typescript/index.d.ts.map +1 -1
- package/llama-rn.podspec +4 -18
- package/package.json +2 -3
- package/src/NativeRNLlama.ts +32 -13
- package/src/index.ts +52 -47
package/cpp/ggml-backend-reg.cpp
CHANGED
@@ -574,4 +574,9 @@ void lm_ggml_backend_load_all_from_path(const char * dir_path) {
|
|
574
574
|
lm_ggml_backend_load_best("opencl", silent, dir_path);
|
575
575
|
lm_ggml_backend_load_best("musa", silent, dir_path);
|
576
576
|
lm_ggml_backend_load_best("cpu", silent, dir_path);
|
577
|
+
// check the environment variable LM_GGML_BACKEND_PATH to load an out-of-tree backend
|
578
|
+
const char * backend_path = std::getenv("LM_GGML_BACKEND_PATH");
|
579
|
+
if (backend_path) {
|
580
|
+
lm_ggml_backend_load(backend_path);
|
581
|
+
}
|
577
582
|
}
|
package/cpp/ggml-backend.cpp
CHANGED
@@ -764,7 +764,7 @@ static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sch
|
|
764
764
|
if (tensor->op != LM_GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
765
765
|
int src_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
766
766
|
// check if a backend with higher prio wants to offload the op
|
767
|
-
if (src_backend_id == sched->n_backends - 1) {
|
767
|
+
if (src_backend_id == sched->n_backends - 1 && lm_ggml_backend_buffer_is_host(src->buffer)) {
|
768
768
|
for (int b = 0; b < src_backend_id; b++) {
|
769
769
|
if (lm_ggml_backend_supports_op(sched->backends[b], tensor) && lm_ggml_backend_offload_op(sched->backends[b], tensor)) {
|
770
770
|
SET_CAUSE(tensor, "1.off");
|
@@ -795,9 +795,12 @@ static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sche
|
|
795
795
|
for (int i = 0; i < graph->n_nodes; i++) {
|
796
796
|
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
797
797
|
lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
798
|
-
LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs
|
798
|
+
LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, lm_ggml_backend_name(split_backend),
|
799
799
|
sched->splits[cur_split].n_inputs);
|
800
800
|
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
801
|
+
if (j == 0) {
|
802
|
+
LM_GGML_LOG_DEBUG(": ");
|
803
|
+
}
|
801
804
|
LM_GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
802
805
|
fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
803
806
|
}
|
package/cpp/ggml-cpp.h
CHANGED
package/cpp/ggml-cpu-aarch64.cpp
CHANGED
@@ -194,9 +194,12 @@ static inline __m256i sum_i16_pairs_int32x8(const __m256i x) {
|
|
194
194
|
}
|
195
195
|
|
196
196
|
static inline __m256i mul_sum_us8_pairs_int32x8(const __m256i ax, const __m256i sy) {
|
197
|
-
#if defined(
|
197
|
+
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
|
198
198
|
const __m256i zero = _mm256_setzero_si256();
|
199
199
|
return _mm256_dpbusd_epi32(zero, ax, sy);
|
200
|
+
#elif defined(__AVXVNNI__)
|
201
|
+
const __m256i zero = _mm256_setzero_si256();
|
202
|
+
return _mm256_dpbusd_avx_epi32(zero, ax, sy);
|
200
203
|
#else
|
201
204
|
// Perform multiplication and create 16-bit values
|
202
205
|
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
|
@@ -4166,6 +4169,8 @@ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_aarch64_buffer_type_alloc_bu
|
|
4166
4169
|
buffer->buft = buft;
|
4167
4170
|
buffer->iface.init_tensor = lm_ggml_backend_cpu_aarch64_buffer_init_tensor;
|
4168
4171
|
buffer->iface.set_tensor = lm_ggml_backend_cpu_aarch64_buffer_set_tensor;
|
4172
|
+
buffer->iface.get_tensor = nullptr;
|
4173
|
+
buffer->iface.cpy_tensor = nullptr;
|
4169
4174
|
return buffer;
|
4170
4175
|
}
|
4171
4176
|
|
package/cpp/ggml-cpu-quants.c
CHANGED
@@ -103,10 +103,14 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
|
|
103
103
|
}
|
104
104
|
|
105
105
|
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
|
106
|
-
#if defined(
|
106
|
+
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
|
107
107
|
const __m256i zero = _mm256_setzero_si256();
|
108
108
|
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
|
109
109
|
return _mm256_cvtepi32_ps(summed_pairs);
|
110
|
+
#elif defined(__AVXVNNI__)
|
111
|
+
const __m256i zero = _mm256_setzero_si256();
|
112
|
+
const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
|
113
|
+
return _mm256_cvtepi32_ps(summed_pairs);
|
110
114
|
#else
|
111
115
|
// Perform multiplication and create 16-bit values
|
112
116
|
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
|