cui-llama.rn 1.2.6 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/android/src/main/CMakeLists.txt +26 -6
- package/android/src/main/java/com/rnllama/LlamaContext.java +115 -27
- package/android/src/main/java/com/rnllama/RNLlama.java +40 -7
- package/android/src/main/jni.cpp +228 -40
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +9 -4
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +9 -4
- package/cpp/amx/amx.cpp +196 -0
- package/cpp/amx/amx.h +20 -0
- package/cpp/amx/common.h +101 -0
- package/cpp/amx/mmq.cpp +2524 -0
- package/cpp/amx/mmq.h +16 -0
- package/cpp/common.cpp +118 -251
- package/cpp/common.h +53 -30
- package/cpp/ggml-aarch64.c +46 -3395
- package/cpp/ggml-aarch64.h +0 -20
- package/cpp/ggml-alloc.c +6 -8
- package/cpp/ggml-backend-impl.h +33 -11
- package/cpp/ggml-backend-reg.cpp +423 -0
- package/cpp/ggml-backend.cpp +14 -676
- package/cpp/ggml-backend.h +46 -9
- package/cpp/ggml-common.h +6 -0
- package/cpp/ggml-cpu-aarch64.c +3823 -0
- package/cpp/ggml-cpu-aarch64.h +32 -0
- package/cpp/ggml-cpu-impl.h +14 -242
- package/cpp/ggml-cpu-quants.c +10835 -0
- package/cpp/ggml-cpu-quants.h +63 -0
- package/cpp/ggml-cpu.c +13971 -13720
- package/cpp/ggml-cpu.cpp +715 -0
- package/cpp/ggml-cpu.h +65 -63
- package/cpp/ggml-impl.h +285 -25
- package/cpp/ggml-metal.h +8 -8
- package/cpp/ggml-metal.m +1221 -728
- package/cpp/ggml-quants.c +189 -10681
- package/cpp/ggml-quants.h +78 -125
- package/cpp/ggml-threading.cpp +12 -0
- package/cpp/ggml-threading.h +12 -0
- package/cpp/ggml.c +688 -1460
- package/cpp/ggml.h +58 -244
- package/cpp/json-schema-to-grammar.cpp +1045 -1045
- package/cpp/json.hpp +24766 -24766
- package/cpp/llama-sampling.cpp +5 -2
- package/cpp/llama.cpp +409 -123
- package/cpp/llama.h +8 -4
- package/cpp/rn-llama.hpp +89 -25
- package/cpp/sampling.cpp +42 -3
- package/cpp/sampling.h +22 -1
- package/cpp/sgemm.cpp +608 -0
- package/cpp/speculative.cpp +270 -0
- package/cpp/speculative.h +28 -0
- package/cpp/unicode.cpp +11 -0
- package/ios/RNLlama.mm +43 -20
- package/ios/RNLlamaContext.h +9 -3
- package/ios/RNLlamaContext.mm +146 -33
- package/jest/mock.js +0 -1
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/commonjs/grammar.js +4 -2
- package/lib/commonjs/grammar.js.map +1 -1
- package/lib/commonjs/index.js +52 -15
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/module/grammar.js +2 -1
- package/lib/module/grammar.js.map +1 -1
- package/lib/module/index.js +51 -15
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +122 -8
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/grammar.d.ts +5 -6
- package/lib/typescript/grammar.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +15 -6
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +2 -1
- package/src/NativeRNLlama.ts +135 -13
- package/src/grammar.ts +10 -8
- package/src/index.ts +104 -28
package/cpp/ggml-backend.h
CHANGED
@@ -3,6 +3,20 @@
|
|
3
3
|
#include "ggml.h"
|
4
4
|
#include "ggml-alloc.h"
|
5
5
|
|
6
|
+
#ifdef LM_GGML_BACKEND_SHARED
|
7
|
+
# if defined(_WIN32) && !defined(__MINGW32__)
|
8
|
+
# ifdef LM_GGML_BACKEND_BUILD
|
9
|
+
# define LM_GGML_BACKEND_API __declspec(dllexport) extern
|
10
|
+
# else
|
11
|
+
# define LM_GGML_BACKEND_API __declspec(dllimport) extern
|
12
|
+
# endif
|
13
|
+
# else
|
14
|
+
# define LM_GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
|
15
|
+
# endif
|
16
|
+
#else
|
17
|
+
# define LM_GGML_BACKEND_API extern
|
18
|
+
#endif
|
19
|
+
|
6
20
|
#ifdef __cplusplus
|
7
21
|
extern "C" {
|
8
22
|
#endif
|
@@ -72,7 +86,7 @@ extern "C" {
|
|
72
86
|
LM_GGML_API void lm_ggml_backend_tensor_set_async(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
73
87
|
LM_GGML_API void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
74
88
|
|
75
|
-
// "offset" refers to the offset
|
89
|
+
// "offset" refers to the offset in tensor->data for setting/getting data
|
76
90
|
LM_GGML_API void lm_ggml_backend_tensor_set( struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
77
91
|
LM_GGML_API void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
78
92
|
LM_GGML_API void lm_ggml_backend_tensor_memset( struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
@@ -176,6 +190,14 @@ extern "C" {
|
|
176
190
|
typedef void (*lm_ggml_backend_set_n_threads_t)(lm_ggml_backend_t backend, int n_threads);
|
177
191
|
// Get additional buffer types provided by the device (returns a NULL-terminated array)
|
178
192
|
typedef lm_ggml_backend_buffer_type_t * (*lm_ggml_backend_dev_get_extra_bufts_t)(lm_ggml_backend_dev_t device);
|
193
|
+
// Set the abort callback for the backend
|
194
|
+
typedef void (*lm_ggml_backend_set_abort_callback_t)(lm_ggml_backend_t backend, lm_ggml_abort_callback abort_callback, void * abort_callback_data);
|
195
|
+
// Get a list of feature flags supported by the backend (returns a NULL-terminated array)
|
196
|
+
struct lm_ggml_backend_feature {
|
197
|
+
const char * name;
|
198
|
+
const char * value;
|
199
|
+
};
|
200
|
+
typedef struct lm_ggml_backend_feature * (*lm_ggml_backend_get_features_t)(lm_ggml_backend_reg_t reg);
|
179
201
|
|
180
202
|
//
|
181
203
|
// Backend registry
|
@@ -200,6 +222,13 @@ extern "C" {
|
|
200
222
|
// = lm_ggml_backend_dev_init(lm_ggml_backend_dev_by_type(GPU) OR lm_ggml_backend_dev_by_type(CPU), NULL)
|
201
223
|
LM_GGML_API lm_ggml_backend_t lm_ggml_backend_init_best(void);
|
202
224
|
|
225
|
+
// Load a backend from a dynamic library and register it
|
226
|
+
LM_GGML_API lm_ggml_backend_reg_t lm_ggml_backend_load(const char * path);
|
227
|
+
// Unload a backend if loaded dynamically and unregister it
|
228
|
+
LM_GGML_API void lm_ggml_backend_unload(lm_ggml_backend_reg_t reg);
|
229
|
+
// Load all known backends from dynamic libraries
|
230
|
+
LM_GGML_API void lm_ggml_backend_load_all(void);
|
231
|
+
|
203
232
|
//
|
204
233
|
// Backend scheduler
|
205
234
|
//
|
@@ -228,14 +257,20 @@ extern "C" {
|
|
228
257
|
lm_ggml_backend_sched_reserve(sched, reserve_graph);
|
229
258
|
|
230
259
|
// compute
|
231
|
-
graph = build_graph(sched);
|
232
|
-
|
260
|
+
graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
|
261
|
+
for (int i = 0; i < 10; ++i) {
|
262
|
+
lm_ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
|
263
|
+
}
|
233
264
|
|
234
265
|
// if there are graph inputs:
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
266
|
+
graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once lm_ggml_free is called)
|
267
|
+
lm_ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
|
268
|
+
lm_ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
|
269
|
+
lm_ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
|
270
|
+
lm_ggml_backend_sched_graph_compute(sched, graph); // execute the graph
|
271
|
+
|
272
|
+
// as an alternative to the above it is also possible to assign the inputs to a dedicated context and
|
273
|
+
// allocate them statically via lm_ggml_backend_alloc_ctx_tensors
|
239
274
|
}
|
240
275
|
*/
|
241
276
|
|
@@ -250,7 +285,7 @@ extern "C" {
|
|
250
285
|
//
|
251
286
|
typedef bool (*lm_ggml_backend_sched_eval_callback)(struct lm_ggml_tensor * t, bool ask, void * user_data);
|
252
287
|
|
253
|
-
// Initialize a backend scheduler
|
288
|
+
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
|
254
289
|
LM_GGML_API lm_ggml_backend_sched_t lm_ggml_backend_sched_new(lm_ggml_backend_t * backends, lm_ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
|
255
290
|
LM_GGML_API void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched);
|
256
291
|
|
@@ -275,7 +310,9 @@ extern "C" {
|
|
275
310
|
LM_GGML_API enum lm_ggml_status lm_ggml_backend_sched_graph_compute_async(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph);
|
276
311
|
LM_GGML_API void lm_ggml_backend_sched_synchronize(lm_ggml_backend_sched_t sched);
|
277
312
|
|
278
|
-
// Reset all assignments and allocators - must be called before changing the node backends
|
313
|
+
// Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
|
314
|
+
// This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
|
315
|
+
// The correct way to use this API is to discard the deallocated tensors and create new ones.
|
279
316
|
LM_GGML_API void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched);
|
280
317
|
|
281
318
|
// Set a callback to be called for each resulting node during graph compute
|
package/cpp/ggml-common.h
CHANGED
@@ -418,6 +418,12 @@ typedef struct {
|
|
418
418
|
} block_iq4_xs;
|
419
419
|
static_assert(sizeof(block_iq4_xs) == sizeof(lm_ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
420
420
|
|
421
|
+
typedef struct {
|
422
|
+
lm_ggml_half d[4]; // deltas for 4 iq4_nl blocks
|
423
|
+
uint8_t qs[QK4_NL * 2];// nibbles / quants for 4 iq4_nl blocks
|
424
|
+
} block_iq4_nlx4;
|
425
|
+
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(lm_ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
|
426
|
+
|
421
427
|
#endif // LM_GGML_COMMON_DECL
|
422
428
|
#endif // LM_GGML_COMMON_DECL
|
423
429
|
|