cui-llama.rn 1.2.6 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/README.md +3 -2
  2. package/android/src/main/CMakeLists.txt +26 -6
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +115 -27
  4. package/android/src/main/java/com/rnllama/RNLlama.java +40 -7
  5. package/android/src/main/jni.cpp +228 -40
  6. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +9 -4
  7. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +9 -4
  8. package/cpp/amx/amx.cpp +196 -0
  9. package/cpp/amx/amx.h +20 -0
  10. package/cpp/amx/common.h +101 -0
  11. package/cpp/amx/mmq.cpp +2524 -0
  12. package/cpp/amx/mmq.h +16 -0
  13. package/cpp/common.cpp +118 -251
  14. package/cpp/common.h +53 -30
  15. package/cpp/ggml-aarch64.c +46 -3395
  16. package/cpp/ggml-aarch64.h +0 -20
  17. package/cpp/ggml-alloc.c +6 -8
  18. package/cpp/ggml-backend-impl.h +33 -11
  19. package/cpp/ggml-backend-reg.cpp +423 -0
  20. package/cpp/ggml-backend.cpp +14 -676
  21. package/cpp/ggml-backend.h +46 -9
  22. package/cpp/ggml-common.h +6 -0
  23. package/cpp/ggml-cpu-aarch64.c +3823 -0
  24. package/cpp/ggml-cpu-aarch64.h +32 -0
  25. package/cpp/ggml-cpu-impl.h +14 -242
  26. package/cpp/ggml-cpu-quants.c +10835 -0
  27. package/cpp/ggml-cpu-quants.h +63 -0
  28. package/cpp/ggml-cpu.c +13971 -13720
  29. package/cpp/ggml-cpu.cpp +715 -0
  30. package/cpp/ggml-cpu.h +65 -63
  31. package/cpp/ggml-impl.h +285 -25
  32. package/cpp/ggml-metal.h +8 -8
  33. package/cpp/ggml-metal.m +1221 -728
  34. package/cpp/ggml-quants.c +189 -10681
  35. package/cpp/ggml-quants.h +78 -125
  36. package/cpp/ggml-threading.cpp +12 -0
  37. package/cpp/ggml-threading.h +12 -0
  38. package/cpp/ggml.c +688 -1460
  39. package/cpp/ggml.h +58 -244
  40. package/cpp/json-schema-to-grammar.cpp +1045 -1045
  41. package/cpp/json.hpp +24766 -24766
  42. package/cpp/llama-sampling.cpp +5 -2
  43. package/cpp/llama.cpp +409 -123
  44. package/cpp/llama.h +8 -4
  45. package/cpp/rn-llama.hpp +89 -25
  46. package/cpp/sampling.cpp +42 -3
  47. package/cpp/sampling.h +22 -1
  48. package/cpp/sgemm.cpp +608 -0
  49. package/cpp/speculative.cpp +270 -0
  50. package/cpp/speculative.h +28 -0
  51. package/cpp/unicode.cpp +11 -0
  52. package/ios/RNLlama.mm +43 -20
  53. package/ios/RNLlamaContext.h +9 -3
  54. package/ios/RNLlamaContext.mm +146 -33
  55. package/jest/mock.js +0 -1
  56. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  57. package/lib/commonjs/grammar.js +4 -2
  58. package/lib/commonjs/grammar.js.map +1 -1
  59. package/lib/commonjs/index.js +52 -15
  60. package/lib/commonjs/index.js.map +1 -1
  61. package/lib/module/NativeRNLlama.js.map +1 -1
  62. package/lib/module/grammar.js +2 -1
  63. package/lib/module/grammar.js.map +1 -1
  64. package/lib/module/index.js +51 -15
  65. package/lib/module/index.js.map +1 -1
  66. package/lib/typescript/NativeRNLlama.d.ts +122 -8
  67. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  68. package/lib/typescript/grammar.d.ts +5 -6
  69. package/lib/typescript/grammar.d.ts.map +1 -1
  70. package/lib/typescript/index.d.ts +15 -6
  71. package/lib/typescript/index.d.ts.map +1 -1
  72. package/package.json +2 -1
  73. package/src/NativeRNLlama.ts +135 -13
  74. package/src/grammar.ts +10 -8
  75. package/src/index.ts +104 -28
@@ -3,6 +3,20 @@
3
3
  #include "ggml.h"
4
4
  #include "ggml-alloc.h"
5
5
 
6
+ #ifdef LM_GGML_BACKEND_SHARED
7
+ # if defined(_WIN32) && !defined(__MINGW32__)
8
+ # ifdef LM_GGML_BACKEND_BUILD
9
+ # define LM_GGML_BACKEND_API __declspec(dllexport) extern
10
+ # else
11
+ # define LM_GGML_BACKEND_API __declspec(dllimport) extern
12
+ # endif
13
+ # else
14
+ # define LM_GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
15
+ # endif
16
+ #else
17
+ # define LM_GGML_BACKEND_API extern
18
+ #endif
19
+
6
20
  #ifdef __cplusplus
7
21
  extern "C" {
8
22
  #endif
@@ -72,7 +86,7 @@ extern "C" {
72
86
  LM_GGML_API void lm_ggml_backend_tensor_set_async(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
73
87
  LM_GGML_API void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size);
74
88
 
75
- // "offset" refers to the offset of the tensor data for setting/getting data
89
+ // "offset" refers to the offset in tensor->data for setting/getting data
76
90
  LM_GGML_API void lm_ggml_backend_tensor_set( struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
77
91
  LM_GGML_API void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size);
78
92
  LM_GGML_API void lm_ggml_backend_tensor_memset( struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
@@ -176,6 +190,14 @@ extern "C" {
176
190
  typedef void (*lm_ggml_backend_set_n_threads_t)(lm_ggml_backend_t backend, int n_threads);
177
191
  // Get additional buffer types provided by the device (returns a NULL-terminated array)
178
192
  typedef lm_ggml_backend_buffer_type_t * (*lm_ggml_backend_dev_get_extra_bufts_t)(lm_ggml_backend_dev_t device);
193
+ // Set the abort callback for the backend
194
+ typedef void (*lm_ggml_backend_set_abort_callback_t)(lm_ggml_backend_t backend, lm_ggml_abort_callback abort_callback, void * abort_callback_data);
195
+ // Get a list of feature flags supported by the backend (returns a NULL-terminated array)
196
+ struct lm_ggml_backend_feature {
197
+ const char * name;
198
+ const char * value;
199
+ };
200
+ typedef struct lm_ggml_backend_feature * (*lm_ggml_backend_get_features_t)(lm_ggml_backend_reg_t reg);
179
201
 
180
202
  //
181
203
  // Backend registry
@@ -200,6 +222,13 @@ extern "C" {
200
222
  // = lm_ggml_backend_dev_init(lm_ggml_backend_dev_by_type(GPU) OR lm_ggml_backend_dev_by_type(CPU), NULL)
201
223
  LM_GGML_API lm_ggml_backend_t lm_ggml_backend_init_best(void);
202
224
 
225
+ // Load a backend from a dynamic library and register it
226
+ LM_GGML_API lm_ggml_backend_reg_t lm_ggml_backend_load(const char * path);
227
+ // Unload a backend if loaded dynamically and unregister it
228
+ LM_GGML_API void lm_ggml_backend_unload(lm_ggml_backend_reg_t reg);
229
+ // Load all known backends from dynamic libraries
230
+ LM_GGML_API void lm_ggml_backend_load_all(void);
231
+
203
232
  //
204
233
  // Backend scheduler
205
234
  //
@@ -228,14 +257,20 @@ extern "C" {
228
257
  lm_ggml_backend_sched_reserve(sched, reserve_graph);
229
258
 
230
259
  // compute
231
- graph = build_graph(sched);
232
- lm_ggml_backend_sched_graph_compute(sched, graph);
260
+ graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
261
+ for (int i = 0; i < 10; ++i) {
262
+ lm_ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
263
+ }
233
264
 
234
265
  // if there are graph inputs:
235
- lm_ggml_backend_sched_reset(sched);
236
- lm_ggml_backend_sched_alloc_graph(sched, graph);
237
- lm_ggml_backend_tensor_set(input_tensor, ...);
238
- lm_ggml_backend_sched_graph_compute(sched, graph);
266
+ graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once lm_ggml_free is called)
267
+ lm_ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
268
+ lm_ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
269
+ lm_ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
270
+ lm_ggml_backend_sched_graph_compute(sched, graph); // execute the graph
271
+
272
+ // as an alternative to the above it is also possible to assign the inputs to a dedicated context and
273
+ // allocate them statically via lm_ggml_backend_alloc_ctx_tensors
239
274
  }
240
275
  */
241
276
 
@@ -250,7 +285,7 @@ extern "C" {
250
285
  //
251
286
  typedef bool (*lm_ggml_backend_sched_eval_callback)(struct lm_ggml_tensor * t, bool ask, void * user_data);
252
287
 
253
- // Initialize a backend scheduler
288
+ // Initialize a backend scheduler, backends with low index are given priority over backends with high index
254
289
  LM_GGML_API lm_ggml_backend_sched_t lm_ggml_backend_sched_new(lm_ggml_backend_t * backends, lm_ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
255
290
  LM_GGML_API void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched);
256
291
 
@@ -275,7 +310,9 @@ extern "C" {
275
310
  LM_GGML_API enum lm_ggml_status lm_ggml_backend_sched_graph_compute_async(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph);
276
311
  LM_GGML_API void lm_ggml_backend_sched_synchronize(lm_ggml_backend_sched_t sched);
277
312
 
278
- // Reset all assignments and allocators - must be called before changing the node backends
313
+ // Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
314
+ // This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
315
+ // The correct way to use this API is to discard the deallocated tensors and create new ones.
279
316
  LM_GGML_API void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched);
280
317
 
281
318
  // Set a callback to be called for each resulting node during graph compute
package/cpp/ggml-common.h CHANGED
@@ -418,6 +418,12 @@ typedef struct {
418
418
  } block_iq4_xs;
419
419
  static_assert(sizeof(block_iq4_xs) == sizeof(lm_ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
420
420
 
421
+ typedef struct {
422
+ lm_ggml_half d[4]; // deltas for 4 iq4_nl blocks
423
+ uint8_t qs[QK4_NL * 2];// nibbles / quants for 4 iq4_nl blocks
424
+ } block_iq4_nlx4;
425
+ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(lm_ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
426
+
421
427
  #endif // LM_GGML_COMMON_DECL
422
428
  #endif // LM_GGML_COMMON_DECL
423
429