cui-llama.rn 1.2.6 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/README.md +3 -2
  2. package/android/src/main/CMakeLists.txt +20 -5
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +115 -27
  4. package/android/src/main/java/com/rnllama/RNLlama.java +40 -7
  5. package/android/src/main/jni.cpp +222 -34
  6. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +9 -4
  7. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +9 -4
  8. package/cpp/common.cpp +1682 -2114
  9. package/cpp/common.h +600 -613
  10. package/cpp/ggml-aarch64.c +129 -3478
  11. package/cpp/ggml-aarch64.h +19 -39
  12. package/cpp/ggml-alloc.c +1040 -1040
  13. package/cpp/ggml-alloc.h +76 -76
  14. package/cpp/ggml-backend-impl.h +216 -216
  15. package/cpp/ggml-backend-reg.cpp +195 -0
  16. package/cpp/ggml-backend.cpp +1997 -2661
  17. package/cpp/ggml-backend.h +328 -314
  18. package/cpp/ggml-common.h +1853 -1853
  19. package/cpp/ggml-cpp.h +38 -38
  20. package/cpp/ggml-cpu-aarch64.c +3560 -0
  21. package/cpp/ggml-cpu-aarch64.h +30 -0
  22. package/cpp/ggml-cpu-impl.h +371 -614
  23. package/cpp/ggml-cpu-quants.c +10822 -0
  24. package/cpp/ggml-cpu-quants.h +63 -0
  25. package/cpp/ggml-cpu.c +13975 -13720
  26. package/cpp/ggml-cpu.cpp +663 -0
  27. package/cpp/ggml-cpu.h +177 -150
  28. package/cpp/ggml-impl.h +550 -296
  29. package/cpp/ggml-metal.h +66 -66
  30. package/cpp/ggml-metal.m +4294 -3933
  31. package/cpp/ggml-quants.c +5247 -15739
  32. package/cpp/ggml-quants.h +100 -147
  33. package/cpp/ggml-threading.cpp +12 -0
  34. package/cpp/ggml-threading.h +12 -0
  35. package/cpp/ggml.c +8180 -8390
  36. package/cpp/ggml.h +2411 -2441
  37. package/cpp/llama-grammar.cpp +1138 -1138
  38. package/cpp/llama-grammar.h +144 -144
  39. package/cpp/llama-impl.h +181 -181
  40. package/cpp/llama-sampling.cpp +2348 -2345
  41. package/cpp/llama-sampling.h +48 -48
  42. package/cpp/llama-vocab.cpp +1984 -1984
  43. package/cpp/llama-vocab.h +170 -170
  44. package/cpp/llama.cpp +22132 -22046
  45. package/cpp/llama.h +1253 -1255
  46. package/cpp/log.cpp +401 -401
  47. package/cpp/log.h +121 -121
  48. package/cpp/rn-llama.hpp +83 -19
  49. package/cpp/sampling.cpp +466 -466
  50. package/cpp/sgemm.cpp +1884 -1276
  51. package/ios/RNLlama.mm +43 -20
  52. package/ios/RNLlamaContext.h +9 -3
  53. package/ios/RNLlamaContext.mm +133 -33
  54. package/jest/mock.js +0 -1
  55. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  56. package/lib/commonjs/index.js +52 -15
  57. package/lib/commonjs/index.js.map +1 -1
  58. package/lib/module/NativeRNLlama.js.map +1 -1
  59. package/lib/module/index.js +51 -15
  60. package/lib/module/index.js.map +1 -1
  61. package/lib/typescript/NativeRNLlama.d.ts +29 -5
  62. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  63. package/lib/typescript/index.d.ts +12 -5
  64. package/lib/typescript/index.d.ts.map +1 -1
  65. package/package.json +1 -1
  66. package/src/NativeRNLlama.ts +41 -6
  67. package/src/index.ts +82 -27
  68. package/cpp/json-schema-to-grammar.cpp +0 -1045
  69. package/cpp/json-schema-to-grammar.h +0 -8
  70. package/cpp/json.hpp +0 -24766
@@ -0,0 +1,663 @@
1
+ #include "ggml-backend.h"
2
+ #include "ggml-backend-impl.h"
3
+ #include "ggml-cpu.h"
4
+ #include "ggml-cpu-aarch64.h"
5
+ #include "ggml-impl.h"
6
+ #include <cctype>
7
+ #include <string>
8
+ #include <vector>
9
+
10
+ #if defined(__APPLE__)
11
+ #include <sys/types.h>
12
+ #include <sys/sysctl.h>
13
+ #endif
14
+
15
+ #if defined(_WIN32)
16
+ #define WIN32_LEAN_AND_MEAN
17
+ #ifndef NOMINMAX
18
+ #define NOMINMAX
19
+ #endif
20
+ #include <windows.h>
21
+ #endif
22
+
23
+ // ggml-backend interface
24
+
25
+ #ifdef LM_GGML_USE_CPU_HBM
26
+
27
+ // buffer type HBM
28
+
29
+ #include <hbwmalloc.h>
30
+
31
+ static const char * lm_ggml_backend_cpu_hbm_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
32
+ return "CPU_HBM";
33
+
34
+ LM_GGML_UNUSED(buft);
35
+ }
36
+
37
+ static void lm_ggml_backend_cpu_hbm_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
38
+ hbw_free(buffer->context);
39
+ }
40
+
41
+ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
42
+ void * ptr;
43
+ int result = hbw_posix_memalign(&ptr, lm_ggml_backend_cpu_buffer_type_get_alignment(buft), size);
44
+ if (result != 0) {
45
+ LM_GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
46
+ return NULL;
47
+ }
48
+
49
+ lm_ggml_backend_buffer_t buffer = lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
50
+ buffer->buft = buft;
51
+ buffer->iface.free_buffer = lm_ggml_backend_cpu_hbm_buffer_free_buffer;
52
+
53
+ return buffer;
54
+ }
55
+
56
+ lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_hbm_buffer_type(void) {
57
+ static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type_hbm = {
58
+ /* .iface = */ {
59
+ /* .get_name = */ lm_ggml_backend_cpu_hbm_buffer_type_get_name,
60
+ /* .alloc_buffer = */ lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
61
+ /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
62
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
63
+ /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
64
+ /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
65
+ },
66
+ /* .context = */ NULL,
67
+ };
68
+
69
+ return &lm_ggml_backend_cpu_buffer_type_hbm;
70
+ }
71
+ #endif
72
+
73
+ // buffer type AARCH64
74
+
75
+ static void lm_ggml_backend_cpu_aarch64_buffer_init_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) {
76
+ tensor->extra = (void *)lm_ggml_aarch64_get_optimal_repack_type(tensor); // NOLINT
77
+
78
+ LM_GGML_UNUSED(buffer);
79
+ }
80
+
81
+ static void lm_ggml_backend_cpu_aarch64_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
82
+ LM_GGML_ASSERT(offset == 0);
83
+ LM_GGML_ASSERT(size == lm_ggml_nbytes(tensor));
84
+
85
+ enum lm_ggml_type repack_type = (enum lm_ggml_type)(intptr_t)tensor->extra;
86
+
87
+ lm_ggml_aarch64_repack_tensor(tensor, repack_type, data, size);
88
+
89
+ LM_GGML_UNUSED(buffer);
90
+ }
91
+
92
+ static const char * lm_ggml_backend_cpu_aarch64_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
93
+ return "CPU_AARCH64";
94
+
95
+ LM_GGML_UNUSED(buft);
96
+ }
97
+
98
+ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
99
+ auto * buffer = lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_cpu_buffer_type(), size);
100
+
101
+ if (buffer == NULL) {
102
+ return NULL;
103
+ }
104
+
105
+ buffer->buft = buft;
106
+ buffer->iface.init_tensor = lm_ggml_backend_cpu_aarch64_buffer_init_tensor;
107
+ buffer->iface.set_tensor = lm_ggml_backend_cpu_aarch64_buffer_set_tensor;
108
+
109
+ return buffer;
110
+ }
111
+
112
+ lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_aarch64_buffer_type(void) {
113
+ static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type_aarch64 = {
114
+ /* .iface = */ {
115
+ /* .get_name = */ lm_ggml_backend_cpu_aarch64_buffer_type_get_name,
116
+ /* .alloc_buffer = */ lm_ggml_backend_cpu_aarch64_buffer_type_alloc_buffer,
117
+ /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type()->iface.get_alignment,
118
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
119
+ /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
120
+ /* .is_host = */ NULL,
121
+ },
122
+ /* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
123
+ /* .context = */ NULL,
124
+ };
125
+
126
+ return &lm_ggml_backend_cpu_buffer_type_aarch64;
127
+ }
128
+
129
+ bool lm_ggml_backend_cpu_buft_is_aarch64(lm_ggml_backend_buffer_type_t buft) {
130
+ return buft == lm_ggml_backend_cpu_aarch64_buffer_type();
131
+ }
132
+
133
+ static lm_ggml_backend_buffer_type_t * lm_ggml_backend_cpu_get_extra_bufts(lm_ggml_backend_dev_t device) {
134
+ static std::vector<lm_ggml_backend_buffer_type_t> bufts = []() {
135
+ std::vector<lm_ggml_backend_buffer_type_t> bufts;
136
+
137
+ #ifdef LM_GGML_USE_CPU_HBM
138
+ bufts.push_back(lm_ggml_backend_cpu_hbm_buffer_type());
139
+ #endif
140
+
141
+ #ifdef LM_GGML_USE_CPU_AARCH64
142
+ bufts.push_back(lm_ggml_backend_cpu_aarch64_buffer_type());
143
+ #endif
144
+
145
+ bufts.push_back(NULL);
146
+
147
+ return bufts;
148
+ }();
149
+
150
+ return bufts.data();
151
+
152
+ LM_GGML_UNUSED(device);
153
+ }
154
+
155
+ // CPU backend - backend (stream)
156
+
157
+ struct lm_ggml_backend_cpu_context {
158
+ int n_threads;
159
+ lm_ggml_threadpool_t threadpool;
160
+
161
+ uint8_t * work_data;
162
+ size_t work_size;
163
+
164
+ lm_ggml_abort_callback abort_callback;
165
+ void * abort_callback_data;
166
+ };
167
+
168
+ static const char * lm_ggml_backend_cpu_get_name(lm_ggml_backend_t backend) {
169
+ return "CPU";
170
+
171
+ LM_GGML_UNUSED(backend);
172
+ }
173
+
174
+ static void lm_ggml_backend_cpu_free(lm_ggml_backend_t backend) {
175
+ struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
176
+ delete[] cpu_ctx->work_data;
177
+ delete cpu_ctx;
178
+ delete backend;
179
+ }
180
+
181
+ struct lm_ggml_backend_plan_cpu {
182
+ struct lm_ggml_cplan cplan;
183
+ struct lm_ggml_cgraph cgraph;
184
+ };
185
+
186
+ static lm_ggml_backend_graph_plan_t lm_ggml_backend_cpu_graph_plan_create(lm_ggml_backend_t backend, const struct lm_ggml_cgraph * cgraph) {
187
+ struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
188
+
189
+ struct lm_ggml_backend_plan_cpu * cpu_plan = new lm_ggml_backend_plan_cpu;
190
+
191
+ cpu_plan->cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
192
+ cpu_plan->cgraph = *cgraph; // FIXME: deep copy
193
+
194
+ if (cpu_plan->cplan.work_size > 0) {
195
+ cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
196
+ if (cpu_plan->cplan.work_data == NULL) {
197
+ delete cpu_plan;
198
+ return NULL;
199
+ }
200
+ }
201
+
202
+ cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
203
+ cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
204
+
205
+ return cpu_plan;
206
+ }
207
+
208
+ static void lm_ggml_backend_cpu_graph_plan_free(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
209
+ struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
210
+
211
+ delete[] cpu_plan->cplan.work_data;
212
+ delete cpu_plan;
213
+
214
+ LM_GGML_UNUSED(backend);
215
+ }
216
+
217
+ static enum lm_ggml_status lm_ggml_backend_cpu_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
218
+ struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
219
+
220
+ return lm_ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
221
+
222
+ LM_GGML_UNUSED(backend);
223
+ }
224
+
225
+ static enum lm_ggml_status lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
226
+ struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
227
+
228
+ struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
229
+
230
+ if (cpu_ctx->work_size < cplan.work_size) {
231
+ delete[] cpu_ctx->work_data;
232
+ cpu_ctx->work_data = new uint8_t[cplan.work_size];
233
+ if (cpu_ctx->work_data == NULL) {
234
+ cpu_ctx->work_size = 0;
235
+ return LM_GGML_STATUS_ALLOC_FAILED;
236
+ }
237
+ cpu_ctx->work_size = cplan.work_size;
238
+ }
239
+ cplan.work_data = (uint8_t *)cpu_ctx->work_data;
240
+
241
+ cplan.abort_callback = cpu_ctx->abort_callback;
242
+ cplan.abort_callback_data = cpu_ctx->abort_callback_data;
243
+
244
+ return lm_ggml_graph_compute(cgraph, &cplan);
245
+ }
246
+
247
+ static const struct lm_ggml_backend_i lm_ggml_backend_cpu_i = {
248
+ /* .get_name = */ lm_ggml_backend_cpu_get_name,
249
+ /* .free = */ lm_ggml_backend_cpu_free,
250
+ /* .set_tensor_async = */ NULL,
251
+ /* .get_tensor_async = */ NULL,
252
+ /* .cpy_tensor_async = */ NULL,
253
+ /* .synchronize = */ NULL,
254
+ /* .graph_plan_create = */ lm_ggml_backend_cpu_graph_plan_create,
255
+ /* .graph_plan_free = */ lm_ggml_backend_cpu_graph_plan_free,
256
+ /* .graph_plan_update = */ NULL,
257
+ /* .graph_plan_compute = */ lm_ggml_backend_cpu_graph_plan_compute,
258
+ /* .graph_compute = */ lm_ggml_backend_cpu_graph_compute,
259
+ /* .event_record = */ NULL,
260
+ /* .event_wait = */ NULL,
261
+ };
262
+
263
+ static lm_ggml_guid_t lm_ggml_backend_cpu_guid(void) {
264
+ static lm_ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
265
+ return &guid;
266
+ }
267
+
268
+ lm_ggml_backend_t lm_ggml_backend_cpu_init(void) {
269
+ // initialize CPU backend now to avoid slowing the first graph computation
270
+ lm_ggml_cpu_init();
271
+
272
+ struct lm_ggml_backend_cpu_context * ctx = new lm_ggml_backend_cpu_context;
273
+ if (ctx == NULL) {
274
+ return NULL;
275
+ }
276
+
277
+ ctx->n_threads = LM_GGML_DEFAULT_N_THREADS;
278
+ ctx->threadpool = NULL;
279
+ ctx->work_data = NULL;
280
+ ctx->work_size = 0;
281
+ ctx->abort_callback = NULL;
282
+ ctx->abort_callback_data = NULL;
283
+
284
+ lm_ggml_backend_t cpu_backend = new lm_ggml_backend {
285
+ /* .guid = */ lm_ggml_backend_cpu_guid(),
286
+ /* .interface = */ lm_ggml_backend_cpu_i,
287
+ /* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
288
+ /* .context = */ ctx,
289
+ };
290
+
291
+ if (cpu_backend == NULL) {
292
+ delete ctx;
293
+ return NULL;
294
+ }
295
+
296
+ return cpu_backend;
297
+ }
298
+
299
+ bool lm_ggml_backend_is_cpu(lm_ggml_backend_t backend) {
300
+ return backend != NULL && lm_ggml_guid_matches(backend->guid, lm_ggml_backend_cpu_guid());
301
+ }
302
+
303
+ void lm_ggml_backend_cpu_set_n_threads(lm_ggml_backend_t backend_cpu, int n_threads) {
304
+ LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
305
+
306
+ struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
307
+ ctx->n_threads = n_threads;
308
+ }
309
+
310
+ void lm_ggml_backend_cpu_set_threadpool(lm_ggml_backend_t backend_cpu, lm_ggml_threadpool_t threadpool) {
311
+ LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
312
+
313
+ struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
314
+
315
+ if (ctx->threadpool && ctx->threadpool != threadpool) {
316
+ // already had a different threadpool, pause/suspend it before switching
317
+ lm_ggml_threadpool_pause(ctx->threadpool);
318
+ }
319
+ ctx->threadpool = threadpool;
320
+ }
321
+
322
+ void lm_ggml_backend_cpu_set_abort_callback(lm_ggml_backend_t backend_cpu, lm_ggml_abort_callback abort_callback, void * abort_callback_data) {
323
+ LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
324
+
325
+ struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
326
+ ctx->abort_callback = abort_callback;
327
+ ctx->abort_callback_data = abort_callback_data;
328
+ }
329
+
330
+ // CPU backend - device
331
+
332
+ struct lm_ggml_backend_cpu_device_context {
333
+ std::string description = "CPU";
334
+
335
+ lm_ggml_backend_cpu_device_context() {
336
+ #ifdef __APPLE__
337
+ size_t len = 0;
338
+ if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
339
+ description.resize(len);
340
+ sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
341
+ }
342
+ #elif defined(__linux__)
343
+ FILE * f = fopen("/proc/cpuinfo", "r");
344
+ if (f) {
345
+ char buf[1024];
346
+ while (fgets(buf, sizeof(buf), f)) {
347
+ if (strncmp(buf, "model name", 10) == 0) {
348
+ char * p = strchr(buf, ':');
349
+ if (p) {
350
+ p++;
351
+ while (std::isspace(*p)) {
352
+ p++;
353
+ }
354
+ while (std::isspace(p[strlen(p) - 1])) {
355
+ p[strlen(p) - 1] = '\0';
356
+ }
357
+ description = p;
358
+ break;
359
+ }
360
+ }
361
+ }
362
+ fclose(f);
363
+ }
364
+ #elif defined(_WIN32)
365
+ HKEY hKey;
366
+ if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
367
+ TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
368
+ 0,
369
+ KEY_READ,
370
+ &hKey) == ERROR_SUCCESS) {
371
+ DWORD cpu_brand_size = 0;
372
+ if (RegQueryValueExA(hKey,
373
+ TEXT("ProcessorNameString"),
374
+ NULL,
375
+ NULL,
376
+ NULL,
377
+ &cpu_brand_size) == ERROR_SUCCESS) {
378
+ description.resize(cpu_brand_size);
379
+ if (RegQueryValueExA(hKey,
380
+ TEXT("ProcessorNameString"),
381
+ NULL,
382
+ NULL,
383
+ (LPBYTE)&description[0], // NOLINT
384
+ &cpu_brand_size) == ERROR_SUCCESS) {
385
+ if (description.find('\0') != std::string::npos) {
386
+ description.resize(description.find('\0'));
387
+ }
388
+ }
389
+ }
390
+ RegCloseKey(hKey);
391
+ }
392
+ #endif
393
+ }
394
+ };
395
+
396
+ static const char * lm_ggml_backend_cpu_device_get_name(lm_ggml_backend_dev_t dev) {
397
+ return "CPU";
398
+
399
+ LM_GGML_UNUSED(dev);
400
+ }
401
+
402
+ static const char * lm_ggml_backend_cpu_device_get_description(lm_ggml_backend_dev_t dev) {
403
+ struct lm_ggml_backend_cpu_device_context * ctx = (struct lm_ggml_backend_cpu_device_context *)dev->context;
404
+
405
+ return ctx->description.c_str();
406
+ }
407
+
408
+ static void lm_ggml_backend_cpu_device_get_memory(lm_ggml_backend_dev_t dev, size_t * free, size_t * total) {
409
+ // TODO
410
+ *free = 0;
411
+ *total = 0;
412
+
413
+ LM_GGML_UNUSED(dev);
414
+ }
415
+
416
+ static enum lm_ggml_backend_dev_type lm_ggml_backend_cpu_device_get_type(lm_ggml_backend_dev_t dev) {
417
+ return LM_GGML_BACKEND_DEVICE_TYPE_CPU;
418
+
419
+ LM_GGML_UNUSED(dev);
420
+ }
421
+
422
+ static void lm_ggml_backend_cpu_device_get_props(lm_ggml_backend_dev_t dev, struct lm_ggml_backend_dev_props * props) {
423
+ props->name = lm_ggml_backend_cpu_device_get_name(dev);
424
+ props->description = lm_ggml_backend_cpu_device_get_description(dev);
425
+ props->type = lm_ggml_backend_cpu_device_get_type(dev);
426
+ lm_ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
427
+ props->caps = {
428
+ /* .async = */ false,
429
+ /* .host_buffer = */ false,
430
+ /* .buffer_from_host_ptr = */ true,
431
+ /* .events = */ false,
432
+ };
433
+ }
434
+
435
+ static lm_ggml_backend_t lm_ggml_backend_cpu_device_init_backend(lm_ggml_backend_dev_t dev, const char * params) {
436
+ return lm_ggml_backend_cpu_init();
437
+
438
+ LM_GGML_UNUSED(dev);
439
+ LM_GGML_UNUSED(params);
440
+ }
441
+
442
+ static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_device_get_buffer_type(lm_ggml_backend_dev_t dev) {
443
+ return lm_ggml_backend_cpu_buffer_type();
444
+
445
+ LM_GGML_UNUSED(dev);
446
+ }
447
+
448
+ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_device_buffer_from_host_ptr(lm_ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
449
+ return lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
450
+
451
+ LM_GGML_UNUSED(dev);
452
+ LM_GGML_UNUSED(max_tensor_size);
453
+ }
454
+
455
+ static bool lm_ggml_backend_cpu_device_supports_op(lm_ggml_backend_dev_t dev, const struct lm_ggml_tensor * op) {
456
+ const struct lm_ggml_tensor * src0 = op->src[0];
457
+ const struct lm_ggml_tensor * src1 = op->src[1];
458
+
459
+ if (src0 && src0->buffer && lm_ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
460
+ if (op->op != LM_GGML_OP_MUL_MAT || src0->type != LM_GGML_TYPE_Q4_0 || lm_ggml_aarch64_get_optimal_repack_type(src0) == LM_GGML_TYPE_Q4_0) {
461
+ return false;
462
+ }
463
+ }
464
+
465
+ for (int i = 1; i < LM_GGML_MAX_SRC; i++) {
466
+ if (op->src[i] && op->src[i]->buffer && lm_ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
467
+ return false;
468
+ }
469
+ }
470
+
471
+ switch (op->op) {
472
+ case LM_GGML_OP_CPY:
473
+ return
474
+ op->type != LM_GGML_TYPE_IQ2_XXS &&
475
+ op->type != LM_GGML_TYPE_IQ2_XS &&
476
+ op->type != LM_GGML_TYPE_IQ1_S &&
477
+ op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
478
+ case LM_GGML_OP_MUL_MAT:
479
+ return src1->type == LM_GGML_TYPE_F32 || src1->type == lm_ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
480
+ case LM_GGML_OP_ROPE_BACK:
481
+ return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
482
+ case LM_GGML_OP_IM2COL_BACK:
483
+ return src0->type == LM_GGML_TYPE_F32 && src1->type == LM_GGML_TYPE_F32;
484
+ case LM_GGML_OP_OUT_PROD:
485
+ return (src0->type == LM_GGML_TYPE_F32 || lm_ggml_is_quantized(src0->type)) && src1->type == LM_GGML_TYPE_F32;
486
+ default:
487
+ return true;
488
+ }
489
+
490
+ LM_GGML_UNUSED(dev);
491
+ }
492
+
493
+ static bool lm_ggml_backend_cpu_device_supports_buft(lm_ggml_backend_dev_t dev, lm_ggml_backend_buffer_type_t buft) {
494
+ return lm_ggml_backend_buft_is_host(buft) || lm_ggml_backend_cpu_buft_is_aarch64(buft);
495
+
496
+ LM_GGML_UNUSED(dev);
497
+ }
498
+
499
+ static const struct lm_ggml_backend_device_i lm_ggml_backend_cpu_device_i = {
500
+ /* .get_name = */ lm_ggml_backend_cpu_device_get_name,
501
+ /* .get_description = */ lm_ggml_backend_cpu_device_get_description,
502
+ /* .get_memory = */ lm_ggml_backend_cpu_device_get_memory,
503
+ /* .get_type = */ lm_ggml_backend_cpu_device_get_type,
504
+ /* .get_props = */ lm_ggml_backend_cpu_device_get_props,
505
+ /* .init_backend = */ lm_ggml_backend_cpu_device_init_backend,
506
+ /* .get_buffer_type = */ lm_ggml_backend_cpu_device_get_buffer_type,
507
+ /* .get_host_buffer_type = */ NULL,
508
+ /* .buffer_from_host_ptr = */ lm_ggml_backend_cpu_device_buffer_from_host_ptr,
509
+ /* .supports_op = */ lm_ggml_backend_cpu_device_supports_op,
510
+ /* .supports_buft = */ lm_ggml_backend_cpu_device_supports_buft,
511
+ /* .offload_op = */ NULL,
512
+ /* .event_new = */ NULL,
513
+ /* .event_free = */ NULL,
514
+ /* .event_synchronize = */ NULL,
515
+ };
516
+
517
+ // CPU backend - backend (reg)
518
+
519
+ static const char * lm_ggml_backend_cpu_reg_get_name(lm_ggml_backend_reg_t reg) {
520
+ return "CPU";
521
+
522
+ LM_GGML_UNUSED(reg);
523
+ }
524
+
525
+ static size_t lm_ggml_backend_cpu_reg_get_device_count(lm_ggml_backend_reg_t reg) {
526
+ return 1;
527
+
528
+ LM_GGML_UNUSED(reg);
529
+ }
530
+
531
+ static lm_ggml_backend_dev_t lm_ggml_backend_cpu_reg_get_device(lm_ggml_backend_reg_t reg, size_t index) {
532
+ LM_GGML_ASSERT(index == 0);
533
+
534
+ static lm_ggml_backend_cpu_device_context ctx;
535
+ static lm_ggml_backend_device lm_ggml_backend_cpu_device = {
536
+ /* .iface = */ lm_ggml_backend_cpu_device_i,
537
+ /* .reg = */ reg,
538
+ /* .context = */ &ctx,
539
+ };
540
+
541
+ return &lm_ggml_backend_cpu_device;
542
+ }
543
+
544
+ struct lm_ggml_backend_feature {
545
+ const char * name;
546
+ const char * value;
547
+ };
548
+
549
+ // Not used yet
550
+ // This is intended to replace the the lm_ggml_cpu_has_* functions when loading the CPU backend dynamically,
551
+ // and additionally to allow other backends to expose their own list of features that applications can query using the same API.
552
+ static lm_ggml_backend_feature * lm_ggml_backend_cpu_get_features(lm_ggml_backend_reg_t reg) {
553
+ static std::vector<lm_ggml_backend_feature> features = []() {
554
+ std::vector<lm_ggml_backend_feature> features;
555
+ if (lm_ggml_cpu_has_sse3()) {
556
+ features.push_back({ "SSE3", "1" });
557
+ }
558
+ if (lm_ggml_cpu_has_ssse3()) {
559
+ features.push_back({ "SSSE3", "1" });
560
+ }
561
+ if (lm_ggml_cpu_has_avx()) {
562
+ features.push_back({ "AVX", "1" });
563
+ }
564
+ if (lm_ggml_cpu_has_avx2()) {
565
+ features.push_back({ "AVX2", "1" });
566
+ }
567
+ if (lm_ggml_cpu_has_f16c()) {
568
+ features.push_back({ "F16C", "1" });
569
+ }
570
+ if (lm_ggml_cpu_has_fma()) {
571
+ features.push_back({ "FMA", "1" });
572
+ }
573
+ if (lm_ggml_cpu_has_avx_vnni()) {
574
+ features.push_back({ "AVX_VNNI", "1" });
575
+ }
576
+ if (lm_ggml_cpu_has_avx512()) {
577
+ features.push_back({ "AVX512", "1" });
578
+ }
579
+ if (lm_ggml_cpu_has_avx512_vbmi()) {
580
+ features.push_back({ "AVX512_VBMI", "1" });
581
+ }
582
+ if (lm_ggml_cpu_has_avx512_vnni()) {
583
+ features.push_back({ "AVX512_VNNI", "1" });
584
+ }
585
+ if (lm_ggml_cpu_has_avx512_bf16()) {
586
+ features.push_back({ "AVX512_BF16", "1" });
587
+ }
588
+ if (lm_ggml_cpu_has_amx_int8()) {
589
+ features.push_back({ "AMX_INT8", "1" });
590
+ }
591
+ if (lm_ggml_cpu_has_neon()) {
592
+ features.push_back({ "NEON", "1" });
593
+ }
594
+ if (lm_ggml_cpu_has_arm_fma()) {
595
+ features.push_back({ "ARM_FMA", "1" });
596
+ }
597
+ if (lm_ggml_cpu_has_fp16_va()) {
598
+ features.push_back({ "FP16_VA", "1" });
599
+ }
600
+ if (lm_ggml_cpu_has_matmul_int8()) {
601
+ features.push_back({ "MATMUL_INT8", "1" });
602
+ }
603
+ if (lm_ggml_cpu_has_sve()) {
604
+ features.push_back({ "SVE", "1" });
605
+ }
606
+ if (lm_ggml_cpu_get_sve_cnt() > 0) {
607
+ static std::string sve_cnt = std::to_string(lm_ggml_cpu_get_sve_cnt());
608
+ features.push_back({ "SVE_CNT", sve_cnt.c_str() });
609
+ }
610
+ if (lm_ggml_cpu_has_riscv_v()) {
611
+ features.push_back({ "RISCV_V", "1" });
612
+ }
613
+ if (lm_ggml_cpu_has_vsx()) {
614
+ features.push_back({ "VSX", "1" });
615
+ }
616
+ if (lm_ggml_cpu_has_wasm_simd()) {
617
+ features.push_back({ "WASM_SIMD", "1" });
618
+ }
619
+ if (lm_ggml_cpu_has_llamafile()) {
620
+ features.push_back({ "LLAMAFILE", "1" });
621
+ }
622
+
623
+ features.push_back({ nullptr, nullptr });
624
+
625
+ return features;
626
+ }();
627
+
628
+ return features.data();
629
+
630
+ LM_GGML_UNUSED(reg);
631
+ }
632
+
633
+ static void * lm_ggml_backend_cpu_get_proc_address(lm_ggml_backend_reg_t reg, const char * name) {
634
+ if (strcmp(name, "lm_ggml_backend_set_n_threads") == 0) {
635
+ return (void *)lm_ggml_backend_cpu_set_n_threads;
636
+ }
637
+ if (strcmp(name, "lm_ggml_backend_dev_get_extra_bufts") == 0) {
638
+ return (void *)lm_ggml_backend_cpu_get_extra_bufts;
639
+ }
640
+
641
+ return NULL;
642
+
643
+ LM_GGML_UNUSED(reg);
644
+ }
645
+
646
+ static const struct lm_ggml_backend_reg_i lm_ggml_backend_cpu_reg_i = {
647
+ /* .get_name = */ lm_ggml_backend_cpu_reg_get_name,
648
+ /* .get_device_count = */ lm_ggml_backend_cpu_reg_get_device_count,
649
+ /* .get_device = */ lm_ggml_backend_cpu_reg_get_device,
650
+ /* .get_proc_address = */ lm_ggml_backend_cpu_get_proc_address,
651
+ };
652
+
653
+ lm_ggml_backend_reg_t lm_ggml_backend_cpu_reg(void) {
654
+ // init CPU feature detection
655
+ lm_ggml_cpu_init();
656
+
657
+ static struct lm_ggml_backend_reg lm_ggml_backend_cpu_reg = {
658
+ /* .iface = */ lm_ggml_backend_cpu_reg_i,
659
+ /* .context = */ NULL,
660
+ };
661
+
662
+ return &lm_ggml_backend_cpu_reg;
663
+ }