cui-llama.rn 1.2.0 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/android/src/main/CMakeLists.txt +2 -2
- package/android/src/main/java/com/rnllama/LlamaContext.java +31 -9
- package/android/src/main/java/com/rnllama/RNLlama.java +39 -0
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +5 -0
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +5 -0
- package/cpp/common.cpp +36 -1
- package/cpp/common.h +5 -1
- package/cpp/ggml-aarch64.c +2 -11
- package/cpp/ggml-alloc.h +1 -1
- package/cpp/ggml-backend-impl.h +151 -78
- package/cpp/{ggml-backend.c → ggml-backend.cpp} +565 -269
- package/cpp/ggml-backend.h +147 -62
- package/cpp/ggml-impl.h +15 -0
- package/cpp/ggml-metal.h +8 -9
- package/cpp/ggml-metal.m +2428 -2111
- package/cpp/ggml-quants.c +2 -2
- package/cpp/ggml-quants.h +0 -4
- package/cpp/ggml.c +799 -1121
- package/cpp/ggml.h +79 -72
- package/cpp/llama-vocab.cpp +189 -106
- package/cpp/llama-vocab.h +18 -9
- package/cpp/llama.cpp +736 -341
- package/cpp/llama.h +9 -4
- package/cpp/unicode-data.cpp +6 -4
- package/cpp/unicode-data.h +4 -4
- package/cpp/unicode.cpp +14 -7
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/commonjs/index.js +4 -0
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/module/index.js +3 -0
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +6 -0
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +2 -1
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +7 -0
- package/src/index.ts +5 -0
@@ -1,3 +1,13 @@
|
|
1
|
+
// Note: porting this file to C++ is a work in progress
|
2
|
+
|
3
|
+
#ifdef _WIN32
|
4
|
+
#define WIN32_LEAN_AND_MEAN
|
5
|
+
#ifndef NOMINMAX
|
6
|
+
# define NOMINMAX
|
7
|
+
#endif
|
8
|
+
#include <windows.h>
|
9
|
+
#endif
|
10
|
+
|
1
11
|
#include "ggml-backend-impl.h"
|
2
12
|
#include "ggml-alloc.h"
|
3
13
|
#include "ggml-impl.h"
|
@@ -8,9 +18,14 @@
|
|
8
18
|
#include <stdio.h>
|
9
19
|
#include <stdlib.h>
|
10
20
|
#include <string.h>
|
21
|
+
#include <string>
|
22
|
+
#include <vector>
|
11
23
|
|
24
|
+
#ifdef __APPLE__
|
25
|
+
#include <sys/types.h>
|
26
|
+
#include <sys/sysctl.h>
|
27
|
+
#endif
|
12
28
|
|
13
|
-
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
14
29
|
|
15
30
|
// backend buffer type
|
16
31
|
|
@@ -18,7 +33,7 @@ const char * lm_ggml_backend_buft_name(lm_ggml_backend_buffer_type_t buft) {
|
|
18
33
|
return buft->iface.get_name(buft);
|
19
34
|
}
|
20
35
|
|
21
|
-
|
36
|
+
lm_ggml_backend_buffer_t lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
|
22
37
|
return buft->iface.alloc_buffer(buft, size);
|
23
38
|
}
|
24
39
|
|
@@ -34,7 +49,7 @@ size_t lm_ggml_backend_buft_get_max_size(lm_ggml_backend_buffer_type_t buft) {
|
|
34
49
|
return SIZE_MAX;
|
35
50
|
}
|
36
51
|
|
37
|
-
|
52
|
+
size_t lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_type_t buft, struct lm_ggml_tensor * tensor) {
|
38
53
|
// get_alloc_size is optional, defaults to lm_ggml_nbytes
|
39
54
|
if (buft->iface.get_alloc_size) {
|
40
55
|
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
@@ -51,16 +66,18 @@ bool lm_ggml_backend_buft_is_host(lm_ggml_backend_buffer_type_t buft) {
|
|
51
66
|
return false;
|
52
67
|
}
|
53
68
|
|
54
|
-
|
69
|
+
lm_ggml_backend_dev_t lm_ggml_backend_buft_get_device(lm_ggml_backend_buffer_type_t buft) {
|
70
|
+
return buft->device;
|
71
|
+
}
|
55
72
|
|
56
|
-
|
57
|
-
lm_ggml_backend_buffer_type_t buft,
|
58
|
-
struct lm_ggml_backend_buffer_i iface,
|
59
|
-
lm_ggml_backend_buffer_context_t context,
|
60
|
-
size_t size) {
|
61
|
-
lm_ggml_backend_buffer_t buffer = malloc(sizeof(struct lm_ggml_backend_buffer));
|
73
|
+
// backend buffer
|
62
74
|
|
63
|
-
|
75
|
+
lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init(
|
76
|
+
lm_ggml_backend_buffer_type_t buft,
|
77
|
+
struct lm_ggml_backend_buffer_i iface,
|
78
|
+
void * context,
|
79
|
+
size_t size) {
|
80
|
+
lm_ggml_backend_buffer_t buffer = new lm_ggml_backend_buffer {
|
64
81
|
/* .interface = */ iface,
|
65
82
|
/* .buft = */ buft,
|
66
83
|
/* .context = */ context,
|
@@ -83,7 +100,7 @@ void lm_ggml_backend_buffer_free(lm_ggml_backend_buffer_t buffer) {
|
|
83
100
|
if (buffer->iface.free_buffer != NULL) {
|
84
101
|
buffer->iface.free_buffer(buffer);
|
85
102
|
}
|
86
|
-
|
103
|
+
delete buffer;
|
87
104
|
}
|
88
105
|
|
89
106
|
size_t lm_ggml_backend_buffer_get_size(lm_ggml_backend_buffer_t buffer) {
|
@@ -98,14 +115,14 @@ void * lm_ggml_backend_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
|
|
98
115
|
return base;
|
99
116
|
}
|
100
117
|
|
101
|
-
|
118
|
+
void lm_ggml_backend_buffer_init_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) {
|
102
119
|
// init_tensor is optional
|
103
120
|
if (buffer->iface.init_tensor) {
|
104
121
|
buffer->iface.init_tensor(buffer, tensor);
|
105
122
|
}
|
106
123
|
}
|
107
124
|
|
108
|
-
size_t lm_ggml_backend_buffer_get_alignment
|
125
|
+
size_t lm_ggml_backend_buffer_get_alignment(lm_ggml_backend_buffer_t buffer) {
|
109
126
|
return lm_ggml_backend_buft_get_alignment(lm_ggml_backend_buffer_get_type(buffer));
|
110
127
|
}
|
111
128
|
|
@@ -218,7 +235,7 @@ void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm
|
|
218
235
|
}
|
219
236
|
}
|
220
237
|
|
221
|
-
|
238
|
+
void lm_ggml_backend_tensor_set(struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
222
239
|
lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
223
240
|
|
224
241
|
LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
@@ -232,7 +249,7 @@ LM_GGML_CALL void lm_ggml_backend_tensor_set(struct lm_ggml_tensor * tensor, con
|
|
232
249
|
buf->iface.set_tensor(buf, tensor, data, offset, size);
|
233
250
|
}
|
234
251
|
|
235
|
-
|
252
|
+
void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
236
253
|
lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
237
254
|
|
238
255
|
LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
@@ -246,7 +263,7 @@ LM_GGML_CALL void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tenso
|
|
246
263
|
buf->iface.get_tensor(buf, tensor, data, offset, size);
|
247
264
|
}
|
248
265
|
|
249
|
-
LM_GGML_API
|
266
|
+
LM_GGML_API void lm_ggml_backend_tensor_memset(struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
250
267
|
lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
251
268
|
|
252
269
|
LM_GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
@@ -299,20 +316,39 @@ enum lm_ggml_status lm_ggml_backend_graph_compute_async(lm_ggml_backend_t backen
|
|
299
316
|
}
|
300
317
|
|
301
318
|
bool lm_ggml_backend_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
|
319
|
+
// helper to ease transition to device interface
|
320
|
+
if (backend->device) {
|
321
|
+
return lm_ggml_backend_dev_supports_op(backend->device, op);
|
322
|
+
}
|
323
|
+
|
302
324
|
return backend->iface.supports_op(backend, op);
|
303
325
|
}
|
304
326
|
|
305
327
|
bool lm_ggml_backend_supports_buft(lm_ggml_backend_t backend, lm_ggml_backend_buffer_type_t buft) {
|
328
|
+
// helper to ease transition to device interface
|
329
|
+
if (backend->device) {
|
330
|
+
return lm_ggml_backend_dev_supports_buft(backend->device, buft);
|
331
|
+
}
|
332
|
+
|
306
333
|
return backend->iface.supports_buft(backend, buft);
|
307
334
|
}
|
308
335
|
|
309
336
|
bool lm_ggml_backend_offload_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
|
337
|
+
// helper to ease transition to device interface
|
338
|
+
if (backend->device) {
|
339
|
+
return lm_ggml_backend_dev_offload_op(backend->device, op);
|
340
|
+
}
|
341
|
+
|
310
342
|
if (backend->iface.offload_op != NULL) {
|
311
343
|
return backend->iface.offload_op(backend, op);
|
312
344
|
}
|
313
345
|
return false;
|
314
346
|
}
|
315
347
|
|
348
|
+
lm_ggml_backend_dev_t lm_ggml_backend_get_device(lm_ggml_backend_t backend) {
|
349
|
+
return backend->device;
|
350
|
+
}
|
351
|
+
|
316
352
|
// backend copy
|
317
353
|
|
318
354
|
static bool lm_ggml_are_same_layout(const struct lm_ggml_tensor * a, const struct lm_ggml_tensor * b) {
|
@@ -375,30 +411,31 @@ void lm_ggml_backend_tensor_copy_async(lm_ggml_backend_t backend_src, lm_ggml_ba
|
|
375
411
|
|
376
412
|
// events
|
377
413
|
|
378
|
-
lm_ggml_backend_event_t lm_ggml_backend_event_new(
|
379
|
-
|
414
|
+
lm_ggml_backend_event_t lm_ggml_backend_event_new(lm_ggml_backend_dev_t device) {
|
415
|
+
// null device is allowed for the transition period to the device interface
|
416
|
+
if (device == NULL || device->iface.event_new == NULL) {
|
380
417
|
return NULL;
|
381
418
|
}
|
382
|
-
return
|
419
|
+
return device->iface.event_new(device);
|
383
420
|
}
|
384
421
|
|
385
422
|
void lm_ggml_backend_event_free(lm_ggml_backend_event_t event) {
|
386
423
|
if (event == NULL) {
|
387
424
|
return;
|
388
425
|
}
|
389
|
-
event->
|
426
|
+
event->device->iface.event_free(event->device, event);
|
390
427
|
}
|
391
428
|
|
392
|
-
void lm_ggml_backend_event_record(lm_ggml_backend_event_t event) {
|
393
|
-
LM_GGML_ASSERT(
|
429
|
+
void lm_ggml_backend_event_record(lm_ggml_backend_event_t event, lm_ggml_backend_t backend) {
|
430
|
+
LM_GGML_ASSERT(backend->iface.event_record != NULL);
|
394
431
|
|
395
|
-
|
432
|
+
backend->iface.event_record(backend, event);
|
396
433
|
}
|
397
434
|
|
398
435
|
void lm_ggml_backend_event_synchronize(lm_ggml_backend_event_t event) {
|
399
|
-
LM_GGML_ASSERT(event->
|
436
|
+
LM_GGML_ASSERT(event->device->iface.event_synchronize);
|
400
437
|
|
401
|
-
event->
|
438
|
+
event->device->iface.event_synchronize(event->device, event);
|
402
439
|
}
|
403
440
|
|
404
441
|
void lm_ggml_backend_event_wait(lm_ggml_backend_t backend, lm_ggml_backend_event_t event) {
|
@@ -407,170 +444,246 @@ void lm_ggml_backend_event_wait(lm_ggml_backend_t backend, lm_ggml_backend_event
|
|
407
444
|
backend->iface.event_wait(backend, event);
|
408
445
|
}
|
409
446
|
|
410
|
-
//
|
447
|
+
// Backend device
|
411
448
|
|
412
|
-
|
449
|
+
const char * lm_ggml_backend_dev_name(lm_ggml_backend_dev_t device) {
|
450
|
+
return device->iface.get_name(device);
|
451
|
+
}
|
413
452
|
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
453
|
+
const char * lm_ggml_backend_dev_description(lm_ggml_backend_dev_t device) {
|
454
|
+
return device->iface.get_description(device);
|
455
|
+
}
|
456
|
+
|
457
|
+
void lm_ggml_backend_dev_memory(lm_ggml_backend_dev_t device, size_t * free, size_t * total) {
|
458
|
+
device->iface.get_memory(device, free, total);
|
459
|
+
}
|
420
460
|
|
421
|
-
|
422
|
-
|
461
|
+
enum lm_ggml_backend_dev_type lm_ggml_backend_dev_type(lm_ggml_backend_dev_t device) {
|
462
|
+
return device->iface.get_type(device);
|
463
|
+
}
|
423
464
|
|
424
|
-
|
465
|
+
void lm_ggml_backend_dev_get_props(lm_ggml_backend_dev_t device, struct lm_ggml_backend_dev_props * props) {
|
466
|
+
memset(props, 0, sizeof(*props));
|
467
|
+
device->iface.get_props(device, props);
|
468
|
+
}
|
425
469
|
|
426
|
-
|
427
|
-
|
470
|
+
lm_ggml_backend_reg_t lm_ggml_backend_dev_backend_reg(lm_ggml_backend_dev_t device) {
|
471
|
+
return device->reg;
|
472
|
+
}
|
428
473
|
|
429
|
-
|
430
|
-
|
474
|
+
lm_ggml_backend_t lm_ggml_backend_dev_init(lm_ggml_backend_dev_t device, const char * params) {
|
475
|
+
return device->iface.init_backend(device, params);
|
476
|
+
}
|
477
|
+
|
478
|
+
lm_ggml_backend_buffer_type_t lm_ggml_backend_dev_buffer_type(lm_ggml_backend_dev_t device) {
|
479
|
+
return device->iface.get_buffer_type(device);
|
480
|
+
}
|
481
|
+
|
482
|
+
lm_ggml_backend_buffer_type_t lm_ggml_backend_dev_host_buffer_type(lm_ggml_backend_dev_t device) {
|
483
|
+
if (device->iface.get_host_buffer_type == NULL) {
|
484
|
+
return NULL;
|
431
485
|
}
|
432
486
|
|
433
|
-
|
487
|
+
return device->iface.get_host_buffer_type(device);
|
488
|
+
}
|
434
489
|
|
435
|
-
|
490
|
+
lm_ggml_backend_buffer_t lm_ggml_backend_dev_buffer_from_host_ptr(lm_ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
|
491
|
+
return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
|
492
|
+
}
|
436
493
|
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
494
|
+
bool lm_ggml_backend_dev_supports_op(lm_ggml_backend_dev_t device, const struct lm_ggml_tensor * op) {
|
495
|
+
return device->iface.supports_op(device, op);
|
496
|
+
}
|
497
|
+
|
498
|
+
bool lm_ggml_backend_dev_supports_buft(lm_ggml_backend_dev_t device, lm_ggml_backend_buffer_type_t buft) {
|
499
|
+
return device->iface.supports_buft(device, buft);
|
500
|
+
}
|
501
|
+
|
502
|
+
bool lm_ggml_backend_dev_offload_op(lm_ggml_backend_dev_t device, const struct lm_ggml_tensor * op) {
|
503
|
+
if (device->iface.offload_op != NULL) {
|
504
|
+
return device->iface.offload_op(device, op);
|
505
|
+
}
|
506
|
+
|
507
|
+
return false;
|
508
|
+
}
|
442
509
|
|
443
|
-
|
444
|
-
|
445
|
-
|
510
|
+
// Backend (reg)
|
511
|
+
|
512
|
+
const char * lm_ggml_backend_reg_name(lm_ggml_backend_reg_t reg) {
|
513
|
+
return reg->iface.get_name(reg);
|
514
|
+
}
|
515
|
+
|
516
|
+
size_t lm_ggml_backend_reg_dev_count(lm_ggml_backend_reg_t reg) {
|
517
|
+
return reg->iface.get_device_count(reg);
|
518
|
+
}
|
519
|
+
|
520
|
+
lm_ggml_backend_dev_t lm_ggml_backend_reg_dev_get(lm_ggml_backend_reg_t reg, size_t index) {
|
521
|
+
return reg->iface.get_device(reg, index);
|
522
|
+
}
|
523
|
+
|
524
|
+
void * lm_ggml_backend_reg_get_proc_address(lm_ggml_backend_reg_t reg, const char * name) {
|
525
|
+
if (!reg->iface.get_proc_address) {
|
526
|
+
return NULL;
|
527
|
+
}
|
528
|
+
return reg->iface.get_proc_address(reg, name);
|
529
|
+
}
|
530
|
+
|
531
|
+
// Backend registry
|
532
|
+
|
533
|
+
#ifdef LM_GGML_USE_CUDA
|
534
|
+
#include "ggml-cuda.h"
|
446
535
|
#endif
|
447
536
|
|
448
537
|
#ifdef LM_GGML_USE_METAL
|
449
|
-
|
450
|
-
extern LM_GGML_CALL lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_buffer_type(void);
|
451
|
-
lm_ggml_backend_register("Metal", lm_ggml_backend_reg_metal_init, lm_ggml_backend_metal_buffer_type(), NULL);
|
538
|
+
#include "ggml-metal.h"
|
452
539
|
#endif
|
453
540
|
|
454
|
-
#ifdef
|
455
|
-
|
456
|
-
lm_ggml_backend_vk_reg_devices();
|
541
|
+
#ifdef LM_GGML_USE_BLAS
|
542
|
+
#include "ggml-blas.h"
|
457
543
|
#endif
|
458
544
|
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
#endif
|
545
|
+
struct lm_ggml_backend_registry {
|
546
|
+
std::vector<lm_ggml_backend_reg_t> backends;
|
547
|
+
std::vector<lm_ggml_backend_dev_t> devices;
|
463
548
|
|
464
|
-
|
465
|
-
|
466
|
-
|
549
|
+
lm_ggml_backend_registry() {
|
550
|
+
#ifdef LM_GGML_USE_CUDA
|
551
|
+
register_backend(lm_ggml_backend_cuda_reg());
|
552
|
+
#endif
|
553
|
+
#ifdef LM_GGML_USE_METAL
|
554
|
+
register_backend(lm_ggml_backend_metal_reg());
|
555
|
+
#endif
|
556
|
+
#ifdef LM_GGML_USE_BLAS
|
557
|
+
register_backend(lm_ggml_backend_blas_reg());
|
467
558
|
#endif
|
468
|
-
}
|
469
|
-
|
470
|
-
LM_GGML_CALL void lm_ggml_backend_register(const char * name, lm_ggml_backend_init_fn init_fn, lm_ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
471
|
-
LM_GGML_ASSERT(lm_ggml_backend_registry_count < LM_GGML_REG_MAX_BACKENDS);
|
472
559
|
|
473
|
-
|
560
|
+
// TODO: sycl, vulkan, kompute, cann
|
474
561
|
|
475
|
-
|
476
|
-
|
477
|
-
/* .fn = */ init_fn,
|
478
|
-
/* .default_buffer_type = */ default_buffer_type,
|
479
|
-
/* .user_data = */ user_data,
|
480
|
-
};
|
562
|
+
register_backend(lm_ggml_backend_cpu_reg());
|
563
|
+
}
|
481
564
|
|
482
|
-
|
565
|
+
void register_backend(lm_ggml_backend_reg_t reg) {
|
566
|
+
#ifndef NDEBUG
|
567
|
+
fprintf(stderr, "%s: registered backend %s (%zu devices)\n",
|
568
|
+
__func__, lm_ggml_backend_reg_name(reg), lm_ggml_backend_reg_dev_count(reg));
|
569
|
+
#endif
|
570
|
+
backends.push_back(reg);
|
571
|
+
for (size_t i = 0; i < lm_ggml_backend_reg_dev_count(reg); i++) {
|
572
|
+
register_device(lm_ggml_backend_reg_dev_get(reg, i));
|
573
|
+
}
|
574
|
+
}
|
483
575
|
|
576
|
+
void register_device(lm_ggml_backend_dev_t device) {
|
484
577
|
#ifndef NDEBUG
|
485
|
-
|
578
|
+
fprintf(stderr, "%s: registered device %s (%s)\n", __func__, lm_ggml_backend_dev_name(device), lm_ggml_backend_dev_description(device));
|
486
579
|
#endif
|
580
|
+
devices.push_back(device);
|
581
|
+
}
|
582
|
+
};
|
487
583
|
|
488
|
-
|
584
|
+
static lm_ggml_backend_registry & get_reg() {
|
585
|
+
static lm_ggml_backend_registry reg;
|
586
|
+
return reg;
|
489
587
|
}
|
490
588
|
|
491
|
-
|
492
|
-
|
589
|
+
// Internal API
|
590
|
+
void lm_ggml_backend_register(lm_ggml_backend_reg_t reg) {
|
591
|
+
get_reg().register_backend(reg);
|
592
|
+
}
|
493
593
|
|
494
|
-
|
594
|
+
void lm_ggml_backend_device_register(lm_ggml_backend_dev_t device) {
|
595
|
+
get_reg().register_device(device);
|
495
596
|
}
|
496
597
|
|
497
|
-
|
498
|
-
|
598
|
+
// Backend (reg) enumeration
|
599
|
+
size_t lm_ggml_backend_reg_count() {
|
600
|
+
return get_reg().backends.size();
|
601
|
+
}
|
499
602
|
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
603
|
+
lm_ggml_backend_reg_t lm_ggml_backend_reg_get(size_t index) {
|
604
|
+
LM_GGML_ASSERT(index < lm_ggml_backend_reg_count());
|
605
|
+
return get_reg().backends[index];
|
606
|
+
}
|
607
|
+
|
608
|
+
lm_ggml_backend_reg_t lm_ggml_backend_reg_by_name(const char * name) {
|
609
|
+
for (size_t i = 0; i < lm_ggml_backend_reg_count(); i++) {
|
610
|
+
lm_ggml_backend_reg_t reg = lm_ggml_backend_reg_get(i);
|
611
|
+
if (strcmp(lm_ggml_backend_reg_name(reg), name) == 0) {
|
612
|
+
return reg;
|
504
613
|
}
|
505
614
|
}
|
506
|
-
|
507
|
-
// not found
|
508
|
-
return SIZE_MAX;
|
615
|
+
return NULL;
|
509
616
|
}
|
510
617
|
|
511
|
-
//
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
const char * params = strchr(backend_str, ':');
|
516
|
-
char backend_name[128];
|
517
|
-
if (params == NULL) {
|
518
|
-
snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
|
519
|
-
params = "";
|
520
|
-
} else {
|
521
|
-
snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
|
522
|
-
params++;
|
523
|
-
}
|
618
|
+
// Device enumeration
|
619
|
+
size_t lm_ggml_backend_dev_count() {
|
620
|
+
return get_reg().devices.size();
|
621
|
+
}
|
524
622
|
|
525
|
-
|
623
|
+
lm_ggml_backend_dev_t lm_ggml_backend_dev_get(size_t index) {
|
624
|
+
LM_GGML_ASSERT(index < lm_ggml_backend_dev_count());
|
625
|
+
return get_reg().devices[index];
|
626
|
+
}
|
526
627
|
|
527
|
-
|
528
|
-
|
529
|
-
|
628
|
+
lm_ggml_backend_dev_t lm_ggml_backend_dev_by_name(const char * name) {
|
629
|
+
for (size_t i = 0; i < lm_ggml_backend_dev_count(); i++) {
|
630
|
+
lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
|
631
|
+
if (strcmp(lm_ggml_backend_dev_name(dev), name) == 0) {
|
632
|
+
return dev;
|
633
|
+
}
|
530
634
|
}
|
531
|
-
|
532
|
-
return lm_ggml_backend_reg_init_backend(backend_i, params);
|
635
|
+
return NULL;
|
533
636
|
}
|
534
637
|
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
638
|
+
lm_ggml_backend_dev_t lm_ggml_backend_dev_by_type(enum lm_ggml_backend_dev_type type) {
|
639
|
+
for (size_t i = 0; i < lm_ggml_backend_dev_count(); i++) {
|
640
|
+
lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
|
641
|
+
if (lm_ggml_backend_dev_type(dev) == type) {
|
642
|
+
return dev;
|
643
|
+
}
|
644
|
+
}
|
645
|
+
return NULL;
|
540
646
|
}
|
541
647
|
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
648
|
+
// Convenience functions
|
649
|
+
lm_ggml_backend_t lm_ggml_backend_init_by_name(const char * name, const char * params) {
|
650
|
+
lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_by_name(name);
|
651
|
+
if (!dev) {
|
652
|
+
return NULL;
|
653
|
+
}
|
654
|
+
return lm_ggml_backend_dev_init(dev, params);
|
547
655
|
}
|
548
656
|
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
657
|
+
lm_ggml_backend_t lm_ggml_backend_init_by_type(enum lm_ggml_backend_dev_type type, const char * params) {
|
658
|
+
lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_by_type(type);
|
659
|
+
if (!dev) {
|
660
|
+
return NULL;
|
661
|
+
}
|
662
|
+
return lm_ggml_backend_dev_init(dev, params);
|
554
663
|
}
|
555
664
|
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
665
|
+
lm_ggml_backend_t lm_ggml_backend_init_best(void) {
|
666
|
+
lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_GPU_FULL);
|
667
|
+
if (!dev) {
|
668
|
+
dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU_FULL);
|
669
|
+
}
|
670
|
+
if (!dev) {
|
671
|
+
return NULL;
|
672
|
+
}
|
673
|
+
return lm_ggml_backend_dev_init(dev, NULL);
|
561
674
|
}
|
562
675
|
|
563
676
|
// backend CPU
|
564
677
|
|
565
678
|
static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
|
566
679
|
|
567
|
-
|
680
|
+
static const char * lm_ggml_backend_cpu_buffer_get_name(lm_ggml_backend_buffer_t buffer) {
|
568
681
|
return "CPU";
|
569
682
|
|
570
683
|
LM_GGML_UNUSED(buffer);
|
571
684
|
}
|
572
685
|
|
573
|
-
|
686
|
+
static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_buffer_t buffer) {
|
574
687
|
uintptr_t data = (uintptr_t)buffer->context;
|
575
688
|
|
576
689
|
// align the buffer
|
@@ -581,29 +694,29 @@ LM_GGML_CALL static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_b
|
|
581
694
|
return (void *)data;
|
582
695
|
}
|
583
696
|
|
584
|
-
|
697
|
+
static void lm_ggml_backend_cpu_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
|
585
698
|
free(buffer->context);
|
586
699
|
}
|
587
700
|
|
588
|
-
|
701
|
+
static void lm_ggml_backend_cpu_buffer_memset_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
589
702
|
memset((char *)tensor->data + offset, value, size);
|
590
703
|
|
591
704
|
LM_GGML_UNUSED(buffer);
|
592
705
|
}
|
593
706
|
|
594
|
-
|
707
|
+
static void lm_ggml_backend_cpu_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
595
708
|
memcpy((char *)tensor->data + offset, data, size);
|
596
709
|
|
597
710
|
LM_GGML_UNUSED(buffer);
|
598
711
|
}
|
599
712
|
|
600
|
-
|
713
|
+
static void lm_ggml_backend_cpu_buffer_get_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
601
714
|
memcpy(data, (const char *)tensor->data + offset, size);
|
602
715
|
|
603
716
|
LM_GGML_UNUSED(buffer);
|
604
717
|
}
|
605
718
|
|
606
|
-
|
719
|
+
static bool lm_ggml_backend_cpu_buffer_cpy_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) {
|
607
720
|
if (lm_ggml_backend_buffer_is_host(src->buffer)) {
|
608
721
|
memcpy(dst->data, src->data, lm_ggml_nbytes(src));
|
609
722
|
return true;
|
@@ -613,12 +726,12 @@ LM_GGML_CALL static bool lm_ggml_backend_cpu_buffer_cpy_tensor(lm_ggml_backend_b
|
|
613
726
|
LM_GGML_UNUSED(buffer);
|
614
727
|
}
|
615
728
|
|
616
|
-
|
729
|
+
static void lm_ggml_backend_cpu_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
|
617
730
|
memset(buffer->context, value, buffer->size);
|
618
731
|
}
|
619
732
|
|
620
|
-
static struct lm_ggml_backend_buffer_i
|
621
|
-
/* .get_name = */
|
733
|
+
static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_i = {
|
734
|
+
/* .get_name = */ lm_ggml_backend_cpu_buffer_get_name,
|
622
735
|
/* .free_buffer = */ lm_ggml_backend_cpu_buffer_free_buffer,
|
623
736
|
/* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
|
624
737
|
/* .init_tensor = */ NULL, // no initialization required
|
@@ -630,9 +743,8 @@ static struct lm_ggml_backend_buffer_i cpu_backend_buffer_i = {
|
|
630
743
|
/* .reset = */ NULL,
|
631
744
|
};
|
632
745
|
|
633
|
-
|
634
|
-
|
635
|
-
/* .get_name = */ lm_ggml_backend_cpu_buffer_name,
|
746
|
+
static const struct lm_ggml_backend_buffer_i lm_ggml_backend_cpu_buffer_from_ptr_i = {
|
747
|
+
/* .get_name = */ lm_ggml_backend_cpu_buffer_get_name,
|
636
748
|
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
637
749
|
/* .get_base = */ lm_ggml_backend_cpu_buffer_get_base,
|
638
750
|
/* .init_tensor = */ NULL, // no initialization required
|
@@ -644,13 +756,13 @@ static struct lm_ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
|
644
756
|
/* .reset = */ NULL,
|
645
757
|
};
|
646
758
|
|
647
|
-
|
759
|
+
static const char * lm_ggml_backend_cpu_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
|
648
760
|
return "CPU";
|
649
761
|
|
650
762
|
LM_GGML_UNUSED(buft);
|
651
763
|
}
|
652
764
|
|
653
|
-
|
765
|
+
static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
|
654
766
|
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
655
767
|
void * data = malloc(size); // TODO: use LM_GGML_ALIGNED_MALLOC (move to ggml-impl.h)
|
656
768
|
if (data == NULL) {
|
@@ -658,24 +770,24 @@ LM_GGML_CALL static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_type_all
|
|
658
770
|
return NULL;
|
659
771
|
}
|
660
772
|
|
661
|
-
return lm_ggml_backend_buffer_init(buft,
|
773
|
+
return lm_ggml_backend_buffer_init(buft, lm_ggml_backend_cpu_buffer_i, data, size);
|
662
774
|
}
|
663
775
|
|
664
|
-
|
776
|
+
static size_t lm_ggml_backend_cpu_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) {
|
665
777
|
return TENSOR_ALIGNMENT;
|
666
778
|
|
667
779
|
LM_GGML_UNUSED(buft);
|
668
780
|
}
|
669
781
|
|
670
|
-
|
782
|
+
static bool lm_ggml_backend_cpu_buffer_type_is_host(lm_ggml_backend_buffer_type_t buft) {
|
671
783
|
return true;
|
672
784
|
|
673
785
|
LM_GGML_UNUSED(buft);
|
674
786
|
}
|
675
787
|
|
676
|
-
|
788
|
+
lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void) {
|
677
789
|
static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type = {
|
678
|
-
/* .iface
|
790
|
+
/* .iface = */ {
|
679
791
|
/* .get_name = */ lm_ggml_backend_cpu_buffer_type_get_name,
|
680
792
|
/* .alloc_buffer = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer,
|
681
793
|
/* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment,
|
@@ -683,6 +795,7 @@ LM_GGML_CALL lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void)
|
|
683
795
|
/* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes
|
684
796
|
/* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host,
|
685
797
|
},
|
798
|
+
/* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
|
686
799
|
/* .context = */ NULL,
|
687
800
|
};
|
688
801
|
|
@@ -695,23 +808,23 @@ LM_GGML_CALL lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void)
|
|
695
808
|
|
696
809
|
#include <hbwmalloc.h>
|
697
810
|
|
698
|
-
|
811
|
+
static const char * lm_ggml_backend_cpu_hbm_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
|
699
812
|
return "CPU_HBM";
|
700
813
|
|
701
814
|
LM_GGML_UNUSED(buft);
|
702
815
|
}
|
703
816
|
|
704
|
-
|
817
|
+
static const char * lm_ggml_backend_cpu_hbm_buffer_get_name(lm_ggml_backend_buffer_t buf) {
|
705
818
|
return "CPU_HBM";
|
706
819
|
|
707
820
|
LM_GGML_UNUSED(buf);
|
708
821
|
}
|
709
822
|
|
710
|
-
|
823
|
+
static void lm_ggml_backend_cpu_hbm_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
|
711
824
|
hbw_free(buffer->context);
|
712
825
|
}
|
713
826
|
|
714
|
-
|
827
|
+
static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
|
715
828
|
//void * ptr = hbw_malloc(size);
|
716
829
|
void * ptr;
|
717
830
|
int result = hbw_posix_memalign(&ptr, lm_ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
@@ -749,27 +862,27 @@ struct lm_ggml_backend_cpu_context {
|
|
749
862
|
int n_threads;
|
750
863
|
lm_ggml_threadpool_t threadpool;
|
751
864
|
|
752
|
-
|
865
|
+
uint8_t * work_data;
|
753
866
|
size_t work_size;
|
754
867
|
|
755
868
|
lm_ggml_abort_callback abort_callback;
|
756
869
|
void * abort_callback_data;
|
757
870
|
};
|
758
871
|
|
759
|
-
|
872
|
+
static const char * lm_ggml_backend_cpu_get_name(lm_ggml_backend_t backend) {
|
760
873
|
return "CPU";
|
761
874
|
|
762
875
|
LM_GGML_UNUSED(backend);
|
763
876
|
}
|
764
877
|
|
765
|
-
|
878
|
+
static void lm_ggml_backend_cpu_free(lm_ggml_backend_t backend) {
|
766
879
|
struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
|
767
|
-
|
768
|
-
|
769
|
-
|
880
|
+
delete[] cpu_ctx->work_data;
|
881
|
+
delete cpu_ctx;
|
882
|
+
delete backend;
|
770
883
|
}
|
771
884
|
|
772
|
-
|
885
|
+
static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_get_default_buffer_type(lm_ggml_backend_t backend) {
|
773
886
|
return lm_ggml_backend_cpu_buffer_type();
|
774
887
|
|
775
888
|
LM_GGML_UNUSED(backend);
|
@@ -780,18 +893,18 @@ struct lm_ggml_backend_plan_cpu {
|
|
780
893
|
struct lm_ggml_cgraph cgraph;
|
781
894
|
};
|
782
895
|
|
783
|
-
|
896
|
+
static lm_ggml_backend_graph_plan_t lm_ggml_backend_cpu_graph_plan_create(lm_ggml_backend_t backend, const struct lm_ggml_cgraph * cgraph) {
|
784
897
|
struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
|
785
898
|
|
786
|
-
struct lm_ggml_backend_plan_cpu * cpu_plan =
|
899
|
+
struct lm_ggml_backend_plan_cpu * cpu_plan = new lm_ggml_backend_plan_cpu;
|
787
900
|
|
788
901
|
cpu_plan->cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
789
902
|
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
790
903
|
|
791
904
|
if (cpu_plan->cplan.work_size > 0) {
|
792
|
-
cpu_plan->cplan.work_data =
|
905
|
+
cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
|
793
906
|
if (cpu_plan->cplan.work_data == NULL) {
|
794
|
-
|
907
|
+
delete cpu_plan;
|
795
908
|
return NULL;
|
796
909
|
}
|
797
910
|
}
|
@@ -802,16 +915,16 @@ LM_GGML_CALL static lm_ggml_backend_graph_plan_t lm_ggml_backend_cpu_graph_plan_
|
|
802
915
|
return cpu_plan;
|
803
916
|
}
|
804
917
|
|
805
|
-
|
918
|
+
static void lm_ggml_backend_cpu_graph_plan_free(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
|
806
919
|
struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
|
807
920
|
|
808
|
-
|
809
|
-
|
921
|
+
delete[] cpu_plan->cplan.work_data;
|
922
|
+
delete cpu_plan;
|
810
923
|
|
811
924
|
LM_GGML_UNUSED(backend);
|
812
925
|
}
|
813
926
|
|
814
|
-
|
927
|
+
static enum lm_ggml_status lm_ggml_backend_cpu_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) {
|
815
928
|
struct lm_ggml_backend_plan_cpu * cpu_plan = (struct lm_ggml_backend_plan_cpu *)plan;
|
816
929
|
|
817
930
|
return lm_ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
@@ -819,21 +932,21 @@ LM_GGML_CALL static enum lm_ggml_status lm_ggml_backend_cpu_graph_plan_compute(l
|
|
819
932
|
LM_GGML_UNUSED(backend);
|
820
933
|
}
|
821
934
|
|
822
|
-
|
935
|
+
static enum lm_ggml_status lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
|
823
936
|
struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
|
824
937
|
|
825
938
|
struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
826
939
|
|
827
940
|
if (cpu_ctx->work_size < cplan.work_size) {
|
828
|
-
|
829
|
-
cpu_ctx->work_data =
|
941
|
+
delete[] cpu_ctx->work_data;
|
942
|
+
cpu_ctx->work_data = new uint8_t[cplan.work_size];
|
830
943
|
if (cpu_ctx->work_data == NULL) {
|
831
944
|
cpu_ctx->work_size = 0;
|
832
945
|
return LM_GGML_STATUS_ALLOC_FAILED;
|
833
946
|
}
|
834
947
|
cpu_ctx->work_size = cplan.work_size;
|
835
948
|
}
|
836
|
-
cplan.work_data = cpu_ctx->work_data;
|
949
|
+
cplan.work_data = (uint8_t *)cpu_ctx->work_data;
|
837
950
|
|
838
951
|
cplan.abort_callback = cpu_ctx->abort_callback;
|
839
952
|
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
@@ -841,35 +954,8 @@ LM_GGML_CALL static enum lm_ggml_status lm_ggml_backend_cpu_graph_compute(lm_ggm
|
|
841
954
|
return lm_ggml_graph_compute(cgraph, &cplan);
|
842
955
|
}
|
843
956
|
|
844
|
-
|
845
|
-
|
846
|
-
case LM_GGML_OP_CPY:
|
847
|
-
return
|
848
|
-
op->type != LM_GGML_TYPE_IQ2_XXS &&
|
849
|
-
op->type != LM_GGML_TYPE_IQ2_XS &&
|
850
|
-
op->type != LM_GGML_TYPE_IQ1_S &&
|
851
|
-
op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
852
|
-
case LM_GGML_OP_MUL_MAT:
|
853
|
-
return op->src[1]->type == LM_GGML_TYPE_F32 || op->src[1]->type == lm_ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
854
|
-
case LM_GGML_OP_ROPE_BACK:
|
855
|
-
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
|
856
|
-
case LM_GGML_OP_IM2COL_BACK:
|
857
|
-
return op->src[0]->type == LM_GGML_TYPE_F32 && op->src[1]->type == LM_GGML_TYPE_F32;
|
858
|
-
default:
|
859
|
-
return true;
|
860
|
-
}
|
861
|
-
|
862
|
-
LM_GGML_UNUSED(backend);
|
863
|
-
}
|
864
|
-
|
865
|
-
LM_GGML_CALL static bool lm_ggml_backend_cpu_supports_buft(lm_ggml_backend_t backend, lm_ggml_backend_buffer_type_t buft) {
|
866
|
-
return lm_ggml_backend_buft_is_host(buft);
|
867
|
-
|
868
|
-
LM_GGML_UNUSED(backend);
|
869
|
-
}
|
870
|
-
|
871
|
-
static struct lm_ggml_backend_i cpu_backend_i = {
|
872
|
-
/* .get_name = */ lm_ggml_backend_cpu_name,
|
957
|
+
static const struct lm_ggml_backend_i lm_ggml_backend_cpu_i = {
|
958
|
+
/* .get_name = */ lm_ggml_backend_cpu_get_name,
|
873
959
|
/* .free = */ lm_ggml_backend_cpu_free,
|
874
960
|
/* .get_default_buffer_type = */ lm_ggml_backend_cpu_get_default_buffer_type,
|
875
961
|
/* .set_tensor_async = */ NULL,
|
@@ -881,14 +967,11 @@ static struct lm_ggml_backend_i cpu_backend_i = {
|
|
881
967
|
/* .graph_plan_update = */ NULL,
|
882
968
|
/* .graph_plan_compute = */ lm_ggml_backend_cpu_graph_plan_compute,
|
883
969
|
/* .graph_compute = */ lm_ggml_backend_cpu_graph_compute,
|
884
|
-
/* .supports_op = */
|
885
|
-
/* .supports_buft = */
|
970
|
+
/* .supports_op = */ NULL,
|
971
|
+
/* .supports_buft = */ NULL,
|
886
972
|
/* .offload_op = */ NULL,
|
887
|
-
/* .event_new = */ NULL,
|
888
|
-
/* .event_free = */ NULL,
|
889
973
|
/* .event_record = */ NULL,
|
890
974
|
/* .event_wait = */ NULL,
|
891
|
-
/* .event_synchronize = */ NULL,
|
892
975
|
};
|
893
976
|
|
894
977
|
static lm_ggml_guid_t lm_ggml_backend_cpu_guid(void) {
|
@@ -897,7 +980,7 @@ static lm_ggml_guid_t lm_ggml_backend_cpu_guid(void) {
|
|
897
980
|
}
|
898
981
|
|
899
982
|
lm_ggml_backend_t lm_ggml_backend_cpu_init(void) {
|
900
|
-
struct lm_ggml_backend_cpu_context * ctx =
|
983
|
+
struct lm_ggml_backend_cpu_context * ctx = new lm_ggml_backend_cpu_context;
|
901
984
|
if (ctx == NULL) {
|
902
985
|
return NULL;
|
903
986
|
}
|
@@ -909,21 +992,22 @@ lm_ggml_backend_t lm_ggml_backend_cpu_init(void) {
|
|
909
992
|
ctx->abort_callback = NULL;
|
910
993
|
ctx->abort_callback_data = NULL;
|
911
994
|
|
912
|
-
lm_ggml_backend_t cpu_backend =
|
995
|
+
lm_ggml_backend_t cpu_backend = new lm_ggml_backend {
|
996
|
+
/* .guid = */ lm_ggml_backend_cpu_guid(),
|
997
|
+
/* .interface = */ lm_ggml_backend_cpu_i,
|
998
|
+
/* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
|
999
|
+
/* .context = */ ctx,
|
1000
|
+
};
|
1001
|
+
|
913
1002
|
if (cpu_backend == NULL) {
|
914
|
-
|
1003
|
+
delete ctx;
|
915
1004
|
return NULL;
|
916
1005
|
}
|
917
1006
|
|
918
|
-
*cpu_backend = (struct lm_ggml_backend) {
|
919
|
-
/* .guid = */ lm_ggml_backend_cpu_guid(),
|
920
|
-
/* .interface = */ cpu_backend_i,
|
921
|
-
/* .context = */ ctx
|
922
|
-
};
|
923
1007
|
return cpu_backend;
|
924
1008
|
}
|
925
1009
|
|
926
|
-
|
1010
|
+
bool lm_ggml_backend_is_cpu(lm_ggml_backend_t backend) {
|
927
1011
|
return backend != NULL && lm_ggml_guid_matches(backend->guid, lm_ggml_backend_cpu_guid());
|
928
1012
|
}
|
929
1013
|
|
@@ -954,16 +1038,233 @@ void lm_ggml_backend_cpu_set_abort_callback(lm_ggml_backend_t backend_cpu, lm_gg
|
|
954
1038
|
ctx->abort_callback_data = abort_callback_data;
|
955
1039
|
}
|
956
1040
|
|
957
|
-
|
1041
|
+
lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
958
1042
|
LM_GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
959
|
-
return lm_ggml_backend_buffer_init(lm_ggml_backend_cpu_buffer_type(),
|
1043
|
+
return lm_ggml_backend_buffer_init(lm_ggml_backend_cpu_buffer_type(), lm_ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
|
960
1044
|
}
|
961
1045
|
|
962
|
-
|
1046
|
+
////////////////////////
|
1047
|
+
|
1048
|
+
struct lm_ggml_backend_cpu_device_context {
|
1049
|
+
std::string description = "CPU";
|
1050
|
+
|
1051
|
+
lm_ggml_backend_cpu_device_context() {
|
1052
|
+
#ifdef __APPLE__
|
1053
|
+
size_t len = 0;
|
1054
|
+
if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
|
1055
|
+
description.resize(len);
|
1056
|
+
sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
|
1057
|
+
}
|
1058
|
+
#elif defined(__linux__)
|
1059
|
+
FILE * f = fopen("/proc/cpuinfo", "r");
|
1060
|
+
if (f) {
|
1061
|
+
char buf[1024];
|
1062
|
+
while (fgets(buf, sizeof(buf), f)) {
|
1063
|
+
if (strncmp(buf, "model name", 10) == 0) {
|
1064
|
+
char * p = strchr(buf, ':');
|
1065
|
+
if (p) {
|
1066
|
+
p++;
|
1067
|
+
while (std::isspace(*p)) {
|
1068
|
+
p++;
|
1069
|
+
}
|
1070
|
+
while (std::isspace(p[strlen(p) - 1])) {
|
1071
|
+
p[strlen(p) - 1] = '\0';
|
1072
|
+
}
|
1073
|
+
description = p;
|
1074
|
+
break;
|
1075
|
+
}
|
1076
|
+
}
|
1077
|
+
}
|
1078
|
+
fclose(f);
|
1079
|
+
}
|
1080
|
+
#elif defined(_WIN32)
|
1081
|
+
HKEY hKey;
|
1082
|
+
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
|
1083
|
+
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
|
1084
|
+
0,
|
1085
|
+
KEY_READ,
|
1086
|
+
&hKey) == ERROR_SUCCESS) {
|
1087
|
+
DWORD cpu_brand_size = 0;
|
1088
|
+
if (RegQueryValueExA(hKey,
|
1089
|
+
TEXT("ProcessorNameString"),
|
1090
|
+
NULL,
|
1091
|
+
NULL,
|
1092
|
+
NULL,
|
1093
|
+
&cpu_brand_size) == ERROR_SUCCESS) {
|
1094
|
+
description.resize(cpu_brand_size);
|
1095
|
+
if (RegQueryValueExA(hKey,
|
1096
|
+
TEXT("ProcessorNameString"),
|
1097
|
+
NULL,
|
1098
|
+
NULL,
|
1099
|
+
(LPBYTE)&description[0], // NOLINT
|
1100
|
+
&cpu_brand_size) == ERROR_SUCCESS) {
|
1101
|
+
if (description.find('\0') != std::string::npos) {
|
1102
|
+
description.resize(description.find('\0'));
|
1103
|
+
}
|
1104
|
+
}
|
1105
|
+
}
|
1106
|
+
RegCloseKey(hKey);
|
1107
|
+
}
|
1108
|
+
#endif
|
1109
|
+
}
|
1110
|
+
};
|
1111
|
+
|
1112
|
+
static const char * lm_ggml_backend_cpu_device_get_name(lm_ggml_backend_dev_t dev) {
|
1113
|
+
return "CPU";
|
1114
|
+
|
1115
|
+
LM_GGML_UNUSED(dev);
|
1116
|
+
}
|
1117
|
+
|
1118
|
+
static const char * lm_ggml_backend_cpu_device_get_description(lm_ggml_backend_dev_t dev) {
|
1119
|
+
struct lm_ggml_backend_cpu_device_context * ctx = (struct lm_ggml_backend_cpu_device_context *)dev->context;
|
1120
|
+
|
1121
|
+
return ctx->description.c_str();
|
1122
|
+
}
|
1123
|
+
|
1124
|
+
static void lm_ggml_backend_cpu_device_get_memory(lm_ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
1125
|
+
// TODO
|
1126
|
+
*free = 0;
|
1127
|
+
*total = 0;
|
1128
|
+
|
1129
|
+
LM_GGML_UNUSED(dev);
|
1130
|
+
}
|
1131
|
+
|
1132
|
+
static enum lm_ggml_backend_dev_type lm_ggml_backend_cpu_device_get_type(lm_ggml_backend_dev_t dev) {
|
1133
|
+
return LM_GGML_BACKEND_DEVICE_TYPE_CPU_FULL;
|
1134
|
+
|
1135
|
+
LM_GGML_UNUSED(dev);
|
1136
|
+
}
|
1137
|
+
|
1138
|
+
static void lm_ggml_backend_cpu_device_get_props(lm_ggml_backend_dev_t dev, struct lm_ggml_backend_dev_props * props) {
|
1139
|
+
props->name = lm_ggml_backend_cpu_device_get_name(dev);
|
1140
|
+
props->description = lm_ggml_backend_cpu_device_get_description(dev);
|
1141
|
+
props->type = lm_ggml_backend_cpu_device_get_type(dev);
|
1142
|
+
lm_ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
1143
|
+
props->caps = {
|
1144
|
+
/* .async = */ false,
|
1145
|
+
/* .host_buffer = */ false,
|
1146
|
+
/* .buffer_from_host_ptr = */ true,
|
1147
|
+
/* .events = */ false,
|
1148
|
+
};
|
1149
|
+
}
|
1150
|
+
|
1151
|
+
static lm_ggml_backend_t lm_ggml_backend_cpu_device_init(lm_ggml_backend_dev_t dev, const char * params) {
|
963
1152
|
return lm_ggml_backend_cpu_init();
|
964
1153
|
|
1154
|
+
LM_GGML_UNUSED(dev);
|
965
1155
|
LM_GGML_UNUSED(params);
|
966
|
-
|
1156
|
+
}
|
1157
|
+
|
1158
|
+
static lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_device_get_buffer_type(lm_ggml_backend_dev_t dev) {
|
1159
|
+
return lm_ggml_backend_cpu_buffer_type();
|
1160
|
+
|
1161
|
+
LM_GGML_UNUSED(dev);
|
1162
|
+
}
|
1163
|
+
|
1164
|
+
static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_device_buffer_from_ptr(lm_ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
1165
|
+
return lm_ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
1166
|
+
|
1167
|
+
LM_GGML_UNUSED(dev);
|
1168
|
+
LM_GGML_UNUSED(max_tensor_size);
|
1169
|
+
}
|
1170
|
+
|
1171
|
+
static bool lm_ggml_backend_cpu_device_supports_op(lm_ggml_backend_dev_t dev, const struct lm_ggml_tensor * op) {
|
1172
|
+
switch (op->op) {
|
1173
|
+
case LM_GGML_OP_CPY:
|
1174
|
+
return
|
1175
|
+
op->type != LM_GGML_TYPE_IQ2_XXS &&
|
1176
|
+
op->type != LM_GGML_TYPE_IQ2_XS &&
|
1177
|
+
op->type != LM_GGML_TYPE_IQ1_S &&
|
1178
|
+
op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
1179
|
+
case LM_GGML_OP_MUL_MAT:
|
1180
|
+
return op->src[1]->type == LM_GGML_TYPE_F32 || op->src[1]->type == lm_ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
|
1181
|
+
case LM_GGML_OP_ROPE_BACK:
|
1182
|
+
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
|
1183
|
+
case LM_GGML_OP_IM2COL_BACK:
|
1184
|
+
return op->src[0]->type == LM_GGML_TYPE_F32 && op->src[1]->type == LM_GGML_TYPE_F32;
|
1185
|
+
case LM_GGML_OP_OUT_PROD:
|
1186
|
+
return (op->src[0]->type == LM_GGML_TYPE_F32 || lm_ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == LM_GGML_TYPE_F32;
|
1187
|
+
default:
|
1188
|
+
return true;
|
1189
|
+
}
|
1190
|
+
|
1191
|
+
LM_GGML_UNUSED(dev);
|
1192
|
+
}
|
1193
|
+
|
1194
|
+
static bool lm_ggml_backend_cpu_device_supports_buft(lm_ggml_backend_dev_t dev, lm_ggml_backend_buffer_type_t buft) {
|
1195
|
+
return lm_ggml_backend_buft_is_host(buft);
|
1196
|
+
|
1197
|
+
LM_GGML_UNUSED(dev);
|
1198
|
+
}
|
1199
|
+
|
1200
|
+
static const struct lm_ggml_backend_device_i lm_ggml_backend_cpu_device_i = {
|
1201
|
+
/* .get_name = */ lm_ggml_backend_cpu_device_get_name,
|
1202
|
+
/* .get_description = */ lm_ggml_backend_cpu_device_get_description,
|
1203
|
+
/* .get_memory = */ lm_ggml_backend_cpu_device_get_memory,
|
1204
|
+
/* .get_type = */ lm_ggml_backend_cpu_device_get_type,
|
1205
|
+
/* .get_props = */ lm_ggml_backend_cpu_device_get_props,
|
1206
|
+
/* .init_backend = */ lm_ggml_backend_cpu_device_init,
|
1207
|
+
/* .get_buffer_type = */ lm_ggml_backend_cpu_device_get_buffer_type,
|
1208
|
+
/* .get_host_buffer_type = */ NULL,
|
1209
|
+
/* .buffer_from_host_ptr = */ lm_ggml_backend_cpu_device_buffer_from_ptr,
|
1210
|
+
/* .supports_op = */ lm_ggml_backend_cpu_device_supports_op,
|
1211
|
+
/* .supports_buft = */ lm_ggml_backend_cpu_device_supports_buft,
|
1212
|
+
/* .offload_op = */ NULL,
|
1213
|
+
/* .event_new = */ NULL,
|
1214
|
+
/* .event_free = */ NULL,
|
1215
|
+
/* .event_synchronize = */ NULL,
|
1216
|
+
};
|
1217
|
+
|
1218
|
+
////////////////////////
|
1219
|
+
|
1220
|
+
static const char * lm_ggml_backend_cpu_reg_get_name(lm_ggml_backend_reg_t reg) {
|
1221
|
+
return "CPU";
|
1222
|
+
|
1223
|
+
LM_GGML_UNUSED(reg);
|
1224
|
+
}
|
1225
|
+
|
1226
|
+
static size_t lm_ggml_backend_cpu_reg_get_device_count(lm_ggml_backend_reg_t reg) {
|
1227
|
+
return 1;
|
1228
|
+
|
1229
|
+
LM_GGML_UNUSED(reg);
|
1230
|
+
}
|
1231
|
+
|
1232
|
+
static lm_ggml_backend_dev_t lm_ggml_backend_cpu_reg_get_device(lm_ggml_backend_reg_t reg, size_t index) {
|
1233
|
+
LM_GGML_ASSERT(index == 0);
|
1234
|
+
|
1235
|
+
static lm_ggml_backend_cpu_device_context ctx;
|
1236
|
+
static lm_ggml_backend_device lm_ggml_backend_cpu_device = {
|
1237
|
+
/* .iface = */ lm_ggml_backend_cpu_device_i,
|
1238
|
+
/* .reg = */ reg,
|
1239
|
+
/* .context = */ &ctx,
|
1240
|
+
};
|
1241
|
+
|
1242
|
+
return &lm_ggml_backend_cpu_device;
|
1243
|
+
}
|
1244
|
+
|
1245
|
+
static void * lm_ggml_backend_cpu_get_proc_address(lm_ggml_backend_reg_t reg, const char * name) {
|
1246
|
+
if (strcmp(name, "lm_ggml_backend_set_n_threads") == 0) {
|
1247
|
+
return (void *)lm_ggml_backend_cpu_set_n_threads;
|
1248
|
+
}
|
1249
|
+
return NULL;
|
1250
|
+
|
1251
|
+
LM_GGML_UNUSED(reg);
|
1252
|
+
}
|
1253
|
+
|
1254
|
+
static const struct lm_ggml_backend_reg_i lm_ggml_backend_cpu_reg_i = {
|
1255
|
+
/* .get_name = */ lm_ggml_backend_cpu_reg_get_name,
|
1256
|
+
/* .get_device_count = */ lm_ggml_backend_cpu_reg_get_device_count,
|
1257
|
+
/* .get_device = */ lm_ggml_backend_cpu_reg_get_device,
|
1258
|
+
/* .get_proc_address = */ lm_ggml_backend_cpu_get_proc_address,
|
1259
|
+
};
|
1260
|
+
|
1261
|
+
lm_ggml_backend_reg_t lm_ggml_backend_cpu_reg(void) {
|
1262
|
+
static struct lm_ggml_backend_reg lm_ggml_backend_cpu_reg = {
|
1263
|
+
/* .iface = */ lm_ggml_backend_cpu_reg_i,
|
1264
|
+
/* .context = */ NULL,
|
1265
|
+
};
|
1266
|
+
|
1267
|
+
return &lm_ggml_backend_cpu_reg;
|
967
1268
|
}
|
968
1269
|
|
969
1270
|
// multi-buffer buffer
|
@@ -973,16 +1274,14 @@ struct lm_ggml_backend_multi_buffer_context {
|
|
973
1274
|
size_t n_buffers;
|
974
1275
|
};
|
975
1276
|
|
976
|
-
|
977
|
-
|
978
|
-
LM_GGML_CALL static const char * lm_ggml_backend_multi_buffer_get_name(lm_ggml_backend_buffer_t buffer) {
|
979
|
-
lm_ggml_backend_multi_buffer_context_t ctx = (lm_ggml_backend_multi_buffer_context_t) buffer->context;
|
1277
|
+
static const char * lm_ggml_backend_multi_buffer_get_name(lm_ggml_backend_buffer_t buffer) {
|
1278
|
+
lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
|
980
1279
|
|
981
1280
|
return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
|
982
1281
|
}
|
983
1282
|
|
984
|
-
|
985
|
-
|
1283
|
+
static void lm_ggml_backend_multi_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
|
1284
|
+
lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
|
986
1285
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
987
1286
|
lm_ggml_backend_buffer_free(ctx->buffers[i]);
|
988
1287
|
}
|
@@ -991,32 +1290,28 @@ LM_GGML_CALL static void lm_ggml_backend_multi_buffer_free_buffer(lm_ggml_backen
|
|
991
1290
|
free(ctx);
|
992
1291
|
}
|
993
1292
|
|
994
|
-
|
995
|
-
|
1293
|
+
static void lm_ggml_backend_multi_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) {
|
1294
|
+
lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
|
996
1295
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
997
1296
|
lm_ggml_backend_buffer_clear(ctx->buffers[i], value);
|
998
1297
|
}
|
999
1298
|
}
|
1000
1299
|
|
1001
|
-
static struct lm_ggml_backend_buffer_i
|
1002
|
-
|
1003
|
-
|
1004
|
-
|
1005
|
-
|
1006
|
-
|
1007
|
-
|
1008
|
-
|
1009
|
-
|
1010
|
-
|
1011
|
-
|
1012
|
-
|
1013
|
-
};
|
1014
|
-
|
1015
|
-
return multi_backend_buffer_i;
|
1016
|
-
}
|
1300
|
+
static const struct lm_ggml_backend_buffer_i lm_ggml_backend_multi_buffer_i = {
|
1301
|
+
/* .get_name = */ lm_ggml_backend_multi_buffer_get_name,
|
1302
|
+
/* .free_buffer = */ lm_ggml_backend_multi_buffer_free_buffer,
|
1303
|
+
/* .get_base = */ NULL,
|
1304
|
+
/* .init_tensor = */ NULL,
|
1305
|
+
/* .memset_tensor = */ NULL,
|
1306
|
+
/* .set_tensor = */ NULL,
|
1307
|
+
/* .get_tensor = */ NULL,
|
1308
|
+
/* .cpy_tensor = */ NULL,
|
1309
|
+
/* .clear = */ lm_ggml_backend_multi_buffer_clear,
|
1310
|
+
/* .reset = */ NULL,
|
1311
|
+
};
|
1017
1312
|
|
1018
|
-
|
1019
|
-
|
1313
|
+
lm_ggml_backend_buffer_t lm_ggml_backend_multi_buffer_alloc_buffer(lm_ggml_backend_buffer_t * buffers, size_t n_buffers) {
|
1314
|
+
lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) malloc(sizeof(struct lm_ggml_backend_multi_buffer_context));
|
1020
1315
|
ctx->n_buffers = n_buffers;
|
1021
1316
|
ctx->buffers = (lm_ggml_backend_buffer_t *) malloc(n_buffers * sizeof(lm_ggml_backend_buffer_t));
|
1022
1317
|
|
@@ -1028,16 +1323,16 @@ LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_multi_buffer_alloc_buffer(
|
|
1028
1323
|
total_size += lm_ggml_backend_buffer_get_size(buffers[i]);
|
1029
1324
|
}
|
1030
1325
|
|
1031
|
-
return lm_ggml_backend_buffer_init(buffers[0]->buft,
|
1326
|
+
return lm_ggml_backend_buffer_init(buffers[0]->buft, lm_ggml_backend_multi_buffer_i, ctx, total_size);
|
1032
1327
|
}
|
1033
1328
|
|
1034
|
-
|
1329
|
+
bool lm_ggml_backend_buffer_is_multi_buffer(lm_ggml_backend_buffer_t buffer) {
|
1035
1330
|
return buffer->iface.get_name == lm_ggml_backend_multi_buffer_get_name;
|
1036
1331
|
}
|
1037
1332
|
|
1038
|
-
|
1333
|
+
void lm_ggml_backend_multi_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage) {
|
1039
1334
|
LM_GGML_ASSERT(lm_ggml_backend_buffer_is_multi_buffer(buffer));
|
1040
|
-
|
1335
|
+
lm_ggml_backend_multi_buffer_context * ctx = (lm_ggml_backend_multi_buffer_context *) buffer->context;
|
1041
1336
|
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
1042
1337
|
lm_ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
|
1043
1338
|
}
|
@@ -1592,7 +1887,8 @@ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, str
|
|
1592
1887
|
i_split++;
|
1593
1888
|
if (i_split >= sched->splits_capacity) {
|
1594
1889
|
sched->splits_capacity *= 2;
|
1595
|
-
sched->splits =
|
1890
|
+
sched->splits = (lm_ggml_backend_sched_split *)
|
1891
|
+
realloc(sched->splits, sched->splits_capacity * sizeof(struct lm_ggml_backend_sched_split));
|
1596
1892
|
LM_GGML_ASSERT(sched->splits != NULL);
|
1597
1893
|
}
|
1598
1894
|
split = &sched->splits[i_split];
|
@@ -1678,11 +1974,11 @@ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, str
|
|
1678
1974
|
sched->prev_leaf_backend_ids = tmp;
|
1679
1975
|
}
|
1680
1976
|
|
1681
|
-
int graph_size =
|
1977
|
+
int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
|
1682
1978
|
if (sched->graph.size < graph_size) {
|
1683
1979
|
sched->graph.size = graph_size;
|
1684
|
-
sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct lm_ggml_tensor *));
|
1685
|
-
sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct lm_ggml_tensor *));
|
1980
|
+
sched->graph.nodes = (lm_ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct lm_ggml_tensor *));
|
1981
|
+
sched->graph.leafs = (lm_ggml_tensor **) realloc(sched->graph.leafs, graph_size * sizeof(struct lm_ggml_tensor *));
|
1686
1982
|
LM_GGML_ASSERT(sched->graph.nodes != NULL);
|
1687
1983
|
LM_GGML_ASSERT(sched->graph.leafs != NULL);
|
1688
1984
|
}
|
@@ -1881,7 +2177,7 @@ static enum lm_ggml_status lm_ggml_backend_sched_compute_splits(lm_ggml_backend_
|
|
1881
2177
|
// record the event of this copy
|
1882
2178
|
if (split->n_inputs > 0) {
|
1883
2179
|
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
1884
|
-
lm_ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
|
2180
|
+
lm_ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy], split_backend);
|
1885
2181
|
}
|
1886
2182
|
}
|
1887
2183
|
}
|
@@ -1901,7 +2197,7 @@ lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
|
|
1901
2197
|
LM_GGML_ASSERT(n_backends <= LM_GGML_SCHED_MAX_BACKENDS);
|
1902
2198
|
LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
1903
2199
|
|
1904
|
-
struct lm_ggml_backend_sched * sched = calloc(1, sizeof(struct lm_ggml_backend_sched));
|
2200
|
+
struct lm_ggml_backend_sched * sched = (lm_ggml_backend_sched *) calloc(1, sizeof(struct lm_ggml_backend_sched));
|
1905
2201
|
|
1906
2202
|
sched->debug = getenv("LM_GGML_SCHED_DEBUG") != NULL;
|
1907
2203
|
sched->n_backends = n_backends;
|
@@ -1910,21 +2206,21 @@ lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
|
|
1910
2206
|
// initialize hash table
|
1911
2207
|
// FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
|
1912
2208
|
sched->hash_set = lm_ggml_hash_set_new(graph_size);
|
1913
|
-
sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
1914
|
-
sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
|
2209
|
+
sched->hv_tensor_backend_ids = (int *) malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
2210
|
+
sched->hv_tensor_copies = (lm_ggml_tensor **) malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct lm_ggml_tensor *));
|
1915
2211
|
|
1916
2212
|
const size_t lm_ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
|
1917
2213
|
const size_t nodes_size = graph_size + lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
1918
|
-
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
1919
|
-
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
1920
|
-
sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
1921
|
-
sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
2214
|
+
sched->node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
2215
|
+
sched->leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
2216
|
+
sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
2217
|
+
sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
1922
2218
|
|
1923
2219
|
sched->context_buffer_size = lm_ggml_sched_max_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct lm_ggml_tensor) + lm_ggml_graph_overhead_custom(graph_size, false);
|
1924
|
-
sched->context_buffer = malloc(sched->context_buffer_size);
|
2220
|
+
sched->context_buffer = (char *) malloc(sched->context_buffer_size);
|
1925
2221
|
|
1926
2222
|
const int initial_splits_capacity = 16;
|
1927
|
-
sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
2223
|
+
sched->splits = (lm_ggml_backend_sched_split *) calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
1928
2224
|
sched->splits_capacity = initial_splits_capacity;
|
1929
2225
|
|
1930
2226
|
for (int b = 0; b < n_backends; b++) {
|
@@ -1933,7 +2229,7 @@ lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
|
|
1933
2229
|
LM_GGML_ASSERT(lm_ggml_backend_supports_buft(backends[b], sched->bufts[b]));
|
1934
2230
|
if (sched->n_copies > 1) {
|
1935
2231
|
for (int c = 0; c < sched->n_copies; c++) {
|
1936
|
-
sched->events[b][c] = lm_ggml_backend_event_new(backends[b]);
|
2232
|
+
sched->events[b][c] = lm_ggml_backend_event_new(backends[b]->device);
|
1937
2233
|
}
|
1938
2234
|
}
|
1939
2235
|
}
|
@@ -2169,8 +2465,8 @@ static void graph_copy_init_tensor(struct lm_ggml_hash_set * hash_set, struct lm
|
|
2169
2465
|
|
2170
2466
|
struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t backend, struct lm_ggml_cgraph * graph) {
|
2171
2467
|
struct lm_ggml_hash_set hash_set = lm_ggml_hash_set_new(graph->visited_hash_set.size);
|
2172
|
-
struct lm_ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
2173
|
-
bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
|
2468
|
+
struct lm_ggml_tensor ** node_copies = (lm_ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
2469
|
+
bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
|
2174
2470
|
|
2175
2471
|
struct lm_ggml_init_params params = {
|
2176
2472
|
/* .mem_size = */ lm_ggml_tensor_overhead()*hash_set.size + lm_ggml_graph_overhead_custom(graph->size, false),
|
@@ -2188,7 +2484,7 @@ struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t b
|
|
2188
2484
|
free(node_init);
|
2189
2485
|
lm_ggml_free(ctx_allocated);
|
2190
2486
|
lm_ggml_free(ctx_unallocated);
|
2191
|
-
return
|
2487
|
+
return {
|
2192
2488
|
/* .buffer = */ NULL,
|
2193
2489
|
/* .ctx_allocated = */ NULL,
|
2194
2490
|
/* .ctx_unallocated = */ NULL,
|
@@ -2211,7 +2507,7 @@ struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t b
|
|
2211
2507
|
free(node_init);
|
2212
2508
|
lm_ggml_free(ctx_allocated);
|
2213
2509
|
lm_ggml_free(ctx_unallocated);
|
2214
|
-
return
|
2510
|
+
return {
|
2215
2511
|
/* .buffer = */ NULL,
|
2216
2512
|
/* .ctx_allocated = */ NULL,
|
2217
2513
|
/* .ctx_unallocated = */ NULL,
|
@@ -2240,7 +2536,7 @@ struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t b
|
|
2240
2536
|
free(node_copies);
|
2241
2537
|
free(node_init);
|
2242
2538
|
|
2243
|
-
return
|
2539
|
+
return {
|
2244
2540
|
/* .buffer = */ buffer,
|
2245
2541
|
/* .ctx_allocated = */ ctx_allocated,
|
2246
2542
|
/* .ctx_unallocated = */ ctx_unallocated,
|