cui-llama.rn 1.2.2 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/java/com/rnllama/LlamaContext.java +5 -2
- package/android/src/main/jni.cpp +7 -7
- package/cpp/common.cpp +81 -63
- package/cpp/common.h +79 -62
- package/cpp/ggml-alloc.c +17 -19
- package/cpp/ggml-backend.cpp +59 -24
- package/cpp/ggml-impl.h +8 -0
- package/cpp/ggml.c +65 -23
- package/cpp/ggml.h +1 -0
- package/cpp/json-schema-to-grammar.cpp +1 -1
- package/cpp/llama-sampling.cpp +366 -24
- package/cpp/llama-sampling.h +3 -2
- package/cpp/llama-vocab.cpp +33 -9
- package/cpp/llama-vocab.h +30 -11
- package/cpp/llama.cpp +471 -387
- package/cpp/llama.h +52 -21
- package/cpp/log.cpp +50 -50
- package/cpp/log.h +18 -18
- package/cpp/rn-llama.hpp +23 -22
- package/cpp/sampling.cpp +110 -119
- package/cpp/sampling.h +20 -20
- package/package.json +1 -1
package/cpp/ggml-backend.cpp
CHANGED
@@ -329,7 +329,6 @@ bool lm_ggml_backend_supports_buft(lm_ggml_backend_t backend, lm_ggml_backend_bu
|
|
329
329
|
if (backend->device) {
|
330
330
|
return lm_ggml_backend_dev_supports_buft(backend->device, buft);
|
331
331
|
}
|
332
|
-
|
333
332
|
return backend->iface.supports_buft(backend, buft);
|
334
333
|
}
|
335
334
|
|
@@ -379,7 +378,7 @@ void lm_ggml_backend_tensor_copy(struct lm_ggml_tensor * src, struct lm_ggml_ten
|
|
379
378
|
lm_ggml_backend_tensor_get(src, dst->data, 0, lm_ggml_nbytes(src));
|
380
379
|
} else if (!lm_ggml_backend_buffer_copy_tensor(src, dst)) {
|
381
380
|
#ifndef NDEBUG
|
382
|
-
|
381
|
+
LM_GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, lm_ggml_backend_buffer_name(src->buffer), lm_ggml_backend_buffer_name(dst->buffer));
|
383
382
|
#endif
|
384
383
|
size_t nbytes = lm_ggml_nbytes(src);
|
385
384
|
void * data = malloc(nbytes);
|
@@ -538,10 +537,30 @@ void * lm_ggml_backend_reg_get_proc_address(lm_ggml_backend_reg_t reg, const cha
|
|
538
537
|
#include "ggml-metal.h"
|
539
538
|
#endif
|
540
539
|
|
540
|
+
#ifdef LM_GGML_USE_SYCL
|
541
|
+
#include "ggml-sycl.h"
|
542
|
+
#endif
|
543
|
+
|
544
|
+
#ifdef LM_GGML_USE_VULKAN
|
545
|
+
#include "ggml-vulkan.h"
|
546
|
+
#endif
|
547
|
+
|
541
548
|
#ifdef LM_GGML_USE_BLAS
|
542
549
|
#include "ggml-blas.h"
|
543
550
|
#endif
|
544
551
|
|
552
|
+
#ifdef LM_GGML_USE_RPC
|
553
|
+
#include "ggml-rpc.h"
|
554
|
+
#endif
|
555
|
+
|
556
|
+
#ifndef __AMX_INT8__
|
557
|
+
#undef LM_GGML_USE_AMX
|
558
|
+
#endif
|
559
|
+
|
560
|
+
#ifdef LM_GGML_USE_AMX
|
561
|
+
# include "ggml-amx.h"
|
562
|
+
#endif
|
563
|
+
|
545
564
|
struct lm_ggml_backend_registry {
|
546
565
|
std::vector<lm_ggml_backend_reg_t> backends;
|
547
566
|
std::vector<lm_ggml_backend_dev_t> devices;
|
@@ -553,18 +572,30 @@ struct lm_ggml_backend_registry {
|
|
553
572
|
#ifdef LM_GGML_USE_METAL
|
554
573
|
register_backend(lm_ggml_backend_metal_reg());
|
555
574
|
#endif
|
575
|
+
#ifdef LM_GGML_USE_SYCL
|
576
|
+
register_backend(lm_ggml_backend_sycl_reg());
|
577
|
+
#endif
|
578
|
+
#ifdef LM_GGML_USE_VULKAN
|
579
|
+
register_backend(lm_ggml_backend_vk_reg());
|
580
|
+
#endif
|
556
581
|
#ifdef LM_GGML_USE_BLAS
|
557
582
|
register_backend(lm_ggml_backend_blas_reg());
|
558
583
|
#endif
|
584
|
+
#ifdef LM_GGML_USE_RPC
|
585
|
+
register_backend(lm_ggml_backend_rpc_reg());
|
586
|
+
#endif
|
587
|
+
#ifdef LM_GGML_USE_AMX
|
588
|
+
register_backend(lm_ggml_backend_amx_reg());
|
589
|
+
#endif
|
559
590
|
|
560
|
-
// TODO:
|
591
|
+
// TODO: kompute, cann
|
561
592
|
|
562
593
|
register_backend(lm_ggml_backend_cpu_reg());
|
563
594
|
}
|
564
595
|
|
565
596
|
void register_backend(lm_ggml_backend_reg_t reg) {
|
566
597
|
#ifndef NDEBUG
|
567
|
-
|
598
|
+
LM_GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
|
568
599
|
__func__, lm_ggml_backend_reg_name(reg), lm_ggml_backend_reg_dev_count(reg));
|
569
600
|
#endif
|
570
601
|
backends.push_back(reg);
|
@@ -575,7 +606,7 @@ struct lm_ggml_backend_registry {
|
|
575
606
|
|
576
607
|
void register_device(lm_ggml_backend_dev_t device) {
|
577
608
|
#ifndef NDEBUG
|
578
|
-
|
609
|
+
LM_GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, lm_ggml_backend_dev_name(device), lm_ggml_backend_dev_description(device));
|
579
610
|
#endif
|
580
611
|
devices.push_back(device);
|
581
612
|
}
|
@@ -675,8 +706,6 @@ lm_ggml_backend_t lm_ggml_backend_init_best(void) {
|
|
675
706
|
|
676
707
|
// backend CPU
|
677
708
|
|
678
|
-
static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
|
679
|
-
|
680
709
|
static const char * lm_ggml_backend_cpu_buffer_get_name(lm_ggml_backend_buffer_t buffer) {
|
681
710
|
return "CPU";
|
682
711
|
|
@@ -695,7 +724,7 @@ static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_buffer_t buffe
|
|
695
724
|
}
|
696
725
|
|
697
726
|
static void lm_ggml_backend_cpu_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
|
698
|
-
|
727
|
+
lm_ggml_aligned_free(buffer->context, buffer->size);
|
699
728
|
}
|
700
729
|
|
701
730
|
static void lm_ggml_backend_cpu_buffer_memset_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
@@ -763,14 +792,19 @@ static const char * lm_ggml_backend_cpu_buffer_type_get_name(lm_ggml_backend_buf
|
|
763
792
|
}
|
764
793
|
|
765
794
|
static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
|
766
|
-
|
767
|
-
|
795
|
+
auto alloc_size = size;
|
796
|
+
if (alloc_size == 0) {
|
797
|
+
alloc_size = 1;
|
798
|
+
}
|
799
|
+
|
800
|
+
void * data = lm_ggml_aligned_malloc(alloc_size);
|
801
|
+
|
768
802
|
if (data == NULL) {
|
769
|
-
|
803
|
+
LM_GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size);
|
770
804
|
return NULL;
|
771
805
|
}
|
772
806
|
|
773
|
-
return lm_ggml_backend_buffer_init(buft, lm_ggml_backend_cpu_buffer_i, data,
|
807
|
+
return lm_ggml_backend_buffer_init(buft, lm_ggml_backend_cpu_buffer_i, data, alloc_size);
|
774
808
|
}
|
775
809
|
|
776
810
|
static size_t lm_ggml_backend_cpu_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) {
|
@@ -829,7 +863,7 @@ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer
|
|
829
863
|
void * ptr;
|
830
864
|
int result = hbw_posix_memalign(&ptr, lm_ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
831
865
|
if (result != 0) {
|
832
|
-
|
866
|
+
LM_GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
|
833
867
|
return NULL;
|
834
868
|
}
|
835
869
|
|
@@ -1452,7 +1486,7 @@ static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sch
|
|
1452
1486
|
}
|
1453
1487
|
|
1454
1488
|
#ifndef NDEBUG
|
1455
|
-
|
1489
|
+
LM_GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
|
1456
1490
|
__func__, lm_ggml_op_desc(tensor), lm_ggml_backend_buffer_name(buffer), tensor->name);
|
1457
1491
|
#endif
|
1458
1492
|
|
@@ -1541,13 +1575,13 @@ static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sche
|
|
1541
1575
|
for (int i = 0; i < graph->n_nodes; i++) {
|
1542
1576
|
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
1543
1577
|
lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
1544
|
-
|
1578
|
+
LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend),
|
1545
1579
|
sched->splits[cur_split].n_inputs);
|
1546
1580
|
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
1547
|
-
|
1581
|
+
LM_GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
1548
1582
|
fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
1549
1583
|
}
|
1550
|
-
|
1584
|
+
LM_GGML_LOG_DEBUG("\n");
|
1551
1585
|
cur_split++;
|
1552
1586
|
}
|
1553
1587
|
struct lm_ggml_tensor * node = graph->nodes[i];
|
@@ -1555,7 +1589,7 @@ static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sche
|
|
1555
1589
|
continue;
|
1556
1590
|
}
|
1557
1591
|
lm_ggml_backend_t tensor_backend = lm_ggml_backend_sched_get_tensor_backend(sched, node);
|
1558
|
-
|
1592
|
+
LM_GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, lm_ggml_op_name(node->op), node->name,
|
1559
1593
|
fmt_size(lm_ggml_nbytes(node)), tensor_backend ? lm_ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
1560
1594
|
for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
|
1561
1595
|
struct lm_ggml_tensor * src = node->src[j];
|
@@ -1563,10 +1597,10 @@ static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sche
|
|
1563
1597
|
continue;
|
1564
1598
|
}
|
1565
1599
|
lm_ggml_backend_t src_backend = lm_ggml_backend_sched_get_tensor_backend(sched, src);
|
1566
|
-
|
1600
|
+
LM_GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
1567
1601
|
fmt_size(lm_ggml_nbytes(src)), src_backend ? lm_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
1568
1602
|
}
|
1569
|
-
|
1603
|
+
LM_GGML_LOG_DEBUG("\n");
|
1570
1604
|
}
|
1571
1605
|
}
|
1572
1606
|
|
@@ -2080,11 +2114,11 @@ static bool lm_ggml_backend_sched_alloc_splits(lm_ggml_backend_sched_t sched) {
|
|
2080
2114
|
// the re-allocation may cause the split inputs to be moved to a different address
|
2081
2115
|
lm_ggml_backend_sched_synchronize(sched);
|
2082
2116
|
#ifndef NDEBUG
|
2083
|
-
|
2117
|
+
LM_GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
2084
2118
|
#endif
|
2085
2119
|
lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
2086
2120
|
if (!lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
2087
|
-
|
2121
|
+
LM_GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
|
2088
2122
|
return false;
|
2089
2123
|
}
|
2090
2124
|
}
|
@@ -2227,6 +2261,7 @@ lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
|
|
2227
2261
|
sched->backends[b] = backends[b];
|
2228
2262
|
sched->bufts[b] = bufts ? bufts[b] : lm_ggml_backend_get_default_buffer_type(backends[b]);
|
2229
2263
|
LM_GGML_ASSERT(lm_ggml_backend_supports_buft(backends[b], sched->bufts[b]));
|
2264
|
+
|
2230
2265
|
if (sched->n_copies > 1) {
|
2231
2266
|
for (int c = 0; c < sched->n_copies; c++) {
|
2232
2267
|
sched->events[b][c] = lm_ggml_backend_event_new(backends[b]->device);
|
@@ -2478,7 +2513,7 @@ struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t b
|
|
2478
2513
|
struct lm_ggml_context * ctx_unallocated = lm_ggml_init(params);
|
2479
2514
|
|
2480
2515
|
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
|
2481
|
-
|
2516
|
+
LM_GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
|
2482
2517
|
lm_ggml_hash_set_free(&hash_set);
|
2483
2518
|
free(node_copies);
|
2484
2519
|
free(node_init);
|
@@ -2501,7 +2536,7 @@ struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t b
|
|
2501
2536
|
// allocate nodes
|
2502
2537
|
lm_ggml_backend_buffer_t buffer = lm_ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
2503
2538
|
if (buffer == NULL) {
|
2504
|
-
|
2539
|
+
LM_GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
|
2505
2540
|
lm_ggml_hash_set_free(&hash_set);
|
2506
2541
|
free(node_copies);
|
2507
2542
|
free(node_init);
|
package/cpp/ggml-impl.h
CHANGED
@@ -19,6 +19,9 @@ extern "C" {
|
|
19
19
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
20
20
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
21
21
|
|
22
|
+
// required for mmap as gguf only guarantees 32-byte alignment
|
23
|
+
#define TENSOR_ALIGNMENT 32
|
24
|
+
|
22
25
|
// static_assert should be a #define, but if it's not,
|
23
26
|
// fall back to the _Static_assert C11 keyword.
|
24
27
|
// if C99 - static_assert is noop
|
@@ -196,6 +199,11 @@ struct lm_ggml_cgraph {
|
|
196
199
|
|
197
200
|
struct lm_ggml_cgraph lm_ggml_graph_view(struct lm_ggml_cgraph * cgraph, int i0, int i1);
|
198
201
|
|
202
|
+
// Memory allocation
|
203
|
+
|
204
|
+
void * lm_ggml_aligned_malloc(size_t size);
|
205
|
+
void lm_ggml_aligned_free(void * ptr, size_t size);
|
206
|
+
|
199
207
|
#ifdef __cplusplus
|
200
208
|
}
|
201
209
|
#endif
|
package/cpp/ggml.c
CHANGED
@@ -35,10 +35,6 @@
|
|
35
35
|
#include <omp.h>
|
36
36
|
#endif
|
37
37
|
|
38
|
-
#ifdef LM_GGML_USE_METAL
|
39
|
-
#include <unistd.h>
|
40
|
-
#endif
|
41
|
-
|
42
38
|
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
|
43
39
|
#undef LM_GGML_USE_LLAMAFILE
|
44
40
|
#endif
|
@@ -189,6 +185,8 @@ typedef pthread_t lm_ggml_thread_t;
|
|
189
185
|
#endif
|
190
186
|
|
191
187
|
#if defined(__APPLE__)
|
188
|
+
#include <unistd.h>
|
189
|
+
#include <mach/mach.h>
|
192
190
|
#include <TargetConditionals.h>
|
193
191
|
#endif
|
194
192
|
|
@@ -326,8 +324,9 @@ struct lm_ggml_logger_state {
|
|
326
324
|
static struct lm_ggml_logger_state g_logger_state = {lm_ggml_log_callback_default, NULL};
|
327
325
|
|
328
326
|
static void lm_ggml_log_internal_v(enum lm_ggml_log_level level, const char * format, va_list args) {
|
329
|
-
if (format == NULL)
|
327
|
+
if (format == NULL) {
|
330
328
|
return;
|
329
|
+
}
|
331
330
|
va_list args_copy;
|
332
331
|
va_copy(args_copy, args);
|
333
332
|
char buffer[128];
|
@@ -386,22 +385,40 @@ void lm_ggml_log_callback_default(enum lm_ggml_log_level level, const char * tex
|
|
386
385
|
//#define LM_GGML_SOFT_MAX_ACCELERATE
|
387
386
|
#endif
|
388
387
|
|
388
|
+
|
389
|
+
void * lm_ggml_aligned_malloc(size_t size) {
|
389
390
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
390
|
-
|
391
|
-
#define LM_GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
391
|
+
return _aligned_malloc(size, TENSOR_ALIGNMENT);
|
392
392
|
#else
|
393
|
-
inline static void * lm_ggml_aligned_malloc(size_t size) {
|
394
393
|
if (size == 0) {
|
395
394
|
LM_GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for lm_ggml_aligned_malloc!\n");
|
396
395
|
return NULL;
|
397
396
|
}
|
398
397
|
void * aligned_memory = NULL;
|
399
398
|
#ifdef LM_GGML_USE_CPU_HBM
|
400
|
-
int result = hbw_posix_memalign(&aligned_memory,
|
399
|
+
int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
|
400
|
+
#elif TARGET_OS_OSX
|
401
|
+
kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE);
|
402
|
+
int result = EFAULT;
|
403
|
+
switch (alloc_status) {
|
404
|
+
case KERN_SUCCESS:
|
405
|
+
result = 0;
|
406
|
+
break;
|
407
|
+
case KERN_INVALID_ADDRESS:
|
408
|
+
result = EINVAL;
|
409
|
+
break;
|
410
|
+
case KERN_NO_SPACE:
|
411
|
+
result = ENOMEM;
|
412
|
+
break;
|
413
|
+
default:
|
414
|
+
result = EFAULT;
|
415
|
+
break;
|
416
|
+
}
|
401
417
|
#elif LM_GGML_USE_METAL
|
402
|
-
|
418
|
+
const long page_size = sysconf(_SC_PAGESIZE);
|
419
|
+
int result = posix_memalign(&aligned_memory, MAX(TENSOR_ALIGNMENT, page_size), size);
|
403
420
|
#else
|
404
|
-
int result = posix_memalign(&aligned_memory,
|
421
|
+
int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
|
405
422
|
#endif
|
406
423
|
if (result != 0) {
|
407
424
|
// Handle allocation failure
|
@@ -419,14 +436,26 @@ inline static void * lm_ggml_aligned_malloc(size_t size) {
|
|
419
436
|
return NULL;
|
420
437
|
}
|
421
438
|
return aligned_memory;
|
439
|
+
#endif
|
422
440
|
}
|
423
|
-
|
424
|
-
|
425
|
-
|
441
|
+
|
442
|
+
void lm_ggml_aligned_free(void * ptr, size_t size) {
|
443
|
+
LM_GGML_UNUSED(size);
|
444
|
+
#if defined(_MSC_VER) || defined(__MINGW32__)
|
445
|
+
_aligned_free(ptr);
|
446
|
+
#elif LM_GGML_USE_CPU_HBM
|
447
|
+
if (ptr != NULL) {
|
448
|
+
hbw_free(ptr);
|
449
|
+
}
|
450
|
+
#elif TARGET_OS_OSX
|
451
|
+
if (ptr != NULL) {
|
452
|
+
vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ptr, size);
|
453
|
+
}
|
426
454
|
#else
|
427
|
-
|
428
|
-
#endif
|
455
|
+
free(ptr);
|
429
456
|
#endif
|
457
|
+
}
|
458
|
+
|
430
459
|
|
431
460
|
inline static void * lm_ggml_malloc(size_t size) {
|
432
461
|
if (size == 0) {
|
@@ -3882,7 +3911,7 @@ struct lm_ggml_context * lm_ggml_init(struct lm_ggml_init_params params) {
|
|
3882
3911
|
|
3883
3912
|
*ctx = (struct lm_ggml_context) {
|
3884
3913
|
/*.mem_size =*/ mem_size,
|
3885
|
-
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer :
|
3914
|
+
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : lm_ggml_aligned_malloc(mem_size),
|
3886
3915
|
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
|
3887
3916
|
/*.no_alloc =*/ params.no_alloc,
|
3888
3917
|
/*.no_alloc_save =*/ params.no_alloc,
|
@@ -3922,7 +3951,7 @@ void lm_ggml_free(struct lm_ggml_context * ctx) {
|
|
3922
3951
|
__func__, i, lm_ggml_used_mem(ctx));
|
3923
3952
|
|
3924
3953
|
if (ctx->mem_buffer_owned) {
|
3925
|
-
|
3954
|
+
lm_ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
|
3926
3955
|
}
|
3927
3956
|
|
3928
3957
|
found = true;
|
@@ -15708,6 +15737,9 @@ static void lm_ggml_compute_forward_flash_attn_ext_f16(
|
|
15708
15737
|
lm_ggml_vec_dot_t const kq_vec_dot = type_traits[k->type].vec_dot;
|
15709
15738
|
lm_ggml_to_float_t const v_to_float = type_traits[v->type].to_float;
|
15710
15739
|
|
15740
|
+
LM_GGML_ASSERT(q_to_vec_dot && "fattn: unsupported K-type");
|
15741
|
+
LM_GGML_ASSERT(v_to_float && "fattn: unsupported V-type");
|
15742
|
+
|
15711
15743
|
// loop over n_batch and n_head
|
15712
15744
|
for (int ir = ir0; ir < ir1; ++ir) {
|
15713
15745
|
// q indices
|
@@ -19621,9 +19653,10 @@ static void lm_ggml_thread_cpumask_next(const bool * global_mask, bool * local_m
|
|
19621
19653
|
void lm_ggml_threadpool_free(struct lm_ggml_threadpool* threadpool) {
|
19622
19654
|
if (!threadpool) return;
|
19623
19655
|
|
19656
|
+
const int n_threads = threadpool->n_threads_max;
|
19657
|
+
|
19624
19658
|
#ifndef LM_GGML_USE_OPENMP
|
19625
19659
|
struct lm_ggml_compute_state* workers = threadpool->workers;
|
19626
|
-
const int n_threads = threadpool->n_threads_max;
|
19627
19660
|
|
19628
19661
|
lm_ggml_mutex_lock(&threadpool->mutex);
|
19629
19662
|
|
@@ -19643,8 +19676,9 @@ void lm_ggml_threadpool_free(struct lm_ggml_threadpool* threadpool) {
|
|
19643
19676
|
lm_ggml_cond_destroy(&threadpool->cond);
|
19644
19677
|
#endif // LM_GGML_USE_OPENMP
|
19645
19678
|
|
19646
|
-
|
19647
|
-
|
19679
|
+
const size_t workers_size = sizeof(struct lm_ggml_compute_state) * n_threads;
|
19680
|
+
lm_ggml_aligned_free(threadpool->workers, workers_size);
|
19681
|
+
lm_ggml_aligned_free(threadpool, sizeof(struct lm_ggml_threadpool));
|
19648
19682
|
}
|
19649
19683
|
|
19650
19684
|
#ifndef LM_GGML_USE_OPENMP
|
@@ -20076,7 +20110,7 @@ static struct lm_ggml_threadpool * lm_ggml_threadpool_new_impl(
|
|
20076
20110
|
struct lm_ggml_cplan * cplan) {
|
20077
20111
|
|
20078
20112
|
struct lm_ggml_threadpool * threadpool =
|
20079
|
-
|
20113
|
+
lm_ggml_aligned_malloc(sizeof(struct lm_ggml_threadpool));
|
20080
20114
|
{
|
20081
20115
|
threadpool->cgraph = cgraph;
|
20082
20116
|
threadpool->cplan = cplan;
|
@@ -20097,7 +20131,7 @@ static struct lm_ggml_threadpool * lm_ggml_threadpool_new_impl(
|
|
20097
20131
|
|
20098
20132
|
// Allocate and init workers state
|
20099
20133
|
const size_t workers_size = sizeof(struct lm_ggml_compute_state) * tpp->n_threads;
|
20100
|
-
struct lm_ggml_compute_state * workers =
|
20134
|
+
struct lm_ggml_compute_state * workers = lm_ggml_aligned_malloc(workers_size);
|
20101
20135
|
|
20102
20136
|
memset(workers, 0, workers_size);
|
20103
20137
|
for (int j = 0; j < tpp->n_threads; j++) {
|
@@ -23235,6 +23269,14 @@ int lm_ggml_cpu_has_avx512_bf16(void) {
|
|
23235
23269
|
#endif
|
23236
23270
|
}
|
23237
23271
|
|
23272
|
+
int lm_ggml_cpu_has_amx_int8(void) {
|
23273
|
+
#if defined(__AMX_INT8__)
|
23274
|
+
return 1;
|
23275
|
+
#else
|
23276
|
+
return 0;
|
23277
|
+
#endif
|
23278
|
+
}
|
23279
|
+
|
23238
23280
|
int lm_ggml_cpu_has_fma(void) {
|
23239
23281
|
#if defined(__FMA__)
|
23240
23282
|
return 1;
|
package/cpp/ggml.h
CHANGED
@@ -2489,6 +2489,7 @@ extern "C" {
|
|
2489
2489
|
LM_GGML_API int lm_ggml_cpu_has_avx512_vbmi(void);
|
2490
2490
|
LM_GGML_API int lm_ggml_cpu_has_avx512_vnni(void);
|
2491
2491
|
LM_GGML_API int lm_ggml_cpu_has_avx512_bf16(void);
|
2492
|
+
LM_GGML_API int lm_ggml_cpu_has_amx_int8 (void);
|
2492
2493
|
LM_GGML_API int lm_ggml_cpu_has_fma (void);
|
2493
2494
|
LM_GGML_API int lm_ggml_cpu_has_neon (void);
|
2494
2495
|
LM_GGML_API int lm_ggml_cpu_has_sve (void);
|