cui-llama.rn 1.2.2 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -329,7 +329,6 @@ bool lm_ggml_backend_supports_buft(lm_ggml_backend_t backend, lm_ggml_backend_bu
329
329
  if (backend->device) {
330
330
  return lm_ggml_backend_dev_supports_buft(backend->device, buft);
331
331
  }
332
-
333
332
  return backend->iface.supports_buft(backend, buft);
334
333
  }
335
334
 
@@ -379,7 +378,7 @@ void lm_ggml_backend_tensor_copy(struct lm_ggml_tensor * src, struct lm_ggml_ten
379
378
  lm_ggml_backend_tensor_get(src, dst->data, 0, lm_ggml_nbytes(src));
380
379
  } else if (!lm_ggml_backend_buffer_copy_tensor(src, dst)) {
381
380
  #ifndef NDEBUG
382
- fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, lm_ggml_backend_buffer_name(src->buffer), lm_ggml_backend_buffer_name(dst->buffer));
381
+ LM_GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, lm_ggml_backend_buffer_name(src->buffer), lm_ggml_backend_buffer_name(dst->buffer));
383
382
  #endif
384
383
  size_t nbytes = lm_ggml_nbytes(src);
385
384
  void * data = malloc(nbytes);
@@ -538,10 +537,30 @@ void * lm_ggml_backend_reg_get_proc_address(lm_ggml_backend_reg_t reg, const cha
538
537
  #include "ggml-metal.h"
539
538
  #endif
540
539
 
540
+ #ifdef LM_GGML_USE_SYCL
541
+ #include "ggml-sycl.h"
542
+ #endif
543
+
544
+ #ifdef LM_GGML_USE_VULKAN
545
+ #include "ggml-vulkan.h"
546
+ #endif
547
+
541
548
  #ifdef LM_GGML_USE_BLAS
542
549
  #include "ggml-blas.h"
543
550
  #endif
544
551
 
552
+ #ifdef LM_GGML_USE_RPC
553
+ #include "ggml-rpc.h"
554
+ #endif
555
+
556
+ #ifndef __AMX_INT8__
557
+ #undef LM_GGML_USE_AMX
558
+ #endif
559
+
560
+ #ifdef LM_GGML_USE_AMX
561
+ # include "ggml-amx.h"
562
+ #endif
563
+
545
564
  struct lm_ggml_backend_registry {
546
565
  std::vector<lm_ggml_backend_reg_t> backends;
547
566
  std::vector<lm_ggml_backend_dev_t> devices;
@@ -553,18 +572,30 @@ struct lm_ggml_backend_registry {
553
572
  #ifdef LM_GGML_USE_METAL
554
573
  register_backend(lm_ggml_backend_metal_reg());
555
574
  #endif
575
+ #ifdef LM_GGML_USE_SYCL
576
+ register_backend(lm_ggml_backend_sycl_reg());
577
+ #endif
578
+ #ifdef LM_GGML_USE_VULKAN
579
+ register_backend(lm_ggml_backend_vk_reg());
580
+ #endif
556
581
  #ifdef LM_GGML_USE_BLAS
557
582
  register_backend(lm_ggml_backend_blas_reg());
558
583
  #endif
584
+ #ifdef LM_GGML_USE_RPC
585
+ register_backend(lm_ggml_backend_rpc_reg());
586
+ #endif
587
+ #ifdef LM_GGML_USE_AMX
588
+ register_backend(lm_ggml_backend_amx_reg());
589
+ #endif
559
590
 
560
- // TODO: sycl, vulkan, kompute, cann
591
+ // TODO: kompute, cann
561
592
 
562
593
  register_backend(lm_ggml_backend_cpu_reg());
563
594
  }
564
595
 
565
596
  void register_backend(lm_ggml_backend_reg_t reg) {
566
597
  #ifndef NDEBUG
567
- fprintf(stderr, "%s: registered backend %s (%zu devices)\n",
598
+ LM_GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
568
599
  __func__, lm_ggml_backend_reg_name(reg), lm_ggml_backend_reg_dev_count(reg));
569
600
  #endif
570
601
  backends.push_back(reg);
@@ -575,7 +606,7 @@ struct lm_ggml_backend_registry {
575
606
 
576
607
  void register_device(lm_ggml_backend_dev_t device) {
577
608
  #ifndef NDEBUG
578
- fprintf(stderr, "%s: registered device %s (%s)\n", __func__, lm_ggml_backend_dev_name(device), lm_ggml_backend_dev_description(device));
609
+ LM_GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, lm_ggml_backend_dev_name(device), lm_ggml_backend_dev_description(device));
579
610
  #endif
580
611
  devices.push_back(device);
581
612
  }
@@ -675,8 +706,6 @@ lm_ggml_backend_t lm_ggml_backend_init_best(void) {
675
706
 
676
707
  // backend CPU
677
708
 
678
- static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
679
-
680
709
  static const char * lm_ggml_backend_cpu_buffer_get_name(lm_ggml_backend_buffer_t buffer) {
681
710
  return "CPU";
682
711
 
@@ -695,7 +724,7 @@ static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_buffer_t buffe
695
724
  }
696
725
 
697
726
  static void lm_ggml_backend_cpu_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) {
698
- free(buffer->context);
727
+ lm_ggml_aligned_free(buffer->context, buffer->size);
699
728
  }
700
729
 
701
730
  static void lm_ggml_backend_cpu_buffer_memset_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -763,14 +792,19 @@ static const char * lm_ggml_backend_cpu_buffer_type_get_name(lm_ggml_backend_buf
763
792
  }
764
793
 
765
794
  static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
766
- size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
767
- void * data = malloc(size); // TODO: use LM_GGML_ALIGNED_MALLOC (move to ggml-impl.h)
795
+ auto alloc_size = size;
796
+ if (alloc_size == 0) {
797
+ alloc_size = 1;
798
+ }
799
+
800
+ void * data = lm_ggml_aligned_malloc(alloc_size);
801
+
768
802
  if (data == NULL) {
769
- fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
803
+ LM_GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size);
770
804
  return NULL;
771
805
  }
772
806
 
773
- return lm_ggml_backend_buffer_init(buft, lm_ggml_backend_cpu_buffer_i, data, size);
807
+ return lm_ggml_backend_buffer_init(buft, lm_ggml_backend_cpu_buffer_i, data, alloc_size);
774
808
  }
775
809
 
776
810
  static size_t lm_ggml_backend_cpu_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) {
@@ -829,7 +863,7 @@ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer
829
863
  void * ptr;
830
864
  int result = hbw_posix_memalign(&ptr, lm_ggml_backend_cpu_buffer_type_get_alignment(buft), size);
831
865
  if (result != 0) {
832
- fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
866
+ LM_GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
833
867
  return NULL;
834
868
  }
835
869
 
@@ -1452,7 +1486,7 @@ static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sch
1452
1486
  }
1453
1487
 
1454
1488
  #ifndef NDEBUG
1455
- fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
1489
+ LM_GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
1456
1490
  __func__, lm_ggml_op_desc(tensor), lm_ggml_backend_buffer_name(buffer), tensor->name);
1457
1491
  #endif
1458
1492
 
@@ -1541,13 +1575,13 @@ static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sche
1541
1575
  for (int i = 0; i < graph->n_nodes; i++) {
1542
1576
  if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
1543
1577
  lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
1544
- fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend),
1578
+ LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend),
1545
1579
  sched->splits[cur_split].n_inputs);
1546
1580
  for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
1547
- fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
1581
+ LM_GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
1548
1582
  fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
1549
1583
  }
1550
- fprintf(stderr, "\n");
1584
+ LM_GGML_LOG_DEBUG("\n");
1551
1585
  cur_split++;
1552
1586
  }
1553
1587
  struct lm_ggml_tensor * node = graph->nodes[i];
@@ -1555,7 +1589,7 @@ static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sche
1555
1589
  continue;
1556
1590
  }
1557
1591
  lm_ggml_backend_t tensor_backend = lm_ggml_backend_sched_get_tensor_backend(sched, node);
1558
- fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, lm_ggml_op_name(node->op), node->name,
1592
+ LM_GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, lm_ggml_op_name(node->op), node->name,
1559
1593
  fmt_size(lm_ggml_nbytes(node)), tensor_backend ? lm_ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1560
1594
  for (int j = 0; j < LM_GGML_MAX_SRC; j++) {
1561
1595
  struct lm_ggml_tensor * src = node->src[j];
@@ -1563,10 +1597,10 @@ static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sche
1563
1597
  continue;
1564
1598
  }
1565
1599
  lm_ggml_backend_t src_backend = lm_ggml_backend_sched_get_tensor_backend(sched, src);
1566
- fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1600
+ LM_GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1567
1601
  fmt_size(lm_ggml_nbytes(src)), src_backend ? lm_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1568
1602
  }
1569
- fprintf(stderr, "\n");
1603
+ LM_GGML_LOG_DEBUG("\n");
1570
1604
  }
1571
1605
  }
1572
1606
 
@@ -2080,11 +2114,11 @@ static bool lm_ggml_backend_sched_alloc_splits(lm_ggml_backend_sched_t sched) {
2080
2114
  // the re-allocation may cause the split inputs to be moved to a different address
2081
2115
  lm_ggml_backend_sched_synchronize(sched);
2082
2116
  #ifndef NDEBUG
2083
- fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
2117
+ LM_GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
2084
2118
  #endif
2085
2119
  lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
2086
2120
  if (!lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
2087
- fprintf(stderr, "%s: failed to allocate graph\n", __func__);
2121
+ LM_GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
2088
2122
  return false;
2089
2123
  }
2090
2124
  }
@@ -2227,6 +2261,7 @@ lm_ggml_backend_sched_t lm_ggml_backend_sched_new(
2227
2261
  sched->backends[b] = backends[b];
2228
2262
  sched->bufts[b] = bufts ? bufts[b] : lm_ggml_backend_get_default_buffer_type(backends[b]);
2229
2263
  LM_GGML_ASSERT(lm_ggml_backend_supports_buft(backends[b], sched->bufts[b]));
2264
+
2230
2265
  if (sched->n_copies > 1) {
2231
2266
  for (int c = 0; c < sched->n_copies; c++) {
2232
2267
  sched->events[b][c] = lm_ggml_backend_event_new(backends[b]->device);
@@ -2478,7 +2513,7 @@ struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t b
2478
2513
  struct lm_ggml_context * ctx_unallocated = lm_ggml_init(params);
2479
2514
 
2480
2515
  if (ctx_allocated == NULL || ctx_unallocated == NULL) {
2481
- fprintf(stderr, "failed to allocate context for graph copy\n");
2516
+ LM_GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
2482
2517
  lm_ggml_hash_set_free(&hash_set);
2483
2518
  free(node_copies);
2484
2519
  free(node_init);
@@ -2501,7 +2536,7 @@ struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t b
2501
2536
  // allocate nodes
2502
2537
  lm_ggml_backend_buffer_t buffer = lm_ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
2503
2538
  if (buffer == NULL) {
2504
- fprintf(stderr, "failed to allocate buffer for graph copy\n");
2539
+ LM_GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
2505
2540
  lm_ggml_hash_set_free(&hash_set);
2506
2541
  free(node_copies);
2507
2542
  free(node_init);
package/cpp/ggml-impl.h CHANGED
@@ -19,6 +19,9 @@ extern "C" {
19
19
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
20
20
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
21
21
 
22
+ // required for mmap as gguf only guarantees 32-byte alignment
23
+ #define TENSOR_ALIGNMENT 32
24
+
22
25
  // static_assert should be a #define, but if it's not,
23
26
  // fall back to the _Static_assert C11 keyword.
24
27
  // if C99 - static_assert is noop
@@ -196,6 +199,11 @@ struct lm_ggml_cgraph {
196
199
 
197
200
  struct lm_ggml_cgraph lm_ggml_graph_view(struct lm_ggml_cgraph * cgraph, int i0, int i1);
198
201
 
202
+ // Memory allocation
203
+
204
+ void * lm_ggml_aligned_malloc(size_t size);
205
+ void lm_ggml_aligned_free(void * ptr, size_t size);
206
+
199
207
  #ifdef __cplusplus
200
208
  }
201
209
  #endif
package/cpp/ggml.c CHANGED
@@ -35,10 +35,6 @@
35
35
  #include <omp.h>
36
36
  #endif
37
37
 
38
- #ifdef LM_GGML_USE_METAL
39
- #include <unistd.h>
40
- #endif
41
-
42
38
  #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
43
39
  #undef LM_GGML_USE_LLAMAFILE
44
40
  #endif
@@ -189,6 +185,8 @@ typedef pthread_t lm_ggml_thread_t;
189
185
  #endif
190
186
 
191
187
  #if defined(__APPLE__)
188
+ #include <unistd.h>
189
+ #include <mach/mach.h>
192
190
  #include <TargetConditionals.h>
193
191
  #endif
194
192
 
@@ -326,8 +324,9 @@ struct lm_ggml_logger_state {
326
324
  static struct lm_ggml_logger_state g_logger_state = {lm_ggml_log_callback_default, NULL};
327
325
 
328
326
  static void lm_ggml_log_internal_v(enum lm_ggml_log_level level, const char * format, va_list args) {
329
- if (format == NULL)
327
+ if (format == NULL) {
330
328
  return;
329
+ }
331
330
  va_list args_copy;
332
331
  va_copy(args_copy, args);
333
332
  char buffer[128];
@@ -386,22 +385,40 @@ void lm_ggml_log_callback_default(enum lm_ggml_log_level level, const char * tex
386
385
  //#define LM_GGML_SOFT_MAX_ACCELERATE
387
386
  #endif
388
387
 
388
+
389
+ void * lm_ggml_aligned_malloc(size_t size) {
389
390
  #if defined(_MSC_VER) || defined(__MINGW32__)
390
- #define LM_GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, LM_GGML_MEM_ALIGN)
391
- #define LM_GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
391
+ return _aligned_malloc(size, TENSOR_ALIGNMENT);
392
392
  #else
393
- inline static void * lm_ggml_aligned_malloc(size_t size) {
394
393
  if (size == 0) {
395
394
  LM_GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for lm_ggml_aligned_malloc!\n");
396
395
  return NULL;
397
396
  }
398
397
  void * aligned_memory = NULL;
399
398
  #ifdef LM_GGML_USE_CPU_HBM
400
- int result = hbw_posix_memalign(&aligned_memory, 16, size);
399
+ int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
400
+ #elif TARGET_OS_OSX
401
+ kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE);
402
+ int result = EFAULT;
403
+ switch (alloc_status) {
404
+ case KERN_SUCCESS:
405
+ result = 0;
406
+ break;
407
+ case KERN_INVALID_ADDRESS:
408
+ result = EINVAL;
409
+ break;
410
+ case KERN_NO_SPACE:
411
+ result = ENOMEM;
412
+ break;
413
+ default:
414
+ result = EFAULT;
415
+ break;
416
+ }
401
417
  #elif LM_GGML_USE_METAL
402
- int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
418
+ const long page_size = sysconf(_SC_PAGESIZE);
419
+ int result = posix_memalign(&aligned_memory, MAX(TENSOR_ALIGNMENT, page_size), size);
403
420
  #else
404
- int result = posix_memalign(&aligned_memory, LM_GGML_MEM_ALIGN, size);
421
+ int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
405
422
  #endif
406
423
  if (result != 0) {
407
424
  // Handle allocation failure
@@ -419,14 +436,26 @@ inline static void * lm_ggml_aligned_malloc(size_t size) {
419
436
  return NULL;
420
437
  }
421
438
  return aligned_memory;
439
+ #endif
422
440
  }
423
- #define LM_GGML_ALIGNED_MALLOC(size) lm_ggml_aligned_malloc(size)
424
- #ifdef LM_GGML_USE_CPU_HBM
425
- #define LM_GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
441
+
442
+ void lm_ggml_aligned_free(void * ptr, size_t size) {
443
+ LM_GGML_UNUSED(size);
444
+ #if defined(_MSC_VER) || defined(__MINGW32__)
445
+ _aligned_free(ptr);
446
+ #elif LM_GGML_USE_CPU_HBM
447
+ if (ptr != NULL) {
448
+ hbw_free(ptr);
449
+ }
450
+ #elif TARGET_OS_OSX
451
+ if (ptr != NULL) {
452
+ vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ptr, size);
453
+ }
426
454
  #else
427
- #define LM_GGML_ALIGNED_FREE(ptr) free(ptr)
428
- #endif
455
+ free(ptr);
429
456
  #endif
457
+ }
458
+
430
459
 
431
460
  inline static void * lm_ggml_malloc(size_t size) {
432
461
  if (size == 0) {
@@ -3882,7 +3911,7 @@ struct lm_ggml_context * lm_ggml_init(struct lm_ggml_init_params params) {
3882
3911
 
3883
3912
  *ctx = (struct lm_ggml_context) {
3884
3913
  /*.mem_size =*/ mem_size,
3885
- /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : LM_GGML_ALIGNED_MALLOC(mem_size),
3914
+ /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : lm_ggml_aligned_malloc(mem_size),
3886
3915
  /*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
3887
3916
  /*.no_alloc =*/ params.no_alloc,
3888
3917
  /*.no_alloc_save =*/ params.no_alloc,
@@ -3922,7 +3951,7 @@ void lm_ggml_free(struct lm_ggml_context * ctx) {
3922
3951
  __func__, i, lm_ggml_used_mem(ctx));
3923
3952
 
3924
3953
  if (ctx->mem_buffer_owned) {
3925
- LM_GGML_ALIGNED_FREE(ctx->mem_buffer);
3954
+ lm_ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
3926
3955
  }
3927
3956
 
3928
3957
  found = true;
@@ -15708,6 +15737,9 @@ static void lm_ggml_compute_forward_flash_attn_ext_f16(
15708
15737
  lm_ggml_vec_dot_t const kq_vec_dot = type_traits[k->type].vec_dot;
15709
15738
  lm_ggml_to_float_t const v_to_float = type_traits[v->type].to_float;
15710
15739
 
15740
+ LM_GGML_ASSERT(q_to_vec_dot && "fattn: unsupported K-type");
15741
+ LM_GGML_ASSERT(v_to_float && "fattn: unsupported V-type");
15742
+
15711
15743
  // loop over n_batch and n_head
15712
15744
  for (int ir = ir0; ir < ir1; ++ir) {
15713
15745
  // q indices
@@ -19621,9 +19653,10 @@ static void lm_ggml_thread_cpumask_next(const bool * global_mask, bool * local_m
19621
19653
  void lm_ggml_threadpool_free(struct lm_ggml_threadpool* threadpool) {
19622
19654
  if (!threadpool) return;
19623
19655
 
19656
+ const int n_threads = threadpool->n_threads_max;
19657
+
19624
19658
  #ifndef LM_GGML_USE_OPENMP
19625
19659
  struct lm_ggml_compute_state* workers = threadpool->workers;
19626
- const int n_threads = threadpool->n_threads_max;
19627
19660
 
19628
19661
  lm_ggml_mutex_lock(&threadpool->mutex);
19629
19662
 
@@ -19643,8 +19676,9 @@ void lm_ggml_threadpool_free(struct lm_ggml_threadpool* threadpool) {
19643
19676
  lm_ggml_cond_destroy(&threadpool->cond);
19644
19677
  #endif // LM_GGML_USE_OPENMP
19645
19678
 
19646
- LM_GGML_ALIGNED_FREE(threadpool->workers);
19647
- LM_GGML_ALIGNED_FREE(threadpool);
19679
+ const size_t workers_size = sizeof(struct lm_ggml_compute_state) * n_threads;
19680
+ lm_ggml_aligned_free(threadpool->workers, workers_size);
19681
+ lm_ggml_aligned_free(threadpool, sizeof(struct lm_ggml_threadpool));
19648
19682
  }
19649
19683
 
19650
19684
  #ifndef LM_GGML_USE_OPENMP
@@ -20076,7 +20110,7 @@ static struct lm_ggml_threadpool * lm_ggml_threadpool_new_impl(
20076
20110
  struct lm_ggml_cplan * cplan) {
20077
20111
 
20078
20112
  struct lm_ggml_threadpool * threadpool =
20079
- LM_GGML_ALIGNED_MALLOC(sizeof(struct lm_ggml_threadpool));
20113
+ lm_ggml_aligned_malloc(sizeof(struct lm_ggml_threadpool));
20080
20114
  {
20081
20115
  threadpool->cgraph = cgraph;
20082
20116
  threadpool->cplan = cplan;
@@ -20097,7 +20131,7 @@ static struct lm_ggml_threadpool * lm_ggml_threadpool_new_impl(
20097
20131
 
20098
20132
  // Allocate and init workers state
20099
20133
  const size_t workers_size = sizeof(struct lm_ggml_compute_state) * tpp->n_threads;
20100
- struct lm_ggml_compute_state * workers = LM_GGML_ALIGNED_MALLOC(workers_size);
20134
+ struct lm_ggml_compute_state * workers = lm_ggml_aligned_malloc(workers_size);
20101
20135
 
20102
20136
  memset(workers, 0, workers_size);
20103
20137
  for (int j = 0; j < tpp->n_threads; j++) {
@@ -23235,6 +23269,14 @@ int lm_ggml_cpu_has_avx512_bf16(void) {
23235
23269
  #endif
23236
23270
  }
23237
23271
 
23272
+ int lm_ggml_cpu_has_amx_int8(void) {
23273
+ #if defined(__AMX_INT8__)
23274
+ return 1;
23275
+ #else
23276
+ return 0;
23277
+ #endif
23278
+ }
23279
+
23238
23280
  int lm_ggml_cpu_has_fma(void) {
23239
23281
  #if defined(__FMA__)
23240
23282
  return 1;
package/cpp/ggml.h CHANGED
@@ -2489,6 +2489,7 @@ extern "C" {
2489
2489
  LM_GGML_API int lm_ggml_cpu_has_avx512_vbmi(void);
2490
2490
  LM_GGML_API int lm_ggml_cpu_has_avx512_vnni(void);
2491
2491
  LM_GGML_API int lm_ggml_cpu_has_avx512_bf16(void);
2492
+ LM_GGML_API int lm_ggml_cpu_has_amx_int8 (void);
2492
2493
  LM_GGML_API int lm_ggml_cpu_has_fma (void);
2493
2494
  LM_GGML_API int lm_ggml_cpu_has_neon (void);
2494
2495
  LM_GGML_API int lm_ggml_cpu_has_sve (void);
@@ -611,7 +611,7 @@ private:
611
611
  }
612
612
  return join_seq();
613
613
  };
614
- return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
614
+ return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
615
615
  }
616
616
 
617
617
  /*