whisper.rn 0.4.0-rc.10 → 0.4.0-rc.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/ggml-cpu.cpp CHANGED
@@ -2,11 +2,22 @@
2
2
  #include "ggml-backend-impl.h"
3
3
  #include "ggml-cpu.h"
4
4
  #include "ggml-cpu-aarch64.h"
5
+ #include "ggml-cpu-traits.h"
5
6
  #include "ggml-impl.h"
7
+ #include "amx/amx.h"
8
+
6
9
  #include <cctype>
7
10
  #include <string>
8
11
  #include <vector>
9
12
 
13
+ #ifdef WSP_GGML_USE_CPU_HBM
14
+ #include "ggml-cpu-hbm.h"
15
+ #endif
16
+
17
+ #ifdef WSP_GGML_USE_CPU_KLEIDIAI
18
+ #include "kleidiai/kleidiai.h"
19
+ #endif
20
+
10
21
  #if defined(__APPLE__)
11
22
  #include <sys/types.h>
12
23
  #include <sys/sysctl.h>
@@ -22,124 +33,26 @@
22
33
 
23
34
  // ggml-backend interface
24
35
 
25
- #ifdef WSP_GGML_USE_CPU_HBM
26
-
27
- // buffer type HBM
28
-
29
- #include <hbwmalloc.h>
30
-
31
- static const char * wsp_ggml_backend_cpu_hbm_buffer_type_get_name(wsp_ggml_backend_buffer_type_t buft) {
32
- return "CPU_HBM";
33
-
34
- WSP_GGML_UNUSED(buft);
35
- }
36
-
37
- static void wsp_ggml_backend_cpu_hbm_buffer_free_buffer(wsp_ggml_backend_buffer_t buffer) {
38
- hbw_free(buffer->context);
39
- }
40
-
41
- static wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_hbm_buffer_type_alloc_buffer(wsp_ggml_backend_buffer_type_t buft, size_t size) {
42
- void * ptr;
43
- int result = hbw_posix_memalign(&ptr, wsp_ggml_backend_cpu_buffer_type_get_alignment(buft), size);
44
- if (result != 0) {
45
- WSP_GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
46
- return NULL;
47
- }
48
-
49
- wsp_ggml_backend_buffer_t buffer = wsp_ggml_backend_cpu_buffer_from_ptr(ptr, size);
50
- buffer->buft = buft;
51
- buffer->iface.free_buffer = wsp_ggml_backend_cpu_hbm_buffer_free_buffer;
52
-
53
- return buffer;
54
- }
55
-
56
- wsp_ggml_backend_buffer_type_t wsp_ggml_backend_cpu_hbm_buffer_type(void) {
57
- static struct wsp_ggml_backend_buffer_type wsp_ggml_backend_cpu_buffer_type_hbm = {
58
- /* .iface = */ {
59
- /* .get_name = */ wsp_ggml_backend_cpu_hbm_buffer_type_get_name,
60
- /* .alloc_buffer = */ wsp_ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
61
- /* .get_alignment = */ wsp_ggml_backend_cpu_buffer_type_get_alignment,
62
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
63
- /* .get_alloc_size = */ NULL, // defaults to wsp_ggml_nbytes
64
- /* .is_host = */ wsp_ggml_backend_cpu_buffer_type_is_host,
65
- },
66
- /* .context = */ NULL,
67
- };
68
-
69
- return &wsp_ggml_backend_cpu_buffer_type_hbm;
70
- }
71
- #endif
72
-
73
- // buffer type AARCH64
74
-
75
- static void wsp_ggml_backend_cpu_aarch64_buffer_init_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor) {
76
- tensor->extra = (void *)wsp_ggml_aarch64_get_optimal_repack_type(tensor); // NOLINT
77
-
78
- WSP_GGML_UNUSED(buffer);
79
- }
80
-
81
- static void wsp_ggml_backend_cpu_aarch64_buffer_set_tensor(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
82
- WSP_GGML_ASSERT(offset == 0);
83
- WSP_GGML_ASSERT(size == wsp_ggml_nbytes(tensor));
84
-
85
- enum wsp_ggml_type repack_type = (enum wsp_ggml_type)(intptr_t)tensor->extra;
86
-
87
- wsp_ggml_aarch64_repack_tensor(tensor, repack_type, data, size);
88
-
89
- WSP_GGML_UNUSED(buffer);
90
- }
91
-
92
- static const char * wsp_ggml_backend_cpu_aarch64_buffer_type_get_name(wsp_ggml_backend_buffer_type_t buft) {
93
- return "CPU_AARCH64";
94
-
95
- WSP_GGML_UNUSED(buft);
96
- }
97
-
98
- static wsp_ggml_backend_buffer_t wsp_ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(wsp_ggml_backend_buffer_type_t buft, size_t size) {
99
- auto * buffer = wsp_ggml_backend_buft_alloc_buffer(wsp_ggml_backend_cpu_buffer_type(), size);
100
-
101
- if (buffer == NULL) {
102
- return NULL;
103
- }
104
-
105
- buffer->buft = buft;
106
- buffer->iface.init_tensor = wsp_ggml_backend_cpu_aarch64_buffer_init_tensor;
107
- buffer->iface.set_tensor = wsp_ggml_backend_cpu_aarch64_buffer_set_tensor;
108
-
109
- return buffer;
110
- }
111
-
112
- wsp_ggml_backend_buffer_type_t wsp_ggml_backend_cpu_aarch64_buffer_type(void) {
113
- static struct wsp_ggml_backend_buffer_type wsp_ggml_backend_cpu_buffer_type_aarch64 = {
114
- /* .iface = */ {
115
- /* .get_name = */ wsp_ggml_backend_cpu_aarch64_buffer_type_get_name,
116
- /* .alloc_buffer = */ wsp_ggml_backend_cpu_aarch64_buffer_type_alloc_buffer,
117
- /* .get_alignment = */ wsp_ggml_backend_cpu_buffer_type()->iface.get_alignment,
118
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
119
- /* .get_alloc_size = */ NULL, // defaults to wsp_ggml_nbytes
120
- /* .is_host = */ NULL,
121
- },
122
- /* .device = */ wsp_ggml_backend_reg_dev_get(wsp_ggml_backend_cpu_reg(), 0),
123
- /* .context = */ NULL,
124
- };
125
-
126
- return &wsp_ggml_backend_cpu_buffer_type_aarch64;
127
- }
128
-
129
- bool wsp_ggml_backend_cpu_buft_is_aarch64(wsp_ggml_backend_buffer_type_t buft) {
130
- return buft == wsp_ggml_backend_cpu_aarch64_buffer_type();
131
- }
132
-
133
- static wsp_ggml_backend_buffer_type_t * wsp_ggml_backend_cpu_get_extra_bufts(wsp_ggml_backend_dev_t device) {
36
+ std::vector<wsp_ggml_backend_buffer_type_t>& wsp_ggml_backend_cpu_get_extra_buffers_type() {
134
37
  static std::vector<wsp_ggml_backend_buffer_type_t> bufts = []() {
135
38
  std::vector<wsp_ggml_backend_buffer_type_t> bufts;
136
39
 
137
- #ifdef WSP_GGML_USE_CPU_HBM
138
- bufts.push_back(wsp_ggml_backend_cpu_hbm_buffer_type());
40
+ #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
41
+ if (wsp_ggml_backend_amx_buffer_type()) {
42
+ bufts.push_back(wsp_ggml_backend_amx_buffer_type());
43
+ }
44
+ #endif
45
+
46
+ #ifdef WSP_GGML_USE_CPU_KLEIDIAI
47
+ if (wsp_ggml_backend_cpu_kleidiai_buffer_type()) {
48
+ bufts.push_back(wsp_ggml_backend_cpu_kleidiai_buffer_type());
49
+ }
139
50
  #endif
140
51
 
141
52
  #ifdef WSP_GGML_USE_CPU_AARCH64
142
- bufts.push_back(wsp_ggml_backend_cpu_aarch64_buffer_type());
53
+ if (wsp_ggml_backend_cpu_aarch64_buffer_type()) {
54
+ bufts.push_back(wsp_ggml_backend_cpu_aarch64_buffer_type());
55
+ }
143
56
  #endif
144
57
 
145
58
  bufts.push_back(NULL);
@@ -147,11 +60,22 @@ static wsp_ggml_backend_buffer_type_t * wsp_ggml_backend_cpu_get_extra_bufts(wsp
147
60
  return bufts;
148
61
  }();
149
62
 
150
- return bufts.data();
63
+ return bufts;
64
+ }
65
+
66
+ static wsp_ggml_backend_buffer_type_t * wsp_ggml_backend_cpu_device_get_extra_buffers_type(wsp_ggml_backend_dev_t device) {
67
+ return wsp_ggml_backend_cpu_get_extra_buffers_type().data();
151
68
 
152
69
  WSP_GGML_UNUSED(device);
153
70
  }
154
71
 
72
+ static bool wsp_ggml_backend_cpu_is_extra_buffer_type(wsp_ggml_backend_buffer_type_t buft) {
73
+ for (auto extra : wsp_ggml_backend_cpu_get_extra_buffers_type()) {
74
+ if (extra && extra == buft) return true;
75
+ }
76
+ return false;
77
+ }
78
+
155
79
  // CPU backend - backend (stream)
156
80
 
157
81
  struct wsp_ggml_backend_cpu_context {
@@ -370,14 +294,14 @@ struct wsp_ggml_backend_cpu_device_context {
370
294
  &hKey) == ERROR_SUCCESS) {
371
295
  DWORD cpu_brand_size = 0;
372
296
  if (RegQueryValueExA(hKey,
373
- TEXT("ProcessorNameString"),
297
+ "ProcessorNameString",
374
298
  NULL,
375
299
  NULL,
376
300
  NULL,
377
301
  &cpu_brand_size) == ERROR_SUCCESS) {
378
302
  description.resize(cpu_brand_size);
379
303
  if (RegQueryValueExA(hKey,
380
- TEXT("ProcessorNameString"),
304
+ "ProcessorNameString",
381
305
  NULL,
382
306
  NULL,
383
307
  (LPBYTE)&description[0], // NOLINT
@@ -456,14 +380,23 @@ static bool wsp_ggml_backend_cpu_device_supports_op(wsp_ggml_backend_dev_t dev,
456
380
  const struct wsp_ggml_tensor * src0 = op->src[0];
457
381
  const struct wsp_ggml_tensor * src1 = op->src[1];
458
382
 
459
- if (src0 && src0->buffer && wsp_ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
460
- if (op->op != WSP_GGML_OP_MUL_MAT || src0->type != WSP_GGML_TYPE_Q4_0 || wsp_ggml_aarch64_get_optimal_repack_type(src0) == WSP_GGML_TYPE_Q4_0) {
461
- return false;
383
+ if (op->op == WSP_GGML_OP_NONE || op->op == WSP_GGML_OP_RESHAPE || op->op == WSP_GGML_OP_VIEW || op->op == WSP_GGML_OP_PERMUTE || op->op == WSP_GGML_OP_TRANSPOSE) {
384
+ return true;
385
+ }
386
+
387
+ // extra_buffer_op?
388
+ for (auto extra : wsp_ggml_backend_cpu_get_extra_buffers_type()) {
389
+ if (extra) {
390
+ auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context;
391
+ if (buf_extra && buf_extra->supports_op(dev, op)) {
392
+ return true;
393
+ }
462
394
  }
463
395
  }
464
396
 
465
- for (int i = 1; i < WSP_GGML_MAX_SRC; i++) {
466
- if (op->src[i] && op->src[i]->buffer && wsp_ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
397
+ // the other case need host buffer.
398
+ for (int i = 0; i < WSP_GGML_MAX_SRC; i++) {
399
+ if (op->src[i] && op->src[i]->buffer && !wsp_ggml_backend_buft_is_host(op->src[i]->buffer->buft)) {
467
400
  return false;
468
401
  }
469
402
  }
@@ -471,28 +404,37 @@ static bool wsp_ggml_backend_cpu_device_supports_op(wsp_ggml_backend_dev_t dev,
471
404
  switch (op->op) {
472
405
  case WSP_GGML_OP_CPY:
473
406
  return
407
+ op->type != WSP_GGML_TYPE_IQ3_XXS &&
408
+ op->type != WSP_GGML_TYPE_IQ3_S &&
474
409
  op->type != WSP_GGML_TYPE_IQ2_XXS &&
475
410
  op->type != WSP_GGML_TYPE_IQ2_XS &&
411
+ op->type != WSP_GGML_TYPE_IQ2_S &&
476
412
  op->type != WSP_GGML_TYPE_IQ1_S &&
477
413
  op->type != WSP_GGML_TYPE_IQ1_M; // missing type_traits.from_float
478
414
  case WSP_GGML_OP_MUL_MAT:
479
415
  return src1->type == WSP_GGML_TYPE_F32 || src1->type == wsp_ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
480
- case WSP_GGML_OP_ROPE_BACK:
481
- return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
416
+ case WSP_GGML_OP_SOFT_MAX_BACK: {
417
+ if (op->src[0]->type != WSP_GGML_TYPE_F32 || op->src[1]->type != WSP_GGML_TYPE_F32) {
418
+ return false;
419
+ }
420
+ float max_bias = 0.0f;
421
+
422
+ memcpy(&max_bias, (const float *) op->op_params + 1, sizeof(float));
423
+
424
+ return max_bias == 0.0f;
425
+ }
482
426
  case WSP_GGML_OP_IM2COL_BACK:
483
427
  return src0->type == WSP_GGML_TYPE_F32 && src1->type == WSP_GGML_TYPE_F32;
484
428
  case WSP_GGML_OP_OUT_PROD:
485
- return (src0->type == WSP_GGML_TYPE_F32 || wsp_ggml_is_quantized(src0->type)) && src1->type == WSP_GGML_TYPE_F32;
429
+ return (src0->type == WSP_GGML_TYPE_F32 || (wsp_ggml_is_quantized(src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) &&
430
+ src1->type == WSP_GGML_TYPE_F32 && op->type == WSP_GGML_TYPE_F32;
486
431
  default:
487
432
  return true;
488
433
  }
489
-
490
- WSP_GGML_UNUSED(dev);
491
434
  }
492
435
 
493
436
  static bool wsp_ggml_backend_cpu_device_supports_buft(wsp_ggml_backend_dev_t dev, wsp_ggml_backend_buffer_type_t buft) {
494
- return wsp_ggml_backend_buft_is_host(buft) || wsp_ggml_backend_cpu_buft_is_aarch64(buft);
495
-
437
+ return wsp_ggml_backend_buft_is_host(buft) || wsp_ggml_backend_cpu_is_extra_buffer_type(buft);
496
438
  WSP_GGML_UNUSED(dev);
497
439
  }
498
440
 
@@ -541,16 +483,12 @@ static wsp_ggml_backend_dev_t wsp_ggml_backend_cpu_reg_get_device(wsp_ggml_backe
541
483
  return &wsp_ggml_backend_cpu_device;
542
484
  }
543
485
 
544
- struct wsp_ggml_backend_feature {
545
- const char * name;
546
- const char * value;
547
- };
548
-
549
- // Not used yet
550
486
  // This is intended to replace the the wsp_ggml_cpu_has_* functions when loading the CPU backend dynamically,
551
- // and additionally to allow other backends to expose their own list of features that applications can query using the same API.
487
+ // and additionally to allow other backends to expose their own list of features that applications can query using the same API
552
488
  static wsp_ggml_backend_feature * wsp_ggml_backend_cpu_get_features(wsp_ggml_backend_reg_t reg) {
553
489
  static std::vector<wsp_ggml_backend_feature> features = []() {
490
+ wsp_ggml_cpu_init();
491
+
554
492
  std::vector<wsp_ggml_backend_feature> features;
555
493
  if (wsp_ggml_cpu_has_sse3()) {
556
494
  features.push_back({ "SSE3", "1" });
@@ -561,6 +499,9 @@ static wsp_ggml_backend_feature * wsp_ggml_backend_cpu_get_features(wsp_ggml_bac
561
499
  if (wsp_ggml_cpu_has_avx()) {
562
500
  features.push_back({ "AVX", "1" });
563
501
  }
502
+ if (wsp_ggml_cpu_has_avx_vnni()) {
503
+ features.push_back({ "AVX_VNNI", "1" });
504
+ }
564
505
  if (wsp_ggml_cpu_has_avx2()) {
565
506
  features.push_back({ "AVX2", "1" });
566
507
  }
@@ -570,9 +511,6 @@ static wsp_ggml_backend_feature * wsp_ggml_backend_cpu_get_features(wsp_ggml_bac
570
511
  if (wsp_ggml_cpu_has_fma()) {
571
512
  features.push_back({ "FMA", "1" });
572
513
  }
573
- if (wsp_ggml_cpu_has_avx_vnni()) {
574
- features.push_back({ "AVX_VNNI", "1" });
575
- }
576
514
  if (wsp_ggml_cpu_has_avx512()) {
577
515
  features.push_back({ "AVX512", "1" });
578
516
  }
@@ -603,22 +541,46 @@ static wsp_ggml_backend_feature * wsp_ggml_backend_cpu_get_features(wsp_ggml_bac
603
541
  if (wsp_ggml_cpu_has_sve()) {
604
542
  features.push_back({ "SVE", "1" });
605
543
  }
544
+ if (wsp_ggml_cpu_has_dotprod()) {
545
+ features.push_back({ "DOTPROD", "1" });
546
+ }
606
547
  if (wsp_ggml_cpu_get_sve_cnt() > 0) {
607
548
  static std::string sve_cnt = std::to_string(wsp_ggml_cpu_get_sve_cnt());
608
549
  features.push_back({ "SVE_CNT", sve_cnt.c_str() });
609
550
  }
551
+ if (wsp_ggml_cpu_has_sme()) {
552
+ features.push_back({ "SME", "1" });
553
+ }
610
554
  if (wsp_ggml_cpu_has_riscv_v()) {
611
555
  features.push_back({ "RISCV_V", "1" });
612
556
  }
613
557
  if (wsp_ggml_cpu_has_vsx()) {
614
558
  features.push_back({ "VSX", "1" });
615
559
  }
560
+ if (wsp_ggml_cpu_has_vxe()) {
561
+ features.push_back({ "VXE", "1" });
562
+ }
616
563
  if (wsp_ggml_cpu_has_wasm_simd()) {
617
564
  features.push_back({ "WASM_SIMD", "1" });
618
565
  }
619
566
  if (wsp_ggml_cpu_has_llamafile()) {
620
567
  features.push_back({ "LLAMAFILE", "1" });
621
568
  }
569
+ #ifdef WSP_GGML_USE_ACCELERATE
570
+ features.push_back({ "ACCELERATE", "1" });
571
+ #endif
572
+ #ifdef WSP_GGML_USE_CPU_HBM
573
+ features.push_back({ "CPU_HBM", "1" });
574
+ #endif
575
+ #ifdef WSP_GGML_USE_OPENMP
576
+ features.push_back({ "OPENMP", "1" });
577
+ #endif
578
+ #ifdef WSP_GGML_USE_CPU_KLEIDIAI
579
+ features.push_back({ "KLEIDIAI", "1" });
580
+ #endif
581
+ #ifdef WSP_GGML_USE_CPU_AARCH64
582
+ features.push_back({ "AARCH64_REPACK", "1" });
583
+ #endif
622
584
 
623
585
  features.push_back({ nullptr, nullptr });
624
586
 
@@ -632,10 +594,35 @@ static wsp_ggml_backend_feature * wsp_ggml_backend_cpu_get_features(wsp_ggml_bac
632
594
 
633
595
  static void * wsp_ggml_backend_cpu_get_proc_address(wsp_ggml_backend_reg_t reg, const char * name) {
634
596
  if (strcmp(name, "wsp_ggml_backend_set_n_threads") == 0) {
635
- return (void *)wsp_ggml_backend_cpu_set_n_threads;
597
+ wsp_ggml_backend_set_n_threads_t fct = wsp_ggml_backend_cpu_set_n_threads;
598
+ return (void *)fct;
636
599
  }
637
600
  if (strcmp(name, "wsp_ggml_backend_dev_get_extra_bufts") == 0) {
638
- return (void *)wsp_ggml_backend_cpu_get_extra_bufts;
601
+ wsp_ggml_backend_dev_get_extra_bufts_t fct = wsp_ggml_backend_cpu_device_get_extra_buffers_type;
602
+ return (void *)fct;
603
+ }
604
+ if (strcmp(name, "wsp_ggml_backend_get_features") == 0) {
605
+ return (void *)wsp_ggml_backend_cpu_get_features;
606
+ }
607
+ if (strcmp(name, "wsp_ggml_backend_set_abort_callback") == 0) {
608
+ return (void *)wsp_ggml_backend_cpu_set_abort_callback;
609
+ }
610
+ if (strcmp(name, "wsp_ggml_backend_cpu_numa_init") == 0) {
611
+ return (void *)wsp_ggml_numa_init;
612
+ }
613
+ if (strcmp(name, "wsp_ggml_backend_cpu_is_numa") == 0) {
614
+ return (void *)wsp_ggml_is_numa;
615
+ }
616
+
617
+ // threadpool - TODO: move to ggml-base
618
+ if (strcmp(name, "wsp_ggml_threadpool_new") == 0) {
619
+ return (void *)wsp_ggml_threadpool_new;
620
+ }
621
+ if (strcmp(name, "wsp_ggml_threadpool_free") == 0) {
622
+ return (void *)wsp_ggml_threadpool_free;
623
+ }
624
+ if (strcmp(name, "wsp_ggml_backend_cpu_set_threadpool") == 0) {
625
+ return (void *)wsp_ggml_backend_cpu_set_threadpool;
639
626
  }
640
627
 
641
628
  return NULL;
@@ -655,9 +642,12 @@ wsp_ggml_backend_reg_t wsp_ggml_backend_cpu_reg(void) {
655
642
  wsp_ggml_cpu_init();
656
643
 
657
644
  static struct wsp_ggml_backend_reg wsp_ggml_backend_cpu_reg = {
658
- /* .iface = */ wsp_ggml_backend_cpu_reg_i,
659
- /* .context = */ NULL,
645
+ /* .api_version = */ WSP_GGML_BACKEND_API_VERSION,
646
+ /* .iface = */ wsp_ggml_backend_cpu_reg_i,
647
+ /* .context = */ NULL,
660
648
  };
661
649
 
662
650
  return &wsp_ggml_backend_cpu_reg;
663
651
  }
652
+
653
+ WSP_GGML_BACKEND_DL_IMPL(wsp_ggml_backend_cpu_reg)
package/cpp/ggml-cpu.h CHANGED
@@ -7,31 +7,8 @@
7
7
  extern "C" {
8
8
  #endif
9
9
 
10
- // Scheduling priorities
11
- enum wsp_ggml_sched_priority {
12
- WSP_GGML_SCHED_PRIO_NORMAL,
13
- WSP_GGML_SCHED_PRIO_MEDIUM,
14
- WSP_GGML_SCHED_PRIO_HIGH,
15
- WSP_GGML_SCHED_PRIO_REALTIME
16
- };
17
-
18
- // Threadpool params
19
- // Use wsp_ggml_threadpool_params_default() or wsp_ggml_threadpool_params_init() to populate the defaults
20
- struct wsp_ggml_threadpool_params {
21
- bool cpumask[WSP_GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
22
- int n_threads; // number of threads
23
- enum wsp_ggml_sched_priority prio; // thread priority
24
- uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
25
- bool strict_cpu; // strict cpu placement
26
- bool paused; // start in paused state
27
- };
28
-
29
- struct wsp_ggml_threadpool; // forward declaration, see ggml.c
30
-
31
- typedef struct wsp_ggml_threadpool * wsp_ggml_threadpool_t;
32
-
33
10
  // the compute plan that needs to be prepared for wsp_ggml_graph_compute()
34
- // since https://github.com/ggerganov/ggml/issues/287
11
+ // since https://github.com/ggml-org/ggml/issues/287
35
12
  struct wsp_ggml_cplan {
36
13
  size_t work_size; // size of work buffer, calculated by `wsp_ggml_graph_plan()`
37
14
  uint8_t * work_data; // work buffer, to be allocated by caller before calling to `wsp_ggml_graph_compute()`
@@ -75,14 +52,11 @@ extern "C" {
75
52
  WSP_GGML_BACKEND_API float wsp_ggml_get_f32_nd(const struct wsp_ggml_tensor * tensor, int i0, int i1, int i2, int i3);
76
53
  WSP_GGML_BACKEND_API void wsp_ggml_set_f32_nd(const struct wsp_ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
77
54
 
78
- WSP_GGML_BACKEND_API struct wsp_ggml_threadpool_params wsp_ggml_threadpool_params_default(int n_threads);
79
- WSP_GGML_BACKEND_API void wsp_ggml_threadpool_params_init (struct wsp_ggml_threadpool_params * p, int n_threads);
80
- WSP_GGML_BACKEND_API bool wsp_ggml_threadpool_params_match (const struct wsp_ggml_threadpool_params * p0, const struct wsp_ggml_threadpool_params * p1);
81
- WSP_GGML_BACKEND_API struct wsp_ggml_threadpool * wsp_ggml_threadpool_new (struct wsp_ggml_threadpool_params * params);
82
- WSP_GGML_BACKEND_API void wsp_ggml_threadpool_free (struct wsp_ggml_threadpool * threadpool);
83
- WSP_GGML_BACKEND_API int wsp_ggml_threadpool_get_n_threads(struct wsp_ggml_threadpool * threadpool);
84
- WSP_GGML_BACKEND_API void wsp_ggml_threadpool_pause (struct wsp_ggml_threadpool * threadpool);
85
- WSP_GGML_BACKEND_API void wsp_ggml_threadpool_resume (struct wsp_ggml_threadpool * threadpool);
55
+ WSP_GGML_BACKEND_API struct wsp_ggml_threadpool * wsp_ggml_threadpool_new (struct wsp_ggml_threadpool_params * params);
56
+ WSP_GGML_BACKEND_API void wsp_ggml_threadpool_free (struct wsp_ggml_threadpool * threadpool);
57
+ WSP_GGML_BACKEND_API int wsp_ggml_threadpool_get_n_threads (struct wsp_ggml_threadpool * threadpool);
58
+ WSP_GGML_BACKEND_API void wsp_ggml_threadpool_pause (struct wsp_ggml_threadpool * threadpool);
59
+ WSP_GGML_BACKEND_API void wsp_ggml_threadpool_resume (struct wsp_ggml_threadpool * threadpool);
86
60
 
87
61
  // wsp_ggml_graph_plan() has to be called before wsp_ggml_graph_compute()
88
62
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
@@ -104,10 +78,10 @@ extern "C" {
104
78
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_sse3 (void);
105
79
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_ssse3 (void);
106
80
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_avx (void);
81
+ WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_avx_vnni (void);
107
82
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_avx2 (void);
108
83
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_f16c (void);
109
84
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_fma (void);
110
- WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_avx_vnni (void);
111
85
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_avx512 (void);
112
86
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_avx512_vbmi(void);
113
87
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_avx512_vnni(void);
@@ -117,35 +91,28 @@ extern "C" {
117
91
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_neon (void);
118
92
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_arm_fma (void);
119
93
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_fp16_va (void);
94
+ WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_dotprod (void);
120
95
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_matmul_int8(void);
121
96
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_sve (void);
122
97
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
98
+ WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_sme (void);
123
99
  // other
124
100
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_riscv_v (void);
125
101
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_vsx (void);
102
+ WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_vxe (void);
126
103
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_wasm_simd (void);
127
104
  WSP_GGML_BACKEND_API int wsp_ggml_cpu_has_llamafile (void);
128
105
 
129
106
  // Internal types and functions exposed for tests and benchmarks
130
107
 
131
- typedef void (*wsp_ggml_from_float_to_mat_t)
132
- (const float * WSP_GGML_RESTRICT x, void * WSP_GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
133
108
  typedef void (*wsp_ggml_vec_dot_t) (int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT x, size_t bx,
134
109
  const void * WSP_GGML_RESTRICT y, size_t by, int nrc);
135
- typedef void (*wsp_ggml_gemv_t) (int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT x,
136
- const void * WSP_GGML_RESTRICT y, int nr, int nc);
137
- typedef void (*wsp_ggml_gemm_t) (int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT x,
138
- const void * WSP_GGML_RESTRICT y, int nr, int nc);
139
110
 
140
111
  struct wsp_ggml_type_traits_cpu {
141
112
  wsp_ggml_from_float_t from_float;
142
- wsp_ggml_from_float_to_mat_t from_float_to_mat;
143
113
  wsp_ggml_vec_dot_t vec_dot;
144
114
  enum wsp_ggml_type vec_dot_type;
145
115
  int64_t nrows; // number of rows to process simultaneously
146
- int64_t ncols; // number of columns to process simultaneously
147
- wsp_ggml_gemv_t gemv;
148
- wsp_ggml_gemm_t gemm;
149
116
  };
150
117
 
151
118
  WSP_GGML_BACKEND_API const struct wsp_ggml_type_traits_cpu * wsp_ggml_get_type_traits_cpu(enum wsp_ggml_type type);
@@ -165,13 +132,6 @@ extern "C" {
165
132
 
166
133
  WSP_GGML_BACKEND_API wsp_ggml_backend_reg_t wsp_ggml_backend_cpu_reg(void);
167
134
 
168
- #ifdef WSP_GGML_USE_CPU_HBM
169
- WSP_GGML_BACKEND_API wsp_ggml_backend_buffer_type_t wsp_ggml_backend_cpu_hbm_buffer_type(void);
170
- #endif
171
-
172
- WSP_GGML_BACKEND_API wsp_ggml_backend_buffer_type_t wsp_ggml_backend_cpu_aarch64_buffer_type(void);
173
- WSP_GGML_BACKEND_API bool wsp_ggml_backend_cpu_buft_is_aarch64(wsp_ggml_backend_buffer_type_t buft);
174
-
175
135
  #ifdef __cplusplus
176
136
  }
177
137
  #endif
package/cpp/ggml-impl.h CHANGED
@@ -3,6 +3,8 @@
3
3
  // GGML internal header
4
4
 
5
5
  #include "ggml.h"
6
+ #include "gguf.h"
7
+
6
8
  #include <assert.h>
7
9
  #include <math.h>
8
10
  #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
@@ -14,7 +16,7 @@
14
16
  #include <arm_sve.h>
15
17
  #endif // __ARM_FEATURE_SVE
16
18
 
17
- #if defined(__ARM_NEON)
19
+ #if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
18
20
  // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
19
21
  //
20
22
  // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
@@ -30,11 +32,13 @@
30
32
  extern "C" {
31
33
  #endif
32
34
 
33
- #undef MIN
34
- #undef MAX
35
+ #ifndef MIN
36
+ # define MIN(a, b) ((a) < (b) ? (a) : (b))
37
+ #endif
35
38
 
36
- #define MIN(a, b) ((a) < (b) ? (a) : (b))
37
- #define MAX(a, b) ((a) > (b) ? (a) : (b))
39
+ #ifndef MAX
40
+ # define MAX(a, b) ((a) > (b) ? (a) : (b))
41
+ #endif
38
42
 
39
43
  // required for mmap as gguf only guarantees 32-byte alignment
40
44
  #define TENSOR_ALIGNMENT 32
@@ -72,8 +76,8 @@ static inline int wsp_ggml_up(int n, int m) {
72
76
  //
73
77
 
74
78
  WSP_GGML_ATTRIBUTE_FORMAT(2, 3)
75
- void wsp_ggml_log_internal (enum wsp_ggml_log_level level, const char * format, ...);
76
- void wsp_ggml_log_callback_default(enum wsp_ggml_log_level level, const char * text, void * user_data);
79
+ WSP_GGML_API void wsp_ggml_log_internal (enum wsp_ggml_log_level level, const char * format, ...);
80
+ WSP_GGML_API void wsp_ggml_log_callback_default(enum wsp_ggml_log_level level, const char * text, void * user_data);
77
81
 
78
82
  #define WSP_GGML_LOG(...) wsp_ggml_log_internal(WSP_GGML_LOG_LEVEL_NONE , __VA_ARGS__)
79
83
  #define WSP_GGML_LOG_INFO(...) wsp_ggml_log_internal(WSP_GGML_LOG_LEVEL_INFO , __VA_ARGS__)
@@ -295,24 +299,27 @@ struct wsp_ggml_cgraph {
295
299
  enum wsp_ggml_cgraph_eval_order order;
296
300
  };
297
301
 
302
+ // returns a slice of cgraph with nodes [i0, i1)
303
+ // the slice does not have leafs or gradients
304
+ // if you need the gradients, get them from the original graph
298
305
  struct wsp_ggml_cgraph wsp_ggml_graph_view(struct wsp_ggml_cgraph * cgraph, int i0, int i1);
299
306
 
300
307
  // Memory allocation
301
308
 
302
- void * wsp_ggml_aligned_malloc(size_t size);
303
- void wsp_ggml_aligned_free(void * ptr, size_t size);
309
+ WSP_GGML_API void * wsp_ggml_aligned_malloc(size_t size);
310
+ WSP_GGML_API void wsp_ggml_aligned_free(void * ptr, size_t size);
304
311
 
305
312
  // FP16 to FP32 conversion
306
313
 
307
314
  #if defined(__ARM_NEON)
308
- #ifdef _MSC_VER
315
+ #if defined(_MSC_VER) || (defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
309
316
  typedef uint16_t wsp_ggml_fp16_internal_t;
310
317
  #else
311
318
  typedef __fp16 wsp_ggml_fp16_internal_t;
312
319
  #endif
313
320
  #endif
314
321
 
315
- #if defined(__ARM_NEON) && !defined(_MSC_VER)
322
+ #if defined(__ARM_NEON) && !defined(_MSC_VER) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
316
323
  #define WSP_GGML_COMPUTE_FP16_TO_FP32(x) wsp_ggml_compute_fp16_to_fp32(x)
317
324
  #define WSP_GGML_COMPUTE_FP32_TO_FP16(x) wsp_ggml_compute_fp32_to_fp16(x)
318
325
 
@@ -549,3 +556,12 @@ static inline wsp_ggml_bf16_t wsp_ggml_compute_fp32_to_bf16(float s) {
549
556
  #ifdef __cplusplus
550
557
  }
551
558
  #endif
559
+
560
+ #ifdef __cplusplus
561
+ #include <vector>
562
+
563
+ // expose GGUF internals for test code
564
+ WSP_GGML_API size_t wsp_gguf_type_size(enum wsp_gguf_type type);
565
+ WSP_GGML_API struct wsp_gguf_context * wsp_gguf_init_from_file_impl(FILE * file, struct wsp_gguf_init_params params);
566
+ WSP_GGML_API void wsp_gguf_write_to_buf(const struct wsp_gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta);
567
+ #endif // __cplusplus
@@ -102,6 +102,21 @@ typedef struct {
102
102
  uint64_t nb3;
103
103
  } wsp_ggml_metal_kargs_cpy;
104
104
 
105
+ typedef struct {
106
+ int64_t ne10;
107
+ int64_t ne11;
108
+ int64_t ne12;
109
+ uint64_t nb10;
110
+ uint64_t nb11;
111
+ uint64_t nb12;
112
+ uint64_t nb13;
113
+ uint64_t nb1;
114
+ uint64_t nb2;
115
+ uint64_t nb3;
116
+ uint64_t offs;
117
+ bool inplace;
118
+ } wsp_ggml_metal_kargs_set;
119
+
105
120
  typedef struct {
106
121
  int32_t ne00;
107
122
  int32_t ne01;
@@ -192,6 +207,30 @@ typedef struct {
192
207
  int16_t r3;
193
208
  } wsp_ggml_metal_kargs_mul_mv;
194
209
 
210
+ typedef struct {
211
+ int32_t ne00;
212
+ int32_t ne01;
213
+ int32_t ne02;
214
+ uint64_t nb00;
215
+ uint64_t nb01;
216
+ uint64_t nb02;
217
+ uint64_t nb03;
218
+ int32_t ne10;
219
+ int32_t ne11;
220
+ int32_t ne12;
221
+ uint64_t nb10;
222
+ uint64_t nb11;
223
+ uint64_t nb12;
224
+ uint64_t nb13;
225
+ int32_t ne0;
226
+ int32_t ne1;
227
+ int16_t r2;
228
+ int16_t r3;
229
+ int16_t nsg;
230
+ int16_t nxpsg;
231
+ int16_t r1ptg;
232
+ } wsp_ggml_metal_kargs_mul_mv_ext;
233
+
195
234
  typedef struct {
196
235
  int32_t nei0;
197
236
  int32_t nei1;