whispercpp 1.2.0.2 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (135) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +5 -0
  3. data/LICENSE +1 -1
  4. data/README.md +165 -434
  5. data/Rakefile +46 -86
  6. data/ext/.gitignore +13 -0
  7. data/ext/cpu.mk +9 -0
  8. data/ext/{dr_wav.h → examples/dr_wav.h} +3560 -1179
  9. data/ext/extconf.rb +185 -7
  10. data/ext/ggml/include/ggml-alloc.h +76 -0
  11. data/ext/ggml/include/ggml-backend.h +352 -0
  12. data/ext/ggml/include/ggml-blas.h +25 -0
  13. data/ext/ggml/include/ggml-cann.h +123 -0
  14. data/ext/ggml/include/ggml-cpp.h +38 -0
  15. data/ext/ggml/include/ggml-cpu.h +135 -0
  16. data/ext/ggml/include/ggml-cuda.h +47 -0
  17. data/ext/ggml/include/ggml-kompute.h +50 -0
  18. data/ext/ggml/include/ggml-metal.h +66 -0
  19. data/ext/ggml/include/ggml-opencl.h +26 -0
  20. data/ext/ggml/include/ggml-opt.h +216 -0
  21. data/ext/ggml/include/ggml-rpc.h +28 -0
  22. data/ext/ggml/include/ggml-sycl.h +49 -0
  23. data/ext/ggml/include/ggml-vulkan.h +31 -0
  24. data/ext/ggml/include/ggml.h +2285 -0
  25. data/ext/ggml/src/ggml-alloc.c +1037 -0
  26. data/ext/ggml/src/ggml-amx/common.h +94 -0
  27. data/ext/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  28. data/ext/ggml/src/ggml-amx/mmq.cpp +2510 -0
  29. data/ext/ggml/src/ggml-amx/mmq.h +17 -0
  30. data/ext/ggml/src/ggml-backend-impl.h +256 -0
  31. data/ext/ggml/src/ggml-backend-reg.cpp +552 -0
  32. data/ext/ggml/src/ggml-backend.cpp +1999 -0
  33. data/ext/ggml/src/ggml-blas/ggml-blas.cpp +517 -0
  34. data/ext/ggml/src/ggml-cann/acl_tensor.cpp +175 -0
  35. data/ext/ggml/src/ggml-cann/acl_tensor.h +258 -0
  36. data/ext/ggml/src/ggml-cann/aclnn_ops.cpp +3427 -0
  37. data/ext/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  38. data/ext/ggml/src/ggml-cann/common.h +286 -0
  39. data/ext/ggml/src/ggml-cann/ggml-cann.cpp +2188 -0
  40. data/ext/ggml/src/ggml-cann/kernels/ascendc_kernels.h +19 -0
  41. data/ext/ggml/src/ggml-cann/kernels/dup.cpp +236 -0
  42. data/ext/ggml/src/ggml-cann/kernels/get_row_f16.cpp +197 -0
  43. data/ext/ggml/src/ggml-cann/kernels/get_row_f32.cpp +190 -0
  44. data/ext/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +204 -0
  45. data/ext/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  46. data/ext/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +218 -0
  47. data/ext/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +216 -0
  48. data/ext/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +295 -0
  49. data/ext/ggml/src/ggml-common.h +1853 -0
  50. data/ext/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  51. data/ext/ggml/src/ggml-cpu/amx/amx.h +8 -0
  52. data/ext/ggml/src/ggml-cpu/amx/common.h +91 -0
  53. data/ext/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  54. data/ext/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  55. data/ext/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  56. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +4262 -0
  57. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  58. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  59. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  60. data/ext/ggml/src/ggml-cpu/ggml-cpu-impl.h +386 -0
  61. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  62. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  63. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  64. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  65. data/ext/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  66. data/ext/ggml/src/ggml-cpu/ggml-cpu.cpp +622 -0
  67. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1884 -0
  68. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
  69. data/ext/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  70. data/ext/ggml/src/ggml-cuda/vendors/hip.h +186 -0
  71. data/ext/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  72. data/ext/ggml/src/ggml-impl.h +556 -0
  73. data/ext/ggml/src/ggml-kompute/ggml-kompute.cpp +2251 -0
  74. data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  75. data/ext/ggml/src/ggml-metal/ggml-metal.m +4884 -0
  76. data/ext/ggml/src/ggml-metal/ggml-metal.metal +6732 -0
  77. data/ext/ggml/src/ggml-opt.cpp +854 -0
  78. data/ext/ggml/src/ggml-quants.c +5238 -0
  79. data/ext/ggml/src/ggml-quants.h +100 -0
  80. data/ext/ggml/src/ggml-rpc/ggml-rpc.cpp +1406 -0
  81. data/ext/ggml/src/ggml-sycl/common.cpp +95 -0
  82. data/ext/ggml/src/ggml-sycl/concat.cpp +196 -0
  83. data/ext/ggml/src/ggml-sycl/conv.cpp +99 -0
  84. data/ext/ggml/src/ggml-sycl/convert.cpp +547 -0
  85. data/ext/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  86. data/ext/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  87. data/ext/ggml/src/ggml-sycl/ggml-sycl.cpp +4729 -0
  88. data/ext/ggml/src/ggml-sycl/im2col.cpp +126 -0
  89. data/ext/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  90. data/ext/ggml/src/ggml-sycl/mmvq.cpp +1015 -0
  91. data/ext/ggml/src/ggml-sycl/norm.cpp +378 -0
  92. data/ext/ggml/src/ggml-sycl/outprod.cpp +56 -0
  93. data/ext/ggml/src/ggml-sycl/rope.cpp +276 -0
  94. data/ext/ggml/src/ggml-sycl/softmax.cpp +251 -0
  95. data/ext/ggml/src/ggml-sycl/tsembd.cpp +72 -0
  96. data/ext/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  97. data/ext/ggml/src/ggml-threading.cpp +12 -0
  98. data/ext/ggml/src/ggml-threading.h +14 -0
  99. data/ext/ggml/src/ggml-vulkan/ggml-vulkan.cpp +8657 -0
  100. data/ext/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  101. data/ext/ggml/src/ggml.c +7694 -0
  102. data/ext/include/whisper.h +672 -0
  103. data/ext/metal-embed.mk +17 -0
  104. data/ext/metal.mk +6 -0
  105. data/ext/ruby_whisper.cpp +1608 -159
  106. data/ext/ruby_whisper.h +10 -0
  107. data/ext/scripts/get-flags.mk +38 -0
  108. data/ext/src/coreml/whisper-decoder-impl.h +146 -0
  109. data/ext/src/coreml/whisper-decoder-impl.m +201 -0
  110. data/ext/src/coreml/whisper-encoder-impl.h +142 -0
  111. data/ext/src/coreml/whisper-encoder-impl.m +197 -0
  112. data/ext/src/coreml/whisper-encoder.h +26 -0
  113. data/ext/src/openvino/whisper-openvino-encoder.cpp +108 -0
  114. data/ext/src/openvino/whisper-openvino-encoder.h +31 -0
  115. data/ext/src/whisper.cpp +7393 -0
  116. data/extsources.rb +6 -0
  117. data/lib/whisper/model/uri.rb +157 -0
  118. data/lib/whisper.rb +2 -0
  119. data/tests/helper.rb +7 -0
  120. data/tests/jfk_reader/.gitignore +5 -0
  121. data/tests/jfk_reader/extconf.rb +3 -0
  122. data/tests/jfk_reader/jfk_reader.c +68 -0
  123. data/tests/test_callback.rb +160 -0
  124. data/tests/test_error.rb +20 -0
  125. data/tests/test_model.rb +71 -0
  126. data/tests/test_package.rb +31 -0
  127. data/tests/test_params.rb +160 -0
  128. data/tests/test_segment.rb +83 -0
  129. data/tests/test_whisper.rb +211 -123
  130. data/whispercpp.gemspec +36 -0
  131. metadata +137 -11
  132. data/ext/ggml.c +0 -8616
  133. data/ext/ggml.h +0 -748
  134. data/ext/whisper.cpp +0 -4829
  135. data/ext/whisper.h +0 -402
@@ -0,0 +1,8 @@
1
+ #pragma once
2
+
3
+ #include "ggml-cpu-traits.h"
4
+ #include "ggml.h"
5
+
6
+ // GGML internal header
7
+
8
+ ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
@@ -0,0 +1,55 @@
1
+ #ifdef GGML_USE_CPU_HBM
2
+
3
+ #include "ggml-backend.h"
4
+ #include "ggml-backend-impl.h"
5
+ #include "ggml-cpu.h"
6
+ #include "ggml-impl.h"
7
+
8
+ #include "ggml-cpu-hbm.h"
9
+
10
+ // buffer type HBM
11
+
12
+ #include <hbwmalloc.h>
13
+
14
+ static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
15
+ return "CPU_HBM";
16
+
17
+ GGML_UNUSED(buft);
18
+ }
19
+
20
+ static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
21
+ hbw_free(buffer->context);
22
+ }
23
+
24
+ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
25
+ size_t size) {
26
+ void * ptr;
27
+ int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
28
+ if (result != 0) {
29
+ GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
30
+ return NULL;
31
+ }
32
+
33
+ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
34
+ buffer->buft = buft;
35
+ buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
36
+
37
+ return buffer;
38
+ }
39
+
40
+ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
41
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
42
+ /* .iface = */ {
43
+ /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
44
+ /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
45
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
46
+ /* .get_max_size = */ nullptr, // defaults to SIZE_MAX
47
+ /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
48
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
49
+ },
50
+ /* .context = */ nullptr,
51
+ };
52
+
53
+ return &ggml_backend_cpu_buffer_type_hbm;
54
+ }
55
+ #endif
@@ -0,0 +1,8 @@
1
+ #pragma once
2
+
3
+ #include "ggml-backend.h"
4
+ #include "ggml.h"
5
+
6
+ // GGML CPU internal header
7
+
8
+ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
@@ -0,0 +1,386 @@
1
+ #pragma once
2
+
3
+ // GGML CPU internal header
4
+
5
+ #include "ggml.h"
6
+ #include "ggml-impl.h"
7
+ #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
8
+ //#include <stddef.h>
9
+ #include <stdbool.h>
10
+ #include <string.h> // memcpy
11
+ #include <math.h> // fabsf
12
+
13
+
14
+ #ifdef __cplusplus
15
+ extern "C" {
16
+ #endif
17
+
18
+ struct ggml_compute_params {
19
+ // ith = thread index, nth = number of threads
20
+ int ith, nth;
21
+
22
+ // work buffer for all threads
23
+ size_t wsize;
24
+ void * wdata;
25
+
26
+ struct ggml_threadpool * threadpool;
27
+ };
28
+
29
+
30
+ #if defined(_MSC_VER)
31
+
32
+ #define m512bh(p) p
33
+ #define m512i(p) p
34
+
35
+ #else
36
+
37
+ #define m512bh(p) (__m512bh)(p)
38
+ #define m512i(p) (__m512i)(p)
39
+
40
+ #endif
41
+
42
+ // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
43
+ #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
44
+ #ifndef __FMA__
45
+ #define __FMA__
46
+ #endif
47
+ #ifndef __F16C__
48
+ #define __F16C__
49
+ #endif
50
+ #endif
51
+
52
+ // __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
53
+ #if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
54
+ #ifndef __SSE3__
55
+ #define __SSE3__
56
+ #endif
57
+ #ifndef __SSSE3__
58
+ #define __SSSE3__
59
+ #endif
60
+ #endif
61
+
62
+ #if defined(__ARM_FEATURE_SVE)
63
+ #include <arm_sve.h>
64
+ #include <sys/prctl.h>
65
+ #endif
66
+
67
+ // 16-bit float
68
+ // on Arm, we use __fp16
69
+ // on x86, we use uint16_t
70
+ #if defined(__ARM_NEON)
71
+
72
+ // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
73
+ //
74
+ // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
75
+ //
76
+ #include <arm_neon.h>
77
+
78
+ #ifdef _MSC_VER
79
+
80
+ typedef uint16_t ggml_fp16_internal_t;
81
+
82
+ #define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
83
+
84
+ #else
85
+
86
+ typedef __fp16 ggml_fp16_internal_t;
87
+
88
+ #define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
89
+
90
+ #endif // _MSC_VER
91
+
92
+ #if !defined(__aarch64__)
93
+
94
+ // 32-bit ARM compatibility
95
+
96
+ // vaddlvq_s16
97
+ // vpaddq_s16
98
+ // vpaddq_s32
99
+ // vaddvq_s32
100
+ // vaddvq_f32
101
+ // vmaxvq_f32
102
+ // vcvtnq_s32_f32
103
+ // vzip1_u8
104
+ // vzip2_u8
105
+
106
+ inline static int32_t vaddlvq_s16(int16x8_t v) {
107
+ int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
108
+ return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
109
+ }
110
+
111
+ inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
112
+ int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
113
+ int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
114
+ return vcombine_s16(a0, b0);
115
+ }
116
+
117
+ inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
118
+ int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
119
+ int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
120
+ return vcombine_s32(a0, b0);
121
+ }
122
+
123
+ inline static int32_t vaddvq_s32(int32x4_t v) {
124
+ return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
125
+ }
126
+
127
+ inline static float vaddvq_f32(float32x4_t v) {
128
+ return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
129
+ }
130
+
131
+ inline static float vmaxvq_f32(float32x4_t v) {
132
+ return
133
+ MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
134
+ MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
135
+ }
136
+
137
+ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
138
+ int32x4_t res;
139
+
140
+ res[0] = roundf(vgetq_lane_f32(v, 0));
141
+ res[1] = roundf(vgetq_lane_f32(v, 1));
142
+ res[2] = roundf(vgetq_lane_f32(v, 2));
143
+ res[3] = roundf(vgetq_lane_f32(v, 3));
144
+
145
+ return res;
146
+ }
147
+
148
+ inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
149
+ uint8x8_t res;
150
+
151
+ res[0] = a[0]; res[1] = b[0];
152
+ res[2] = a[1]; res[3] = b[1];
153
+ res[4] = a[2]; res[5] = b[2];
154
+ res[6] = a[3]; res[7] = b[3];
155
+
156
+ return res;
157
+ }
158
+
159
+ inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
160
+ uint8x8_t res;
161
+
162
+ res[0] = a[4]; res[1] = b[4];
163
+ res[2] = a[5]; res[3] = b[5];
164
+ res[4] = a[6]; res[5] = b[6];
165
+ res[6] = a[7]; res[7] = b[7];
166
+
167
+ return res;
168
+ }
169
+
170
+ // vld1q_s16_x2
171
+ // vld1q_u8_x2
172
+ // vld1q_u8_x4
173
+ // vld1q_s8_x2
174
+ // vld1q_s8_x4
175
+ // TODO: double-check these work correctly
176
+
177
+ typedef struct ggml_int16x8x2_t {
178
+ int16x8_t val[2];
179
+ } ggml_int16x8x2_t;
180
+
181
+ inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
182
+ ggml_int16x8x2_t res;
183
+
184
+ res.val[0] = vld1q_s16(ptr + 0);
185
+ res.val[1] = vld1q_s16(ptr + 8);
186
+
187
+ return res;
188
+ }
189
+
190
+ typedef struct ggml_uint8x16x2_t {
191
+ uint8x16_t val[2];
192
+ } ggml_uint8x16x2_t;
193
+
194
+ inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
195
+ ggml_uint8x16x2_t res;
196
+
197
+ res.val[0] = vld1q_u8(ptr + 0);
198
+ res.val[1] = vld1q_u8(ptr + 16);
199
+
200
+ return res;
201
+ }
202
+
203
+ typedef struct ggml_uint8x16x4_t {
204
+ uint8x16_t val[4];
205
+ } ggml_uint8x16x4_t;
206
+
207
+ inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
208
+ ggml_uint8x16x4_t res;
209
+
210
+ res.val[0] = vld1q_u8(ptr + 0);
211
+ res.val[1] = vld1q_u8(ptr + 16);
212
+ res.val[2] = vld1q_u8(ptr + 32);
213
+ res.val[3] = vld1q_u8(ptr + 48);
214
+
215
+ return res;
216
+ }
217
+
218
+ typedef struct ggml_int8x16x2_t {
219
+ int8x16_t val[2];
220
+ } ggml_int8x16x2_t;
221
+
222
+ inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
223
+ ggml_int8x16x2_t res;
224
+
225
+ res.val[0] = vld1q_s8(ptr + 0);
226
+ res.val[1] = vld1q_s8(ptr + 16);
227
+
228
+ return res;
229
+ }
230
+
231
+ typedef struct ggml_int8x16x4_t {
232
+ int8x16_t val[4];
233
+ } ggml_int8x16x4_t;
234
+
235
+ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
236
+ ggml_int8x16x4_t res;
237
+
238
+ res.val[0] = vld1q_s8(ptr + 0);
239
+ res.val[1] = vld1q_s8(ptr + 16);
240
+ res.val[2] = vld1q_s8(ptr + 32);
241
+ res.val[3] = vld1q_s8(ptr + 48);
242
+
243
+ return res;
244
+ }
245
+
246
+ // NOTE: not tested
247
+ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
248
+ int8x16_t res;
249
+
250
+ res[ 0] = a[b[ 0]];
251
+ res[ 1] = a[b[ 1]];
252
+ res[ 2] = a[b[ 2]];
253
+ res[ 3] = a[b[ 3]];
254
+ res[ 4] = a[b[ 4]];
255
+ res[ 5] = a[b[ 5]];
256
+ res[ 6] = a[b[ 6]];
257
+ res[ 7] = a[b[ 7]];
258
+ res[ 8] = a[b[ 8]];
259
+ res[ 9] = a[b[ 9]];
260
+ res[10] = a[b[10]];
261
+ res[11] = a[b[11]];
262
+ res[12] = a[b[12]];
263
+ res[13] = a[b[13]];
264
+ res[14] = a[b[14]];
265
+ res[15] = a[b[15]];
266
+
267
+ return res;
268
+ }
269
+
270
+ // NOTE: not tested
271
+ inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
272
+ uint8x16_t res;
273
+
274
+ res[ 0] = a[b[ 0]];
275
+ res[ 1] = a[b[ 1]];
276
+ res[ 2] = a[b[ 2]];
277
+ res[ 3] = a[b[ 3]];
278
+ res[ 4] = a[b[ 4]];
279
+ res[ 5] = a[b[ 5]];
280
+ res[ 6] = a[b[ 6]];
281
+ res[ 7] = a[b[ 7]];
282
+ res[ 8] = a[b[ 8]];
283
+ res[ 9] = a[b[ 9]];
284
+ res[10] = a[b[10]];
285
+ res[11] = a[b[11]];
286
+ res[12] = a[b[12]];
287
+ res[13] = a[b[13]];
288
+ res[14] = a[b[14]];
289
+ res[15] = a[b[15]];
290
+
291
+ return res;
292
+ }
293
+
294
+ #else
295
+
296
+ #define ggml_int16x8x2_t int16x8x2_t
297
+ #define ggml_uint8x16x2_t uint8x16x2_t
298
+ #define ggml_uint8x16x4_t uint8x16x4_t
299
+ #define ggml_int8x16x2_t int8x16x2_t
300
+ #define ggml_int8x16x4_t int8x16x4_t
301
+
302
+ #define ggml_vld1q_s16_x2 vld1q_s16_x2
303
+ #define ggml_vld1q_u8_x2 vld1q_u8_x2
304
+ #define ggml_vld1q_u8_x4 vld1q_u8_x4
305
+ #define ggml_vld1q_s8_x2 vld1q_s8_x2
306
+ #define ggml_vld1q_s8_x4 vld1q_s8_x4
307
+ #define ggml_vqtbl1q_s8 vqtbl1q_s8
308
+ #define ggml_vqtbl1q_u8 vqtbl1q_u8
309
+
310
+ #endif // !defined(__aarch64__)
311
+
312
+ #if !defined(__ARM_FEATURE_DOTPROD)
313
+
314
+ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
315
+ const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
316
+ const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
317
+
318
+ return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
319
+ }
320
+
321
+ #else
322
+
323
+ #define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
324
+
325
+ #endif // !defined(__ARM_FEATURE_DOTPROD)
326
+
327
+ #endif // defined(__ARM_NEON)
328
+
329
+ #ifdef __wasm_simd128__
330
+ #include <wasm_simd128.h>
331
+ #else
332
+ #ifdef __POWER9_VECTOR__
333
+ #include <altivec.h>
334
+ #undef bool
335
+ #define bool _Bool
336
+ #else
337
+ #if defined(_MSC_VER) || defined(__MINGW32__)
338
+ #include <intrin.h>
339
+ #else
340
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
341
+ #if !defined(__riscv)
342
+ #include <immintrin.h>
343
+ #endif
344
+ #endif
345
+ #endif
346
+ #endif
347
+ #endif
348
+
349
+ #ifdef __riscv_v_intrinsic
350
+ #include <riscv_vector.h>
351
+ #endif
352
+
353
+ #if defined(__loongarch64)
354
+ #if defined(__loongarch_asx)
355
+ #include <lasxintrin.h>
356
+ #endif
357
+ #if defined(__loongarch_sx)
358
+ #include <lsxintrin.h>
359
+ #endif
360
+ #endif
361
+
362
+ #if defined(__loongarch_asx)
363
+
364
+ typedef union {
365
+ int32_t i;
366
+ float f;
367
+ } ft_union;
368
+
369
+ /* float type data load instructions */
370
+ static __m128 __lsx_vreplfr2vr_s(float val) {
371
+ ft_union fi_tmpval = {.f = val};
372
+ return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
373
+ }
374
+
375
+ static __m256 __lasx_xvreplfr2vr_s(float val) {
376
+ ft_union fi_tmpval = {.f = val};
377
+ return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
378
+ }
379
+ #endif
380
+
381
+ // TODO: move to ggml-threading
382
+ void ggml_barrier(struct ggml_threadpool * tp);
383
+
384
+ #ifdef __cplusplus
385
+ }
386
+ #endif