whisper.rn 0.4.0-rc.6 → 0.4.0-rc.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/java/com/rnwhisper/RNWhisper.java +5 -5
- package/cpp/coreml/whisper-encoder.mm +1 -1
- package/cpp/ggml-alloc.c +41 -11
- package/cpp/ggml-alloc.h +3 -1
- package/cpp/ggml-backend-impl.h +38 -34
- package/cpp/ggml-backend.c +630 -269
- package/cpp/ggml-backend.h +58 -30
- package/cpp/ggml-impl.h +3 -0
- package/cpp/ggml-metal-whisper.metal +1253 -341
- package/cpp/ggml-metal.h +6 -54
- package/cpp/ggml-metal.m +2004 -1987
- package/cpp/ggml-quants.c +2230 -421
- package/cpp/ggml-quants.h +39 -1
- package/cpp/ggml.c +735 -265
- package/cpp/ggml.h +94 -43
- package/cpp/rn-whisper.cpp +1 -0
- package/cpp/whisper.cpp +118 -86
- package/ios/RNWhisperContext.mm +4 -2
- package/lib/commonjs/version.json +1 -1
- package/lib/module/version.json +1 -1
- package/package.json +1 -1
- package/src/version.json +1 -1
package/cpp/ggml-quants.h
CHANGED
|
@@ -70,7 +70,7 @@ static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block s
|
|
|
70
70
|
// 2-bit quantization
|
|
71
71
|
// weight is represented as x = a * q + b
|
|
72
72
|
// 16 blocks of 16 elements each
|
|
73
|
-
// Effectively 2.
|
|
73
|
+
// Effectively 2.625 bits per weight
|
|
74
74
|
typedef struct {
|
|
75
75
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
|
76
76
|
uint8_t qs[QK_K/4]; // quants
|
|
@@ -165,6 +165,22 @@ typedef struct {
|
|
|
165
165
|
} block_q8_K;
|
|
166
166
|
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
|
|
167
167
|
|
|
168
|
+
// (Almost) "true" 2-bit quantization.
|
|
169
|
+
// Due to the need to use blocks as per ggml dsign, it ends up using
|
|
170
|
+
// 2.0625 bpw because of the 16-bit scale for each block of 256.
|
|
171
|
+
typedef struct {
|
|
172
|
+
wsp_ggml_fp16_t d;
|
|
173
|
+
uint16_t qs[QK_K/8];
|
|
174
|
+
} block_iq2_xxs;
|
|
175
|
+
static_assert(sizeof(block_iq2_xxs) == sizeof(wsp_ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
|
|
176
|
+
|
|
177
|
+
// 2.3125 bpw quants
|
|
178
|
+
typedef struct {
|
|
179
|
+
wsp_ggml_fp16_t d;
|
|
180
|
+
uint16_t qs[QK_K/8];
|
|
181
|
+
uint8_t scales[QK_K/32];
|
|
182
|
+
} block_iq2_xs;
|
|
183
|
+
static_assert(sizeof(block_iq2_xs) == sizeof(wsp_ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
|
168
184
|
|
|
169
185
|
// Quantization
|
|
170
186
|
void wsp_quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
|
|
@@ -209,6 +225,8 @@ void wsp_dewsp_quantize_row_q4_K(const block_q4_K * restrict x, float * restrict
|
|
|
209
225
|
void wsp_dewsp_quantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
|
|
210
226
|
void wsp_dewsp_quantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
|
|
211
227
|
void wsp_dewsp_quantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
|
228
|
+
void wsp_dewsp_quantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k);
|
|
229
|
+
void wsp_dewsp_quantize_row_iq2_xs (const block_iq2_xs * restrict x, float * restrict y, int k);
|
|
212
230
|
|
|
213
231
|
// Dot product
|
|
214
232
|
void wsp_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
@@ -222,3 +240,23 @@ void wsp_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict
|
|
|
222
240
|
void wsp_ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
223
241
|
void wsp_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
224
242
|
void wsp_ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
243
|
+
void wsp_ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
244
|
+
void wsp_ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
245
|
+
|
|
246
|
+
//
|
|
247
|
+
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
|
248
|
+
//
|
|
249
|
+
size_t wsp_quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
|
250
|
+
size_t wsp_quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
|
251
|
+
size_t wsp_quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
|
252
|
+
size_t wsp_quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
|
253
|
+
size_t wsp_quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
|
254
|
+
size_t wsp_quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
|
255
|
+
size_t wsp_quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
|
256
|
+
size_t wsp_quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
|
257
|
+
size_t wsp_quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
|
258
|
+
size_t wsp_quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
|
259
|
+
size_t wsp_quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
|
260
|
+
|
|
261
|
+
void iq2xs_init_impl(int grid_size);
|
|
262
|
+
void iq2xs_free_impl(int grid_size);
|