whisper.rn 0.4.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/java/com/rnwhisper/RNWhisper.java +21 -16
- package/android/src/main/jniLibs/arm64-v8a/librnwhisper.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnwhisper_v8fp16_va_2.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/librnwhisper.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/librnwhisper_vfpv4.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnwhisper.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnwhisper_x86_64.so +0 -0
- package/cpp/ggml-backend.cpp +36 -18
- package/cpp/ggml-backend.h +1 -1
- package/cpp/ggml-cpu/amx/mmq.cpp +10 -9
- package/cpp/ggml-cpu/arch/arm/quants.c +109 -108
- package/cpp/ggml-cpu/arch/arm/repack.cpp +13 -12
- package/cpp/ggml-cpu/arch/x86/quants.c +83 -82
- package/cpp/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/cpp/ggml-cpu/common.h +3 -2
- package/cpp/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/cpp/ggml-cpu/ggml-cpu.c +95 -17
- package/cpp/ggml-cpu/ggml-cpu.cpp +4 -0
- package/cpp/ggml-cpu/ops.cpp +775 -74
- package/cpp/ggml-cpu/ops.h +7 -0
- package/cpp/ggml-cpu/quants.c +25 -24
- package/cpp/ggml-cpu/repack.cpp +15 -14
- package/cpp/ggml-cpu/simd-mappings.h +211 -33
- package/cpp/ggml-cpu/vec.cpp +26 -2
- package/cpp/ggml-cpu/vec.h +99 -45
- package/cpp/ggml-cpu.h +2 -0
- package/cpp/ggml-impl.h +125 -183
- package/cpp/ggml-metal-impl.h +27 -0
- package/cpp/ggml-metal.m +298 -41
- package/cpp/ggml-quants.c +6 -6
- package/cpp/ggml-whisper-sim.metallib +0 -0
- package/cpp/ggml-whisper.metallib +0 -0
- package/cpp/ggml.c +269 -40
- package/cpp/ggml.h +122 -2
- package/cpp/gguf.cpp +5 -1
- package/cpp/whisper.cpp +4 -0
- package/cpp/whisper.h +2 -0
- package/ios/RNWhisper.mm +28 -31
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +2 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +125 -183
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +27 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +122 -2
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +2 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +2 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +125 -183
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +27 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +122 -2
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +2 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +2 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +125 -183
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +27 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +122 -2
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +2 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +2 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +125 -183
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +27 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +122 -2
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +2 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/package.json +1 -1
package/cpp/ggml-cpu/ops.h
CHANGED
|
@@ -20,6 +20,9 @@
|
|
|
20
20
|
|
|
21
21
|
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
|
22
22
|
|
|
23
|
+
// Work buffer size for im2col operations in CONV2D
|
|
24
|
+
#define WSP_GGML_IM2COL_WORK_SIZE (16 * 1024 * 1024)
|
|
25
|
+
|
|
23
26
|
#ifdef __cplusplus
|
|
24
27
|
extern "C" {
|
|
25
28
|
#endif
|
|
@@ -53,6 +56,7 @@ void wsp_ggml_compute_forward_permute(const struct wsp_ggml_compute_params * par
|
|
|
53
56
|
void wsp_ggml_compute_forward_transpose(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
54
57
|
void wsp_ggml_compute_forward_get_rows(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
55
58
|
void wsp_ggml_compute_forward_get_rows_back(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
59
|
+
void wsp_ggml_compute_forward_set_rows(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
56
60
|
void wsp_ggml_compute_forward_diag(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
57
61
|
void wsp_ggml_compute_forward_diag_mask_inf(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
58
62
|
void wsp_ggml_compute_forward_diag_mask_zero(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
@@ -64,6 +68,7 @@ void wsp_ggml_compute_forward_clamp(const struct wsp_ggml_compute_params * param
|
|
|
64
68
|
void wsp_ggml_compute_forward_conv_transpose_1d(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
65
69
|
void wsp_ggml_compute_forward_im2col(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
66
70
|
void wsp_ggml_compute_forward_im2col_back_f32(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
71
|
+
void wsp_ggml_compute_forward_conv_2d(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
67
72
|
void wsp_ggml_compute_forward_conv_transpose_2d(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
68
73
|
void wsp_ggml_compute_forward_conv_2d_dw(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
69
74
|
void wsp_ggml_compute_forward_pool_1d(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
@@ -93,6 +98,7 @@ void wsp_ggml_compute_forward_ssm_scan(const struct wsp_ggml_compute_params * pa
|
|
|
93
98
|
void wsp_ggml_compute_forward_win_part(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
94
99
|
void wsp_ggml_compute_forward_win_unpart(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
95
100
|
void wsp_ggml_compute_forward_unary(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
101
|
+
void wsp_ggml_compute_forward_glu(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
96
102
|
void wsp_ggml_compute_forward_get_rel_pos(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
97
103
|
void wsp_ggml_compute_forward_add_rel_pos(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
98
104
|
void wsp_ggml_compute_forward_rwkv_wkv6(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
@@ -105,6 +111,7 @@ void wsp_ggml_compute_forward_custom(const struct wsp_ggml_compute_params * para
|
|
|
105
111
|
void wsp_ggml_compute_forward_cross_entropy_loss(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
106
112
|
void wsp_ggml_compute_forward_cross_entropy_loss_back(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
107
113
|
void wsp_ggml_compute_forward_opt_step_adamw(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
114
|
+
void wsp_ggml_compute_forward_mul_mat(const struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * dst);
|
|
108
115
|
|
|
109
116
|
#ifdef __cplusplus
|
|
110
117
|
}
|
package/cpp/ggml-cpu/quants.c
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
#include "ggml-common.h"
|
|
3
3
|
|
|
4
4
|
#include "ggml-cpu-impl.h"
|
|
5
|
+
#include "simd-mappings.h"
|
|
5
6
|
#include "ggml-quants.h"
|
|
6
7
|
#include "quants.h"
|
|
7
8
|
|
|
@@ -137,7 +138,7 @@ void wsp_ggml_vec_dot_q4_0_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, size
|
|
|
137
138
|
}
|
|
138
139
|
|
|
139
140
|
int sumi = sumi0 + sumi1;
|
|
140
|
-
sumf += sumi*
|
|
141
|
+
sumf += sumi*WSP_GGML_CPU_FP16_TO_FP32(x[ib].d)*WSP_GGML_CPU_FP16_TO_FP32(y[ib].d);
|
|
141
142
|
}
|
|
142
143
|
|
|
143
144
|
*s = sumf;
|
|
@@ -174,7 +175,7 @@ void wsp_ggml_vec_dot_q4_1_q8_1_generic(int n, float * WSP_GGML_RESTRICT s, size
|
|
|
174
175
|
}
|
|
175
176
|
|
|
176
177
|
int sumi = sumi0 + sumi1;
|
|
177
|
-
sumf += (
|
|
178
|
+
sumf += (WSP_GGML_CPU_FP16_TO_FP32(x[ib].d)*WSP_GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + WSP_GGML_CPU_FP16_TO_FP32(x[ib].m)*WSP_GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
178
179
|
}
|
|
179
180
|
|
|
180
181
|
*s = sumf;
|
|
@@ -217,7 +218,7 @@ void wsp_ggml_vec_dot_q5_0_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, size
|
|
|
217
218
|
}
|
|
218
219
|
|
|
219
220
|
int sumi = sumi0 + sumi1;
|
|
220
|
-
sumf += (
|
|
221
|
+
sumf += (WSP_GGML_CPU_FP16_TO_FP32(x[ib].d)*WSP_GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
|
221
222
|
}
|
|
222
223
|
|
|
223
224
|
*s = sumf;
|
|
@@ -260,7 +261,7 @@ void wsp_ggml_vec_dot_q5_1_q8_1_generic(int n, float * WSP_GGML_RESTRICT s, size
|
|
|
260
261
|
}
|
|
261
262
|
|
|
262
263
|
int sumi = sumi0 + sumi1;
|
|
263
|
-
sumf += (
|
|
264
|
+
sumf += (WSP_GGML_CPU_FP16_TO_FP32(x[ib].d)*WSP_GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + WSP_GGML_CPU_FP16_TO_FP32(x[ib].m)*WSP_GGML_CPU_FP16_TO_FP32(y[ib].s);
|
|
264
265
|
}
|
|
265
266
|
|
|
266
267
|
*s = sumf;
|
|
@@ -290,7 +291,7 @@ void wsp_ggml_vec_dot_q8_0_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, size
|
|
|
290
291
|
sumi += x[ib].qs[j]*y[ib].qs[j];
|
|
291
292
|
}
|
|
292
293
|
|
|
293
|
-
sumf += sumi*(
|
|
294
|
+
sumf += sumi*(WSP_GGML_CPU_FP16_TO_FP32(x[ib].d)*WSP_GGML_CPU_FP16_TO_FP32(y[ib].d));
|
|
294
295
|
}
|
|
295
296
|
|
|
296
297
|
*s = sumf;
|
|
@@ -342,7 +343,7 @@ void wsp_ggml_vec_dot_tq1_0_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, siz
|
|
|
342
343
|
}
|
|
343
344
|
}
|
|
344
345
|
|
|
345
|
-
sumf += (float) sum * (
|
|
346
|
+
sumf += (float) sum * (WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
|
|
346
347
|
}
|
|
347
348
|
|
|
348
349
|
*s = sumf;
|
|
@@ -372,7 +373,7 @@ void wsp_ggml_vec_dot_tq2_0_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, siz
|
|
|
372
373
|
}
|
|
373
374
|
}
|
|
374
375
|
|
|
375
|
-
const float d = y[i].d *
|
|
376
|
+
const float d = y[i].d * WSP_GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
376
377
|
|
|
377
378
|
sumf += (float) sumi * d;
|
|
378
379
|
}
|
|
@@ -405,8 +406,8 @@ void wsp_ggml_vec_dot_q2_K_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, size
|
|
|
405
406
|
summs += y[i].bsums[j] * (sc[j] >> 4);
|
|
406
407
|
}
|
|
407
408
|
|
|
408
|
-
const float dall = y[i].d *
|
|
409
|
-
const float dmin = y[i].d *
|
|
409
|
+
const float dall = y[i].d * WSP_GGML_CPU_FP16_TO_FP32(x[i].d);
|
|
410
|
+
const float dmin = y[i].d * WSP_GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
|
410
411
|
|
|
411
412
|
int isum = 0;
|
|
412
413
|
int is = 0;
|
|
@@ -504,7 +505,7 @@ void wsp_ggml_vec_dot_q3_K_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, size
|
|
|
504
505
|
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
|
505
506
|
q8 += 8; a += 8;
|
|
506
507
|
}
|
|
507
|
-
const float d =
|
|
508
|
+
const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
508
509
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
509
510
|
}
|
|
510
511
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -577,9 +578,9 @@ void wsp_ggml_vec_dot_q4_K_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, size
|
|
|
577
578
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
578
579
|
q8 += 8; a += 8;
|
|
579
580
|
}
|
|
580
|
-
const float d =
|
|
581
|
+
const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
581
582
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
582
|
-
const float dmin =
|
|
583
|
+
const float dmin = WSP_GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
583
584
|
sumf -= dmin * sumi;
|
|
584
585
|
}
|
|
585
586
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -657,9 +658,9 @@ void wsp_ggml_vec_dot_q5_K_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, size
|
|
|
657
658
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
658
659
|
q8 += 8; a += 8;
|
|
659
660
|
}
|
|
660
|
-
const float d =
|
|
661
|
+
const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
661
662
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
662
|
-
const float dmin =
|
|
663
|
+
const float dmin = WSP_GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
|
663
664
|
sumf -= dmin * sumi;
|
|
664
665
|
}
|
|
665
666
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -714,7 +715,7 @@ void wsp_ggml_vec_dot_q6_K_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, size
|
|
|
714
715
|
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
|
715
716
|
q8 += 8; a += 8;
|
|
716
717
|
}
|
|
717
|
-
const float d =
|
|
718
|
+
const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
718
719
|
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
|
719
720
|
}
|
|
720
721
|
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
|
@@ -739,7 +740,7 @@ void wsp_ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, s
|
|
|
739
740
|
|
|
740
741
|
float sumf = 0.f;
|
|
741
742
|
for (int i = 0; i < nb; ++i) {
|
|
742
|
-
const float d =
|
|
743
|
+
const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
743
744
|
const uint16_t * WSP_GGML_RESTRICT q2 = x[i].qs;
|
|
744
745
|
const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
|
|
745
746
|
int32_t bsum = 0;
|
|
@@ -778,7 +779,7 @@ void wsp_ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, si
|
|
|
778
779
|
|
|
779
780
|
float sumf = 0.f;
|
|
780
781
|
for (int i = 0; i < nb; ++i) {
|
|
781
|
-
const float d =
|
|
782
|
+
const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
782
783
|
const uint16_t * WSP_GGML_RESTRICT q2 = x[i].qs;
|
|
783
784
|
const uint8_t * WSP_GGML_RESTRICT sc = x[i].scales;
|
|
784
785
|
const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -829,7 +830,7 @@ void wsp_ggml_vec_dot_iq2_s_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, siz
|
|
|
829
830
|
float sumf = 0;
|
|
830
831
|
for (int i = 0; i < nb; i++) {
|
|
831
832
|
|
|
832
|
-
const float d =
|
|
833
|
+
const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
833
834
|
const int8_t * q8 = y[i].qs;
|
|
834
835
|
const uint8_t * qs = x[i].qs;
|
|
835
836
|
const uint8_t * qh = x[i].qh;
|
|
@@ -882,7 +883,7 @@ void wsp_ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, s
|
|
|
882
883
|
|
|
883
884
|
float sumf = 0.f;
|
|
884
885
|
for (int i = 0; i < nb; ++i) {
|
|
885
|
-
const float d =
|
|
886
|
+
const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
886
887
|
const uint8_t * WSP_GGML_RESTRICT q3 = x[i].qs;
|
|
887
888
|
const uint8_t * WSP_GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
|
888
889
|
const int8_t * WSP_GGML_RESTRICT q8 = y[i].qs;
|
|
@@ -924,7 +925,7 @@ void wsp_ggml_vec_dot_iq3_s_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, siz
|
|
|
924
925
|
|
|
925
926
|
float sumf = 0.f;
|
|
926
927
|
for (int i = 0; i < nb; ++i) {
|
|
927
|
-
const float d =
|
|
928
|
+
const float d = WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
|
928
929
|
const uint8_t * WSP_GGML_RESTRICT qs = x[i].qs;
|
|
929
930
|
const uint8_t * WSP_GGML_RESTRICT qh = x[i].qh;
|
|
930
931
|
const uint8_t * WSP_GGML_RESTRICT signs = x[i].signs;
|
|
@@ -1002,7 +1003,7 @@ void wsp_ggml_vec_dot_iq1_s_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, siz
|
|
|
1002
1003
|
qs += 4;
|
|
1003
1004
|
}
|
|
1004
1005
|
|
|
1005
|
-
sumf +=
|
|
1006
|
+
sumf += WSP_GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
|
1006
1007
|
}
|
|
1007
1008
|
|
|
1008
1009
|
*s = sumf;
|
|
@@ -1063,7 +1064,7 @@ void wsp_ggml_vec_dot_iq1_m_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, siz
|
|
|
1063
1064
|
qh += 2;
|
|
1064
1065
|
}
|
|
1065
1066
|
|
|
1066
|
-
sumf +=
|
|
1067
|
+
sumf += WSP_GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
|
1067
1068
|
}
|
|
1068
1069
|
|
|
1069
1070
|
*s = sumf;
|
|
@@ -1087,7 +1088,7 @@ void wsp_ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, si
|
|
|
1087
1088
|
float sumf = 0;
|
|
1088
1089
|
|
|
1089
1090
|
for (; ib < nb; ++ib) {
|
|
1090
|
-
const float d =
|
|
1091
|
+
const float d = WSP_GGML_CPU_FP16_TO_FP32(y[ib].d)*WSP_GGML_CPU_FP16_TO_FP32(x[ib].d);
|
|
1091
1092
|
int sumi1 = 0, sumi2 = 0;
|
|
1092
1093
|
for (int j = 0; j < QK4_NL/2; ++j) {
|
|
1093
1094
|
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
|
@@ -1113,7 +1114,7 @@ void wsp_ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, si
|
|
|
1113
1114
|
|
|
1114
1115
|
float sumf = 0;
|
|
1115
1116
|
for (int ibl = 0; ibl < nb; ++ibl) {
|
|
1116
|
-
const float d4d8 =
|
|
1117
|
+
const float d4d8 = WSP_GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
|
1117
1118
|
uint16_t h = x[ibl].scales_h;
|
|
1118
1119
|
const uint8_t * qs = x[ibl].qs;
|
|
1119
1120
|
const int8_t * q8 = y[ibl].qs;
|
package/cpp/ggml-cpu/repack.cpp
CHANGED
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
#include "ggml-impl.h"
|
|
7
7
|
#include "ggml-cpu.h"
|
|
8
8
|
#include "ggml-cpu-impl.h"
|
|
9
|
+
#include "simd-mappings.h"
|
|
9
10
|
#include "traits.h"
|
|
10
11
|
|
|
11
12
|
#include "arch-fallback.h"
|
|
@@ -72,7 +73,7 @@ void wsp_ggml_wsp_quantize_mat_q8_0_4x4_generic(const float * WSP_GGML_RESTRICT
|
|
|
72
73
|
const float d = amax / ((1 << 7) - 1);
|
|
73
74
|
id[row_iter] = d ? 1.0f / d : 0.0f;
|
|
74
75
|
|
|
75
|
-
y[i].d[row_iter] =
|
|
76
|
+
y[i].d[row_iter] = WSP_GGML_CPU_FP32_TO_FP16(d);
|
|
76
77
|
}
|
|
77
78
|
|
|
78
79
|
for (int j = 0; j < QK8_0 * 4; j++) {
|
|
@@ -110,7 +111,7 @@ void wsp_ggml_wsp_quantize_mat_q8_0_4x8_generic(const float * WSP_GGML_RESTRICT
|
|
|
110
111
|
const float d = amax / ((1 << 7) - 1);
|
|
111
112
|
id[row_iter] = d ? 1.0f / d : 0.0f;
|
|
112
113
|
|
|
113
|
-
y[i].d[row_iter] =
|
|
114
|
+
y[i].d[row_iter] = WSP_GGML_CPU_FP32_TO_FP16(d);
|
|
114
115
|
}
|
|
115
116
|
|
|
116
117
|
for (int j = 0; j < QK8_0 * 4; j++) {
|
|
@@ -236,7 +237,7 @@ void wsp_ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, siz
|
|
|
236
237
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
237
238
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
238
239
|
}
|
|
239
|
-
sumf[j] += sumi *
|
|
240
|
+
sumf[j] += sumi * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * WSP_GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
240
241
|
}
|
|
241
242
|
}
|
|
242
243
|
}
|
|
@@ -280,7 +281,7 @@ void wsp_ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, siz
|
|
|
280
281
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
281
282
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
282
283
|
}
|
|
283
|
-
sumf[j] += sumi *
|
|
284
|
+
sumf[j] += sumi * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * WSP_GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
284
285
|
}
|
|
285
286
|
}
|
|
286
287
|
}
|
|
@@ -325,7 +326,7 @@ void wsp_ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, siz
|
|
|
325
326
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
|
326
327
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
|
327
328
|
}
|
|
328
|
-
sumf[j] += sumi *
|
|
329
|
+
sumf[j] += sumi * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * WSP_GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
329
330
|
}
|
|
330
331
|
}
|
|
331
332
|
}
|
|
@@ -396,13 +397,13 @@ void wsp_ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, siz
|
|
|
396
397
|
sumi2 = sumi2 * scales_1[j];
|
|
397
398
|
sumi += sumi1 + sumi2;
|
|
398
399
|
}
|
|
399
|
-
sumf[j] += sumi *
|
|
400
|
+
sumf[j] += sumi * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
|
400
401
|
}
|
|
401
402
|
}
|
|
402
403
|
for (int sb = 0; sb < 8; sb++) {
|
|
403
404
|
uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
|
|
404
405
|
for (int j = 0; j < ncols_interleaved; j++) {
|
|
405
|
-
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) *
|
|
406
|
+
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
|
406
407
|
}
|
|
407
408
|
}
|
|
408
409
|
}
|
|
@@ -449,7 +450,7 @@ void wsp_ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, s
|
|
|
449
450
|
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
|
450
451
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
|
451
452
|
}
|
|
452
|
-
sumf[j] += sumi *
|
|
453
|
+
sumf[j] += sumi * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * WSP_GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
453
454
|
}
|
|
454
455
|
}
|
|
455
456
|
}
|
|
@@ -500,7 +501,7 @@ void wsp_ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, siz
|
|
|
500
501
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
501
502
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
502
503
|
}
|
|
503
|
-
sumf[m][j] += sumi *
|
|
504
|
+
sumf[m][j] += sumi * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * WSP_GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
504
505
|
}
|
|
505
506
|
}
|
|
506
507
|
}
|
|
@@ -555,7 +556,7 @@ void wsp_ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, siz
|
|
|
555
556
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
556
557
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
557
558
|
}
|
|
558
|
-
sumf[m][j] += sumi *
|
|
559
|
+
sumf[m][j] += sumi * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * WSP_GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
559
560
|
}
|
|
560
561
|
}
|
|
561
562
|
}
|
|
@@ -609,7 +610,7 @@ void wsp_ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, siz
|
|
|
609
610
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
610
611
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
|
611
612
|
}
|
|
612
|
-
sumf[m][j] += sumi *
|
|
613
|
+
sumf[m][j] += sumi * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * WSP_GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
613
614
|
}
|
|
614
615
|
}
|
|
615
616
|
}
|
|
@@ -688,7 +689,7 @@ void wsp_ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, siz
|
|
|
688
689
|
sumi2 = sumi2 * scales_1[j];
|
|
689
690
|
sumi += sumi1 + sumi2;
|
|
690
691
|
}
|
|
691
|
-
sumf[m][j] += sumi *
|
|
692
|
+
sumf[m][j] += sumi * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
|
692
693
|
}
|
|
693
694
|
}
|
|
694
695
|
}
|
|
@@ -697,7 +698,7 @@ void wsp_ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, siz
|
|
|
697
698
|
for(int m = 0; m < 4; m++) {
|
|
698
699
|
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
|
699
700
|
for(int j = 0; j < ncols_interleaved; j++) {
|
|
700
|
-
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) *
|
|
701
|
+
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
|
701
702
|
}
|
|
702
703
|
}
|
|
703
704
|
}
|
|
@@ -753,7 +754,7 @@ void wsp_ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, s
|
|
|
753
754
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
|
754
755
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
|
755
756
|
}
|
|
756
|
-
sumf[m][j] += sumi *
|
|
757
|
+
sumf[m][j] += sumi * WSP_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * WSP_GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
757
758
|
}
|
|
758
759
|
}
|
|
759
760
|
}
|