whisper.rn 0.4.1 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/java/com/rnwhisper/RNWhisper.java +24 -18
- package/android/src/main/java/com/rnwhisper/WhisperVadContext.java +1 -57
- package/android/src/main/jniLibs/arm64-v8a/librnwhisper.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnwhisper_v8fp16_va_2.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/librnwhisper.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/librnwhisper_vfpv4.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnwhisper.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnwhisper_x86_64.so +0 -0
- package/cpp/ggml-backend.cpp +36 -18
- package/cpp/ggml-backend.h +1 -1
- package/cpp/ggml-cpu/amx/mmq.cpp +10 -9
- package/cpp/ggml-cpu/arch/arm/quants.c +109 -108
- package/cpp/ggml-cpu/arch/arm/repack.cpp +13 -12
- package/cpp/ggml-cpu/arch/x86/quants.c +83 -82
- package/cpp/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/cpp/ggml-cpu/common.h +3 -2
- package/cpp/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/cpp/ggml-cpu/ggml-cpu.c +95 -17
- package/cpp/ggml-cpu/ggml-cpu.cpp +4 -0
- package/cpp/ggml-cpu/ops.cpp +775 -74
- package/cpp/ggml-cpu/ops.h +7 -0
- package/cpp/ggml-cpu/quants.c +25 -24
- package/cpp/ggml-cpu/repack.cpp +15 -14
- package/cpp/ggml-cpu/simd-mappings.h +211 -33
- package/cpp/ggml-cpu/vec.cpp +26 -2
- package/cpp/ggml-cpu/vec.h +99 -45
- package/cpp/ggml-cpu.h +2 -0
- package/cpp/ggml-impl.h +125 -183
- package/cpp/ggml-metal-impl.h +27 -0
- package/cpp/ggml-metal.m +298 -41
- package/cpp/ggml-quants.c +6 -6
- package/cpp/ggml-whisper-sim.metallib +0 -0
- package/cpp/ggml-whisper.metallib +0 -0
- package/cpp/ggml.c +269 -40
- package/cpp/ggml.h +122 -2
- package/cpp/gguf.cpp +5 -1
- package/cpp/whisper.cpp +4 -0
- package/cpp/whisper.h +2 -0
- package/ios/RNWhisper.mm +35 -38
- package/ios/RNWhisperVadContext.h +1 -1
- package/ios/RNWhisperVadContext.mm +2 -6
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-cpu.h +2 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +125 -183
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +27 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +122 -2
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/whisper.h +2 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +2 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +125 -183
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +27 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +122 -2
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +2 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-cpu.h +2 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +125 -183
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-metal-impl.h +27 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +122 -2
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/whisper.h +2 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-cpu.h +2 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +125 -183
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-metal-impl.h +27 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +122 -2
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/whisper.h +2 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/package.json +1 -1
|
@@ -326,7 +326,7 @@ public class RNWhisper implements LifecycleEventListener {
|
|
|
326
326
|
@Override
|
|
327
327
|
protected Void doInBackground(Void... voids) {
|
|
328
328
|
try {
|
|
329
|
-
|
|
329
|
+
releaseAllContexts();
|
|
330
330
|
} catch (Exception e) {
|
|
331
331
|
exception = e;
|
|
332
332
|
}
|
|
@@ -415,7 +415,8 @@ public class RNWhisper implements LifecycleEventListener {
|
|
|
415
415
|
@Override
|
|
416
416
|
protected WritableArray doInBackground(Void... voids) {
|
|
417
417
|
try {
|
|
418
|
-
|
|
418
|
+
float[] audioData = AudioUtils.decodePcmData(audioDataBase64);
|
|
419
|
+
return vadContext.detectSpeechWithAudioData(audioData, audioData.length, options);
|
|
419
420
|
} catch (Exception e) {
|
|
420
421
|
exception = e;
|
|
421
422
|
return null;
|
|
@@ -468,7 +469,7 @@ public class RNWhisper implements LifecycleEventListener {
|
|
|
468
469
|
throw new Exception("Failed to load audio file: " + filePathOrBase64);
|
|
469
470
|
}
|
|
470
471
|
|
|
471
|
-
return vadContext.detectSpeechWithAudioData(audioData, options);
|
|
472
|
+
return vadContext.detectSpeechWithAudioData(audioData, audioData.length, options);
|
|
472
473
|
} catch (Exception e) {
|
|
473
474
|
exception = e;
|
|
474
475
|
return null;
|
|
@@ -528,10 +529,7 @@ public class RNWhisper implements LifecycleEventListener {
|
|
|
528
529
|
@Override
|
|
529
530
|
protected Void doInBackground(Void... voids) {
|
|
530
531
|
try {
|
|
531
|
-
|
|
532
|
-
vadContext.release();
|
|
533
|
-
}
|
|
534
|
-
vadContexts.clear();
|
|
532
|
+
releaseAllVadContexts();
|
|
535
533
|
} catch (Exception e) {
|
|
536
534
|
exception = e;
|
|
537
535
|
}
|
|
@@ -559,27 +557,35 @@ public class RNWhisper implements LifecycleEventListener {
|
|
|
559
557
|
public void onHostPause() {
|
|
560
558
|
}
|
|
561
559
|
|
|
562
|
-
|
|
563
|
-
public void onHostDestroy() {
|
|
560
|
+
private void releaseAllContexts() {
|
|
564
561
|
for (WhisperContext context : contexts.values()) {
|
|
565
562
|
context.stopCurrentTranscribe();
|
|
566
563
|
}
|
|
567
|
-
|
|
568
|
-
try {
|
|
569
|
-
task.get();
|
|
570
|
-
} catch (Exception e) {
|
|
571
|
-
Log.e(NAME, "Failed to wait for task", e);
|
|
572
|
-
}
|
|
573
|
-
}
|
|
564
|
+
WhisperContext.abortAllTranscribe(); // graceful abort
|
|
574
565
|
for (WhisperContext context : contexts.values()) {
|
|
575
566
|
context.release();
|
|
576
567
|
}
|
|
568
|
+
contexts.clear();
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
private void releaseAllVadContexts() {
|
|
577
572
|
for (WhisperVadContext vadContext : vadContexts.values()) {
|
|
578
573
|
vadContext.release();
|
|
579
574
|
}
|
|
580
|
-
WhisperContext.abortAllTranscribe(); // graceful abort
|
|
581
|
-
contexts.clear();
|
|
582
575
|
vadContexts.clear();
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
@Override
|
|
579
|
+
public void onHostDestroy() {
|
|
580
|
+
for (AsyncTask task : tasks.keySet()) {
|
|
581
|
+
try {
|
|
582
|
+
task.get();
|
|
583
|
+
} catch (Exception e) {
|
|
584
|
+
Log.e(NAME, "Failed to wait for task", e);
|
|
585
|
+
}
|
|
586
|
+
}
|
|
583
587
|
downloader.clearCache();
|
|
588
|
+
releaseAllContexts();
|
|
589
|
+
releaseAllVadContexts();
|
|
584
590
|
}
|
|
585
591
|
}
|
|
@@ -25,70 +25,14 @@ public class WhisperVadContext {
|
|
|
25
25
|
this.reactContext = reactContext;
|
|
26
26
|
}
|
|
27
27
|
|
|
28
|
-
public WritableArray
|
|
28
|
+
public WritableArray detectSpeechWithAudioData(float[] audioData, int numSamples, ReadableMap options) throws Exception {
|
|
29
29
|
if (vadContext == 0) {
|
|
30
30
|
throw new Exception("VAD context is null");
|
|
31
31
|
}
|
|
32
32
|
|
|
33
|
-
// Decode base64 audio data to float array
|
|
34
|
-
byte[] audioBytes = Base64.decode(audioDataBase64, Base64.DEFAULT);
|
|
35
|
-
int numSamples = audioBytes.length / 4; // 4 bytes per float
|
|
36
|
-
float[] audioData = new float[numSamples];
|
|
37
|
-
|
|
38
|
-
for (int i = 0; i < numSamples; i++) {
|
|
39
|
-
int intBits = (audioBytes[i * 4] & 0xFF) |
|
|
40
|
-
((audioBytes[i * 4 + 1] & 0xFF) << 8) |
|
|
41
|
-
((audioBytes[i * 4 + 2] & 0xFF) << 16) |
|
|
42
|
-
((audioBytes[i * 4 + 3] & 0xFF) << 24);
|
|
43
|
-
audioData[i] = Float.intBitsToFloat(intBits);
|
|
44
|
-
}
|
|
45
|
-
|
|
46
33
|
return processVadDetection(audioData, numSamples, options);
|
|
47
34
|
}
|
|
48
35
|
|
|
49
|
-
public WritableArray detectSpeechFile(String filePathOrBase64, ReadableMap options) throws Exception {
|
|
50
|
-
if (vadContext == 0) {
|
|
51
|
-
throw new Exception("VAD context is null");
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
// Follow the same pattern as transcribeFile
|
|
55
|
-
String filePath = filePathOrBase64;
|
|
56
|
-
|
|
57
|
-
// Handle HTTP downloads
|
|
58
|
-
if (filePathOrBase64.startsWith("http://") || filePathOrBase64.startsWith("https://")) {
|
|
59
|
-
// Note: This would require access to the downloader, but for now we'll throw an error
|
|
60
|
-
throw new Exception("HTTP URLs not supported in VAD file detection. Please download the file first.");
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
float[] audioData;
|
|
64
|
-
|
|
65
|
-
// Check for resource identifier (bundled assets)
|
|
66
|
-
int resId = getResourceIdentifier(filePath);
|
|
67
|
-
if (resId > 0) {
|
|
68
|
-
audioData = AudioUtils.decodeWaveFile(reactContext.getResources().openRawResource(resId));
|
|
69
|
-
} else if (filePathOrBase64.startsWith("data:audio/wav;base64,")) {
|
|
70
|
-
// Handle base64 WAV data
|
|
71
|
-
audioData = AudioUtils.decodeWaveData(filePathOrBase64);
|
|
72
|
-
} else {
|
|
73
|
-
// Handle regular file path
|
|
74
|
-
audioData = AudioUtils.decodeWaveFile(new java.io.FileInputStream(new java.io.File(filePath)));
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
if (audioData == null) {
|
|
78
|
-
throw new Exception("Failed to load audio file: " + filePathOrBase64);
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
return processVadDetection(audioData, audioData.length, options);
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
public WritableArray detectSpeechWithAudioData(float[] audioData, ReadableMap options) throws Exception {
|
|
85
|
-
if (vadContext == 0) {
|
|
86
|
-
throw new Exception("VAD context is null");
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
return processVadDetection(audioData, audioData.length, options);
|
|
90
|
-
}
|
|
91
|
-
|
|
92
36
|
private int getResourceIdentifier(String filePath) {
|
|
93
37
|
int identifier = reactContext.getResources().getIdentifier(
|
|
94
38
|
filePath,
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/cpp/ggml-backend.cpp
CHANGED
|
@@ -817,8 +817,9 @@ static void wsp_ggml_backend_sched_print_assignments(wsp_ggml_backend_sched_t sc
|
|
|
817
817
|
}
|
|
818
818
|
if (sched->debug > 1) {
|
|
819
819
|
wsp_ggml_backend_t tensor_backend = wsp_ggml_backend_sched_get_tensor_backend(sched, node);
|
|
820
|
-
WSP_GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, wsp_ggml_op_name(node->op), node->name,
|
|
821
|
-
fmt_size(wsp_ggml_nbytes(node)), tensor_backend ? wsp_ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node)
|
|
820
|
+
WSP_GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d:", i, wsp_ggml_op_name(node->op), node->name,
|
|
821
|
+
fmt_size(wsp_ggml_nbytes(node)), tensor_backend ? wsp_ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node),
|
|
822
|
+
graph->use_counts[wsp_ggml_hash_find(&graph->visited_hash_set, node)]);
|
|
822
823
|
for (int j = 0; j < WSP_GGML_MAX_SRC; j++) {
|
|
823
824
|
struct wsp_ggml_tensor * src = node->src[j];
|
|
824
825
|
if (src == NULL) {
|
|
@@ -1826,7 +1827,7 @@ void wsp_ggml_backend_graph_copy_free(struct wsp_ggml_backend_graph_copy copy) {
|
|
|
1826
1827
|
wsp_ggml_free(copy.ctx_unallocated);
|
|
1827
1828
|
}
|
|
1828
1829
|
|
|
1829
|
-
bool wsp_ggml_backend_compare_graph_backend(wsp_ggml_backend_t backend1, wsp_ggml_backend_t backend2, struct wsp_ggml_cgraph * graph, wsp_ggml_backend_eval_callback callback, void * user_data) {
|
|
1830
|
+
bool wsp_ggml_backend_compare_graph_backend(wsp_ggml_backend_t backend1, wsp_ggml_backend_t backend2, struct wsp_ggml_cgraph * graph, wsp_ggml_backend_eval_callback callback, void * user_data, struct wsp_ggml_tensor * test_node) {
|
|
1830
1831
|
struct wsp_ggml_backend_graph_copy copy = wsp_ggml_backend_graph_copy(backend2, graph);
|
|
1831
1832
|
if (copy.buffer == NULL) {
|
|
1832
1833
|
return false;
|
|
@@ -1837,28 +1838,45 @@ bool wsp_ggml_backend_compare_graph_backend(wsp_ggml_backend_t backend1, wsp_ggm
|
|
|
1837
1838
|
|
|
1838
1839
|
assert(g1->n_nodes == g2->n_nodes);
|
|
1839
1840
|
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
|
|
1841
|
+
if (test_node != nullptr) {
|
|
1842
|
+
// Compute the whole graph and only test the output for a specific tensor
|
|
1843
|
+
wsp_ggml_backend_graph_compute(backend1, g1);
|
|
1844
|
+
wsp_ggml_backend_graph_compute(backend2, g2);
|
|
1843
1845
|
|
|
1844
|
-
|
|
1846
|
+
int test_node_idx = -1;
|
|
1847
|
+
for (int i = 0; i < g1->n_nodes; i++) {
|
|
1848
|
+
struct wsp_ggml_tensor * t1 = g1->nodes[i];
|
|
1849
|
+
if (t1 == test_node) {
|
|
1850
|
+
test_node_idx = i;
|
|
1851
|
+
break;
|
|
1852
|
+
}
|
|
1853
|
+
}
|
|
1854
|
+
WSP_GGML_ASSERT(test_node_idx != -1);
|
|
1845
1855
|
|
|
1846
|
-
|
|
1847
|
-
|
|
1856
|
+
callback(test_node_idx, g1->nodes[test_node_idx], g2->nodes[test_node_idx], user_data);
|
|
1857
|
+
} else {
|
|
1858
|
+
for (int i = 0; i < g1->n_nodes; i++) {
|
|
1859
|
+
struct wsp_ggml_tensor * t1 = g1->nodes[i];
|
|
1860
|
+
struct wsp_ggml_tensor * t2 = g2->nodes[i];
|
|
1848
1861
|
|
|
1849
|
-
|
|
1850
|
-
wsp_ggml_backend_graph_compute(backend2, &g2v);
|
|
1862
|
+
assert(t1->op == t2->op && wsp_ggml_are_same_layout(t1, t2));
|
|
1851
1863
|
|
|
1852
|
-
|
|
1853
|
-
|
|
1854
|
-
}
|
|
1864
|
+
struct wsp_ggml_cgraph g1v = wsp_ggml_graph_view(g1, i, i + 1);
|
|
1865
|
+
struct wsp_ggml_cgraph g2v = wsp_ggml_graph_view(g2, i, i + 1);
|
|
1855
1866
|
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1867
|
+
wsp_ggml_backend_graph_compute(backend1, &g1v);
|
|
1868
|
+
wsp_ggml_backend_graph_compute(backend2, &g2v);
|
|
1869
|
+
|
|
1870
|
+
if (wsp_ggml_is_view_op(t1->op)) {
|
|
1871
|
+
continue;
|
|
1872
|
+
}
|
|
1873
|
+
|
|
1874
|
+
// compare results, calculate rms etc
|
|
1875
|
+
if (!callback(i, t1, t2, user_data)) {
|
|
1876
|
+
break;
|
|
1877
|
+
}
|
|
1859
1878
|
}
|
|
1860
1879
|
}
|
|
1861
|
-
|
|
1862
1880
|
wsp_ggml_backend_graph_copy_free(copy);
|
|
1863
1881
|
|
|
1864
1882
|
return true;
|
package/cpp/ggml-backend.h
CHANGED
|
@@ -339,7 +339,7 @@ extern "C" {
|
|
|
339
339
|
typedef bool (*wsp_ggml_backend_eval_callback)(int node_index, struct wsp_ggml_tensor * t1, struct wsp_ggml_tensor * t2, void * user_data);
|
|
340
340
|
|
|
341
341
|
// Compare the output of two backends
|
|
342
|
-
WSP_GGML_API bool wsp_ggml_backend_compare_graph_backend(wsp_ggml_backend_t backend1, wsp_ggml_backend_t backend2, struct wsp_ggml_cgraph * graph, wsp_ggml_backend_eval_callback callback, void * user_data);
|
|
342
|
+
WSP_GGML_API bool wsp_ggml_backend_compare_graph_backend(wsp_ggml_backend_t backend1, wsp_ggml_backend_t backend2, struct wsp_ggml_cgraph * graph, wsp_ggml_backend_eval_callback callback, void * user_data, struct wsp_ggml_tensor * test_node);
|
|
343
343
|
|
|
344
344
|
// Tensor initialization
|
|
345
345
|
WSP_GGML_API enum wsp_ggml_status wsp_ggml_backend_tensor_alloc(wsp_ggml_backend_buffer_t buffer, struct wsp_ggml_tensor * tensor, void * addr);
|
package/cpp/ggml-cpu/amx/mmq.cpp
CHANGED
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
#include "mmq.h"
|
|
9
9
|
#include "ggml-impl.h"
|
|
10
10
|
#include "ggml-cpu-impl.h"
|
|
11
|
+
#include "simd-mappings.h"
|
|
11
12
|
#include "quants.h"
|
|
12
13
|
#include "ggml-quants.h"
|
|
13
14
|
#include <algorithm>
|
|
@@ -453,7 +454,7 @@ void wsp_quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, in
|
|
|
453
454
|
|
|
454
455
|
// Quantize these floats
|
|
455
456
|
const float iscale = 127.f / amax;
|
|
456
|
-
y[i].d =
|
|
457
|
+
y[i].d = WSP_GGML_CPU_FP32_TO_FP16(1 / iscale);
|
|
457
458
|
const float id = ( amax != 0.0f ) ? iscale : 0.f;
|
|
458
459
|
const __m512 vscale = _mm512_set1_ps(id);
|
|
459
460
|
|
|
@@ -1090,7 +1091,7 @@ struct acc_C<block_q8_0, block_q4_0, is_acc> {
|
|
|
1090
1091
|
const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
|
|
1091
1092
|
|
|
1092
1093
|
for (int m = 0; m < nr; ++m) {
|
|
1093
|
-
const __m512 vd1 = _mm512_set1_ps(
|
|
1094
|
+
const __m512 vd1 = _mm512_set1_ps(WSP_GGML_CPU_FP16_TO_FP32(A[m * lda].d));
|
|
1094
1095
|
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
|
|
1095
1096
|
|
|
1096
1097
|
__m512 vsum;
|
|
@@ -1113,8 +1114,8 @@ struct acc_C<block_q8_1, block_q4_1, is_acc> {
|
|
|
1113
1114
|
const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(wsp_ggml_half))));
|
|
1114
1115
|
|
|
1115
1116
|
for (int m = 0; m < nr; ++m) {
|
|
1116
|
-
const __m512 vd1 = _mm512_set1_ps(
|
|
1117
|
-
const __m512 vs1 = _mm512_set1_ps(
|
|
1117
|
+
const __m512 vd1 = _mm512_set1_ps(WSP_GGML_CPU_FP16_TO_FP32(A[m * lda].d));
|
|
1118
|
+
const __m512 vs1 = _mm512_set1_ps(WSP_GGML_CPU_FP16_TO_FP32(A[m * lda].s));
|
|
1118
1119
|
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
|
|
1119
1120
|
|
|
1120
1121
|
__m512 vsum;
|
|
@@ -1137,7 +1138,7 @@ struct acc_C<block_q8_0, block_q8_0, is_acc> {
|
|
|
1137
1138
|
const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
|
|
1138
1139
|
|
|
1139
1140
|
for (int m = 0; m < nr; ++m) {
|
|
1140
|
-
const __m512 vd1 = _mm512_set1_ps(
|
|
1141
|
+
const __m512 vd1 = _mm512_set1_ps(WSP_GGML_CPU_FP16_TO_FP32(A[m * lda].d));
|
|
1141
1142
|
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
|
|
1142
1143
|
|
|
1143
1144
|
__m512 vsum;
|
|
@@ -1437,7 +1438,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
|
|
|
1437
1438
|
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
|
1438
1439
|
vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
|
|
1439
1440
|
}
|
|
1440
|
-
vd1 = _mm512_set1_ps(
|
|
1441
|
+
vd1 = _mm512_set1_ps(WSP_GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
|
|
1441
1442
|
}
|
|
1442
1443
|
|
|
1443
1444
|
// load b
|
|
@@ -1498,8 +1499,8 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
|
|
|
1498
1499
|
for (int k = 0; k < 8; ++k) {
|
|
1499
1500
|
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
|
1500
1501
|
}
|
|
1501
|
-
vd1 = _mm512_set1_ps(
|
|
1502
|
-
vs1 = _mm512_set1_ps(
|
|
1502
|
+
vd1 = _mm512_set1_ps(WSP_GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
|
|
1503
|
+
vs1 = _mm512_set1_ps(WSP_GGML_CPU_FP16_TO_FP32(A[0 * KB + i].s));
|
|
1503
1504
|
}
|
|
1504
1505
|
|
|
1505
1506
|
// load b
|
|
@@ -1571,7 +1572,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
|
|
|
1571
1572
|
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
|
1572
1573
|
va[k] = _mm512_add_epi8(va[k], off);
|
|
1573
1574
|
}
|
|
1574
|
-
vd1 = _mm512_set1_ps(
|
|
1575
|
+
vd1 = _mm512_set1_ps(WSP_GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
|
|
1575
1576
|
}
|
|
1576
1577
|
|
|
1577
1578
|
// load b
|