whisper.rn 0.5.1 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/jni.cpp +12 -3
- package/cpp/ggml-alloc.c +49 -18
- package/cpp/ggml-backend-impl.h +0 -3
- package/cpp/ggml-backend-reg.cpp +8 -0
- package/cpp/ggml-backend.cpp +0 -2
- package/cpp/ggml-backend.h +2 -0
- package/cpp/ggml-cpu/amx/amx.cpp +1 -0
- package/cpp/ggml-cpu/arch/arm/quants.c +428 -26
- package/cpp/ggml-cpu/ggml-cpu-impl.h +4 -2
- package/cpp/ggml-cpu/ggml-cpu.c +67 -24
- package/cpp/ggml-cpu/ops.cpp +489 -364
- package/cpp/ggml-cpu/ops.h +4 -4
- package/cpp/ggml-cpu/repack.cpp +143 -29
- package/cpp/ggml-cpu/simd-mappings.h +25 -25
- package/cpp/ggml-cpu/unary-ops.cpp +151 -0
- package/cpp/ggml-cpu/unary-ops.h +7 -0
- package/cpp/ggml-cpu/vec.cpp +83 -0
- package/cpp/ggml-cpu/vec.h +20 -8
- package/cpp/ggml-impl.h +67 -2
- package/cpp/ggml-metal/ggml-metal-common.cpp +2 -2
- package/cpp/ggml-metal/ggml-metal-context.m +5 -6
- package/cpp/ggml-metal/ggml-metal-device.cpp +300 -14
- package/cpp/ggml-metal/ggml-metal-device.h +26 -1
- package/cpp/ggml-metal/ggml-metal-device.m +243 -28
- package/cpp/ggml-metal/ggml-metal-impl.h +177 -9
- package/cpp/ggml-metal/ggml-metal-ops.cpp +843 -157
- package/cpp/ggml-metal/ggml-metal-ops.h +8 -0
- package/cpp/ggml-metal/ggml-metal.cpp +8 -3
- package/cpp/ggml-metal/ggml-metal.metal +12436 -0
- package/cpp/ggml.c +317 -4
- package/cpp/ggml.h +139 -0
- package/cpp/jsi/RNWhisperJSI.cpp +7 -2
- package/cpp/rn-whisper.h +1 -0
- package/cpp/whisper.cpp +8 -2
- package/ios/RNWhisperContext.mm +3 -1
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +0 -3
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-backend.h +2 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml-impl.h +67 -2
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/ggml.h +139 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Headers/rn-whisper.h +1 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-metal.metal +12436 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +0 -3
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +2 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +67 -2
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +139 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper.h +1 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-metal.metal +12436 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend-impl.h +0 -3
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-backend.h +2 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml-impl.h +67 -2
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/ggml.h +139 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Headers/rn-whisper.h +1 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-metal.metal +12436 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/rnwhisper +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend-impl.h +0 -3
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-backend.h +2 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml-impl.h +67 -2
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/ggml.h +139 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Headers/rn-whisper.h +1 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/Info.plist +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-metal.metal +12436 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/rnwhisper +0 -0
- package/lib/commonjs/NativeRNWhisper.js.map +1 -1
- package/lib/commonjs/version.json +1 -1
- package/lib/module/NativeRNWhisper.js.map +1 -1
- package/lib/module/version.json +1 -1
- package/lib/typescript/NativeRNWhisper.d.ts +2 -0
- package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/NativeRNWhisper.ts +2 -0
- package/src/version.json +1 -1
- package/whisper-rn.podspec +1 -1
- package/cpp/ggml-metal/ggml-whisper-sim.metallib +0 -0
- package/cpp/ggml-metal/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/ios-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64/rnwhisper.framework/ggml-whisper.metallib +0 -0
- package/ios/rnwhisper.xcframework/tvos-arm64_x86_64-simulator/rnwhisper.framework/ggml-whisper-sim.metallib +0 -0
package/cpp/ggml-cpu/vec.h
CHANGED
|
@@ -44,6 +44,7 @@ void wsp_ggml_vec_dot_bf16(int n, float * WSP_GGML_RESTRICT s, size_t bs, wsp_gg
|
|
|
44
44
|
void wsp_ggml_vec_dot_f16(int n, float * WSP_GGML_RESTRICT s, size_t bs, wsp_ggml_fp16_t * WSP_GGML_RESTRICT x, size_t bx, wsp_ggml_fp16_t * WSP_GGML_RESTRICT y, size_t by, int nrc);
|
|
45
45
|
|
|
46
46
|
void wsp_ggml_vec_silu_f32(const int n, float * y, const float * x);
|
|
47
|
+
wsp_ggml_float wsp_ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean); //it will also center y ( y = y - mean )
|
|
47
48
|
wsp_ggml_float wsp_ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
|
|
48
49
|
wsp_ggml_float wsp_ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
|
|
49
50
|
|
|
@@ -143,14 +144,14 @@ inline static void wsp_ggml_vec_dot_f16_unroll(const int n, const int xs, float
|
|
|
143
144
|
for (int i = 0; i < np; i += wsp_ggml_f16_step) {
|
|
144
145
|
ay1 = WSP_GGML_F16x_VEC_LOAD(y + i + 0 * wsp_ggml_f16_epr, 0); // 8 elements
|
|
145
146
|
|
|
146
|
-
ax1 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 0*wsp_ggml_f16_epr, 0); // 8
|
|
147
|
+
ax1 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 0*wsp_ggml_f16_epr, 0); // 8 elements
|
|
147
148
|
sum_00 = WSP_GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
|
|
148
149
|
ax1 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 0*wsp_ggml_f16_epr, 0); // 8 elements
|
|
149
150
|
sum_10 = WSP_GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
|
|
150
151
|
|
|
151
152
|
ay2 = WSP_GGML_F16x_VEC_LOAD(y + i + 1 * wsp_ggml_f16_epr, 1); // next 8 elements
|
|
152
153
|
|
|
153
|
-
ax2 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 1*wsp_ggml_f16_epr, 1); // next 8
|
|
154
|
+
ax2 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 1*wsp_ggml_f16_epr, 1); // next 8 elements
|
|
154
155
|
sum_01 = WSP_GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
|
|
155
156
|
ax2 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 1*wsp_ggml_f16_epr, 1);
|
|
156
157
|
sum_11 = WSP_GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
|
|
@@ -159,7 +160,7 @@ inline static void wsp_ggml_vec_dot_f16_unroll(const int n, const int xs, float
|
|
|
159
160
|
|
|
160
161
|
ax3 = WSP_GGML_F16x_VEC_LOAD(x[0] + i + 2*wsp_ggml_f16_epr, 2);
|
|
161
162
|
sum_02 = WSP_GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
|
|
162
|
-
|
|
163
|
+
ax3 = WSP_GGML_F16x_VEC_LOAD(x[1] + i + 2*wsp_ggml_f16_epr, 2);
|
|
163
164
|
sum_12 = WSP_GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
|
|
164
165
|
|
|
165
166
|
ay4 = WSP_GGML_F16x_VEC_LOAD(y + i + 3 * wsp_ggml_f16_epr, 3);
|
|
@@ -654,11 +655,11 @@ inline static void wsp_ggml_vec_scale_f32(const int n, float * y, const float
|
|
|
654
655
|
}
|
|
655
656
|
// leftovers
|
|
656
657
|
// maximum number of leftover elements will be less that wsp_ggml_f32_epr. Apply predicated svmad on available elements only
|
|
657
|
-
|
|
658
|
-
svbool_t pg = svwhilelt_b32(
|
|
659
|
-
ay1 = svld1_f32(pg, y +
|
|
658
|
+
for (int i = np; i < n; i += wsp_ggml_f32_epr) {
|
|
659
|
+
svbool_t pg = svwhilelt_b32(i, n);
|
|
660
|
+
ay1 = svld1_f32(pg, y + i);
|
|
660
661
|
ay1 = svmul_f32_m(pg, ay1, vx);
|
|
661
|
-
svst1_f32(pg, y +
|
|
662
|
+
svst1_f32(pg, y + i, ay1);
|
|
662
663
|
}
|
|
663
664
|
#elif defined(__riscv_v_intrinsic)
|
|
664
665
|
for (int i = 0, avl; i < n; i += avl) {
|
|
@@ -819,7 +820,8 @@ inline static void wsp_ggml_vec_tanh_f16 (const int n, wsp_ggml_fp16_t * y, cons
|
|
|
819
820
|
inline static void wsp_ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
|
|
820
821
|
inline static void wsp_ggml_vec_elu_f16 (const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) {
|
|
821
822
|
for (int i = 0; i < n; ++i) {
|
|
822
|
-
|
|
823
|
+
const float v = WSP_GGML_CPU_FP16_TO_FP32(x[i]);
|
|
824
|
+
y[i] = WSP_GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : expm1f(v));
|
|
823
825
|
}
|
|
824
826
|
}
|
|
825
827
|
inline static void wsp_ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
|
@@ -1414,6 +1416,16 @@ inline static void wsp_ggml_vec_sum_f32(const int n, float * s, const float * x)
|
|
|
1414
1416
|
#endif
|
|
1415
1417
|
}
|
|
1416
1418
|
|
|
1419
|
+
inline static void wsp_ggml_vec_cumsum_f32(const int n, float * y, const float * x) {
|
|
1420
|
+
for (int i = 0; i < n; ++i) {
|
|
1421
|
+
if (i == 0) {
|
|
1422
|
+
y[i] = x[i];
|
|
1423
|
+
} else {
|
|
1424
|
+
y[i] = y[i - 1] + x[i];
|
|
1425
|
+
}
|
|
1426
|
+
}
|
|
1427
|
+
}
|
|
1428
|
+
|
|
1417
1429
|
inline static void wsp_ggml_vec_sum_f32_ggf(const int n, wsp_ggml_float * s, const float * x) {
|
|
1418
1430
|
wsp_ggml_float sum = 0.0;
|
|
1419
1431
|
for (int i = 0; i < n; ++i) {
|
package/cpp/ggml-impl.h
CHANGED
|
@@ -102,6 +102,9 @@ static bool wsp_ggml_op_is_empty(enum wsp_ggml_op op) {
|
|
|
102
102
|
}
|
|
103
103
|
}
|
|
104
104
|
|
|
105
|
+
static inline float wsp_ggml_compute_softplus_f32(float input) {
|
|
106
|
+
return (input > 20.0f) ? input : logf(1 + expf(input));
|
|
107
|
+
}
|
|
105
108
|
//
|
|
106
109
|
// logging
|
|
107
110
|
//
|
|
@@ -562,14 +565,23 @@ static inline wsp_ggml_bf16_t wsp_ggml_compute_fp32_to_bf16(float s) {
|
|
|
562
565
|
#define WSP_GGML_FP32_TO_BF16(x) wsp_ggml_compute_fp32_to_bf16(x)
|
|
563
566
|
#define WSP_GGML_BF16_TO_FP32(x) wsp_ggml_compute_bf16_to_fp32(x)
|
|
564
567
|
|
|
568
|
+
static inline int32_t wsp_ggml_node_get_use_count(const struct wsp_ggml_cgraph * cgraph, int node_idx) {
|
|
569
|
+
const struct wsp_ggml_tensor * node = cgraph->nodes[node_idx];
|
|
570
|
+
|
|
571
|
+
size_t hash_pos = wsp_ggml_hash_find(&cgraph->visited_hash_set, node);
|
|
572
|
+
if (!wsp_ggml_bitset_get(cgraph->visited_hash_set.used, hash_pos)) {
|
|
573
|
+
return 0;
|
|
574
|
+
}
|
|
575
|
+
return cgraph->use_counts[hash_pos];
|
|
576
|
+
}
|
|
577
|
+
|
|
565
578
|
// return true if the node's results are only used by N other nodes
|
|
566
579
|
// and can be fused into their calculations.
|
|
567
580
|
static inline bool wsp_ggml_node_has_n_uses(const struct wsp_ggml_cgraph * cgraph, int node_idx, int32_t n_uses) {
|
|
568
581
|
const struct wsp_ggml_tensor * node = cgraph->nodes[node_idx];
|
|
569
582
|
|
|
570
583
|
// check the use count against how many we're replacing
|
|
571
|
-
|
|
572
|
-
if (!wsp_ggml_bitset_get(cgraph->visited_hash_set.used, hash_pos) || cgraph->use_counts[hash_pos] != n_uses) {
|
|
584
|
+
if (wsp_ggml_node_get_use_count(cgraph, node_idx) != n_uses) {
|
|
573
585
|
return false;
|
|
574
586
|
}
|
|
575
587
|
|
|
@@ -635,11 +647,42 @@ static inline bool wsp_ggml_can_fuse(const struct wsp_ggml_cgraph * cgraph, int
|
|
|
635
647
|
return wsp_ggml_can_fuse_ext(cgraph, idxs, ops, num_ops);
|
|
636
648
|
}
|
|
637
649
|
|
|
650
|
+
WSP_GGML_API bool wsp_ggml_can_fuse_subgraph_ext(const struct wsp_ggml_cgraph * cgraph,
|
|
651
|
+
const int * node_idxs,
|
|
652
|
+
int count,
|
|
653
|
+
const enum wsp_ggml_op * ops,
|
|
654
|
+
const int * outputs,
|
|
655
|
+
int num_outputs);
|
|
656
|
+
|
|
657
|
+
// Returns true if the subgraph formed by {node_idxs} can be fused
|
|
658
|
+
// checks whethers all nodes which are not part of outputs can be elided
|
|
659
|
+
// by checking if their num_uses are confined to the subgraph
|
|
660
|
+
static inline bool wsp_ggml_can_fuse_subgraph(const struct wsp_ggml_cgraph * cgraph,
|
|
661
|
+
int node_idx,
|
|
662
|
+
int count,
|
|
663
|
+
const enum wsp_ggml_op * ops,
|
|
664
|
+
const int * outputs,
|
|
665
|
+
int num_outputs) {
|
|
666
|
+
WSP_GGML_ASSERT(count < 32);
|
|
667
|
+
if (node_idx + count > cgraph->n_nodes) {
|
|
668
|
+
return false;
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
int idxs[32];
|
|
672
|
+
|
|
673
|
+
for (int i = 0; i < count; ++i) {
|
|
674
|
+
idxs[i] = node_idx + i;
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
return wsp_ggml_can_fuse_subgraph_ext(cgraph, idxs, count, ops, outputs, num_outputs);
|
|
678
|
+
}
|
|
679
|
+
|
|
638
680
|
#ifdef __cplusplus
|
|
639
681
|
}
|
|
640
682
|
#endif
|
|
641
683
|
|
|
642
684
|
#ifdef __cplusplus
|
|
685
|
+
#include <array>
|
|
643
686
|
#include <initializer_list>
|
|
644
687
|
#include <vector>
|
|
645
688
|
|
|
@@ -648,6 +691,28 @@ inline bool wsp_ggml_can_fuse(const struct wsp_ggml_cgraph * cgraph, int node_id
|
|
|
648
691
|
return wsp_ggml_can_fuse(cgraph, node_idx, ops.begin(), (int)ops.size());
|
|
649
692
|
}
|
|
650
693
|
|
|
694
|
+
inline bool wsp_ggml_can_fuse_subgraph(const struct wsp_ggml_cgraph * cgraph,
|
|
695
|
+
int start_idx,
|
|
696
|
+
std::initializer_list<enum wsp_ggml_op> ops,
|
|
697
|
+
std::initializer_list<int> outputs = {}) {
|
|
698
|
+
return wsp_ggml_can_fuse_subgraph(cgraph, start_idx, ops.size(), ops.begin(), outputs.begin(), outputs.size());
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
// Return true if the edges in the graph match expectations.
|
|
702
|
+
inline bool wsp_ggml_check_edges(const struct wsp_ggml_cgraph * cgraph,
|
|
703
|
+
int start_idx,
|
|
704
|
+
std::initializer_list<std::array<int, 3>> edges) {
|
|
705
|
+
for (const auto & edge : edges) {
|
|
706
|
+
int dst_node = edge[0];
|
|
707
|
+
int src_idx = edge[1];
|
|
708
|
+
int src_node = edge[2];
|
|
709
|
+
if (cgraph->nodes[start_idx + dst_node]->src[src_idx] != cgraph->nodes[start_idx + src_node]) {
|
|
710
|
+
return false;
|
|
711
|
+
}
|
|
712
|
+
}
|
|
713
|
+
return true;
|
|
714
|
+
}
|
|
715
|
+
|
|
651
716
|
// expose GGUF internals for test code
|
|
652
717
|
WSP_GGML_API size_t wsp_gguf_type_size(enum wsp_gguf_type type);
|
|
653
718
|
WSP_GGML_API struct wsp_gguf_context * wsp_gguf_init_from_file_impl(FILE * file, struct wsp_gguf_init_params params);
|
|
@@ -112,7 +112,7 @@ static bool wsp_ggml_mem_ranges_add_dst(wsp_ggml_mem_ranges_t mrs, const wsp_ggm
|
|
|
112
112
|
}
|
|
113
113
|
|
|
114
114
|
bool wsp_ggml_mem_ranges_add(wsp_ggml_mem_ranges_t mrs, const wsp_ggml_tensor * tensor) {
|
|
115
|
-
for (int i = 0; i <
|
|
115
|
+
for (int i = 0; i < WSP_GGML_MAX_SRC; i++) {
|
|
116
116
|
if (tensor->src[i]) {
|
|
117
117
|
wsp_ggml_mem_ranges_add_src(mrs, tensor->src[i]);
|
|
118
118
|
}
|
|
@@ -173,7 +173,7 @@ static bool wsp_ggml_mem_ranges_check_dst(wsp_ggml_mem_ranges_t mrs, const wsp_g
|
|
|
173
173
|
}
|
|
174
174
|
|
|
175
175
|
bool wsp_ggml_mem_ranges_check(wsp_ggml_mem_ranges_t mrs, const wsp_ggml_tensor * tensor) {
|
|
176
|
-
for (int i = 0; i <
|
|
176
|
+
for (int i = 0; i < WSP_GGML_MAX_SRC; i++) {
|
|
177
177
|
if (tensor->src[i]) {
|
|
178
178
|
if (!wsp_ggml_mem_ranges_check_src(mrs, tensor->src[i])) {
|
|
179
179
|
return false;
|
|
@@ -35,7 +35,6 @@ struct wsp_ggml_metal {
|
|
|
35
35
|
// additional, inference-time compiled pipelines
|
|
36
36
|
wsp_ggml_metal_pipelines_t pipelines_ext;
|
|
37
37
|
|
|
38
|
-
bool use_bfloat;
|
|
39
38
|
bool use_fusion;
|
|
40
39
|
bool use_concurrency;
|
|
41
40
|
bool use_graph_optimize;
|
|
@@ -121,11 +120,10 @@ wsp_ggml_metal_t wsp_ggml_metal_init(wsp_ggml_metal_device_t dev) {
|
|
|
121
120
|
}
|
|
122
121
|
}
|
|
123
122
|
|
|
124
|
-
const struct wsp_ggml_metal_device_props * props_dev = wsp_ggml_metal_device_get_props(dev);
|
|
123
|
+
//const struct wsp_ggml_metal_device_props * props_dev = wsp_ggml_metal_device_get_props(dev);
|
|
125
124
|
|
|
126
125
|
res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
|
127
126
|
|
|
128
|
-
res->use_bfloat = props_dev->has_bfloat;
|
|
129
127
|
res->use_fusion = getenv("WSP_GGML_METAL_FUSION_DISABLE") == nil;
|
|
130
128
|
res->use_concurrency = getenv("WSP_GGML_METAL_CONCURRENCY_DISABLE") == nil;
|
|
131
129
|
|
|
@@ -147,7 +145,6 @@ wsp_ggml_metal_t wsp_ggml_metal_init(wsp_ggml_metal_device_t dev) {
|
|
|
147
145
|
|
|
148
146
|
memset(res->fuse_cnt, 0, sizeof(res->fuse_cnt));
|
|
149
147
|
|
|
150
|
-
WSP_GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, res->use_bfloat ? "true" : "false");
|
|
151
148
|
WSP_GGML_LOG_INFO("%s: use fusion = %s\n", __func__, res->use_fusion ? "true" : "false");
|
|
152
149
|
WSP_GGML_LOG_INFO("%s: use concurrency = %s\n", __func__, res->use_concurrency ? "true" : "false");
|
|
153
150
|
WSP_GGML_LOG_INFO("%s: use graph optimize = %s\n", __func__, res->use_graph_optimize ? "true" : "false");
|
|
@@ -292,7 +289,7 @@ void wsp_ggml_metal_set_tensor_async(wsp_ggml_metal_t ctx, struct wsp_ggml_tenso
|
|
|
292
289
|
|
|
293
290
|
// queue the copy operation into the queue of the Metal context
|
|
294
291
|
// this will be queued at the end, after any currently ongoing GPU operations
|
|
295
|
-
id<MTLCommandBuffer> cmd_buf = [ctx->queue
|
|
292
|
+
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
|
|
296
293
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
|
297
294
|
|
|
298
295
|
[encoder copyFromBuffer:buf_src
|
|
@@ -303,6 +300,7 @@ void wsp_ggml_metal_set_tensor_async(wsp_ggml_metal_t ctx, struct wsp_ggml_tenso
|
|
|
303
300
|
|
|
304
301
|
[encoder endEncoding];
|
|
305
302
|
[cmd_buf commit];
|
|
303
|
+
[buf_src release];
|
|
306
304
|
|
|
307
305
|
// do not wait here for completion
|
|
308
306
|
//[cmd_buf waitUntilCompleted];
|
|
@@ -333,7 +331,7 @@ void wsp_ggml_metal_get_tensor_async(wsp_ggml_metal_t ctx, const struct wsp_ggml
|
|
|
333
331
|
|
|
334
332
|
// queue the copy operation into the queue of the Metal context
|
|
335
333
|
// this will be queued at the end, after any currently ongoing GPU operations
|
|
336
|
-
id<MTLCommandBuffer> cmd_buf = [ctx->queue
|
|
334
|
+
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
|
|
337
335
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
|
338
336
|
|
|
339
337
|
[encoder copyFromBuffer:bid_src.metal
|
|
@@ -344,6 +342,7 @@ void wsp_ggml_metal_get_tensor_async(wsp_ggml_metal_t ctx, const struct wsp_ggml
|
|
|
344
342
|
|
|
345
343
|
[encoder endEncoding];
|
|
346
344
|
[cmd_buf commit];
|
|
345
|
+
[buf_dst release];
|
|
347
346
|
|
|
348
347
|
// do not wait here for completion
|
|
349
348
|
//[cmd_buf waitUntilCompleted];
|