cactus-react-native 1.7.0 → 1.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/jniLibs/arm64-v8a/libcactus.a +0 -0
- package/cpp/HybridCactus.cpp +49 -1
- package/cpp/HybridCactus.hpp +5 -0
- package/cpp/cactus_ffi.h +14 -1
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_cloud.h +48 -0
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h +14 -1
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_utils.h +304 -66
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/engine.h +32 -4
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h +75 -11
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h +123 -4
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h +37 -3
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Info.plist +0 -0
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus +0 -0
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_cloud.h +48 -0
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_ffi.h +14 -1
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_utils.h +304 -66
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h +32 -4
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h +75 -11
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel.h +123 -4
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel_utils.h +37 -3
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Info.plist +0 -0
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/_CodeSignature/CodeResources +1 -1
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/cactus +0 -0
- package/lib/module/classes/CactusSTT.js +15 -0
- package/lib/module/classes/CactusSTT.js.map +1 -1
- package/lib/module/native/Cactus.js +18 -0
- package/lib/module/native/Cactus.js.map +1 -1
- package/lib/typescript/src/classes/CactusSTT.d.ts +2 -1
- package/lib/typescript/src/classes/CactusSTT.d.ts.map +1 -1
- package/lib/typescript/src/index.d.ts +1 -1
- package/lib/typescript/src/index.d.ts.map +1 -1
- package/lib/typescript/src/native/Cactus.d.ts +2 -1
- package/lib/typescript/src/native/Cactus.d.ts.map +1 -1
- package/lib/typescript/src/specs/Cactus.nitro.d.ts +1 -0
- package/lib/typescript/src/specs/Cactus.nitro.d.ts.map +1 -1
- package/lib/typescript/src/types/CactusSTT.d.ts +11 -0
- package/lib/typescript/src/types/CactusSTT.d.ts.map +1 -1
- package/nitrogen/generated/shared/c++/HybridCactusSpec.cpp +1 -0
- package/nitrogen/generated/shared/c++/HybridCactusSpec.hpp +1 -0
- package/package.json +1 -1
- package/src/classes/CactusSTT.ts +20 -0
- package/src/index.tsx +3 -0
- package/src/native/Cactus.ts +32 -0
- package/src/specs/Cactus.nitro.ts +5 -0
- package/src/types/CactusSTT.ts +14 -0
|
@@ -56,6 +56,12 @@ struct Config {
|
|
|
56
56
|
uint32_t num_shared_experts = 0;
|
|
57
57
|
uint32_t num_top_experts = 0;
|
|
58
58
|
uint32_t moe_every_n_layers = 0;
|
|
59
|
+
uint32_t moe_intermediate_dim = 0;
|
|
60
|
+
uint32_t num_dense_layers = 0;
|
|
61
|
+
uint32_t num_experts_per_tok = 0;
|
|
62
|
+
bool norm_topk_prob = false;
|
|
63
|
+
bool use_expert_bias = false;
|
|
64
|
+
float routed_scaling_factor = 1.0f;
|
|
59
65
|
bool tie_word_embeddings = true;
|
|
60
66
|
|
|
61
67
|
uint32_t vision_hidden_dim = 0;
|
|
@@ -93,8 +99,22 @@ struct Config {
|
|
|
93
99
|
uint32_t num_encoder_layers = 0;
|
|
94
100
|
uint32_t num_decoder_layers = 0;
|
|
95
101
|
float partial_rotary_factor = 0.0f;
|
|
96
|
-
|
|
97
|
-
|
|
102
|
+
uint32_t pad_token_id = 0;
|
|
103
|
+
uint32_t conv_kernel_size = 0;
|
|
104
|
+
uint32_t subsampling_conv_kernel_size = 0;
|
|
105
|
+
uint32_t subsampling_conv_stride = 0;
|
|
106
|
+
uint32_t subsampling_conv_channels = 0;
|
|
107
|
+
uint32_t subsampling_factor = 0;
|
|
108
|
+
uint32_t num_mel_bins = 80;
|
|
109
|
+
std::string encoder_hidden_act = "silu";
|
|
110
|
+
uint32_t predictor_hidden_dim = 0;
|
|
111
|
+
uint32_t predictor_num_layers = 0;
|
|
112
|
+
uint32_t tdt_joint_dim = 0;
|
|
113
|
+
uint32_t tdt_num_durations = 0;
|
|
114
|
+
uint32_t tdt_blank_id = 0;
|
|
115
|
+
std::vector<uint32_t> tdt_durations;
|
|
116
|
+
|
|
117
|
+
enum class ModelType {QWEN = 0, GEMMA = 1, NOMIC = 3, LFM2 = 5, SIGLIP2 = 6, WHISPER = 7, MOONSHINE = 8, SILERO_VAD = 9, PARAKEET = 10, PARAKEET_TDT = 11};
|
|
98
118
|
ModelType model_type = ModelType::QWEN;
|
|
99
119
|
|
|
100
120
|
enum class ModelVariant {DEFAULT = 0, VLM = 1, EXTRACT = 2, RAG = 3};
|
|
@@ -168,7 +188,7 @@ public:
|
|
|
168
188
|
uint32_t get_global_img_token_id() const { return global_img_token_id_; }
|
|
169
189
|
|
|
170
190
|
protected:
|
|
171
|
-
enum class ModelType { UNKNOWN, QWEN, GEMMA, LFM2, BERT, WHISPER};
|
|
191
|
+
enum class ModelType { UNKNOWN, QWEN, GEMMA, LFM2, BERT, WHISPER, PARAKEET};
|
|
172
192
|
ModelType model_type_ = ModelType::UNKNOWN;
|
|
173
193
|
enum class ModelVariant { DEFAULT, VLM, EXTRACT, RAG};
|
|
174
194
|
ModelVariant model_variant_ = ModelVariant::DEFAULT;
|
|
@@ -366,7 +386,6 @@ struct KVCache {
|
|
|
366
386
|
size_t num_tokens, size_t kv_heads, size_t head_dim);
|
|
367
387
|
|
|
368
388
|
bool is_empty() const { return current_seq_len == 0; }
|
|
369
|
-
bool is_int8() const { return precision == Precision::INT8; }
|
|
370
389
|
void* get_key_ptr(size_t layer);
|
|
371
390
|
void* get_value_ptr(size_t layer);
|
|
372
391
|
|
|
@@ -684,6 +703,8 @@ public:
|
|
|
684
703
|
float reference = 1.0f;
|
|
685
704
|
float min_value = 1e-10f;
|
|
686
705
|
bool remove_dc_offset = false;
|
|
706
|
+
float preemphasis = 0.0f;
|
|
707
|
+
bool hann_periodic = true;
|
|
687
708
|
};
|
|
688
709
|
|
|
689
710
|
AudioProcessor();
|
|
@@ -696,6 +717,11 @@ public:
|
|
|
696
717
|
const std::vector<float>& waveform,
|
|
697
718
|
const SpectrogramConfig& config);
|
|
698
719
|
|
|
720
|
+
static std::vector<float> compute_irfft(
|
|
721
|
+
const std::vector<float>& complex_input,
|
|
722
|
+
size_t n,
|
|
723
|
+
const char* norm = "backward");
|
|
724
|
+
|
|
699
725
|
const std::vector<float>& get_mel_filters() const { return mel_filters_; }
|
|
700
726
|
|
|
701
727
|
size_t get_num_mel_filters() const { return num_mel_filters_; }
|
|
@@ -721,6 +747,8 @@ namespace index {
|
|
|
721
747
|
struct QueryResult {
|
|
722
748
|
int doc_id;
|
|
723
749
|
float score;
|
|
750
|
+
|
|
751
|
+
QueryResult(int doc_id, float score) : doc_id(doc_id), score(score) {}
|
|
724
752
|
};
|
|
725
753
|
|
|
726
754
|
struct QueryOptions {
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
#include <unordered_map>
|
|
7
7
|
#include <unordered_set>
|
|
8
8
|
#include <functional>
|
|
9
|
+
#include <cassert>
|
|
9
10
|
#include <cstring>
|
|
10
11
|
#include <stdexcept>
|
|
11
12
|
#include <string>
|
|
@@ -109,23 +110,33 @@ enum class ComputeBackend {
|
|
|
109
110
|
NPU
|
|
110
111
|
};
|
|
111
112
|
|
|
113
|
+
enum class Activation {
|
|
114
|
+
SILU,
|
|
115
|
+
GELU,
|
|
116
|
+
GELU_ERF,
|
|
117
|
+
RELU,
|
|
118
|
+
SIGMOID,
|
|
119
|
+
TANH
|
|
120
|
+
};
|
|
121
|
+
|
|
112
122
|
enum class OpType {
|
|
113
123
|
INPUT, PRECISION_CAST,
|
|
114
124
|
ADD, ADD_CLIPPED, SUBTRACT, MULTIPLY, DIVIDE,
|
|
115
125
|
MATMUL, TRANSPOSE, RESHAPE, SLICE, GATHER, EMBEDDING,
|
|
116
126
|
BILINEAR_INTERPOLATION,
|
|
117
127
|
SUM, MEAN, VARIANCE, MIN, MAX,
|
|
118
|
-
RMS_NORM, ROPE, ROPE_GPTJ, SOFTMAX, ATTENTION, ATTENTION_INT8_HYBRID, CONV1D_CAUSAL, CONV1D_K3, CONV1D_K7S3, CONV1D,
|
|
119
|
-
SCALAR_ADD, SCALAR_SUBTRACT, SCALAR_MULTIPLY, SCALAR_DIVIDE, SCALAR_EXP, SCALAR_SQRT, SCALAR_COS, SCALAR_SIN,
|
|
128
|
+
RMS_NORM, ROPE, ROPE_GPTJ, SOFTMAX, ATTENTION, ATTENTION_INT8_HYBRID, REL_POS_BIAS, CONV1D_CAUSAL, CONV1D_K3, CONV1D_K7S3, CONV1D, CONV1D_SAME_DEPTHWISE_K9, CONV1D_POINTWISE, CONV2D_K3S2P1, CONV2D_DEPTHWISE_K3S2P1, CONV2D_POINTWISE_1X1, GLU, BATCHNORM,
|
|
129
|
+
SCALAR_ADD, SCALAR_SUBTRACT, SCALAR_MULTIPLY, SCALAR_DIVIDE, SCALAR_EXP, SCALAR_SQRT, SCALAR_COS, SCALAR_SIN, SCALAR_LOG,
|
|
120
130
|
RELU, SILU, GELU, GELU_ERF, SIGMOID, TANH,
|
|
121
131
|
SAMPLE, CONCAT,
|
|
122
132
|
SCATTER_TOPK,
|
|
123
133
|
TOPK, LAYERNORM, GROUPNORM,
|
|
134
|
+
MOE_LAYER,
|
|
124
135
|
INDEX,
|
|
125
136
|
PERSISTENT,
|
|
126
137
|
QUANTIZE_ACTIVATIONS,
|
|
127
138
|
LSTM_CELL,
|
|
128
|
-
|
|
139
|
+
STFT
|
|
129
140
|
};
|
|
130
141
|
|
|
131
142
|
struct PrecisionTraits {
|
|
@@ -141,11 +152,20 @@ struct PrecisionTraits {
|
|
|
141
152
|
|
|
142
153
|
static constexpr size_t packed_size_of(Precision prec, size_t count) {
|
|
143
154
|
switch (prec) {
|
|
144
|
-
case Precision::INT4: return (count + 1) / 2;
|
|
155
|
+
case Precision::INT4: return (count + 1) / 2;
|
|
145
156
|
default: return count * size_of(prec);
|
|
146
157
|
}
|
|
147
158
|
}
|
|
148
159
|
|
|
160
|
+
static size_t byte_offset_of(Precision prec, size_t element_offset) {
|
|
161
|
+
switch (prec) {
|
|
162
|
+
case Precision::INT4:
|
|
163
|
+
assert(element_offset % 32 == 0 && "INT4 byte offset must be group-aligned (multiple of 32)");
|
|
164
|
+
return element_offset / 2;
|
|
165
|
+
default: return element_offset * size_of(prec);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
149
169
|
static constexpr bool is_integer(Precision prec) {
|
|
150
170
|
switch (prec) {
|
|
151
171
|
case Precision::INT8: return true;
|
|
@@ -181,7 +201,6 @@ struct TensorConfig {
|
|
|
181
201
|
Precision compute_precision = Precision::INT8;
|
|
182
202
|
Precision output_precision = Precision::INT8;
|
|
183
203
|
bool auto_mixed_precision = false;
|
|
184
|
-
bool enable_int4_packing = true;
|
|
185
204
|
|
|
186
205
|
static TensorConfig& global();
|
|
187
206
|
};
|
|
@@ -243,6 +262,10 @@ struct BufferDesc {
|
|
|
243
262
|
return precision == Precision::INT8 && group_size > 0;
|
|
244
263
|
}
|
|
245
264
|
|
|
265
|
+
bool is_grouped_int4() const {
|
|
266
|
+
return precision == Precision::INT4 && group_size > 0;
|
|
267
|
+
}
|
|
268
|
+
|
|
246
269
|
void set_grouped_scales(size_t gs, size_t ng, void* scales_ptr) {
|
|
247
270
|
group_size = gs;
|
|
248
271
|
num_groups = ng;
|
|
@@ -291,6 +314,7 @@ struct OpParams {
|
|
|
291
314
|
size_t slice_length = 0;
|
|
292
315
|
size_t window_size = 0;
|
|
293
316
|
bool is_causal = true;
|
|
317
|
+
bool attention_mask_is_additive = false;
|
|
294
318
|
std::vector<size_t> new_shape;
|
|
295
319
|
std::vector<size_t> permutation;
|
|
296
320
|
Precision output_precision = Precision::INT8;
|
|
@@ -309,6 +333,11 @@ struct OpParams {
|
|
|
309
333
|
size_t num_groups = 0;
|
|
310
334
|
size_t dst_height = 0;
|
|
311
335
|
size_t dst_width = 0;
|
|
336
|
+
bool normalize_routing = false;
|
|
337
|
+
size_t num_experts = 0;
|
|
338
|
+
size_t num_experts_per_tok = 0;
|
|
339
|
+
bool moe_gated = true;
|
|
340
|
+
Activation activation = Activation::SILU;
|
|
312
341
|
|
|
313
342
|
std::vector<float> bias_values;
|
|
314
343
|
std::vector<uint32_t> bias_indices;
|
|
@@ -356,7 +385,6 @@ void compute_index_node(GraphNode& node, const std::vector<std::unique_ptr<Graph
|
|
|
356
385
|
void compute_lstm_cell_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
|
|
357
386
|
|
|
358
387
|
void shrink_thread_local_buffers();
|
|
359
|
-
|
|
360
388
|
class BufferPool {
|
|
361
389
|
public:
|
|
362
390
|
BufferPool() = default;
|
|
@@ -418,6 +446,7 @@ public:
|
|
|
418
446
|
size_t scalar_sqrt(size_t input);
|
|
419
447
|
size_t scalar_cos(size_t input);
|
|
420
448
|
size_t scalar_sin(size_t input);
|
|
449
|
+
size_t scalar_log(size_t input);
|
|
421
450
|
|
|
422
451
|
size_t relu(size_t input);
|
|
423
452
|
size_t silu(size_t input);
|
|
@@ -425,6 +454,7 @@ public:
|
|
|
425
454
|
size_t gelu_erf(size_t input);
|
|
426
455
|
size_t sigmoid(size_t input);
|
|
427
456
|
size_t tanh(size_t input);
|
|
457
|
+
size_t glu(size_t input, int axis = -1);
|
|
428
458
|
|
|
429
459
|
size_t matmul(size_t input1, size_t input2, bool pretransposed_rhs = false, ComputeBackend backend = ComputeBackend::CPU);
|
|
430
460
|
size_t transpose(size_t input, ComputeBackend backend = ComputeBackend::CPU);
|
|
@@ -455,7 +485,30 @@ public:
|
|
|
455
485
|
size_t layernorm(size_t input, size_t weight, size_t bias, float epsilon = 1e-5f);
|
|
456
486
|
size_t layernorm(size_t input, size_t weight, float epsilon = 1e-5f); // No bias version
|
|
457
487
|
size_t groupnorm(size_t input, size_t weight, size_t bias, size_t num_groups = 32, float epsilon = 1e-5f);
|
|
488
|
+
size_t batchnorm(size_t input, size_t weight, size_t bias, size_t running_mean, size_t running_var, int axis = 1, float epsilon = 1e-5f);
|
|
458
489
|
size_t topk(size_t input, size_t k);
|
|
490
|
+
size_t moe_layer(size_t hidden,
|
|
491
|
+
size_t routing_probs,
|
|
492
|
+
size_t topk_indices,
|
|
493
|
+
const std::vector<size_t>& w1_weights,
|
|
494
|
+
const std::vector<size_t>& w3_weights,
|
|
495
|
+
const std::vector<size_t>& w2_weights,
|
|
496
|
+
size_t num_experts,
|
|
497
|
+
size_t num_experts_per_tok,
|
|
498
|
+
bool normalize_routing,
|
|
499
|
+
float epsilon,
|
|
500
|
+
float routed_scaling_factor);
|
|
501
|
+
size_t moe_layer(size_t hidden,
|
|
502
|
+
size_t routing_probs,
|
|
503
|
+
size_t topk_indices,
|
|
504
|
+
const std::vector<size_t>& w1_weights,
|
|
505
|
+
const std::vector<size_t>& w2_weights,
|
|
506
|
+
size_t num_experts,
|
|
507
|
+
size_t num_experts_per_tok,
|
|
508
|
+
bool normalize_routing,
|
|
509
|
+
float epsilon,
|
|
510
|
+
float routed_scaling_factor,
|
|
511
|
+
Activation activation);
|
|
459
512
|
size_t rms_norm(size_t input, size_t weight, float epsilon = 1e-5f);
|
|
460
513
|
size_t rope(size_t input, float theta, size_t position_offset = 0, ComputeBackend backend = ComputeBackend::CPU);
|
|
461
514
|
size_t rope_gptj(size_t input, float theta, size_t position_offset = 0, size_t rot_dim = 0, ComputeBackend backend = ComputeBackend::CPU);
|
|
@@ -463,6 +516,10 @@ public:
|
|
|
463
516
|
size_t attention(size_t query, size_t key, size_t value, float scale, bool is_causal = true, ComputeBackend backend = ComputeBackend::CPU);
|
|
464
517
|
size_t attention(size_t query, size_t key, size_t value, float scale, size_t position_offset, ComputeBackend backend = ComputeBackend::CPU);
|
|
465
518
|
size_t attention(size_t query, size_t key, size_t value, float scale, size_t position_offset, size_t window_size, ComputeBackend backend = ComputeBackend::CPU);
|
|
519
|
+
size_t attention_masked(size_t query, size_t key, size_t value, size_t mask, float scale,
|
|
520
|
+
bool is_causal = true, ComputeBackend backend = ComputeBackend::CPU,
|
|
521
|
+
bool additive_mask = false, size_t position_offset = 0, size_t window_size = 0);
|
|
522
|
+
size_t rel_pos_bias(size_t query, size_t relative_key, float scale);
|
|
466
523
|
|
|
467
524
|
size_t attention_int8_hybrid(size_t query, size_t key_new, size_t value_new, float scale, size_t position_offset,
|
|
468
525
|
const int8_t* cached_keys, const int8_t* cached_values,
|
|
@@ -474,9 +531,19 @@ public:
|
|
|
474
531
|
size_t conv1d_k7s3(size_t input, size_t weight, size_t bias);
|
|
475
532
|
size_t conv1d(size_t input, size_t weight, size_t stride);
|
|
476
533
|
size_t conv1d(size_t input, size_t weight, size_t bias, size_t stride);
|
|
534
|
+
size_t conv1d_same_depthwise_k9(size_t input, size_t weight);
|
|
535
|
+
size_t conv1d_same_depthwise_k9(size_t input, size_t weight, size_t bias);
|
|
536
|
+
size_t conv1d_pointwise(size_t input, size_t weight);
|
|
537
|
+
size_t conv1d_pointwise(size_t input, size_t weight, size_t bias);
|
|
538
|
+
size_t conv2d_k3s2p1(size_t input, size_t weight);
|
|
539
|
+
size_t conv2d_k3s2p1(size_t input, size_t weight, size_t bias);
|
|
540
|
+
size_t conv2d_depthwise_k3s2p1(size_t input, size_t weight);
|
|
541
|
+
size_t conv2d_depthwise_k3s2p1(size_t input, size_t weight, size_t bias);
|
|
542
|
+
size_t conv2d_pointwise_1x1(size_t input, size_t weight);
|
|
543
|
+
size_t conv2d_pointwise_1x1(size_t input, size_t weight, size_t bias);
|
|
477
544
|
|
|
478
545
|
size_t lstm_cell(size_t input, size_t h_prev, size_t c_prev, size_t weight_ih, size_t weight_hh, size_t bias_ih, size_t bias_hh);
|
|
479
|
-
size_t
|
|
546
|
+
size_t stft(size_t input, size_t weight, size_t stride, size_t num_fft_bins);
|
|
480
547
|
|
|
481
548
|
size_t sample(size_t logits, float temperature = 0.6f, float top_p = 0.95f, size_t top_k = 20,
|
|
482
549
|
const std::unordered_map<uint32_t, float>& logit_bias = {});
|
|
@@ -581,12 +648,9 @@ namespace GraphFile {
|
|
|
581
648
|
bool is_interleaved_ = false;
|
|
582
649
|
size_t original_N_ = 0;
|
|
583
650
|
|
|
584
|
-
std::unique_ptr<int8_t[]> unpacked_data_;
|
|
585
|
-
|
|
586
651
|
void parse_header();
|
|
587
652
|
void apply_madvise_hints();
|
|
588
|
-
void unpack_int4_data();
|
|
589
653
|
};
|
|
590
654
|
}
|
|
591
655
|
|
|
592
|
-
#endif
|
|
656
|
+
#endif
|
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
#include <cstddef>
|
|
5
5
|
#include <arm_neon.h>
|
|
6
6
|
|
|
7
|
+
enum class Precision;
|
|
8
|
+
|
|
7
9
|
enum class ScalarOpType {
|
|
8
10
|
ADD,
|
|
9
11
|
SUBTRACT,
|
|
@@ -12,7 +14,8 @@ enum class ScalarOpType {
|
|
|
12
14
|
EXP,
|
|
13
15
|
SQRT,
|
|
14
16
|
COS,
|
|
15
|
-
SIN
|
|
17
|
+
SIN,
|
|
18
|
+
LOG
|
|
16
19
|
};
|
|
17
20
|
|
|
18
21
|
constexpr size_t KV_QUANT_GROUP_SIZE = 32;
|
|
@@ -21,6 +24,7 @@ void cactus_add_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num
|
|
|
21
24
|
void cactus_add_f16_clipped(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
|
|
22
25
|
void cactus_subtract_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
|
|
23
26
|
void cactus_multiply_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
|
|
27
|
+
void cactus_add_scaled_f16(const __fp16* base, const __fp16* src, __fp16* output, size_t num_elements, float scale);
|
|
24
28
|
void cactus_divide_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
|
|
25
29
|
|
|
26
30
|
void cactus_add_broadcast_f16(const __fp16* a, const __fp16* b, __fp16* output,
|
|
@@ -50,6 +54,23 @@ void cactus_matmul_int8(const int8_t* A, const float* A_scales,
|
|
|
50
54
|
const int8_t* B, const __fp16* B_scales,
|
|
51
55
|
__fp16* C, size_t M, size_t K, size_t N, size_t group_size);
|
|
52
56
|
|
|
57
|
+
void cactus_gemv_int4(const int8_t* A, float A_scale,
|
|
58
|
+
const int8_t* B_packed, const __fp16* B_scales,
|
|
59
|
+
__fp16* C, size_t K, size_t N, size_t group_size);
|
|
60
|
+
|
|
61
|
+
void cactus_gemm_int4(const int8_t* A, const float* A_scales,
|
|
62
|
+
const int8_t* B_packed, const __fp16* B_scales,
|
|
63
|
+
__fp16* C, size_t M, size_t K, size_t N, size_t group_size);
|
|
64
|
+
|
|
65
|
+
void cactus_matmul_int4(const int8_t* A, const float* A_scales,
|
|
66
|
+
const int8_t* B_packed, const __fp16* B_scales,
|
|
67
|
+
__fp16* C, size_t M, size_t K, size_t N, size_t group_size);
|
|
68
|
+
|
|
69
|
+
void cactus_matmul_integer(Precision precision,
|
|
70
|
+
const int8_t* A, const float* A_scales,
|
|
71
|
+
const int8_t* B, const __fp16* B_scales,
|
|
72
|
+
__fp16* C, size_t M, size_t K, size_t N, size_t group_size);
|
|
73
|
+
|
|
53
74
|
void cactus_matmul_f16(const __fp16* a, const __fp16* b_transposed, __fp16* c,
|
|
54
75
|
size_t M, size_t K, size_t N);
|
|
55
76
|
|
|
@@ -97,10 +118,52 @@ void cactus_sigmoid_f16(const __fp16* input, __fp16* output, size_t num_elements
|
|
|
97
118
|
|
|
98
119
|
void cactus_tanh_f16(const __fp16* input, __fp16* output, size_t num_elements);
|
|
99
120
|
|
|
121
|
+
void cactus_glu_f16(
|
|
122
|
+
const __fp16* input,
|
|
123
|
+
__fp16* output,
|
|
124
|
+
size_t outer_size,
|
|
125
|
+
size_t split_size,
|
|
126
|
+
size_t inner_size
|
|
127
|
+
);
|
|
128
|
+
|
|
129
|
+
void cactus_glu_f32(
|
|
130
|
+
const float* input,
|
|
131
|
+
float* output,
|
|
132
|
+
size_t outer_size,
|
|
133
|
+
size_t split_size,
|
|
134
|
+
size_t inner_size
|
|
135
|
+
);
|
|
136
|
+
|
|
137
|
+
void cactus_batchnorm_f16(
|
|
138
|
+
const __fp16* input,
|
|
139
|
+
const float* weight,
|
|
140
|
+
const float* bias,
|
|
141
|
+
const float* running_mean,
|
|
142
|
+
const float* running_var,
|
|
143
|
+
__fp16* output,
|
|
144
|
+
size_t outer_size,
|
|
145
|
+
size_t channels,
|
|
146
|
+
size_t inner_size,
|
|
147
|
+
float epsilon
|
|
148
|
+
);
|
|
149
|
+
|
|
150
|
+
void cactus_batchnorm_f32(
|
|
151
|
+
const float* input,
|
|
152
|
+
const float* weight,
|
|
153
|
+
const float* bias,
|
|
154
|
+
const float* running_mean,
|
|
155
|
+
const float* running_var,
|
|
156
|
+
float* output,
|
|
157
|
+
size_t outer_size,
|
|
158
|
+
size_t channels,
|
|
159
|
+
size_t inner_size,
|
|
160
|
+
float epsilon
|
|
161
|
+
);
|
|
162
|
+
|
|
100
163
|
void cactus_attention_f16(const __fp16* queries, const __fp16* keys, const __fp16* values, __fp16* output,
|
|
101
164
|
size_t batch_size, size_t seq_len, size_t kv_seq_len, size_t num_q_heads, size_t num_kv_heads,
|
|
102
165
|
size_t head_dim, float scale, const __fp16* mask, size_t position_offset = 0, size_t window_size = 0,
|
|
103
|
-
bool is_causal = true);
|
|
166
|
+
bool is_causal = true, bool mask_is_additive = false, bool mask_per_head = false);
|
|
104
167
|
|
|
105
168
|
void cactus_attention_hybrid_int8_fp16(
|
|
106
169
|
const __fp16* queries,
|
|
@@ -150,7 +213,7 @@ void cactus_conv1d_f16(
|
|
|
150
213
|
size_t stride
|
|
151
214
|
);
|
|
152
215
|
|
|
153
|
-
void
|
|
216
|
+
void cactus_stft_f16(
|
|
154
217
|
const __fp16* input,
|
|
155
218
|
const __fp16* weight,
|
|
156
219
|
__fp16* output,
|
|
@@ -171,6 +234,62 @@ void cactus_conv1d_f16_k7s3_oc8(
|
|
|
171
234
|
size_t C_out
|
|
172
235
|
);
|
|
173
236
|
|
|
237
|
+
void cactus_conv1d_same_depthwise_f16_k9(
|
|
238
|
+
const __fp16* input,
|
|
239
|
+
const __fp16* weight,
|
|
240
|
+
const __fp16* bias,
|
|
241
|
+
__fp16* output,
|
|
242
|
+
size_t N,
|
|
243
|
+
size_t L,
|
|
244
|
+
size_t C
|
|
245
|
+
);
|
|
246
|
+
|
|
247
|
+
void cactus_conv2d_f16_k3s2p1_nchw(
|
|
248
|
+
const __fp16* input,
|
|
249
|
+
const __fp16* weight,
|
|
250
|
+
const __fp16* bias,
|
|
251
|
+
__fp16* output,
|
|
252
|
+
size_t N,
|
|
253
|
+
size_t C_in,
|
|
254
|
+
size_t H,
|
|
255
|
+
size_t W,
|
|
256
|
+
size_t C_out
|
|
257
|
+
);
|
|
258
|
+
|
|
259
|
+
void cactus_conv2d_depthwise_f16_k3s2p1_nchw(
|
|
260
|
+
const __fp16* input,
|
|
261
|
+
const __fp16* weight,
|
|
262
|
+
const __fp16* bias,
|
|
263
|
+
__fp16* output,
|
|
264
|
+
size_t N,
|
|
265
|
+
size_t C,
|
|
266
|
+
size_t H,
|
|
267
|
+
size_t W
|
|
268
|
+
);
|
|
269
|
+
|
|
270
|
+
void cactus_conv2d_pointwise_f16_1x1_nchw_gemm(
|
|
271
|
+
const __fp16* input,
|
|
272
|
+
const __fp16* weight,
|
|
273
|
+
const __fp16* bias,
|
|
274
|
+
__fp16* output,
|
|
275
|
+
size_t N,
|
|
276
|
+
size_t C_in,
|
|
277
|
+
size_t H,
|
|
278
|
+
size_t W,
|
|
279
|
+
size_t C_out
|
|
280
|
+
);
|
|
281
|
+
|
|
282
|
+
void cactus_conv1d_pointwise_f16_gemm(
|
|
283
|
+
const __fp16* input,
|
|
284
|
+
const __fp16* weight,
|
|
285
|
+
const __fp16* bias,
|
|
286
|
+
__fp16* output,
|
|
287
|
+
size_t N,
|
|
288
|
+
size_t L,
|
|
289
|
+
size_t C_in,
|
|
290
|
+
size_t C_out
|
|
291
|
+
);
|
|
292
|
+
|
|
174
293
|
void cactus_bilinear_interpolation_f16(const __fp16* input, __fp16* output, size_t src_height, size_t src_width, size_t embed_dim,
|
|
175
294
|
size_t dst_height, size_t dst_width);
|
|
176
295
|
|
|
@@ -224,4 +343,4 @@ void cactus_lstm_cell_f16(
|
|
|
224
343
|
size_t hidden_size
|
|
225
344
|
);
|
|
226
345
|
|
|
227
|
-
#endif
|
|
346
|
+
#endif
|
|
@@ -44,6 +44,34 @@ inline void stream_store_f16x8(__fp16* dst, float16x8_t val) {
|
|
|
44
44
|
#endif
|
|
45
45
|
}
|
|
46
46
|
|
|
47
|
+
inline bool cpu_has_sme2() {
|
|
48
|
+
#if defined(__aarch64__)
|
|
49
|
+
static std::once_flag once;
|
|
50
|
+
static bool has = false;
|
|
51
|
+
|
|
52
|
+
std::call_once(once, []() {
|
|
53
|
+
|
|
54
|
+
#if defined(__APPLE__)
|
|
55
|
+
int ret = 0;
|
|
56
|
+
size_t size = sizeof(ret);
|
|
57
|
+
if (sysctlbyname("hw.optional.arm.FEAT_SME2", &ret, &size, nullptr, 0) == 0) {
|
|
58
|
+
has = ret == 1;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
#elif defined(__ANDROID__)
|
|
62
|
+
unsigned long hwcap2 = getauxval(AT_HWCAP2);
|
|
63
|
+
#ifdef HWCAP2_SME2
|
|
64
|
+
has = (hwcap2 & HWCAP2_SME2) != 0;
|
|
65
|
+
#endif
|
|
66
|
+
|
|
67
|
+
#endif
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
return has;
|
|
71
|
+
#else
|
|
72
|
+
return false;
|
|
73
|
+
#endif
|
|
74
|
+
}
|
|
47
75
|
|
|
48
76
|
inline float32x4_t fast_exp_f32x4(float32x4_t x) {
|
|
49
77
|
const float32x4_t log2e = vdupq_n_f32(1.4426950408889634f);
|
|
@@ -102,6 +130,12 @@ inline float32x4_t fast_tanh_f32x4(float32x4_t x) {
|
|
|
102
130
|
return result;
|
|
103
131
|
}
|
|
104
132
|
|
|
133
|
+
inline void unpack_int4_as_int8x16x2(const uint8_t* ptr, int8x16_t& high_decoded, int8x16_t& low_decoded) {
|
|
134
|
+
int8x16_t packed = vreinterpretq_s8_u8(vld1q_u8(ptr));
|
|
135
|
+
high_decoded = vshrq_n_s8(packed, 4);
|
|
136
|
+
low_decoded = vshrq_n_s8(vshlq_n_s8(packed, 4), 4);
|
|
137
|
+
}
|
|
138
|
+
|
|
105
139
|
namespace CactusThreading {
|
|
106
140
|
|
|
107
141
|
class ThreadPool {
|
|
@@ -297,7 +331,7 @@ namespace CactusThreading {
|
|
|
297
331
|
}
|
|
298
332
|
static size_t get_gemv_threads(size_t N_blocks, size_t pool_size) {
|
|
299
333
|
if (N_blocks < GEMV_MIN_N_BLOCKS) return 1;
|
|
300
|
-
return std::min(pool_size, static_cast<size_t>(
|
|
334
|
+
return std::min(pool_size, static_cast<size_t>(3));
|
|
301
335
|
}
|
|
302
336
|
#else
|
|
303
337
|
static constexpr size_t GEMV_MIN_N_BLOCKS = 256;
|
|
@@ -308,7 +342,7 @@ namespace CactusThreading {
|
|
|
308
342
|
static size_t get_gemv_threads(size_t N_blocks, size_t pool_size) {
|
|
309
343
|
if (N_blocks < GEMV_MIN_N_BLOCKS) return 1;
|
|
310
344
|
if (N_blocks < 512) return std::min(pool_size, static_cast<size_t>(2));
|
|
311
|
-
return std::min(pool_size, static_cast<size_t>(
|
|
345
|
+
return std::min(pool_size, static_cast<size_t>(5));
|
|
312
346
|
}
|
|
313
347
|
#endif
|
|
314
348
|
};
|
|
@@ -465,4 +499,4 @@ namespace CactusThreading {
|
|
|
465
499
|
}
|
|
466
500
|
|
|
467
501
|
|
|
468
|
-
#endif // KERNEL_UTILS_H
|
|
502
|
+
#endif // KERNEL_UTILS_H
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
#ifndef CACTUS_CLOUD_H
|
|
2
|
+
#define CACTUS_CLOUD_H
|
|
3
|
+
|
|
4
|
+
#include "cactus_utils.h"
|
|
5
|
+
#include <string>
|
|
6
|
+
#include <vector>
|
|
7
|
+
|
|
8
|
+
namespace cactus {
|
|
9
|
+
namespace ffi {
|
|
10
|
+
|
|
11
|
+
struct CloudResponse {
|
|
12
|
+
std::string transcript;
|
|
13
|
+
std::string api_key_hash;
|
|
14
|
+
bool used_cloud = false;
|
|
15
|
+
std::string error;
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
struct CloudCompletionRequest {
|
|
19
|
+
std::vector<cactus::engine::ChatMessage> messages;
|
|
20
|
+
std::vector<ToolFunction> tools;
|
|
21
|
+
std::string local_output;
|
|
22
|
+
std::vector<std::string> local_function_calls;
|
|
23
|
+
bool has_images = false;
|
|
24
|
+
std::string cloud_key;
|
|
25
|
+
};
|
|
26
|
+
|
|
27
|
+
struct CloudCompletionResult {
|
|
28
|
+
bool ok = false;
|
|
29
|
+
bool used_cloud = false;
|
|
30
|
+
std::string response;
|
|
31
|
+
std::vector<std::string> function_calls;
|
|
32
|
+
std::string error;
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
std::string cloud_base64_encode(const uint8_t* data, size_t len);
|
|
36
|
+
std::vector<uint8_t> cloud_build_wav(const uint8_t* pcm, size_t pcm_bytes);
|
|
37
|
+
std::string resolve_cloud_api_key(const char* cloud_key_param);
|
|
38
|
+
CloudResponse cloud_transcribe_request(const std::string& audio_b64,
|
|
39
|
+
const std::string& fallback_text,
|
|
40
|
+
long timeout_seconds = 15L,
|
|
41
|
+
const char* cloud_key = nullptr);
|
|
42
|
+
CloudCompletionResult cloud_complete_request(const CloudCompletionRequest& request,
|
|
43
|
+
long timeout_ms);
|
|
44
|
+
|
|
45
|
+
} // namespace ffi
|
|
46
|
+
} // namespace cactus
|
|
47
|
+
|
|
48
|
+
#endif // CACTUS_CLOUD_H
|
|
@@ -76,6 +76,16 @@ CACTUS_FFI_EXPORT int cactus_transcribe(
|
|
|
76
76
|
size_t pcm_buffer_size
|
|
77
77
|
);
|
|
78
78
|
|
|
79
|
+
CACTUS_FFI_EXPORT int cactus_detect_language(
|
|
80
|
+
cactus_model_t model,
|
|
81
|
+
const char* audio_file_path, // NULL if using pcm_buffer
|
|
82
|
+
char* response_buffer,
|
|
83
|
+
size_t buffer_size,
|
|
84
|
+
const char* options_json, // optional
|
|
85
|
+
const uint8_t* pcm_buffer, // NULL if using audio_file_path
|
|
86
|
+
size_t pcm_buffer_size
|
|
87
|
+
);
|
|
88
|
+
|
|
79
89
|
CACTUS_FFI_EXPORT cactus_stream_transcribe_t cactus_stream_transcribe_start(
|
|
80
90
|
cactus_model_t model,
|
|
81
91
|
const char* options_json // optional
|
|
@@ -189,7 +199,10 @@ CACTUS_FFI_EXPORT void cactus_index_destroy(cactus_index_t index);
|
|
|
189
199
|
|
|
190
200
|
CACTUS_FFI_EXPORT const char* cactus_get_last_error(void);
|
|
191
201
|
|
|
192
|
-
CACTUS_FFI_EXPORT void cactus_set_telemetry_environment(const char* framework, const char* cache_location);
|
|
202
|
+
CACTUS_FFI_EXPORT void cactus_set_telemetry_environment(const char* framework, const char* cache_location, const char* version);
|
|
203
|
+
CACTUS_FFI_EXPORT void cactus_set_app_id(const char* app_id);
|
|
204
|
+
CACTUS_FFI_EXPORT void cactus_telemetry_flush(void);
|
|
205
|
+
CACTUS_FFI_EXPORT void cactus_telemetry_shutdown(void);
|
|
193
206
|
|
|
194
207
|
#ifdef __cplusplus
|
|
195
208
|
}
|