cactus-react-native 1.7.0 → 1.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/android/src/main/jniLibs/arm64-v8a/libcactus.a +0 -0
  2. package/cpp/HybridCactus.cpp +49 -1
  3. package/cpp/HybridCactus.hpp +5 -0
  4. package/cpp/cactus_ffi.h +14 -1
  5. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_cloud.h +48 -0
  6. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h +14 -1
  7. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_utils.h +304 -66
  8. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/engine.h +32 -4
  9. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h +75 -11
  10. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h +123 -4
  11. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h +37 -3
  12. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Info.plist +0 -0
  13. package/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus +0 -0
  14. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_cloud.h +48 -0
  15. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_ffi.h +14 -1
  16. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_utils.h +304 -66
  17. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h +32 -4
  18. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h +75 -11
  19. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel.h +123 -4
  20. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel_utils.h +37 -3
  21. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Info.plist +0 -0
  22. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/_CodeSignature/CodeResources +1 -1
  23. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/cactus +0 -0
  24. package/lib/module/classes/CactusSTT.js +15 -0
  25. package/lib/module/classes/CactusSTT.js.map +1 -1
  26. package/lib/module/native/Cactus.js +18 -0
  27. package/lib/module/native/Cactus.js.map +1 -1
  28. package/lib/typescript/src/classes/CactusSTT.d.ts +2 -1
  29. package/lib/typescript/src/classes/CactusSTT.d.ts.map +1 -1
  30. package/lib/typescript/src/index.d.ts +1 -1
  31. package/lib/typescript/src/index.d.ts.map +1 -1
  32. package/lib/typescript/src/native/Cactus.d.ts +2 -1
  33. package/lib/typescript/src/native/Cactus.d.ts.map +1 -1
  34. package/lib/typescript/src/specs/Cactus.nitro.d.ts +1 -0
  35. package/lib/typescript/src/specs/Cactus.nitro.d.ts.map +1 -1
  36. package/lib/typescript/src/types/CactusSTT.d.ts +11 -0
  37. package/lib/typescript/src/types/CactusSTT.d.ts.map +1 -1
  38. package/nitrogen/generated/shared/c++/HybridCactusSpec.cpp +1 -0
  39. package/nitrogen/generated/shared/c++/HybridCactusSpec.hpp +1 -0
  40. package/package.json +1 -1
  41. package/src/classes/CactusSTT.ts +20 -0
  42. package/src/index.tsx +3 -0
  43. package/src/native/Cactus.ts +32 -0
  44. package/src/specs/Cactus.nitro.ts +5 -0
  45. package/src/types/CactusSTT.ts +14 -0
@@ -2,6 +2,7 @@
2
2
  #define CACTUS_UTILS_H
3
3
 
4
4
  #include "../engine/engine.h"
5
+ #include "../models/model.h"
5
6
  #include <string>
6
7
  #include <vector>
7
8
  #include <unordered_map>
@@ -12,6 +13,9 @@
12
13
  #include <iostream>
13
14
  #include <filesystem>
14
15
  #include <cctype>
16
+ #include <algorithm>
17
+ #include <cmath>
18
+ #include <limits>
15
19
  #include <memory>
16
20
  #include <atomic>
17
21
  #include <mutex>
@@ -101,12 +105,92 @@ inline cactus::engine::AudioProcessor::SpectrogramConfig get_whisper_spectrogram
101
105
  return cfg;
102
106
  }
103
107
 
108
+ inline cactus::engine::AudioProcessor::SpectrogramConfig get_parakeet_spectrogram_config() {
109
+ cactus::engine::AudioProcessor::SpectrogramConfig cfg{};
110
+ cfg.n_fft = 512;
111
+ cfg.frame_length = 400;
112
+ cfg.hop_length = 160;
113
+ cfg.power = 2.0f;
114
+ cfg.center = true;
115
+ cfg.pad_mode = "constant";
116
+ cfg.onesided = true;
117
+ cfg.dither = 0.0f;
118
+ cfg.mel_floor = 5.960464477539063e-08f; // 2^-24 guard value used by HF Parakeet.
119
+ cfg.log_mel = "log";
120
+ cfg.reference = 1.0f;
121
+ cfg.min_value = 1e-10f;
122
+ cfg.remove_dc_offset = false;
123
+ cfg.hann_periodic = false;
124
+ return cfg;
125
+ }
126
+
127
+ inline void apply_preemphasis(std::vector<float>& waveform, float coefficient = 0.97f) {
128
+ if (waveform.size() < 2 || coefficient == 0.0f) {
129
+ return;
130
+ }
131
+ for (size_t i = waveform.size() - 1; i > 0; --i) {
132
+ waveform[i] -= coefficient * waveform[i - 1];
133
+ }
134
+ }
135
+
136
+ inline void normalize_parakeet_log_mel(std::vector<float>& mel, size_t num_mels, float epsilon = 1e-5f) {
137
+ if (mel.empty() || num_mels == 0 || (mel.size() % num_mels) != 0) {
138
+ return;
139
+ }
140
+ const size_t num_frames = mel.size() / num_mels;
141
+ if (num_frames == 0) {
142
+ return;
143
+ }
144
+
145
+ for (size_t m = 0; m < num_mels; ++m) {
146
+ const size_t base = m * num_frames;
147
+ float mean = 0.0f;
148
+ for (size_t t = 0; t < num_frames; ++t) {
149
+ mean += mel[base + t];
150
+ }
151
+ mean /= static_cast<float>(num_frames);
152
+
153
+ float variance = 0.0f;
154
+ for (size_t t = 0; t < num_frames; ++t) {
155
+ const float d = mel[base + t] - mean;
156
+ variance += d * d;
157
+ }
158
+ const float denom = static_cast<float>(std::max<size_t>(1, num_frames - 1));
159
+ const float inv_std = 1.0f / std::sqrt((variance / denom) + epsilon);
160
+ for (size_t t = 0; t < num_frames; ++t) {
161
+ mel[base + t] = (mel[base + t] - mean) * inv_std;
162
+ }
163
+ }
164
+ }
165
+
166
+ inline void trim_mel_frames(std::vector<float>& mel, size_t num_mels, size_t valid_frames) {
167
+ if (mel.empty() || num_mels == 0 || (mel.size() % num_mels) != 0) {
168
+ return;
169
+ }
170
+ size_t total_frames = mel.size() / num_mels;
171
+ if (valid_frames == 0 || valid_frames >= total_frames) {
172
+ return;
173
+ }
174
+ std::vector<float> trimmed(num_mels * valid_frames);
175
+ for (size_t m = 0; m < num_mels; ++m) {
176
+ const float* src = &mel[m * total_frames];
177
+ float* dst = &trimmed[m * valid_frames];
178
+ std::copy(src, src + valid_frames, dst);
179
+ }
180
+ mel.swap(trimmed);
181
+ }
182
+
104
183
  } // namespace audio
105
184
  } // namespace cactus
106
185
 
107
186
  namespace cactus {
108
187
  namespace ffi {
109
188
 
189
+ inline bool env_flag_enabled(const char* key) {
190
+ const char* value = std::getenv(key);
191
+ return value && value[0] != '\0' && !(value[0] == '0' && value[1] == '\0');
192
+ }
193
+
110
194
  inline std::string generateUUID() {
111
195
  #ifdef __APPLE__
112
196
  uuid_t uuid;
@@ -114,6 +198,25 @@ inline std::string generateUUID() {
114
198
  char uuid_str[37];
115
199
  uuid_unparse_lower(uuid, uuid_str);
116
200
  return std::string(uuid_str);
201
+ #else
202
+ static std::random_device rd;
203
+ static std::mt19937 gen(rd());
204
+ static std::uniform_int_distribution<> dis(0, 15);
205
+ static std::uniform_int_distribution<> dis2(8, 11);
206
+
207
+ std::stringstream ss;
208
+ ss << std::hex;
209
+ for (int i = 0; i < 8; i++) ss << dis(gen);
210
+ ss << "-";
211
+ for (int i = 0; i < 4; i++) ss << dis(gen);
212
+ ss << "-4";
213
+ for (int i = 0; i < 3; i++) ss << dis(gen);
214
+ ss << "-";
215
+ ss << dis2(gen);
216
+ for (int i = 0; i < 3; i++) ss << dis(gen);
217
+ ss << "-";
218
+ for (int i = 0; i < 12; i++) ss << dis(gen);
219
+ return ss.str();
117
220
  #endif
118
221
  }
119
222
 
@@ -150,6 +253,130 @@ inline std::string escape_json_string(const std::string& s) {
150
253
  return o.str();
151
254
  }
152
255
 
256
+
257
+ inline std::string trim_string(const std::string& s) {
258
+ size_t start = 0;
259
+ while (start < s.size() && std::isspace(static_cast<unsigned char>(s[start]))) ++start;
260
+ size_t end = s.size();
261
+ while (end > start && std::isspace(static_cast<unsigned char>(s[end - 1]))) --end;
262
+ return s.substr(start, end - start);
263
+ }
264
+
265
+ inline std::string env_or_default(const char* key, const char* fallback) {
266
+ const char* v = std::getenv(key);
267
+ if (v && v[0] != '\0') return std::string(v);
268
+ return std::string(fallback);
269
+ }
270
+
271
+ inline std::string json_string_field(const std::string& json, const std::string& key) {
272
+ std::string pattern = "\"" + key + "\":";
273
+ size_t pos = json.find(pattern);
274
+ if (pos == std::string::npos) return {};
275
+
276
+ size_t i = pos + pattern.size();
277
+ while (i < json.size() && std::isspace(static_cast<unsigned char>(json[i]))) i++;
278
+ if (i >= json.size() || json[i] != '"') return {};
279
+ ++i;
280
+
281
+ std::string out;
282
+ out.reserve(128);
283
+ while (i < json.size()) {
284
+ char c = json[i++];
285
+ if (c == '"') return out;
286
+ if (c == '\\' && i < json.size()) {
287
+ char e = json[i++];
288
+ switch (e) {
289
+ case '"': out.push_back('"'); break;
290
+ case '\\': out.push_back('\\'); break;
291
+ case '/': out.push_back('/'); break;
292
+ case 'b': out.push_back('\b'); break;
293
+ case 'f': out.push_back('\f'); break;
294
+ case 'n': out.push_back('\n'); break;
295
+ case 'r': out.push_back('\r'); break;
296
+ case 't': out.push_back('\t'); break;
297
+ default: out.push_back(e); break;
298
+ }
299
+ continue;
300
+ }
301
+ out.push_back(c);
302
+ }
303
+ return {};
304
+ }
305
+
306
+ inline std::string json_array_field(const std::string& json, const std::string& key) {
307
+ std::string pattern = "\"" + key + "\":";
308
+ size_t pos = json.find(pattern);
309
+ if (pos == std::string::npos) return "[]";
310
+ size_t start = pos + pattern.size();
311
+ while (start < json.size() && std::isspace(static_cast<unsigned char>(json[start]))) ++start;
312
+ if (start >= json.size() || json[start] != '[') return "[]";
313
+
314
+ int depth = 1;
315
+ size_t end = start + 1;
316
+ while (end < json.size() && depth > 0) {
317
+ if (json[end] == '[') depth++;
318
+ else if (json[end] == ']') depth--;
319
+ end++;
320
+ }
321
+ return json.substr(start, end - start);
322
+ }
323
+
324
+ inline std::vector<std::string> split_json_array(const std::string& array_json) {
325
+ std::vector<std::string> out;
326
+ if (array_json.size() < 2 || array_json.front() != '[' || array_json.back() != ']') return out;
327
+
328
+ size_t i = 1;
329
+ while (i + 1 < array_json.size()) {
330
+ while (i + 1 < array_json.size() &&
331
+ (std::isspace(static_cast<unsigned char>(array_json[i])) || array_json[i] == ',')) i++;
332
+ if (i + 1 >= array_json.size() || array_json[i] != '{') break;
333
+
334
+ size_t start = i;
335
+ int depth = 0;
336
+ bool in_str = false;
337
+ bool esc = false;
338
+ for (; i < array_json.size(); ++i) {
339
+ char c = array_json[i];
340
+ if (in_str) {
341
+ if (esc) esc = false;
342
+ else if (c == '\\') esc = true;
343
+ else if (c == '"') in_str = false;
344
+ continue;
345
+ }
346
+ if (c == '"') { in_str = true; continue; }
347
+ if (c == '{') depth++;
348
+ if (c == '}') {
349
+ depth--;
350
+ if (depth == 0) {
351
+ out.push_back(array_json.substr(start, i - start + 1));
352
+ i++;
353
+ break;
354
+ }
355
+ }
356
+ }
357
+ }
358
+ return out;
359
+ }
360
+
361
+ inline std::string serialize_tools_json(const std::vector<ToolFunction>& tools) {
362
+ if (tools.empty()) return "";
363
+ std::ostringstream oss;
364
+ oss << "[";
365
+ for (size_t i = 0; i < tools.size(); ++i) {
366
+ if (i > 0) oss << ",";
367
+ oss << "{\"type\":\"function\",\"function\":{";
368
+ oss << "\"name\":\"" << escape_json_string(tools[i].name) << "\",";
369
+ oss << "\"description\":\"" << escape_json_string(tools[i].description) << "\"";
370
+ auto it = tools[i].parameters.find("schema");
371
+ if (it != tools[i].parameters.end()) {
372
+ oss << ",\"parameters\":" << it->second;
373
+ }
374
+ oss << "}}";
375
+ }
376
+ oss << "]";
377
+ return oss.str();
378
+ }
379
+
153
380
  inline void handle_error_response(const std::string& error_message, char* response_buffer, size_t buffer_size) {
154
381
  std::ostringstream json;
155
382
  json << "{";
@@ -324,7 +551,10 @@ inline void parse_options_json(const std::string& json,
324
551
  float& confidence_threshold,
325
552
  bool& include_stop_sequences,
326
553
  bool& use_vad,
327
- bool& telemetry_enabled) {
554
+ bool& telemetry_enabled,
555
+ bool* auto_handoff = nullptr,
556
+ size_t* cloud_timeout_ms = nullptr,
557
+ bool* handoff_with_images = nullptr) {
328
558
  temperature = 0.0f;
329
559
  top_p = 0.0f;
330
560
  top_k = 0;
@@ -335,6 +565,9 @@ inline void parse_options_json(const std::string& json,
335
565
  include_stop_sequences = false;
336
566
  use_vad = true;
337
567
  telemetry_enabled = true;
568
+ if (auto_handoff) *auto_handoff = true;
569
+ if (cloud_timeout_ms) *cloud_timeout_ms = 15000;
570
+ if (handoff_with_images) *handoff_with_images = true;
338
571
  stop_sequences.clear();
339
572
 
340
573
  if (json.empty()) return;
@@ -403,6 +636,32 @@ inline void parse_options_json(const std::string& json,
403
636
  telemetry_enabled = (json.substr(pos, 4) == "true");
404
637
  }
405
638
 
639
+ if (auto_handoff) {
640
+ pos = json.find("\"auto_handoff\"");
641
+ if (pos != std::string::npos) {
642
+ pos = json.find(':', pos) + 1;
643
+ while (pos < json.length() && std::isspace(json[pos])) pos++;
644
+ *auto_handoff = (json.substr(pos, 4) == "true");
645
+ }
646
+ }
647
+
648
+ if (cloud_timeout_ms) {
649
+ pos = json.find("\"cloud_timeout_ms\"");
650
+ if (pos != std::string::npos) {
651
+ pos = json.find(':', pos) + 1;
652
+ *cloud_timeout_ms = std::stoul(json.substr(pos));
653
+ }
654
+ }
655
+
656
+ if (handoff_with_images) {
657
+ pos = json.find("\"handoff_with_images\"");
658
+ if (pos != std::string::npos) {
659
+ pos = json.find(':', pos) + 1;
660
+ while (pos < json.length() && std::isspace(json[pos])) pos++;
661
+ *handoff_with_images = (json.substr(pos, 4) == "true");
662
+ }
663
+ }
664
+
406
665
  pos = json.find("\"stop_sequences\"");
407
666
  if (pos != std::string::npos) {
408
667
  pos = json.find('[', pos);
@@ -422,31 +681,8 @@ inline void parse_options_json(const std::string& json,
422
681
  }
423
682
  }
424
683
 
425
- inline std::string format_tools_for_prompt(const std::vector<ToolFunction>& tools) {
426
- if (tools.empty()) return "";
427
- std::string formatted_tools_json;
428
- for (size_t i = 0; i < tools.size(); i++) {
429
- if (i > 0) formatted_tools_json += "\n";
430
- formatted_tools_json += "{\"type\":\"function\",\"function\":{\"name\":\""
431
- + tools[i].name
432
- + "\",\"description\":\""
433
- + tools[i].description + "\"";
434
- if (tools[i].parameters.find("schema") != tools[i].parameters.end()) {
435
- formatted_tools_json += ",\"parameters\":" + tools[i].parameters.at("schema");
436
- }
437
- formatted_tools_json += "}}";
438
- }
439
- return formatted_tools_json;
440
- }
441
-
442
684
  static inline std::string trim_lfm2_slice(const std::string& value, size_t begin, size_t end) {
443
- while (begin < end && std::isspace(static_cast<unsigned char>(value[begin]))) {
444
- begin++;
445
- }
446
- while (end > begin && std::isspace(static_cast<unsigned char>(value[end - 1]))) {
447
- end--;
448
- }
449
- return value.substr(begin, end - begin);
685
+ return trim_string(value.substr(begin, end - begin));
450
686
  }
451
687
 
452
688
  static inline void append_lfm2_call(const std::string& entry,
@@ -577,23 +813,49 @@ inline void parse_function_calls_from_response(const std::string& response_text,
577
813
 
578
814
  if (!content.empty() && content.front() == '[' && content.back() == ']') {
579
815
  std::string inner = content.substr(1, content.size() - 2);
580
- size_t start = 0;
581
- int paren_depth = 0;
582
-
583
- for (size_t i = 0; i < inner.size(); ++i) {
584
- char c = inner[i];
585
- if (c == '(') {
586
- paren_depth++;
587
- } else if (c == ')' && paren_depth > 0) {
588
- paren_depth--;
589
- } else if (c == ',' && paren_depth == 0) {
590
- append_lfm2_call(inner.substr(start, i - start), function_calls);
591
- start = i + 1;
816
+
817
+ size_t inner_first = inner.find_first_not_of(" \t\n\r");
818
+ if (inner_first != std::string::npos && inner[inner_first] == '{') {
819
+ size_t pos = inner_first;
820
+ while (pos < inner.size()) {
821
+ if (inner[pos] == '{') {
822
+ int brace_depth = 1;
823
+ size_t obj_start = pos;
824
+ pos++;
825
+ while (pos < inner.size() && brace_depth > 0) {
826
+ if (inner[pos] == '{') brace_depth++;
827
+ else if (inner[pos] == '}') brace_depth--;
828
+ pos++;
829
+ }
830
+ if (brace_depth == 0) {
831
+ std::string json_obj = inner.substr(obj_start, pos - obj_start);
832
+ if (json_obj.find("\"name\"") != std::string::npos) {
833
+ function_calls.push_back(json_obj);
834
+ }
835
+ }
836
+ } else {
837
+ pos++;
838
+ }
839
+ }
840
+ } else {
841
+ size_t start = 0;
842
+ int paren_depth = 0;
843
+
844
+ for (size_t i = 0; i < inner.size(); ++i) {
845
+ char c = inner[i];
846
+ if (c == '(') {
847
+ paren_depth++;
848
+ } else if (c == ')' && paren_depth > 0) {
849
+ paren_depth--;
850
+ } else if (c == ',' && paren_depth == 0) {
851
+ append_lfm2_call(inner.substr(start, i - start), function_calls);
852
+ start = i + 1;
853
+ }
592
854
  }
593
- }
594
855
 
595
- if (start < inner.size()) {
596
- append_lfm2_call(inner.substr(start), function_calls);
856
+ if (start < inner.size()) {
857
+ append_lfm2_call(inner.substr(start), function_calls);
858
+ }
597
859
  }
598
860
  } else if (!content.empty()) {
599
861
  append_lfm2_call(content, function_calls);
@@ -648,7 +910,7 @@ inline std::string construct_response_json(const std::string& regular_response,
648
910
  bool cloud_handoff = false) {
649
911
  std::ostringstream json;
650
912
  json << "{";
651
- json << "\"success\":" << (cloud_handoff ? "false" : "true") << ",";
913
+ json << "\"success\":true,";
652
914
  json << "\"error\":null,";
653
915
  json << "\"cloud_handoff\":" << (cloud_handoff ? "true" : "false") << ",";
654
916
  json << "\"response\":\"" << escape_json_string(regular_response) << "\",";
@@ -671,30 +933,6 @@ inline std::string construct_response_json(const std::string& regular_response,
671
933
  return json.str();
672
934
  }
673
935
 
674
- inline std::string construct_cloud_handoff_json(float confidence,
675
- double time_to_first_token,
676
- double prefill_tps,
677
- size_t prompt_tokens) {
678
- std::ostringstream json;
679
- json << "{";
680
- json << "\"success\":false,";
681
- json << "\"error\":null,";
682
- json << "\"cloud_handoff\":true,";
683
- json << "\"response\":null,";
684
- json << "\"function_calls\":[],";
685
- json << "\"confidence\":" << std::fixed << std::setprecision(4) << confidence << ",";
686
- json << "\"time_to_first_token_ms\":" << std::fixed << std::setprecision(2) << time_to_first_token << ",";
687
- json << "\"total_time_ms\":" << std::fixed << std::setprecision(2) << time_to_first_token << ",";
688
- json << "\"prefill_tps\":" << std::fixed << std::setprecision(2) << prefill_tps << ",";
689
- json << "\"decode_tps\":0.0,";
690
- json << "\"ram_usage_mb\":" << std::fixed << std::setprecision(2) << get_ram_usage_mb() << ",";
691
- json << "\"prefill_tokens\":" << prompt_tokens << ",";
692
- json << "\"decode_tokens\":0,";
693
- json << "\"total_tokens\":" << prompt_tokens;
694
- json << "}";
695
- return json.str();
696
- }
697
-
698
936
  inline std::string serialize_function_calls(const std::vector<std::string>& calls) {
699
937
  if (calls.empty()) return "[]";
700
938
  std::ostringstream oss;
@@ -720,4 +958,4 @@ const char* cactus_get_last_error();
720
958
  }
721
959
  #endif
722
960
 
723
- #endif // CACTUS_UTILS_H
961
+ #endif // CACTUS_UTILS_H
@@ -56,6 +56,12 @@ struct Config {
56
56
  uint32_t num_shared_experts = 0;
57
57
  uint32_t num_top_experts = 0;
58
58
  uint32_t moe_every_n_layers = 0;
59
+ uint32_t moe_intermediate_dim = 0;
60
+ uint32_t num_dense_layers = 0;
61
+ uint32_t num_experts_per_tok = 0;
62
+ bool norm_topk_prob = false;
63
+ bool use_expert_bias = false;
64
+ float routed_scaling_factor = 1.0f;
59
65
  bool tie_word_embeddings = true;
60
66
 
61
67
  uint32_t vision_hidden_dim = 0;
@@ -93,8 +99,22 @@ struct Config {
93
99
  uint32_t num_encoder_layers = 0;
94
100
  uint32_t num_decoder_layers = 0;
95
101
  float partial_rotary_factor = 0.0f;
96
-
97
- enum class ModelType {QWEN = 0, GEMMA = 1, NOMIC = 3, LFM2 = 5, SIGLIP2 = 6, WHISPER = 7, MOONSHINE = 8, SILERO_VAD = 9};
102
+ uint32_t pad_token_id = 0;
103
+ uint32_t conv_kernel_size = 0;
104
+ uint32_t subsampling_conv_kernel_size = 0;
105
+ uint32_t subsampling_conv_stride = 0;
106
+ uint32_t subsampling_conv_channels = 0;
107
+ uint32_t subsampling_factor = 0;
108
+ uint32_t num_mel_bins = 80;
109
+ std::string encoder_hidden_act = "silu";
110
+ uint32_t predictor_hidden_dim = 0;
111
+ uint32_t predictor_num_layers = 0;
112
+ uint32_t tdt_joint_dim = 0;
113
+ uint32_t tdt_num_durations = 0;
114
+ uint32_t tdt_blank_id = 0;
115
+ std::vector<uint32_t> tdt_durations;
116
+
117
+ enum class ModelType {QWEN = 0, GEMMA = 1, NOMIC = 3, LFM2 = 5, SIGLIP2 = 6, WHISPER = 7, MOONSHINE = 8, SILERO_VAD = 9, PARAKEET = 10, PARAKEET_TDT = 11};
98
118
  ModelType model_type = ModelType::QWEN;
99
119
 
100
120
  enum class ModelVariant {DEFAULT = 0, VLM = 1, EXTRACT = 2, RAG = 3};
@@ -168,7 +188,7 @@ public:
168
188
  uint32_t get_global_img_token_id() const { return global_img_token_id_; }
169
189
 
170
190
  protected:
171
- enum class ModelType { UNKNOWN, QWEN, GEMMA, LFM2, BERT, WHISPER};
191
+ enum class ModelType { UNKNOWN, QWEN, GEMMA, LFM2, BERT, WHISPER, PARAKEET};
172
192
  ModelType model_type_ = ModelType::UNKNOWN;
173
193
  enum class ModelVariant { DEFAULT, VLM, EXTRACT, RAG};
174
194
  ModelVariant model_variant_ = ModelVariant::DEFAULT;
@@ -366,7 +386,6 @@ struct KVCache {
366
386
  size_t num_tokens, size_t kv_heads, size_t head_dim);
367
387
 
368
388
  bool is_empty() const { return current_seq_len == 0; }
369
- bool is_int8() const { return precision == Precision::INT8; }
370
389
  void* get_key_ptr(size_t layer);
371
390
  void* get_value_ptr(size_t layer);
372
391
 
@@ -684,6 +703,8 @@ public:
684
703
  float reference = 1.0f;
685
704
  float min_value = 1e-10f;
686
705
  bool remove_dc_offset = false;
706
+ float preemphasis = 0.0f;
707
+ bool hann_periodic = true;
687
708
  };
688
709
 
689
710
  AudioProcessor();
@@ -696,6 +717,11 @@ public:
696
717
  const std::vector<float>& waveform,
697
718
  const SpectrogramConfig& config);
698
719
 
720
+ static std::vector<float> compute_irfft(
721
+ const std::vector<float>& complex_input,
722
+ size_t n,
723
+ const char* norm = "backward");
724
+
699
725
  const std::vector<float>& get_mel_filters() const { return mel_filters_; }
700
726
 
701
727
  size_t get_num_mel_filters() const { return num_mel_filters_; }
@@ -721,6 +747,8 @@ namespace index {
721
747
  struct QueryResult {
722
748
  int doc_id;
723
749
  float score;
750
+
751
+ QueryResult(int doc_id, float score) : doc_id(doc_id), score(score) {}
724
752
  };
725
753
 
726
754
  struct QueryOptions {