cactus-react-native 1.10.4 → 1.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/README.md +199 -40
  2. package/android/src/main/jniLibs/arm64-v8a/libcactus.a +0 -0
  3. package/cpp/HybridCactus.cpp +131 -2
  4. package/cpp/HybridCactus.hpp +15 -0
  5. package/cpp/cactus_ffi.h +240 -2
  6. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h +240 -2
  7. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_utils.h +940 -109
  8. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/engine.h +175 -25
  9. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/gemma_tools.h +48 -21
  10. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h +79 -7
  11. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h +122 -9
  12. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h +191 -2
  13. package/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus +0 -0
  14. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_ffi.h +240 -2
  15. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_utils.h +940 -109
  16. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h +175 -25
  17. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/gemma_tools.h +48 -21
  18. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h +79 -7
  19. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel.h +122 -9
  20. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel_utils.h +191 -2
  21. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/cactus +0 -0
  22. package/lib/module/classes/{CactusVAD.js → CactusAudio.js} +19 -6
  23. package/lib/module/classes/CactusAudio.js.map +1 -0
  24. package/lib/module/classes/CactusLM.js +25 -0
  25. package/lib/module/classes/CactusLM.js.map +1 -1
  26. package/lib/module/hooks/{useCactusVAD.js → useCactusAudio.js} +50 -20
  27. package/lib/module/hooks/useCactusAudio.js.map +1 -0
  28. package/lib/module/index.js +2 -2
  29. package/lib/module/index.js.map +1 -1
  30. package/lib/module/modelRegistry.js +1 -1
  31. package/lib/module/native/Cactus.js +81 -2
  32. package/lib/module/native/Cactus.js.map +1 -1
  33. package/lib/module/types/CactusAudio.js +4 -0
  34. package/lib/module/types/{CactusVAD.js.map → CactusAudio.js.map} +1 -1
  35. package/lib/typescript/src/classes/CactusAudio.d.ts +22 -0
  36. package/lib/typescript/src/classes/CactusAudio.d.ts.map +1 -0
  37. package/lib/typescript/src/classes/CactusLM.d.ts +2 -1
  38. package/lib/typescript/src/classes/CactusLM.d.ts.map +1 -1
  39. package/lib/typescript/src/hooks/useCactusAudio.d.ts +17 -0
  40. package/lib/typescript/src/hooks/useCactusAudio.d.ts.map +1 -0
  41. package/lib/typescript/src/index.d.ts +4 -4
  42. package/lib/typescript/src/index.d.ts.map +1 -1
  43. package/lib/typescript/src/native/Cactus.d.ts +9 -3
  44. package/lib/typescript/src/native/Cactus.d.ts.map +1 -1
  45. package/lib/typescript/src/specs/Cactus.nitro.d.ts +3 -0
  46. package/lib/typescript/src/specs/Cactus.nitro.d.ts.map +1 -1
  47. package/lib/typescript/src/types/CactusAudio.d.ts +63 -0
  48. package/lib/typescript/src/types/CactusAudio.d.ts.map +1 -0
  49. package/lib/typescript/src/types/CactusLM.d.ts +15 -0
  50. package/lib/typescript/src/types/CactusLM.d.ts.map +1 -1
  51. package/lib/typescript/src/types/CactusSTT.d.ts +1 -0
  52. package/lib/typescript/src/types/CactusSTT.d.ts.map +1 -1
  53. package/nitrogen/generated/shared/c++/HybridCactusSpec.cpp +3 -0
  54. package/nitrogen/generated/shared/c++/HybridCactusSpec.hpp +3 -0
  55. package/package.json +1 -1
  56. package/src/classes/{CactusVAD.ts → CactusAudio.ts} +32 -13
  57. package/src/classes/CactusLM.ts +36 -0
  58. package/src/hooks/{useCactusVAD.ts → useCactusAudio.ts} +65 -28
  59. package/src/index.tsx +16 -9
  60. package/src/modelRegistry.ts +1 -1
  61. package/src/native/Cactus.ts +118 -3
  62. package/src/specs/Cactus.nitro.ts +16 -0
  63. package/src/types/CactusAudio.ts +73 -0
  64. package/src/types/CactusLM.ts +17 -0
  65. package/src/types/CactusSTT.ts +1 -0
  66. package/lib/module/classes/CactusVAD.js.map +0 -1
  67. package/lib/module/hooks/useCactusVAD.js.map +0 -1
  68. package/lib/module/types/CactusVAD.js +0 -4
  69. package/lib/typescript/src/classes/CactusVAD.d.ts +0 -20
  70. package/lib/typescript/src/classes/CactusVAD.d.ts.map +0 -1
  71. package/lib/typescript/src/hooks/useCactusVAD.d.ts +0 -15
  72. package/lib/typescript/src/hooks/useCactusVAD.d.ts.map +0 -1
  73. package/lib/typescript/src/types/CactusVAD.d.ts +0 -34
  74. package/lib/typescript/src/types/CactusVAD.d.ts.map +0 -1
  75. package/src/types/CactusVAD.ts +0 -39
@@ -6,6 +6,7 @@
6
6
  #include <string>
7
7
  #include <vector>
8
8
  #include <unordered_map>
9
+ #include <map>
9
10
  #include <stdexcept>
10
11
  #include <sstream>
11
12
  #include <iomanip>
@@ -63,6 +64,16 @@ struct CactusModelHandle {
63
64
  std::unique_ptr<cactus::engine::Model> vad_model;
64
65
  std::atomic<bool> should_stop;
65
66
  std::vector<uint32_t> processed_tokens;
67
+ struct ProcessedImage {
68
+ std::string path;
69
+ long long last_modified_timestamp = 0;
70
+
71
+ bool operator==(const ProcessedImage& other) const {
72
+ return path == other.path && last_modified_timestamp == other.last_modified_timestamp;
73
+ }
74
+ };
75
+
76
+ std::vector<std::vector<ProcessedImage>> processed_images;
66
77
  std::mutex model_mutex;
67
78
  std::string model_name;
68
79
  std::unique_ptr<cactus::engine::index::Index> corpus_index;
@@ -124,6 +135,66 @@ inline cactus::engine::AudioProcessor::SpectrogramConfig get_parakeet_spectrogra
124
135
  return cfg;
125
136
  }
126
137
 
138
+ inline cactus::engine::AudioProcessor::SpectrogramConfig get_htk_spectrogram_config() {
139
+ cactus::engine::AudioProcessor::SpectrogramConfig cfg{};
140
+ cfg.n_fft = 321;
141
+ cfg.frame_length = 320;
142
+ cfg.fft_override = 1024;
143
+ cfg.hop_length = 160;
144
+ cfg.power = 1.0f;
145
+ cfg.center = false;
146
+ cfg.pad_mode = "constant";
147
+ cfg.onesided = true;
148
+ cfg.dither = 0.0f;
149
+ cfg.mel_floor = 0.001f;
150
+ cfg.log_mel = "log";
151
+ cfg.reference = 1.0f;
152
+ cfg.min_value = 0.001f;
153
+ cfg.remove_dc_offset = false;
154
+ cfg.hann_periodic = true;
155
+ return cfg;
156
+ }
157
+
158
+ inline cactus::engine::AudioProcessor::SpectrogramConfig get_gemma4_audio_spectrogram_config(
159
+ const cactus::engine::Config& model_config) {
160
+ auto cfg = get_htk_spectrogram_config();
161
+ cfg.fft_override = model_config.audio_fft_length;
162
+ cfg.mel_floor_additive = true;
163
+ return cfg;
164
+ }
165
+
166
+ inline cactus::engine::AudioProcessor::SpectrogramConfig get_wespeaker_spectrogram_config() {
167
+ cactus::engine::AudioProcessor::SpectrogramConfig cfg{};
168
+ cfg.n_fft = 512;
169
+ cfg.frame_length = 400;
170
+ cfg.hop_length = 160;
171
+ cfg.power = 2.0f;
172
+ cfg.center = false;
173
+ cfg.pad_mode = "constant";
174
+ cfg.onesided = true;
175
+ cfg.dither = 0.0f;
176
+ cfg.mel_floor = 1.1754944e-38f;
177
+ cfg.log_mel = "log";
178
+ cfg.reference = 1.0f;
179
+ cfg.min_value = 1.1754944e-38f;
180
+ cfg.remove_dc_offset = true;
181
+ cfg.preemphasis = 0.97f;
182
+ cfg.hann_periodic = false;
183
+ cfg.window_a0 = 0.54f;
184
+ return cfg;
185
+ }
186
+
187
+ inline std::vector<float> transpose_mel_to_frame_major(const std::vector<float>& mel,
188
+ size_t num_mels, size_t num_frames) {
189
+ std::vector<float> transposed(num_frames * num_mels);
190
+ for (size_t m = 0; m < num_mels; m++) {
191
+ for (size_t t = 0; t < num_frames; t++) {
192
+ transposed[t * num_mels + m] = mel[m * num_frames + t];
193
+ }
194
+ }
195
+ return transposed;
196
+ }
197
+
127
198
  inline void apply_preemphasis(std::vector<float>& waveform, float coefficient = 0.97f) {
128
199
  if (waveform.size() < 2 || coefficient == 0.0f) {
129
200
  return;
@@ -180,6 +251,56 @@ inline void trim_mel_frames(std::vector<float>& mel, size_t num_mels, size_t val
180
251
  mel.swap(trimmed);
181
252
  }
182
253
 
254
+ struct AudioPreprocessResult {
255
+ std::vector<float> features;
256
+ size_t num_frames = 0;
257
+ size_t num_soft_tokens = 0;
258
+ };
259
+
260
+ inline AudioPreprocessResult preprocess_audio_for_gemma4(
261
+ std::vector<float> audio_samples,
262
+ const cactus::engine::Config& model_config
263
+ ) {
264
+ AudioPreprocessResult result;
265
+ if (audio_samples.empty()) return result;
266
+
267
+ size_t pad_amt = 320 - (audio_samples.size() % 320);
268
+ if (pad_amt < 320)
269
+ audio_samples.resize(audio_samples.size() + pad_amt, 0.0f);
270
+
271
+ size_t mel_bins = model_config.audio_input_feat_size;
272
+ auto cfg = get_gemma4_audio_spectrogram_config(model_config);
273
+
274
+ size_t semicausal_pad = cfg.frame_length / 2;
275
+ audio_samples.insert(audio_samples.begin(), semicausal_pad, 0.0f);
276
+
277
+ cactus::engine::AudioProcessor ap;
278
+ size_t fft_for_mel = cfg.fft_override > 0 ? cfg.fft_override : cfg.n_fft;
279
+ ap.init_mel_filters(fft_for_mel / 2 + 1, mel_bins, 0.0f, 8000.0f, 16000,
280
+ nullptr, "htk");
281
+ std::vector<float> mel = ap.compute_spectrogram(audio_samples, cfg);
282
+
283
+ result.num_frames = mel.size() / mel_bins;
284
+ result.features = transpose_mel_to_frame_major(mel, mel_bins, result.num_frames);
285
+
286
+ size_t after_stage1 = (result.num_frames + 1) / 2;
287
+ result.num_soft_tokens = (after_stage1 + 1) / 2;
288
+
289
+ return result;
290
+ }
291
+
292
+ inline std::vector<float> pcm_buffer_to_float_samples(
293
+ const uint8_t* pcm_buffer, size_t pcm_buffer_size
294
+ ) {
295
+ const int16_t* pcm_samples = reinterpret_cast<const int16_t*>(pcm_buffer);
296
+ size_t num_samples = pcm_buffer_size / 2;
297
+ std::vector<float> waveform_fp32(num_samples);
298
+ constexpr float inv_32768 = 1.0f / 32768.0f;
299
+ for (size_t i = 0; i < num_samples; i++)
300
+ waveform_fp32[i] = static_cast<float>(pcm_samples[i]) * inv_32768;
301
+ return waveform_fp32;
302
+ }
303
+
183
304
  } // namespace audio
184
305
  } // namespace cactus
185
306
 
@@ -226,6 +347,24 @@ struct ToolFunction {
226
347
  std::unordered_map<std::string, std::string> parameters;
227
348
  };
228
349
 
350
+ struct InferenceOptions {
351
+ float temperature = 0.0f;
352
+ float top_p = 0.0f;
353
+ float confidence_threshold = 0.7f;
354
+ size_t top_k = 0;
355
+ size_t max_tokens = 100;
356
+ size_t tool_rag_top_k = 2;
357
+ size_t cloud_timeout_ms = 15000;
358
+ std::vector<std::string> stop_sequences;
359
+ bool force_tools = false;
360
+ bool include_stop_sequences = false;
361
+ bool use_vad = true;
362
+ bool telemetry_enabled = true;
363
+ bool auto_handoff = true;
364
+ bool handoff_with_images = true;
365
+ bool enable_thinking_if_supported = true;
366
+ };
367
+
229
368
  } // namespace ffi
230
369
  } // namespace cactus
231
370
 
@@ -262,6 +401,24 @@ inline std::string trim_string(const std::string& s) {
262
401
  return s.substr(start, end - start);
263
402
  }
264
403
 
404
+ inline size_t find_matching_delimiter(const std::string& s, size_t pos, char open, char close) {
405
+ int depth = 1;
406
+ pos++;
407
+ while (pos < s.length() && depth > 0) {
408
+ if (s[pos] == open) depth++;
409
+ else if (s[pos] == close) depth--;
410
+ else if (s[pos] == '"') {
411
+ pos++;
412
+ while (pos < s.length() && s[pos] != '"') {
413
+ if (s[pos] == '\\') pos++;
414
+ pos++;
415
+ }
416
+ }
417
+ pos++;
418
+ }
419
+ return pos;
420
+ }
421
+
265
422
  inline std::string env_or_default(const char* key, const char* fallback) {
266
423
  const char* v = std::getenv(key);
267
424
  if (v && v[0] != '\0') return std::string(v);
@@ -377,6 +534,119 @@ inline std::string serialize_tools_json(const std::vector<ToolFunction>& tools)
377
534
  return oss.str();
378
535
  }
379
536
 
537
+ namespace json_sorted {
538
+
539
+ inline void skip_ws(const std::string& s, size_t& p) {
540
+ while (p < s.size() && std::isspace(static_cast<unsigned char>(s[p]))) p++;
541
+ }
542
+
543
+ inline std::string parse_string(const std::string& s, size_t& p) {
544
+ std::string r = "\"";
545
+ p++;
546
+ while (p < s.size()) {
547
+ if (s[p] == '\\') {
548
+ r += s[p++];
549
+ if (p < s.size()) r += s[p++];
550
+ } else if (s[p] == '"') {
551
+ r += '"';
552
+ p++;
553
+ return r;
554
+ } else {
555
+ r += s[p++];
556
+ }
557
+ }
558
+ return r;
559
+ }
560
+
561
+ inline std::string parse_value(const std::string& s, size_t& p);
562
+
563
+ inline std::string parse_object(const std::string& s, size_t& p) {
564
+ p++;
565
+ std::map<std::string, std::string> entries;
566
+ skip_ws(s, p);
567
+ while (p < s.size() && s[p] != '}') {
568
+ if (s[p] == ',') { p++; skip_ws(s, p); continue; }
569
+ std::string key = parse_string(s, p);
570
+ skip_ws(s, p);
571
+ if (p < s.size() && s[p] == ':') p++;
572
+ skip_ws(s, p);
573
+ std::string val = parse_value(s, p);
574
+ entries[key] = val;
575
+ skip_ws(s, p);
576
+ }
577
+ if (p < s.size()) p++;
578
+ std::string r = "{";
579
+ bool first = true;
580
+ for (const auto& kv : entries) {
581
+ if (!first) r += ", ";
582
+ r += kv.first + ": " + kv.second;
583
+ first = false;
584
+ }
585
+ r += "}";
586
+ return r;
587
+ }
588
+
589
+ inline std::string parse_array(const std::string& s, size_t& p) {
590
+ p++;
591
+ std::vector<std::string> items;
592
+ skip_ws(s, p);
593
+ while (p < s.size() && s[p] != ']') {
594
+ if (s[p] == ',') { p++; skip_ws(s, p); continue; }
595
+ items.push_back(parse_value(s, p));
596
+ skip_ws(s, p);
597
+ }
598
+ if (p < s.size()) p++;
599
+ std::string r = "[";
600
+ for (size_t i = 0; i < items.size(); i++) {
601
+ if (i > 0) r += ", ";
602
+ r += items[i];
603
+ }
604
+ r += "]";
605
+ return r;
606
+ }
607
+
608
+ inline std::string parse_value(const std::string& s, size_t& p) {
609
+ skip_ws(s, p);
610
+ if (p >= s.size()) return "";
611
+ if (s[p] == '"') return parse_string(s, p);
612
+ if (s[p] == '{') return parse_object(s, p);
613
+ if (s[p] == '[') return parse_array(s, p);
614
+ size_t start = p;
615
+ while (p < s.size() && s[p] != ',' && s[p] != '}' && s[p] != ']' && !std::isspace(static_cast<unsigned char>(s[p]))) p++;
616
+ return s.substr(start, p - start);
617
+ }
618
+
619
+ inline std::string reformat(const std::string& json) {
620
+ size_t p = 0;
621
+ return parse_value(json, p);
622
+ }
623
+
624
+ } // namespace json_sorted
625
+
626
+ inline std::string serialize_tools_for_template(const std::vector<ToolFunction>& tools) {
627
+ if (tools.empty()) return "";
628
+ std::string result;
629
+ for (const auto& tool : tools) {
630
+ std::map<std::string, std::string> func_fields;
631
+ func_fields["\"description\""] = "\"" + escape_json_string(tool.description) + "\"";
632
+ func_fields["\"name\""] = "\"" + escape_json_string(tool.name) + "\"";
633
+ auto it = tool.parameters.find("schema");
634
+ if (it != tool.parameters.end()) {
635
+ func_fields["\"parameters\""] = json_sorted::reformat(it->second);
636
+ }
637
+ std::string func_json = "{";
638
+ bool first = true;
639
+ for (const auto& kv : func_fields) {
640
+ if (!first) func_json += ", ";
641
+ func_json += kv.first + ": " + kv.second;
642
+ first = false;
643
+ }
644
+ func_json += "}";
645
+ result += "\n{\"function\": " + func_json + ", \"type\": \"function\"}";
646
+ }
647
+ return result;
648
+ }
649
+
380
650
  inline void handle_error_response(const std::string& error_message, char* response_buffer, size_t buffer_size) {
381
651
  std::ostringstream json;
382
652
  json << "{";
@@ -401,10 +671,12 @@ inline void handle_error_response(const std::string& error_message, char* respon
401
671
  }
402
672
  }
403
673
 
404
- inline std::vector<cactus::engine::ChatMessage> parse_messages_json(const std::string& json,
405
- std::vector<std::string>& out_image_paths) {
674
+ inline std::vector<cactus::engine::ChatMessage> parse_messages_json(const std::string& json,
675
+ std::vector<std::string>& out_image_paths,
676
+ std::vector<std::string>* out_audio_paths = nullptr) {
406
677
  std::vector<cactus::engine::ChatMessage> messages;
407
678
  out_image_paths.clear();
679
+ if (out_audio_paths) out_audio_paths->clear();
408
680
 
409
681
  size_t pos = json.find('[');
410
682
  if (pos == std::string::npos) {
@@ -457,39 +729,111 @@ inline std::vector<cactus::engine::ChatMessage> parse_messages_json(const std::s
457
729
  }
458
730
  }
459
731
 
460
- size_t images_pos = json.find("\"images\"", pos);
461
- if (images_pos != std::string::npos && images_pos < obj_end) {
462
- size_t array_start = json.find('[', images_pos);
463
- if (array_start != std::string::npos && array_start < obj_end) {
464
- size_t array_end = json.find(']', array_start);
465
- if (array_end != std::string::npos && array_end < obj_end) {
466
- size_t img_pos = array_start;
467
- while (true) {
468
- img_pos = json.find('"', img_pos + 1);
469
- if (img_pos == std::string::npos || img_pos >= array_end) break;
470
-
471
- size_t img_start = img_pos + 1;
472
- size_t img_end = json.find('"', img_start);
473
- if (img_end == std::string::npos || img_end > array_end) break;
474
-
475
- std::string img_path = json.substr(img_start, img_end - img_start);
476
-
477
- std::filesystem::path p(img_path);
478
- img_path = std::filesystem::absolute(p).string();
479
-
480
- msg.images.push_back(img_path);
481
- out_image_paths.push_back(img_path);
482
- img_pos = img_end;
732
+ auto parse_path_array = [&](const char* key, std::vector<std::string>& dest,
733
+ std::vector<std::string>* out_paths) {
734
+ size_t key_pos = json.find(key, pos);
735
+ if (key_pos == std::string::npos || key_pos >= obj_end) return;
736
+ size_t array_start = json.find('[', key_pos);
737
+ if (array_start == std::string::npos || array_start >= obj_end) return;
738
+ size_t array_end = json.find(']', array_start);
739
+ if (array_end == std::string::npos || array_end >= obj_end) return;
740
+ size_t cur = array_start;
741
+ while (true) {
742
+ cur = json.find('"', cur + 1);
743
+ if (cur == std::string::npos || cur >= array_end) break;
744
+ size_t str_start = cur + 1;
745
+ size_t str_end = json.find('"', str_start);
746
+ if (str_end == std::string::npos || str_end > array_end) break;
747
+ std::string path = std::filesystem::absolute(
748
+ std::filesystem::path(json.substr(str_start, str_end - str_start))).string();
749
+ dest.push_back(path);
750
+ if (out_paths) out_paths->push_back(path);
751
+ cur = str_end;
752
+ }
753
+ };
754
+
755
+ parse_path_array("\"images\"", msg.images, &out_image_paths);
756
+ parse_path_array("\"audio\"", msg.audio, out_audio_paths);
757
+
758
+ if (msg.role == "tool") {
759
+ size_t name_pos = json.find("\"name\"", obj_start);
760
+ if (name_pos != std::string::npos && name_pos < obj_end) {
761
+ size_t name_quote = json.find('"', name_pos + 6);
762
+ if (name_quote != std::string::npos && name_quote < obj_end) {
763
+ size_t name_start = name_quote + 1;
764
+ size_t name_end = json.find('"', name_start);
765
+ if (name_end != std::string::npos && name_end < obj_end) {
766
+ msg.name = json.substr(name_start, name_end - name_start);
483
767
  }
484
768
  }
485
769
  }
486
770
  }
487
-
771
+
772
+ size_t tool_calls_pos = json.find("\"tool_calls\"", obj_start);
773
+ if (tool_calls_pos != std::string::npos && tool_calls_pos < obj_end) {
774
+ size_t tool_calls_arr_start = json.find('[', tool_calls_pos);
775
+ if (tool_calls_arr_start != std::string::npos && tool_calls_arr_start < obj_end) {
776
+ size_t tool_calls_arr_end = find_matching_delimiter(json, tool_calls_arr_start, '[', ']');
777
+
778
+ size_t search_pos = tool_calls_arr_start;
779
+ while (true) {
780
+ size_t func_pos = json.find("\"function\"", search_pos);
781
+ if (func_pos == std::string::npos || func_pos >= tool_calls_arr_end) break;
782
+
783
+ size_t func_obj_start = json.find('{', func_pos + 10);
784
+ if (func_obj_start == std::string::npos || func_obj_start >= tool_calls_arr_end) break;
785
+
786
+ size_t func_obj_end = find_matching_delimiter(json, func_obj_start, '{', '}');
787
+
788
+ cactus::engine::ToolCallInfo tool_call;
789
+
790
+ size_t fn_name_pos = json.find("\"name\"", func_obj_start);
791
+ if (fn_name_pos != std::string::npos && fn_name_pos < func_obj_end) {
792
+ size_t fn_name_quote = json.find('"', fn_name_pos + 6);
793
+ if (fn_name_quote != std::string::npos && fn_name_quote < func_obj_end) {
794
+ size_t fn_name_start = fn_name_quote + 1;
795
+ size_t fn_name_end = json.find('"', fn_name_start);
796
+ if (fn_name_end != std::string::npos && fn_name_end < func_obj_end) {
797
+ tool_call.name = json.substr(fn_name_start, fn_name_end - fn_name_start);
798
+ }
799
+ }
800
+ }
801
+
802
+ size_t args_pos = json.find("\"arguments\"", func_obj_start);
803
+ if (args_pos != std::string::npos && args_pos < func_obj_end) {
804
+ size_t colon_pos = json.find(':', args_pos + 11);
805
+ if (colon_pos != std::string::npos && colon_pos < func_obj_end) {
806
+ size_t args_start = colon_pos + 1;
807
+ while (args_start < json.length() && std::isspace(static_cast<unsigned char>(json[args_start]))) args_start++;
808
+
809
+ if (args_start < func_obj_end && json[args_start] == '{') {
810
+ size_t args_end = find_matching_delimiter(json, args_start, '{', '}');
811
+ tool_call.arguments = json.substr(args_start, args_end - args_start);
812
+ } else if (args_start < func_obj_end && json[args_start] == '"') {
813
+ size_t str_start = args_start + 1;
814
+ size_t str_end = str_start;
815
+ while (str_end < json.length() && json[str_end] != '"') {
816
+ if (json[str_end] == '\\') str_end++;
817
+ str_end++;
818
+ }
819
+ tool_call.arguments = json.substr(str_start, str_end - str_start);
820
+ }
821
+ }
822
+ }
823
+
824
+ if (!tool_call.name.empty()) {
825
+ msg.tool_calls.push_back(tool_call);
826
+ }
827
+ search_pos = func_obj_end;
828
+ }
829
+ }
830
+ }
831
+
488
832
  messages.push_back(msg);
489
-
833
+
490
834
  pos = json.find('{', obj_end);
491
835
  }
492
-
836
+
493
837
  return messages;
494
838
  }
495
839
 
@@ -538,128 +882,433 @@ inline std::vector<ToolFunction> parse_tools_json(const std::string& json) {
538
882
 
539
883
  pos = json.find("\"function\"", name_pos);
540
884
  }
541
-
885
+
542
886
  return tools;
543
887
  }
544
888
 
545
- inline void parse_options_json(const std::string& json,
546
- float& temperature, float& top_p,
547
- size_t& top_k, size_t& max_tokens,
548
- std::vector<std::string>& stop_sequences,
549
- bool& force_tools,
550
- size_t& tool_rag_top_k,
551
- float& confidence_threshold,
552
- bool& include_stop_sequences,
553
- bool& use_vad,
554
- bool& telemetry_enabled,
555
- bool* auto_handoff = nullptr,
556
- size_t* cloud_timeout_ms = nullptr,
557
- bool* handoff_with_images = nullptr) {
558
- temperature = 0.0f;
559
- top_p = 0.0f;
560
- top_k = 0;
561
- max_tokens = 100;
562
- force_tools = false;
563
- tool_rag_top_k = 2;
564
- confidence_threshold = 0.7f;
565
- include_stop_sequences = false;
566
- use_vad = true;
567
- telemetry_enabled = true;
568
- if (auto_handoff) *auto_handoff = true;
569
- if (cloud_timeout_ms) *cloud_timeout_ms = 15000;
570
- if (handoff_with_images) *handoff_with_images = true;
571
- stop_sequences.clear();
889
+ inline bool try_parse_json_float(const std::string& json, const std::string& key, float& out_value) {
890
+ std::string pattern = "\"" + key + "\":";
891
+ size_t pos = json.find(pattern);
892
+ if (pos == std::string::npos) return false;
893
+
894
+ size_t start = pos + pattern.size();
895
+ while (start < json.size() && std::isspace(static_cast<unsigned char>(json[start]))) ++start;
896
+
897
+ size_t end = start;
898
+ while (end < json.size() && std::string(",}] \t\n\r").find(json[end]) == std::string::npos) ++end;
899
+
900
+ try {
901
+ out_value = std::stof(json.substr(start, end - start));
902
+ return true;
903
+ } catch (...) {
904
+ return false;
905
+ }
906
+ }
907
+
908
+ inline std::vector<std::string> parse_json_string_array_field(const std::string& json, const std::string& key) {
909
+ std::vector<std::string> out;
910
+ std::string pattern = "\"" + key + "\":";
911
+ size_t pos = json.find(pattern);
912
+ if (pos == std::string::npos) return out;
913
+
914
+ size_t start = pos + pattern.size();
915
+ while (start < json.size() && std::isspace(static_cast<unsigned char>(json[start]))) ++start;
916
+ if (start >= json.size() || json[start] != '[') return out;
917
+
918
+ int depth = 1;
919
+ bool in_string = false;
920
+ bool escaped = false;
921
+ size_t end = start + 1;
922
+
923
+ while (end < json.size() && depth > 0) {
924
+ char c = json[end];
925
+ if (in_string) {
926
+ if (escaped) escaped = false;
927
+ else if (c == '\\') escaped = true;
928
+ else if (c == '"') in_string = false;
929
+ } else {
930
+ if (c == '"') in_string = true;
931
+ else if (c == '[') depth++;
932
+ else if (c == ']') depth--;
933
+ }
934
+ ++end;
935
+ }
936
+
937
+ if (depth != 0) return out;
938
+ const std::string array_json = json.substr(start, end - start);
939
+ if (array_json.size() < 2 || array_json.front() != '[' || array_json.back() != ']') return out;
940
+
941
+ size_t i = 1;
942
+ while (i + 1 < array_json.size()) {
943
+ while (i + 1 < array_json.size() &&
944
+ (std::isspace(static_cast<unsigned char>(array_json[i])) || array_json[i] == ',')) {
945
+ ++i;
946
+ }
947
+ if (i + 1 >= array_json.size() || array_json[i] == ']') break;
948
+ if (array_json[i] != '"') break;
949
+
950
+ ++i;
951
+ std::string value;
952
+ bool escaped = false;
953
+ while (i < array_json.size()) {
954
+ char c = array_json[i++];
955
+ if (escaped) {
956
+ switch (c) {
957
+ case '"': value.push_back('"'); break;
958
+ case '\\': value.push_back('\\'); break;
959
+ case '/': value.push_back('/'); break;
960
+ case 'b': value.push_back('\b'); break;
961
+ case 'f': value.push_back('\f'); break;
962
+ case 'n': value.push_back('\n'); break;
963
+ case 'r': value.push_back('\r'); break;
964
+ case 't': value.push_back('\t'); break;
965
+ default: value.push_back(c); break;
966
+ }
967
+ escaped = false;
968
+ continue;
969
+ }
970
+ if (c == '\\') {
971
+ escaped = true;
972
+ continue;
973
+ }
974
+ if (c == '"') {
975
+ out.push_back(value);
976
+ break;
977
+ }
978
+ value.push_back(c);
979
+ }
980
+ }
572
981
 
982
+ return out;
983
+ }
984
+
985
+ inline void parse_custom_vocabulary_options(const std::string& json,
986
+ std::vector<std::string>& custom_vocabulary,
987
+ float& vocabulary_boost) {
988
+ custom_vocabulary.clear();
989
+ vocabulary_boost = 5.0f;
573
990
  if (json.empty()) return;
574
991
 
992
+ float parsed_boost = vocabulary_boost;
993
+ if (try_parse_json_float(json, "vocabulary_boost", parsed_boost)) {
994
+ vocabulary_boost = std::clamp(parsed_boost, 0.0f, 20.0f);
995
+ }
996
+
997
+ custom_vocabulary = parse_json_string_array_field(json, "custom_vocabulary");
998
+ }
999
+
1000
+ inline std::unordered_map<uint32_t, float> build_token_bias_map(const std::vector<std::vector<uint32_t>>& tokenized_entries,
1001
+ float vocabulary_boost) {
1002
+ std::unordered_map<uint32_t, float> vocab_bias;
1003
+ const float clamped_boost = std::clamp(vocabulary_boost, 0.0f, 20.0f);
1004
+ if (clamped_boost == 0.0f) return vocab_bias;
1005
+
1006
+ for (const auto& token_ids : tokenized_entries) {
1007
+ for (uint32_t token_id : token_ids) {
1008
+ float& entry = vocab_bias[token_id];
1009
+ if (entry < clamped_boost) {
1010
+ entry = clamped_boost;
1011
+ }
1012
+ }
1013
+ }
1014
+
1015
+ return vocab_bias;
1016
+ }
1017
+
1018
+ inline std::unordered_map<uint32_t, float> build_custom_vocabulary_bias(cactus::engine::Tokenizer* tokenizer,
1019
+ const std::vector<std::string>& custom_vocabulary,
1020
+ float vocabulary_boost) {
1021
+ if (!tokenizer || custom_vocabulary.empty()) return {};
1022
+ std::vector<std::vector<uint32_t>> tokenized_entries;
1023
+ tokenized_entries.reserve(custom_vocabulary.size());
1024
+
1025
+ for (const auto& word : custom_vocabulary) {
1026
+ if (word.empty()) continue;
1027
+ tokenized_entries.push_back(tokenizer->encode(word));
1028
+ }
1029
+
1030
+ return build_token_bias_map(tokenized_entries, vocabulary_boost);
1031
+ }
1032
+
1033
+ inline void apply_custom_vocabulary_options(cactus::engine::Model* model, const std::string& json) {
1034
+ if (!model) return;
1035
+
1036
+ std::vector<std::string> custom_vocabulary;
1037
+ float vocabulary_boost = 5.0f;
1038
+ parse_custom_vocabulary_options(json, custom_vocabulary, vocabulary_boost);
1039
+ model->set_vocab_bias(build_custom_vocabulary_bias(model->get_tokenizer(), custom_vocabulary, vocabulary_boost));
1040
+ }
1041
+
1042
+ inline size_t levenshtein_ci(const std::string& a, const std::string& b) {
1043
+ const size_t m = a.size(), n = b.size();
1044
+ std::vector<size_t> prev(n + 1), curr(n + 1);
1045
+ for (size_t j = 0; j <= n; ++j) prev[j] = j;
1046
+ for (size_t i = 1; i <= m; ++i) {
1047
+ curr[0] = i;
1048
+ for (size_t j = 1; j <= n; ++j) {
1049
+ const bool match = std::tolower(static_cast<unsigned char>(a[i - 1])) ==
1050
+ std::tolower(static_cast<unsigned char>(b[j - 1]));
1051
+ curr[j] = std::min({prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + (match ? 0 : 1)});
1052
+ }
1053
+ std::swap(prev, curr);
1054
+ }
1055
+ return prev[n];
1056
+ }
1057
+
1058
+ inline std::string collapse_spaces(const std::string& s) {
1059
+ std::string out;
1060
+ out.reserve(s.size());
1061
+ for (char c : s) {
1062
+ if (c != ' ') out += c;
1063
+ }
1064
+ return out;
1065
+ }
1066
+
1067
+ inline void apply_vocabulary_spelling_correction(
1068
+ std::string& text,
1069
+ const std::vector<std::string>& custom_vocabulary)
1070
+ {
1071
+ if (custom_vocabulary.empty() || text.empty()) return;
1072
+
1073
+ struct VocabEntry {
1074
+ const std::string* original;
1075
+ std::string collapsed;
1076
+ };
1077
+ std::vector<VocabEntry> vocab_entries;
1078
+ vocab_entries.reserve(custom_vocabulary.size());
1079
+ for (const auto& v : custom_vocabulary) {
1080
+ vocab_entries.push_back({&v, collapse_spaces(v)});
1081
+ }
1082
+
1083
+ struct Token { std::string text; bool is_word; };
1084
+ std::vector<Token> tokens;
1085
+ size_t pos = 0;
1086
+ while (pos < text.size()) {
1087
+ if (std::isalnum(static_cast<unsigned char>(text[pos])) ||
1088
+ text[pos] == '\'' || text[pos] == '-') {
1089
+ size_t start = pos;
1090
+ while (pos < text.size() && (std::isalnum(static_cast<unsigned char>(text[pos])) ||
1091
+ text[pos] == '\'' || text[pos] == '-')) {
1092
+ ++pos;
1093
+ }
1094
+ tokens.push_back({text.substr(start, pos - start), true});
1095
+ } else {
1096
+ size_t start = pos;
1097
+ while (pos < text.size() && !std::isalnum(static_cast<unsigned char>(text[pos])) &&
1098
+ text[pos] != '\'' && text[pos] != '-') {
1099
+ ++pos;
1100
+ }
1101
+ tokens.push_back({text.substr(start, pos - start), false});
1102
+ }
1103
+ }
1104
+
1105
+ std::vector<size_t> word_indices;
1106
+ for (size_t i = 0; i < tokens.size(); ++i) {
1107
+ if (tokens[i].is_word) word_indices.push_back(i);
1108
+ }
1109
+
1110
+ std::vector<bool> consumed(tokens.size(), false);
1111
+
1112
+ auto strip_suffix = [](const std::string& word) -> std::pair<std::string, std::string> {
1113
+ if (word.size() >= 3 && word.substr(word.size() - 2) == "'s") {
1114
+ return {word.substr(0, word.size() - 2), "'s"};
1115
+ }
1116
+ if (word.size() >= 3 && word.substr(word.size() - 2) == "'t") {
1117
+ return {word.substr(0, word.size() - 2), "'t"};
1118
+ }
1119
+ if (word.size() >= 4 && word.back() == 's' &&
1120
+ word[word.size() - 2] != 's' && // avoid stripping from "boss", "class"
1121
+ std::isalpha(static_cast<unsigned char>(word[word.size() - 2]))) {
1122
+ return {word.substr(0, word.size() - 1), "s"};
1123
+ }
1124
+ return {word, ""};
1125
+ };
1126
+
1127
+ size_t wi = 0;
1128
+ while (wi < word_indices.size()) {
1129
+ size_t best_dist = std::numeric_limits<size_t>::max();
1130
+ const std::string* best_match = nullptr;
1131
+ size_t best_window = 0;
1132
+ size_t best_first_token = 0;
1133
+ size_t best_last_token = 0;
1134
+ std::string best_suffix;
1135
+
1136
+ for (size_t window = std::min<size_t>(3, word_indices.size() - wi); window >= 1; --window) {
1137
+ std::string window_collapsed;
1138
+ const size_t first_tok = word_indices[wi];
1139
+ const size_t last_tok = word_indices[wi + window - 1];
1140
+ for (size_t w = 0; w < window; ++w) {
1141
+ window_collapsed += tokens[word_indices[wi + w]].text;
1142
+ }
1143
+
1144
+ if (window == 1 && window_collapsed.size() < 3) break;
1145
+
1146
+ auto [stem, suffix] = strip_suffix(window_collapsed);
1147
+ const std::string* candidates[] = {&window_collapsed, &stem};
1148
+ const std::string suffixes[] = {"", suffix};
1149
+ const size_t num_candidates = suffix.empty() ? 1 : 2;
1150
+
1151
+ for (size_t ci = 0; ci < num_candidates; ++ci) {
1152
+ const std::string& candidate = *candidates[ci];
1153
+ if (candidate.empty()) continue;
1154
+
1155
+ for (const auto& entry : vocab_entries) {
1156
+ const size_t wlen = candidate.size();
1157
+ const size_t vlen = entry.collapsed.size();
1158
+
1159
+ const size_t len_diff = wlen > vlen ? wlen - vlen : vlen - wlen;
1160
+ const size_t max_dist = std::max<size_t>(1, std::min(wlen, vlen) / 3);
1161
+ if (len_diff > max_dist) continue;
1162
+
1163
+ const size_t dist = levenshtein_ci(candidate, entry.collapsed);
1164
+
1165
+ // For single-edit corrections, require first char match to prevent
1166
+ // false positives like "vortex" → "Cortex".
1167
+ if (dist == 1 && window == 1) {
1168
+ const bool first_char_match =
1169
+ std::tolower(static_cast<unsigned char>(candidate[0])) ==
1170
+ std::tolower(static_cast<unsigned char>(entry.collapsed[0]));
1171
+ if (!first_char_match) continue;
1172
+ }
1173
+
1174
+ if (dist <= max_dist && dist < best_dist) {
1175
+ best_dist = dist;
1176
+ best_match = entry.original;
1177
+ best_window = window;
1178
+ best_first_token = first_tok;
1179
+ best_last_token = last_tok;
1180
+ best_suffix = suffixes[ci];
1181
+ }
1182
+ }
1183
+ }
1184
+
1185
+ if (best_dist == 0) break;
1186
+ }
1187
+
1188
+ // Allow dist==0 for multi-word merges where word boundaries changed.
1189
+ const bool should_replace = best_match &&
1190
+ best_dist != std::numeric_limits<size_t>::max() &&
1191
+ (best_dist > 0 || best_window > 1);
1192
+
1193
+ if (should_replace) {
1194
+ tokens[best_first_token].text = *best_match + best_suffix;
1195
+ for (size_t t = best_first_token + 1; t <= best_last_token; ++t) {
1196
+ consumed[t] = true;
1197
+ }
1198
+ for (size_t t = best_first_token + 1; t <= best_last_token; ++t) {
1199
+ if (t > 0) consumed[t - 1] = consumed[t - 1] || !tokens[t - 1].is_word;
1200
+ }
1201
+ wi += best_window;
1202
+ } else {
1203
+ ++wi;
1204
+ }
1205
+ }
1206
+
1207
+ std::string result;
1208
+ result.reserve(text.size());
1209
+ for (size_t i = 0; i < tokens.size(); ++i) {
1210
+ if (!consumed[i]) {
1211
+ result += tokens[i].text;
1212
+ }
1213
+ }
1214
+
1215
+ text = std::move(result);
1216
+ }
1217
+
1218
+ inline InferenceOptions parse_inference_options_json(const std::string& json) {
1219
+ InferenceOptions options;
1220
+
1221
+ if (json.empty()) return options;
1222
+
575
1223
  size_t pos = json.find("\"temperature\"");
576
1224
  if (pos != std::string::npos) {
577
1225
  pos = json.find(':', pos) + 1;
578
- temperature = std::stof(json.substr(pos));
1226
+ options.temperature = std::stof(json.substr(pos));
579
1227
  }
580
1228
 
581
1229
  pos = json.find("\"top_p\"");
582
1230
  if (pos != std::string::npos) {
583
1231
  pos = json.find(':', pos) + 1;
584
- top_p = std::stof(json.substr(pos));
1232
+ options.top_p = std::stof(json.substr(pos));
585
1233
  }
586
1234
 
587
1235
  pos = json.find("\"top_k\"");
588
1236
  if (pos != std::string::npos) {
589
1237
  pos = json.find(':', pos) + 1;
590
- top_k = std::stoul(json.substr(pos));
1238
+ options.top_k = std::stoul(json.substr(pos));
591
1239
  }
592
1240
 
593
1241
  pos = json.find("\"max_tokens\"");
594
1242
  if (pos != std::string::npos) {
595
1243
  pos = json.find(':', pos) + 1;
596
- max_tokens = std::stoul(json.substr(pos));
1244
+ options.max_tokens = std::stoul(json.substr(pos));
597
1245
  }
598
1246
 
599
1247
  pos = json.find("\"force_tools\"");
600
1248
  if (pos != std::string::npos) {
601
1249
  pos = json.find(':', pos) + 1;
602
- while (pos < json.length() && std::isspace(json[pos])) pos++;
603
- force_tools = (json.substr(pos, 4) == "true");
1250
+ while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
1251
+ options.force_tools = (json.substr(pos, 4) == "true");
604
1252
  }
605
1253
 
606
1254
  pos = json.find("\"tool_rag_top_k\"");
607
1255
  if (pos != std::string::npos) {
608
1256
  pos = json.find(':', pos) + 1;
609
- tool_rag_top_k = std::stoul(json.substr(pos));
1257
+ options.tool_rag_top_k = std::stoul(json.substr(pos));
610
1258
  }
611
1259
 
612
1260
  pos = json.find("\"confidence_threshold\"");
613
1261
  if (pos != std::string::npos) {
614
1262
  pos = json.find(':', pos) + 1;
615
- confidence_threshold = std::stof(json.substr(pos));
1263
+ options.confidence_threshold = std::stof(json.substr(pos));
616
1264
  }
617
1265
 
618
1266
  pos = json.find("\"include_stop_sequences\"");
619
1267
  if (pos != std::string::npos) {
620
1268
  pos = json.find(':', pos) + 1;
621
- while (pos < json.length() && std::isspace(json[pos])) pos++;
622
- include_stop_sequences = (json.substr(pos, 4) == "true");
1269
+ while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
1270
+ options.include_stop_sequences = (json.substr(pos, 4) == "true");
623
1271
  }
624
1272
 
625
1273
  pos = json.find("\"use_vad\"");
626
1274
  if (pos != std::string::npos) {
627
1275
  pos = json.find(':', pos) + 1;
628
- while (pos < json.length() && std::isspace(json[pos])) pos++;
629
- use_vad = (json.substr(pos, 4) == "true");
1276
+ while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
1277
+ options.use_vad = (json.substr(pos, 4) == "true");
630
1278
  }
631
1279
 
632
1280
  pos = json.find("\"telemetry_enabled\"");
633
1281
  if (pos != std::string::npos) {
634
1282
  pos = json.find(':', pos) + 1;
635
- while (pos < json.length() && std::isspace(json[pos])) pos++;
636
- telemetry_enabled = (json.substr(pos, 4) == "true");
1283
+ while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
1284
+ options.telemetry_enabled = (json.substr(pos, 4) == "true");
637
1285
  }
638
1286
 
639
- if (auto_handoff) {
640
- pos = json.find("\"auto_handoff\"");
641
- if (pos != std::string::npos) {
642
- pos = json.find(':', pos) + 1;
643
- while (pos < json.length() && std::isspace(json[pos])) pos++;
644
- *auto_handoff = (json.substr(pos, 4) == "true");
645
- }
1287
+ pos = json.find("\"auto_handoff\"");
1288
+ if (pos != std::string::npos) {
1289
+ pos = json.find(':', pos) + 1;
1290
+ while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
1291
+ options.auto_handoff = (json.substr(pos, 4) == "true");
646
1292
  }
647
1293
 
648
- if (cloud_timeout_ms) {
649
- pos = json.find("\"cloud_timeout_ms\"");
650
- if (pos != std::string::npos) {
651
- pos = json.find(':', pos) + 1;
652
- *cloud_timeout_ms = std::stoul(json.substr(pos));
653
- }
1294
+ pos = json.find("\"cloud_timeout_ms\"");
1295
+ if (pos != std::string::npos) {
1296
+ pos = json.find(':', pos) + 1;
1297
+ options.cloud_timeout_ms = std::stoul(json.substr(pos));
654
1298
  }
655
1299
 
656
- if (handoff_with_images) {
657
- pos = json.find("\"handoff_with_images\"");
658
- if (pos != std::string::npos) {
659
- pos = json.find(':', pos) + 1;
660
- while (pos < json.length() && std::isspace(json[pos])) pos++;
661
- *handoff_with_images = (json.substr(pos, 4) == "true");
662
- }
1300
+ pos = json.find("\"handoff_with_images\"");
1301
+ if (pos != std::string::npos) {
1302
+ pos = json.find(':', pos) + 1;
1303
+ while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
1304
+ options.handoff_with_images = (json.substr(pos, 4) == "true");
1305
+ }
1306
+
1307
+ pos = json.find("\"enable_thinking_if_supported\"");
1308
+ if (pos != std::string::npos) {
1309
+ pos = json.find(':', pos) + 1;
1310
+ while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
1311
+ options.enable_thinking_if_supported = (json.substr(pos, 4) == "true");
663
1312
  }
664
1313
 
665
1314
  pos = json.find("\"stop_sequences\"");
@@ -673,12 +1322,14 @@ inline void parse_options_json(const std::string& json,
673
1322
  size_t seq_start = seq_pos + 1;
674
1323
  size_t seq_end = json.find('"', seq_start);
675
1324
  if (seq_end != std::string::npos) {
676
- stop_sequences.push_back(json.substr(seq_start, seq_end - seq_start));
1325
+ options.stop_sequences.push_back(json.substr(seq_start, seq_end - seq_start));
677
1326
  }
678
1327
  seq_pos = json.find('"', seq_end + 1);
679
1328
  }
680
1329
  }
681
1330
  }
1331
+
1332
+ return options;
682
1333
  }
683
1334
 
684
1335
  static inline std::string trim_lfm2_slice(const std::string& value, size_t begin, size_t end) {
@@ -755,7 +1406,6 @@ inline void parse_function_calls_from_response(const std::string& response_text,
755
1406
 
756
1407
  gemma::parse_function_calls(regular_response, function_calls);
757
1408
 
758
- // Parse Qwen-style function calls: <tool_call>{"name": "...", "arguments": {...}}</tool_call>
759
1409
  const std::string QWEN_TOOL_START = "<tool_call>";
760
1410
  const std::string QWEN_TOOL_END = "</tool_call>";
761
1411
  size_t qwen_start_pos = 0;
@@ -764,27 +1414,62 @@ inline void parse_function_calls_from_response(const std::string& response_text,
764
1414
  size_t content_start = qwen_start_pos + QWEN_TOOL_START.length();
765
1415
  size_t qwen_end_pos = regular_response.find(QWEN_TOOL_END, content_start);
766
1416
 
1417
+ size_t erase_end;
1418
+ std::string json_content;
1419
+
767
1420
  if (qwen_end_pos != std::string::npos) {
768
- std::string json_content = regular_response.substr(content_start, qwen_end_pos - content_start);
1421
+ json_content = regular_response.substr(content_start, qwen_end_pos - content_start);
1422
+ erase_end = qwen_end_pos + QWEN_TOOL_END.length();
1423
+ } else {
1424
+ json_content = regular_response.substr(content_start);
1425
+ erase_end = regular_response.length();
1426
+ }
769
1427
 
770
- size_t first = json_content.find_first_not_of(" \t\n\r");
771
- size_t last = json_content.find_last_not_of(" \t\n\r");
772
- if (first != std::string::npos && last != std::string::npos) {
773
- json_content = json_content.substr(first, last - first + 1);
774
- }
1428
+ size_t first = json_content.find_first_not_of(" \t\n\r");
1429
+ size_t last = json_content.find_last_not_of(" \t\n\r");
1430
+ if (first != std::string::npos && last != std::string::npos) {
1431
+ json_content = json_content.substr(first, last - first + 1);
1432
+ }
775
1433
 
776
- if (json_content.size() > 2 && json_content[0] == '{' &&
777
- json_content.find("\"name\"") != std::string::npos) {
778
- function_calls.push_back(json_content);
1434
+ if (json_content.size() > 2 && json_content[0] == '{' &&
1435
+ json_content.find("\"name\"") != std::string::npos) {
1436
+ size_t depth = 0;
1437
+ bool in_string = false;
1438
+ bool escaped = false;
1439
+ size_t end_pos = 0;
1440
+ for (size_t c = 0; c < json_content.size(); c++) {
1441
+ char ch = json_content[c];
1442
+ if (escaped) {
1443
+ escaped = false;
1444
+ continue;
1445
+ }
1446
+ if (ch == '\\' && in_string) {
1447
+ escaped = true;
1448
+ continue;
1449
+ }
1450
+ if (ch == '"') {
1451
+ in_string = !in_string;
1452
+ continue;
1453
+ }
1454
+ if (!in_string) {
1455
+ if (ch == '{') depth++;
1456
+ else if (ch == '}') {
1457
+ depth--;
1458
+ if (depth == 0) {
1459
+ end_pos = c + 1;
1460
+ break;
1461
+ }
1462
+ }
1463
+ }
1464
+ }
1465
+ if (end_pos > 0) {
1466
+ function_calls.push_back(json_content.substr(0, end_pos));
779
1467
  }
780
-
781
- regular_response.erase(qwen_start_pos, qwen_end_pos + QWEN_TOOL_END.length() - qwen_start_pos);
782
- } else {
783
- break;
784
1468
  }
1469
+
1470
+ regular_response.erase(qwen_start_pos, erase_end - qwen_start_pos);
785
1471
  }
786
-
787
- // Parse LFM2-style function calls: <|tool_call_start|>[name(args)]<|tool_call_end|>
1472
+
788
1473
  const std::string TOOL_CALL_START = "<|tool_call_start|>";
789
1474
  const std::string TOOL_CALL_END = "<|tool_call_end|>";
790
1475
  size_t tool_start_pos = 0;
@@ -898,6 +1583,95 @@ inline void parse_function_calls_from_response(const std::string& response_text,
898
1583
  }
899
1584
  }
900
1585
 
1586
+ inline std::vector<std::pair<size_t, size_t>> find_channel_token_ranges(
1587
+ const std::vector<uint32_t>& tokens, size_t offset,
1588
+ uint32_t channel_open_id, uint32_t channel_close_id) {
1589
+ std::vector<std::pair<size_t, size_t>> ranges;
1590
+ size_t pos = 0;
1591
+ while (pos < tokens.size()) {
1592
+ if (tokens[pos] != channel_open_id) {
1593
+ pos++;
1594
+ continue;
1595
+ }
1596
+
1597
+ size_t block_start = pos;
1598
+ pos++;
1599
+ while (pos < tokens.size() && tokens[pos] != channel_close_id) {
1600
+ pos++;
1601
+ }
1602
+ if (pos < tokens.size()) {
1603
+ pos++;
1604
+ }
1605
+ ranges.push_back({offset + block_start, pos - block_start});
1606
+ }
1607
+ return ranges;
1608
+ }
1609
+
1610
+ inline void strip_tag_blocks(std::string& text, std::string& extracted,
1611
+ const std::string& open_tag, const std::string& close_tag) {
1612
+ std::string result;
1613
+ size_t pos = 0;
1614
+
1615
+ size_t first_close = text.find(close_tag);
1616
+ size_t first_open = text.find(open_tag);
1617
+ if (first_close != std::string::npos &&
1618
+ (first_open == std::string::npos || first_close < first_open)) {
1619
+ extracted += text.substr(0, first_close);
1620
+ pos = first_close + close_tag.size();
1621
+ }
1622
+
1623
+ while (pos < text.size()) {
1624
+ size_t open_pos = text.find(open_tag, pos);
1625
+ if (open_pos == std::string::npos) {
1626
+ result += text.substr(pos);
1627
+ break;
1628
+ }
1629
+ result += text.substr(pos, open_pos - pos);
1630
+ size_t content_start = open_pos + open_tag.size();
1631
+ size_t close_pos = text.find(close_tag, content_start);
1632
+ if (close_pos == std::string::npos) {
1633
+ if (!extracted.empty()) extracted += "\n";
1634
+ extracted += text.substr(content_start);
1635
+ break;
1636
+ }
1637
+ if (!extracted.empty()) extracted += "\n";
1638
+ extracted += text.substr(content_start, close_pos - content_start);
1639
+ pos = close_pos + close_tag.size();
1640
+ }
1641
+ text = result;
1642
+ }
1643
+
1644
+ inline void strip_thinking_block(const std::string& input, std::string& thinking, std::string& content) {
1645
+ thinking.clear();
1646
+ content = input;
1647
+
1648
+ auto trim = [](std::string& s) {
1649
+ size_t first = s.find_first_not_of(" \t\n\r");
1650
+ size_t last = s.find_last_not_of(" \t\n\r");
1651
+ if (first != std::string::npos && last != std::string::npos)
1652
+ s = s.substr(first, last - first + 1);
1653
+ else
1654
+ s.clear();
1655
+ };
1656
+
1657
+ if (content.find("<|channel>") != std::string::npos || content.find("<channel|>") != std::string::npos) {
1658
+ strip_tag_blocks(content, thinking, "<|channel>", "<channel|>");
1659
+ } else if (content.find("<think>") != std::string::npos || content.find("</think>") != std::string::npos) {
1660
+ strip_tag_blocks(content, thinking, "<think>", "</think>");
1661
+ } else {
1662
+ return;
1663
+ }
1664
+
1665
+ trim(thinking);
1666
+ trim(content);
1667
+ }
1668
+
1669
+ struct TranscriptSegment {
1670
+ float start;
1671
+ float end;
1672
+ std::string text;
1673
+ };
1674
+
901
1675
  inline std::string construct_response_json(const std::string& regular_response,
902
1676
  const std::vector<std::string>& function_calls,
903
1677
  double time_to_first_token,
@@ -907,19 +1681,32 @@ inline std::string construct_response_json(const std::string& regular_response,
907
1681
  size_t prompt_tokens,
908
1682
  size_t completion_tokens,
909
1683
  float confidence = 0.0f,
910
- bool cloud_handoff = false) {
1684
+ bool cloud_handoff = false,
1685
+ const std::string& thinking = "",
1686
+ const std::vector<TranscriptSegment>& segments = {}) {
911
1687
  std::ostringstream json;
912
1688
  json << "{";
913
1689
  json << "\"success\":true,";
914
1690
  json << "\"error\":null,";
915
1691
  json << "\"cloud_handoff\":" << (cloud_handoff ? "true" : "false") << ",";
916
1692
  json << "\"response\":\"" << escape_json_string(regular_response) << "\",";
1693
+ if (!thinking.empty()) {
1694
+ json << "\"thinking\":\"" << escape_json_string(thinking) << "\",";
1695
+ }
917
1696
  json << "\"function_calls\":[";
918
1697
  for (size_t i = 0; i < function_calls.size(); ++i) {
919
1698
  if (i > 0) json << ",";
920
1699
  json << function_calls[i];
921
1700
  }
922
1701
  json << "],";
1702
+ json << "\"segments\":[";
1703
+ for (size_t i = 0; i < segments.size(); ++i) {
1704
+ if (i > 0) json << ",";
1705
+ json << "{\"start\":" << std::fixed << std::setprecision(3) << segments[i].start
1706
+ << ",\"end\":" << std::fixed << std::setprecision(3) << segments[i].end
1707
+ << ",\"text\":\"" << escape_json_string(segments[i].text) << "\"}";
1708
+ }
1709
+ json << "],";
923
1710
  json << "\"confidence\":" << std::fixed << std::setprecision(4) << confidence << ",";
924
1711
  json << "\"time_to_first_token_ms\":" << std::fixed << std::setprecision(2) << time_to_first_token << ",";
925
1712
  json << "\"total_time_ms\":" << std::fixed << std::setprecision(2) << total_time_ms << ",";
@@ -945,6 +1732,50 @@ inline std::string serialize_function_calls(const std::vector<std::string>& call
945
1732
  return oss.str();
946
1733
  }
947
1734
 
1735
+ inline int validate_audio_params(
1736
+ const char* component,
1737
+ void* model,
1738
+ char* response_buffer, size_t buffer_size,
1739
+ const char* audio_file_path,
1740
+ const uint8_t* pcm_buffer, size_t pcm_buffer_size) {
1741
+ if (!model) {
1742
+ std::string err = last_error_message.empty() ? "Model not initialized." : last_error_message;
1743
+ CACTUS_LOG_ERROR(component, err);
1744
+ handle_error_response(err, response_buffer, buffer_size);
1745
+ return -1;
1746
+ }
1747
+ if (!response_buffer || buffer_size == 0) {
1748
+ CACTUS_LOG_ERROR(component, "Invalid parameters: response_buffer or buffer_size");
1749
+ handle_error_response("Invalid parameters", response_buffer, buffer_size);
1750
+ return -1;
1751
+ }
1752
+ if (!audio_file_path && (!pcm_buffer || pcm_buffer_size == 0)) {
1753
+ CACTUS_LOG_ERROR(component, "No audio input provided");
1754
+ handle_error_response("Either audio_file_path or pcm_buffer must be provided", response_buffer, buffer_size);
1755
+ return -1;
1756
+ }
1757
+ if (audio_file_path && pcm_buffer && pcm_buffer_size > 0) {
1758
+ CACTUS_LOG_ERROR(component, "Both audio_file_path and pcm_buffer provided");
1759
+ handle_error_response("Cannot provide both audio_file_path and pcm_buffer", response_buffer, buffer_size);
1760
+ return -1;
1761
+ }
1762
+ if (pcm_buffer && pcm_buffer_size > 0 && (pcm_buffer_size < 2 || pcm_buffer_size % 2 != 0)) {
1763
+ CACTUS_LOG_ERROR(component, "Invalid pcm_buffer_size");
1764
+ handle_error_response("pcm_buffer_size must be even and at least 2 bytes", response_buffer, buffer_size);
1765
+ return -1;
1766
+ }
1767
+ return 0;
1768
+ }
1769
+
1770
+ inline std::vector<float> pcm_to_float(const uint8_t* pcm_buffer, size_t pcm_buffer_size) {
1771
+ const int16_t* samples = reinterpret_cast<const int16_t*>(pcm_buffer);
1772
+ size_t n = pcm_buffer_size / 2;
1773
+ std::vector<float> out(n);
1774
+ for (size_t i = 0; i < n; ++i)
1775
+ out[i] = static_cast<float>(samples[i]) / 32768.0f;
1776
+ return out;
1777
+ }
1778
+
948
1779
  } // namespace ffi
949
1780
  } // namespace cactus
950
1781
 
@@ -958,4 +1789,4 @@ const char* cactus_get_last_error();
958
1789
  }
959
1790
  #endif
960
1791
 
961
- #endif // CACTUS_UTILS_H
1792
+ #endif // CACTUS_UTILS_H