cactus-react-native 1.10.4 → 1.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +199 -40
- package/android/src/main/jniLibs/arm64-v8a/libcactus.a +0 -0
- package/cpp/HybridCactus.cpp +131 -2
- package/cpp/HybridCactus.hpp +15 -0
- package/cpp/cactus_ffi.h +240 -2
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h +240 -2
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_utils.h +940 -109
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/engine.h +175 -25
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/gemma_tools.h +48 -21
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h +79 -7
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h +122 -9
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h +191 -2
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus +0 -0
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_ffi.h +240 -2
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_utils.h +940 -109
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h +175 -25
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/gemma_tools.h +48 -21
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h +79 -7
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel.h +122 -9
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel_utils.h +191 -2
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/cactus +0 -0
- package/lib/module/classes/{CactusVAD.js → CactusAudio.js} +19 -6
- package/lib/module/classes/CactusAudio.js.map +1 -0
- package/lib/module/classes/CactusLM.js +25 -0
- package/lib/module/classes/CactusLM.js.map +1 -1
- package/lib/module/hooks/{useCactusVAD.js → useCactusAudio.js} +50 -20
- package/lib/module/hooks/useCactusAudio.js.map +1 -0
- package/lib/module/index.js +2 -2
- package/lib/module/index.js.map +1 -1
- package/lib/module/modelRegistry.js +1 -1
- package/lib/module/native/Cactus.js +81 -2
- package/lib/module/native/Cactus.js.map +1 -1
- package/lib/module/types/CactusAudio.js +4 -0
- package/lib/module/types/{CactusVAD.js.map → CactusAudio.js.map} +1 -1
- package/lib/typescript/src/classes/CactusAudio.d.ts +22 -0
- package/lib/typescript/src/classes/CactusAudio.d.ts.map +1 -0
- package/lib/typescript/src/classes/CactusLM.d.ts +2 -1
- package/lib/typescript/src/classes/CactusLM.d.ts.map +1 -1
- package/lib/typescript/src/hooks/useCactusAudio.d.ts +17 -0
- package/lib/typescript/src/hooks/useCactusAudio.d.ts.map +1 -0
- package/lib/typescript/src/index.d.ts +4 -4
- package/lib/typescript/src/index.d.ts.map +1 -1
- package/lib/typescript/src/native/Cactus.d.ts +9 -3
- package/lib/typescript/src/native/Cactus.d.ts.map +1 -1
- package/lib/typescript/src/specs/Cactus.nitro.d.ts +3 -0
- package/lib/typescript/src/specs/Cactus.nitro.d.ts.map +1 -1
- package/lib/typescript/src/types/CactusAudio.d.ts +63 -0
- package/lib/typescript/src/types/CactusAudio.d.ts.map +1 -0
- package/lib/typescript/src/types/CactusLM.d.ts +15 -0
- package/lib/typescript/src/types/CactusLM.d.ts.map +1 -1
- package/lib/typescript/src/types/CactusSTT.d.ts +1 -0
- package/lib/typescript/src/types/CactusSTT.d.ts.map +1 -1
- package/nitrogen/generated/shared/c++/HybridCactusSpec.cpp +3 -0
- package/nitrogen/generated/shared/c++/HybridCactusSpec.hpp +3 -0
- package/package.json +1 -1
- package/src/classes/{CactusVAD.ts → CactusAudio.ts} +32 -13
- package/src/classes/CactusLM.ts +36 -0
- package/src/hooks/{useCactusVAD.ts → useCactusAudio.ts} +65 -28
- package/src/index.tsx +16 -9
- package/src/modelRegistry.ts +1 -1
- package/src/native/Cactus.ts +118 -3
- package/src/specs/Cactus.nitro.ts +16 -0
- package/src/types/CactusAudio.ts +73 -0
- package/src/types/CactusLM.ts +17 -0
- package/src/types/CactusSTT.ts +1 -0
- package/lib/module/classes/CactusVAD.js.map +0 -1
- package/lib/module/hooks/useCactusVAD.js.map +0 -1
- package/lib/module/types/CactusVAD.js +0 -4
- package/lib/typescript/src/classes/CactusVAD.d.ts +0 -20
- package/lib/typescript/src/classes/CactusVAD.d.ts.map +0 -1
- package/lib/typescript/src/hooks/useCactusVAD.d.ts +0 -15
- package/lib/typescript/src/hooks/useCactusVAD.d.ts.map +0 -1
- package/lib/typescript/src/types/CactusVAD.d.ts +0 -34
- package/lib/typescript/src/types/CactusVAD.d.ts.map +0 -1
- package/src/types/CactusVAD.ts +0 -39
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
#include <string>
|
|
7
7
|
#include <vector>
|
|
8
8
|
#include <unordered_map>
|
|
9
|
+
#include <map>
|
|
9
10
|
#include <stdexcept>
|
|
10
11
|
#include <sstream>
|
|
11
12
|
#include <iomanip>
|
|
@@ -63,6 +64,16 @@ struct CactusModelHandle {
|
|
|
63
64
|
std::unique_ptr<cactus::engine::Model> vad_model;
|
|
64
65
|
std::atomic<bool> should_stop;
|
|
65
66
|
std::vector<uint32_t> processed_tokens;
|
|
67
|
+
struct ProcessedImage {
|
|
68
|
+
std::string path;
|
|
69
|
+
long long last_modified_timestamp = 0;
|
|
70
|
+
|
|
71
|
+
bool operator==(const ProcessedImage& other) const {
|
|
72
|
+
return path == other.path && last_modified_timestamp == other.last_modified_timestamp;
|
|
73
|
+
}
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
std::vector<std::vector<ProcessedImage>> processed_images;
|
|
66
77
|
std::mutex model_mutex;
|
|
67
78
|
std::string model_name;
|
|
68
79
|
std::unique_ptr<cactus::engine::index::Index> corpus_index;
|
|
@@ -124,6 +135,66 @@ inline cactus::engine::AudioProcessor::SpectrogramConfig get_parakeet_spectrogra
|
|
|
124
135
|
return cfg;
|
|
125
136
|
}
|
|
126
137
|
|
|
138
|
+
inline cactus::engine::AudioProcessor::SpectrogramConfig get_htk_spectrogram_config() {
|
|
139
|
+
cactus::engine::AudioProcessor::SpectrogramConfig cfg{};
|
|
140
|
+
cfg.n_fft = 321;
|
|
141
|
+
cfg.frame_length = 320;
|
|
142
|
+
cfg.fft_override = 1024;
|
|
143
|
+
cfg.hop_length = 160;
|
|
144
|
+
cfg.power = 1.0f;
|
|
145
|
+
cfg.center = false;
|
|
146
|
+
cfg.pad_mode = "constant";
|
|
147
|
+
cfg.onesided = true;
|
|
148
|
+
cfg.dither = 0.0f;
|
|
149
|
+
cfg.mel_floor = 0.001f;
|
|
150
|
+
cfg.log_mel = "log";
|
|
151
|
+
cfg.reference = 1.0f;
|
|
152
|
+
cfg.min_value = 0.001f;
|
|
153
|
+
cfg.remove_dc_offset = false;
|
|
154
|
+
cfg.hann_periodic = true;
|
|
155
|
+
return cfg;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
inline cactus::engine::AudioProcessor::SpectrogramConfig get_gemma4_audio_spectrogram_config(
|
|
159
|
+
const cactus::engine::Config& model_config) {
|
|
160
|
+
auto cfg = get_htk_spectrogram_config();
|
|
161
|
+
cfg.fft_override = model_config.audio_fft_length;
|
|
162
|
+
cfg.mel_floor_additive = true;
|
|
163
|
+
return cfg;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
inline cactus::engine::AudioProcessor::SpectrogramConfig get_wespeaker_spectrogram_config() {
|
|
167
|
+
cactus::engine::AudioProcessor::SpectrogramConfig cfg{};
|
|
168
|
+
cfg.n_fft = 512;
|
|
169
|
+
cfg.frame_length = 400;
|
|
170
|
+
cfg.hop_length = 160;
|
|
171
|
+
cfg.power = 2.0f;
|
|
172
|
+
cfg.center = false;
|
|
173
|
+
cfg.pad_mode = "constant";
|
|
174
|
+
cfg.onesided = true;
|
|
175
|
+
cfg.dither = 0.0f;
|
|
176
|
+
cfg.mel_floor = 1.1754944e-38f;
|
|
177
|
+
cfg.log_mel = "log";
|
|
178
|
+
cfg.reference = 1.0f;
|
|
179
|
+
cfg.min_value = 1.1754944e-38f;
|
|
180
|
+
cfg.remove_dc_offset = true;
|
|
181
|
+
cfg.preemphasis = 0.97f;
|
|
182
|
+
cfg.hann_periodic = false;
|
|
183
|
+
cfg.window_a0 = 0.54f;
|
|
184
|
+
return cfg;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
inline std::vector<float> transpose_mel_to_frame_major(const std::vector<float>& mel,
|
|
188
|
+
size_t num_mels, size_t num_frames) {
|
|
189
|
+
std::vector<float> transposed(num_frames * num_mels);
|
|
190
|
+
for (size_t m = 0; m < num_mels; m++) {
|
|
191
|
+
for (size_t t = 0; t < num_frames; t++) {
|
|
192
|
+
transposed[t * num_mels + m] = mel[m * num_frames + t];
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
return transposed;
|
|
196
|
+
}
|
|
197
|
+
|
|
127
198
|
inline void apply_preemphasis(std::vector<float>& waveform, float coefficient = 0.97f) {
|
|
128
199
|
if (waveform.size() < 2 || coefficient == 0.0f) {
|
|
129
200
|
return;
|
|
@@ -180,6 +251,56 @@ inline void trim_mel_frames(std::vector<float>& mel, size_t num_mels, size_t val
|
|
|
180
251
|
mel.swap(trimmed);
|
|
181
252
|
}
|
|
182
253
|
|
|
254
|
+
struct AudioPreprocessResult {
|
|
255
|
+
std::vector<float> features;
|
|
256
|
+
size_t num_frames = 0;
|
|
257
|
+
size_t num_soft_tokens = 0;
|
|
258
|
+
};
|
|
259
|
+
|
|
260
|
+
inline AudioPreprocessResult preprocess_audio_for_gemma4(
|
|
261
|
+
std::vector<float> audio_samples,
|
|
262
|
+
const cactus::engine::Config& model_config
|
|
263
|
+
) {
|
|
264
|
+
AudioPreprocessResult result;
|
|
265
|
+
if (audio_samples.empty()) return result;
|
|
266
|
+
|
|
267
|
+
size_t pad_amt = 320 - (audio_samples.size() % 320);
|
|
268
|
+
if (pad_amt < 320)
|
|
269
|
+
audio_samples.resize(audio_samples.size() + pad_amt, 0.0f);
|
|
270
|
+
|
|
271
|
+
size_t mel_bins = model_config.audio_input_feat_size;
|
|
272
|
+
auto cfg = get_gemma4_audio_spectrogram_config(model_config);
|
|
273
|
+
|
|
274
|
+
size_t semicausal_pad = cfg.frame_length / 2;
|
|
275
|
+
audio_samples.insert(audio_samples.begin(), semicausal_pad, 0.0f);
|
|
276
|
+
|
|
277
|
+
cactus::engine::AudioProcessor ap;
|
|
278
|
+
size_t fft_for_mel = cfg.fft_override > 0 ? cfg.fft_override : cfg.n_fft;
|
|
279
|
+
ap.init_mel_filters(fft_for_mel / 2 + 1, mel_bins, 0.0f, 8000.0f, 16000,
|
|
280
|
+
nullptr, "htk");
|
|
281
|
+
std::vector<float> mel = ap.compute_spectrogram(audio_samples, cfg);
|
|
282
|
+
|
|
283
|
+
result.num_frames = mel.size() / mel_bins;
|
|
284
|
+
result.features = transpose_mel_to_frame_major(mel, mel_bins, result.num_frames);
|
|
285
|
+
|
|
286
|
+
size_t after_stage1 = (result.num_frames + 1) / 2;
|
|
287
|
+
result.num_soft_tokens = (after_stage1 + 1) / 2;
|
|
288
|
+
|
|
289
|
+
return result;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
inline std::vector<float> pcm_buffer_to_float_samples(
|
|
293
|
+
const uint8_t* pcm_buffer, size_t pcm_buffer_size
|
|
294
|
+
) {
|
|
295
|
+
const int16_t* pcm_samples = reinterpret_cast<const int16_t*>(pcm_buffer);
|
|
296
|
+
size_t num_samples = pcm_buffer_size / 2;
|
|
297
|
+
std::vector<float> waveform_fp32(num_samples);
|
|
298
|
+
constexpr float inv_32768 = 1.0f / 32768.0f;
|
|
299
|
+
for (size_t i = 0; i < num_samples; i++)
|
|
300
|
+
waveform_fp32[i] = static_cast<float>(pcm_samples[i]) * inv_32768;
|
|
301
|
+
return waveform_fp32;
|
|
302
|
+
}
|
|
303
|
+
|
|
183
304
|
} // namespace audio
|
|
184
305
|
} // namespace cactus
|
|
185
306
|
|
|
@@ -226,6 +347,24 @@ struct ToolFunction {
|
|
|
226
347
|
std::unordered_map<std::string, std::string> parameters;
|
|
227
348
|
};
|
|
228
349
|
|
|
350
|
+
struct InferenceOptions {
|
|
351
|
+
float temperature = 0.0f;
|
|
352
|
+
float top_p = 0.0f;
|
|
353
|
+
float confidence_threshold = 0.7f;
|
|
354
|
+
size_t top_k = 0;
|
|
355
|
+
size_t max_tokens = 100;
|
|
356
|
+
size_t tool_rag_top_k = 2;
|
|
357
|
+
size_t cloud_timeout_ms = 15000;
|
|
358
|
+
std::vector<std::string> stop_sequences;
|
|
359
|
+
bool force_tools = false;
|
|
360
|
+
bool include_stop_sequences = false;
|
|
361
|
+
bool use_vad = true;
|
|
362
|
+
bool telemetry_enabled = true;
|
|
363
|
+
bool auto_handoff = true;
|
|
364
|
+
bool handoff_with_images = true;
|
|
365
|
+
bool enable_thinking_if_supported = true;
|
|
366
|
+
};
|
|
367
|
+
|
|
229
368
|
} // namespace ffi
|
|
230
369
|
} // namespace cactus
|
|
231
370
|
|
|
@@ -262,6 +401,24 @@ inline std::string trim_string(const std::string& s) {
|
|
|
262
401
|
return s.substr(start, end - start);
|
|
263
402
|
}
|
|
264
403
|
|
|
404
|
+
inline size_t find_matching_delimiter(const std::string& s, size_t pos, char open, char close) {
|
|
405
|
+
int depth = 1;
|
|
406
|
+
pos++;
|
|
407
|
+
while (pos < s.length() && depth > 0) {
|
|
408
|
+
if (s[pos] == open) depth++;
|
|
409
|
+
else if (s[pos] == close) depth--;
|
|
410
|
+
else if (s[pos] == '"') {
|
|
411
|
+
pos++;
|
|
412
|
+
while (pos < s.length() && s[pos] != '"') {
|
|
413
|
+
if (s[pos] == '\\') pos++;
|
|
414
|
+
pos++;
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
pos++;
|
|
418
|
+
}
|
|
419
|
+
return pos;
|
|
420
|
+
}
|
|
421
|
+
|
|
265
422
|
inline std::string env_or_default(const char* key, const char* fallback) {
|
|
266
423
|
const char* v = std::getenv(key);
|
|
267
424
|
if (v && v[0] != '\0') return std::string(v);
|
|
@@ -377,6 +534,119 @@ inline std::string serialize_tools_json(const std::vector<ToolFunction>& tools)
|
|
|
377
534
|
return oss.str();
|
|
378
535
|
}
|
|
379
536
|
|
|
537
|
+
namespace json_sorted {
|
|
538
|
+
|
|
539
|
+
inline void skip_ws(const std::string& s, size_t& p) {
|
|
540
|
+
while (p < s.size() && std::isspace(static_cast<unsigned char>(s[p]))) p++;
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
inline std::string parse_string(const std::string& s, size_t& p) {
|
|
544
|
+
std::string r = "\"";
|
|
545
|
+
p++;
|
|
546
|
+
while (p < s.size()) {
|
|
547
|
+
if (s[p] == '\\') {
|
|
548
|
+
r += s[p++];
|
|
549
|
+
if (p < s.size()) r += s[p++];
|
|
550
|
+
} else if (s[p] == '"') {
|
|
551
|
+
r += '"';
|
|
552
|
+
p++;
|
|
553
|
+
return r;
|
|
554
|
+
} else {
|
|
555
|
+
r += s[p++];
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
return r;
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
inline std::string parse_value(const std::string& s, size_t& p);
|
|
562
|
+
|
|
563
|
+
inline std::string parse_object(const std::string& s, size_t& p) {
|
|
564
|
+
p++;
|
|
565
|
+
std::map<std::string, std::string> entries;
|
|
566
|
+
skip_ws(s, p);
|
|
567
|
+
while (p < s.size() && s[p] != '}') {
|
|
568
|
+
if (s[p] == ',') { p++; skip_ws(s, p); continue; }
|
|
569
|
+
std::string key = parse_string(s, p);
|
|
570
|
+
skip_ws(s, p);
|
|
571
|
+
if (p < s.size() && s[p] == ':') p++;
|
|
572
|
+
skip_ws(s, p);
|
|
573
|
+
std::string val = parse_value(s, p);
|
|
574
|
+
entries[key] = val;
|
|
575
|
+
skip_ws(s, p);
|
|
576
|
+
}
|
|
577
|
+
if (p < s.size()) p++;
|
|
578
|
+
std::string r = "{";
|
|
579
|
+
bool first = true;
|
|
580
|
+
for (const auto& kv : entries) {
|
|
581
|
+
if (!first) r += ", ";
|
|
582
|
+
r += kv.first + ": " + kv.second;
|
|
583
|
+
first = false;
|
|
584
|
+
}
|
|
585
|
+
r += "}";
|
|
586
|
+
return r;
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
inline std::string parse_array(const std::string& s, size_t& p) {
|
|
590
|
+
p++;
|
|
591
|
+
std::vector<std::string> items;
|
|
592
|
+
skip_ws(s, p);
|
|
593
|
+
while (p < s.size() && s[p] != ']') {
|
|
594
|
+
if (s[p] == ',') { p++; skip_ws(s, p); continue; }
|
|
595
|
+
items.push_back(parse_value(s, p));
|
|
596
|
+
skip_ws(s, p);
|
|
597
|
+
}
|
|
598
|
+
if (p < s.size()) p++;
|
|
599
|
+
std::string r = "[";
|
|
600
|
+
for (size_t i = 0; i < items.size(); i++) {
|
|
601
|
+
if (i > 0) r += ", ";
|
|
602
|
+
r += items[i];
|
|
603
|
+
}
|
|
604
|
+
r += "]";
|
|
605
|
+
return r;
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
inline std::string parse_value(const std::string& s, size_t& p) {
|
|
609
|
+
skip_ws(s, p);
|
|
610
|
+
if (p >= s.size()) return "";
|
|
611
|
+
if (s[p] == '"') return parse_string(s, p);
|
|
612
|
+
if (s[p] == '{') return parse_object(s, p);
|
|
613
|
+
if (s[p] == '[') return parse_array(s, p);
|
|
614
|
+
size_t start = p;
|
|
615
|
+
while (p < s.size() && s[p] != ',' && s[p] != '}' && s[p] != ']' && !std::isspace(static_cast<unsigned char>(s[p]))) p++;
|
|
616
|
+
return s.substr(start, p - start);
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
inline std::string reformat(const std::string& json) {
|
|
620
|
+
size_t p = 0;
|
|
621
|
+
return parse_value(json, p);
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
} // namespace json_sorted
|
|
625
|
+
|
|
626
|
+
inline std::string serialize_tools_for_template(const std::vector<ToolFunction>& tools) {
|
|
627
|
+
if (tools.empty()) return "";
|
|
628
|
+
std::string result;
|
|
629
|
+
for (const auto& tool : tools) {
|
|
630
|
+
std::map<std::string, std::string> func_fields;
|
|
631
|
+
func_fields["\"description\""] = "\"" + escape_json_string(tool.description) + "\"";
|
|
632
|
+
func_fields["\"name\""] = "\"" + escape_json_string(tool.name) + "\"";
|
|
633
|
+
auto it = tool.parameters.find("schema");
|
|
634
|
+
if (it != tool.parameters.end()) {
|
|
635
|
+
func_fields["\"parameters\""] = json_sorted::reformat(it->second);
|
|
636
|
+
}
|
|
637
|
+
std::string func_json = "{";
|
|
638
|
+
bool first = true;
|
|
639
|
+
for (const auto& kv : func_fields) {
|
|
640
|
+
if (!first) func_json += ", ";
|
|
641
|
+
func_json += kv.first + ": " + kv.second;
|
|
642
|
+
first = false;
|
|
643
|
+
}
|
|
644
|
+
func_json += "}";
|
|
645
|
+
result += "\n{\"function\": " + func_json + ", \"type\": \"function\"}";
|
|
646
|
+
}
|
|
647
|
+
return result;
|
|
648
|
+
}
|
|
649
|
+
|
|
380
650
|
inline void handle_error_response(const std::string& error_message, char* response_buffer, size_t buffer_size) {
|
|
381
651
|
std::ostringstream json;
|
|
382
652
|
json << "{";
|
|
@@ -401,10 +671,12 @@ inline void handle_error_response(const std::string& error_message, char* respon
|
|
|
401
671
|
}
|
|
402
672
|
}
|
|
403
673
|
|
|
404
|
-
inline std::vector<cactus::engine::ChatMessage> parse_messages_json(const std::string& json,
|
|
405
|
-
std::vector<std::string>& out_image_paths
|
|
674
|
+
inline std::vector<cactus::engine::ChatMessage> parse_messages_json(const std::string& json,
|
|
675
|
+
std::vector<std::string>& out_image_paths,
|
|
676
|
+
std::vector<std::string>* out_audio_paths = nullptr) {
|
|
406
677
|
std::vector<cactus::engine::ChatMessage> messages;
|
|
407
678
|
out_image_paths.clear();
|
|
679
|
+
if (out_audio_paths) out_audio_paths->clear();
|
|
408
680
|
|
|
409
681
|
size_t pos = json.find('[');
|
|
410
682
|
if (pos == std::string::npos) {
|
|
@@ -457,39 +729,111 @@ inline std::vector<cactus::engine::ChatMessage> parse_messages_json(const std::s
|
|
|
457
729
|
}
|
|
458
730
|
}
|
|
459
731
|
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
size_t
|
|
463
|
-
if (
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
732
|
+
auto parse_path_array = [&](const char* key, std::vector<std::string>& dest,
|
|
733
|
+
std::vector<std::string>* out_paths) {
|
|
734
|
+
size_t key_pos = json.find(key, pos);
|
|
735
|
+
if (key_pos == std::string::npos || key_pos >= obj_end) return;
|
|
736
|
+
size_t array_start = json.find('[', key_pos);
|
|
737
|
+
if (array_start == std::string::npos || array_start >= obj_end) return;
|
|
738
|
+
size_t array_end = json.find(']', array_start);
|
|
739
|
+
if (array_end == std::string::npos || array_end >= obj_end) return;
|
|
740
|
+
size_t cur = array_start;
|
|
741
|
+
while (true) {
|
|
742
|
+
cur = json.find('"', cur + 1);
|
|
743
|
+
if (cur == std::string::npos || cur >= array_end) break;
|
|
744
|
+
size_t str_start = cur + 1;
|
|
745
|
+
size_t str_end = json.find('"', str_start);
|
|
746
|
+
if (str_end == std::string::npos || str_end > array_end) break;
|
|
747
|
+
std::string path = std::filesystem::absolute(
|
|
748
|
+
std::filesystem::path(json.substr(str_start, str_end - str_start))).string();
|
|
749
|
+
dest.push_back(path);
|
|
750
|
+
if (out_paths) out_paths->push_back(path);
|
|
751
|
+
cur = str_end;
|
|
752
|
+
}
|
|
753
|
+
};
|
|
754
|
+
|
|
755
|
+
parse_path_array("\"images\"", msg.images, &out_image_paths);
|
|
756
|
+
parse_path_array("\"audio\"", msg.audio, out_audio_paths);
|
|
757
|
+
|
|
758
|
+
if (msg.role == "tool") {
|
|
759
|
+
size_t name_pos = json.find("\"name\"", obj_start);
|
|
760
|
+
if (name_pos != std::string::npos && name_pos < obj_end) {
|
|
761
|
+
size_t name_quote = json.find('"', name_pos + 6);
|
|
762
|
+
if (name_quote != std::string::npos && name_quote < obj_end) {
|
|
763
|
+
size_t name_start = name_quote + 1;
|
|
764
|
+
size_t name_end = json.find('"', name_start);
|
|
765
|
+
if (name_end != std::string::npos && name_end < obj_end) {
|
|
766
|
+
msg.name = json.substr(name_start, name_end - name_start);
|
|
483
767
|
}
|
|
484
768
|
}
|
|
485
769
|
}
|
|
486
770
|
}
|
|
487
|
-
|
|
771
|
+
|
|
772
|
+
size_t tool_calls_pos = json.find("\"tool_calls\"", obj_start);
|
|
773
|
+
if (tool_calls_pos != std::string::npos && tool_calls_pos < obj_end) {
|
|
774
|
+
size_t tool_calls_arr_start = json.find('[', tool_calls_pos);
|
|
775
|
+
if (tool_calls_arr_start != std::string::npos && tool_calls_arr_start < obj_end) {
|
|
776
|
+
size_t tool_calls_arr_end = find_matching_delimiter(json, tool_calls_arr_start, '[', ']');
|
|
777
|
+
|
|
778
|
+
size_t search_pos = tool_calls_arr_start;
|
|
779
|
+
while (true) {
|
|
780
|
+
size_t func_pos = json.find("\"function\"", search_pos);
|
|
781
|
+
if (func_pos == std::string::npos || func_pos >= tool_calls_arr_end) break;
|
|
782
|
+
|
|
783
|
+
size_t func_obj_start = json.find('{', func_pos + 10);
|
|
784
|
+
if (func_obj_start == std::string::npos || func_obj_start >= tool_calls_arr_end) break;
|
|
785
|
+
|
|
786
|
+
size_t func_obj_end = find_matching_delimiter(json, func_obj_start, '{', '}');
|
|
787
|
+
|
|
788
|
+
cactus::engine::ToolCallInfo tool_call;
|
|
789
|
+
|
|
790
|
+
size_t fn_name_pos = json.find("\"name\"", func_obj_start);
|
|
791
|
+
if (fn_name_pos != std::string::npos && fn_name_pos < func_obj_end) {
|
|
792
|
+
size_t fn_name_quote = json.find('"', fn_name_pos + 6);
|
|
793
|
+
if (fn_name_quote != std::string::npos && fn_name_quote < func_obj_end) {
|
|
794
|
+
size_t fn_name_start = fn_name_quote + 1;
|
|
795
|
+
size_t fn_name_end = json.find('"', fn_name_start);
|
|
796
|
+
if (fn_name_end != std::string::npos && fn_name_end < func_obj_end) {
|
|
797
|
+
tool_call.name = json.substr(fn_name_start, fn_name_end - fn_name_start);
|
|
798
|
+
}
|
|
799
|
+
}
|
|
800
|
+
}
|
|
801
|
+
|
|
802
|
+
size_t args_pos = json.find("\"arguments\"", func_obj_start);
|
|
803
|
+
if (args_pos != std::string::npos && args_pos < func_obj_end) {
|
|
804
|
+
size_t colon_pos = json.find(':', args_pos + 11);
|
|
805
|
+
if (colon_pos != std::string::npos && colon_pos < func_obj_end) {
|
|
806
|
+
size_t args_start = colon_pos + 1;
|
|
807
|
+
while (args_start < json.length() && std::isspace(static_cast<unsigned char>(json[args_start]))) args_start++;
|
|
808
|
+
|
|
809
|
+
if (args_start < func_obj_end && json[args_start] == '{') {
|
|
810
|
+
size_t args_end = find_matching_delimiter(json, args_start, '{', '}');
|
|
811
|
+
tool_call.arguments = json.substr(args_start, args_end - args_start);
|
|
812
|
+
} else if (args_start < func_obj_end && json[args_start] == '"') {
|
|
813
|
+
size_t str_start = args_start + 1;
|
|
814
|
+
size_t str_end = str_start;
|
|
815
|
+
while (str_end < json.length() && json[str_end] != '"') {
|
|
816
|
+
if (json[str_end] == '\\') str_end++;
|
|
817
|
+
str_end++;
|
|
818
|
+
}
|
|
819
|
+
tool_call.arguments = json.substr(str_start, str_end - str_start);
|
|
820
|
+
}
|
|
821
|
+
}
|
|
822
|
+
}
|
|
823
|
+
|
|
824
|
+
if (!tool_call.name.empty()) {
|
|
825
|
+
msg.tool_calls.push_back(tool_call);
|
|
826
|
+
}
|
|
827
|
+
search_pos = func_obj_end;
|
|
828
|
+
}
|
|
829
|
+
}
|
|
830
|
+
}
|
|
831
|
+
|
|
488
832
|
messages.push_back(msg);
|
|
489
|
-
|
|
833
|
+
|
|
490
834
|
pos = json.find('{', obj_end);
|
|
491
835
|
}
|
|
492
|
-
|
|
836
|
+
|
|
493
837
|
return messages;
|
|
494
838
|
}
|
|
495
839
|
|
|
@@ -538,128 +882,433 @@ inline std::vector<ToolFunction> parse_tools_json(const std::string& json) {
|
|
|
538
882
|
|
|
539
883
|
pos = json.find("\"function\"", name_pos);
|
|
540
884
|
}
|
|
541
|
-
|
|
885
|
+
|
|
542
886
|
return tools;
|
|
543
887
|
}
|
|
544
888
|
|
|
545
|
-
inline
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
if (
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
889
|
+
inline bool try_parse_json_float(const std::string& json, const std::string& key, float& out_value) {
|
|
890
|
+
std::string pattern = "\"" + key + "\":";
|
|
891
|
+
size_t pos = json.find(pattern);
|
|
892
|
+
if (pos == std::string::npos) return false;
|
|
893
|
+
|
|
894
|
+
size_t start = pos + pattern.size();
|
|
895
|
+
while (start < json.size() && std::isspace(static_cast<unsigned char>(json[start]))) ++start;
|
|
896
|
+
|
|
897
|
+
size_t end = start;
|
|
898
|
+
while (end < json.size() && std::string(",}] \t\n\r").find(json[end]) == std::string::npos) ++end;
|
|
899
|
+
|
|
900
|
+
try {
|
|
901
|
+
out_value = std::stof(json.substr(start, end - start));
|
|
902
|
+
return true;
|
|
903
|
+
} catch (...) {
|
|
904
|
+
return false;
|
|
905
|
+
}
|
|
906
|
+
}
|
|
907
|
+
|
|
908
|
+
inline std::vector<std::string> parse_json_string_array_field(const std::string& json, const std::string& key) {
|
|
909
|
+
std::vector<std::string> out;
|
|
910
|
+
std::string pattern = "\"" + key + "\":";
|
|
911
|
+
size_t pos = json.find(pattern);
|
|
912
|
+
if (pos == std::string::npos) return out;
|
|
913
|
+
|
|
914
|
+
size_t start = pos + pattern.size();
|
|
915
|
+
while (start < json.size() && std::isspace(static_cast<unsigned char>(json[start]))) ++start;
|
|
916
|
+
if (start >= json.size() || json[start] != '[') return out;
|
|
917
|
+
|
|
918
|
+
int depth = 1;
|
|
919
|
+
bool in_string = false;
|
|
920
|
+
bool escaped = false;
|
|
921
|
+
size_t end = start + 1;
|
|
922
|
+
|
|
923
|
+
while (end < json.size() && depth > 0) {
|
|
924
|
+
char c = json[end];
|
|
925
|
+
if (in_string) {
|
|
926
|
+
if (escaped) escaped = false;
|
|
927
|
+
else if (c == '\\') escaped = true;
|
|
928
|
+
else if (c == '"') in_string = false;
|
|
929
|
+
} else {
|
|
930
|
+
if (c == '"') in_string = true;
|
|
931
|
+
else if (c == '[') depth++;
|
|
932
|
+
else if (c == ']') depth--;
|
|
933
|
+
}
|
|
934
|
+
++end;
|
|
935
|
+
}
|
|
936
|
+
|
|
937
|
+
if (depth != 0) return out;
|
|
938
|
+
const std::string array_json = json.substr(start, end - start);
|
|
939
|
+
if (array_json.size() < 2 || array_json.front() != '[' || array_json.back() != ']') return out;
|
|
940
|
+
|
|
941
|
+
size_t i = 1;
|
|
942
|
+
while (i + 1 < array_json.size()) {
|
|
943
|
+
while (i + 1 < array_json.size() &&
|
|
944
|
+
(std::isspace(static_cast<unsigned char>(array_json[i])) || array_json[i] == ',')) {
|
|
945
|
+
++i;
|
|
946
|
+
}
|
|
947
|
+
if (i + 1 >= array_json.size() || array_json[i] == ']') break;
|
|
948
|
+
if (array_json[i] != '"') break;
|
|
949
|
+
|
|
950
|
+
++i;
|
|
951
|
+
std::string value;
|
|
952
|
+
bool escaped = false;
|
|
953
|
+
while (i < array_json.size()) {
|
|
954
|
+
char c = array_json[i++];
|
|
955
|
+
if (escaped) {
|
|
956
|
+
switch (c) {
|
|
957
|
+
case '"': value.push_back('"'); break;
|
|
958
|
+
case '\\': value.push_back('\\'); break;
|
|
959
|
+
case '/': value.push_back('/'); break;
|
|
960
|
+
case 'b': value.push_back('\b'); break;
|
|
961
|
+
case 'f': value.push_back('\f'); break;
|
|
962
|
+
case 'n': value.push_back('\n'); break;
|
|
963
|
+
case 'r': value.push_back('\r'); break;
|
|
964
|
+
case 't': value.push_back('\t'); break;
|
|
965
|
+
default: value.push_back(c); break;
|
|
966
|
+
}
|
|
967
|
+
escaped = false;
|
|
968
|
+
continue;
|
|
969
|
+
}
|
|
970
|
+
if (c == '\\') {
|
|
971
|
+
escaped = true;
|
|
972
|
+
continue;
|
|
973
|
+
}
|
|
974
|
+
if (c == '"') {
|
|
975
|
+
out.push_back(value);
|
|
976
|
+
break;
|
|
977
|
+
}
|
|
978
|
+
value.push_back(c);
|
|
979
|
+
}
|
|
980
|
+
}
|
|
572
981
|
|
|
982
|
+
return out;
|
|
983
|
+
}
|
|
984
|
+
|
|
985
|
+
inline void parse_custom_vocabulary_options(const std::string& json,
|
|
986
|
+
std::vector<std::string>& custom_vocabulary,
|
|
987
|
+
float& vocabulary_boost) {
|
|
988
|
+
custom_vocabulary.clear();
|
|
989
|
+
vocabulary_boost = 5.0f;
|
|
573
990
|
if (json.empty()) return;
|
|
574
991
|
|
|
992
|
+
float parsed_boost = vocabulary_boost;
|
|
993
|
+
if (try_parse_json_float(json, "vocabulary_boost", parsed_boost)) {
|
|
994
|
+
vocabulary_boost = std::clamp(parsed_boost, 0.0f, 20.0f);
|
|
995
|
+
}
|
|
996
|
+
|
|
997
|
+
custom_vocabulary = parse_json_string_array_field(json, "custom_vocabulary");
|
|
998
|
+
}
|
|
999
|
+
|
|
1000
|
+
inline std::unordered_map<uint32_t, float> build_token_bias_map(const std::vector<std::vector<uint32_t>>& tokenized_entries,
|
|
1001
|
+
float vocabulary_boost) {
|
|
1002
|
+
std::unordered_map<uint32_t, float> vocab_bias;
|
|
1003
|
+
const float clamped_boost = std::clamp(vocabulary_boost, 0.0f, 20.0f);
|
|
1004
|
+
if (clamped_boost == 0.0f) return vocab_bias;
|
|
1005
|
+
|
|
1006
|
+
for (const auto& token_ids : tokenized_entries) {
|
|
1007
|
+
for (uint32_t token_id : token_ids) {
|
|
1008
|
+
float& entry = vocab_bias[token_id];
|
|
1009
|
+
if (entry < clamped_boost) {
|
|
1010
|
+
entry = clamped_boost;
|
|
1011
|
+
}
|
|
1012
|
+
}
|
|
1013
|
+
}
|
|
1014
|
+
|
|
1015
|
+
return vocab_bias;
|
|
1016
|
+
}
|
|
1017
|
+
|
|
1018
|
+
inline std::unordered_map<uint32_t, float> build_custom_vocabulary_bias(cactus::engine::Tokenizer* tokenizer,
|
|
1019
|
+
const std::vector<std::string>& custom_vocabulary,
|
|
1020
|
+
float vocabulary_boost) {
|
|
1021
|
+
if (!tokenizer || custom_vocabulary.empty()) return {};
|
|
1022
|
+
std::vector<std::vector<uint32_t>> tokenized_entries;
|
|
1023
|
+
tokenized_entries.reserve(custom_vocabulary.size());
|
|
1024
|
+
|
|
1025
|
+
for (const auto& word : custom_vocabulary) {
|
|
1026
|
+
if (word.empty()) continue;
|
|
1027
|
+
tokenized_entries.push_back(tokenizer->encode(word));
|
|
1028
|
+
}
|
|
1029
|
+
|
|
1030
|
+
return build_token_bias_map(tokenized_entries, vocabulary_boost);
|
|
1031
|
+
}
|
|
1032
|
+
|
|
1033
|
+
inline void apply_custom_vocabulary_options(cactus::engine::Model* model, const std::string& json) {
|
|
1034
|
+
if (!model) return;
|
|
1035
|
+
|
|
1036
|
+
std::vector<std::string> custom_vocabulary;
|
|
1037
|
+
float vocabulary_boost = 5.0f;
|
|
1038
|
+
parse_custom_vocabulary_options(json, custom_vocabulary, vocabulary_boost);
|
|
1039
|
+
model->set_vocab_bias(build_custom_vocabulary_bias(model->get_tokenizer(), custom_vocabulary, vocabulary_boost));
|
|
1040
|
+
}
|
|
1041
|
+
|
|
1042
|
+
inline size_t levenshtein_ci(const std::string& a, const std::string& b) {
|
|
1043
|
+
const size_t m = a.size(), n = b.size();
|
|
1044
|
+
std::vector<size_t> prev(n + 1), curr(n + 1);
|
|
1045
|
+
for (size_t j = 0; j <= n; ++j) prev[j] = j;
|
|
1046
|
+
for (size_t i = 1; i <= m; ++i) {
|
|
1047
|
+
curr[0] = i;
|
|
1048
|
+
for (size_t j = 1; j <= n; ++j) {
|
|
1049
|
+
const bool match = std::tolower(static_cast<unsigned char>(a[i - 1])) ==
|
|
1050
|
+
std::tolower(static_cast<unsigned char>(b[j - 1]));
|
|
1051
|
+
curr[j] = std::min({prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + (match ? 0 : 1)});
|
|
1052
|
+
}
|
|
1053
|
+
std::swap(prev, curr);
|
|
1054
|
+
}
|
|
1055
|
+
return prev[n];
|
|
1056
|
+
}
|
|
1057
|
+
|
|
1058
|
+
inline std::string collapse_spaces(const std::string& s) {
|
|
1059
|
+
std::string out;
|
|
1060
|
+
out.reserve(s.size());
|
|
1061
|
+
for (char c : s) {
|
|
1062
|
+
if (c != ' ') out += c;
|
|
1063
|
+
}
|
|
1064
|
+
return out;
|
|
1065
|
+
}
|
|
1066
|
+
|
|
1067
|
+
inline void apply_vocabulary_spelling_correction(
|
|
1068
|
+
std::string& text,
|
|
1069
|
+
const std::vector<std::string>& custom_vocabulary)
|
|
1070
|
+
{
|
|
1071
|
+
if (custom_vocabulary.empty() || text.empty()) return;
|
|
1072
|
+
|
|
1073
|
+
struct VocabEntry {
|
|
1074
|
+
const std::string* original;
|
|
1075
|
+
std::string collapsed;
|
|
1076
|
+
};
|
|
1077
|
+
std::vector<VocabEntry> vocab_entries;
|
|
1078
|
+
vocab_entries.reserve(custom_vocabulary.size());
|
|
1079
|
+
for (const auto& v : custom_vocabulary) {
|
|
1080
|
+
vocab_entries.push_back({&v, collapse_spaces(v)});
|
|
1081
|
+
}
|
|
1082
|
+
|
|
1083
|
+
struct Token { std::string text; bool is_word; };
|
|
1084
|
+
std::vector<Token> tokens;
|
|
1085
|
+
size_t pos = 0;
|
|
1086
|
+
while (pos < text.size()) {
|
|
1087
|
+
if (std::isalnum(static_cast<unsigned char>(text[pos])) ||
|
|
1088
|
+
text[pos] == '\'' || text[pos] == '-') {
|
|
1089
|
+
size_t start = pos;
|
|
1090
|
+
while (pos < text.size() && (std::isalnum(static_cast<unsigned char>(text[pos])) ||
|
|
1091
|
+
text[pos] == '\'' || text[pos] == '-')) {
|
|
1092
|
+
++pos;
|
|
1093
|
+
}
|
|
1094
|
+
tokens.push_back({text.substr(start, pos - start), true});
|
|
1095
|
+
} else {
|
|
1096
|
+
size_t start = pos;
|
|
1097
|
+
while (pos < text.size() && !std::isalnum(static_cast<unsigned char>(text[pos])) &&
|
|
1098
|
+
text[pos] != '\'' && text[pos] != '-') {
|
|
1099
|
+
++pos;
|
|
1100
|
+
}
|
|
1101
|
+
tokens.push_back({text.substr(start, pos - start), false});
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
1104
|
+
|
|
1105
|
+
std::vector<size_t> word_indices;
|
|
1106
|
+
for (size_t i = 0; i < tokens.size(); ++i) {
|
|
1107
|
+
if (tokens[i].is_word) word_indices.push_back(i);
|
|
1108
|
+
}
|
|
1109
|
+
|
|
1110
|
+
std::vector<bool> consumed(tokens.size(), false);
|
|
1111
|
+
|
|
1112
|
+
auto strip_suffix = [](const std::string& word) -> std::pair<std::string, std::string> {
|
|
1113
|
+
if (word.size() >= 3 && word.substr(word.size() - 2) == "'s") {
|
|
1114
|
+
return {word.substr(0, word.size() - 2), "'s"};
|
|
1115
|
+
}
|
|
1116
|
+
if (word.size() >= 3 && word.substr(word.size() - 2) == "'t") {
|
|
1117
|
+
return {word.substr(0, word.size() - 2), "'t"};
|
|
1118
|
+
}
|
|
1119
|
+
if (word.size() >= 4 && word.back() == 's' &&
|
|
1120
|
+
word[word.size() - 2] != 's' && // avoid stripping from "boss", "class"
|
|
1121
|
+
std::isalpha(static_cast<unsigned char>(word[word.size() - 2]))) {
|
|
1122
|
+
return {word.substr(0, word.size() - 1), "s"};
|
|
1123
|
+
}
|
|
1124
|
+
return {word, ""};
|
|
1125
|
+
};
|
|
1126
|
+
|
|
1127
|
+
size_t wi = 0;
|
|
1128
|
+
while (wi < word_indices.size()) {
|
|
1129
|
+
size_t best_dist = std::numeric_limits<size_t>::max();
|
|
1130
|
+
const std::string* best_match = nullptr;
|
|
1131
|
+
size_t best_window = 0;
|
|
1132
|
+
size_t best_first_token = 0;
|
|
1133
|
+
size_t best_last_token = 0;
|
|
1134
|
+
std::string best_suffix;
|
|
1135
|
+
|
|
1136
|
+
for (size_t window = std::min<size_t>(3, word_indices.size() - wi); window >= 1; --window) {
|
|
1137
|
+
std::string window_collapsed;
|
|
1138
|
+
const size_t first_tok = word_indices[wi];
|
|
1139
|
+
const size_t last_tok = word_indices[wi + window - 1];
|
|
1140
|
+
for (size_t w = 0; w < window; ++w) {
|
|
1141
|
+
window_collapsed += tokens[word_indices[wi + w]].text;
|
|
1142
|
+
}
|
|
1143
|
+
|
|
1144
|
+
if (window == 1 && window_collapsed.size() < 3) break;
|
|
1145
|
+
|
|
1146
|
+
auto [stem, suffix] = strip_suffix(window_collapsed);
|
|
1147
|
+
const std::string* candidates[] = {&window_collapsed, &stem};
|
|
1148
|
+
const std::string suffixes[] = {"", suffix};
|
|
1149
|
+
const size_t num_candidates = suffix.empty() ? 1 : 2;
|
|
1150
|
+
|
|
1151
|
+
for (size_t ci = 0; ci < num_candidates; ++ci) {
|
|
1152
|
+
const std::string& candidate = *candidates[ci];
|
|
1153
|
+
if (candidate.empty()) continue;
|
|
1154
|
+
|
|
1155
|
+
for (const auto& entry : vocab_entries) {
|
|
1156
|
+
const size_t wlen = candidate.size();
|
|
1157
|
+
const size_t vlen = entry.collapsed.size();
|
|
1158
|
+
|
|
1159
|
+
const size_t len_diff = wlen > vlen ? wlen - vlen : vlen - wlen;
|
|
1160
|
+
const size_t max_dist = std::max<size_t>(1, std::min(wlen, vlen) / 3);
|
|
1161
|
+
if (len_diff > max_dist) continue;
|
|
1162
|
+
|
|
1163
|
+
const size_t dist = levenshtein_ci(candidate, entry.collapsed);
|
|
1164
|
+
|
|
1165
|
+
// For single-edit corrections, require first char match to prevent
|
|
1166
|
+
// false positives like "vortex" → "Cortex".
|
|
1167
|
+
if (dist == 1 && window == 1) {
|
|
1168
|
+
const bool first_char_match =
|
|
1169
|
+
std::tolower(static_cast<unsigned char>(candidate[0])) ==
|
|
1170
|
+
std::tolower(static_cast<unsigned char>(entry.collapsed[0]));
|
|
1171
|
+
if (!first_char_match) continue;
|
|
1172
|
+
}
|
|
1173
|
+
|
|
1174
|
+
if (dist <= max_dist && dist < best_dist) {
|
|
1175
|
+
best_dist = dist;
|
|
1176
|
+
best_match = entry.original;
|
|
1177
|
+
best_window = window;
|
|
1178
|
+
best_first_token = first_tok;
|
|
1179
|
+
best_last_token = last_tok;
|
|
1180
|
+
best_suffix = suffixes[ci];
|
|
1181
|
+
}
|
|
1182
|
+
}
|
|
1183
|
+
}
|
|
1184
|
+
|
|
1185
|
+
if (best_dist == 0) break;
|
|
1186
|
+
}
|
|
1187
|
+
|
|
1188
|
+
// Allow dist==0 for multi-word merges where word boundaries changed.
|
|
1189
|
+
const bool should_replace = best_match &&
|
|
1190
|
+
best_dist != std::numeric_limits<size_t>::max() &&
|
|
1191
|
+
(best_dist > 0 || best_window > 1);
|
|
1192
|
+
|
|
1193
|
+
if (should_replace) {
|
|
1194
|
+
tokens[best_first_token].text = *best_match + best_suffix;
|
|
1195
|
+
for (size_t t = best_first_token + 1; t <= best_last_token; ++t) {
|
|
1196
|
+
consumed[t] = true;
|
|
1197
|
+
}
|
|
1198
|
+
for (size_t t = best_first_token + 1; t <= best_last_token; ++t) {
|
|
1199
|
+
if (t > 0) consumed[t - 1] = consumed[t - 1] || !tokens[t - 1].is_word;
|
|
1200
|
+
}
|
|
1201
|
+
wi += best_window;
|
|
1202
|
+
} else {
|
|
1203
|
+
++wi;
|
|
1204
|
+
}
|
|
1205
|
+
}
|
|
1206
|
+
|
|
1207
|
+
std::string result;
|
|
1208
|
+
result.reserve(text.size());
|
|
1209
|
+
for (size_t i = 0; i < tokens.size(); ++i) {
|
|
1210
|
+
if (!consumed[i]) {
|
|
1211
|
+
result += tokens[i].text;
|
|
1212
|
+
}
|
|
1213
|
+
}
|
|
1214
|
+
|
|
1215
|
+
text = std::move(result);
|
|
1216
|
+
}
|
|
1217
|
+
|
|
1218
|
+
inline InferenceOptions parse_inference_options_json(const std::string& json) {
|
|
1219
|
+
InferenceOptions options;
|
|
1220
|
+
|
|
1221
|
+
if (json.empty()) return options;
|
|
1222
|
+
|
|
575
1223
|
size_t pos = json.find("\"temperature\"");
|
|
576
1224
|
if (pos != std::string::npos) {
|
|
577
1225
|
pos = json.find(':', pos) + 1;
|
|
578
|
-
temperature = std::stof(json.substr(pos));
|
|
1226
|
+
options.temperature = std::stof(json.substr(pos));
|
|
579
1227
|
}
|
|
580
1228
|
|
|
581
1229
|
pos = json.find("\"top_p\"");
|
|
582
1230
|
if (pos != std::string::npos) {
|
|
583
1231
|
pos = json.find(':', pos) + 1;
|
|
584
|
-
top_p = std::stof(json.substr(pos));
|
|
1232
|
+
options.top_p = std::stof(json.substr(pos));
|
|
585
1233
|
}
|
|
586
1234
|
|
|
587
1235
|
pos = json.find("\"top_k\"");
|
|
588
1236
|
if (pos != std::string::npos) {
|
|
589
1237
|
pos = json.find(':', pos) + 1;
|
|
590
|
-
top_k = std::stoul(json.substr(pos));
|
|
1238
|
+
options.top_k = std::stoul(json.substr(pos));
|
|
591
1239
|
}
|
|
592
1240
|
|
|
593
1241
|
pos = json.find("\"max_tokens\"");
|
|
594
1242
|
if (pos != std::string::npos) {
|
|
595
1243
|
pos = json.find(':', pos) + 1;
|
|
596
|
-
max_tokens = std::stoul(json.substr(pos));
|
|
1244
|
+
options.max_tokens = std::stoul(json.substr(pos));
|
|
597
1245
|
}
|
|
598
1246
|
|
|
599
1247
|
pos = json.find("\"force_tools\"");
|
|
600
1248
|
if (pos != std::string::npos) {
|
|
601
1249
|
pos = json.find(':', pos) + 1;
|
|
602
|
-
while (pos < json.length() && std::isspace(json[pos])) pos++;
|
|
603
|
-
force_tools = (json.substr(pos, 4) == "true");
|
|
1250
|
+
while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
|
|
1251
|
+
options.force_tools = (json.substr(pos, 4) == "true");
|
|
604
1252
|
}
|
|
605
1253
|
|
|
606
1254
|
pos = json.find("\"tool_rag_top_k\"");
|
|
607
1255
|
if (pos != std::string::npos) {
|
|
608
1256
|
pos = json.find(':', pos) + 1;
|
|
609
|
-
tool_rag_top_k = std::stoul(json.substr(pos));
|
|
1257
|
+
options.tool_rag_top_k = std::stoul(json.substr(pos));
|
|
610
1258
|
}
|
|
611
1259
|
|
|
612
1260
|
pos = json.find("\"confidence_threshold\"");
|
|
613
1261
|
if (pos != std::string::npos) {
|
|
614
1262
|
pos = json.find(':', pos) + 1;
|
|
615
|
-
confidence_threshold = std::stof(json.substr(pos));
|
|
1263
|
+
options.confidence_threshold = std::stof(json.substr(pos));
|
|
616
1264
|
}
|
|
617
1265
|
|
|
618
1266
|
pos = json.find("\"include_stop_sequences\"");
|
|
619
1267
|
if (pos != std::string::npos) {
|
|
620
1268
|
pos = json.find(':', pos) + 1;
|
|
621
|
-
while (pos < json.length() && std::isspace(json[pos])) pos++;
|
|
622
|
-
include_stop_sequences = (json.substr(pos, 4) == "true");
|
|
1269
|
+
while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
|
|
1270
|
+
options.include_stop_sequences = (json.substr(pos, 4) == "true");
|
|
623
1271
|
}
|
|
624
1272
|
|
|
625
1273
|
pos = json.find("\"use_vad\"");
|
|
626
1274
|
if (pos != std::string::npos) {
|
|
627
1275
|
pos = json.find(':', pos) + 1;
|
|
628
|
-
while (pos < json.length() && std::isspace(json[pos])) pos++;
|
|
629
|
-
use_vad = (json.substr(pos, 4) == "true");
|
|
1276
|
+
while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
|
|
1277
|
+
options.use_vad = (json.substr(pos, 4) == "true");
|
|
630
1278
|
}
|
|
631
1279
|
|
|
632
1280
|
pos = json.find("\"telemetry_enabled\"");
|
|
633
1281
|
if (pos != std::string::npos) {
|
|
634
1282
|
pos = json.find(':', pos) + 1;
|
|
635
|
-
while (pos < json.length() && std::isspace(json[pos])) pos++;
|
|
636
|
-
telemetry_enabled = (json.substr(pos, 4) == "true");
|
|
1283
|
+
while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
|
|
1284
|
+
options.telemetry_enabled = (json.substr(pos, 4) == "true");
|
|
637
1285
|
}
|
|
638
1286
|
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
*auto_handoff = (json.substr(pos, 4) == "true");
|
|
645
|
-
}
|
|
1287
|
+
pos = json.find("\"auto_handoff\"");
|
|
1288
|
+
if (pos != std::string::npos) {
|
|
1289
|
+
pos = json.find(':', pos) + 1;
|
|
1290
|
+
while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
|
|
1291
|
+
options.auto_handoff = (json.substr(pos, 4) == "true");
|
|
646
1292
|
}
|
|
647
1293
|
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
*cloud_timeout_ms = std::stoul(json.substr(pos));
|
|
653
|
-
}
|
|
1294
|
+
pos = json.find("\"cloud_timeout_ms\"");
|
|
1295
|
+
if (pos != std::string::npos) {
|
|
1296
|
+
pos = json.find(':', pos) + 1;
|
|
1297
|
+
options.cloud_timeout_ms = std::stoul(json.substr(pos));
|
|
654
1298
|
}
|
|
655
1299
|
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
1300
|
+
pos = json.find("\"handoff_with_images\"");
|
|
1301
|
+
if (pos != std::string::npos) {
|
|
1302
|
+
pos = json.find(':', pos) + 1;
|
|
1303
|
+
while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
|
|
1304
|
+
options.handoff_with_images = (json.substr(pos, 4) == "true");
|
|
1305
|
+
}
|
|
1306
|
+
|
|
1307
|
+
pos = json.find("\"enable_thinking_if_supported\"");
|
|
1308
|
+
if (pos != std::string::npos) {
|
|
1309
|
+
pos = json.find(':', pos) + 1;
|
|
1310
|
+
while (pos < json.length() && std::isspace(static_cast<unsigned char>(json[pos]))) pos++;
|
|
1311
|
+
options.enable_thinking_if_supported = (json.substr(pos, 4) == "true");
|
|
663
1312
|
}
|
|
664
1313
|
|
|
665
1314
|
pos = json.find("\"stop_sequences\"");
|
|
@@ -673,12 +1322,14 @@ inline void parse_options_json(const std::string& json,
|
|
|
673
1322
|
size_t seq_start = seq_pos + 1;
|
|
674
1323
|
size_t seq_end = json.find('"', seq_start);
|
|
675
1324
|
if (seq_end != std::string::npos) {
|
|
676
|
-
stop_sequences.push_back(json.substr(seq_start, seq_end - seq_start));
|
|
1325
|
+
options.stop_sequences.push_back(json.substr(seq_start, seq_end - seq_start));
|
|
677
1326
|
}
|
|
678
1327
|
seq_pos = json.find('"', seq_end + 1);
|
|
679
1328
|
}
|
|
680
1329
|
}
|
|
681
1330
|
}
|
|
1331
|
+
|
|
1332
|
+
return options;
|
|
682
1333
|
}
|
|
683
1334
|
|
|
684
1335
|
static inline std::string trim_lfm2_slice(const std::string& value, size_t begin, size_t end) {
|
|
@@ -755,7 +1406,6 @@ inline void parse_function_calls_from_response(const std::string& response_text,
|
|
|
755
1406
|
|
|
756
1407
|
gemma::parse_function_calls(regular_response, function_calls);
|
|
757
1408
|
|
|
758
|
-
// Parse Qwen-style function calls: <tool_call>{"name": "...", "arguments": {...}}</tool_call>
|
|
759
1409
|
const std::string QWEN_TOOL_START = "<tool_call>";
|
|
760
1410
|
const std::string QWEN_TOOL_END = "</tool_call>";
|
|
761
1411
|
size_t qwen_start_pos = 0;
|
|
@@ -764,27 +1414,62 @@ inline void parse_function_calls_from_response(const std::string& response_text,
|
|
|
764
1414
|
size_t content_start = qwen_start_pos + QWEN_TOOL_START.length();
|
|
765
1415
|
size_t qwen_end_pos = regular_response.find(QWEN_TOOL_END, content_start);
|
|
766
1416
|
|
|
1417
|
+
size_t erase_end;
|
|
1418
|
+
std::string json_content;
|
|
1419
|
+
|
|
767
1420
|
if (qwen_end_pos != std::string::npos) {
|
|
768
|
-
|
|
1421
|
+
json_content = regular_response.substr(content_start, qwen_end_pos - content_start);
|
|
1422
|
+
erase_end = qwen_end_pos + QWEN_TOOL_END.length();
|
|
1423
|
+
} else {
|
|
1424
|
+
json_content = regular_response.substr(content_start);
|
|
1425
|
+
erase_end = regular_response.length();
|
|
1426
|
+
}
|
|
769
1427
|
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
1428
|
+
size_t first = json_content.find_first_not_of(" \t\n\r");
|
|
1429
|
+
size_t last = json_content.find_last_not_of(" \t\n\r");
|
|
1430
|
+
if (first != std::string::npos && last != std::string::npos) {
|
|
1431
|
+
json_content = json_content.substr(first, last - first + 1);
|
|
1432
|
+
}
|
|
775
1433
|
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
1434
|
+
if (json_content.size() > 2 && json_content[0] == '{' &&
|
|
1435
|
+
json_content.find("\"name\"") != std::string::npos) {
|
|
1436
|
+
size_t depth = 0;
|
|
1437
|
+
bool in_string = false;
|
|
1438
|
+
bool escaped = false;
|
|
1439
|
+
size_t end_pos = 0;
|
|
1440
|
+
for (size_t c = 0; c < json_content.size(); c++) {
|
|
1441
|
+
char ch = json_content[c];
|
|
1442
|
+
if (escaped) {
|
|
1443
|
+
escaped = false;
|
|
1444
|
+
continue;
|
|
1445
|
+
}
|
|
1446
|
+
if (ch == '\\' && in_string) {
|
|
1447
|
+
escaped = true;
|
|
1448
|
+
continue;
|
|
1449
|
+
}
|
|
1450
|
+
if (ch == '"') {
|
|
1451
|
+
in_string = !in_string;
|
|
1452
|
+
continue;
|
|
1453
|
+
}
|
|
1454
|
+
if (!in_string) {
|
|
1455
|
+
if (ch == '{') depth++;
|
|
1456
|
+
else if (ch == '}') {
|
|
1457
|
+
depth--;
|
|
1458
|
+
if (depth == 0) {
|
|
1459
|
+
end_pos = c + 1;
|
|
1460
|
+
break;
|
|
1461
|
+
}
|
|
1462
|
+
}
|
|
1463
|
+
}
|
|
1464
|
+
}
|
|
1465
|
+
if (end_pos > 0) {
|
|
1466
|
+
function_calls.push_back(json_content.substr(0, end_pos));
|
|
779
1467
|
}
|
|
780
|
-
|
|
781
|
-
regular_response.erase(qwen_start_pos, qwen_end_pos + QWEN_TOOL_END.length() - qwen_start_pos);
|
|
782
|
-
} else {
|
|
783
|
-
break;
|
|
784
1468
|
}
|
|
1469
|
+
|
|
1470
|
+
regular_response.erase(qwen_start_pos, erase_end - qwen_start_pos);
|
|
785
1471
|
}
|
|
786
|
-
|
|
787
|
-
// Parse LFM2-style function calls: <|tool_call_start|>[name(args)]<|tool_call_end|>
|
|
1472
|
+
|
|
788
1473
|
const std::string TOOL_CALL_START = "<|tool_call_start|>";
|
|
789
1474
|
const std::string TOOL_CALL_END = "<|tool_call_end|>";
|
|
790
1475
|
size_t tool_start_pos = 0;
|
|
@@ -898,6 +1583,95 @@ inline void parse_function_calls_from_response(const std::string& response_text,
|
|
|
898
1583
|
}
|
|
899
1584
|
}
|
|
900
1585
|
|
|
1586
|
+
inline std::vector<std::pair<size_t, size_t>> find_channel_token_ranges(
|
|
1587
|
+
const std::vector<uint32_t>& tokens, size_t offset,
|
|
1588
|
+
uint32_t channel_open_id, uint32_t channel_close_id) {
|
|
1589
|
+
std::vector<std::pair<size_t, size_t>> ranges;
|
|
1590
|
+
size_t pos = 0;
|
|
1591
|
+
while (pos < tokens.size()) {
|
|
1592
|
+
if (tokens[pos] != channel_open_id) {
|
|
1593
|
+
pos++;
|
|
1594
|
+
continue;
|
|
1595
|
+
}
|
|
1596
|
+
|
|
1597
|
+
size_t block_start = pos;
|
|
1598
|
+
pos++;
|
|
1599
|
+
while (pos < tokens.size() && tokens[pos] != channel_close_id) {
|
|
1600
|
+
pos++;
|
|
1601
|
+
}
|
|
1602
|
+
if (pos < tokens.size()) {
|
|
1603
|
+
pos++;
|
|
1604
|
+
}
|
|
1605
|
+
ranges.push_back({offset + block_start, pos - block_start});
|
|
1606
|
+
}
|
|
1607
|
+
return ranges;
|
|
1608
|
+
}
|
|
1609
|
+
|
|
1610
|
+
inline void strip_tag_blocks(std::string& text, std::string& extracted,
|
|
1611
|
+
const std::string& open_tag, const std::string& close_tag) {
|
|
1612
|
+
std::string result;
|
|
1613
|
+
size_t pos = 0;
|
|
1614
|
+
|
|
1615
|
+
size_t first_close = text.find(close_tag);
|
|
1616
|
+
size_t first_open = text.find(open_tag);
|
|
1617
|
+
if (first_close != std::string::npos &&
|
|
1618
|
+
(first_open == std::string::npos || first_close < first_open)) {
|
|
1619
|
+
extracted += text.substr(0, first_close);
|
|
1620
|
+
pos = first_close + close_tag.size();
|
|
1621
|
+
}
|
|
1622
|
+
|
|
1623
|
+
while (pos < text.size()) {
|
|
1624
|
+
size_t open_pos = text.find(open_tag, pos);
|
|
1625
|
+
if (open_pos == std::string::npos) {
|
|
1626
|
+
result += text.substr(pos);
|
|
1627
|
+
break;
|
|
1628
|
+
}
|
|
1629
|
+
result += text.substr(pos, open_pos - pos);
|
|
1630
|
+
size_t content_start = open_pos + open_tag.size();
|
|
1631
|
+
size_t close_pos = text.find(close_tag, content_start);
|
|
1632
|
+
if (close_pos == std::string::npos) {
|
|
1633
|
+
if (!extracted.empty()) extracted += "\n";
|
|
1634
|
+
extracted += text.substr(content_start);
|
|
1635
|
+
break;
|
|
1636
|
+
}
|
|
1637
|
+
if (!extracted.empty()) extracted += "\n";
|
|
1638
|
+
extracted += text.substr(content_start, close_pos - content_start);
|
|
1639
|
+
pos = close_pos + close_tag.size();
|
|
1640
|
+
}
|
|
1641
|
+
text = result;
|
|
1642
|
+
}
|
|
1643
|
+
|
|
1644
|
+
inline void strip_thinking_block(const std::string& input, std::string& thinking, std::string& content) {
|
|
1645
|
+
thinking.clear();
|
|
1646
|
+
content = input;
|
|
1647
|
+
|
|
1648
|
+
auto trim = [](std::string& s) {
|
|
1649
|
+
size_t first = s.find_first_not_of(" \t\n\r");
|
|
1650
|
+
size_t last = s.find_last_not_of(" \t\n\r");
|
|
1651
|
+
if (first != std::string::npos && last != std::string::npos)
|
|
1652
|
+
s = s.substr(first, last - first + 1);
|
|
1653
|
+
else
|
|
1654
|
+
s.clear();
|
|
1655
|
+
};
|
|
1656
|
+
|
|
1657
|
+
if (content.find("<|channel>") != std::string::npos || content.find("<channel|>") != std::string::npos) {
|
|
1658
|
+
strip_tag_blocks(content, thinking, "<|channel>", "<channel|>");
|
|
1659
|
+
} else if (content.find("<think>") != std::string::npos || content.find("</think>") != std::string::npos) {
|
|
1660
|
+
strip_tag_blocks(content, thinking, "<think>", "</think>");
|
|
1661
|
+
} else {
|
|
1662
|
+
return;
|
|
1663
|
+
}
|
|
1664
|
+
|
|
1665
|
+
trim(thinking);
|
|
1666
|
+
trim(content);
|
|
1667
|
+
}
|
|
1668
|
+
|
|
1669
|
+
struct TranscriptSegment {
|
|
1670
|
+
float start;
|
|
1671
|
+
float end;
|
|
1672
|
+
std::string text;
|
|
1673
|
+
};
|
|
1674
|
+
|
|
901
1675
|
inline std::string construct_response_json(const std::string& regular_response,
|
|
902
1676
|
const std::vector<std::string>& function_calls,
|
|
903
1677
|
double time_to_first_token,
|
|
@@ -907,19 +1681,32 @@ inline std::string construct_response_json(const std::string& regular_response,
|
|
|
907
1681
|
size_t prompt_tokens,
|
|
908
1682
|
size_t completion_tokens,
|
|
909
1683
|
float confidence = 0.0f,
|
|
910
|
-
bool cloud_handoff = false
|
|
1684
|
+
bool cloud_handoff = false,
|
|
1685
|
+
const std::string& thinking = "",
|
|
1686
|
+
const std::vector<TranscriptSegment>& segments = {}) {
|
|
911
1687
|
std::ostringstream json;
|
|
912
1688
|
json << "{";
|
|
913
1689
|
json << "\"success\":true,";
|
|
914
1690
|
json << "\"error\":null,";
|
|
915
1691
|
json << "\"cloud_handoff\":" << (cloud_handoff ? "true" : "false") << ",";
|
|
916
1692
|
json << "\"response\":\"" << escape_json_string(regular_response) << "\",";
|
|
1693
|
+
if (!thinking.empty()) {
|
|
1694
|
+
json << "\"thinking\":\"" << escape_json_string(thinking) << "\",";
|
|
1695
|
+
}
|
|
917
1696
|
json << "\"function_calls\":[";
|
|
918
1697
|
for (size_t i = 0; i < function_calls.size(); ++i) {
|
|
919
1698
|
if (i > 0) json << ",";
|
|
920
1699
|
json << function_calls[i];
|
|
921
1700
|
}
|
|
922
1701
|
json << "],";
|
|
1702
|
+
json << "\"segments\":[";
|
|
1703
|
+
for (size_t i = 0; i < segments.size(); ++i) {
|
|
1704
|
+
if (i > 0) json << ",";
|
|
1705
|
+
json << "{\"start\":" << std::fixed << std::setprecision(3) << segments[i].start
|
|
1706
|
+
<< ",\"end\":" << std::fixed << std::setprecision(3) << segments[i].end
|
|
1707
|
+
<< ",\"text\":\"" << escape_json_string(segments[i].text) << "\"}";
|
|
1708
|
+
}
|
|
1709
|
+
json << "],";
|
|
923
1710
|
json << "\"confidence\":" << std::fixed << std::setprecision(4) << confidence << ",";
|
|
924
1711
|
json << "\"time_to_first_token_ms\":" << std::fixed << std::setprecision(2) << time_to_first_token << ",";
|
|
925
1712
|
json << "\"total_time_ms\":" << std::fixed << std::setprecision(2) << total_time_ms << ",";
|
|
@@ -945,6 +1732,50 @@ inline std::string serialize_function_calls(const std::vector<std::string>& call
|
|
|
945
1732
|
return oss.str();
|
|
946
1733
|
}
|
|
947
1734
|
|
|
1735
|
+
inline int validate_audio_params(
|
|
1736
|
+
const char* component,
|
|
1737
|
+
void* model,
|
|
1738
|
+
char* response_buffer, size_t buffer_size,
|
|
1739
|
+
const char* audio_file_path,
|
|
1740
|
+
const uint8_t* pcm_buffer, size_t pcm_buffer_size) {
|
|
1741
|
+
if (!model) {
|
|
1742
|
+
std::string err = last_error_message.empty() ? "Model not initialized." : last_error_message;
|
|
1743
|
+
CACTUS_LOG_ERROR(component, err);
|
|
1744
|
+
handle_error_response(err, response_buffer, buffer_size);
|
|
1745
|
+
return -1;
|
|
1746
|
+
}
|
|
1747
|
+
if (!response_buffer || buffer_size == 0) {
|
|
1748
|
+
CACTUS_LOG_ERROR(component, "Invalid parameters: response_buffer or buffer_size");
|
|
1749
|
+
handle_error_response("Invalid parameters", response_buffer, buffer_size);
|
|
1750
|
+
return -1;
|
|
1751
|
+
}
|
|
1752
|
+
if (!audio_file_path && (!pcm_buffer || pcm_buffer_size == 0)) {
|
|
1753
|
+
CACTUS_LOG_ERROR(component, "No audio input provided");
|
|
1754
|
+
handle_error_response("Either audio_file_path or pcm_buffer must be provided", response_buffer, buffer_size);
|
|
1755
|
+
return -1;
|
|
1756
|
+
}
|
|
1757
|
+
if (audio_file_path && pcm_buffer && pcm_buffer_size > 0) {
|
|
1758
|
+
CACTUS_LOG_ERROR(component, "Both audio_file_path and pcm_buffer provided");
|
|
1759
|
+
handle_error_response("Cannot provide both audio_file_path and pcm_buffer", response_buffer, buffer_size);
|
|
1760
|
+
return -1;
|
|
1761
|
+
}
|
|
1762
|
+
if (pcm_buffer && pcm_buffer_size > 0 && (pcm_buffer_size < 2 || pcm_buffer_size % 2 != 0)) {
|
|
1763
|
+
CACTUS_LOG_ERROR(component, "Invalid pcm_buffer_size");
|
|
1764
|
+
handle_error_response("pcm_buffer_size must be even and at least 2 bytes", response_buffer, buffer_size);
|
|
1765
|
+
return -1;
|
|
1766
|
+
}
|
|
1767
|
+
return 0;
|
|
1768
|
+
}
|
|
1769
|
+
|
|
1770
|
+
inline std::vector<float> pcm_to_float(const uint8_t* pcm_buffer, size_t pcm_buffer_size) {
|
|
1771
|
+
const int16_t* samples = reinterpret_cast<const int16_t*>(pcm_buffer);
|
|
1772
|
+
size_t n = pcm_buffer_size / 2;
|
|
1773
|
+
std::vector<float> out(n);
|
|
1774
|
+
for (size_t i = 0; i < n; ++i)
|
|
1775
|
+
out[i] = static_cast<float>(samples[i]) / 32768.0f;
|
|
1776
|
+
return out;
|
|
1777
|
+
}
|
|
1778
|
+
|
|
948
1779
|
} // namespace ffi
|
|
949
1780
|
} // namespace cactus
|
|
950
1781
|
|
|
@@ -958,4 +1789,4 @@ const char* cactus_get_last_error();
|
|
|
958
1789
|
}
|
|
959
1790
|
#endif
|
|
960
1791
|
|
|
961
|
-
#endif // CACTUS_UTILS_H
|
|
1792
|
+
#endif // CACTUS_UTILS_H
|