cactus-react-native 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +241 -45
- package/android/src/main/jniLibs/arm64-v8a/libcactus.a +0 -0
- package/cpp/HybridCactus.cpp +63 -51
- package/cpp/HybridCactus.hpp +21 -14
- package/cpp/HybridCactusUtil.cpp +13 -11
- package/cpp/HybridCactusUtil.hpp +9 -9
- package/cpp/cactus_ffi.h +1 -1
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h +1 -1
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/engine.h +204 -6
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/ffi_utils.h +150 -36
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h +20 -1
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h +21 -0
- package/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus +0 -0
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_ffi.h +1 -1
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h +204 -6
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/ffi_utils.h +150 -36
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h +20 -1
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel.h +21 -0
- package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/cactus +0 -0
- package/lib/module/classes/CactusLM.js +4 -2
- package/lib/module/classes/CactusLM.js.map +1 -1
- package/lib/module/constants/packageVersion.js +1 -1
- package/lib/module/hooks/useCactusLM.js +11 -6
- package/lib/module/hooks/useCactusLM.js.map +1 -1
- package/lib/module/native/Cactus.js +2 -2
- package/lib/module/native/Cactus.js.map +1 -1
- package/lib/typescript/src/classes/CactusLM.d.ts +2 -1
- package/lib/typescript/src/classes/CactusLM.d.ts.map +1 -1
- package/lib/typescript/src/constants/packageVersion.d.ts +1 -1
- package/lib/typescript/src/hooks/useCactusLM.d.ts +1 -1
- package/lib/typescript/src/hooks/useCactusLM.d.ts.map +1 -1
- package/lib/typescript/src/native/Cactus.d.ts +1 -1
- package/lib/typescript/src/native/Cactus.d.ts.map +1 -1
- package/lib/typescript/src/specs/Cactus.nitro.d.ts +1 -1
- package/lib/typescript/src/specs/Cactus.nitro.d.ts.map +1 -1
- package/lib/typescript/src/types/CactusLM.d.ts +3 -1
- package/lib/typescript/src/types/CactusLM.d.ts.map +1 -1
- package/nitrogen/generated/shared/c++/HybridCactusSpec.hpp +1 -1
- package/package.json +1 -1
- package/src/classes/CactusLM.ts +4 -2
- package/src/constants/packageVersion.ts +1 -1
- package/src/hooks/useCactusLM.ts +8 -5
- package/src/native/Cactus.ts +6 -2
- package/src/specs/Cactus.nitro.ts +5 -1
- package/src/types/CactusLM.ts +3 -1
|
@@ -28,8 +28,9 @@ enum class OpType {
|
|
|
28
28
|
INPUT, PRECISION_CAST,
|
|
29
29
|
ADD, ADD_CLIPPED, SUBTRACT, MULTIPLY, DIVIDE,
|
|
30
30
|
MATMUL, TRANSPOSE, RESHAPE, SLICE, GATHER, EMBEDDING,
|
|
31
|
+
BILINEAR_INTERPOLATION,
|
|
31
32
|
SUM, MEAN, VARIANCE, MIN, MAX,
|
|
32
|
-
RMS_NORM, ROPE, SOFTMAX, ATTENTION, CONV1D_CAUSAL,
|
|
33
|
+
RMS_NORM, ROPE, SOFTMAX, ATTENTION, CONV1D_CAUSAL, CONV1D_K3,
|
|
33
34
|
SCALAR_ADD, SCALAR_SUBTRACT, SCALAR_MULTIPLY, SCALAR_DIVIDE, SCALAR_EXP, SCALAR_SQRT, SCALAR_COS, SCALAR_SIN,
|
|
34
35
|
SILU, GELU,
|
|
35
36
|
SAMPLE, CONCAT,
|
|
@@ -139,6 +140,7 @@ struct OpParams {
|
|
|
139
140
|
ComputeBackend backend = ComputeBackend::CPU;
|
|
140
141
|
|
|
141
142
|
size_t dilation = 1;
|
|
143
|
+
size_t stride = 1;
|
|
142
144
|
float temperature = 1.0f;
|
|
143
145
|
float top_p = 1.0f;
|
|
144
146
|
size_t top_k = 0;
|
|
@@ -146,6 +148,8 @@ struct OpParams {
|
|
|
146
148
|
|
|
147
149
|
size_t index_value = 0; // For INDEX operation
|
|
148
150
|
size_t num_classes = 0; // For scatter operations
|
|
151
|
+
size_t dst_height = 0;
|
|
152
|
+
size_t dst_width = 0;
|
|
149
153
|
};
|
|
150
154
|
|
|
151
155
|
struct GraphNode {
|
|
@@ -187,6 +191,12 @@ namespace ValidationUtils {
|
|
|
187
191
|
class CactusGraph {
|
|
188
192
|
public:
|
|
189
193
|
CactusGraph();
|
|
194
|
+
|
|
195
|
+
struct DebugNodeEntry {
|
|
196
|
+
uint32_t layer_idx;
|
|
197
|
+
std::string name;
|
|
198
|
+
size_t node_id;
|
|
199
|
+
};
|
|
190
200
|
|
|
191
201
|
size_t input(const std::vector<size_t>& shape, Precision precision = Precision::INT8);
|
|
192
202
|
size_t precision_cast(size_t input, Precision target_precision);
|
|
@@ -212,6 +222,7 @@ public:
|
|
|
212
222
|
|
|
213
223
|
size_t matmul(size_t input1, size_t input2, bool pretransposed_rhs = false, ComputeBackend backend = ComputeBackend::CPU);
|
|
214
224
|
size_t transpose(size_t input, ComputeBackend backend = ComputeBackend::CPU);
|
|
225
|
+
size_t transposeN(size_t input, const std::vector<size_t>& permutation, ComputeBackend backend = ComputeBackend::CPU);
|
|
215
226
|
size_t reshape(size_t input, const std::vector<size_t>& new_shape);
|
|
216
227
|
size_t slice(size_t input, int axis, size_t start, size_t length);
|
|
217
228
|
size_t index(size_t input, size_t index_value, int dim);
|
|
@@ -228,6 +239,7 @@ public:
|
|
|
228
239
|
void set_quantization_scale(size_t node_id, float scale);
|
|
229
240
|
size_t embedding(const std::string& filename, size_t indices);
|
|
230
241
|
size_t embedding(size_t embedding_tensor, size_t indices);
|
|
242
|
+
size_t bilinear_interpolation(size_t pos_embeds, size_t dst_height, size_t dst_width);
|
|
231
243
|
|
|
232
244
|
size_t layernorm(size_t input, size_t weight, size_t bias, float epsilon = 1e-5f);
|
|
233
245
|
size_t topk(size_t input, size_t k);
|
|
@@ -239,6 +251,7 @@ public:
|
|
|
239
251
|
size_t attention(size_t query, size_t key, size_t value, float scale, size_t position_offset, size_t window_size, ComputeBackend backend = ComputeBackend::CPU);
|
|
240
252
|
|
|
241
253
|
size_t conv1d_causal(size_t input, size_t weight, size_t kernel_size, size_t dilation = 1);
|
|
254
|
+
size_t conv1d_k3(size_t input, size_t weight, size_t stride);
|
|
242
255
|
|
|
243
256
|
size_t sample(size_t logits, float temperature = 0.6f, float top_p = 0.95f, size_t top_k = 20);
|
|
244
257
|
|
|
@@ -252,6 +265,11 @@ public:
|
|
|
252
265
|
void execute(const std::string& profile_file = "");
|
|
253
266
|
void hard_reset();
|
|
254
267
|
void soft_reset();
|
|
268
|
+
|
|
269
|
+
void register_debug_node(uint32_t layer_idx, const std::string& name, size_t node_id);
|
|
270
|
+
void capture_debug_node(uint32_t layer_idx, const std::string& name, size_t node_id);
|
|
271
|
+
const std::vector<DebugNodeEntry>& get_debug_nodes() const;
|
|
272
|
+
void clear_debug_nodes();
|
|
255
273
|
|
|
256
274
|
size_t add_node(OpType op_type, const std::vector<size_t>& inputs, const std::vector<size_t>& output_shape, const OpParams& params = {});
|
|
257
275
|
const BufferDesc& get_output_buffer(size_t node_id) const;
|
|
@@ -265,6 +283,7 @@ private:
|
|
|
265
283
|
size_t next_node_id_;
|
|
266
284
|
std::vector<std::unique_ptr<GraphFile::MappedFile>> mapped_files_;
|
|
267
285
|
std::unordered_map<std::string, size_t> weight_cache_;
|
|
286
|
+
std::vector<DebugNodeEntry> debug_nodes_;
|
|
268
287
|
};
|
|
269
288
|
|
|
270
289
|
|
|
@@ -225,6 +225,27 @@ void cactus_conv1d_causal_depthwise_int8(
|
|
|
225
225
|
float weight_scale,
|
|
226
226
|
float output_scale);
|
|
227
227
|
|
|
228
|
+
void cactus_conv1d_f32_k3(
|
|
229
|
+
const float* input,
|
|
230
|
+
const float* weight,
|
|
231
|
+
float* output,
|
|
232
|
+
size_t N, size_t L,
|
|
233
|
+
size_t C_in, size_t C_out,
|
|
234
|
+
size_t stride
|
|
235
|
+
);
|
|
236
|
+
|
|
237
|
+
void cactus_conv1d_f16_k3(
|
|
238
|
+
const __fp16* input,
|
|
239
|
+
const __fp16* weight,
|
|
240
|
+
__fp16* output,
|
|
241
|
+
size_t N, size_t L,
|
|
242
|
+
size_t C_in, size_t C_out,
|
|
243
|
+
size_t stride
|
|
244
|
+
);
|
|
245
|
+
|
|
246
|
+
void cactus_bilinear_interpolation_fp32(const float* input, float* output, size_t src_height, size_t src_width, size_t embed_dim,
|
|
247
|
+
size_t dst_height, size_t dst_width);
|
|
248
|
+
|
|
228
249
|
void cactus_sample_f32(const float* logits, uint32_t* output, size_t vocab_size,
|
|
229
250
|
float temperature, float top_p, size_t top_k, size_t random_seed);
|
|
230
251
|
void cactus_sample_f16(const __fp16* logits, uint32_t* output, size_t vocab_size,
|
|
Binary file
|
|
@@ -20,7 +20,7 @@ typedef void* cactus_model_t;
|
|
|
20
20
|
|
|
21
21
|
typedef void (*cactus_token_callback)(const char* token, uint32_t token_id, void* user_data);
|
|
22
22
|
|
|
23
|
-
CACTUS_FFI_EXPORT cactus_model_t cactus_init(const char* model_path, size_t context_size);
|
|
23
|
+
CACTUS_FFI_EXPORT cactus_model_t cactus_init(const char* model_path, size_t context_size, const char* corpus_dir);
|
|
24
24
|
|
|
25
25
|
CACTUS_FFI_EXPORT int cactus_complete(
|
|
26
26
|
cactus_model_t model,
|
|
@@ -7,12 +7,18 @@
|
|
|
7
7
|
#include <cstdint>
|
|
8
8
|
|
|
9
9
|
#include "../graph/graph.h"
|
|
10
|
+
extern "C" {
|
|
11
|
+
#include "../../libs/stb/stb_image.h"
|
|
12
|
+
#include "../../libs/stb/stb_image_resize2.h"
|
|
13
|
+
}
|
|
10
14
|
|
|
11
15
|
class CactusGraph;
|
|
12
16
|
|
|
13
17
|
namespace cactus {
|
|
14
18
|
namespace engine {
|
|
15
19
|
|
|
20
|
+
class Siglip2Preprocessor;
|
|
21
|
+
|
|
16
22
|
struct Config {
|
|
17
23
|
uint32_t vocab_size = 151936;
|
|
18
24
|
uint32_t bos_token_id = 151643;
|
|
@@ -31,9 +37,43 @@ struct Config {
|
|
|
31
37
|
uint32_t moe_every_n_layers = 0;
|
|
32
38
|
bool tie_word_embeddings = true;
|
|
33
39
|
|
|
34
|
-
|
|
40
|
+
uint32_t vision_hidden_dim = 0;
|
|
41
|
+
uint32_t vision_num_layers = 0;
|
|
42
|
+
uint32_t vision_attention_heads = 0;
|
|
43
|
+
uint32_t vision_image_size = 0;
|
|
44
|
+
uint32_t vision_patch_size = 0;
|
|
45
|
+
uint32_t vision_num_channels = 3;
|
|
46
|
+
uint32_t vision_embed_dim = 0;
|
|
47
|
+
uint32_t visual_tokens_per_img = 0;
|
|
48
|
+
bool use_pixel_shuffle = false;
|
|
49
|
+
uint32_t pixel_shuffle_factor = 1;
|
|
50
|
+
bool use_image_tokens = false;
|
|
51
|
+
bool use_layout_tags = false;
|
|
52
|
+
uint32_t image_seq_len = 64;
|
|
53
|
+
|
|
54
|
+
uint32_t global_image_size = 2048;
|
|
55
|
+
uint32_t max_tile_size = 512;
|
|
56
|
+
float rescale_factor = 0.00392156862745098f;
|
|
57
|
+
float image_mean = 0.5f;
|
|
58
|
+
float image_std = 0.5f;
|
|
59
|
+
|
|
60
|
+
uint32_t downsample_factor = 2;
|
|
61
|
+
uint32_t min_tiles = 2;
|
|
62
|
+
uint32_t max_tiles = 10;
|
|
63
|
+
bool use_thumbnail = true;
|
|
64
|
+
uint32_t min_image_tokens = 64;
|
|
65
|
+
uint32_t max_image_tokens = 256;
|
|
66
|
+
uint32_t max_num_patches = 1024;
|
|
67
|
+
uint32_t tile_size = 512;
|
|
68
|
+
float max_pixels_tolerance = 2.0f;
|
|
69
|
+
bool do_image_splitting = true;
|
|
70
|
+
|
|
71
|
+
enum class ModelType {QWEN = 0, GEMMA = 1, SMOL = 2, NOMIC = 3, LFM2 = 5, SIGLIP2 = 6};
|
|
35
72
|
ModelType model_type = ModelType::QWEN;
|
|
36
73
|
|
|
74
|
+
enum class ModelVariant {DEFAULT = 0, VLM = 1, EXTRACT = 2, RAG = 3};
|
|
75
|
+
ModelVariant model_variant = ModelVariant::DEFAULT;
|
|
76
|
+
|
|
37
77
|
enum class Activation {GELU = 0, SILU = 1};
|
|
38
78
|
Activation activation = Activation::SILU;
|
|
39
79
|
|
|
@@ -70,6 +110,7 @@ struct MergeRule {
|
|
|
70
110
|
struct ChatMessage {
|
|
71
111
|
std::string role;
|
|
72
112
|
std::string content;
|
|
113
|
+
std::vector<std::string> images;
|
|
73
114
|
};
|
|
74
115
|
|
|
75
116
|
class Tokenizer {
|
|
@@ -89,18 +130,32 @@ public:
|
|
|
89
130
|
virtual bool has_chat_template() const { return has_chat_template_; }
|
|
90
131
|
|
|
91
132
|
virtual bool load_vocabulary_with_config(const std::string& vocab_file, const std::string& merges_file, const std::string& config_file) = 0;
|
|
133
|
+
|
|
134
|
+
uint32_t get_image_token_id() const { return image_token_id_; }
|
|
135
|
+
uint32_t get_fake_token_id() const { return fake_token_id_; }
|
|
136
|
+
uint32_t get_global_img_token_id() const { return global_img_token_id_; }
|
|
92
137
|
|
|
93
|
-
protected:
|
|
94
138
|
|
|
95
|
-
|
|
139
|
+
void set_corpus_dir(const std::string& dir) { corpus_dir_ = dir; }
|
|
140
|
+
|
|
141
|
+
protected:
|
|
142
|
+
enum class ModelType { UNKNOWN, QWEN, GEMMA, LFM2, SMOL, BERT };
|
|
96
143
|
ModelType model_type_ = ModelType::UNKNOWN;
|
|
144
|
+
enum class ModelVariant { DEFAULT, VLM, EXTRACT, RAG};
|
|
145
|
+
ModelVariant model_variant_ = ModelVariant::DEFAULT;
|
|
97
146
|
bool has_chat_template_ = false;
|
|
98
147
|
std::string chat_template_;
|
|
148
|
+
|
|
149
|
+
uint32_t image_token_id_ = 396;
|
|
150
|
+
uint32_t fake_token_id_ = 49189;
|
|
151
|
+
uint32_t global_img_token_id_ = 49152;
|
|
152
|
+
std::string corpus_dir_;
|
|
99
153
|
|
|
100
154
|
void detect_model_type(const std::string& config_path);
|
|
101
155
|
std::string format_qwen_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
|
|
102
156
|
std::string format_gemma_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
|
|
103
157
|
std::string format_lfm2_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
|
|
158
|
+
std::string format_lfm2_vl_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
|
|
104
159
|
std::string format_smol_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
|
|
105
160
|
};
|
|
106
161
|
|
|
@@ -295,22 +350,37 @@ struct KVCache {
|
|
|
295
350
|
|
|
296
351
|
class Model {
|
|
297
352
|
public:
|
|
353
|
+
struct DebugNode {
|
|
354
|
+
uint32_t layer_idx;
|
|
355
|
+
std::string name;
|
|
356
|
+
size_t node_id;
|
|
357
|
+
};
|
|
358
|
+
|
|
298
359
|
Model();
|
|
299
360
|
explicit Model(const Config& config);
|
|
300
361
|
virtual ~Model();
|
|
301
362
|
|
|
302
363
|
const Config& get_config() const { return config_; }
|
|
303
364
|
Tokenizer* get_tokenizer() const { return tokenizer_.get(); }
|
|
365
|
+
const std::vector<DebugNode>& get_debug_nodes() const;
|
|
304
366
|
|
|
305
|
-
bool init(const std::string& model_folder, size_t context_size, const std::string& system_prompt = "");
|
|
306
|
-
|
|
367
|
+
virtual bool init(const std::string& model_folder, size_t context_size, const std::string& system_prompt = "", bool do_warmup = true);
|
|
368
|
+
virtual bool init(CactusGraph* external_graph, const std::string& model_folder, size_t context_size,
|
|
369
|
+
const std::string& system_prompt = "", bool do_warmup = true);
|
|
370
|
+
virtual uint32_t generate(const std::vector<uint32_t>& tokens, float temperature = -1.0f, float top_p = -1.0f,
|
|
307
371
|
size_t top_k = 0, const std::string& profile_file = "");
|
|
308
372
|
|
|
373
|
+
virtual uint32_t generate_with_images(const std::vector<uint32_t>& tokens, const std::vector<std::string>& image_paths,
|
|
374
|
+
float temperature = -1.0f, float top_p = -1.0f,
|
|
375
|
+
size_t top_k = 0, const std::string& profile_file = "");
|
|
376
|
+
|
|
309
377
|
std::vector<float> get_embeddings(const std::vector<uint32_t>& tokens, bool pooled = true, const std::string& profile_file = "");
|
|
310
378
|
|
|
311
379
|
virtual void reset_cache() { kv_cache_.reset(); }
|
|
312
380
|
void set_cache_window(size_t window_size, size_t sink_size = 4) { kv_cache_.set_window_size(window_size, sink_size); }
|
|
313
381
|
|
|
382
|
+
void* graph_handle_;
|
|
383
|
+
|
|
314
384
|
protected:
|
|
315
385
|
virtual size_t forward(const std::vector<uint32_t>& tokens, bool use_cache = false) = 0;
|
|
316
386
|
virtual void load_weights_to_graph(CactusGraph* gb) = 0;
|
|
@@ -326,7 +396,6 @@ protected:
|
|
|
326
396
|
Config config_;
|
|
327
397
|
std::unique_ptr<Tokenizer> tokenizer_;
|
|
328
398
|
|
|
329
|
-
void* graph_handle_;
|
|
330
399
|
bool initialized_;
|
|
331
400
|
float attention_scale_;
|
|
332
401
|
|
|
@@ -339,9 +408,138 @@ protected:
|
|
|
339
408
|
size_t embedding_node_id_;
|
|
340
409
|
std::string model_folder_path_;
|
|
341
410
|
size_t output_weight_node_id_;
|
|
411
|
+
|
|
412
|
+
mutable std::vector<DebugNode> debug_nodes_;
|
|
413
|
+
|
|
414
|
+
void capture_debug_node(uint32_t layer_idx, const std::string& name, size_t node_id) const;
|
|
415
|
+
void clear_debug_nodes();
|
|
416
|
+
|
|
417
|
+
bool init_internal(CactusGraph* gb, const std::string& model_folder, size_t context_size,
|
|
418
|
+
const std::string& system_prompt, bool do_warmup);
|
|
419
|
+
bool owns_graph_;
|
|
342
420
|
};
|
|
343
421
|
|
|
344
422
|
std::unique_ptr<Model> create_model(const std::string& model_folder);
|
|
345
423
|
|
|
424
|
+
class Siglip2Preprocessor {
|
|
425
|
+
public:
|
|
426
|
+
struct Config {
|
|
427
|
+
int patch_size = 16;
|
|
428
|
+
int downsample_factor = 2;
|
|
429
|
+
int min_tiles = 2;
|
|
430
|
+
int max_tiles = 10;
|
|
431
|
+
bool use_thumbnail = true;
|
|
432
|
+
int min_image_tokens = 64;
|
|
433
|
+
int max_image_tokens = 256;
|
|
434
|
+
int max_num_patches = 1024;
|
|
435
|
+
int tile_size = 512;
|
|
436
|
+
float max_pixels_tolerance = 2.0f;
|
|
437
|
+
bool do_resize = true;
|
|
438
|
+
bool do_rescale = true;
|
|
439
|
+
bool do_normalize = true;
|
|
440
|
+
bool do_convert_rgb = true;
|
|
441
|
+
bool do_image_splitting = true;
|
|
442
|
+
float rescale_factor = 1.0f / 255.0f;
|
|
443
|
+
float image_mean[3] = {0.5f, 0.5f, 0.5f};
|
|
444
|
+
float image_std[3] = {0.5f, 0.5f, 0.5f};
|
|
445
|
+
};
|
|
446
|
+
|
|
447
|
+
struct PreprocessedImage {
|
|
448
|
+
std::vector<float> pixel_values;
|
|
449
|
+
std::vector<int> pixel_attention_mask;
|
|
450
|
+
std::vector<std::pair<int,int>> spatial_shapes;
|
|
451
|
+
std::vector<size_t> pixel_values_shape;
|
|
452
|
+
std::vector<size_t> pixel_attention_mask_shape;
|
|
453
|
+
std::vector<size_t> spatial_shapes_shape;
|
|
454
|
+
int num_patches_height;
|
|
455
|
+
int num_patches_width;
|
|
456
|
+
int actual_num_patches;
|
|
457
|
+
int num_tiles;
|
|
458
|
+
int patch_dim;
|
|
459
|
+
int max_patches_per_tile;
|
|
460
|
+
|
|
461
|
+
int image_rows;
|
|
462
|
+
int image_cols;
|
|
463
|
+
int image_height;
|
|
464
|
+
int image_width;
|
|
465
|
+
int tokens_per_tile;
|
|
466
|
+
int thumbnail_tokens;
|
|
467
|
+
|
|
468
|
+
~PreprocessedImage();
|
|
469
|
+
};
|
|
470
|
+
|
|
471
|
+
struct SpatialShapeResult {
|
|
472
|
+
std::vector<std::pair<int, int>> shapes;
|
|
473
|
+
int grid_rows;
|
|
474
|
+
int grid_cols;
|
|
475
|
+
};
|
|
476
|
+
|
|
477
|
+
explicit Siglip2Preprocessor(const Config& config);
|
|
478
|
+
Siglip2Preprocessor();
|
|
479
|
+
~Siglip2Preprocessor();
|
|
480
|
+
|
|
481
|
+
PreprocessedImage preprocess_from_file(const std::string& image_path);
|
|
482
|
+
PreprocessedImage preprocess_from_memory(const unsigned char* img_data, int width, int height, int channels);
|
|
483
|
+
SpatialShapeResult compute_spatial_shapes(int height, int width);
|
|
484
|
+
|
|
485
|
+
private:
|
|
486
|
+
Config config_;
|
|
487
|
+
|
|
488
|
+
std::vector<unsigned char> convert_to_rgb(const unsigned char* img_data, int width, int height, int channels);
|
|
489
|
+
std::pair<int, int> smart_resize(int height, int width);
|
|
490
|
+
bool is_image_too_large(int height, int width);
|
|
491
|
+
std::pair<int, int> get_grid_layout(int height, int width);
|
|
492
|
+
std::pair<int, int> find_closest_aspect_ratio(float aspect_ratio, int width, int height);
|
|
493
|
+
std::vector<float> resize_image(const unsigned char* img_data, int src_width, int src_height,
|
|
494
|
+
int dst_width, int dst_height, int channels);
|
|
495
|
+
std::vector<float> normalize_image(const float* img_data, int width, int height, int channels);
|
|
496
|
+
std::vector<std::vector<float>> convert_image_to_patches(
|
|
497
|
+
const std::vector<float>& image, int width, int height, int channels, int patch_size);
|
|
498
|
+
PreprocessedImage pad_patches(const std::vector<std::vector<float>>& tile_patches,
|
|
499
|
+
const std::vector<std::pair<int,int>>& spatial_shapes,
|
|
500
|
+
int patch_dim,
|
|
501
|
+
int max_patches_per_tile);
|
|
502
|
+
int round_by_factor(int number, int factor);
|
|
503
|
+
};
|
|
504
|
+
|
|
505
|
+
class AudioProcessor {
|
|
506
|
+
public:
|
|
507
|
+
struct SpectrogramConfig {
|
|
508
|
+
size_t n_fft = 400;
|
|
509
|
+
size_t hop_length = 160;
|
|
510
|
+
size_t frame_length = 400;
|
|
511
|
+
float power = 2.0f;
|
|
512
|
+
bool center = true;
|
|
513
|
+
const char* pad_mode = "reflect";
|
|
514
|
+
bool onesided = true;
|
|
515
|
+
float dither = 0.0f;
|
|
516
|
+
float mel_floor = 1e-10f;
|
|
517
|
+
const char* log_mel = nullptr;
|
|
518
|
+
float reference = 1.0f;
|
|
519
|
+
float min_value = 1e-10f;
|
|
520
|
+
bool remove_dc_offset = false;
|
|
521
|
+
};
|
|
522
|
+
|
|
523
|
+
AudioProcessor();
|
|
524
|
+
~AudioProcessor();
|
|
525
|
+
|
|
526
|
+
void init_mel_filters(size_t num_frequency_bins, size_t num_mel_filters,
|
|
527
|
+
float min_freq, float max_freq, size_t sampling_rate);
|
|
528
|
+
|
|
529
|
+
std::vector<float> compute_spectrogram(
|
|
530
|
+
const std::vector<float>& waveform,
|
|
531
|
+
const SpectrogramConfig& config);
|
|
532
|
+
|
|
533
|
+
const std::vector<float>& get_mel_filters() const { return mel_filters_; }
|
|
534
|
+
|
|
535
|
+
size_t get_num_mel_filters() const { return num_mel_filters_; }
|
|
536
|
+
size_t get_num_frequency_bins() const { return num_frequency_bins_; }
|
|
537
|
+
|
|
538
|
+
private:
|
|
539
|
+
std::vector<float> mel_filters_;
|
|
540
|
+
size_t num_frequency_bins_;
|
|
541
|
+
size_t num_mel_filters_;
|
|
542
|
+
};
|
|
543
|
+
|
|
346
544
|
}
|
|
347
545
|
}
|
|
@@ -8,6 +8,8 @@
|
|
|
8
8
|
#include <stdexcept>
|
|
9
9
|
#include <sstream>
|
|
10
10
|
#include <iomanip>
|
|
11
|
+
#include <filesystem>
|
|
12
|
+
#include <cctype>
|
|
11
13
|
|
|
12
14
|
namespace cactus {
|
|
13
15
|
namespace ffi {
|
|
@@ -30,8 +32,10 @@ inline void handle_error_response(const std::string& error_message, char* respon
|
|
|
30
32
|
}
|
|
31
33
|
}
|
|
32
34
|
|
|
33
|
-
inline std::vector<cactus::engine::ChatMessage> parse_messages_json(const std::string& json
|
|
35
|
+
inline std::vector<cactus::engine::ChatMessage> parse_messages_json(const std::string& json,
|
|
36
|
+
std::vector<std::string>& out_image_paths) {
|
|
34
37
|
std::vector<cactus::engine::ChatMessage> messages;
|
|
38
|
+
out_image_paths.clear();
|
|
35
39
|
|
|
36
40
|
size_t pos = json.find('[');
|
|
37
41
|
if (pos == std::string::npos) {
|
|
@@ -42,42 +46,79 @@ inline std::vector<cactus::engine::ChatMessage> parse_messages_json(const std::s
|
|
|
42
46
|
while (pos != std::string::npos) {
|
|
43
47
|
cactus::engine::ChatMessage msg;
|
|
44
48
|
|
|
49
|
+
size_t obj_start = pos;
|
|
50
|
+
int brace_count = 1;
|
|
51
|
+
size_t obj_end = obj_start + 1;
|
|
52
|
+
while (obj_end < json.length() && brace_count > 0) {
|
|
53
|
+
if (json[obj_end] == '{') brace_count++;
|
|
54
|
+
else if (json[obj_end] == '}') brace_count--;
|
|
55
|
+
obj_end++;
|
|
56
|
+
}
|
|
57
|
+
|
|
45
58
|
size_t role_pos = json.find("\"role\"", pos);
|
|
46
|
-
if (role_pos == std::string::npos) break;
|
|
59
|
+
if (role_pos == std::string::npos || role_pos >= obj_end) break;
|
|
47
60
|
|
|
48
61
|
size_t role_start = json.find('"', role_pos + 6) + 1;
|
|
49
62
|
size_t role_end = json.find('"', role_start);
|
|
50
63
|
msg.role = json.substr(role_start, role_end - role_start);
|
|
51
64
|
|
|
52
65
|
size_t content_pos = json.find("\"content\"", role_end);
|
|
53
|
-
if (content_pos
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
66
|
+
if (content_pos != std::string::npos && content_pos < obj_end) {
|
|
67
|
+
size_t content_start = json.find('"', content_pos + 9) + 1;
|
|
68
|
+
size_t content_end = content_start;
|
|
69
|
+
|
|
70
|
+
while (content_end < json.length()) {
|
|
71
|
+
content_end = json.find('"', content_end);
|
|
72
|
+
if (content_end == std::string::npos) break;
|
|
73
|
+
if (json[content_end - 1] != '\\') break;
|
|
74
|
+
content_end++;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
msg.content = json.substr(content_start, content_end - content_start);
|
|
78
|
+
|
|
79
|
+
size_t escape_pos = 0;
|
|
80
|
+
while ((escape_pos = msg.content.find("\\n", escape_pos)) != std::string::npos) {
|
|
81
|
+
msg.content.replace(escape_pos, 2, "\n");
|
|
82
|
+
escape_pos += 1;
|
|
83
|
+
}
|
|
84
|
+
escape_pos = 0;
|
|
85
|
+
while ((escape_pos = msg.content.find("\\\"", escape_pos)) != std::string::npos) {
|
|
86
|
+
msg.content.replace(escape_pos, 2, "\"");
|
|
87
|
+
escape_pos += 1;
|
|
88
|
+
}
|
|
63
89
|
}
|
|
64
90
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
91
|
+
size_t images_pos = json.find("\"images\"", pos);
|
|
92
|
+
if (images_pos != std::string::npos && images_pos < obj_end) {
|
|
93
|
+
size_t array_start = json.find('[', images_pos);
|
|
94
|
+
if (array_start != std::string::npos && array_start < obj_end) {
|
|
95
|
+
size_t array_end = json.find(']', array_start);
|
|
96
|
+
if (array_end != std::string::npos && array_end < obj_end) {
|
|
97
|
+
size_t img_pos = array_start;
|
|
98
|
+
while (true) {
|
|
99
|
+
img_pos = json.find('"', img_pos + 1);
|
|
100
|
+
if (img_pos == std::string::npos || img_pos >= array_end) break;
|
|
101
|
+
|
|
102
|
+
size_t img_start = img_pos + 1;
|
|
103
|
+
size_t img_end = json.find('"', img_start);
|
|
104
|
+
if (img_end == std::string::npos || img_end > array_end) break;
|
|
105
|
+
|
|
106
|
+
std::string img_path = json.substr(img_start, img_end - img_start);
|
|
107
|
+
|
|
108
|
+
std::filesystem::path p(img_path);
|
|
109
|
+
img_path = std::filesystem::absolute(p).string();
|
|
110
|
+
|
|
111
|
+
msg.images.push_back(img_path);
|
|
112
|
+
out_image_paths.push_back(img_path);
|
|
113
|
+
img_pos = img_end;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
76
117
|
}
|
|
77
118
|
|
|
78
119
|
messages.push_back(msg);
|
|
79
120
|
|
|
80
|
-
pos = json.find('{',
|
|
121
|
+
pos = json.find('{', obj_end);
|
|
81
122
|
}
|
|
82
123
|
|
|
83
124
|
return messages;
|
|
@@ -136,10 +177,10 @@ inline void parse_options_json(const std::string& json,
|
|
|
136
177
|
float& temperature, float& top_p,
|
|
137
178
|
size_t& top_k, size_t& max_tokens,
|
|
138
179
|
std::vector<std::string>& stop_sequences) {
|
|
139
|
-
temperature = -1.0f;
|
|
140
|
-
top_p = -1.0f;
|
|
141
|
-
top_k = 0;
|
|
142
|
-
max_tokens = 100;
|
|
180
|
+
temperature = -1.0f;
|
|
181
|
+
top_p = -1.0f;
|
|
182
|
+
top_k = 0;
|
|
183
|
+
max_tokens = 100;
|
|
143
184
|
stop_sequences.clear();
|
|
144
185
|
|
|
145
186
|
if (json.empty()) return;
|
|
@@ -205,34 +246,107 @@ inline std::string format_tools_for_prompt(const std::vector<ToolFunction>& tool
|
|
|
205
246
|
return formatted_tools_json;
|
|
206
247
|
}
|
|
207
248
|
|
|
208
|
-
inline void parse_function_calls_from_response(const std::string& response_text,
|
|
209
|
-
std::string& regular_response,
|
|
249
|
+
inline void parse_function_calls_from_response(const std::string& response_text,
|
|
250
|
+
std::string& regular_response,
|
|
210
251
|
std::vector<std::string>& function_calls) {
|
|
211
252
|
regular_response = response_text;
|
|
212
253
|
function_calls.clear();
|
|
213
254
|
|
|
255
|
+
const std::string TOOL_CALL_START = "<|tool_call_start|>";
|
|
256
|
+
const std::string TOOL_CALL_END = "<|tool_call_end|>";
|
|
257
|
+
size_t tool_start_pos = 0;
|
|
258
|
+
|
|
259
|
+
while ((tool_start_pos = response_text.find(TOOL_CALL_START, tool_start_pos)) != std::string::npos) {
|
|
260
|
+
size_t content_start = tool_start_pos + TOOL_CALL_START.length();
|
|
261
|
+
size_t tool_end_pos = response_text.find(TOOL_CALL_END, content_start);
|
|
262
|
+
|
|
263
|
+
if (tool_end_pos != std::string::npos) {
|
|
264
|
+
std::string tool_content = response_text.substr(content_start, tool_end_pos - content_start);
|
|
265
|
+
|
|
266
|
+
if (tool_content.size() > 2 && tool_content[0] == '[' && tool_content[tool_content.size()-1] == ']') {
|
|
267
|
+
tool_content = tool_content.substr(1, tool_content.size() - 2);
|
|
268
|
+
|
|
269
|
+
size_t paren_pos = tool_content.find('(');
|
|
270
|
+
if (paren_pos != std::string::npos) {
|
|
271
|
+
std::string func_name = tool_content.substr(0, paren_pos);
|
|
272
|
+
std::string args_str = tool_content.substr(paren_pos + 1);
|
|
273
|
+
|
|
274
|
+
if (!args_str.empty() && args_str.back() == ')') {
|
|
275
|
+
args_str.pop_back();
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
std::string json_call = "{\"name\":\"" + func_name + "\",\"arguments\":{";
|
|
279
|
+
|
|
280
|
+
size_t arg_pos = 0;
|
|
281
|
+
bool first_arg = true;
|
|
282
|
+
while (arg_pos < args_str.length()) {
|
|
283
|
+
while (arg_pos < args_str.length() && std::isspace(args_str[arg_pos])) arg_pos++;
|
|
284
|
+
|
|
285
|
+
size_t eq_pos = args_str.find('=', arg_pos);
|
|
286
|
+
if (eq_pos == std::string::npos) break;
|
|
287
|
+
|
|
288
|
+
std::string arg_name = args_str.substr(arg_pos, eq_pos - arg_pos);
|
|
289
|
+
|
|
290
|
+
size_t val_start = eq_pos + 1;
|
|
291
|
+
size_t val_end = val_start;
|
|
292
|
+
|
|
293
|
+
if (val_start < args_str.length() && args_str[val_start] == '"') {
|
|
294
|
+
val_start++;
|
|
295
|
+
val_end = args_str.find('"', val_start);
|
|
296
|
+
if (val_end == std::string::npos) break;
|
|
297
|
+
} else {
|
|
298
|
+
val_end = args_str.find(',', val_start);
|
|
299
|
+
if (val_end == std::string::npos) val_end = args_str.length();
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
std::string arg_value = args_str.substr(val_start, val_end - val_start);
|
|
303
|
+
|
|
304
|
+
if (!first_arg) json_call += ",";
|
|
305
|
+
json_call += "\"" + arg_name + "\":\"" + arg_value + "\"";
|
|
306
|
+
first_arg = false;
|
|
307
|
+
|
|
308
|
+
arg_pos = args_str.find(',', val_end);
|
|
309
|
+
if (arg_pos != std::string::npos) {
|
|
310
|
+
arg_pos++;
|
|
311
|
+
} else {
|
|
312
|
+
break;
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
json_call += "}}";
|
|
317
|
+
function_calls.push_back(json_call);
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
regular_response.erase(tool_start_pos, tool_end_pos + TOOL_CALL_END.length() - tool_start_pos);
|
|
322
|
+
tool_start_pos = tool_end_pos + TOOL_CALL_END.length();
|
|
323
|
+
} else {
|
|
324
|
+
break;
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
|
|
214
328
|
const char* FUNCTION_CALL_MARKER = "\"function_call\"";
|
|
215
329
|
size_t search_pos = 0;
|
|
216
|
-
const size_t text_len =
|
|
330
|
+
const size_t text_len = regular_response.length();
|
|
217
331
|
|
|
218
332
|
while (search_pos < text_len) {
|
|
219
|
-
size_t marker_pos =
|
|
333
|
+
size_t marker_pos = regular_response.find(FUNCTION_CALL_MARKER, search_pos);
|
|
220
334
|
if (marker_pos == std::string::npos) break;
|
|
221
335
|
|
|
222
|
-
size_t json_start =
|
|
336
|
+
size_t json_start = regular_response.find('{', marker_pos);
|
|
223
337
|
if (json_start == std::string::npos) break;
|
|
224
338
|
|
|
225
339
|
int brace_count = 1;
|
|
226
340
|
size_t json_end = json_start + 1;
|
|
227
341
|
while (json_end < text_len && brace_count > 0) {
|
|
228
|
-
char c =
|
|
342
|
+
char c = regular_response[json_end];
|
|
229
343
|
brace_count += (c == '{') - (c == '}');
|
|
230
344
|
json_end++;
|
|
231
345
|
}
|
|
232
346
|
|
|
233
347
|
if (brace_count == 0) {
|
|
234
|
-
function_calls.push_back(
|
|
235
|
-
regular_response =
|
|
348
|
+
function_calls.push_back(regular_response.substr(json_start, json_end - json_start));
|
|
349
|
+
regular_response = regular_response.substr(0, marker_pos);
|
|
236
350
|
size_t last_bracket = regular_response.rfind('{');
|
|
237
351
|
if(last_bracket != std::string::npos) {
|
|
238
352
|
regular_response = regular_response.substr(0, last_bracket);
|