npm - cactus-react-native - Versions diffs - 1.5.0 → 1.10.0 - Mend

cactus-react-native 1.5.0 → 1.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (221) hide show

package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h CHANGED Viewed

@@ -56,6 +56,12 @@ struct Config {
     uint32_t num_shared_experts = 0;
     uint32_t num_top_experts = 0;
     uint32_t moe_every_n_layers = 0;
+    uint32_t moe_intermediate_dim = 0;
+    uint32_t num_dense_layers = 0;
+    uint32_t num_experts_per_tok = 0;
+    bool norm_topk_prob = false;
+    bool use_expert_bias = false;
+    float routed_scaling_factor = 1.0f;
     bool tie_word_embeddings = true;
     uint32_t vision_hidden_dim = 0;
@@ -84,12 +90,31 @@ struct Config {
     bool use_thumbnail = true;
     uint32_t min_image_tokens = 64;
     uint32_t max_image_tokens = 256;
-        uint32_t max_num_patches = 1024;
+    uint32_t max_num_patches = 1024;
     uint32_t tile_size = 512;
     float max_pixels_tolerance = 2.0f;
     bool do_image_splitting = true;
-    enum class ModelType {QWEN = 0, GEMMA = 1, SMOL = 2, NOMIC = 3, LFM2 = 5, SIGLIP2 = 6, WHISPER = 7};
+    bool encoder_act_gelu = false;
+    bool decoder_act_gelu = false;
+    uint32_t num_encoder_layers = 0;
+    uint32_t num_decoder_layers = 0;
+    float partial_rotary_factor = 0.0f;
+    uint32_t pad_token_id = 0;
+    uint32_t conv_kernel_size = 0;
+    uint32_t subsampling_conv_kernel_size = 0;
+    uint32_t subsampling_conv_stride = 0;
+    uint32_t subsampling_conv_channels = 0;
+    uint32_t subsampling_factor = 0;
+    uint32_t num_mel_bins = 80;
+    std::string encoder_hidden_act = "silu";
+    uint32_t predictor_hidden_dim = 0;
+    uint32_t predictor_num_layers = 0;
+    uint32_t tdt_joint_dim = 0;
+    uint32_t tdt_num_durations = 0;
+    uint32_t tdt_blank_id = 0;
+    std::vector<uint32_t> tdt_durations;
+    enum class ModelType {QWEN = 0, GEMMA = 1, NOMIC = 3, LFM2 = 5, SIGLIP2 = 6, WHISPER = 7, MOONSHINE = 8, SILERO_VAD = 9, PARAKEET = 10, PARAKEET_TDT = 11};
     ModelType model_type = ModelType::QWEN;
     enum class ModelVariant {DEFAULT = 0, VLM = 1, EXTRACT = 2, RAG = 3};
@@ -107,6 +132,8 @@ struct Config {
     float default_temperature = 0.6f;
     float default_top_p = 0.95f;
     size_t default_top_k = 20;
+    float default_max_tps = -1.0f;
+    float default_cloud_handoff_threshold = 0.0f;
     std::vector<std::string> layer_types;
     size_t conv_L_cache = 0;
@@ -152,6 +179,7 @@ public:
     virtual uint32_t get_bos_token() const = 0;
     virtual uint32_t get_eos_token() const = 0;
     virtual bool has_chat_template() const { return has_chat_template_; }
+    std::string get_default_stop_sequence() const;
     virtual bool load_vocabulary_with_config(const std::string& vocab_file, const std::string& merges_file, const std::string& config_file) = 0;
@@ -159,11 +187,8 @@ public:
     uint32_t get_fake_token_id() const { return fake_token_id_; }
     uint32_t get_global_img_token_id() const { return global_img_token_id_; }
-    void set_corpus_dir(const std::string& dir) { corpus_dir_ = dir; }
 protected:
-    enum class ModelType { UNKNOWN, QWEN, GEMMA, LFM2, SMOL, BERT, WHISPER};
+    enum class ModelType { UNKNOWN, QWEN, GEMMA, LFM2, BERT, WHISPER, PARAKEET};
     ModelType model_type_ = ModelType::UNKNOWN;
     enum class ModelVariant { DEFAULT, VLM, EXTRACT, RAG};
     ModelVariant model_variant_ = ModelVariant::DEFAULT;
@@ -173,14 +198,12 @@ protected:
     uint32_t image_token_id_ = 396;
     uint32_t fake_token_id_ = 49189;
     uint32_t global_img_token_id_ = 49152;
-    std::string corpus_dir_;
     void detect_model_type(const std::string& config_path);
     std::string format_qwen_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
     std::string format_gemma_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
     std::string format_lfm2_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
     std::string format_lfm2_vl_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
-    std::string format_smol_style(const std::vector<ChatMessage>& messages, bool add_generation_prompt, const std::string& tools_json) const;
 };
 class BPETokenizer : public Tokenizer {
@@ -363,7 +386,6 @@ struct KVCache {
                          size_t num_tokens, size_t kv_heads, size_t head_dim);
     bool is_empty() const { return current_seq_len == 0; }
-    bool is_int8() const { return precision == Precision::INT8; }
     void* get_key_ptr(size_t layer);
     void* get_value_ptr(size_t layer);
@@ -471,6 +493,8 @@ private:
     void compute_bias();
     void tokenize_grammar_elements();
     void add_tokens_for_string(const std::string& str, std::unordered_set<uint32_t>& token_set);
+    void tokenize_function_names(bool quote_names);
+    void init_common_tokens();
 };
 class Model {
@@ -495,22 +519,22 @@ public:
               const std::string& system_prompt = "", bool do_warmup = true);
     virtual uint32_t decode(const std::vector<uint32_t>& tokens, float temperature = -1.0f, float top_p = -1.0f,
-                      size_t top_k = 0, const std::string& profile_file = "");
+                      size_t top_k = 0, const std::string& profile_file = "", float* out_entropy = nullptr);
     virtual void prefill(const std::vector<uint32_t>& tokens, size_t chunk_size = 256, const std::string& profile_file = "");
     virtual uint32_t decode_with_images(const std::vector<uint32_t>& tokens, const std::vector<std::string>& image_paths,
                                           float temperature = -1.0f, float top_p = -1.0f,
-                                          size_t top_k = 0, const std::string& profile_file = "");
+                                          size_t top_k = 0, const std::string& profile_file = "", float* out_entropy = nullptr);
-    virtual uint32_t decode_with_audio(const std::vector<uint32_t>& tokens, const std::vector<float>& mel_bins, float temperature = 0.0f, float top_p = 0.0f,
-                      size_t top_k = 0, const std::string& profile_file = "");
+    virtual uint32_t decode_with_audio(const std::vector<uint32_t>& tokens, const std::vector<float>& audio_features, float temperature = 0.0f, float top_p = 0.0f,
+                      size_t top_k = 0, const std::string& profile_file = "", float* out_entropy = nullptr);
     std::vector<float> get_embeddings(const std::vector<uint32_t>& tokens, bool pooled = true, bool normalize = false, const std::string& profile_file = "");
     virtual std::vector<float> get_image_embeddings(const std::string& image_path);
-    virtual std::vector<float> get_audio_embeddings(const std::vector<float>& mel_bins);
+    virtual std::vector<float> get_audio_embeddings(const std::vector<float>& audio_features);
     virtual void reset_cache() { kv_cache_.reset(); }
@@ -533,7 +557,7 @@ public:
 protected:
     virtual size_t forward(const std::vector<uint32_t>& tokens, bool use_cache = false) = 0;
-    virtual size_t forward(const std::vector<float>& mel_bins, const std::vector<uint32_t>& tokens, bool use_cache = false);
+    virtual size_t forward(const std::vector<float>& audio_features, const std::vector<uint32_t>& tokens, bool use_cache = false);
     virtual void load_weights_to_graph(CactusGraph* gb) = 0;
@@ -645,6 +669,7 @@ public:
 private:
     Config config_;
+    std::pair<int64_t, int64_t> compute_pixel_limits() const;
     std::vector<unsigned char> convert_to_rgb(const unsigned char* img_data, int width, int height, int channels);
     std::pair<int, int> smart_resize(int height, int width);
     bool is_image_too_large(int height, int width);
@@ -678,6 +703,8 @@ public:
         float reference = 1.0f;
         float min_value = 1e-10f;
         bool remove_dc_offset = false;
+        float preemphasis = 0.0f;
+        bool hann_periodic = true;
     };
     AudioProcessor();
@@ -690,6 +717,11 @@ public:
         const std::vector<float>& waveform,
         const SpectrogramConfig& config);
+    static std::vector<float> compute_irfft(
+        const std::vector<float>& complex_input,
+        size_t n,
+        const char* norm = "backward");
     const std::vector<float>& get_mel_filters() const { return mel_filters_; }
     size_t get_num_mel_filters() const { return num_mel_filters_; }
@@ -701,5 +733,104 @@ private:
     size_t num_mel_filters_;
 };
+namespace index {
+    constexpr uint32_t MAGIC = 0x43414354;
+    constexpr uint32_t VERSION = 1;
+    struct Document {
+        int id;
+        std::vector<float> embedding;
+        std::string content;
+        std::string metadata;
+    };
+    struct QueryResult {
+        int doc_id;
+        float score;
+        QueryResult(int doc_id, float score) : doc_id(doc_id), score(score) {}
+    };
+    struct QueryOptions {
+        size_t top_k = 10;
+        float score_threshold = -1.0f;
+    };
+    class Index {
+        public:
+            Index(const std::string& index_path, const std::string& data_path, size_t embedding_dim);
+            ~Index();
+            Index(const Index&) = delete;
+            Index& operator=(const Index&) = delete;
+            Index(Index&&) = delete;
+            Index& operator=(Index&&) = delete;
+            void add_documents(const std::vector<Document>& documents);
+            void delete_documents(const std::vector<int>& doc_ids);
+            std::vector<Document> get_documents(const std::vector<int>& doc_ids);
+            std::vector<std::vector<QueryResult>> query(const std::vector<std::vector<float>>& embeddings, const QueryOptions& options);
+            void compact();
+        private:
+            struct IndexHeader {
+                uint32_t magic;
+                uint32_t version;
+                uint32_t embedding_dim;
+                uint32_t num_documents;
+            };
+            struct IndexEntry {
+                int32_t doc_id;
+                uint64_t data_offset;
+                uint8_t flags; // bit 0: tombstone
+                const __fp16* embedding() const {
+                    return reinterpret_cast<const __fp16*>(this + 1);
+                }
+                static size_t size(size_t embedding_dim) {
+                    return sizeof(IndexEntry) + embedding_dim * sizeof(__fp16);
+                }
+            };
+            struct DataHeader {
+                uint32_t magic;
+                uint32_t version;
+            };
+            struct DataEntry {
+                uint16_t content_len;
+                uint16_t metadata_len;
+                const char* content() const {
+                    return reinterpret_cast<const char*>(this + 1);
+                }
+                const char* metadata() const {
+                    return content() + content_len;
+                }
+            };
+            void parse_index_header();
+            void parse_data_header();
+            void build_doc_id_map();
+            void validate_documents(const std::vector<Document>& documents);
+            void validate_doc_ids(const std::vector<int>& doc_ids);
+            ssize_t write_full(int fd, const void* buf, size_t count);
+            std::unordered_map<int, uint32_t> doc_id_map_;
+            std::string index_path_, data_path_;
+            size_t embedding_dim_;
+            size_t index_entry_size_;
+            uint32_t num_documents_;
+            int index_fd_, data_fd_;
+            void *mapped_index_, *mapped_data_;
+            size_t index_file_size_, data_file_size_;
+    };
+} // namespace index
+}
 }
-}

package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h CHANGED Viewed

@@ -4,7 +4,9 @@
 #include <vector>
 #include <memory>
 #include <unordered_map>
+#include <unordered_set>
 #include <functional>
+#include <cassert>
 #include <cstring>
 #include <stdexcept>
 #include <string>
@@ -108,23 +110,36 @@ enum class ComputeBackend {
     NPU
 };
+enum class Activation {
+    SILU,
+    GELU,
+    GELU_ERF,
+    RELU,
+    SIGMOID,
+    TANH
+};
 enum class OpType {
     INPUT, PRECISION_CAST,
     ADD, ADD_CLIPPED, SUBTRACT, MULTIPLY, DIVIDE,
     MATMUL, TRANSPOSE, RESHAPE, SLICE, GATHER, EMBEDDING,
     BILINEAR_INTERPOLATION,
     SUM, MEAN, VARIANCE, MIN, MAX,
-    RMS_NORM, ROPE, SOFTMAX, ATTENTION, ATTENTION_INT8_HYBRID, CONV1D_CAUSAL, CONV1D_K3,
-    SCALAR_ADD, SCALAR_SUBTRACT, SCALAR_MULTIPLY, SCALAR_DIVIDE, SCALAR_EXP, SCALAR_SQRT, SCALAR_COS, SCALAR_SIN,
-    SILU, GELU, GELU_ERF,
+    RMS_NORM, ROPE, ROPE_GPTJ, SOFTMAX, ATTENTION, ATTENTION_INT8_HYBRID, REL_POS_BIAS, CONV1D_CAUSAL, CONV1D_K3, CONV1D_K7S3, CONV1D, CONV1D_SAME_DEPTHWISE_K9, CONV1D_POINTWISE, CONV2D_K3S2P1, CONV2D_DEPTHWISE_K3S2P1, CONV2D_POINTWISE_1X1, GLU, BATCHNORM,
+    SCALAR_ADD, SCALAR_SUBTRACT, SCALAR_MULTIPLY, SCALAR_DIVIDE, SCALAR_EXP, SCALAR_SQRT, SCALAR_COS, SCALAR_SIN, SCALAR_LOG,
+    RELU, SILU, GELU, GELU_ERF, SIGMOID, TANH,
     SAMPLE, CONCAT,
     SCATTER_TOPK,
-    TOPK, LAYERNORM,
+    TOPK, LAYERNORM, GROUPNORM,
+    MOE_LAYER,
     INDEX,
+    PERSISTENT,
+    QUANTIZE_ACTIVATIONS,
+    LSTM_CELL,
+    STFT
 };
 struct PrecisionTraits {
-    // Returns in-memory element size (INT4 unpacks to INT8, so returns 1)
     static constexpr size_t size_of(Precision prec) {
         switch (prec) {
             case Precision::INT8: return 1;
@@ -137,11 +152,20 @@ struct PrecisionTraits {
     static constexpr size_t packed_size_of(Precision prec, size_t count) {
         switch (prec) {
-            case Precision::INT4: return (count + 1) / 2;
+            case Precision::INT4: return (count + 1) / 2;
             default: return count * size_of(prec);
         }
     }
+    static size_t byte_offset_of(Precision prec, size_t element_offset) {
+        switch (prec) {
+            case Precision::INT4:
+                assert(element_offset % 32 == 0 && "INT4 byte offset must be group-aligned (multiple of 32)");
+                return element_offset / 2;
+            default: return element_offset * size_of(prec);
+        }
+    }
     static constexpr bool is_integer(Precision prec) {
         switch (prec) {
             case Precision::INT8: return true;
@@ -177,7 +201,6 @@ struct TensorConfig {
     Precision compute_precision = Precision::INT8;
     Precision output_precision = Precision::INT8;
     bool auto_mixed_precision = false;
-    bool enable_int4_packing = true;
     static TensorConfig& global();
 };
@@ -205,8 +228,12 @@ struct BufferDesc {
     void* scales_data = nullptr;
     std::unique_ptr<char[]> owned_scales;
-    const void* packed_int4_data = nullptr;
-    size_t packed_int4_size = 0;
+    bool is_interleaved = false;
+    size_t original_N = 0;
+    void* activation_scales_data = nullptr;
+    std::unique_ptr<char[]> owned_activation_scales;
+    size_t num_rows_for_activation_scales = 0;
     BufferDesc();
     BufferDesc(const std::vector<size_t>& s, Precision prec = Precision::INT8);
@@ -230,23 +257,43 @@ struct BufferDesc {
     const __fp16* scales_as_fp16() const {
         return reinterpret_cast<const __fp16*>(scales_data);
     }
     bool is_grouped_int8() const {
         return precision == Precision::INT8 && group_size > 0;
     }
-    bool is_packed_int4() const {
-        return packed_int4_data != nullptr && packed_int4_size > 0;
-    }
-    const uint8_t* packed_int4_as_uint8() const {
-        return reinterpret_cast<const uint8_t*>(packed_int4_data);
+    bool is_grouped_int4() const {
+        return precision == Precision::INT4 && group_size > 0;
     }
     void set_grouped_scales(size_t gs, size_t ng, void* scales_ptr) {
         group_size = gs;
         num_groups = ng;
         scales_data = scales_ptr;
     }
-    void set_packed_int4(const void* packed_data, size_t packed_size) {
-        packed_int4_data = packed_data;
-        packed_int4_size = packed_size;
+    void set_interleaved(bool interleaved, size_t orig_n) {
+        is_interleaved = interleaved;
+        original_N = orig_n;
+    }
+    bool has_activation_scales() const {
+        return activation_scales_data != nullptr && num_rows_for_activation_scales > 0;
+    }
+    const float* activation_scales_as_float() const {
+        return reinterpret_cast<const float*>(activation_scales_data);
+    }
+    float* activation_scales_as_float() {
+        return reinterpret_cast<float*>(activation_scales_data);
+    }
+    void allocate_activation_scales(size_t num_rows) {
+        num_rows_for_activation_scales = num_rows;
+        owned_activation_scales = std::make_unique<char[]>(num_rows * sizeof(float));
+        activation_scales_data = owned_activation_scales.get();
+    }
+    void set_activation_scales(void* scales_ptr, size_t num_rows) {
+        activation_scales_data = scales_ptr;
+        num_rows_for_activation_scales = num_rows;
     }
     void allocate();
@@ -267,6 +314,7 @@ struct OpParams {
     size_t slice_length = 0;
     size_t window_size = 0;
     bool is_causal = true;
+    bool attention_mask_is_additive = false;
     std::vector<size_t> new_shape;
     std::vector<size_t> permutation;
     Precision output_precision = Precision::INT8;
@@ -282,8 +330,14 @@ struct OpParams {
     size_t index_value = 0;
     size_t num_classes = 0;
+    size_t num_groups = 0;
     size_t dst_height = 0;
     size_t dst_width = 0;
+    bool normalize_routing = false;
+    size_t num_experts = 0;
+    size_t num_experts_per_tok = 0;
+    bool moe_gated = true;
+    Activation activation = Activation::SILU;
     std::vector<float> bias_values;
     std::vector<uint32_t> bias_indices;
@@ -295,6 +349,7 @@ struct OpParams {
     size_t cache_seq_len = 0;
     size_t num_kv_heads = 0;
     size_t head_dim = 0;
+    size_t num_fft_bins = 0;
 };
 struct GraphNode {
@@ -324,10 +379,12 @@ void compute_sample_node(GraphNode& node, const std::vector<std::unique_ptr<Grap
 void compute_scatter_topk_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
 void compute_topk_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
 void compute_layernorm_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_groupnorm_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_persistent_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
 void compute_index_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
+void compute_lstm_cell_node(GraphNode& node, const std::vector<std::unique_ptr<GraphNode>>& nodes, const std::unordered_map<size_t, size_t>& node_index_map);
 void shrink_thread_local_buffers();
 class BufferPool {
 public:
     BufferPool() = default;
@@ -372,6 +429,7 @@ public:
     size_t input(const std::vector<size_t>& shape, Precision precision = Precision::INT8);
     size_t precision_cast(size_t input, Precision target_precision);
+    size_t quantize_activations(size_t input);
     size_t add(size_t input1, size_t input2);
     size_t add_clipped(size_t input1, size_t input2);
@@ -388,10 +446,15 @@ public:
     size_t scalar_sqrt(size_t input);
     size_t scalar_cos(size_t input);
     size_t scalar_sin(size_t input);
+    size_t scalar_log(size_t input);
+    size_t relu(size_t input);
     size_t silu(size_t input);
     size_t gelu(size_t input);
     size_t gelu_erf(size_t input);
+    size_t sigmoid(size_t input);
+    size_t tanh(size_t input);
+    size_t glu(size_t input, int axis = -1);
     size_t matmul(size_t input1, size_t input2, bool pretransposed_rhs = false, ComputeBackend backend = ComputeBackend::CPU);
     size_t transpose(size_t input, ComputeBackend backend = ComputeBackend::CPU);
@@ -409,8 +472,8 @@ public:
     size_t gather(size_t embeddings, size_t indices);
     size_t mmap_embeddings(const std::string& filename);
     size_t mmap_weights(const std::string& filename);
-    size_t load_weights(const std::string& filename);
     void set_grouped_scales(size_t node_id, size_t group_size, size_t num_groups, void* scales_ptr);
+    void set_interleaved(size_t node_id, bool interleaved, size_t original_N);
     void release_weight_pages(size_t node_id);
     void prefetch_weight_pages(size_t node_id);
@@ -420,22 +483,68 @@ public:
     size_t bilinear_interpolation(size_t pos_embeds, size_t dst_height, size_t dst_width);
     size_t layernorm(size_t input, size_t weight, size_t bias, float epsilon = 1e-5f);
+    size_t layernorm(size_t input, size_t weight, float epsilon = 1e-5f);  // No bias version
+    size_t groupnorm(size_t input, size_t weight, size_t bias, size_t num_groups = 32, float epsilon = 1e-5f);
+    size_t batchnorm(size_t input, size_t weight, size_t bias, size_t running_mean, size_t running_var, int axis = 1, float epsilon = 1e-5f);
     size_t topk(size_t input, size_t k);
+    size_t moe_layer(size_t hidden,
+                     size_t routing_probs,
+                     size_t topk_indices,
+                     const std::vector<size_t>& w1_weights,
+                     const std::vector<size_t>& w3_weights,
+                     const std::vector<size_t>& w2_weights,
+                     size_t num_experts,
+                     size_t num_experts_per_tok,
+                     bool normalize_routing,
+                     float epsilon,
+                     float routed_scaling_factor);
+    size_t moe_layer(size_t hidden,
+                     size_t routing_probs,
+                     size_t topk_indices,
+                     const std::vector<size_t>& w1_weights,
+                     const std::vector<size_t>& w2_weights,
+                     size_t num_experts,
+                     size_t num_experts_per_tok,
+                     bool normalize_routing,
+                     float epsilon,
+                     float routed_scaling_factor,
+                     Activation activation);
     size_t rms_norm(size_t input, size_t weight, float epsilon = 1e-5f);
     size_t rope(size_t input, float theta, size_t position_offset = 0, ComputeBackend backend = ComputeBackend::CPU);
+    size_t rope_gptj(size_t input, float theta, size_t position_offset = 0, size_t rot_dim = 0, ComputeBackend backend = ComputeBackend::CPU);
     size_t softmax(size_t input, int axis = -1);
     size_t attention(size_t query, size_t key, size_t value, float scale, bool is_causal = true, ComputeBackend backend = ComputeBackend::CPU);
     size_t attention(size_t query, size_t key, size_t value, float scale, size_t position_offset, ComputeBackend backend = ComputeBackend::CPU);
     size_t attention(size_t query, size_t key, size_t value, float scale, size_t position_offset, size_t window_size, ComputeBackend backend = ComputeBackend::CPU);
+    size_t attention_masked(size_t query, size_t key, size_t value, size_t mask, float scale,
+                            bool is_causal = true, ComputeBackend backend = ComputeBackend::CPU,
+                            bool additive_mask = false, size_t position_offset = 0, size_t window_size = 0);
+    size_t rel_pos_bias(size_t query, size_t relative_key, float scale);
     size_t attention_int8_hybrid(size_t query, size_t key_new, size_t value_new, float scale, size_t position_offset,
                                  const int8_t* cached_keys, const int8_t* cached_values,
                                  const float* k_scales, const float* v_scales,
-                                 size_t cache_len, size_t num_kv_heads, size_t head_dim);
+                                 size_t cache_len, size_t num_kv_heads, size_t head_dim, size_t window_size = 0);
     size_t conv1d_causal(size_t input, size_t weight, size_t kernel_size, size_t dilation = 1);
     size_t conv1d_k3(size_t input, size_t weight, size_t stride);
+    size_t conv1d_k7s3(size_t input, size_t weight, size_t bias);
+    size_t conv1d(size_t input, size_t weight, size_t stride);
+    size_t conv1d(size_t input, size_t weight, size_t bias, size_t stride);
+    size_t conv1d_same_depthwise_k9(size_t input, size_t weight);
+    size_t conv1d_same_depthwise_k9(size_t input, size_t weight, size_t bias);
+    size_t conv1d_pointwise(size_t input, size_t weight);
+    size_t conv1d_pointwise(size_t input, size_t weight, size_t bias);
+    size_t conv2d_k3s2p1(size_t input, size_t weight);
+    size_t conv2d_k3s2p1(size_t input, size_t weight, size_t bias);
+    size_t conv2d_depthwise_k3s2p1(size_t input, size_t weight);
+    size_t conv2d_depthwise_k3s2p1(size_t input, size_t weight, size_t bias);
+    size_t conv2d_pointwise_1x1(size_t input, size_t weight);
+    size_t conv2d_pointwise_1x1(size_t input, size_t weight, size_t bias);
+    size_t lstm_cell(size_t input, size_t h_prev, size_t c_prev, size_t weight_ih, size_t weight_hh, size_t bias_ih, size_t bias_hh);
+    size_t stft(size_t input, size_t weight, size_t stride, size_t num_fft_bins);
     size_t sample(size_t logits, float temperature = 0.6f, float top_p = 0.95f, size_t top_k = 20,
                   const std::unordered_map<uint32_t, float>& logit_bias = {});
@@ -462,6 +571,10 @@ public:
     void allocate_buffers();
     size_t get_node_count() const;
+    size_t persistent(size_t source_node);
+    bool is_populated(size_t persistent_node_id) const;
+    void invalidate_persistent(size_t persistent_node_id);
     std::vector<std::unique_ptr<GraphNode>> nodes_;
     std::unordered_map<size_t, size_t> node_index_map_;
@@ -473,6 +586,9 @@ private:
     std::vector<DebugNodeEntry> debug_nodes_;
     BufferPool buffer_pool_;
     bool prefill_mode_ = false;
+    std::unordered_set<size_t> persistent_node_ids_;
+    std::unordered_set<size_t> populated_node_ids_;
 };
@@ -485,7 +601,6 @@ namespace GraphFile {
     };
     void save_node(CactusGraph& graph, size_t node_id, const std::string& filename);
-    LoadedNode load_into_graph(CactusGraph& graph, const std::string& filename);
     class MappedFile {
     public:
@@ -499,16 +614,14 @@ namespace GraphFile {
         const std::vector<size_t>& shape() const;
         Precision precision() const;
-        Precision effective_precision() const {
-            return is_int4_ ? Precision::INT8 : precision_;
-        }
         size_t byte_size() const;
         size_t group_size() const { return group_size_; }
         size_t num_groups() const { return num_groups_; }
         const void* scales_data() const;
-        const void* raw_packed_data() const;  // Get raw mmap'd data without unpacking (for INT4)
-        bool is_int4() const { return is_int4_; }
+        bool is_interleaved() const { return is_interleaved_; }
+        size_t original_N() const { return original_N_; }
         void* data();
         const void* data() const;
@@ -516,8 +629,6 @@ namespace GraphFile {
         template<typename T>
         const T* typed_data() const;
-        LoadedNode load_into_graph(CactusGraph& graph) const;
         void release_pages();
         void prefetch_pages();
@@ -532,16 +643,14 @@ namespace GraphFile {
         size_t num_groups_ = 0;
         size_t scales_offset_ = 0;
         size_t scales_bytes_ = 0;
-        uint32_t version_ = 1;
         uint32_t alignment_ = 32;
-        bool is_int4_ = false;
-        mutable std::unique_ptr<int8_t[]> unpacked_int4_data_;
+        bool is_interleaved_ = false;
+        size_t original_N_ = 0;
         void parse_header();
         void apply_madvise_hints();
-        void unpack_int4_if_needed() const;
     };
-    MappedFile mmap_load(const std::string& filename);
 }
-#endif
+#endif