RubyGems - fine - Versions diffs - 0.1.0 - Mend

fine 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

checksums.yaml +7 -0
data/.rspec +3 -0
data/CHANGELOG.md +38 -0
data/Gemfile +6 -0
data/Gemfile.lock +167 -0
data/LICENSE +21 -0
data/README.md +212 -0
data/Rakefile +6 -0
data/docs/installation.md +151 -0
data/docs/tutorials/llm-fine-tuning.md +246 -0
data/docs/tutorials/model-export.md +200 -0
data/docs/tutorials/siglip2-image-classification.md +130 -0
data/docs/tutorials/siglip2-object-recognition.md +203 -0
data/docs/tutorials/siglip2-similarity-search.md +152 -0
data/docs/tutorials/text-classification.md +233 -0
data/docs/tutorials/text-embeddings.md +211 -0
data/examples/basic_classification.rb +70 -0
data/examples/data/tool_calls.jsonl +30 -0
data/examples/demo_training.rb +78 -0
data/examples/finetune_gemma3_tools.rb +135 -0
data/examples/real_llm_test.rb +128 -0
data/examples/real_text_classification_test.rb +90 -0
data/examples/real_text_embedder_test.rb +110 -0
data/examples/real_training_test.rb +88 -0
data/examples/test_export.rb +28 -0
data/examples/test_image_classifier.rb +79 -0
data/examples/test_llm.rb +100 -0
data/examples/test_text_classifier.rb +59 -0
data/lib/fine/callbacks/base.rb +140 -0
data/lib/fine/callbacks/progress_bar.rb +66 -0
data/lib/fine/configuration.rb +106 -0
data/lib/fine/datasets/data_loader.rb +63 -0
data/lib/fine/datasets/image_dataset.rb +203 -0
data/lib/fine/datasets/instruction_dataset.rb +226 -0
data/lib/fine/datasets/text_data_loader.rb +88 -0
data/lib/fine/datasets/text_dataset.rb +266 -0
data/lib/fine/error.rb +49 -0
data/lib/fine/export/gguf_exporter.rb +424 -0
data/lib/fine/export/onnx_exporter.rb +249 -0
data/lib/fine/export.rb +53 -0
data/lib/fine/hub/config_loader.rb +145 -0
data/lib/fine/hub/model_downloader.rb +136 -0
data/lib/fine/hub/safetensors_loader.rb +108 -0
data/lib/fine/image_classifier.rb +256 -0
data/lib/fine/llm.rb +336 -0
data/lib/fine/models/base.rb +48 -0
data/lib/fine/models/bert_encoder.rb +202 -0
data/lib/fine/models/bert_for_sequence_classification.rb +226 -0
data/lib/fine/models/causal_lm.rb +279 -0
data/lib/fine/models/classification_head.rb +24 -0
data/lib/fine/models/gemma3_decoder.rb +244 -0
data/lib/fine/models/llama_decoder.rb +297 -0
data/lib/fine/models/sentence_transformer.rb +202 -0
data/lib/fine/models/siglip2_for_image_classification.rb +155 -0
data/lib/fine/models/siglip2_vision_encoder.rb +190 -0
data/lib/fine/text_classifier.rb +250 -0
data/lib/fine/text_embedder.rb +221 -0
data/lib/fine/tokenizers/auto_tokenizer.rb +208 -0
data/lib/fine/training/llm_trainer.rb +212 -0
data/lib/fine/training/text_trainer.rb +275 -0
data/lib/fine/training/trainer.rb +194 -0
data/lib/fine/transforms/compose.rb +28 -0
data/lib/fine/transforms/normalize.rb +33 -0
data/lib/fine/transforms/resize.rb +35 -0
data/lib/fine/transforms/to_tensor.rb +53 -0
data/lib/fine/version.rb +3 -0
data/lib/fine.rb +112 -0
data/mise.toml +2 -0
metadata +240 -0

data/lib/fine/export/gguf_exporter.rb ADDED Viewed

@@ -0,0 +1,424 @@
+# frozen_string_literal: true
+module Fine
+  module Export
+    # Export LLMs to GGUF format for llama.cpp, ollama, etc.
+    #
+    # @example Basic export
+    #   llm = Fine::LLM.load("my_llama")
+    #   Fine::Export::GGUFExporter.export(llm, "model.gguf")
+    #
+    # @example With quantization
+    #   Fine::Export::GGUFExporter.export(
+    #     llm,
+    #     "model-q4.gguf",
+    #     quantization: :q4_0
+    #   )
+    class GGUFExporter
+      # GGUF magic number and version
+      GGUF_MAGIC = 0x46554747  # "GGUF" in little-endian
+      GGUF_VERSION = 3
+      # GGUF value types
+      GGUF_TYPE_UINT8 = 0
+      GGUF_TYPE_INT8 = 1
+      GGUF_TYPE_UINT16 = 2
+      GGUF_TYPE_INT16 = 3
+      GGUF_TYPE_UINT32 = 4
+      GGUF_TYPE_INT32 = 5
+      GGUF_TYPE_FLOAT32 = 6
+      GGUF_TYPE_BOOL = 7
+      GGUF_TYPE_STRING = 8
+      GGUF_TYPE_ARRAY = 9
+      GGUF_TYPE_UINT64 = 10
+      GGUF_TYPE_INT64 = 11
+      GGUF_TYPE_FLOAT64 = 12
+      # GGML tensor types
+      GGML_TYPE_F32 = 0
+      GGML_TYPE_F16 = 1
+      GGML_TYPE_Q4_0 = 2
+      GGML_TYPE_Q4_1 = 3
+      GGML_TYPE_Q5_0 = 6
+      GGML_TYPE_Q5_1 = 7
+      GGML_TYPE_Q8_0 = 8
+      GGML_TYPE_Q8_1 = 9
+      GGML_TYPE_Q2_K = 10
+      GGML_TYPE_Q3_K = 11
+      GGML_TYPE_Q4_K = 12
+      GGML_TYPE_Q5_K = 13
+      GGML_TYPE_Q6_K = 14
+      GGML_TYPE_Q8_K = 15
+      QUANTIZATION_TYPES = {
+        f32: GGML_TYPE_F32,
+        f16: GGML_TYPE_F16,
+        q4_0: GGML_TYPE_Q4_0,
+        q4_1: GGML_TYPE_Q4_1,
+        q5_0: GGML_TYPE_Q5_0,
+        q5_1: GGML_TYPE_Q5_1,
+        q8_0: GGML_TYPE_Q8_0,
+        q4_k: GGML_TYPE_Q4_K,
+        q5_k: GGML_TYPE_Q5_K,
+        q6_k: GGML_TYPE_Q6_K
+      }.freeze
+      class << self
+        # Export a Fine::LLM to GGUF format
+        #
+        # @param llm [Fine::LLM] The LLM to export
+        # @param output_path [String] Path for the output GGUF file
+        # @param quantization [Symbol] Quantization type (:f16, :q4_0, :q4_k, :q8_0, etc.)
+        # @param metadata [Hash] Additional metadata to include
+        def export(llm, output_path, quantization: :f16, metadata: {})
+          unless llm.is_a?(Fine::LLM)
+            raise ExportError, "GGUF export only supports Fine::LLM models"
+          end
+          unless llm.model
+            raise ExportError, "Model not loaded or trained"
+          end
+          exporter = new(llm, output_path, quantization, metadata)
+          exporter.export
+        end
+      end
+      def initialize(llm, output_path, quantization, metadata)
+        @llm = llm
+        @output_path = output_path
+        @quantization = quantization
+        @metadata = metadata
+        @model = llm.model
+        @config = extract_config
+      end
+      def export
+        File.open(@output_path, "wb") do |file|
+          @file = file
+          write_header
+          write_metadata
+          write_tensors
+        end
+        @output_path
+      end
+      private
+      def extract_config
+        model_config = @model.config
+        {
+          vocab_size: model_config.vocab_size,
+          hidden_size: model_config.hidden_size,
+          intermediate_size: model_config.intermediate_size,
+          num_hidden_layers: model_config.num_hidden_layers,
+          num_attention_heads: model_config.num_attention_heads,
+          num_key_value_heads: model_config.num_key_value_heads || model_config.num_attention_heads,
+          max_position_embeddings: model_config.max_position_embeddings || 2048,
+          rms_norm_eps: model_config.rms_norm_eps || 1e-6,
+          rope_theta: model_config.rope_theta || 10000.0
+        }
+      end
+      def write_header
+        # Magic number
+        @file.write([GGUF_MAGIC].pack("V"))
+        # Version
+        @file.write([GGUF_VERSION].pack("V"))
+        # Tensor count (will be updated later)
+        @tensor_count_pos = @file.pos
+        @file.write([0].pack("Q<"))
+        # Metadata KV count (will be updated later)
+        @kv_count_pos = @file.pos
+        @file.write([0].pack("Q<"))
+      end
+      def write_metadata
+        kv_count = 0
+        # Architecture
+        write_string_kv("general.architecture", "llama")
+        kv_count += 1
+        write_string_kv("general.name", @llm.model_id || "fine-tuned-model")
+        kv_count += 1
+        # Model parameters
+        write_uint32_kv("llama.context_length", @config[:max_position_embeddings])
+        kv_count += 1
+        write_uint32_kv("llama.embedding_length", @config[:hidden_size])
+        kv_count += 1
+        write_uint32_kv("llama.block_count", @config[:num_hidden_layers])
+        kv_count += 1
+        write_uint32_kv("llama.feed_forward_length", @config[:intermediate_size])
+        kv_count += 1
+        write_uint32_kv("llama.attention.head_count", @config[:num_attention_heads])
+        kv_count += 1
+        write_uint32_kv("llama.attention.head_count_kv", @config[:num_key_value_heads])
+        kv_count += 1
+        write_float32_kv("llama.rope.freq_base", @config[:rope_theta])
+        kv_count += 1
+        write_float32_kv("llama.attention.layer_norm_rms_epsilon", @config[:rms_norm_eps])
+        kv_count += 1
+        # Tokenizer info (if available)
+        if @llm.tokenizer
+          write_string_kv("tokenizer.ggml.model", "llama")
+          kv_count += 1
+          if @llm.tokenizer.respond_to?(:vocab_size)
+            write_uint32_kv("llama.vocab_size", @llm.tokenizer.vocab_size)
+            kv_count += 1
+          end
+        end
+        # Custom metadata
+        @metadata.each do |key, value|
+          case value
+          when String
+            write_string_kv("general.#{key}", value)
+          when Integer
+            write_uint32_kv("general.#{key}", value)
+          when Float
+            write_float32_kv("general.#{key}", value)
+          end
+          kv_count += 1
+        end
+        # Update KV count
+        current_pos = @file.pos
+        @file.seek(@kv_count_pos)
+        @file.write([kv_count].pack("Q<"))
+        @file.seek(current_pos)
+      end
+      def write_tensors
+        tensor_count = 0
+        tensor_infos = []
+        tensor_data = []
+        state_dict = @model.state_dict
+        state_dict.each do |name, tensor|
+          gguf_name = convert_tensor_name(name)
+          next unless gguf_name
+          # Quantize tensor
+          quantized, dtype = quantize_tensor(tensor, name)
+          tensor_infos << {
+            name: gguf_name,
+            dims: tensor.shape.reverse,  # GGUF uses reversed dimensions
+            dtype: dtype
+          }
+          tensor_data << quantized
+          tensor_count += 1
+        end
+        # Write tensor infos
+        tensor_infos.each do |info|
+          write_tensor_info(info)
+        end
+        # Alignment padding
+        align_to(32)
+        # Write tensor data
+        tensor_data.each_with_index do |data, idx|
+          align_to(32)
+          @file.write(data)
+        end
+        # Update tensor count
+        current_pos = @file.pos
+        @file.seek(@tensor_count_pos)
+        @file.write([tensor_count].pack("Q<"))
+        @file.seek(current_pos)
+      end
+      def convert_tensor_name(torch_name)
+        # Map torch.rb/HuggingFace names to GGUF names
+        name = torch_name.dup
+        mappings = {
+          "decoder.embed_tokens.weight" => "token_embd.weight",
+          "decoder.norm.weight" => "output_norm.weight",
+          "lm_head.weight" => "output.weight"
+        }
+        return mappings[name] if mappings.key?(name)
+        # Layer mappings
+        if name =~ /decoder\.layers\.(\d+)\./
+          layer_num = $1
+          layer_mappings = {
+            "input_layernorm.weight" => "blk.#{layer_num}.attn_norm.weight",
+            "post_attention_layernorm.weight" => "blk.#{layer_num}.ffn_norm.weight",
+            "self_attn.q_proj.weight" => "blk.#{layer_num}.attn_q.weight",
+            "self_attn.k_proj.weight" => "blk.#{layer_num}.attn_k.weight",
+            "self_attn.v_proj.weight" => "blk.#{layer_num}.attn_v.weight",
+            "self_attn.o_proj.weight" => "blk.#{layer_num}.attn_output.weight",
+            "mlp.gate_proj.weight" => "blk.#{layer_num}.ffn_gate.weight",
+            "mlp.up_proj.weight" => "blk.#{layer_num}.ffn_up.weight",
+            "mlp.down_proj.weight" => "blk.#{layer_num}.ffn_down.weight"
+          }
+          suffix = name.sub(/decoder\.layers\.\d+\./, "")
+          return layer_mappings[suffix]
+        end
+        nil  # Skip unknown tensors
+      end
+      def quantize_tensor(tensor, name)
+        tensor = tensor.cpu.contiguous
+        # Always keep embeddings and norms in higher precision
+        if name.include?("embed") || name.include?("norm") || name.include?("lm_head")
+          return [tensor_to_f16(tensor), GGML_TYPE_F16]
+        end
+        case @quantization
+        when :f32
+          [tensor_to_f32(tensor), GGML_TYPE_F32]
+        when :f16
+          [tensor_to_f16(tensor), GGML_TYPE_F16]
+        when :q8_0
+          quantize_q8_0(tensor)
+        when :q4_0
+          quantize_q4_0(tensor)
+        when :q4_k, :q5_k, :q6_k
+          # K-quants are more complex, fall back to Q8 for now
+          quantize_q8_0(tensor)
+        else
+          [tensor_to_f16(tensor), GGML_TYPE_F16]
+        end
+      end
+      def tensor_to_f32(tensor)
+        tensor.to(:float32).data_ptr_bytes
+      end
+      def tensor_to_f16(tensor)
+        tensor.to(:float16).data_ptr_bytes
+      end
+      def quantize_q8_0(tensor)
+        # Q8_0: 8-bit quantization with block size 32
+        block_size = 32
+        data = tensor.to(:float32).flatten.to_a
+        quantized = []
+        data.each_slice(block_size) do |block|
+          block = block + [0.0] * (block_size - block.size) if block.size < block_size
+          # Find scale (max absolute value)
+          max_abs = block.map(&:abs).max
+          scale = max_abs / 127.0
+          scale = 1.0 if scale == 0
+          # Quantize
+          quantized << [scale].pack("e")  # float16 scale
+          block.each do |val|
+            q = (val / scale).round.clamp(-128, 127)
+            quantized << [q].pack("c")
+          end
+        end
+        [quantized.join, GGML_TYPE_Q8_0]
+      end
+      def quantize_q4_0(tensor)
+        # Q4_0: 4-bit quantization with block size 32
+        block_size = 32
+        data = tensor.to(:float32).flatten.to_a
+        quantized = []
+        data.each_slice(block_size) do |block|
+          block = block + [0.0] * (block_size - block.size) if block.size < block_size
+          # Find scale
+          max_abs = block.map(&:abs).max
+          scale = max_abs / 7.0
+          scale = 1.0 if scale == 0
+          # Quantize to 4-bit
+          quantized << [scale].pack("e")  # float16 scale
+          block.each_slice(2) do |pair|
+            q0 = ((pair[0] / scale).round.clamp(-8, 7) + 8) & 0x0F
+            q1 = ((pair[1] / scale).round.clamp(-8, 7) + 8) & 0x0F
+            quantized << [(q0 | (q1 << 4))].pack("C")
+          end
+        end
+        [quantized.join, GGML_TYPE_Q4_0]
+      end
+      def write_tensor_info(info)
+        # Name
+        write_string(info[:name])
+        # Number of dimensions
+        @file.write([info[:dims].size].pack("V"))
+        # Dimensions
+        info[:dims].each do |dim|
+          @file.write([dim].pack("Q<"))
+        end
+        # Type
+        @file.write([info[:dtype]].pack("V"))
+        # Offset (will be calculated later, write 0 for now)
+        @file.write([0].pack("Q<"))
+      end
+      def write_string_kv(key, value)
+        write_string(key)
+        @file.write([GGUF_TYPE_STRING].pack("V"))
+        write_string(value)
+      end
+      def write_uint32_kv(key, value)
+        write_string(key)
+        @file.write([GGUF_TYPE_UINT32].pack("V"))
+        @file.write([value].pack("V"))
+      end
+      def write_float32_kv(key, value)
+        write_string(key)
+        @file.write([GGUF_TYPE_FLOAT32].pack("V"))
+        @file.write([value].pack("e"))
+      end
+      def write_string(str)
+        @file.write([str.bytesize].pack("Q<"))
+        @file.write(str)
+      end
+      def align_to(alignment)
+        current = @file.pos
+        padding = (alignment - (current % alignment)) % alignment
+        @file.write("\x00" * padding) if padding > 0
+      end
+    end
+  end
+end

data/lib/fine/export/onnx_exporter.rb ADDED Viewed

@@ -0,0 +1,249 @@
+# frozen_string_literal: true
+module Fine
+  module Export
+    # Export models to ONNX format
+    #
+    # @example Export a text classifier
+    #   classifier = Fine::TextClassifier.load("my_model")
+    #   Fine::Export::ONNXExporter.export(classifier, "model.onnx")
+    #
+    # @example Export with options
+    #   Fine::Export::ONNXExporter.export(
+    #     model,
+    #     "model.onnx",
+    #     opset_version: 14,
+    #     dynamic_axes: true
+    #   )
+    class ONNXExporter
+      SUPPORTED_TYPES = [
+        Fine::TextClassifier,
+        Fine::TextEmbedder,
+        Fine::ImageClassifier,
+        Fine::LLM
+      ].freeze
+      class << self
+        # Export a Fine model to ONNX format
+        #
+        # @param fine_model [TextClassifier, TextEmbedder, ImageClassifier, LLM] The model to export
+        # @param output_path [String] Path for the output ONNX file
+        # @param opset_version [Integer] ONNX opset version (default: 14)
+        # @param dynamic_axes [Boolean] Use dynamic axes for variable batch/sequence (default: true)
+        # @param quantize [Symbol, nil] Quantization type (:int8, :uint8, nil)
+        def export(fine_model, output_path, opset_version: 14, dynamic_axes: true, quantize: nil)
+          validate_model(fine_model)
+          model = fine_model.model
+          model.eval
+          # Get example inputs based on model type
+          example_inputs, input_names, output_names, dynamic_axes_config =
+            prepare_export_config(fine_model, dynamic_axes)
+          # Export to ONNX
+          Torch::ONNX.export(
+            model,
+            example_inputs,
+            output_path,
+            input_names: input_names,
+            output_names: output_names,
+            dynamic_axes: dynamic_axes_config,
+            opset_version: opset_version,
+            do_constant_folding: true
+          )
+          # Optional quantization
+          if quantize
+            quantize_model(output_path, quantize)
+          end
+          output_path
+        end
+        # Export only the encoder/backbone (useful for embeddings)
+        #
+        # @param fine_model [TextEmbedder, ImageClassifier] Model with encoder
+        # @param output_path [String] Output path
+        def export_encoder(fine_model, output_path, **options)
+          unless fine_model.respond_to?(:model) && fine_model.model.respond_to?(:encoder)
+            raise ExportError, "Model does not have an encoder"
+          end
+          encoder = fine_model.model.encoder
+          encoder.eval
+          example_inputs, input_names, output_names, dynamic_axes_config =
+            prepare_encoder_config(fine_model)
+          Torch::ONNX.export(
+            encoder,
+            example_inputs,
+            output_path,
+            input_names: input_names,
+            output_names: output_names,
+            dynamic_axes: dynamic_axes_config,
+            opset_version: options[:opset_version] || 14
+          )
+          output_path
+        end
+        private
+        def validate_model(model)
+          unless SUPPORTED_TYPES.any? { |t| model.is_a?(t) }
+            raise ExportError, "Unsupported model type: #{model.class}"
+          end
+          unless model.model
+            raise ExportError, "Model not loaded or trained"
+          end
+        end
+        def prepare_export_config(fine_model, dynamic_axes)
+          case fine_model
+          when Fine::TextClassifier, Fine::TextEmbedder
+            prepare_text_config(fine_model, dynamic_axes)
+          when Fine::ImageClassifier
+            prepare_image_config(fine_model, dynamic_axes)
+          when Fine::LLM
+            prepare_llm_config(fine_model, dynamic_axes)
+          end
+        end
+        def prepare_text_config(fine_model, dynamic_axes)
+          batch_size = 1
+          seq_length = fine_model.config.max_length
+          example_inputs = [
+            Torch.zeros([batch_size, seq_length], dtype: :int64),  # input_ids
+            Torch.ones([batch_size, seq_length], dtype: :int64)    # attention_mask
+          ]
+          input_names = %w[input_ids attention_mask]
+          output_names = if fine_model.is_a?(Fine::TextEmbedder)
+            %w[embeddings]
+          else
+            %w[logits]
+          end
+          dynamic_axes_config = if dynamic_axes
+            {
+              "input_ids" => { 0 => "batch_size", 1 => "sequence" },
+              "attention_mask" => { 0 => "batch_size", 1 => "sequence" },
+              output_names.first => { 0 => "batch_size" }
+            }
+          end
+          [example_inputs, input_names, output_names, dynamic_axes_config]
+        end
+        def prepare_image_config(fine_model, dynamic_axes)
+          # Get image size from config
+          image_size = fine_model.config.image_size || 224
+          batch_size = 1
+          example_inputs = [
+            Torch.zeros([batch_size, 3, image_size, image_size], dtype: :float32)
+          ]
+          input_names = %w[pixel_values]
+          output_names = %w[logits]
+          dynamic_axes_config = if dynamic_axes
+            {
+              "pixel_values" => { 0 => "batch_size" },
+              "logits" => { 0 => "batch_size" }
+            }
+          end
+          [example_inputs, input_names, output_names, dynamic_axes_config]
+        end
+        def prepare_llm_config(fine_model, dynamic_axes)
+          batch_size = 1
+          seq_length = 128  # Smaller default for export
+          example_inputs = [
+            Torch.zeros([batch_size, seq_length], dtype: :int64)  # input_ids
+          ]
+          input_names = %w[input_ids]
+          output_names = %w[logits]
+          dynamic_axes_config = if dynamic_axes
+            {
+              "input_ids" => { 0 => "batch_size", 1 => "sequence" },
+              "logits" => { 0 => "batch_size", 1 => "sequence" }
+            }
+          end
+          [example_inputs, input_names, output_names, dynamic_axes_config]
+        end
+        def prepare_encoder_config(fine_model)
+          case fine_model
+          when Fine::TextEmbedder
+            batch_size = 1
+            seq_length = fine_model.config.max_length
+            example_inputs = [
+              Torch.zeros([batch_size, seq_length], dtype: :int64),
+              Torch.ones([batch_size, seq_length], dtype: :int64)
+            ]
+            input_names = %w[input_ids attention_mask]
+            output_names = %w[last_hidden_state]
+            dynamic_axes_config = {
+              "input_ids" => { 0 => "batch_size", 1 => "sequence" },
+              "attention_mask" => { 0 => "batch_size", 1 => "sequence" },
+              "last_hidden_state" => { 0 => "batch_size", 1 => "sequence" }
+            }
+            [example_inputs, input_names, output_names, dynamic_axes_config]
+          when Fine::ImageClassifier
+            image_size = fine_model.config.image_size || 224
+            example_inputs = [
+              Torch.zeros([1, 3, image_size, image_size], dtype: :float32)
+            ]
+            [example_inputs, %w[pixel_values], %w[features], { "pixel_values" => { 0 => "batch_size" } }]
+          end
+        end
+        def quantize_model(model_path, quantize_type)
+          # Note: Full ONNX quantization requires onnxruntime
+          # This is a placeholder for the quantization logic
+          require "onnxruntime"
+          quantized_path = model_path.sub(".onnx", "_quantized.onnx")
+          case quantize_type
+          when :int8
+            # Dynamic INT8 quantization
+            OnnxRuntime::Quantization.quantize_dynamic(
+              model_path,
+              quantized_path,
+              weight_type: :int8
+            )
+          when :uint8
+            OnnxRuntime::Quantization.quantize_dynamic(
+              model_path,
+              quantized_path,
+              weight_type: :uint8
+            )
+          end
+          # Replace original with quantized
+          FileUtils.mv(quantized_path, model_path)
+        rescue LoadError
+          warn "onnxruntime gem not installed, skipping quantization"
+        end
+      end
+    end
+  end
+end