RubyGems - fine - Versions diffs - 0.1.0 → 0.2.0 - Mend

fine 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

checksums.yaml +4 -4
data/README.md +20 -10
data/docs/examples/image-classification-shapes.md +83 -0
data/docs/examples/text-embeddings-faq.md +98 -0
data/docs/quickstart.md +209 -0
data/docs/tutorials/lora-tool-calling.md +306 -0
data/examples/data/generate_tool_data.rb +261 -0
data/examples/data/ollama_tool_calls.jsonl +40 -0
data/examples/data/sentiment_reviews.jsonl +30 -0
data/examples/data/shapes/circle/circle_1.jpg +0 -0
data/examples/data/shapes/circle/circle_10.jpg +0 -0
data/examples/data/shapes/circle/circle_2.jpg +0 -0
data/examples/data/shapes/circle/circle_3.jpg +0 -0
data/examples/data/shapes/circle/circle_4.jpg +0 -0
data/examples/data/shapes/circle/circle_5.jpg +0 -0
data/examples/data/shapes/circle/circle_6.jpg +0 -0
data/examples/data/shapes/circle/circle_7.jpg +0 -0
data/examples/data/shapes/circle/circle_8.jpg +0 -0
data/examples/data/shapes/circle/circle_9.jpg +0 -0
data/examples/data/shapes/square/square_1.jpg +0 -0
data/examples/data/shapes/square/square_10.jpg +0 -0
data/examples/data/shapes/square/square_2.jpg +0 -0
data/examples/data/shapes/square/square_3.jpg +0 -0
data/examples/data/shapes/square/square_4.jpg +0 -0
data/examples/data/shapes/square/square_5.jpg +0 -0
data/examples/data/shapes/square/square_6.jpg +0 -0
data/examples/data/shapes/square/square_7.jpg +0 -0
data/examples/data/shapes/square/square_8.jpg +0 -0
data/examples/data/shapes/square/square_9.jpg +0 -0
data/examples/data/shapes/triangle/triangle_1.jpg +0 -0
data/examples/data/shapes/triangle/triangle_10.jpg +0 -0
data/examples/data/shapes/triangle/triangle_2.jpg +0 -0
data/examples/data/shapes/triangle/triangle_3.jpg +0 -0
data/examples/data/shapes/triangle/triangle_4.jpg +0 -0
data/examples/data/shapes/triangle/triangle_5.jpg +0 -0
data/examples/data/shapes/triangle/triangle_6.jpg +0 -0
data/examples/data/shapes/triangle/triangle_7.jpg +0 -0
data/examples/data/shapes/triangle/triangle_8.jpg +0 -0
data/examples/data/shapes/triangle/triangle_9.jpg +0 -0
data/examples/data/support_faq_pairs.jsonl +30 -0
data/examples/generate_shape_images.rb +94 -0
data/examples/sentiment_classification.rb +87 -0
data/examples/shape_classification.rb +87 -0
data/examples/support_faq_embeddings.rb +105 -0
data/examples/train_lora_tools.rb +218 -0
data/lib/fine/configuration.rb +173 -15
data/lib/fine/datasets/image_dataset.rb +14 -2
data/lib/fine/datasets/instruction_dataset.rb +17 -2
data/lib/fine/datasets/text_dataset.rb +15 -5
data/lib/fine/hub/config_loader.rb +4 -4
data/lib/fine/hub/safetensors_loader.rb +3 -2
data/lib/fine/llm.rb +39 -10
data/lib/fine/lora.rb +214 -0
data/lib/fine/models/bert_encoder.rb +15 -6
data/lib/fine/models/bert_for_sequence_classification.rb +35 -4
data/lib/fine/models/causal_lm.rb +46 -5
data/lib/fine/models/gemma3_decoder.rb +25 -6
data/lib/fine/models/llama_decoder.rb +9 -8
data/lib/fine/models/sentence_transformer.rb +1 -1
data/lib/fine/tokenizers/auto_tokenizer.rb +15 -0
data/lib/fine/training/text_trainer.rb +3 -1
data/lib/fine/validators.rb +304 -0
data/lib/fine/version.rb +1 -1
data/lib/fine.rb +4 -0
metadata +47 -2

data/lib/fine/lora.rb ADDED Viewed

@@ -0,0 +1,214 @@
+# frozen_string_literal: true
+module Fine
+  # Low-Rank Adaptation (LoRA) for parameter-efficient fine-tuning
+  #
+  # LoRA freezes the pretrained model weights and injects trainable
+  # rank decomposition matrices into each layer, dramatically reducing
+  # the number of trainable parameters.
+  #
+  # @example
+  #   model = Fine::Models::CausalLM.from_pretrained("google/gemma-3-4b-it")
+  #   lora_model = Fine::LoRA.apply(model, rank: 8, alpha: 16, target_modules: ["q_proj", "v_proj"])
+  #   # Only LoRA parameters are trainable now
+  #
+  module LoRA
+    # LoRA Linear layer that wraps an existing Linear layer
+    class LoRALinear < Torch::NN::Module
+      attr_reader :in_features, :out_features, :rank, :alpha, :scaling
+      def initialize(original_layer, rank: 8, alpha: 16, dropout: 0.0)
+        super()
+        @in_features = original_layer.weight.shape[1]
+        @out_features = original_layer.weight.shape[0]
+        @rank = rank
+        @alpha = alpha
+        @scaling = alpha.to_f / rank
+        # Match dtype of original layer
+        @dtype = original_layer.weight.dtype
+        @device = original_layer.weight.device
+        # Store original layer (frozen)
+        @original = original_layer
+        @original.weight.requires_grad = false
+        @original.bias&.requires_grad = false if @original.respond_to?(:bias) && @original.bias
+        # LoRA matrices A and B - match dtype of original layer
+        # W' = W + (B @ A) * scaling
+        # A: (rank, in_features) - initialized with Kaiming uniform
+        # B: (out_features, rank) - initialized with zeros
+        @lora_a = Torch::NN::Parameter.new(
+          Torch.empty(@rank, @in_features, dtype: @dtype, device: @device)
+        )
+        @lora_b = Torch::NN::Parameter.new(
+          Torch.zeros(@out_features, @rank, dtype: @dtype, device: @device)
+        )
+        # Initialize A with Kaiming uniform (in float32, then convert)
+        temp_a = Torch.empty(@rank, @in_features)
+        Torch::NN::Init.kaiming_uniform!(temp_a, a: Math.sqrt(5))
+        @lora_a.data.copy!(temp_a.to(@dtype))
+        # Optional dropout
+        @dropout = dropout > 0 ? Torch::NN::Dropout.new(p: dropout) : nil
+      end
+      def forward(x)
+        # Original forward pass (frozen)
+        original_out = @original.call(x)
+        # LoRA forward: x @ A.T @ B.T * scaling
+        lora_out = x
+        lora_out = @dropout.call(lora_out) if @dropout
+        lora_out = lora_out.matmul(@lora_a.t)
+        lora_out = lora_out.matmul(@lora_b.t)
+        lora_out = lora_out * @scaling
+        original_out + lora_out
+      end
+      # Number of trainable parameters
+      def trainable_params
+        @rank * @in_features + @out_features * @rank
+      end
+      # Merge LoRA weights into original layer (for inference)
+      def merge!
+        Torch.no_grad do
+          delta_w = @lora_b.matmul(@lora_a) * @scaling
+          @original.weight.add!(delta_w)
+        end
+      end
+    end
+    class << self
+      # Apply LoRA to a model
+      #
+      # @param model [Torch::NN::Module] Model to apply LoRA to
+      # @param rank [Integer] LoRA rank (lower = fewer params, higher = more capacity)
+      # @param alpha [Integer] LoRA alpha (scaling factor)
+      # @param dropout [Float] Dropout probability for LoRA layers
+      # @param target_modules [Array<String>] Module names to apply LoRA to
+      # @return [Torch::NN::Module] Model with LoRA applied
+      def apply(model, rank: 8, alpha: 16, dropout: 0.0, target_modules: nil)
+        target_modules ||= default_target_modules
+        # First freeze all parameters
+        model.parameters.each { |p| p.requires_grad = false }
+        # Track replacements
+        replacements = []
+        total_lora_params = 0
+        # Find and replace target modules
+        find_modules(model, target_modules) do |parent, name, layer|
+          next unless layer.is_a?(Torch::NN::Linear)
+          lora_layer = LoRALinear.new(layer, rank: rank, alpha: alpha, dropout: dropout)
+          replacements << [parent, name, lora_layer]
+          total_lora_params += lora_layer.trainable_params
+        end
+        # Apply replacements
+        replacements.each do |parent, name, lora_layer|
+          parent.instance_variable_set("@#{name}", lora_layer)
+        end
+        # Calculate stats
+        total_params = count_params(model)
+        trainable = count_trainable_params(model)
+        puts "   LoRA applied to #{replacements.size} layers"
+        puts "   Total params: #{format_params(total_params)}"
+        puts "   Trainable params: #{format_params(trainable)} (#{(trainable.to_f / total_params * 100).round(2)}%)"
+        model
+      end
+      # Merge LoRA weights into base model (for efficient inference)
+      def merge!(model)
+        find_lora_layers(model) do |lora_layer|
+          lora_layer.merge!
+        end
+        model
+      end
+      # Get only trainable (LoRA) parameters
+      def trainable_parameters(model)
+        params = []
+        find_lora_layers(model) do |lora_layer|
+          params << lora_layer.lora_a
+          params << lora_layer.lora_b
+        end
+        params
+      end
+      # Default modules to apply LoRA to (attention projections)
+      def default_target_modules
+        %w[q_proj k_proj v_proj o_proj]
+      end
+      private
+      def find_modules(model, target_names, parent = nil, prefix = "", &block)
+        model.instance_variables.each do |ivar|
+          name = ivar.to_s.delete_prefix("@")
+          child = model.instance_variable_get(ivar)
+          if child.is_a?(Torch::NN::Module)
+            full_name = prefix.empty? ? name : "#{prefix}.#{name}"
+            if target_names.any? { |t| name == t || name.end_with?(t) }
+              yield(model, name, child)
+            end
+            # Recurse into ModuleList
+            if child.is_a?(Torch::NN::ModuleList)
+              child.each_with_index do |layer, idx|
+                find_modules(layer, target_names, child, "#{full_name}[#{idx}]", &block)
+              end
+            else
+              find_modules(child, target_names, model, full_name, &block)
+            end
+          end
+        end
+      end
+      def find_lora_layers(model, &block)
+        model.instance_variables.each do |ivar|
+          child = model.instance_variable_get(ivar)
+          if child.is_a?(LoRALinear)
+            yield(child)
+          elsif child.is_a?(Torch::NN::ModuleList)
+            child.each { |layer| find_lora_layers(layer, &block) }
+          elsif child.is_a?(Torch::NN::Module)
+            find_lora_layers(child, &block)
+          end
+        end
+      end
+      def count_params(model)
+        model.parameters.sum { |p| p.numel }
+      end
+      def count_trainable_params(model)
+        model.parameters.select { |p| p.requires_grad }.sum { |p| p.numel }
+      end
+      def format_params(n)
+        if n >= 1_000_000_000
+          "#{(n / 1_000_000_000.0).round(2)}B"
+        elsif n >= 1_000_000
+          "#{(n / 1_000_000.0).round(2)}M"
+        elsif n >= 1_000
+          "#{(n / 1_000.0).round(2)}K"
+        else
+          n.to_s
+        end
+      end
+    end
+  end
+end

data/lib/fine/models/bert_encoder.rb CHANGED Viewed

@@ -9,7 +9,7 @@ module Fine
     class BertEncoder < Base
       attr_reader :embeddings, :encoder, :pooler
-      def initialize(config)
+      def initialize(config, use_pooler: true)
         super(config)
         @hidden_size = config.hidden_size
@@ -21,6 +21,7 @@ module Fine
         @type_vocab_size = config.type_vocab_size || 2
         @layer_norm_eps = config.layer_norm_eps
         @hidden_dropout_prob = config.hidden_dropout_prob || 0.1
+        @use_pooler = use_pooler
         # Embeddings
         @word_embeddings = Torch::NN::Embedding.new(@vocab_size, @hidden_size)
@@ -42,9 +43,11 @@ module Fine
           end
         )
-        # Pooler (for [CLS] token representation)
-        @pooler_dense = Torch::NN::Linear.new(@hidden_size, @hidden_size)
-        @pooler_activation = Torch::NN::Tanh.new
+        # Pooler (for [CLS] token representation) - optional for models like DistilBERT
+        if @use_pooler
+          @pooler_dense = Torch::NN::Linear.new(@hidden_size, @hidden_size)
+          @pooler_activation = Torch::NN::Tanh.new
+        end
       end
       def forward(input_ids, attention_mask: nil, token_type_ids: nil)
@@ -83,8 +86,14 @@ module Fine
         # Pool the [CLS] token (first token)
         cls_output = hidden_states[0.., 0, 0..]
-        pooled_output = @pooler_dense.call(cls_output)
-        pooled_output = @pooler_activation.call(pooled_output)
+        # Apply pooler if available, otherwise use CLS directly
+        pooled_output = if @use_pooler && @pooler_dense
+          temp = @pooler_dense.call(cls_output)
+          @pooler_activation.call(temp)
+        else
+          cls_output
+        end
         {
           last_hidden_state: hidden_states,

data/lib/fine/models/bert_for_sequence_classification.rb CHANGED Viewed

@@ -30,8 +30,12 @@ module Fine
           load_result = load_pretrained_weights(model, weights_path)
           if load_result[:missing_keys].any?
-            # Only warn about unexpected missing keys (classifier is expected to be missing)
-            encoder_missing = load_result[:missing_keys].reject { |k| k.include?("classifier") }
+            # Only warn about unexpected missing keys
+            # Expected missing: classifier (new), token_type_embeddings (DistilBERT), pooler (DistilBERT)
+            expected_missing = %w[classifier token_type_embeddings pooler_dense]
+            encoder_missing = load_result[:missing_keys].reject do |k|
+              expected_missing.any? { |exp| k.include?(exp) }
+            end
             if encoder_missing.any?
               warn "Missing encoder keys: #{encoder_missing.first(5).join(', ')}..."
             end
@@ -56,7 +60,7 @@ module Fine
         model = new(config, num_labels: num_labels)
         weights_path = File.join(path, "model.safetensors")
-        Hub::SafetensorsLoader.load_into_model(model, weights_path, strict: false)
+        Hub::SafetensorsLoader.load_into_model(model, weights_path, strict: false, skip_mapping: true)
         model
       end
@@ -66,8 +70,11 @@ module Fine
         @num_labels = num_labels
+        # Detect if this is DistilBERT (no pooler layer in pretrained weights)
+        use_pooler = config.model_type != "distilbert"
         # Encoder
-        @encoder = BertEncoder.new(config)
+        @encoder = BertEncoder.new(config, use_pooler: use_pooler)
         # Classification head
         @dropout = Torch::NN::Dropout.new(p: dropout)
@@ -83,6 +90,8 @@ module Fine
         )
         # Use pooled output for classification
+        # For DistilBERT (no pooler), this is the raw CLS token
+        # which works better than mean pooling for classification
         pooled_output = encoder_output[:pooler_output]
         pooled_output = @dropout.call(pooled_output)
@@ -186,6 +195,28 @@ module Fine
       def self.map_bert_weight_name(hf_name)
         name = hf_name.dup
+        # DistilBERT mappings (must come first as they're more specific)
+        if name.start_with?("distilbert.")
+          name = name.sub("distilbert.embeddings.word_embeddings", "encoder.word_embeddings")
+          name = name.sub("distilbert.embeddings.position_embeddings", "encoder.position_embeddings")
+          name = name.sub("distilbert.embeddings.LayerNorm", "encoder.embeddings_layer_norm")
+          name = name.gsub("distilbert.transformer.layer", "encoder.layers")
+          # DistilBERT attention naming
+          name = name.gsub(".attention.q_lin", ".attention.query")
+          name = name.gsub(".attention.k_lin", ".attention.key")
+          name = name.gsub(".attention.v_lin", ".attention.value")
+          name = name.gsub(".attention.out_lin", ".attention.out")
+          name = name.gsub(".sa_layer_norm", ".attention_layer_norm")
+          # DistilBERT FFN naming
+          name = name.gsub(".ffn.lin1", ".intermediate")
+          name = name.gsub(".ffn.lin2", ".output")
+          return name
+        end
+        # Standard BERT mappings
         # Embeddings
         name = name.sub("bert.embeddings.word_embeddings", "encoder.word_embeddings")
         name = name.sub("bert.embeddings.position_embeddings", "encoder.position_embeddings")

data/lib/fine/models/causal_lm.rb CHANGED Viewed

@@ -62,8 +62,8 @@ module Fine
       def initialize(config)
         super(config)
-        # Use appropriate decoder based on model type
-        @decoder = if config.model_type&.include?("gemma3")
+        # Use appropriate decoder based on model type or architectures
+        @decoder = if gemma3_architecture?(config)
           Gemma3Decoder.new(config)
         else
           LlamaDecoder.new(config)
@@ -134,7 +134,10 @@ module Fine
             if do_sample
               # Top-k filtering
               if top_k > 0
-                indices_to_remove = next_token_logits < Torch.topk(next_token_logits, top_k).values[0.., -1, nil]
+                # Torch.topk returns [values, indices] array in torch-rb
+                topk_values, _topk_indices = Torch.topk(next_token_logits, top_k)
+                threshold = topk_values[0.., -1, nil]
+                indices_to_remove = Torch.lt(next_token_logits, threshold)
                 next_token_logits = next_token_logits.masked_fill(indices_to_remove, -Float::INFINITY)
               end
@@ -144,7 +147,7 @@ module Fine
                 cumulative_probs = Torch.cumsum(Torch::NN::Functional.softmax(sorted_logits, dim: -1), dim: -1)
                 # Remove tokens with cumulative probability above threshold
-                sorted_indices_to_remove = cumulative_probs > top_p
+                sorted_indices_to_remove = Torch.gt(cumulative_probs, top_p)
                 sorted_indices_to_remove[0.., 1..] = sorted_indices_to_remove[0.., 0...-1].clone
                 sorted_indices_to_remove[0.., 0] = false
@@ -182,7 +185,12 @@ module Fine
         weights_path = File.join(path, "model.safetensors")
         Safetensors::Torch.save_file(state_dict, weights_path)
-        save_config = @config.to_h.merge("model_type" => "causal_lm")
+        # Preserve architecture info for proper decoder selection on load
+        save_config = @config.to_h.dup
+        # Don't overwrite model_type if it contains architecture info
+        save_config["model_type"] ||= "causal_lm"
+        # Mark which decoder type this model uses
+        save_config["_decoder_type"] = @decoder.class.name.split("::").last
         config_path = File.join(path, "config.json")
         File.write(config_path, JSON.pretty_generate(save_config))
@@ -190,10 +198,26 @@ module Fine
       private
+      # Detect if this is a Gemma 3 model based on config
+      def gemma3_architecture?(config)
+        # Check model_type first
+        return true if config.model_type&.include?("gemma3")
+        # Check architectures array (HuggingFace format)
+        architectures = config.config["architectures"] || []
+        return true if architectures.any? { |a| a.downcase.include?("gemma3") }
+        # Check saved decoder type
+        return true if config.config["_decoder_type"] == "Gemma3Decoder"
+        false
+      end
       def self.load_pretrained_weights(model, weights_path)
         # Load and copy weights one at a time to minimize memory usage
         model_state = model.state_dict
         model_keys = model_state.keys
+        loaded_lm_head = false
         Torch.no_grad do
           Safetensors::Torch.load_file(weights_path).each do |name, tensor|
@@ -203,8 +227,16 @@ module Fine
               # Convert dtype if needed
               tensor = tensor.to(target.dtype) if tensor.dtype != target.dtype
               target.copy!(tensor)
+              loaded_lm_head = true if mapped_name == "lm_head.weight"
             end
           end
+          # If lm_head wasn't in the weights file, tie it to embeddings
+          unless loaded_lm_head
+            embed_weight = model_state["decoder.embed_tokens.weight"]
+            lm_head_weight = model_state["lm_head.weight"]
+            lm_head_weight.copy!(embed_weight)
+          end
         end
         # Force garbage collection to free loaded tensors
@@ -219,6 +251,7 @@ module Fine
         model_state = model.state_dict
         model_keys = model_state.keys
+        loaded_lm_head = false
         # Load each shard and copy weights immediately to minimize memory
         Torch.no_grad do
@@ -229,11 +262,19 @@ module Fine
                 target = model_state[mapped_name]
                 tensor = tensor.to(target.dtype) if tensor.dtype != target.dtype
                 target.copy!(tensor)
+                loaded_lm_head = true if mapped_name == "lm_head.weight"
               end
             end
             # GC after each shard to free memory
             GC.start
           end
+          # If lm_head wasn't in the weights file, tie it to embeddings
+          unless loaded_lm_head
+            embed_weight = model_state["decoder.embed_tokens.weight"]
+            lm_head_weight = model_state["lm_head.weight"]
+            lm_head_weight.copy!(embed_weight)
+          end
         end
       end

data/lib/fine/models/gemma3_decoder.rb CHANGED Viewed

@@ -60,12 +60,12 @@ module Fine
         position_ids ||= Torch.arange(seq_length, device: input_ids.device)
         position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
-        # Create causal mask
-        causal_mask = create_causal_mask(seq_length, hidden_states.device)
+        # Create causal mask (must match dtype of hidden_states)
+        causal_mask = create_causal_mask(seq_length, hidden_states.device, hidden_states.dtype)
         # Combine with attention mask if provided
         if attention_mask
-          expanded_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+          expanded_mask = attention_mask.unsqueeze(1).unsqueeze(2).to(hidden_states.dtype)
           expanded_mask = expanded_mask.expand(-1, -1, seq_length, -1)
           causal_mask = causal_mask + (1.0 - expanded_mask) * -1e9
         end
@@ -87,9 +87,9 @@ module Fine
       private
-      def create_causal_mask(seq_length, device)
+      def create_causal_mask(seq_length, device, dtype)
         mask = Torch.triu(
-          Torch.ones(seq_length, seq_length, device: device) * -1e9,
+          Torch.ones(seq_length, seq_length, device: device, dtype: dtype) * -1e9,
           diagonal: 1
         )
         mask.unsqueeze(0).unsqueeze(0)
@@ -112,7 +112,7 @@ module Fine
           rms_norm_eps: rms_norm_eps
         )
-        @mlp = LlamaMLP.new(
+        @mlp = Gemma3MLP.new(
           hidden_size: hidden_size,
           intermediate_size: intermediate_size
         )
@@ -240,5 +240,24 @@ module Fine
         x.reshape(batch, num_kv_heads * n_rep, seq_len, head_dim)
       end
     end
+    # Gemma 3 MLP with GELU activation (not SiLU like Llama)
+    class Gemma3MLP < Torch::NN::Module
+      def initialize(hidden_size:, intermediate_size:)
+        super()
+        @gate_proj = Torch::NN::Linear.new(hidden_size, intermediate_size, bias: false)
+        @up_proj = Torch::NN::Linear.new(hidden_size, intermediate_size, bias: false)
+        @down_proj = Torch::NN::Linear.new(intermediate_size, hidden_size, bias: false)
+      end
+      def forward(x)
+        # GeGLU: gelu(gate) * up
+        # Using GELU with tanh approximation as per Gemma config
+        gate = Torch::NN::Functional.gelu(@gate_proj.call(x), approximate: "tanh")
+        up = @up_proj.call(x)
+        @down_proj.call(gate * up)
+      end
+    end
   end
 end

data/lib/fine/models/llama_decoder.rb CHANGED Viewed

@@ -52,13 +52,13 @@ module Fine
         position_ids ||= Torch.arange(seq_length, device: input_ids.device)
         position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
-        # Create causal mask
-        causal_mask = create_causal_mask(seq_length, hidden_states.device)
+        # Create causal mask (must match dtype of hidden_states)
+        causal_mask = create_causal_mask(seq_length, hidden_states.device, hidden_states.dtype)
         # Combine with attention mask if provided
         if attention_mask
           # Expand attention mask: (batch, seq) -> (batch, 1, seq, seq)
-          expanded_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+          expanded_mask = attention_mask.unsqueeze(1).unsqueeze(2).to(hidden_states.dtype)
           expanded_mask = expanded_mask.expand(-1, -1, seq_length, -1)
           causal_mask = causal_mask + (1.0 - expanded_mask) * -1e9
         end
@@ -80,10 +80,10 @@ module Fine
       private
-      def create_causal_mask(seq_length, device)
+      def create_causal_mask(seq_length, device, dtype)
         # Lower triangular mask for causal attention
         mask = Torch.triu(
-          Torch.ones(seq_length, seq_length, device: device) * -1e9,
+          Torch.ones(seq_length, seq_length, device: device, dtype: dtype) * -1e9,
           diagonal: 1
         )
         mask.unsqueeze(0).unsqueeze(0)
@@ -235,10 +235,11 @@ module Fine
         seq_len = position_ids.max.item + 1
         build_cache(seq_len) if seq_len > @cos_cached.size(0)
-        # Move cached tensors to position_ids device if needed
+        # Move cached tensors to position_ids device and match dtype of input
         device = position_ids.device
-        cos_cached = @cos_cached.to(device)
-        sin_cached = @sin_cached.to(device)
+        dtype = x.dtype
+        cos_cached = @cos_cached.to(device).to(dtype)
+        sin_cached = @sin_cached.to(device).to(dtype)
         cos = cos_cached[position_ids].unsqueeze(1)
         sin = sin_cached[position_ids].unsqueeze(1)

data/lib/fine/models/sentence_transformer.rb CHANGED Viewed

@@ -43,7 +43,7 @@ module Fine
         model = new(config, pooling_mode: pooling_mode)
         weights_path = File.join(path, "model.safetensors")
-        Hub::SafetensorsLoader.load_into_model(model, weights_path, strict: false)
+        Hub::SafetensorsLoader.load_into_model(model, weights_path, strict: false, skip_mapping: true)
         model
       end

data/lib/fine/tokenizers/auto_tokenizer.rb CHANGED Viewed

@@ -135,6 +135,21 @@ module Fine
         @tokenizer.decode(token_ids, skip_special_tokens: skip_special_tokens)
       end
+      # Encode without padding (for generation)
+      # Returns only the actual tokens, no padding
+      #
+      # @param text [String] Text to tokenize
+      # @return [Array<Integer>] Token IDs
+      def encode_for_generation(text)
+        # Temporarily disable padding
+        @tokenizer.no_padding
+        encoding = @tokenizer.encode(text)
+        ids = encoding.ids
+        # Re-enable padding
+        @tokenizer.enable_padding(length: @max_length)
+        ids
+      end
       # Get vocabulary size
       def vocab_size
         @tokenizer.vocab_size

data/lib/fine/training/text_trainer.rb CHANGED Viewed

@@ -228,10 +228,12 @@ module Fine
           optimizer.zero_grad
           # For pair datasets, we get anchor and positive texts
-          embeddings = @model.encode(
+          # Use forward() directly during training (not encode() which uses no_grad)
+          output = @model.forward(
             batch[:input_ids],
             attention_mask: batch[:attention_mask]
           )
+          embeddings = output[:embeddings]
           # Multiple Negatives Ranking Loss
           # Treat other samples in batch as negatives