RubyGems - chunker-ruby - Versions diffs - 0.1.0 - Mend

chunker-ruby 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +7 -0
data/LICENSE +21 -0
data/lib/chunker_ruby/base_splitter.rb +78 -0
data/lib/chunker_ruby/character.rb +30 -0
data/lib/chunker_ruby/chunk.rb +36 -0
data/lib/chunker_ruby/code.rb +43 -0
data/lib/chunker_ruby/html.rb +83 -0
data/lib/chunker_ruby/json_splitter.rb +62 -0
data/lib/chunker_ruby/markdown.rb +114 -0
data/lib/chunker_ruby/rails/chunkable.rb +70 -0
data/lib/chunker_ruby/recursive_character.rb +68 -0
data/lib/chunker_ruby/semantic.rb +106 -0
data/lib/chunker_ruby/sentence.rb +65 -0
data/lib/chunker_ruby/separator.rb +60 -0
data/lib/chunker_ruby/sliding_window.rb +36 -0
data/lib/chunker_ruby/token.rb +82 -0
data/lib/chunker_ruby/version.rb +5 -0
data/lib/chunker_ruby.rb +22 -0
metadata +61 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: fb1949806664ba1e447e440f5dff4a2e1c072e2a0ed44248d235772ab23d121d
+  data.tar.gz: 2b4eb4b750714e39ef11f6fa40fc82640797b52ebca069f5993be18fa9a705f9
+SHA512:
+  metadata.gz: 7b2fc37c66650dfe14a035e36bc4dea38895a98f165520be4c077ca3f85ffb8a45f0b89e30b1f484184cf0e963e2e58227f2af26633896de09af2c4d13297f68
+  data.tar.gz: ae966e3dfa33ec187899018192fa80dbe78473e988bbeb91ccd165bc1e4e747d7cc97c12d6b22e37b9e298b5c35de2a8e581bf60f65516e93f6dbe7c656e70de

data/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Johannes Dwi Cahyo
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

data/lib/chunker_ruby/base_splitter.rb ADDED Viewed

@@ -0,0 +1,78 @@
+# frozen_string_literal: true
+module ChunkerRuby
+  class BaseSplitter
+    attr_reader :chunk_size, :chunk_overlap
+    def initialize(chunk_size: 1000, chunk_overlap: 200, **options)
+      raise ArgumentError, "chunk_size must be positive" unless chunk_size > 0
+      raise ArgumentError, "chunk_overlap must be non-negative" unless chunk_overlap >= 0
+      raise ArgumentError, "chunk_overlap must be less than chunk_size" unless chunk_overlap < chunk_size
+      @chunk_size = chunk_size
+      @chunk_overlap = chunk_overlap
+    end
+    def split(text, metadata: {})
+      raise NotImplementedError, "#{self.class}#split must be implemented"
+    end
+    def split_many(texts)
+      texts.flat_map.with_index { |t, i| split(t, metadata: { doc_index: i }) }
+    end
+    private
+    def build_chunks(pieces, original_text, metadata: {})
+      chunks = []
+      current_parts = []
+      current_length = 0
+      pieces.each do |piece|
+        piece_len = piece.length
+        if current_length + piece_len > @chunk_size && !current_parts.empty?
+          chunk_text = current_parts.join
+          offset = original_text.index(chunk_text) || 0
+          chunks << Chunk.new(
+            text: chunk_text,
+            index: chunks.size,
+            offset: offset,
+            metadata: metadata.dup
+          )
+          # Handle overlap: keep trailing parts that fit within overlap size
+          overlap_parts = []
+          overlap_length = 0
+          current_parts.reverse_each do |part|
+            if overlap_length + part.length <= @chunk_overlap
+              overlap_parts.unshift(part)
+              overlap_length += part.length
+            else
+              break
+            end
+          end
+          current_parts = overlap_parts
+          current_length = overlap_length
+        end
+        current_parts << piece
+        current_length += piece_len
+      end
+      unless current_parts.empty?
+        chunk_text = current_parts.join
+        offset = original_text.rindex(chunk_text) || 0
+        chunks << Chunk.new(
+          text: chunk_text,
+          index: chunks.size,
+          offset: offset,
+          metadata: metadata.dup
+        )
+      end
+      chunks
+    end
+  end
+end

data/lib/chunker_ruby/character.rb ADDED Viewed

@@ -0,0 +1,30 @@
+# frozen_string_literal: true
+module ChunkerRuby
+  class Character < BaseSplitter
+    def split(text, metadata: {})
+      return [] if text.nil? || text.empty?
+      chunks = []
+      start = 0
+      while start < text.length
+        end_pos = [start + @chunk_size, text.length].min
+        chunk_text = text[start...end_pos]
+        chunks << Chunk.new(
+          text: chunk_text,
+          index: chunks.size,
+          offset: start,
+          metadata: metadata.dup
+        )
+        break if end_pos >= text.length
+        start += @chunk_size - @chunk_overlap
+      end
+      chunks
+    end
+  end
+end

data/lib/chunker_ruby/chunk.rb ADDED Viewed

@@ -0,0 +1,36 @@
+# frozen_string_literal: true
+module ChunkerRuby
+  class Chunk
+    attr_reader :text, :index, :offset, :length, :metadata
+    def initialize(text:, index:, offset:, metadata: {})
+      @text = text
+      @index = index
+      @offset = offset
+      @length = text.length
+      @metadata = metadata
+    end
+    def token_count(tokenizer = nil)
+      if tokenizer
+        tokenizer.encode(text).length
+      else
+        # Rough estimation: ~4 characters per token for English
+        (text.length / 4.0).ceil
+      end
+    end
+    def to_s
+      @text
+    end
+    def to_h
+      { text: @text, index: @index, offset: @offset, length: @length, metadata: @metadata }
+    end
+    def ==(other)
+      other.is_a?(Chunk) && text == other.text && index == other.index && offset == other.offset
+    end
+  end
+end

data/lib/chunker_ruby/code.rb ADDED Viewed

@@ -0,0 +1,43 @@
+# frozen_string_literal: true
+module ChunkerRuby
+  class Code < BaseSplitter
+    LANGUAGE_SEPARATORS = {
+      ruby: [
+        "\nclass ", "\nmodule ", "\ndef ", "\n\n", "\n", " ", ""
+      ],
+      python: [
+        "\nclass ", "\ndef ", "\n\n", "\n", " ", ""
+      ],
+      javascript: [
+        "\nfunction ", "\nclass ", "\nconst ", "\nlet ", "\nvar ",
+        "\nexport ", "\n\n", "\n", " ", ""
+      ],
+      typescript: [
+        "\ninterface ", "\ntype ", "\nfunction ", "\nclass ",
+        "\nconst ", "\nlet ", "\nexport ", "\n\n", "\n", " ", ""
+      ]
+    }.freeze
+    def initialize(language: :ruby, **kwargs)
+      super(**kwargs)
+      @language = language.to_sym
+      @separators = LANGUAGE_SEPARATORS.fetch(@language) do
+        raise ArgumentError, "Unsupported language: #{language}. Supported: #{LANGUAGE_SEPARATORS.keys.join(", ")}"
+      end
+    end
+    def split(text, metadata: {})
+      return [] if text.nil? || text.empty?
+      meta = metadata.merge(language: @language)
+      splitter = RecursiveCharacter.new(
+        chunk_size: @chunk_size,
+        chunk_overlap: @chunk_overlap,
+        separators: @separators,
+        keep_separator: true
+      )
+      splitter.split(text, metadata: meta)
+    end
+  end
+end

data/lib/chunker_ruby/html.rb ADDED Viewed

@@ -0,0 +1,83 @@
+# frozen_string_literal: true
+module ChunkerRuby
+  class HTML < BaseSplitter
+    BLOCK_TAGS = %w[
+      div p section article aside main header footer nav
+      h1 h2 h3 h4 h5 h6 blockquote pre ul ol li table tr
+      form fieldset details summary figure figcaption
+    ].freeze
+    def initialize(strip_tags: false, **kwargs)
+      super(**kwargs)
+      @strip_tags = strip_tags
+    end
+    def split(text, metadata: {})
+      return [] if text.nil? || text.empty?
+      sections = split_by_tags(text)
+      chunks = []
+      sections.each do |section|
+        content = @strip_tags ? strip_html_tags(section[:text]) : section[:text]
+        next if content.strip.empty?
+        if content.length <= @chunk_size
+          chunks << Chunk.new(
+            text: content,
+            index: chunks.size,
+            offset: section[:offset],
+            metadata: metadata.dup
+          )
+        else
+          sub_splitter = RecursiveCharacter.new(
+            chunk_size: @chunk_size,
+            chunk_overlap: @chunk_overlap
+          )
+          sub_chunks = sub_splitter.split(content, metadata: metadata)
+          sub_chunks.each do |sc|
+            chunks << Chunk.new(
+              text: sc.text,
+              index: chunks.size,
+              offset: section[:offset] + sc.offset,
+              metadata: sc.metadata
+            )
+          end
+        end
+      end
+      chunks
+    end
+    private
+    def split_by_tags(text)
+      sections = []
+      tag_pattern = /<\/?(?:#{BLOCK_TAGS.join("|")})\b[^>]*>/i
+      parts = text.split(/(#{tag_pattern})/i)
+      current_text = +""
+      current_offset = 0
+      pos = 0
+      parts.each do |part|
+        if part.match?(tag_pattern) && !current_text.strip.empty?
+          sections << { text: current_text, offset: current_offset }
+          current_text = part
+          current_offset = pos
+        else
+          current_text << part
+        end
+        pos += part.length
+      end
+      sections << { text: current_text, offset: current_offset } unless current_text.strip.empty?
+      sections
+    end
+    def strip_html_tags(text)
+      text.gsub(/<[^>]+>/, "")
+    end
+  end
+end

data/lib/chunker_ruby/json_splitter.rb ADDED Viewed

@@ -0,0 +1,62 @@
+# frozen_string_literal: true
+require "json"
+module ChunkerRuby
+  class JSONSplitter < BaseSplitter
+    def split(text, metadata: {})
+      return [] if text.nil? || text.empty?
+      parsed = ::JSON.parse(text)
+      pieces = extract_pieces(parsed)
+      chunks = []
+      current_parts = []
+      current_length = 0
+      pieces.each do |piece|
+        json_str = ::JSON.generate(piece)
+        if current_length + json_str.length > @chunk_size && !current_parts.empty?
+          chunk_text = ::JSON.generate(current_parts.length == 1 ? current_parts.first : current_parts)
+          chunks << Chunk.new(
+            text: chunk_text,
+            index: chunks.size,
+            offset: 0,
+            metadata: metadata.dup
+          )
+          current_parts = []
+          current_length = 0
+        end
+        current_parts << piece
+        current_length += json_str.length
+      end
+      unless current_parts.empty?
+        chunk_text = ::JSON.generate(current_parts.length == 1 ? current_parts.first : current_parts)
+        chunks << Chunk.new(
+          text: chunk_text,
+          index: chunks.size,
+          offset: 0,
+          metadata: metadata.dup
+        )
+      end
+      chunks
+    end
+    private
+    def extract_pieces(parsed)
+      case parsed
+      when Array
+        parsed
+      when Hash
+        parsed.map { |k, v| { k => v } }
+      else
+        [parsed]
+      end
+    end
+  end
+end

data/lib/chunker_ruby/markdown.rb ADDED Viewed

@@ -0,0 +1,114 @@
+# frozen_string_literal: true
+module ChunkerRuby
+  class Markdown < BaseSplitter
+    HEADER_PATTERN = /^(\#{1,6})\s+(.+)$/
+    def initialize(keep_headers: true, **kwargs)
+      super(**kwargs)
+      @keep_headers = keep_headers
+    end
+    def split(text, metadata: {})
+      return [] if text.nil? || text.empty?
+      sections = split_by_headers(text)
+      chunks = []
+      sections.each do |section|
+        section_meta = metadata.merge(section[:metadata])
+        if section[:text].length <= @chunk_size
+          chunks << Chunk.new(
+            text: section[:text],
+            index: chunks.size,
+            offset: section[:offset],
+            metadata: section_meta
+          )
+        else
+          # Fall back to recursive splitting for large sections
+          sub_splitter = RecursiveCharacter.new(
+            chunk_size: @chunk_size,
+            chunk_overlap: @chunk_overlap,
+            separators: ["\n\n", "\n", ". ", " ", ""]
+          )
+          sub_chunks = sub_splitter.split(section[:text], metadata: section_meta)
+          sub_chunks.each do |sc|
+            chunks << Chunk.new(
+              text: sc.text,
+              index: chunks.size,
+              offset: section[:offset] + sc.offset,
+              metadata: sc.metadata
+            )
+          end
+        end
+      end
+      chunks
+    end
+    private
+    def split_by_headers(text)
+      sections = []
+      current_headers = []
+      current_text = +""
+      current_offset = 0
+      in_code_block = false
+      lines = text.lines
+      pos = 0
+      lines.each do |line|
+        if line.match?(/\A```/)
+          in_code_block = !in_code_block
+          current_text << line
+          pos += line.length
+          next
+        end
+        if !in_code_block && (match = line.match(HEADER_PATTERN))
+          # Save previous section
+          unless current_text.empty?
+            sections << {
+              text: current_text.rstrip,
+              offset: current_offset,
+              metadata: { headers: current_headers.dup }
+            }
+          end
+          level = match[1].length
+          # Remove headers at same or deeper level
+          current_headers = current_headers.select { |h| header_level(h) < level }
+          current_headers << line.rstrip
+          if @keep_headers
+            current_text = line.dup
+          else
+            current_text = +""
+          end
+          current_offset = pos
+        else
+          current_text << line
+        end
+        pos += line.length
+      end
+      unless current_text.empty?
+        sections << {
+          text: current_text.rstrip,
+          offset: current_offset,
+          metadata: { headers: current_headers.dup }
+        }
+      end
+      sections
+    end
+    def header_level(header_line)
+      match = header_line.match(/^(\#{1,6})/)
+      match ? match[1].length : 7
+    end
+  end
+end

data/lib/chunker_ruby/rails/chunkable.rb ADDED Viewed

@@ -0,0 +1,70 @@
+# frozen_string_literal: true
+module ChunkerRuby
+  module Rails
+    module Chunkable
+      def self.included(base)
+        base.extend(ClassMethods)
+      end
+      module ClassMethods
+        def chunkable(attribute, strategy: :recursive_character, chunk_size: 1000, chunk_overlap: 200, **options)
+          @chunkable_config = {
+            attribute: attribute,
+            strategy: strategy,
+            chunk_size: chunk_size,
+            chunk_overlap: chunk_overlap,
+            options: options
+          }
+          after_save :rechunk!, if: -> { saved_change_to_attribute?(attribute) }
+          has_many :chunks,
+            class_name: "#{name}Chunk",
+            dependent: :destroy
+          define_method(:chunker) do
+            config = self.class.instance_variable_get(:@chunkable_config)
+            splitter_class = ChunkerRuby::Rails::Chunkable.resolve_strategy(config[:strategy])
+            splitter_class.new(
+              chunk_size: config[:chunk_size],
+              chunk_overlap: config[:chunk_overlap],
+              **config[:options]
+            )
+          end
+          define_method(:rechunk!) do
+            config = self.class.instance_variable_get(:@chunkable_config)
+            content = send(config[:attribute])
+            return if content.nil? || content.empty?
+            chunks.destroy_all
+            result = chunker.split(content, metadata: { source_id: id, source_type: self.class.name })
+            result.each do |chunk|
+              chunks.create!(
+                text: chunk.text,
+                chunk_index: chunk.index,
+                offset: chunk.offset,
+                metadata: chunk.metadata
+              )
+            end
+          end
+        end
+      end
+      def self.resolve_strategy(strategy)
+        case strategy.to_sym
+        when :character then ChunkerRuby::Character
+        when :recursive_character then ChunkerRuby::RecursiveCharacter
+        when :sentence then ChunkerRuby::Sentence
+        when :separator then ChunkerRuby::Separator
+        when :markdown then ChunkerRuby::Markdown
+        when :html then ChunkerRuby::HTML
+        when :code then ChunkerRuby::Code
+        when :token then ChunkerRuby::Token
+        else raise ArgumentError, "Unknown chunking strategy: #{strategy}"
+        end
+      end
+    end
+  end
+end

data/lib/chunker_ruby/recursive_character.rb ADDED Viewed

@@ -0,0 +1,68 @@
+# frozen_string_literal: true
+module ChunkerRuby
+  class RecursiveCharacter < BaseSplitter
+    DEFAULT_SEPARATORS = ["\n\n", "\n", ". ", ", ", " ", ""].freeze
+    def initialize(separators: nil, keep_separator: true, **kwargs)
+      super(**kwargs)
+      @separators = separators || DEFAULT_SEPARATORS
+      @keep_separator = keep_separator
+    end
+    def split(text, metadata: {})
+      return [] if text.nil? || text.empty?
+      chunks = recursive_split(text, @separators)
+      merge_chunks(chunks, text, metadata: metadata)
+    end
+    private
+    def recursive_split(text, separators)
+      return [text] if text.length <= @chunk_size
+      return [text] if separators.empty?
+      separator = separators.first
+      remaining_separators = separators[1..]
+      pieces = split_by_separator(text, separator)
+      result = []
+      pieces.each do |piece|
+        if piece.length <= @chunk_size
+          result << piece
+        elsif remaining_separators.any?
+          result.concat(recursive_split(piece, remaining_separators))
+        else
+          result << piece
+        end
+      end
+      result
+    end
+    def split_by_separator(text, separator)
+      if separator.empty?
+        return text.chars
+      end
+      parts = text.split(separator, -1)
+      return parts unless @keep_separator && parts.length > 1
+      result = []
+      parts.each_with_index do |part, i|
+        if i < parts.length - 1
+          result << part + separator unless part.empty? && i > 0
+        else
+          result << part unless part.empty?
+        end
+      end
+      result.empty? ? [text] : result
+    end
+    def merge_chunks(pieces, original_text, metadata: {})
+      build_chunks(pieces, original_text, metadata: metadata)
+    end
+  end
+end

data/lib/chunker_ruby/semantic.rb ADDED Viewed

@@ -0,0 +1,106 @@
+# frozen_string_literal: true
+module ChunkerRuby
+  class Semantic < BaseSplitter
+    def initialize(embed:, threshold: 0.5, min_chunk_size: 100, max_chunk_size: 2000, **kwargs)
+      super(chunk_size: max_chunk_size, chunk_overlap: 0, **kwargs)
+      @embed = embed
+      @threshold = threshold
+      @min_chunk_size = min_chunk_size
+    end
+    def split(text, metadata: {})
+      return [] if text.nil? || text.empty?
+      sentences = split_into_sentences(text)
+      return [Chunk.new(text: text, index: 0, offset: 0, metadata: metadata)] if sentences.length <= 1
+      embeddings = sentences.map { |s| @embed.call(s) }
+      split_points = find_split_points(embeddings)
+      build_semantic_chunks(sentences, split_points, text, metadata)
+    end
+    private
+    def split_into_sentences(text)
+      parts = text.split(/(?<=[.!?])\s+/)
+      parts.reject(&:empty?)
+    end
+    def find_split_points(embeddings)
+      points = []
+      (0...embeddings.length - 1).each do |i|
+        similarity = cosine_similarity(embeddings[i], embeddings[i + 1])
+        points << i if similarity < @threshold
+      end
+      points
+    end
+    def cosine_similarity(a, b)
+      dot = a.zip(b).sum { |x, y| x * y }
+      mag_a = Math.sqrt(a.sum { |x| x * x })
+      mag_b = Math.sqrt(b.sum { |x| x * x })
+      return 0.0 if mag_a.zero? || mag_b.zero?
+      dot / (mag_a * mag_b)
+    end
+    def build_semantic_chunks(sentences, split_points, original_text, metadata)
+      chunks = []
+      boundaries = [-1] + split_points + [sentences.length - 1]
+      (0...boundaries.length - 1).each do |i|
+        start_idx = boundaries[i] + 1
+        end_idx = boundaries[i + 1]
+        chunk_sentences = sentences[start_idx..end_idx]
+        chunk_text = chunk_sentences.join(" ")
+        # Enforce size constraints
+        if chunk_text.length > @chunk_size
+          sub_splitter = RecursiveCharacter.new(
+            chunk_size: @chunk_size,
+            chunk_overlap: @chunk_overlap
+          )
+          sub_chunks = sub_splitter.split(chunk_text, metadata: metadata)
+          sub_chunks.each do |sc|
+            chunks << Chunk.new(
+              text: sc.text,
+              index: chunks.size,
+              offset: original_text.index(sc.text) || 0,
+              metadata: sc.metadata
+            )
+          end
+        elsif chunk_text.length >= @min_chunk_size
+          offset = original_text.index(chunk_text) || 0
+          chunks << Chunk.new(
+            text: chunk_text,
+            index: chunks.size,
+            offset: offset,
+            metadata: metadata.dup
+          )
+        elsif !chunks.empty?
+          # Merge small chunk with previous
+          prev = chunks.pop
+          merged = prev.text + " " + chunk_text
+          chunks << Chunk.new(
+            text: merged,
+            index: prev.index,
+            offset: prev.offset,
+            metadata: prev.metadata
+          )
+        else
+          offset = original_text.index(chunk_text) || 0
+          chunks << Chunk.new(
+            text: chunk_text,
+            index: chunks.size,
+            offset: offset,
+            metadata: metadata.dup
+          )
+        end
+      end
+      chunks
+    end
+  end
+end

data/lib/chunker_ruby/sentence.rb ADDED Viewed

@@ -0,0 +1,65 @@
+# frozen_string_literal: true
+require "strscan"
+module ChunkerRuby
+  class Sentence < BaseSplitter
+    ABBREVIATIONS = %w[
+      Mr Mrs Ms Dr Prof Sr Jr St Gen Gov Sgt Cpl Pvt
+      Inc Corp Ltd Co vs etc al
+      Jan Feb Mar Apr Jun Jul Aug Sep Oct Nov Dec
+      Ave Blvd Dept Div Est Fig
+    ].freeze
+    def initialize(min_chunk_size: nil, max_chunk_size: nil, **kwargs)
+      chunk_size = max_chunk_size || kwargs[:chunk_size] || 1000
+      super(chunk_size: chunk_size, **kwargs.except(:chunk_size))
+      @min_chunk_size = min_chunk_size || (@chunk_size / 3)
+    end
+    def split(text, metadata: {})
+      return [] if text.nil? || text.empty?
+      sentences = split_into_sentences(text)
+      build_chunks(sentences, text, metadata: metadata)
+    end
+    private
+    def split_into_sentences(text)
+      sentences = []
+      current = +""
+      text.scan(/[^.!?]*[.!?]+[\s]*|[^.!?]+\s*/) do |segment|
+        current << segment
+        # Check if this looks like a real sentence end
+        if segment.match?(/[.!?]\s*\z/) && real_sentence_end?(current)
+          sentences << current
+          current = +""
+        end
+      end
+      sentences << current unless current.strip.empty?
+      sentences.empty? ? [text] : sentences
+    end
+    def real_sentence_end?(text)
+      stripped = text.rstrip
+      return false if stripped.empty?
+      # Check for abbreviations: "Dr.", "Mr.", etc.
+      ABBREVIATIONS.each do |abbr|
+        return false if stripped.end_with?("#{abbr}.")
+      end
+      # Check for decimal numbers: "3.14"
+      return false if stripped.match?(/\d\.\z/)
+      # Check for ellipsis
+      return false if stripped.end_with?("...")
+      true
+    end
+  end
+end

data/lib/chunker_ruby/separator.rb ADDED Viewed

@@ -0,0 +1,60 @@
+# frozen_string_literal: true
+module ChunkerRuby
+  class Separator < BaseSplitter
+    def initialize(separator: "\n\n", keep_separator: true, **kwargs)
+      super(**kwargs)
+      @separator = separator
+      @keep_separator = keep_separator
+    end
+    def split(text, metadata: {})
+      return [] if text.nil? || text.empty?
+      pieces = split_by_separator(text, @separator)
+      build_chunks(pieces, text, metadata: metadata)
+    end
+    private
+    def split_by_separator(text, separator)
+      if separator.is_a?(Regexp)
+        split_with_regex(text, separator)
+      elsif separator.empty?
+        text.chars
+      else
+        split_with_string(text, separator)
+      end
+    end
+    def split_with_string(text, separator)
+      parts = text.split(separator, -1)
+      return parts unless @keep_separator && parts.length > 1
+      result = []
+      parts.each_with_index do |part, i|
+        if i == 0
+          result << part + separator unless part.empty?
+        elsif i == parts.length - 1
+          result << part unless part.empty?
+        else
+          result << part + separator unless part.empty?
+        end
+      end
+      result.empty? ? [text] : result
+    end
+    def split_with_regex(text, separator)
+      splits = text.split(separator, -1)
+      separators = text.scan(separator)
+      return splits unless @keep_separator
+      result = []
+      splits.each_with_index do |part, i|
+        combined = i < separators.length ? part + separators[i] : part
+        result << combined unless combined.empty?
+      end
+      result.empty? ? [text] : result
+    end
+  end
+end

data/lib/chunker_ruby/sliding_window.rb ADDED Viewed

@@ -0,0 +1,36 @@
+# frozen_string_literal: true
+module ChunkerRuby
+  class SlidingWindow < BaseSplitter
+    def initialize(stride: nil, **kwargs)
+      super(**kwargs)
+      @stride = stride || (@chunk_size - @chunk_overlap)
+      raise ArgumentError, "stride must be positive" unless @stride > 0
+    end
+    def split(text, metadata: {})
+      return [] if text.nil? || text.empty?
+      chunks = []
+      start = 0
+      while start < text.length
+        end_pos = [start + @chunk_size, text.length].min
+        chunk_text = text[start...end_pos]
+        chunks << Chunk.new(
+          text: chunk_text,
+          index: chunks.size,
+          offset: start,
+          metadata: metadata.dup
+        )
+        break if end_pos >= text.length
+        start += @stride
+      end
+      chunks
+    end
+  end
+end

data/lib/chunker_ruby/token.rb ADDED Viewed

@@ -0,0 +1,82 @@
+# frozen_string_literal: true
+module ChunkerRuby
+  class Token < BaseSplitter
+    def initialize(tokenizer: nil, **kwargs)
+      super(**kwargs)
+      @tokenizer = resolve_tokenizer(tokenizer)
+    end
+    def split(text, metadata: {})
+      return [] if text.nil? || text.empty?
+      if @tokenizer
+        split_by_tokens(text, metadata)
+      else
+        split_by_estimation(text, metadata)
+      end
+    end
+    private
+    def resolve_tokenizer(tokenizer)
+      case tokenizer
+      when nil
+        try_load_default_tokenizer
+      when String, Symbol
+        try_load_tokenizer(tokenizer.to_s)
+      else
+        tokenizer # assume it responds to #encode and #decode
+      end
+    end
+    def try_load_default_tokenizer
+      try_load_tokenizer("gpt2")
+    end
+    def try_load_tokenizer(name)
+      require "tokenizer_ruby"
+      TokenizerRuby::Tokenizer.new(name)
+    rescue LoadError
+      nil
+    end
+    def split_by_tokens(text, metadata)
+      tokens = @tokenizer.encode(text)
+      chunks = []
+      start = 0
+      while start < tokens.length
+        end_pos = [start + @chunk_size, tokens.length].min
+        chunk_tokens = tokens[start...end_pos]
+        chunk_text = @tokenizer.decode(chunk_tokens)
+        offset = text.index(chunk_text.strip) || 0
+        chunks << Chunk.new(
+          text: chunk_text,
+          index: chunks.size,
+          offset: offset,
+          metadata: metadata.merge(token_count: chunk_tokens.length)
+        )
+        break if end_pos >= tokens.length
+        start += @chunk_size - @chunk_overlap
+      end
+      chunks
+    end
+    def split_by_estimation(text, metadata)
+      # Estimate ~4 chars per token
+      char_chunk_size = @chunk_size * 4
+      char_overlap = @chunk_overlap * 4
+      char_splitter = Character.new(
+        chunk_size: char_chunk_size,
+        chunk_overlap: char_overlap
+      )
+      char_splitter.split(text, metadata: metadata)
+    end
+  end
+end

data/lib/chunker_ruby/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module ChunkerRuby
+  VERSION = "0.1.0"
+end

data/lib/chunker_ruby.rb ADDED Viewed

@@ -0,0 +1,22 @@
+# frozen_string_literal: true
+require_relative "chunker_ruby/version"
+require_relative "chunker_ruby/chunk"
+require_relative "chunker_ruby/base_splitter"
+require_relative "chunker_ruby/character"
+require_relative "chunker_ruby/separator"
+require_relative "chunker_ruby/recursive_character"
+require_relative "chunker_ruby/sentence"
+require_relative "chunker_ruby/markdown"
+require_relative "chunker_ruby/html"
+require_relative "chunker_ruby/code"
+require_relative "chunker_ruby/json_splitter"
+require_relative "chunker_ruby/token"
+require_relative "chunker_ruby/semantic"
+require_relative "chunker_ruby/sliding_window"
+module ChunkerRuby
+  def self.split(text, chunk_size: 1000, chunk_overlap: 200, **options)
+    RecursiveCharacter.new(chunk_size: chunk_size, chunk_overlap: chunk_overlap, **options).split(text)
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,61 @@
+--- !ruby/object:Gem::Specification
+name: chunker-ruby
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Johannes Dwi Cahyo
+bindir: bin
+cert_chain: []
+date: 1980-01-02 00:00:00.000000000 Z
+dependencies: []
+description: Multiple chunking strategies to split documents into optimal pieces for
+  embedding and vector search. Supports character, recursive, sentence, markdown,
+  HTML, code, token, and semantic splitting.
+email: []
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- LICENSE
+- lib/chunker_ruby.rb
+- lib/chunker_ruby/base_splitter.rb
+- lib/chunker_ruby/character.rb
+- lib/chunker_ruby/chunk.rb
+- lib/chunker_ruby/code.rb
+- lib/chunker_ruby/html.rb
+- lib/chunker_ruby/json_splitter.rb
+- lib/chunker_ruby/markdown.rb
+- lib/chunker_ruby/rails/chunkable.rb
+- lib/chunker_ruby/recursive_character.rb
+- lib/chunker_ruby/semantic.rb
+- lib/chunker_ruby/sentence.rb
+- lib/chunker_ruby/separator.rb
+- lib/chunker_ruby/sliding_window.rb
+- lib/chunker_ruby/token.rb
+- lib/chunker_ruby/version.rb
+homepage: https://github.com/johannesdwicahyo/chunker-ruby
+licenses:
+- MIT
+metadata:
+  homepage_uri: https://github.com/johannesdwicahyo/chunker-ruby
+  source_code_uri: https://github.com/johannesdwicahyo/chunker-ruby
+  changelog_uri: https://github.com/johannesdwicahyo/chunker-ruby/blob/main/CHANGELOG.md
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: 3.0.0
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.6.9
+specification_version: 4
+summary: Text chunking/splitting library for Ruby, designed for RAG pipelines
+test_files: []