chunker-ruby 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: fb1949806664ba1e447e440f5dff4a2e1c072e2a0ed44248d235772ab23d121d
4
+ data.tar.gz: 2b4eb4b750714e39ef11f6fa40fc82640797b52ebca069f5993be18fa9a705f9
5
+ SHA512:
6
+ metadata.gz: 7b2fc37c66650dfe14a035e36bc4dea38895a98f165520be4c077ca3f85ffb8a45f0b89e30b1f484184cf0e963e2e58227f2af26633896de09af2c4d13297f68
7
+ data.tar.gz: ae966e3dfa33ec187899018192fa80dbe78473e988bbeb91ccd165bc1e4e747d7cc97c12d6b22e37b9e298b5c35de2a8e581bf60f65516e93f6dbe7c656e70de
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Johannes Dwi Cahyo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChunkerRuby
4
+ class BaseSplitter
5
+ attr_reader :chunk_size, :chunk_overlap
6
+
7
+ def initialize(chunk_size: 1000, chunk_overlap: 200, **options)
8
+ raise ArgumentError, "chunk_size must be positive" unless chunk_size > 0
9
+ raise ArgumentError, "chunk_overlap must be non-negative" unless chunk_overlap >= 0
10
+ raise ArgumentError, "chunk_overlap must be less than chunk_size" unless chunk_overlap < chunk_size
11
+
12
+ @chunk_size = chunk_size
13
+ @chunk_overlap = chunk_overlap
14
+ end
15
+
16
+ def split(text, metadata: {})
17
+ raise NotImplementedError, "#{self.class}#split must be implemented"
18
+ end
19
+
20
+ def split_many(texts)
21
+ texts.flat_map.with_index { |t, i| split(t, metadata: { doc_index: i }) }
22
+ end
23
+
24
+ private
25
+
26
+ def build_chunks(pieces, original_text, metadata: {})
27
+ chunks = []
28
+ current_parts = []
29
+ current_length = 0
30
+
31
+ pieces.each do |piece|
32
+ piece_len = piece.length
33
+
34
+ if current_length + piece_len > @chunk_size && !current_parts.empty?
35
+ chunk_text = current_parts.join
36
+ offset = original_text.index(chunk_text) || 0
37
+ chunks << Chunk.new(
38
+ text: chunk_text,
39
+ index: chunks.size,
40
+ offset: offset,
41
+ metadata: metadata.dup
42
+ )
43
+
44
+ # Handle overlap: keep trailing parts that fit within overlap size
45
+ overlap_parts = []
46
+ overlap_length = 0
47
+ current_parts.reverse_each do |part|
48
+ if overlap_length + part.length <= @chunk_overlap
49
+ overlap_parts.unshift(part)
50
+ overlap_length += part.length
51
+ else
52
+ break
53
+ end
54
+ end
55
+
56
+ current_parts = overlap_parts
57
+ current_length = overlap_length
58
+ end
59
+
60
+ current_parts << piece
61
+ current_length += piece_len
62
+ end
63
+
64
+ unless current_parts.empty?
65
+ chunk_text = current_parts.join
66
+ offset = original_text.rindex(chunk_text) || 0
67
+ chunks << Chunk.new(
68
+ text: chunk_text,
69
+ index: chunks.size,
70
+ offset: offset,
71
+ metadata: metadata.dup
72
+ )
73
+ end
74
+
75
+ chunks
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChunkerRuby
4
+ class Character < BaseSplitter
5
+ def split(text, metadata: {})
6
+ return [] if text.nil? || text.empty?
7
+
8
+ chunks = []
9
+ start = 0
10
+
11
+ while start < text.length
12
+ end_pos = [start + @chunk_size, text.length].min
13
+ chunk_text = text[start...end_pos]
14
+
15
+ chunks << Chunk.new(
16
+ text: chunk_text,
17
+ index: chunks.size,
18
+ offset: start,
19
+ metadata: metadata.dup
20
+ )
21
+
22
+ break if end_pos >= text.length
23
+
24
+ start += @chunk_size - @chunk_overlap
25
+ end
26
+
27
+ chunks
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChunkerRuby
4
+ class Chunk
5
+ attr_reader :text, :index, :offset, :length, :metadata
6
+
7
+ def initialize(text:, index:, offset:, metadata: {})
8
+ @text = text
9
+ @index = index
10
+ @offset = offset
11
+ @length = text.length
12
+ @metadata = metadata
13
+ end
14
+
15
+ def token_count(tokenizer = nil)
16
+ if tokenizer
17
+ tokenizer.encode(text).length
18
+ else
19
+ # Rough estimation: ~4 characters per token for English
20
+ (text.length / 4.0).ceil
21
+ end
22
+ end
23
+
24
+ def to_s
25
+ @text
26
+ end
27
+
28
+ def to_h
29
+ { text: @text, index: @index, offset: @offset, length: @length, metadata: @metadata }
30
+ end
31
+
32
+ def ==(other)
33
+ other.is_a?(Chunk) && text == other.text && index == other.index && offset == other.offset
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChunkerRuby
4
+ class Code < BaseSplitter
5
+ LANGUAGE_SEPARATORS = {
6
+ ruby: [
7
+ "\nclass ", "\nmodule ", "\ndef ", "\n\n", "\n", " ", ""
8
+ ],
9
+ python: [
10
+ "\nclass ", "\ndef ", "\n\n", "\n", " ", ""
11
+ ],
12
+ javascript: [
13
+ "\nfunction ", "\nclass ", "\nconst ", "\nlet ", "\nvar ",
14
+ "\nexport ", "\n\n", "\n", " ", ""
15
+ ],
16
+ typescript: [
17
+ "\ninterface ", "\ntype ", "\nfunction ", "\nclass ",
18
+ "\nconst ", "\nlet ", "\nexport ", "\n\n", "\n", " ", ""
19
+ ]
20
+ }.freeze
21
+
22
+ def initialize(language: :ruby, **kwargs)
23
+ super(**kwargs)
24
+ @language = language.to_sym
25
+ @separators = LANGUAGE_SEPARATORS.fetch(@language) do
26
+ raise ArgumentError, "Unsupported language: #{language}. Supported: #{LANGUAGE_SEPARATORS.keys.join(", ")}"
27
+ end
28
+ end
29
+
30
+ def split(text, metadata: {})
31
+ return [] if text.nil? || text.empty?
32
+
33
+ meta = metadata.merge(language: @language)
34
+ splitter = RecursiveCharacter.new(
35
+ chunk_size: @chunk_size,
36
+ chunk_overlap: @chunk_overlap,
37
+ separators: @separators,
38
+ keep_separator: true
39
+ )
40
+ splitter.split(text, metadata: meta)
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,83 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChunkerRuby
4
+ class HTML < BaseSplitter
5
+ BLOCK_TAGS = %w[
6
+ div p section article aside main header footer nav
7
+ h1 h2 h3 h4 h5 h6 blockquote pre ul ol li table tr
8
+ form fieldset details summary figure figcaption
9
+ ].freeze
10
+
11
+ def initialize(strip_tags: false, **kwargs)
12
+ super(**kwargs)
13
+ @strip_tags = strip_tags
14
+ end
15
+
16
+ def split(text, metadata: {})
17
+ return [] if text.nil? || text.empty?
18
+
19
+ sections = split_by_tags(text)
20
+ chunks = []
21
+
22
+ sections.each do |section|
23
+ content = @strip_tags ? strip_html_tags(section[:text]) : section[:text]
24
+ next if content.strip.empty?
25
+
26
+ if content.length <= @chunk_size
27
+ chunks << Chunk.new(
28
+ text: content,
29
+ index: chunks.size,
30
+ offset: section[:offset],
31
+ metadata: metadata.dup
32
+ )
33
+ else
34
+ sub_splitter = RecursiveCharacter.new(
35
+ chunk_size: @chunk_size,
36
+ chunk_overlap: @chunk_overlap
37
+ )
38
+ sub_chunks = sub_splitter.split(content, metadata: metadata)
39
+ sub_chunks.each do |sc|
40
+ chunks << Chunk.new(
41
+ text: sc.text,
42
+ index: chunks.size,
43
+ offset: section[:offset] + sc.offset,
44
+ metadata: sc.metadata
45
+ )
46
+ end
47
+ end
48
+ end
49
+
50
+ chunks
51
+ end
52
+
53
+ private
54
+
55
+ def split_by_tags(text)
56
+ sections = []
57
+ tag_pattern = /<\/?(?:#{BLOCK_TAGS.join("|")})\b[^>]*>/i
58
+
59
+ parts = text.split(/(#{tag_pattern})/i)
60
+ current_text = +""
61
+ current_offset = 0
62
+ pos = 0
63
+
64
+ parts.each do |part|
65
+ if part.match?(tag_pattern) && !current_text.strip.empty?
66
+ sections << { text: current_text, offset: current_offset }
67
+ current_text = part
68
+ current_offset = pos
69
+ else
70
+ current_text << part
71
+ end
72
+ pos += part.length
73
+ end
74
+
75
+ sections << { text: current_text, offset: current_offset } unless current_text.strip.empty?
76
+ sections
77
+ end
78
+
79
+ def strip_html_tags(text)
80
+ text.gsub(/<[^>]+>/, "")
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module ChunkerRuby
6
+ class JSONSplitter < BaseSplitter
7
+ def split(text, metadata: {})
8
+ return [] if text.nil? || text.empty?
9
+
10
+ parsed = ::JSON.parse(text)
11
+ pieces = extract_pieces(parsed)
12
+ chunks = []
13
+
14
+ current_parts = []
15
+ current_length = 0
16
+
17
+ pieces.each do |piece|
18
+ json_str = ::JSON.generate(piece)
19
+
20
+ if current_length + json_str.length > @chunk_size && !current_parts.empty?
21
+ chunk_text = ::JSON.generate(current_parts.length == 1 ? current_parts.first : current_parts)
22
+ chunks << Chunk.new(
23
+ text: chunk_text,
24
+ index: chunks.size,
25
+ offset: 0,
26
+ metadata: metadata.dup
27
+ )
28
+ current_parts = []
29
+ current_length = 0
30
+ end
31
+
32
+ current_parts << piece
33
+ current_length += json_str.length
34
+ end
35
+
36
+ unless current_parts.empty?
37
+ chunk_text = ::JSON.generate(current_parts.length == 1 ? current_parts.first : current_parts)
38
+ chunks << Chunk.new(
39
+ text: chunk_text,
40
+ index: chunks.size,
41
+ offset: 0,
42
+ metadata: metadata.dup
43
+ )
44
+ end
45
+
46
+ chunks
47
+ end
48
+
49
+ private
50
+
51
+ def extract_pieces(parsed)
52
+ case parsed
53
+ when Array
54
+ parsed
55
+ when Hash
56
+ parsed.map { |k, v| { k => v } }
57
+ else
58
+ [parsed]
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,114 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChunkerRuby
4
+ class Markdown < BaseSplitter
5
+ HEADER_PATTERN = /^(\#{1,6})\s+(.+)$/
6
+
7
+ def initialize(keep_headers: true, **kwargs)
8
+ super(**kwargs)
9
+ @keep_headers = keep_headers
10
+ end
11
+
12
+ def split(text, metadata: {})
13
+ return [] if text.nil? || text.empty?
14
+
15
+ sections = split_by_headers(text)
16
+ chunks = []
17
+
18
+ sections.each do |section|
19
+ section_meta = metadata.merge(section[:metadata])
20
+
21
+ if section[:text].length <= @chunk_size
22
+ chunks << Chunk.new(
23
+ text: section[:text],
24
+ index: chunks.size,
25
+ offset: section[:offset],
26
+ metadata: section_meta
27
+ )
28
+ else
29
+ # Fall back to recursive splitting for large sections
30
+ sub_splitter = RecursiveCharacter.new(
31
+ chunk_size: @chunk_size,
32
+ chunk_overlap: @chunk_overlap,
33
+ separators: ["\n\n", "\n", ". ", " ", ""]
34
+ )
35
+ sub_chunks = sub_splitter.split(section[:text], metadata: section_meta)
36
+ sub_chunks.each do |sc|
37
+ chunks << Chunk.new(
38
+ text: sc.text,
39
+ index: chunks.size,
40
+ offset: section[:offset] + sc.offset,
41
+ metadata: sc.metadata
42
+ )
43
+ end
44
+ end
45
+ end
46
+
47
+ chunks
48
+ end
49
+
50
+ private
51
+
52
+ def split_by_headers(text)
53
+ sections = []
54
+ current_headers = []
55
+ current_text = +""
56
+ current_offset = 0
57
+ in_code_block = false
58
+
59
+ lines = text.lines
60
+ pos = 0
61
+
62
+ lines.each do |line|
63
+ if line.match?(/\A```/)
64
+ in_code_block = !in_code_block
65
+ current_text << line
66
+ pos += line.length
67
+ next
68
+ end
69
+
70
+ if !in_code_block && (match = line.match(HEADER_PATTERN))
71
+ # Save previous section
72
+ unless current_text.empty?
73
+ sections << {
74
+ text: current_text.rstrip,
75
+ offset: current_offset,
76
+ metadata: { headers: current_headers.dup }
77
+ }
78
+ end
79
+
80
+ level = match[1].length
81
+ # Remove headers at same or deeper level
82
+ current_headers = current_headers.select { |h| header_level(h) < level }
83
+ current_headers << line.rstrip
84
+
85
+ if @keep_headers
86
+ current_text = line.dup
87
+ else
88
+ current_text = +""
89
+ end
90
+ current_offset = pos
91
+ else
92
+ current_text << line
93
+ end
94
+
95
+ pos += line.length
96
+ end
97
+
98
+ unless current_text.empty?
99
+ sections << {
100
+ text: current_text.rstrip,
101
+ offset: current_offset,
102
+ metadata: { headers: current_headers.dup }
103
+ }
104
+ end
105
+
106
+ sections
107
+ end
108
+
109
+ def header_level(header_line)
110
+ match = header_line.match(/^(\#{1,6})/)
111
+ match ? match[1].length : 7
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChunkerRuby
4
+ module Rails
5
+ module Chunkable
6
+ def self.included(base)
7
+ base.extend(ClassMethods)
8
+ end
9
+
10
+ module ClassMethods
11
+ def chunkable(attribute, strategy: :recursive_character, chunk_size: 1000, chunk_overlap: 200, **options)
12
+ @chunkable_config = {
13
+ attribute: attribute,
14
+ strategy: strategy,
15
+ chunk_size: chunk_size,
16
+ chunk_overlap: chunk_overlap,
17
+ options: options
18
+ }
19
+
20
+ after_save :rechunk!, if: -> { saved_change_to_attribute?(attribute) }
21
+
22
+ has_many :chunks,
23
+ class_name: "#{name}Chunk",
24
+ dependent: :destroy
25
+
26
+ define_method(:chunker) do
27
+ config = self.class.instance_variable_get(:@chunkable_config)
28
+ splitter_class = ChunkerRuby::Rails::Chunkable.resolve_strategy(config[:strategy])
29
+ splitter_class.new(
30
+ chunk_size: config[:chunk_size],
31
+ chunk_overlap: config[:chunk_overlap],
32
+ **config[:options]
33
+ )
34
+ end
35
+
36
+ define_method(:rechunk!) do
37
+ config = self.class.instance_variable_get(:@chunkable_config)
38
+ content = send(config[:attribute])
39
+ return if content.nil? || content.empty?
40
+
41
+ chunks.destroy_all
42
+ result = chunker.split(content, metadata: { source_id: id, source_type: self.class.name })
43
+ result.each do |chunk|
44
+ chunks.create!(
45
+ text: chunk.text,
46
+ chunk_index: chunk.index,
47
+ offset: chunk.offset,
48
+ metadata: chunk.metadata
49
+ )
50
+ end
51
+ end
52
+ end
53
+ end
54
+
55
+ def self.resolve_strategy(strategy)
56
+ case strategy.to_sym
57
+ when :character then ChunkerRuby::Character
58
+ when :recursive_character then ChunkerRuby::RecursiveCharacter
59
+ when :sentence then ChunkerRuby::Sentence
60
+ when :separator then ChunkerRuby::Separator
61
+ when :markdown then ChunkerRuby::Markdown
62
+ when :html then ChunkerRuby::HTML
63
+ when :code then ChunkerRuby::Code
64
+ when :token then ChunkerRuby::Token
65
+ else raise ArgumentError, "Unknown chunking strategy: #{strategy}"
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChunkerRuby
4
+ class RecursiveCharacter < BaseSplitter
5
+ DEFAULT_SEPARATORS = ["\n\n", "\n", ". ", ", ", " ", ""].freeze
6
+
7
+ def initialize(separators: nil, keep_separator: true, **kwargs)
8
+ super(**kwargs)
9
+ @separators = separators || DEFAULT_SEPARATORS
10
+ @keep_separator = keep_separator
11
+ end
12
+
13
+ def split(text, metadata: {})
14
+ return [] if text.nil? || text.empty?
15
+
16
+ chunks = recursive_split(text, @separators)
17
+ merge_chunks(chunks, text, metadata: metadata)
18
+ end
19
+
20
+ private
21
+
22
+ def recursive_split(text, separators)
23
+ return [text] if text.length <= @chunk_size
24
+ return [text] if separators.empty?
25
+
26
+ separator = separators.first
27
+ remaining_separators = separators[1..]
28
+
29
+ pieces = split_by_separator(text, separator)
30
+
31
+ result = []
32
+ pieces.each do |piece|
33
+ if piece.length <= @chunk_size
34
+ result << piece
35
+ elsif remaining_separators.any?
36
+ result.concat(recursive_split(piece, remaining_separators))
37
+ else
38
+ result << piece
39
+ end
40
+ end
41
+
42
+ result
43
+ end
44
+
45
+ def split_by_separator(text, separator)
46
+ if separator.empty?
47
+ return text.chars
48
+ end
49
+
50
+ parts = text.split(separator, -1)
51
+ return parts unless @keep_separator && parts.length > 1
52
+
53
+ result = []
54
+ parts.each_with_index do |part, i|
55
+ if i < parts.length - 1
56
+ result << part + separator unless part.empty? && i > 0
57
+ else
58
+ result << part unless part.empty?
59
+ end
60
+ end
61
+ result.empty? ? [text] : result
62
+ end
63
+
64
+ def merge_chunks(pieces, original_text, metadata: {})
65
+ build_chunks(pieces, original_text, metadata: metadata)
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,106 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChunkerRuby
4
+ class Semantic < BaseSplitter
5
+ def initialize(embed:, threshold: 0.5, min_chunk_size: 100, max_chunk_size: 2000, **kwargs)
6
+ super(chunk_size: max_chunk_size, chunk_overlap: 0, **kwargs)
7
+ @embed = embed
8
+ @threshold = threshold
9
+ @min_chunk_size = min_chunk_size
10
+ end
11
+
12
+ def split(text, metadata: {})
13
+ return [] if text.nil? || text.empty?
14
+
15
+ sentences = split_into_sentences(text)
16
+ return [Chunk.new(text: text, index: 0, offset: 0, metadata: metadata)] if sentences.length <= 1
17
+
18
+ embeddings = sentences.map { |s| @embed.call(s) }
19
+ split_points = find_split_points(embeddings)
20
+
21
+ build_semantic_chunks(sentences, split_points, text, metadata)
22
+ end
23
+
24
+ private
25
+
26
+ def split_into_sentences(text)
27
+ parts = text.split(/(?<=[.!?])\s+/)
28
+ parts.reject(&:empty?)
29
+ end
30
+
31
+ def find_split_points(embeddings)
32
+ points = []
33
+ (0...embeddings.length - 1).each do |i|
34
+ similarity = cosine_similarity(embeddings[i], embeddings[i + 1])
35
+ points << i if similarity < @threshold
36
+ end
37
+ points
38
+ end
39
+
40
+ def cosine_similarity(a, b)
41
+ dot = a.zip(b).sum { |x, y| x * y }
42
+ mag_a = Math.sqrt(a.sum { |x| x * x })
43
+ mag_b = Math.sqrt(b.sum { |x| x * x })
44
+ return 0.0 if mag_a.zero? || mag_b.zero?
45
+
46
+ dot / (mag_a * mag_b)
47
+ end
48
+
49
+ def build_semantic_chunks(sentences, split_points, original_text, metadata)
50
+ chunks = []
51
+ boundaries = [-1] + split_points + [sentences.length - 1]
52
+
53
+ (0...boundaries.length - 1).each do |i|
54
+ start_idx = boundaries[i] + 1
55
+ end_idx = boundaries[i + 1]
56
+ chunk_sentences = sentences[start_idx..end_idx]
57
+ chunk_text = chunk_sentences.join(" ")
58
+
59
+ # Enforce size constraints
60
+ if chunk_text.length > @chunk_size
61
+ sub_splitter = RecursiveCharacter.new(
62
+ chunk_size: @chunk_size,
63
+ chunk_overlap: @chunk_overlap
64
+ )
65
+ sub_chunks = sub_splitter.split(chunk_text, metadata: metadata)
66
+ sub_chunks.each do |sc|
67
+ chunks << Chunk.new(
68
+ text: sc.text,
69
+ index: chunks.size,
70
+ offset: original_text.index(sc.text) || 0,
71
+ metadata: sc.metadata
72
+ )
73
+ end
74
+ elsif chunk_text.length >= @min_chunk_size
75
+ offset = original_text.index(chunk_text) || 0
76
+ chunks << Chunk.new(
77
+ text: chunk_text,
78
+ index: chunks.size,
79
+ offset: offset,
80
+ metadata: metadata.dup
81
+ )
82
+ elsif !chunks.empty?
83
+ # Merge small chunk with previous
84
+ prev = chunks.pop
85
+ merged = prev.text + " " + chunk_text
86
+ chunks << Chunk.new(
87
+ text: merged,
88
+ index: prev.index,
89
+ offset: prev.offset,
90
+ metadata: prev.metadata
91
+ )
92
+ else
93
+ offset = original_text.index(chunk_text) || 0
94
+ chunks << Chunk.new(
95
+ text: chunk_text,
96
+ index: chunks.size,
97
+ offset: offset,
98
+ metadata: metadata.dup
99
+ )
100
+ end
101
+ end
102
+
103
+ chunks
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "strscan"
4
+
5
+ module ChunkerRuby
6
+ class Sentence < BaseSplitter
7
+ ABBREVIATIONS = %w[
8
+ Mr Mrs Ms Dr Prof Sr Jr St Gen Gov Sgt Cpl Pvt
9
+ Inc Corp Ltd Co vs etc al
10
+ Jan Feb Mar Apr Jun Jul Aug Sep Oct Nov Dec
11
+ Ave Blvd Dept Div Est Fig
12
+ ].freeze
13
+
14
+ def initialize(min_chunk_size: nil, max_chunk_size: nil, **kwargs)
15
+ chunk_size = max_chunk_size || kwargs[:chunk_size] || 1000
16
+ super(chunk_size: chunk_size, **kwargs.except(:chunk_size))
17
+ @min_chunk_size = min_chunk_size || (@chunk_size / 3)
18
+ end
19
+
20
+ def split(text, metadata: {})
21
+ return [] if text.nil? || text.empty?
22
+
23
+ sentences = split_into_sentences(text)
24
+ build_chunks(sentences, text, metadata: metadata)
25
+ end
26
+
27
+ private
28
+
29
+ def split_into_sentences(text)
30
+ sentences = []
31
+ current = +""
32
+
33
+ text.scan(/[^.!?]*[.!?]+[\s]*|[^.!?]+\s*/) do |segment|
34
+ current << segment
35
+
36
+ # Check if this looks like a real sentence end
37
+ if segment.match?(/[.!?]\s*\z/) && real_sentence_end?(current)
38
+ sentences << current
39
+ current = +""
40
+ end
41
+ end
42
+
43
+ sentences << current unless current.strip.empty?
44
+ sentences.empty? ? [text] : sentences
45
+ end
46
+
47
+ def real_sentence_end?(text)
48
+ stripped = text.rstrip
49
+ return false if stripped.empty?
50
+
51
+ # Check for abbreviations: "Dr.", "Mr.", etc.
52
+ ABBREVIATIONS.each do |abbr|
53
+ return false if stripped.end_with?("#{abbr}.")
54
+ end
55
+
56
+ # Check for decimal numbers: "3.14"
57
+ return false if stripped.match?(/\d\.\z/)
58
+
59
+ # Check for ellipsis
60
+ return false if stripped.end_with?("...")
61
+
62
+ true
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChunkerRuby
4
+ class Separator < BaseSplitter
5
+ def initialize(separator: "\n\n", keep_separator: true, **kwargs)
6
+ super(**kwargs)
7
+ @separator = separator
8
+ @keep_separator = keep_separator
9
+ end
10
+
11
+ def split(text, metadata: {})
12
+ return [] if text.nil? || text.empty?
13
+
14
+ pieces = split_by_separator(text, @separator)
15
+ build_chunks(pieces, text, metadata: metadata)
16
+ end
17
+
18
+ private
19
+
20
+ def split_by_separator(text, separator)
21
+ if separator.is_a?(Regexp)
22
+ split_with_regex(text, separator)
23
+ elsif separator.empty?
24
+ text.chars
25
+ else
26
+ split_with_string(text, separator)
27
+ end
28
+ end
29
+
30
+ def split_with_string(text, separator)
31
+ parts = text.split(separator, -1)
32
+ return parts unless @keep_separator && parts.length > 1
33
+
34
+ result = []
35
+ parts.each_with_index do |part, i|
36
+ if i == 0
37
+ result << part + separator unless part.empty?
38
+ elsif i == parts.length - 1
39
+ result << part unless part.empty?
40
+ else
41
+ result << part + separator unless part.empty?
42
+ end
43
+ end
44
+ result.empty? ? [text] : result
45
+ end
46
+
47
+ def split_with_regex(text, separator)
48
+ splits = text.split(separator, -1)
49
+ separators = text.scan(separator)
50
+ return splits unless @keep_separator
51
+
52
+ result = []
53
+ splits.each_with_index do |part, i|
54
+ combined = i < separators.length ? part + separators[i] : part
55
+ result << combined unless combined.empty?
56
+ end
57
+ result.empty? ? [text] : result
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChunkerRuby
4
+ class SlidingWindow < BaseSplitter
5
+ def initialize(stride: nil, **kwargs)
6
+ super(**kwargs)
7
+ @stride = stride || (@chunk_size - @chunk_overlap)
8
+ raise ArgumentError, "stride must be positive" unless @stride > 0
9
+ end
10
+
11
+ def split(text, metadata: {})
12
+ return [] if text.nil? || text.empty?
13
+
14
+ chunks = []
15
+ start = 0
16
+
17
+ while start < text.length
18
+ end_pos = [start + @chunk_size, text.length].min
19
+ chunk_text = text[start...end_pos]
20
+
21
+ chunks << Chunk.new(
22
+ text: chunk_text,
23
+ index: chunks.size,
24
+ offset: start,
25
+ metadata: metadata.dup
26
+ )
27
+
28
+ break if end_pos >= text.length
29
+
30
+ start += @stride
31
+ end
32
+
33
+ chunks
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,82 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChunkerRuby
4
+ class Token < BaseSplitter
5
+ def initialize(tokenizer: nil, **kwargs)
6
+ super(**kwargs)
7
+ @tokenizer = resolve_tokenizer(tokenizer)
8
+ end
9
+
10
+ def split(text, metadata: {})
11
+ return [] if text.nil? || text.empty?
12
+
13
+ if @tokenizer
14
+ split_by_tokens(text, metadata)
15
+ else
16
+ split_by_estimation(text, metadata)
17
+ end
18
+ end
19
+
20
+ private
21
+
22
+ def resolve_tokenizer(tokenizer)
23
+ case tokenizer
24
+ when nil
25
+ try_load_default_tokenizer
26
+ when String, Symbol
27
+ try_load_tokenizer(tokenizer.to_s)
28
+ else
29
+ tokenizer # assume it responds to #encode and #decode
30
+ end
31
+ end
32
+
33
+ def try_load_default_tokenizer
34
+ try_load_tokenizer("gpt2")
35
+ end
36
+
37
+ def try_load_tokenizer(name)
38
+ require "tokenizer_ruby"
39
+ TokenizerRuby::Tokenizer.new(name)
40
+ rescue LoadError
41
+ nil
42
+ end
43
+
44
+ def split_by_tokens(text, metadata)
45
+ tokens = @tokenizer.encode(text)
46
+ chunks = []
47
+ start = 0
48
+
49
+ while start < tokens.length
50
+ end_pos = [start + @chunk_size, tokens.length].min
51
+ chunk_tokens = tokens[start...end_pos]
52
+ chunk_text = @tokenizer.decode(chunk_tokens)
53
+
54
+ offset = text.index(chunk_text.strip) || 0
55
+ chunks << Chunk.new(
56
+ text: chunk_text,
57
+ index: chunks.size,
58
+ offset: offset,
59
+ metadata: metadata.merge(token_count: chunk_tokens.length)
60
+ )
61
+
62
+ break if end_pos >= tokens.length
63
+
64
+ start += @chunk_size - @chunk_overlap
65
+ end
66
+
67
+ chunks
68
+ end
69
+
70
+ def split_by_estimation(text, metadata)
71
+ # Estimate ~4 chars per token
72
+ char_chunk_size = @chunk_size * 4
73
+ char_overlap = @chunk_overlap * 4
74
+
75
+ char_splitter = Character.new(
76
+ chunk_size: char_chunk_size,
77
+ chunk_overlap: char_overlap
78
+ )
79
+ char_splitter.split(text, metadata: metadata)
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ChunkerRuby
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "chunker_ruby/version"
4
+ require_relative "chunker_ruby/chunk"
5
+ require_relative "chunker_ruby/base_splitter"
6
+ require_relative "chunker_ruby/character"
7
+ require_relative "chunker_ruby/separator"
8
+ require_relative "chunker_ruby/recursive_character"
9
+ require_relative "chunker_ruby/sentence"
10
+ require_relative "chunker_ruby/markdown"
11
+ require_relative "chunker_ruby/html"
12
+ require_relative "chunker_ruby/code"
13
+ require_relative "chunker_ruby/json_splitter"
14
+ require_relative "chunker_ruby/token"
15
+ require_relative "chunker_ruby/semantic"
16
+ require_relative "chunker_ruby/sliding_window"
17
+
18
+ module ChunkerRuby
19
+ def self.split(text, chunk_size: 1000, chunk_overlap: 200, **options)
20
+ RecursiveCharacter.new(chunk_size: chunk_size, chunk_overlap: chunk_overlap, **options).split(text)
21
+ end
22
+ end
metadata ADDED
@@ -0,0 +1,61 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: chunker-ruby
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Johannes Dwi Cahyo
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies: []
12
+ description: Multiple chunking strategies to split documents into optimal pieces for
13
+ embedding and vector search. Supports character, recursive, sentence, markdown,
14
+ HTML, code, token, and semantic splitting.
15
+ email: []
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - LICENSE
21
+ - lib/chunker_ruby.rb
22
+ - lib/chunker_ruby/base_splitter.rb
23
+ - lib/chunker_ruby/character.rb
24
+ - lib/chunker_ruby/chunk.rb
25
+ - lib/chunker_ruby/code.rb
26
+ - lib/chunker_ruby/html.rb
27
+ - lib/chunker_ruby/json_splitter.rb
28
+ - lib/chunker_ruby/markdown.rb
29
+ - lib/chunker_ruby/rails/chunkable.rb
30
+ - lib/chunker_ruby/recursive_character.rb
31
+ - lib/chunker_ruby/semantic.rb
32
+ - lib/chunker_ruby/sentence.rb
33
+ - lib/chunker_ruby/separator.rb
34
+ - lib/chunker_ruby/sliding_window.rb
35
+ - lib/chunker_ruby/token.rb
36
+ - lib/chunker_ruby/version.rb
37
+ homepage: https://github.com/johannesdwicahyo/chunker-ruby
38
+ licenses:
39
+ - MIT
40
+ metadata:
41
+ homepage_uri: https://github.com/johannesdwicahyo/chunker-ruby
42
+ source_code_uri: https://github.com/johannesdwicahyo/chunker-ruby
43
+ changelog_uri: https://github.com/johannesdwicahyo/chunker-ruby/blob/main/CHANGELOG.md
44
+ rdoc_options: []
45
+ require_paths:
46
+ - lib
47
+ required_ruby_version: !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ version: 3.0.0
52
+ required_rubygems_version: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: '0'
57
+ requirements: []
58
+ rubygems_version: 3.6.9
59
+ specification_version: 4
60
+ summary: Text chunking/splitting library for Ruby, designed for RAG pipelines
61
+ test_files: []