chunker-ruby 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fb1949806664ba1e447e440f5dff4a2e1c072e2a0ed44248d235772ab23d121d
4
- data.tar.gz: 2b4eb4b750714e39ef11f6fa40fc82640797b52ebca069f5993be18fa9a705f9
3
+ metadata.gz: 2ef1a60bf60351dc527abc378d992bfa05b0de0d9c64af3db4edbb63b9539c61
4
+ data.tar.gz: 01452e12091762a1dee9e86b2613e525a3dd9536cd51fd5d52da19e4a4f829dd
5
5
  SHA512:
6
- metadata.gz: 7b2fc37c66650dfe14a035e36bc4dea38895a98f165520be4c077ca3f85ffb8a45f0b89e30b1f484184cf0e963e2e58227f2af26633896de09af2c4d13297f68
7
- data.tar.gz: ae966e3dfa33ec187899018192fa80dbe78473e988bbeb91ccd165bc1e4e747d7cc97c12d6b22e37b9e298b5c35de2a8e581bf60f65516e93f6dbe7c656e70de
6
+ metadata.gz: a4b276bd94c9c0e7c6749223eecf8e78aa578f65866a4ab98f0d53e6245fc29ff622e6d8d262d3937c22b1f9e35e23f875c3ca52d5fc188138090c251ce1de29
7
+ data.tar.gz: d4c86c9423f92c20526a4f771c61e95f24edc7a11bf6920189d0eb408213c49e6f9828e02b1c6cb25121953a2f352a9b8c9f9e46db42af0ec5eca98b71820a2a
data/README.md ADDED
@@ -0,0 +1,235 @@
1
+ # chunker-ruby
2
+
3
+ Text chunking/splitting library for Ruby, designed for RAG (Retrieval-Augmented Generation) pipelines. Split documents into optimal pieces for embedding and vector search.
4
+
5
+ Bad chunking = bad retrieval = bad RAG. This gem solves that.
6
+
7
+ ## Installation
8
+
9
+ ```ruby
10
+ gem install chunker-ruby
11
+ ```
12
+
13
+ Or add to your Gemfile:
14
+
15
+ ```ruby
16
+ gem "chunker-ruby"
17
+ ```
18
+
19
+ ## Quick Start
20
+
21
+ ```ruby
22
+ require "chunker_ruby"
23
+
24
+ text = File.read("long_document.md")
25
+
26
+ # Simple split (uses RecursiveCharacter by default)
27
+ chunks = ChunkerRuby.split(text, chunk_size: 1000, chunk_overlap: 200)
28
+
29
+ chunks.each do |chunk|
30
+ chunk.text # => "The document begins..."
31
+ chunk.index # => 0
32
+ chunk.offset # => 0 (character offset in original)
33
+ chunk.length # => 342
34
+ chunk.metadata # => {}
35
+ end
36
+ ```
37
+
38
+ ## Strategies
39
+
40
+ ### Character
41
+
42
+ Fixed character count with overlap. Simplest strategy.
43
+
44
+ ```ruby
45
+ chunker = ChunkerRuby::Character.new(chunk_size: 1000, chunk_overlap: 200)
46
+ chunks = chunker.split(text)
47
+ ```
48
+
49
+ ### RecursiveCharacter
50
+
51
+ Tries splitting by paragraph, then sentence, then word, then character. The most generally useful strategy.
52
+
53
+ ```ruby
54
+ chunker = ChunkerRuby::RecursiveCharacter.new(
55
+ chunk_size: 1000,
56
+ chunk_overlap: 200,
57
+ separators: ["\n\n", "\n", ". ", ", ", " ", ""] # default
58
+ )
59
+ chunks = chunker.split(text)
60
+ ```
61
+
62
+ ### Sentence
63
+
64
+ Splits on sentence boundaries. Handles abbreviations (Dr., Mr., etc.) and decimal numbers.
65
+
66
+ ```ruby
67
+ chunker = ChunkerRuby::Sentence.new(
68
+ min_chunk_size: 500,
69
+ max_chunk_size: 1500
70
+ )
71
+ chunks = chunker.split(text)
72
+ ```
73
+
74
+ ### Separator
75
+
76
+ Split on a specific string or regex.
77
+
78
+ ```ruby
79
+ chunker = ChunkerRuby::Separator.new(
80
+ separator: "\n\n", # or a Regexp
81
+ keep_separator: true,
82
+ chunk_size: 1000
83
+ )
84
+ chunks = chunker.split(text)
85
+ ```
86
+
87
+ ### Markdown
88
+
89
+ Splits on markdown headers (h1-h6). Respects code blocks. Preserves header hierarchy in metadata.
90
+
91
+ ```ruby
92
+ chunker = ChunkerRuby::Markdown.new(chunk_size: 1000, chunk_overlap: 100)
93
+ chunks = chunker.split(markdown_text)
94
+
95
+ chunks.first.metadata[:headers] # => ["# Introduction", "## Background"]
96
+ ```
97
+
98
+ ### HTML
99
+
100
+ Splits on HTML block tags. Optionally strips tags.
101
+
102
+ ```ruby
103
+ chunker = ChunkerRuby::HTML.new(chunk_size: 1000, strip_tags: true)
104
+ chunks = chunker.split(html_text)
105
+ ```
106
+
107
+ ### Code
108
+
109
+ Splits on function/class/method boundaries. Supports Ruby, Python, JavaScript, and TypeScript.
110
+
111
+ ```ruby
112
+ chunker = ChunkerRuby::Code.new(language: :ruby, chunk_size: 1500)
113
+ chunks = chunker.split(source_code)
114
+
115
+ chunks.first.metadata[:language] # => :ruby
116
+ ```
117
+
118
+ ### JSON
119
+
120
+ Splits JSON arrays/objects into chunks. Each chunk is valid JSON.
121
+
122
+ ```ruby
123
+ chunker = ChunkerRuby::JSONSplitter.new(chunk_size: 1000, chunk_overlap: 0)
124
+ chunks = chunker.split(json_string)
125
+ ```
126
+
127
+ ### Token
128
+
129
+ Splits by token count. Uses `tokenizer-ruby` if available, falls back to character estimation (~4 chars/token).
130
+
131
+ ```ruby
132
+ chunker = ChunkerRuby::Token.new(
133
+ chunk_size: 512, # in tokens
134
+ chunk_overlap: 50,
135
+ tokenizer: "gpt2"
136
+ )
137
+ chunks = chunker.split(text)
138
+ ```
139
+
140
+ ### Semantic
141
+
142
+ Splits where embedding similarity drops (topic boundaries). Requires an embedding function.
143
+
144
+ ```ruby
145
+ chunker = ChunkerRuby::Semantic.new(
146
+ embed: ->(text) { my_embedding_function(text) },
147
+ threshold: 0.5,
148
+ min_chunk_size: 100,
149
+ max_chunk_size: 2000
150
+ )
151
+ chunks = chunker.split(text)
152
+ ```
153
+
154
+ ### Sliding Window
155
+
156
+ Fixed-size sliding window with configurable stride.
157
+
158
+ ```ruby
159
+ chunker = ChunkerRuby::SlidingWindow.new(
160
+ chunk_size: 500,
161
+ chunk_overlap: 100,
162
+ stride: 200 # optional, defaults to chunk_size - chunk_overlap
163
+ )
164
+ chunks = chunker.split(text)
165
+ ```
166
+
167
+ ## Chunk Object
168
+
169
+ Every strategy returns an array of `ChunkerRuby::Chunk` objects:
170
+
171
+ ```ruby
172
+ chunk.text # chunk content
173
+ chunk.index # position in sequence (0, 1, 2, ...)
174
+ chunk.offset # character offset in original document
175
+ chunk.length # character length
176
+ chunk.metadata # arbitrary metadata hash
177
+ chunk.token_count # estimated token count (or exact with tokenizer)
178
+ chunk.to_h # { text:, index:, offset:, length:, metadata: }
179
+ chunk.to_s # same as chunk.text
180
+ ```
181
+
182
+ ## Splitting Multiple Documents
183
+
184
+ ```ruby
185
+ splitter = ChunkerRuby::RecursiveCharacter.new(chunk_size: 1000)
186
+ chunks = splitter.split_many(["First document...", "Second document..."])
187
+
188
+ chunks.first.metadata[:doc_index] # => 0
189
+ ```
190
+
191
+ ## Rails Integration
192
+
193
+ ```ruby
194
+ class Document < ApplicationRecord
195
+ include ChunkerRuby::Rails::Chunkable
196
+
197
+ chunkable :content,
198
+ strategy: :markdown,
199
+ chunk_size: 1000,
200
+ chunk_overlap: 200
201
+ end
202
+
203
+ document = Document.create!(content: long_text)
204
+ document.chunks # => [#<DocumentChunk text="..." chunk_index=0>, ...]
205
+ ```
206
+
207
+ Requires a `DocumentChunk` model with `text`, `chunk_index`, `offset`, and `metadata` columns.
208
+
209
+ ## Choosing a Strategy
210
+
211
+ | Use Case | Recommended Strategy |
212
+ |---|---|
213
+ | General text | `RecursiveCharacter` |
214
+ | Markdown docs | `Markdown` |
215
+ | Source code | `Code` |
216
+ | HTML pages | `HTML` |
217
+ | LLM context window management | `Token` |
218
+ | Topic-based splitting | `Semantic` |
219
+ | Simple fixed-size | `Character` or `SlidingWindow` |
220
+
221
+ ## Chunk Size Guidelines
222
+
223
+ - **256-512 tokens**: Precise, fact-based retrieval (FAQ, definitions)
224
+ - **512-1024 tokens**: Good balance for most use cases (docs, articles)
225
+ - **1024-2048 tokens**: Complex topics needing more context (tutorials, guides)
226
+ - **10-20% overlap**: Prevents context loss at boundaries
227
+
228
+ ## Dependencies
229
+
230
+ - **Runtime**: None (pure Ruby)
231
+ - **Optional**: `tokenizer-ruby` for token-based chunking
232
+
233
+ ## License
234
+
235
+ MIT
@@ -25,6 +25,31 @@ module ChunkerRuby
25
25
 
26
26
  def build_chunks(pieces, original_text, metadata: {})
27
27
  chunks = []
28
+ current_pos = 0
29
+
30
+ merged = merge_pieces(pieces)
31
+
32
+ merged.each do |chunk_text|
33
+ next if chunk_text.strip.empty?
34
+
35
+ # Find the actual position starting from current_pos
36
+ offset = original_text.index(chunk_text, current_pos) || current_pos
37
+
38
+ chunks << Chunk.new(
39
+ text: chunk_text,
40
+ index: chunks.size,
41
+ offset: offset,
42
+ metadata: metadata.dup
43
+ )
44
+
45
+ current_pos = offset + chunk_text.length
46
+ end
47
+
48
+ chunks
49
+ end
50
+
51
+ def merge_pieces(pieces)
52
+ merged = []
28
53
  current_parts = []
29
54
  current_length = 0
30
55
 
@@ -32,14 +57,7 @@ module ChunkerRuby
32
57
  piece_len = piece.length
33
58
 
34
59
  if current_length + piece_len > @chunk_size && !current_parts.empty?
35
- chunk_text = current_parts.join
36
- offset = original_text.index(chunk_text) || 0
37
- chunks << Chunk.new(
38
- text: chunk_text,
39
- index: chunks.size,
40
- offset: offset,
41
- metadata: metadata.dup
42
- )
60
+ merged << current_parts.join
43
61
 
44
62
  # Handle overlap: keep trailing parts that fit within overlap size
45
63
  overlap_parts = []
@@ -61,18 +79,9 @@ module ChunkerRuby
61
79
  current_length += piece_len
62
80
  end
63
81
 
64
- unless current_parts.empty?
65
- chunk_text = current_parts.join
66
- offset = original_text.rindex(chunk_text) || 0
67
- chunks << Chunk.new(
68
- text: chunk_text,
69
- index: chunks.size,
70
- offset: offset,
71
- metadata: metadata.dup
72
- )
73
- end
82
+ merged << current_parts.join unless current_parts.empty?
74
83
 
75
- chunks
84
+ merged
76
85
  end
77
86
  end
78
87
  end
@@ -29,6 +29,16 @@ module ChunkerRuby
29
29
  { text: @text, index: @index, offset: @offset, length: @length, metadata: @metadata }
30
30
  end
31
31
 
32
+ def valid?(original_text = nil)
33
+ return false if text.nil? || text.empty?
34
+ return false if offset.negative?
35
+ return false if index.negative?
36
+ if original_text
37
+ return false unless original_text[offset, text.length] == text
38
+ end
39
+ true
40
+ end
41
+
32
42
  def ==(other)
33
43
  other.is_a?(Chunk) && text == other.text && index == other.index && offset == other.offset
34
44
  end
@@ -10,6 +10,7 @@ module ChunkerRuby
10
10
  parsed = ::JSON.parse(text)
11
11
  pieces = extract_pieces(parsed)
12
12
  chunks = []
13
+ current_pos = 0
13
14
 
14
15
  current_parts = []
15
16
  current_length = 0
@@ -19,10 +20,13 @@ module ChunkerRuby
19
20
 
20
21
  if current_length + json_str.length > @chunk_size && !current_parts.empty?
21
22
  chunk_text = ::JSON.generate(current_parts.length == 1 ? current_parts.first : current_parts)
23
+ # Search for a key or value from the first piece to approximate offset
24
+ offset = find_json_offset(text, current_parts.first, current_pos)
25
+ current_pos = offset + chunk_text.length
22
26
  chunks << Chunk.new(
23
27
  text: chunk_text,
24
28
  index: chunks.size,
25
- offset: 0,
29
+ offset: offset,
26
30
  metadata: metadata.dup
27
31
  )
28
32
  current_parts = []
@@ -35,10 +39,11 @@ module ChunkerRuby
35
39
 
36
40
  unless current_parts.empty?
37
41
  chunk_text = ::JSON.generate(current_parts.length == 1 ? current_parts.first : current_parts)
42
+ offset = find_json_offset(text, current_parts.first, current_pos)
38
43
  chunks << Chunk.new(
39
44
  text: chunk_text,
40
45
  index: chunks.size,
41
- offset: 0,
46
+ offset: offset,
42
47
  metadata: metadata.dup
43
48
  )
44
49
  end
@@ -48,6 +53,22 @@ module ChunkerRuby
48
53
 
49
54
  private
50
55
 
56
+ def find_json_offset(text, first_piece, current_pos)
57
+ # Try to find a recognizable key or value from the first piece in the original text
58
+ search_str = case first_piece
59
+ when Hash
60
+ first_piece.keys.first.to_s
61
+ when String
62
+ first_piece
63
+ else
64
+ first_piece.to_s
65
+ end
66
+
67
+ # Search for the key/value string as it would appear in JSON (quoted)
68
+ quoted = "\"#{search_str}\""
69
+ text.index(quoted, current_pos) || text.index(search_str, current_pos) || current_pos
70
+ end
71
+
51
72
  def extract_pieces(parsed)
52
73
  case parsed
53
74
  when Array
@@ -62,6 +62,9 @@ module ChunkerRuby
62
62
  when :html then ChunkerRuby::HTML
63
63
  when :code then ChunkerRuby::Code
64
64
  when :token then ChunkerRuby::Token
65
+ when :semantic then ChunkerRuby::Semantic
66
+ when :json then ChunkerRuby::JSONSplitter
67
+ when :sliding_window then ChunkerRuby::SlidingWindow
65
68
  else raise ArgumentError, "Unknown chunking strategy: #{strategy}"
66
69
  end
67
70
  end
@@ -48,6 +48,7 @@ module ChunkerRuby
48
48
 
49
49
  def build_semantic_chunks(sentences, split_points, original_text, metadata)
50
50
  chunks = []
51
+ current_pos = 0
51
52
  boundaries = [-1] + split_points + [sentences.length - 1]
52
53
 
53
54
  (0...boundaries.length - 1).each do |i|
@@ -64,15 +65,18 @@ module ChunkerRuby
64
65
  )
65
66
  sub_chunks = sub_splitter.split(chunk_text, metadata: metadata)
66
67
  sub_chunks.each do |sc|
68
+ offset = original_text.index(sc.text, current_pos) || current_pos
69
+ current_pos = offset + sc.text.length
67
70
  chunks << Chunk.new(
68
71
  text: sc.text,
69
72
  index: chunks.size,
70
- offset: original_text.index(sc.text) || 0,
73
+ offset: offset,
71
74
  metadata: sc.metadata
72
75
  )
73
76
  end
74
77
  elsif chunk_text.length >= @min_chunk_size
75
- offset = original_text.index(chunk_text) || 0
78
+ offset = original_text.index(chunk_text, current_pos) || current_pos
79
+ current_pos = offset + chunk_text.length
76
80
  chunks << Chunk.new(
77
81
  text: chunk_text,
78
82
  index: chunks.size,
@@ -89,8 +93,10 @@ module ChunkerRuby
89
93
  offset: prev.offset,
90
94
  metadata: prev.metadata
91
95
  )
96
+ current_pos = prev.offset + merged.length
92
97
  else
93
- offset = original_text.index(chunk_text) || 0
98
+ offset = original_text.index(chunk_text, current_pos) || current_pos
99
+ current_pos = offset + chunk_text.length
94
100
  chunks << Chunk.new(
95
101
  text: chunk_text,
96
102
  index: chunks.size,
@@ -45,15 +45,19 @@ module ChunkerRuby
45
45
  tokens = @tokenizer.encode(text)
46
46
  chunks = []
47
47
  start = 0
48
+ current_pos = 0
48
49
 
49
50
  while start < tokens.length
50
51
  end_pos = [start + @chunk_size, tokens.length].min
51
52
  chunk_tokens = tokens[start...end_pos]
52
- chunk_text = @tokenizer.decode(chunk_tokens)
53
+ raw_text = @tokenizer.decode(chunk_tokens)
54
+ stripped = raw_text.strip
55
+
56
+ offset = text.index(stripped, current_pos) || current_pos
57
+ current_pos = offset + stripped.length
53
58
 
54
- offset = text.index(chunk_text.strip) || 0
55
59
  chunks << Chunk.new(
56
- text: chunk_text,
60
+ text: raw_text,
57
61
  index: chunks.size,
58
62
  offset: offset,
59
63
  metadata: metadata.merge(token_count: chunk_tokens.length)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ChunkerRuby
4
- VERSION = "0.1.0"
4
+ VERSION = "0.1.1"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chunker-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Johannes Dwi Cahyo
@@ -18,6 +18,7 @@ extensions: []
18
18
  extra_rdoc_files: []
19
19
  files:
20
20
  - LICENSE
21
+ - README.md
21
22
  - lib/chunker_ruby.rb
22
23
  - lib/chunker_ruby/base_splitter.rb
23
24
  - lib/chunker_ruby/character.rb