semchunk 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +308 -3
- data/lib/semchunk/chunker.rb +65 -0
- data/lib/semchunk/version.rb +1 -1
- data/lib/semchunk.rb +383 -1
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ba78c1e8ca6f2d54a7345a1f2c33a4492224c121022db4fdc4dd46742e4891c8
|
|
4
|
+
data.tar.gz: f7c5e2d0a24964d97f59aa6b1b16649712108e5ce9a3253a178e36d2a3196ecd
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e41759111d572eff3fe47d518f33524e2fac447a09992e96e76a488c27cb760cbe82352b7533680a8761f8360ddfc2613ac5c7b148e21c9800357dd9dcbbed94
|
|
7
|
+
data.tar.gz: e4b4acaf2229dcf85d16d1c3869a4978f17f69fe60b3c2b2a86c67354a4e90f9982691f88f8c6595189863109bb0df82be2232b5144ea036ff2db4e0ce0425df
|
data/README.md
CHANGED
|
@@ -1,29 +1,334 @@
|
|
|
1
|
-
#
|
|
1
|
+
# Semchunk
|
|
2
2
|
|
|
3
3
|
[](https://rubygems.org/gems/semchunk)
|
|
4
4
|
[](https://www.ruby-toolbox.com/projects/semchunk)
|
|
5
5
|
[](https://github.com/philip-zhan/semchunk.rb/actions/workflows/ci.yml)
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
Split text into semantically meaningful chunks of a specified size as determined by a provided token counter.
|
|
8
|
+
|
|
9
|
+
This is a Ruby port of the Python [semchunk](https://github.com/umarbutler/semchunk) package.
|
|
10
|
+
|
|
11
|
+
## Features
|
|
12
|
+
|
|
13
|
+
- **Semantic chunking**: Splits text at natural boundaries (sentences, paragraphs, etc.) rather than at arbitrary character positions
|
|
14
|
+
- **Token-aware**: Respects token limits from any tokenizer you provide
|
|
15
|
+
- **Overlap support**: Create overlapping chunks for better context preservation
|
|
16
|
+
- **Offset tracking**: Get the original positions of each chunk in the source text
|
|
17
|
+
- **Flexible**: Works with any token counter (word count, character count, or tokenizers)
|
|
18
|
+
- **Memoization**: Optional caching of token counts for improved performance
|
|
8
19
|
|
|
9
20
|
---
|
|
10
21
|
|
|
22
|
+
- [Installation](#installation)
|
|
11
23
|
- [Quick start](#quick-start)
|
|
24
|
+
- [API Reference](#api-reference)
|
|
25
|
+
- [Examples](#examples)
|
|
12
26
|
- [Support](#support)
|
|
13
27
|
- [License](#license)
|
|
14
28
|
- [Code of conduct](#code-of-conduct)
|
|
15
29
|
- [Contribution guide](#contribution-guide)
|
|
16
30
|
|
|
17
|
-
##
|
|
31
|
+
## Installation
|
|
18
32
|
|
|
33
|
+
Add this line to your application's Gemfile:
|
|
34
|
+
|
|
35
|
+
```ruby
|
|
36
|
+
gem 'semchunk'
|
|
19
37
|
```
|
|
38
|
+
|
|
39
|
+
Or install it directly:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
20
42
|
gem install semchunk
|
|
21
43
|
```
|
|
22
44
|
|
|
45
|
+
## Quick start
|
|
46
|
+
|
|
47
|
+
```ruby
|
|
48
|
+
require "semchunk"
|
|
49
|
+
|
|
50
|
+
# Define a simple token counter (or use a real tokenizer)
|
|
51
|
+
token_counter = ->(text) { text.split.length }
|
|
52
|
+
|
|
53
|
+
# Chunk some text
|
|
54
|
+
text = "This is the first sentence. This is the second sentence. And this is the third sentence."
|
|
55
|
+
chunks = Semchunk.chunk(text, chunk_size: 5, token_counter: token_counter)
|
|
56
|
+
|
|
57
|
+
puts chunks.inspect
|
|
58
|
+
# => ["This is the first sentence.", "This is the second sentence.", "And this is the third sentence."]
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## API Reference
|
|
62
|
+
|
|
63
|
+
### `Semchunk.chunk`
|
|
64
|
+
|
|
65
|
+
Split a text into semantically meaningful chunks.
|
|
66
|
+
|
|
67
|
+
```ruby
|
|
68
|
+
Semchunk.chunk(
|
|
69
|
+
text,
|
|
70
|
+
chunk_size:,
|
|
71
|
+
token_counter:,
|
|
72
|
+
memoize: true,
|
|
73
|
+
offsets: false,
|
|
74
|
+
overlap: nil,
|
|
75
|
+
cache_maxsize: nil
|
|
76
|
+
)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
**Parameters:**
|
|
80
|
+
- `text` (String): The text to be chunked
|
|
81
|
+
- `chunk_size` (Integer): The maximum number of tokens a chunk may contain
|
|
82
|
+
- `token_counter` (Proc, Lambda, Method): A callable that takes a string and returns the number of tokens in it
|
|
83
|
+
- `memoize` (Boolean, optional): Whether to memoize the token counter. Defaults to `true`
|
|
84
|
+
- `offsets` (Boolean, optional): Whether to return the start and end offsets of each chunk. Defaults to `false`
|
|
85
|
+
- `overlap` (Float, Integer, nil, optional): The proportion of the chunk size (if < 1), or the number of tokens (if >= 1), by which chunks should overlap. Defaults to `nil`
|
|
86
|
+
- `cache_maxsize` (Integer, nil, optional): The maximum number of text-token count pairs to cache. Defaults to `nil` (unbounded)
|
|
87
|
+
|
|
88
|
+
**Returns:**
|
|
89
|
+
- `Array<String>` if `offsets: false`: List of text chunks
|
|
90
|
+
- `[Array<String>, Array<Array<Integer>>]` if `offsets: true`: List of chunks and their `[start, end]` offsets
|
|
91
|
+
|
|
92
|
+
### `Semchunk.chunkerify`
|
|
93
|
+
|
|
94
|
+
Create a reusable chunker object.
|
|
95
|
+
|
|
96
|
+
```ruby
|
|
97
|
+
Semchunk.chunkerify(
|
|
98
|
+
tokenizer_or_token_counter,
|
|
99
|
+
chunk_size: nil,
|
|
100
|
+
max_token_chars: nil,
|
|
101
|
+
memoize: true,
|
|
102
|
+
cache_maxsize: nil
|
|
103
|
+
)
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
**Parameters:**
|
|
107
|
+
- `tokenizer_or_token_counter`: A tokenizer object with an `encode` method, or a callable token counter
|
|
108
|
+
- `chunk_size` (Integer, nil): Maximum tokens per chunk. If `nil`, will attempt to use tokenizer's `model_max_length`
|
|
109
|
+
- `max_token_chars` (Integer, nil): Maximum characters per token (optimization parameter)
|
|
110
|
+
- `memoize` (Boolean): Whether to cache token counts. Defaults to `true`
|
|
111
|
+
- `cache_maxsize` (Integer, nil): Cache size limit. Defaults to `nil` (unbounded)
|
|
112
|
+
|
|
113
|
+
**Returns:**
|
|
114
|
+
- `Semchunk::Chunker`: A chunker instance
|
|
115
|
+
|
|
116
|
+
### `Chunker#call`
|
|
117
|
+
|
|
118
|
+
Process text(s) with the chunker.
|
|
119
|
+
|
|
120
|
+
```ruby
|
|
121
|
+
chunker.call(
|
|
122
|
+
text_or_texts,
|
|
123
|
+
processes: 1,
|
|
124
|
+
progress: false,
|
|
125
|
+
offsets: false,
|
|
126
|
+
overlap: nil
|
|
127
|
+
)
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
**Parameters:**
|
|
131
|
+
- `text_or_texts` (String, Array<String>): Single text or array of texts to chunk
|
|
132
|
+
- `processes` (Integer): Number of processes for parallel chunking (not yet implemented)
|
|
133
|
+
- `progress` (Boolean): Show progress bar for multiple texts (not yet implemented)
|
|
134
|
+
- `offsets` (Boolean): Return offset information
|
|
135
|
+
- `overlap` (Float, Integer, nil): Overlap configuration
|
|
136
|
+
|
|
137
|
+
**Returns:**
|
|
138
|
+
- For single text: `Array<String>` or `[Array<String>, Array<Array<Integer>>]`
|
|
139
|
+
- For multiple texts: `Array<Array<String>>` or `[Array<Array<String>>, Array<Array<Array<Integer>>>]`
|
|
140
|
+
|
|
141
|
+
## Examples
|
|
142
|
+
|
|
143
|
+
### Basic Chunking
|
|
144
|
+
|
|
23
145
|
```ruby
|
|
24
146
|
require "semchunk"
|
|
147
|
+
|
|
148
|
+
text = "Natural language processing is fascinating. It allows computers to understand human language. This enables many applications."
|
|
149
|
+
|
|
150
|
+
# Use word count as token counter
|
|
151
|
+
token_counter = ->(text) { text.split.length }
|
|
152
|
+
|
|
153
|
+
chunks = Semchunk.chunk(text, chunk_size: 8, token_counter: token_counter)
|
|
154
|
+
|
|
155
|
+
chunks.each_with_index do |chunk, i|
|
|
156
|
+
puts "Chunk #{i + 1}: #{chunk}"
|
|
157
|
+
end
|
|
158
|
+
# => Chunk 1: Natural language processing is fascinating. It allows computers
|
|
159
|
+
# => Chunk 2: to understand human language. This enables many applications.
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### With Offsets
|
|
163
|
+
|
|
164
|
+
Track where each chunk came from in the original text:
|
|
165
|
+
|
|
166
|
+
```ruby
|
|
167
|
+
text = "First paragraph here. Second paragraph here. Third paragraph here."
|
|
168
|
+
token_counter = ->(text) { text.split.length }
|
|
169
|
+
|
|
170
|
+
chunks, offsets = Semchunk.chunk(
|
|
171
|
+
text,
|
|
172
|
+
chunk_size: 5,
|
|
173
|
+
token_counter: token_counter,
|
|
174
|
+
offsets: true
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
chunks.zip(offsets).each do |chunk, (start_pos, end_pos)|
|
|
178
|
+
puts "Chunk: '#{chunk}'"
|
|
179
|
+
puts "Position: #{start_pos}...#{end_pos}"
|
|
180
|
+
puts "Verification: '#{text[start_pos...end_pos]}'"
|
|
181
|
+
puts
|
|
182
|
+
end
|
|
25
183
|
```
|
|
26
184
|
|
|
185
|
+
### With Overlap
|
|
186
|
+
|
|
187
|
+
Create overlapping chunks to maintain context:
|
|
188
|
+
|
|
189
|
+
```ruby
|
|
190
|
+
text = "One two three four five six seven eight nine ten."
|
|
191
|
+
token_counter = ->(text) { text.split.length }
|
|
192
|
+
|
|
193
|
+
# 50% overlap
|
|
194
|
+
chunks = Semchunk.chunk(
|
|
195
|
+
text,
|
|
196
|
+
chunk_size: 4,
|
|
197
|
+
token_counter: token_counter,
|
|
198
|
+
overlap: 0.5
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
puts "Overlapping chunks:"
|
|
202
|
+
chunks.each { |chunk| puts "- #{chunk}" }
|
|
203
|
+
|
|
204
|
+
# Fixed overlap of 2 tokens
|
|
205
|
+
chunks = Semchunk.chunk(
|
|
206
|
+
text,
|
|
207
|
+
chunk_size: 6,
|
|
208
|
+
token_counter: token_counter,
|
|
209
|
+
overlap: 2
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
puts "\nWith 2-token overlap:"
|
|
213
|
+
chunks.each { |chunk| puts "- #{chunk}" }
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
### Using Chunkerify for Reusable Chunkers
|
|
217
|
+
|
|
218
|
+
```ruby
|
|
219
|
+
# Create a chunker once
|
|
220
|
+
token_counter = ->(text) { text.split.length }
|
|
221
|
+
chunker = Semchunk.chunkerify(token_counter, chunk_size: 10)
|
|
222
|
+
|
|
223
|
+
# Use it multiple times
|
|
224
|
+
texts = [
|
|
225
|
+
"First document to process.",
|
|
226
|
+
"Second document to process.",
|
|
227
|
+
"Third document to process."
|
|
228
|
+
]
|
|
229
|
+
|
|
230
|
+
all_chunks = chunker.call(texts)
|
|
231
|
+
|
|
232
|
+
all_chunks.each_with_index do |chunks, i|
|
|
233
|
+
puts "Document #{i + 1} chunks: #{chunks.inspect}"
|
|
234
|
+
end
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
### Character-Level Chunking
|
|
238
|
+
|
|
239
|
+
```ruby
|
|
240
|
+
text = "abcdefghijklmnopqrstuvwxyz"
|
|
241
|
+
|
|
242
|
+
# Character count as token counter
|
|
243
|
+
token_counter = ->(text) { text.length }
|
|
244
|
+
|
|
245
|
+
chunks = Semchunk.chunk(text, chunk_size: 5, token_counter: token_counter)
|
|
246
|
+
|
|
247
|
+
puts chunks.inspect
|
|
248
|
+
# => ["abcde", "fghij", "klmno", "pqrst", "uvwxy", "z"]
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
### Custom Token Counter
|
|
252
|
+
|
|
253
|
+
```ruby
|
|
254
|
+
# Token counter that counts punctuation as separate tokens
|
|
255
|
+
def custom_token_counter(text)
|
|
256
|
+
text.scan(/\w+|[^\w\s]/).length
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
text = "Hello, world! How are you?"
|
|
260
|
+
|
|
261
|
+
chunks = Semchunk.chunk(
|
|
262
|
+
text,
|
|
263
|
+
chunk_size: 5,
|
|
264
|
+
token_counter: method(:custom_token_counter)
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
puts chunks.inspect
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
### Working with Real Tokenizers
|
|
271
|
+
|
|
272
|
+
If you have a tokenizer that implements an `encode` method:
|
|
273
|
+
|
|
274
|
+
```ruby
|
|
275
|
+
# Example with a hypothetical tokenizer
|
|
276
|
+
class MyTokenizer
|
|
277
|
+
def encode(text, add_special_tokens: true)
|
|
278
|
+
# Your tokenization logic here
|
|
279
|
+
text.split.map { |word| word.hash }
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
def model_max_length
|
|
283
|
+
512
|
|
284
|
+
end
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
tokenizer = MyTokenizer.new
|
|
288
|
+
|
|
289
|
+
# chunkerify will automatically extract the token counter
|
|
290
|
+
chunker = Semchunk.chunkerify(tokenizer, chunk_size: 100)
|
|
291
|
+
|
|
292
|
+
text = "Your long text here..."
|
|
293
|
+
chunks = chunker.call(text)
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
## How It Works
|
|
297
|
+
|
|
298
|
+
Semchunk uses a hierarchical splitting strategy:
|
|
299
|
+
|
|
300
|
+
1. **Primary split**: Tries to split on paragraph breaks (newlines)
|
|
301
|
+
2. **Secondary split**: Falls back to sentences (periods, question marks, etc.)
|
|
302
|
+
3. **Tertiary split**: Uses clauses (commas, semicolons) if needed
|
|
303
|
+
4. **Final split**: Character-level splitting as last resort
|
|
304
|
+
|
|
305
|
+
This ensures that chunks are semantically meaningful while respecting your token limits.
|
|
306
|
+
|
|
307
|
+
The algorithm uses binary search to efficiently find the optimal split points, making it fast even for large documents.
|
|
308
|
+
|
|
309
|
+
## Running the Examples
|
|
310
|
+
|
|
311
|
+
This gem includes example scripts that demonstrate various features:
|
|
312
|
+
|
|
313
|
+
```bash
|
|
314
|
+
# Basic usage examples
|
|
315
|
+
ruby examples/basic_usage.rb
|
|
316
|
+
|
|
317
|
+
# Advanced usage with longer documents
|
|
318
|
+
ruby examples/advanced_usage.rb
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
## Differences from Python Version
|
|
322
|
+
|
|
323
|
+
This Ruby port maintains feature parity with the Python version, with a few notes:
|
|
324
|
+
|
|
325
|
+
- Multiprocessing support is not yet implemented (`processes` parameter)
|
|
326
|
+
- Progress bar support is not yet implemented (`progress` parameter)
|
|
327
|
+
- String tokenizer names (like `"gpt-4"`) are not yet supported
|
|
328
|
+
- Otherwise, the API and behavior match the Python version
|
|
329
|
+
|
|
330
|
+
See [MIGRATION.md](MIGRATION.md) for a detailed guide on migrating from the Python version.
|
|
331
|
+
|
|
27
332
|
## Support
|
|
28
333
|
|
|
29
334
|
If you want to report a bug, or have ideas, feedback or questions about the gem, [let me know via GitHub issues](https://github.com/philip-zhan/semchunk.rb/issues/new) and I will do my best to provide a helpful answer. Happy hacking!
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "set"
|
|
4
|
+
|
|
5
|
+
module Semchunk
|
|
6
|
+
# A class for chunking one or more texts into semantically meaningful chunks
|
|
7
|
+
class Chunker
|
|
8
|
+
attr_reader :chunk_size, :token_counter
|
|
9
|
+
|
|
10
|
+
def initialize(chunk_size:, token_counter:)
|
|
11
|
+
@chunk_size = chunk_size
|
|
12
|
+
@token_counter = token_counter
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Split text or texts into semantically meaningful chunks
|
|
16
|
+
#
|
|
17
|
+
# @param text_or_texts [String, Array<String>] The text or texts to be chunked
|
|
18
|
+
# @param processes [Integer] The number of processes to use when chunking multiple texts (not yet implemented)
|
|
19
|
+
# @param progress [Boolean] Whether to display a progress bar when chunking multiple texts (not yet implemented)
|
|
20
|
+
# @param offsets [Boolean] Whether to return the start and end offsets of each chunk
|
|
21
|
+
# @param overlap [Float, Integer, nil] The proportion of the chunk size, or, if >=1, the number of tokens, by which chunks should overlap
|
|
22
|
+
#
|
|
23
|
+
# @return [Array<String>, Array<Array>, Hash] Depending on the input and options, returns chunks and optionally offsets
|
|
24
|
+
def call(text_or_texts, processes: 1, progress: false, offsets: false, overlap: nil)
|
|
25
|
+
chunk_function = make_chunk_function(offsets: offsets, overlap: overlap)
|
|
26
|
+
|
|
27
|
+
# Handle single text
|
|
28
|
+
if text_or_texts.is_a?(String)
|
|
29
|
+
return chunk_function.call(text_or_texts)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Handle multiple texts
|
|
33
|
+
if processes == 1
|
|
34
|
+
# TODO: Add progress bar support
|
|
35
|
+
chunks_and_offsets = text_or_texts.map { |text| chunk_function.call(text) }
|
|
36
|
+
else
|
|
37
|
+
# TODO: Add parallel processing support
|
|
38
|
+
raise NotImplementedError, "Parallel processing not yet implemented. Please use processes: 1"
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Return results
|
|
42
|
+
if offsets
|
|
43
|
+
chunks, offsets_arr = chunks_and_offsets.transpose
|
|
44
|
+
return [chunks.to_a, offsets_arr.to_a]
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
chunks_and_offsets
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
private
|
|
51
|
+
|
|
52
|
+
def make_chunk_function(offsets:, overlap:)
|
|
53
|
+
lambda do |text|
|
|
54
|
+
Semchunk.chunk(
|
|
55
|
+
text,
|
|
56
|
+
chunk_size: chunk_size,
|
|
57
|
+
token_counter: token_counter,
|
|
58
|
+
memoize: false,
|
|
59
|
+
offsets: offsets,
|
|
60
|
+
overlap: overlap
|
|
61
|
+
)
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
data/lib/semchunk/version.rb
CHANGED
data/lib/semchunk.rb
CHANGED
|
@@ -1,5 +1,387 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative "semchunk/version"
|
|
4
|
+
require_relative "semchunk/chunker"
|
|
5
|
+
|
|
3
6
|
module Semchunk
|
|
4
|
-
|
|
7
|
+
# A map of token counters to their memoized versions
|
|
8
|
+
@memoized_token_counters = {}
|
|
9
|
+
|
|
10
|
+
class << self
|
|
11
|
+
attr_reader :memoized_token_counters
|
|
12
|
+
|
|
13
|
+
# Split a text into semantically meaningful chunks of a specified size as determined by the provided token counter.
|
|
14
|
+
#
|
|
15
|
+
# @param text [String] The text to be chunked.
|
|
16
|
+
# @param chunk_size [Integer] The maximum number of tokens a chunk may contain.
|
|
17
|
+
# @param token_counter [Proc, Method, #call] A callable that takes a string and returns the number of tokens in it.
|
|
18
|
+
# @param memoize [Boolean] Whether to memoize the token counter. Defaults to true.
|
|
19
|
+
# @param offsets [Boolean] Whether to return the start and end offsets of each chunk. Defaults to false.
|
|
20
|
+
# @param overlap [Float, Integer, nil] The proportion of the chunk size, or, if >=1, the number of tokens, by which chunks should overlap. Defaults to nil.
|
|
21
|
+
# @param cache_maxsize [Integer, nil] The maximum number of text-token count pairs that can be stored in the token counter's cache. Defaults to nil (unbounded).
|
|
22
|
+
# @param recursion_depth [Integer] Internal parameter for tracking recursion depth.
|
|
23
|
+
# @param start [Integer] Internal parameter for tracking character offset.
|
|
24
|
+
#
|
|
25
|
+
# @return [Array<String>, Array<Array>] A list of chunks up to chunk_size-tokens-long, with any whitespace used to split the text removed, and, if offsets is true, a list of tuples [start, end].
|
|
26
|
+
def chunk(text, chunk_size:, token_counter:, memoize: true, offsets: false, overlap: nil, cache_maxsize: nil, recursion_depth: 0, start: 0)
|
|
27
|
+
# Rename variables for clarity
|
|
28
|
+
return_offsets = offsets
|
|
29
|
+
local_chunk_size = chunk_size
|
|
30
|
+
|
|
31
|
+
# If this is the first call, memoize the token counter if memoization is enabled and reduce the effective chunk size if overlapping chunks
|
|
32
|
+
is_first_call = recursion_depth.zero?
|
|
33
|
+
|
|
34
|
+
if is_first_call
|
|
35
|
+
if memoize
|
|
36
|
+
token_counter = memoize_token_counter(token_counter, cache_maxsize)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
if overlap
|
|
40
|
+
# Make relative overlaps absolute and floor both relative and absolute overlaps
|
|
41
|
+
overlap = if overlap < 1
|
|
42
|
+
(chunk_size * overlap).floor
|
|
43
|
+
else
|
|
44
|
+
[overlap, chunk_size - 1].min
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# If the overlap has not been zeroed, compute the effective chunk size
|
|
48
|
+
if overlap.positive?
|
|
49
|
+
unoverlapped_chunk_size = chunk_size - overlap
|
|
50
|
+
local_chunk_size = [overlap, unoverlapped_chunk_size].min
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Split the text using the most semantically meaningful splitter possible
|
|
56
|
+
splitter, splitter_is_whitespace, splits = split_text(text)
|
|
57
|
+
|
|
58
|
+
offsets_arr = []
|
|
59
|
+
splitter_len = splitter.length
|
|
60
|
+
split_lens = splits.map(&:length)
|
|
61
|
+
cum_lens = [0]
|
|
62
|
+
split_lens.each { |len| cum_lens << cum_lens.last + len }
|
|
63
|
+
|
|
64
|
+
split_starts = [0]
|
|
65
|
+
split_lens.each_with_index do |split_len, i|
|
|
66
|
+
split_starts << split_starts[i] + split_len + splitter_len
|
|
67
|
+
end
|
|
68
|
+
split_starts = split_starts.map { |s| s + start }
|
|
69
|
+
|
|
70
|
+
num_splits_plus_one = splits.length + 1
|
|
71
|
+
|
|
72
|
+
chunks = []
|
|
73
|
+
skips = Set.new
|
|
74
|
+
|
|
75
|
+
# Iterate through the splits
|
|
76
|
+
splits.each_with_index do |split, i|
|
|
77
|
+
# Skip the split if it has already been added to a chunk
|
|
78
|
+
next if skips.include?(i)
|
|
79
|
+
|
|
80
|
+
split_start = split_starts[i]
|
|
81
|
+
|
|
82
|
+
# If the split is over the chunk size, recursively chunk it
|
|
83
|
+
if token_counter.call(split) > local_chunk_size
|
|
84
|
+
new_chunks, new_offsets = chunk(
|
|
85
|
+
split,
|
|
86
|
+
chunk_size: local_chunk_size,
|
|
87
|
+
token_counter: token_counter,
|
|
88
|
+
offsets: true,
|
|
89
|
+
recursion_depth: recursion_depth + 1,
|
|
90
|
+
start: split_start
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
chunks.concat(new_chunks)
|
|
94
|
+
offsets_arr.concat(new_offsets)
|
|
95
|
+
else
|
|
96
|
+
# Merge the split with subsequent splits until the chunk size is reached
|
|
97
|
+
final_split_in_chunk_i, new_chunk = merge_splits(
|
|
98
|
+
splits: splits,
|
|
99
|
+
cum_lens: cum_lens,
|
|
100
|
+
chunk_size: local_chunk_size,
|
|
101
|
+
splitter: splitter,
|
|
102
|
+
token_counter: token_counter,
|
|
103
|
+
start: i,
|
|
104
|
+
high: num_splits_plus_one
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Mark any splits included in the new chunk for exclusion from future chunks
|
|
108
|
+
((i + 1)...final_split_in_chunk_i).each { |j| skips.add(j) }
|
|
109
|
+
|
|
110
|
+
# Add the chunk
|
|
111
|
+
chunks << new_chunk
|
|
112
|
+
|
|
113
|
+
# Add the chunk's offsets
|
|
114
|
+
split_end = split_starts[final_split_in_chunk_i] - splitter_len
|
|
115
|
+
offsets_arr << [split_start, split_end]
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# If the splitter is not whitespace and the split is not the last split, add the splitter to the end of the latest chunk
|
|
119
|
+
unless splitter_is_whitespace || (i == splits.length - 1 || ((i + 1)...splits.length).all? { |j| skips.include?(j) })
|
|
120
|
+
last_chunk_with_splitter = chunks[-1] + splitter
|
|
121
|
+
if token_counter.call(last_chunk_with_splitter) <= local_chunk_size
|
|
122
|
+
chunks[-1] = last_chunk_with_splitter
|
|
123
|
+
offset_start, offset_end = offsets_arr[-1]
|
|
124
|
+
offsets_arr[-1] = [offset_start, offset_end + splitter_len]
|
|
125
|
+
else
|
|
126
|
+
offset_start = offsets_arr.empty? ? split_start : offsets_arr[-1][1]
|
|
127
|
+
chunks << splitter
|
|
128
|
+
offsets_arr << [offset_start, offset_start + splitter_len]
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# If this is the first call, remove empty chunks and overlap if desired
|
|
134
|
+
if is_first_call
|
|
135
|
+
# Remove empty chunks and chunks comprised entirely of whitespace
|
|
136
|
+
chunks_and_offsets = chunks.zip(offsets_arr).reject { |chunk, _| chunk.empty? || chunk.strip.empty? }
|
|
137
|
+
|
|
138
|
+
if chunks_and_offsets.any?
|
|
139
|
+
chunks, offsets_arr = chunks_and_offsets.transpose
|
|
140
|
+
else
|
|
141
|
+
chunks = []
|
|
142
|
+
offsets_arr = []
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Overlap chunks if desired and there are chunks to overlap
|
|
146
|
+
if overlap && overlap.positive? && chunks.any?
|
|
147
|
+
# Rename variables for clarity
|
|
148
|
+
subchunk_size = local_chunk_size
|
|
149
|
+
subchunks = chunks
|
|
150
|
+
suboffsets = offsets_arr
|
|
151
|
+
num_subchunks = subchunks.length
|
|
152
|
+
|
|
153
|
+
# Merge the subchunks into overlapping chunks
|
|
154
|
+
subchunks_per_chunk = (chunk_size.to_f / subchunk_size).floor
|
|
155
|
+
subchunk_stride = (unoverlapped_chunk_size.to_f / subchunk_size).floor
|
|
156
|
+
|
|
157
|
+
num_overlapping_chunks = [1, ((num_subchunks - subchunks_per_chunk).to_f / subchunk_stride).ceil + 1].max
|
|
158
|
+
|
|
159
|
+
offsets_arr = (0...num_overlapping_chunks).map do |i|
|
|
160
|
+
start_idx = i * subchunk_stride
|
|
161
|
+
end_idx = [start_idx + subchunks_per_chunk, num_subchunks].min - 1
|
|
162
|
+
[suboffsets[start_idx][0], suboffsets[end_idx][1]]
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
chunks = offsets_arr.map { |s, e| text[s...e] }
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Return offsets if desired
|
|
169
|
+
return [chunks, offsets_arr] if return_offsets
|
|
170
|
+
|
|
171
|
+
return chunks
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# Always return chunks and offsets if this is a recursive call
|
|
175
|
+
[chunks, offsets_arr]
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# Construct a chunker that splits one or more texts into semantically meaningful chunks
|
|
179
|
+
#
|
|
180
|
+
# @param tokenizer_or_token_counter [String, #encode, Proc, Method, #call] Either: the name of a tokenizer; a tokenizer that possesses an encode method; or a token counter.
|
|
181
|
+
# @param chunk_size [Integer, nil] The maximum number of tokens a chunk may contain. Defaults to nil.
|
|
182
|
+
# @param max_token_chars [Integer, nil] The maximum number of characters a token may contain. Defaults to nil.
|
|
183
|
+
# @param memoize [Boolean] Whether to memoize the token counter. Defaults to true.
|
|
184
|
+
# @param cache_maxsize [Integer, nil] The maximum number of text-token count pairs that can be stored in the token counter's cache.
|
|
185
|
+
#
|
|
186
|
+
# @return [Chunker] A chunker instance
|
|
187
|
+
def chunkerify(tokenizer_or_token_counter, chunk_size: nil, max_token_chars: nil, memoize: true, cache_maxsize: nil)
|
|
188
|
+
# Handle string tokenizer names (would require tiktoken/transformers Ruby equivalents)
|
|
189
|
+
if tokenizer_or_token_counter.is_a?(String)
|
|
190
|
+
raise NotImplementedError, "String tokenizer names not yet supported in Ruby. Please pass a tokenizer object or token counter proc."
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# Determine max_token_chars if not provided
|
|
194
|
+
if max_token_chars.nil?
|
|
195
|
+
if tokenizer_or_token_counter.respond_to?(:token_byte_values)
|
|
196
|
+
vocab = tokenizer_or_token_counter.token_byte_values
|
|
197
|
+
max_token_chars = vocab.map(&:length).max if vocab.respond_to?(:map)
|
|
198
|
+
elsif tokenizer_or_token_counter.respond_to?(:get_vocab)
|
|
199
|
+
vocab = tokenizer_or_token_counter.get_vocab
|
|
200
|
+
max_token_chars = vocab.keys.map(&:length).max if vocab.respond_to?(:keys)
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Determine chunk_size if not provided
|
|
205
|
+
if chunk_size.nil?
|
|
206
|
+
if tokenizer_or_token_counter.respond_to?(:model_max_length) && tokenizer_or_token_counter.model_max_length.is_a?(Integer)
|
|
207
|
+
chunk_size = tokenizer_or_token_counter.model_max_length
|
|
208
|
+
|
|
209
|
+
# Attempt to reduce the chunk size by the number of special characters
|
|
210
|
+
if tokenizer_or_token_counter.respond_to?(:encode)
|
|
211
|
+
begin
|
|
212
|
+
chunk_size -= tokenizer_or_token_counter.encode("").length
|
|
213
|
+
rescue StandardError
|
|
214
|
+
# Ignore errors
|
|
215
|
+
end
|
|
216
|
+
end
|
|
217
|
+
else
|
|
218
|
+
raise ArgumentError, "chunk_size not provided and tokenizer lacks model_max_length attribute"
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
# Construct token counter from tokenizer if needed
|
|
223
|
+
if tokenizer_or_token_counter.respond_to?(:encode)
|
|
224
|
+
tokenizer = tokenizer_or_token_counter
|
|
225
|
+
# Check if encode accepts add_special_tokens parameter
|
|
226
|
+
encode_params = tokenizer.method(:encode).parameters rescue []
|
|
227
|
+
has_special_tokens = encode_params.any? { |type, name| name == :add_special_tokens }
|
|
228
|
+
|
|
229
|
+
token_counter = if has_special_tokens
|
|
230
|
+
->(text) { tokenizer.encode(text, add_special_tokens: false).length }
|
|
231
|
+
else
|
|
232
|
+
->(text) { tokenizer.encode(text).length }
|
|
233
|
+
end
|
|
234
|
+
else
|
|
235
|
+
token_counter = tokenizer_or_token_counter
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# Add fast token counter optimization if max_token_chars is known
|
|
239
|
+
if max_token_chars
|
|
240
|
+
max_token_chars -= 1
|
|
241
|
+
original_token_counter = token_counter
|
|
242
|
+
|
|
243
|
+
token_counter = lambda do |text|
|
|
244
|
+
heuristic = chunk_size * 6
|
|
245
|
+
if text.length > heuristic && original_token_counter.call(text[0...(heuristic + max_token_chars)]) > chunk_size
|
|
246
|
+
chunk_size + 1
|
|
247
|
+
else
|
|
248
|
+
original_token_counter.call(text)
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
# Memoize the token counter if necessary
|
|
254
|
+
if memoize
|
|
255
|
+
token_counter = memoize_token_counter(token_counter, cache_maxsize)
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
# Construct and return the chunker
|
|
259
|
+
Chunker.new(chunk_size: chunk_size, token_counter: token_counter)
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
private
|
|
263
|
+
|
|
264
|
+
# A tuple of semantically meaningful non-whitespace splitters
|
|
265
|
+
NON_WHITESPACE_SEMANTIC_SPLITTERS = [
|
|
266
|
+
# Sentence terminators
|
|
267
|
+
".", "?", "!", "*",
|
|
268
|
+
# Clause separators
|
|
269
|
+
";", ",", "(", ")", "[", "]", """, """, "'", "'", "'", '"', "`",
|
|
270
|
+
# Sentence interrupters
|
|
271
|
+
":", "—", "…",
|
|
272
|
+
# Word joiners
|
|
273
|
+
"/", "\\", "–", "&", "-"
|
|
274
|
+
].freeze
|
|
275
|
+
|
|
276
|
+
def split_text(text)
|
|
277
|
+
splitter_is_whitespace = true
|
|
278
|
+
|
|
279
|
+
# Try splitting at various levels
|
|
280
|
+
if text.include?("\n") || text.include?("\r")
|
|
281
|
+
newline_matches = text.scan(/[\r\n]+/)
|
|
282
|
+
splitter = newline_matches.max_by(&:length)
|
|
283
|
+
elsif text.include?("\t")
|
|
284
|
+
tab_matches = text.scan(/\t+/)
|
|
285
|
+
splitter = tab_matches.max_by(&:length)
|
|
286
|
+
elsif text.match?(/\s/)
|
|
287
|
+
whitespace_matches = text.scan(/\s+/)
|
|
288
|
+
splitter = whitespace_matches.max_by(&:length)
|
|
289
|
+
|
|
290
|
+
# If the splitter is only a single character, see if we can target whitespace preceded by semantic splitters
|
|
291
|
+
if splitter.length == 1
|
|
292
|
+
NON_WHITESPACE_SEMANTIC_SPLITTERS.each do |preceder|
|
|
293
|
+
escaped_preceder = Regexp.escape(preceder)
|
|
294
|
+
if (match = text.match(/#{escaped_preceder}(\s)/))
|
|
295
|
+
splitter = match[1]
|
|
296
|
+
escaped_splitter = Regexp.escape(splitter)
|
|
297
|
+
return [splitter, splitter_is_whitespace, text.split(/(?<=#{escaped_preceder})#{escaped_splitter}/)]
|
|
298
|
+
end
|
|
299
|
+
end
|
|
300
|
+
end
|
|
301
|
+
else
|
|
302
|
+
# Find the most desirable semantically meaningful non-whitespace splitter
|
|
303
|
+
splitter = NON_WHITESPACE_SEMANTIC_SPLITTERS.find { |s| text.include?(s) }
|
|
304
|
+
|
|
305
|
+
if splitter
|
|
306
|
+
splitter_is_whitespace = false
|
|
307
|
+
else
|
|
308
|
+
# No semantic splitter found, return characters
|
|
309
|
+
return ["", splitter_is_whitespace, text.chars]
|
|
310
|
+
end
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
[splitter, splitter_is_whitespace, text.split(splitter)]
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
def bisect_left(sorted, target, low, high)
|
|
317
|
+
while low < high
|
|
318
|
+
mid = (low + high) / 2
|
|
319
|
+
if sorted[mid] < target
|
|
320
|
+
low = mid + 1
|
|
321
|
+
else
|
|
322
|
+
high = mid
|
|
323
|
+
end
|
|
324
|
+
end
|
|
325
|
+
low
|
|
326
|
+
end
|
|
327
|
+
|
|
328
|
+
def merge_splits(splits:, cum_lens:, chunk_size:, splitter:, token_counter:, start:, high:)
|
|
329
|
+
average = 0.2
|
|
330
|
+
low = start
|
|
331
|
+
|
|
332
|
+
offset = cum_lens[start]
|
|
333
|
+
target = offset + (chunk_size * average)
|
|
334
|
+
|
|
335
|
+
while low < high
|
|
336
|
+
i = bisect_left(cum_lens, target, low, high)
|
|
337
|
+
midpoint = [i, high - 1].min
|
|
338
|
+
|
|
339
|
+
tokens = token_counter.call(splits[start...midpoint].join(splitter))
|
|
340
|
+
|
|
341
|
+
local_cum = cum_lens[midpoint] - offset
|
|
342
|
+
|
|
343
|
+
if local_cum.positive? && tokens.positive?
|
|
344
|
+
average = local_cum.to_f / tokens
|
|
345
|
+
target = offset + (chunk_size * average)
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
if tokens > chunk_size
|
|
349
|
+
high = midpoint
|
|
350
|
+
else
|
|
351
|
+
low = midpoint + 1
|
|
352
|
+
end
|
|
353
|
+
end
|
|
354
|
+
|
|
355
|
+
last_split_index = low - 1
|
|
356
|
+
[last_split_index, splits[start...last_split_index].join(splitter)]
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
def memoize_token_counter(token_counter, maxsize = nil)
|
|
360
|
+
return @memoized_token_counters[token_counter] if @memoized_token_counters.key?(token_counter)
|
|
361
|
+
|
|
362
|
+
cache = {}
|
|
363
|
+
queue = []
|
|
364
|
+
|
|
365
|
+
memoized = lambda do |text|
|
|
366
|
+
if cache.key?(text)
|
|
367
|
+
cache[text]
|
|
368
|
+
else
|
|
369
|
+
result = token_counter.call(text)
|
|
370
|
+
cache[text] = result
|
|
371
|
+
|
|
372
|
+
if maxsize
|
|
373
|
+
queue << text
|
|
374
|
+
if queue.length > maxsize
|
|
375
|
+
oldest = queue.shift
|
|
376
|
+
cache.delete(oldest)
|
|
377
|
+
end
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
result
|
|
381
|
+
end
|
|
382
|
+
end
|
|
383
|
+
|
|
384
|
+
@memoized_token_counters[token_counter] = memoized
|
|
385
|
+
end
|
|
386
|
+
end
|
|
5
387
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: semchunk
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Philip Zhan
|
|
@@ -18,6 +18,7 @@ files:
|
|
|
18
18
|
- LICENSE.txt
|
|
19
19
|
- README.md
|
|
20
20
|
- lib/semchunk.rb
|
|
21
|
+
- lib/semchunk/chunker.rb
|
|
21
22
|
- lib/semchunk/version.rb
|
|
22
23
|
homepage: https://github.com/philip-zhan/semchunk.rb
|
|
23
24
|
licenses:
|