chunker-ruby 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/lib/chunker_ruby/base_splitter.rb +78 -0
- data/lib/chunker_ruby/character.rb +30 -0
- data/lib/chunker_ruby/chunk.rb +36 -0
- data/lib/chunker_ruby/code.rb +43 -0
- data/lib/chunker_ruby/html.rb +83 -0
- data/lib/chunker_ruby/json_splitter.rb +62 -0
- data/lib/chunker_ruby/markdown.rb +114 -0
- data/lib/chunker_ruby/rails/chunkable.rb +70 -0
- data/lib/chunker_ruby/recursive_character.rb +68 -0
- data/lib/chunker_ruby/semantic.rb +106 -0
- data/lib/chunker_ruby/sentence.rb +65 -0
- data/lib/chunker_ruby/separator.rb +60 -0
- data/lib/chunker_ruby/sliding_window.rb +36 -0
- data/lib/chunker_ruby/token.rb +82 -0
- data/lib/chunker_ruby/version.rb +5 -0
- data/lib/chunker_ruby.rb +22 -0
- metadata +61 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: fb1949806664ba1e447e440f5dff4a2e1c072e2a0ed44248d235772ab23d121d
|
|
4
|
+
data.tar.gz: 2b4eb4b750714e39ef11f6fa40fc82640797b52ebca069f5993be18fa9a705f9
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 7b2fc37c66650dfe14a035e36bc4dea38895a98f165520be4c077ca3f85ffb8a45f0b89e30b1f484184cf0e963e2e58227f2af26633896de09af2c4d13297f68
|
|
7
|
+
data.tar.gz: ae966e3dfa33ec187899018192fa80dbe78473e988bbeb91ccd165bc1e4e747d7cc97c12d6b22e37b9e298b5c35de2a8e581bf60f65516e93f6dbe7c656e70de
|
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Johannes Dwi Cahyo
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ChunkerRuby
|
|
4
|
+
class BaseSplitter
|
|
5
|
+
attr_reader :chunk_size, :chunk_overlap
|
|
6
|
+
|
|
7
|
+
def initialize(chunk_size: 1000, chunk_overlap: 200, **options)
|
|
8
|
+
raise ArgumentError, "chunk_size must be positive" unless chunk_size > 0
|
|
9
|
+
raise ArgumentError, "chunk_overlap must be non-negative" unless chunk_overlap >= 0
|
|
10
|
+
raise ArgumentError, "chunk_overlap must be less than chunk_size" unless chunk_overlap < chunk_size
|
|
11
|
+
|
|
12
|
+
@chunk_size = chunk_size
|
|
13
|
+
@chunk_overlap = chunk_overlap
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def split(text, metadata: {})
|
|
17
|
+
raise NotImplementedError, "#{self.class}#split must be implemented"
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def split_many(texts)
|
|
21
|
+
texts.flat_map.with_index { |t, i| split(t, metadata: { doc_index: i }) }
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
private
|
|
25
|
+
|
|
26
|
+
def build_chunks(pieces, original_text, metadata: {})
|
|
27
|
+
chunks = []
|
|
28
|
+
current_parts = []
|
|
29
|
+
current_length = 0
|
|
30
|
+
|
|
31
|
+
pieces.each do |piece|
|
|
32
|
+
piece_len = piece.length
|
|
33
|
+
|
|
34
|
+
if current_length + piece_len > @chunk_size && !current_parts.empty?
|
|
35
|
+
chunk_text = current_parts.join
|
|
36
|
+
offset = original_text.index(chunk_text) || 0
|
|
37
|
+
chunks << Chunk.new(
|
|
38
|
+
text: chunk_text,
|
|
39
|
+
index: chunks.size,
|
|
40
|
+
offset: offset,
|
|
41
|
+
metadata: metadata.dup
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# Handle overlap: keep trailing parts that fit within overlap size
|
|
45
|
+
overlap_parts = []
|
|
46
|
+
overlap_length = 0
|
|
47
|
+
current_parts.reverse_each do |part|
|
|
48
|
+
if overlap_length + part.length <= @chunk_overlap
|
|
49
|
+
overlap_parts.unshift(part)
|
|
50
|
+
overlap_length += part.length
|
|
51
|
+
else
|
|
52
|
+
break
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
current_parts = overlap_parts
|
|
57
|
+
current_length = overlap_length
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
current_parts << piece
|
|
61
|
+
current_length += piece_len
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
unless current_parts.empty?
|
|
65
|
+
chunk_text = current_parts.join
|
|
66
|
+
offset = original_text.rindex(chunk_text) || 0
|
|
67
|
+
chunks << Chunk.new(
|
|
68
|
+
text: chunk_text,
|
|
69
|
+
index: chunks.size,
|
|
70
|
+
offset: offset,
|
|
71
|
+
metadata: metadata.dup
|
|
72
|
+
)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
chunks
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ChunkerRuby
|
|
4
|
+
class Character < BaseSplitter
|
|
5
|
+
def split(text, metadata: {})
|
|
6
|
+
return [] if text.nil? || text.empty?
|
|
7
|
+
|
|
8
|
+
chunks = []
|
|
9
|
+
start = 0
|
|
10
|
+
|
|
11
|
+
while start < text.length
|
|
12
|
+
end_pos = [start + @chunk_size, text.length].min
|
|
13
|
+
chunk_text = text[start...end_pos]
|
|
14
|
+
|
|
15
|
+
chunks << Chunk.new(
|
|
16
|
+
text: chunk_text,
|
|
17
|
+
index: chunks.size,
|
|
18
|
+
offset: start,
|
|
19
|
+
metadata: metadata.dup
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
break if end_pos >= text.length
|
|
23
|
+
|
|
24
|
+
start += @chunk_size - @chunk_overlap
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
chunks
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ChunkerRuby
|
|
4
|
+
class Chunk
|
|
5
|
+
attr_reader :text, :index, :offset, :length, :metadata
|
|
6
|
+
|
|
7
|
+
def initialize(text:, index:, offset:, metadata: {})
|
|
8
|
+
@text = text
|
|
9
|
+
@index = index
|
|
10
|
+
@offset = offset
|
|
11
|
+
@length = text.length
|
|
12
|
+
@metadata = metadata
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def token_count(tokenizer = nil)
|
|
16
|
+
if tokenizer
|
|
17
|
+
tokenizer.encode(text).length
|
|
18
|
+
else
|
|
19
|
+
# Rough estimation: ~4 characters per token for English
|
|
20
|
+
(text.length / 4.0).ceil
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def to_s
|
|
25
|
+
@text
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def to_h
|
|
29
|
+
{ text: @text, index: @index, offset: @offset, length: @length, metadata: @metadata }
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def ==(other)
|
|
33
|
+
other.is_a?(Chunk) && text == other.text && index == other.index && offset == other.offset
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ChunkerRuby
|
|
4
|
+
class Code < BaseSplitter
|
|
5
|
+
LANGUAGE_SEPARATORS = {
|
|
6
|
+
ruby: [
|
|
7
|
+
"\nclass ", "\nmodule ", "\ndef ", "\n\n", "\n", " ", ""
|
|
8
|
+
],
|
|
9
|
+
python: [
|
|
10
|
+
"\nclass ", "\ndef ", "\n\n", "\n", " ", ""
|
|
11
|
+
],
|
|
12
|
+
javascript: [
|
|
13
|
+
"\nfunction ", "\nclass ", "\nconst ", "\nlet ", "\nvar ",
|
|
14
|
+
"\nexport ", "\n\n", "\n", " ", ""
|
|
15
|
+
],
|
|
16
|
+
typescript: [
|
|
17
|
+
"\ninterface ", "\ntype ", "\nfunction ", "\nclass ",
|
|
18
|
+
"\nconst ", "\nlet ", "\nexport ", "\n\n", "\n", " ", ""
|
|
19
|
+
]
|
|
20
|
+
}.freeze
|
|
21
|
+
|
|
22
|
+
def initialize(language: :ruby, **kwargs)
|
|
23
|
+
super(**kwargs)
|
|
24
|
+
@language = language.to_sym
|
|
25
|
+
@separators = LANGUAGE_SEPARATORS.fetch(@language) do
|
|
26
|
+
raise ArgumentError, "Unsupported language: #{language}. Supported: #{LANGUAGE_SEPARATORS.keys.join(", ")}"
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def split(text, metadata: {})
|
|
31
|
+
return [] if text.nil? || text.empty?
|
|
32
|
+
|
|
33
|
+
meta = metadata.merge(language: @language)
|
|
34
|
+
splitter = RecursiveCharacter.new(
|
|
35
|
+
chunk_size: @chunk_size,
|
|
36
|
+
chunk_overlap: @chunk_overlap,
|
|
37
|
+
separators: @separators,
|
|
38
|
+
keep_separator: true
|
|
39
|
+
)
|
|
40
|
+
splitter.split(text, metadata: meta)
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ChunkerRuby
|
|
4
|
+
class HTML < BaseSplitter
|
|
5
|
+
BLOCK_TAGS = %w[
|
|
6
|
+
div p section article aside main header footer nav
|
|
7
|
+
h1 h2 h3 h4 h5 h6 blockquote pre ul ol li table tr
|
|
8
|
+
form fieldset details summary figure figcaption
|
|
9
|
+
].freeze
|
|
10
|
+
|
|
11
|
+
def initialize(strip_tags: false, **kwargs)
|
|
12
|
+
super(**kwargs)
|
|
13
|
+
@strip_tags = strip_tags
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def split(text, metadata: {})
|
|
17
|
+
return [] if text.nil? || text.empty?
|
|
18
|
+
|
|
19
|
+
sections = split_by_tags(text)
|
|
20
|
+
chunks = []
|
|
21
|
+
|
|
22
|
+
sections.each do |section|
|
|
23
|
+
content = @strip_tags ? strip_html_tags(section[:text]) : section[:text]
|
|
24
|
+
next if content.strip.empty?
|
|
25
|
+
|
|
26
|
+
if content.length <= @chunk_size
|
|
27
|
+
chunks << Chunk.new(
|
|
28
|
+
text: content,
|
|
29
|
+
index: chunks.size,
|
|
30
|
+
offset: section[:offset],
|
|
31
|
+
metadata: metadata.dup
|
|
32
|
+
)
|
|
33
|
+
else
|
|
34
|
+
sub_splitter = RecursiveCharacter.new(
|
|
35
|
+
chunk_size: @chunk_size,
|
|
36
|
+
chunk_overlap: @chunk_overlap
|
|
37
|
+
)
|
|
38
|
+
sub_chunks = sub_splitter.split(content, metadata: metadata)
|
|
39
|
+
sub_chunks.each do |sc|
|
|
40
|
+
chunks << Chunk.new(
|
|
41
|
+
text: sc.text,
|
|
42
|
+
index: chunks.size,
|
|
43
|
+
offset: section[:offset] + sc.offset,
|
|
44
|
+
metadata: sc.metadata
|
|
45
|
+
)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
chunks
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
private
|
|
54
|
+
|
|
55
|
+
def split_by_tags(text)
|
|
56
|
+
sections = []
|
|
57
|
+
tag_pattern = /<\/?(?:#{BLOCK_TAGS.join("|")})\b[^>]*>/i
|
|
58
|
+
|
|
59
|
+
parts = text.split(/(#{tag_pattern})/i)
|
|
60
|
+
current_text = +""
|
|
61
|
+
current_offset = 0
|
|
62
|
+
pos = 0
|
|
63
|
+
|
|
64
|
+
parts.each do |part|
|
|
65
|
+
if part.match?(tag_pattern) && !current_text.strip.empty?
|
|
66
|
+
sections << { text: current_text, offset: current_offset }
|
|
67
|
+
current_text = part
|
|
68
|
+
current_offset = pos
|
|
69
|
+
else
|
|
70
|
+
current_text << part
|
|
71
|
+
end
|
|
72
|
+
pos += part.length
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
sections << { text: current_text, offset: current_offset } unless current_text.strip.empty?
|
|
76
|
+
sections
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def strip_html_tags(text)
|
|
80
|
+
text.gsub(/<[^>]+>/, "")
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module ChunkerRuby
|
|
6
|
+
class JSONSplitter < BaseSplitter
|
|
7
|
+
def split(text, metadata: {})
|
|
8
|
+
return [] if text.nil? || text.empty?
|
|
9
|
+
|
|
10
|
+
parsed = ::JSON.parse(text)
|
|
11
|
+
pieces = extract_pieces(parsed)
|
|
12
|
+
chunks = []
|
|
13
|
+
|
|
14
|
+
current_parts = []
|
|
15
|
+
current_length = 0
|
|
16
|
+
|
|
17
|
+
pieces.each do |piece|
|
|
18
|
+
json_str = ::JSON.generate(piece)
|
|
19
|
+
|
|
20
|
+
if current_length + json_str.length > @chunk_size && !current_parts.empty?
|
|
21
|
+
chunk_text = ::JSON.generate(current_parts.length == 1 ? current_parts.first : current_parts)
|
|
22
|
+
chunks << Chunk.new(
|
|
23
|
+
text: chunk_text,
|
|
24
|
+
index: chunks.size,
|
|
25
|
+
offset: 0,
|
|
26
|
+
metadata: metadata.dup
|
|
27
|
+
)
|
|
28
|
+
current_parts = []
|
|
29
|
+
current_length = 0
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
current_parts << piece
|
|
33
|
+
current_length += json_str.length
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
unless current_parts.empty?
|
|
37
|
+
chunk_text = ::JSON.generate(current_parts.length == 1 ? current_parts.first : current_parts)
|
|
38
|
+
chunks << Chunk.new(
|
|
39
|
+
text: chunk_text,
|
|
40
|
+
index: chunks.size,
|
|
41
|
+
offset: 0,
|
|
42
|
+
metadata: metadata.dup
|
|
43
|
+
)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
chunks
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
private
|
|
50
|
+
|
|
51
|
+
def extract_pieces(parsed)
|
|
52
|
+
case parsed
|
|
53
|
+
when Array
|
|
54
|
+
parsed
|
|
55
|
+
when Hash
|
|
56
|
+
parsed.map { |k, v| { k => v } }
|
|
57
|
+
else
|
|
58
|
+
[parsed]
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ChunkerRuby
|
|
4
|
+
class Markdown < BaseSplitter
|
|
5
|
+
HEADER_PATTERN = /^(\#{1,6})\s+(.+)$/
|
|
6
|
+
|
|
7
|
+
def initialize(keep_headers: true, **kwargs)
|
|
8
|
+
super(**kwargs)
|
|
9
|
+
@keep_headers = keep_headers
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def split(text, metadata: {})
|
|
13
|
+
return [] if text.nil? || text.empty?
|
|
14
|
+
|
|
15
|
+
sections = split_by_headers(text)
|
|
16
|
+
chunks = []
|
|
17
|
+
|
|
18
|
+
sections.each do |section|
|
|
19
|
+
section_meta = metadata.merge(section[:metadata])
|
|
20
|
+
|
|
21
|
+
if section[:text].length <= @chunk_size
|
|
22
|
+
chunks << Chunk.new(
|
|
23
|
+
text: section[:text],
|
|
24
|
+
index: chunks.size,
|
|
25
|
+
offset: section[:offset],
|
|
26
|
+
metadata: section_meta
|
|
27
|
+
)
|
|
28
|
+
else
|
|
29
|
+
# Fall back to recursive splitting for large sections
|
|
30
|
+
sub_splitter = RecursiveCharacter.new(
|
|
31
|
+
chunk_size: @chunk_size,
|
|
32
|
+
chunk_overlap: @chunk_overlap,
|
|
33
|
+
separators: ["\n\n", "\n", ". ", " ", ""]
|
|
34
|
+
)
|
|
35
|
+
sub_chunks = sub_splitter.split(section[:text], metadata: section_meta)
|
|
36
|
+
sub_chunks.each do |sc|
|
|
37
|
+
chunks << Chunk.new(
|
|
38
|
+
text: sc.text,
|
|
39
|
+
index: chunks.size,
|
|
40
|
+
offset: section[:offset] + sc.offset,
|
|
41
|
+
metadata: sc.metadata
|
|
42
|
+
)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
chunks
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
private
|
|
51
|
+
|
|
52
|
+
def split_by_headers(text)
|
|
53
|
+
sections = []
|
|
54
|
+
current_headers = []
|
|
55
|
+
current_text = +""
|
|
56
|
+
current_offset = 0
|
|
57
|
+
in_code_block = false
|
|
58
|
+
|
|
59
|
+
lines = text.lines
|
|
60
|
+
pos = 0
|
|
61
|
+
|
|
62
|
+
lines.each do |line|
|
|
63
|
+
if line.match?(/\A```/)
|
|
64
|
+
in_code_block = !in_code_block
|
|
65
|
+
current_text << line
|
|
66
|
+
pos += line.length
|
|
67
|
+
next
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
if !in_code_block && (match = line.match(HEADER_PATTERN))
|
|
71
|
+
# Save previous section
|
|
72
|
+
unless current_text.empty?
|
|
73
|
+
sections << {
|
|
74
|
+
text: current_text.rstrip,
|
|
75
|
+
offset: current_offset,
|
|
76
|
+
metadata: { headers: current_headers.dup }
|
|
77
|
+
}
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
level = match[1].length
|
|
81
|
+
# Remove headers at same or deeper level
|
|
82
|
+
current_headers = current_headers.select { |h| header_level(h) < level }
|
|
83
|
+
current_headers << line.rstrip
|
|
84
|
+
|
|
85
|
+
if @keep_headers
|
|
86
|
+
current_text = line.dup
|
|
87
|
+
else
|
|
88
|
+
current_text = +""
|
|
89
|
+
end
|
|
90
|
+
current_offset = pos
|
|
91
|
+
else
|
|
92
|
+
current_text << line
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
pos += line.length
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
unless current_text.empty?
|
|
99
|
+
sections << {
|
|
100
|
+
text: current_text.rstrip,
|
|
101
|
+
offset: current_offset,
|
|
102
|
+
metadata: { headers: current_headers.dup }
|
|
103
|
+
}
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
sections
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def header_level(header_line)
|
|
110
|
+
match = header_line.match(/^(\#{1,6})/)
|
|
111
|
+
match ? match[1].length : 7
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ChunkerRuby
|
|
4
|
+
module Rails
|
|
5
|
+
module Chunkable
|
|
6
|
+
def self.included(base)
|
|
7
|
+
base.extend(ClassMethods)
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
module ClassMethods
|
|
11
|
+
def chunkable(attribute, strategy: :recursive_character, chunk_size: 1000, chunk_overlap: 200, **options)
|
|
12
|
+
@chunkable_config = {
|
|
13
|
+
attribute: attribute,
|
|
14
|
+
strategy: strategy,
|
|
15
|
+
chunk_size: chunk_size,
|
|
16
|
+
chunk_overlap: chunk_overlap,
|
|
17
|
+
options: options
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
after_save :rechunk!, if: -> { saved_change_to_attribute?(attribute) }
|
|
21
|
+
|
|
22
|
+
has_many :chunks,
|
|
23
|
+
class_name: "#{name}Chunk",
|
|
24
|
+
dependent: :destroy
|
|
25
|
+
|
|
26
|
+
define_method(:chunker) do
|
|
27
|
+
config = self.class.instance_variable_get(:@chunkable_config)
|
|
28
|
+
splitter_class = ChunkerRuby::Rails::Chunkable.resolve_strategy(config[:strategy])
|
|
29
|
+
splitter_class.new(
|
|
30
|
+
chunk_size: config[:chunk_size],
|
|
31
|
+
chunk_overlap: config[:chunk_overlap],
|
|
32
|
+
**config[:options]
|
|
33
|
+
)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
define_method(:rechunk!) do
|
|
37
|
+
config = self.class.instance_variable_get(:@chunkable_config)
|
|
38
|
+
content = send(config[:attribute])
|
|
39
|
+
return if content.nil? || content.empty?
|
|
40
|
+
|
|
41
|
+
chunks.destroy_all
|
|
42
|
+
result = chunker.split(content, metadata: { source_id: id, source_type: self.class.name })
|
|
43
|
+
result.each do |chunk|
|
|
44
|
+
chunks.create!(
|
|
45
|
+
text: chunk.text,
|
|
46
|
+
chunk_index: chunk.index,
|
|
47
|
+
offset: chunk.offset,
|
|
48
|
+
metadata: chunk.metadata
|
|
49
|
+
)
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def self.resolve_strategy(strategy)
|
|
56
|
+
case strategy.to_sym
|
|
57
|
+
when :character then ChunkerRuby::Character
|
|
58
|
+
when :recursive_character then ChunkerRuby::RecursiveCharacter
|
|
59
|
+
when :sentence then ChunkerRuby::Sentence
|
|
60
|
+
when :separator then ChunkerRuby::Separator
|
|
61
|
+
when :markdown then ChunkerRuby::Markdown
|
|
62
|
+
when :html then ChunkerRuby::HTML
|
|
63
|
+
when :code then ChunkerRuby::Code
|
|
64
|
+
when :token then ChunkerRuby::Token
|
|
65
|
+
else raise ArgumentError, "Unknown chunking strategy: #{strategy}"
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ChunkerRuby
|
|
4
|
+
class RecursiveCharacter < BaseSplitter
|
|
5
|
+
DEFAULT_SEPARATORS = ["\n\n", "\n", ". ", ", ", " ", ""].freeze
|
|
6
|
+
|
|
7
|
+
def initialize(separators: nil, keep_separator: true, **kwargs)
|
|
8
|
+
super(**kwargs)
|
|
9
|
+
@separators = separators || DEFAULT_SEPARATORS
|
|
10
|
+
@keep_separator = keep_separator
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def split(text, metadata: {})
|
|
14
|
+
return [] if text.nil? || text.empty?
|
|
15
|
+
|
|
16
|
+
chunks = recursive_split(text, @separators)
|
|
17
|
+
merge_chunks(chunks, text, metadata: metadata)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
private
|
|
21
|
+
|
|
22
|
+
def recursive_split(text, separators)
|
|
23
|
+
return [text] if text.length <= @chunk_size
|
|
24
|
+
return [text] if separators.empty?
|
|
25
|
+
|
|
26
|
+
separator = separators.first
|
|
27
|
+
remaining_separators = separators[1..]
|
|
28
|
+
|
|
29
|
+
pieces = split_by_separator(text, separator)
|
|
30
|
+
|
|
31
|
+
result = []
|
|
32
|
+
pieces.each do |piece|
|
|
33
|
+
if piece.length <= @chunk_size
|
|
34
|
+
result << piece
|
|
35
|
+
elsif remaining_separators.any?
|
|
36
|
+
result.concat(recursive_split(piece, remaining_separators))
|
|
37
|
+
else
|
|
38
|
+
result << piece
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
result
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def split_by_separator(text, separator)
|
|
46
|
+
if separator.empty?
|
|
47
|
+
return text.chars
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
parts = text.split(separator, -1)
|
|
51
|
+
return parts unless @keep_separator && parts.length > 1
|
|
52
|
+
|
|
53
|
+
result = []
|
|
54
|
+
parts.each_with_index do |part, i|
|
|
55
|
+
if i < parts.length - 1
|
|
56
|
+
result << part + separator unless part.empty? && i > 0
|
|
57
|
+
else
|
|
58
|
+
result << part unless part.empty?
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
result.empty? ? [text] : result
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def merge_chunks(pieces, original_text, metadata: {})
|
|
65
|
+
build_chunks(pieces, original_text, metadata: metadata)
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ChunkerRuby
|
|
4
|
+
class Semantic < BaseSplitter
|
|
5
|
+
def initialize(embed:, threshold: 0.5, min_chunk_size: 100, max_chunk_size: 2000, **kwargs)
|
|
6
|
+
super(chunk_size: max_chunk_size, chunk_overlap: 0, **kwargs)
|
|
7
|
+
@embed = embed
|
|
8
|
+
@threshold = threshold
|
|
9
|
+
@min_chunk_size = min_chunk_size
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def split(text, metadata: {})
|
|
13
|
+
return [] if text.nil? || text.empty?
|
|
14
|
+
|
|
15
|
+
sentences = split_into_sentences(text)
|
|
16
|
+
return [Chunk.new(text: text, index: 0, offset: 0, metadata: metadata)] if sentences.length <= 1
|
|
17
|
+
|
|
18
|
+
embeddings = sentences.map { |s| @embed.call(s) }
|
|
19
|
+
split_points = find_split_points(embeddings)
|
|
20
|
+
|
|
21
|
+
build_semantic_chunks(sentences, split_points, text, metadata)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
private
|
|
25
|
+
|
|
26
|
+
def split_into_sentences(text)
|
|
27
|
+
parts = text.split(/(?<=[.!?])\s+/)
|
|
28
|
+
parts.reject(&:empty?)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def find_split_points(embeddings)
|
|
32
|
+
points = []
|
|
33
|
+
(0...embeddings.length - 1).each do |i|
|
|
34
|
+
similarity = cosine_similarity(embeddings[i], embeddings[i + 1])
|
|
35
|
+
points << i if similarity < @threshold
|
|
36
|
+
end
|
|
37
|
+
points
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def cosine_similarity(a, b)
|
|
41
|
+
dot = a.zip(b).sum { |x, y| x * y }
|
|
42
|
+
mag_a = Math.sqrt(a.sum { |x| x * x })
|
|
43
|
+
mag_b = Math.sqrt(b.sum { |x| x * x })
|
|
44
|
+
return 0.0 if mag_a.zero? || mag_b.zero?
|
|
45
|
+
|
|
46
|
+
dot / (mag_a * mag_b)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def build_semantic_chunks(sentences, split_points, original_text, metadata)
|
|
50
|
+
chunks = []
|
|
51
|
+
boundaries = [-1] + split_points + [sentences.length - 1]
|
|
52
|
+
|
|
53
|
+
(0...boundaries.length - 1).each do |i|
|
|
54
|
+
start_idx = boundaries[i] + 1
|
|
55
|
+
end_idx = boundaries[i + 1]
|
|
56
|
+
chunk_sentences = sentences[start_idx..end_idx]
|
|
57
|
+
chunk_text = chunk_sentences.join(" ")
|
|
58
|
+
|
|
59
|
+
# Enforce size constraints
|
|
60
|
+
if chunk_text.length > @chunk_size
|
|
61
|
+
sub_splitter = RecursiveCharacter.new(
|
|
62
|
+
chunk_size: @chunk_size,
|
|
63
|
+
chunk_overlap: @chunk_overlap
|
|
64
|
+
)
|
|
65
|
+
sub_chunks = sub_splitter.split(chunk_text, metadata: metadata)
|
|
66
|
+
sub_chunks.each do |sc|
|
|
67
|
+
chunks << Chunk.new(
|
|
68
|
+
text: sc.text,
|
|
69
|
+
index: chunks.size,
|
|
70
|
+
offset: original_text.index(sc.text) || 0,
|
|
71
|
+
metadata: sc.metadata
|
|
72
|
+
)
|
|
73
|
+
end
|
|
74
|
+
elsif chunk_text.length >= @min_chunk_size
|
|
75
|
+
offset = original_text.index(chunk_text) || 0
|
|
76
|
+
chunks << Chunk.new(
|
|
77
|
+
text: chunk_text,
|
|
78
|
+
index: chunks.size,
|
|
79
|
+
offset: offset,
|
|
80
|
+
metadata: metadata.dup
|
|
81
|
+
)
|
|
82
|
+
elsif !chunks.empty?
|
|
83
|
+
# Merge small chunk with previous
|
|
84
|
+
prev = chunks.pop
|
|
85
|
+
merged = prev.text + " " + chunk_text
|
|
86
|
+
chunks << Chunk.new(
|
|
87
|
+
text: merged,
|
|
88
|
+
index: prev.index,
|
|
89
|
+
offset: prev.offset,
|
|
90
|
+
metadata: prev.metadata
|
|
91
|
+
)
|
|
92
|
+
else
|
|
93
|
+
offset = original_text.index(chunk_text) || 0
|
|
94
|
+
chunks << Chunk.new(
|
|
95
|
+
text: chunk_text,
|
|
96
|
+
index: chunks.size,
|
|
97
|
+
offset: offset,
|
|
98
|
+
metadata: metadata.dup
|
|
99
|
+
)
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
chunks
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "strscan"
|
|
4
|
+
|
|
5
|
+
module ChunkerRuby
|
|
6
|
+
class Sentence < BaseSplitter
|
|
7
|
+
ABBREVIATIONS = %w[
|
|
8
|
+
Mr Mrs Ms Dr Prof Sr Jr St Gen Gov Sgt Cpl Pvt
|
|
9
|
+
Inc Corp Ltd Co vs etc al
|
|
10
|
+
Jan Feb Mar Apr Jun Jul Aug Sep Oct Nov Dec
|
|
11
|
+
Ave Blvd Dept Div Est Fig
|
|
12
|
+
].freeze
|
|
13
|
+
|
|
14
|
+
def initialize(min_chunk_size: nil, max_chunk_size: nil, **kwargs)
|
|
15
|
+
chunk_size = max_chunk_size || kwargs[:chunk_size] || 1000
|
|
16
|
+
super(chunk_size: chunk_size, **kwargs.except(:chunk_size))
|
|
17
|
+
@min_chunk_size = min_chunk_size || (@chunk_size / 3)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def split(text, metadata: {})
|
|
21
|
+
return [] if text.nil? || text.empty?
|
|
22
|
+
|
|
23
|
+
sentences = split_into_sentences(text)
|
|
24
|
+
build_chunks(sentences, text, metadata: metadata)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
private
|
|
28
|
+
|
|
29
|
+
def split_into_sentences(text)
|
|
30
|
+
sentences = []
|
|
31
|
+
current = +""
|
|
32
|
+
|
|
33
|
+
text.scan(/[^.!?]*[.!?]+[\s]*|[^.!?]+\s*/) do |segment|
|
|
34
|
+
current << segment
|
|
35
|
+
|
|
36
|
+
# Check if this looks like a real sentence end
|
|
37
|
+
if segment.match?(/[.!?]\s*\z/) && real_sentence_end?(current)
|
|
38
|
+
sentences << current
|
|
39
|
+
current = +""
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
sentences << current unless current.strip.empty?
|
|
44
|
+
sentences.empty? ? [text] : sentences
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def real_sentence_end?(text)
|
|
48
|
+
stripped = text.rstrip
|
|
49
|
+
return false if stripped.empty?
|
|
50
|
+
|
|
51
|
+
# Check for abbreviations: "Dr.", "Mr.", etc.
|
|
52
|
+
ABBREVIATIONS.each do |abbr|
|
|
53
|
+
return false if stripped.end_with?("#{abbr}.")
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Check for decimal numbers: "3.14"
|
|
57
|
+
return false if stripped.match?(/\d\.\z/)
|
|
58
|
+
|
|
59
|
+
# Check for ellipsis
|
|
60
|
+
return false if stripped.end_with?("...")
|
|
61
|
+
|
|
62
|
+
true
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ChunkerRuby
|
|
4
|
+
class Separator < BaseSplitter
|
|
5
|
+
def initialize(separator: "\n\n", keep_separator: true, **kwargs)
|
|
6
|
+
super(**kwargs)
|
|
7
|
+
@separator = separator
|
|
8
|
+
@keep_separator = keep_separator
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def split(text, metadata: {})
|
|
12
|
+
return [] if text.nil? || text.empty?
|
|
13
|
+
|
|
14
|
+
pieces = split_by_separator(text, @separator)
|
|
15
|
+
build_chunks(pieces, text, metadata: metadata)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
private
|
|
19
|
+
|
|
20
|
+
def split_by_separator(text, separator)
|
|
21
|
+
if separator.is_a?(Regexp)
|
|
22
|
+
split_with_regex(text, separator)
|
|
23
|
+
elsif separator.empty?
|
|
24
|
+
text.chars
|
|
25
|
+
else
|
|
26
|
+
split_with_string(text, separator)
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def split_with_string(text, separator)
|
|
31
|
+
parts = text.split(separator, -1)
|
|
32
|
+
return parts unless @keep_separator && parts.length > 1
|
|
33
|
+
|
|
34
|
+
result = []
|
|
35
|
+
parts.each_with_index do |part, i|
|
|
36
|
+
if i == 0
|
|
37
|
+
result << part + separator unless part.empty?
|
|
38
|
+
elsif i == parts.length - 1
|
|
39
|
+
result << part unless part.empty?
|
|
40
|
+
else
|
|
41
|
+
result << part + separator unless part.empty?
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
result.empty? ? [text] : result
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def split_with_regex(text, separator)
|
|
48
|
+
splits = text.split(separator, -1)
|
|
49
|
+
separators = text.scan(separator)
|
|
50
|
+
return splits unless @keep_separator
|
|
51
|
+
|
|
52
|
+
result = []
|
|
53
|
+
splits.each_with_index do |part, i|
|
|
54
|
+
combined = i < separators.length ? part + separators[i] : part
|
|
55
|
+
result << combined unless combined.empty?
|
|
56
|
+
end
|
|
57
|
+
result.empty? ? [text] : result
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ChunkerRuby
|
|
4
|
+
class SlidingWindow < BaseSplitter
|
|
5
|
+
def initialize(stride: nil, **kwargs)
|
|
6
|
+
super(**kwargs)
|
|
7
|
+
@stride = stride || (@chunk_size - @chunk_overlap)
|
|
8
|
+
raise ArgumentError, "stride must be positive" unless @stride > 0
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def split(text, metadata: {})
|
|
12
|
+
return [] if text.nil? || text.empty?
|
|
13
|
+
|
|
14
|
+
chunks = []
|
|
15
|
+
start = 0
|
|
16
|
+
|
|
17
|
+
while start < text.length
|
|
18
|
+
end_pos = [start + @chunk_size, text.length].min
|
|
19
|
+
chunk_text = text[start...end_pos]
|
|
20
|
+
|
|
21
|
+
chunks << Chunk.new(
|
|
22
|
+
text: chunk_text,
|
|
23
|
+
index: chunks.size,
|
|
24
|
+
offset: start,
|
|
25
|
+
metadata: metadata.dup
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
break if end_pos >= text.length
|
|
29
|
+
|
|
30
|
+
start += @stride
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
chunks
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ChunkerRuby
|
|
4
|
+
class Token < BaseSplitter
|
|
5
|
+
def initialize(tokenizer: nil, **kwargs)
|
|
6
|
+
super(**kwargs)
|
|
7
|
+
@tokenizer = resolve_tokenizer(tokenizer)
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def split(text, metadata: {})
|
|
11
|
+
return [] if text.nil? || text.empty?
|
|
12
|
+
|
|
13
|
+
if @tokenizer
|
|
14
|
+
split_by_tokens(text, metadata)
|
|
15
|
+
else
|
|
16
|
+
split_by_estimation(text, metadata)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
private
|
|
21
|
+
|
|
22
|
+
def resolve_tokenizer(tokenizer)
|
|
23
|
+
case tokenizer
|
|
24
|
+
when nil
|
|
25
|
+
try_load_default_tokenizer
|
|
26
|
+
when String, Symbol
|
|
27
|
+
try_load_tokenizer(tokenizer.to_s)
|
|
28
|
+
else
|
|
29
|
+
tokenizer # assume it responds to #encode and #decode
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def try_load_default_tokenizer
|
|
34
|
+
try_load_tokenizer("gpt2")
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def try_load_tokenizer(name)
|
|
38
|
+
require "tokenizer_ruby"
|
|
39
|
+
TokenizerRuby::Tokenizer.new(name)
|
|
40
|
+
rescue LoadError
|
|
41
|
+
nil
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def split_by_tokens(text, metadata)
|
|
45
|
+
tokens = @tokenizer.encode(text)
|
|
46
|
+
chunks = []
|
|
47
|
+
start = 0
|
|
48
|
+
|
|
49
|
+
while start < tokens.length
|
|
50
|
+
end_pos = [start + @chunk_size, tokens.length].min
|
|
51
|
+
chunk_tokens = tokens[start...end_pos]
|
|
52
|
+
chunk_text = @tokenizer.decode(chunk_tokens)
|
|
53
|
+
|
|
54
|
+
offset = text.index(chunk_text.strip) || 0
|
|
55
|
+
chunks << Chunk.new(
|
|
56
|
+
text: chunk_text,
|
|
57
|
+
index: chunks.size,
|
|
58
|
+
offset: offset,
|
|
59
|
+
metadata: metadata.merge(token_count: chunk_tokens.length)
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
break if end_pos >= tokens.length
|
|
63
|
+
|
|
64
|
+
start += @chunk_size - @chunk_overlap
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
chunks
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def split_by_estimation(text, metadata)
|
|
71
|
+
# Estimate ~4 chars per token
|
|
72
|
+
char_chunk_size = @chunk_size * 4
|
|
73
|
+
char_overlap = @chunk_overlap * 4
|
|
74
|
+
|
|
75
|
+
char_splitter = Character.new(
|
|
76
|
+
chunk_size: char_chunk_size,
|
|
77
|
+
chunk_overlap: char_overlap
|
|
78
|
+
)
|
|
79
|
+
char_splitter.split(text, metadata: metadata)
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
data/lib/chunker_ruby.rb
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "chunker_ruby/version"
|
|
4
|
+
require_relative "chunker_ruby/chunk"
|
|
5
|
+
require_relative "chunker_ruby/base_splitter"
|
|
6
|
+
require_relative "chunker_ruby/character"
|
|
7
|
+
require_relative "chunker_ruby/separator"
|
|
8
|
+
require_relative "chunker_ruby/recursive_character"
|
|
9
|
+
require_relative "chunker_ruby/sentence"
|
|
10
|
+
require_relative "chunker_ruby/markdown"
|
|
11
|
+
require_relative "chunker_ruby/html"
|
|
12
|
+
require_relative "chunker_ruby/code"
|
|
13
|
+
require_relative "chunker_ruby/json_splitter"
|
|
14
|
+
require_relative "chunker_ruby/token"
|
|
15
|
+
require_relative "chunker_ruby/semantic"
|
|
16
|
+
require_relative "chunker_ruby/sliding_window"
|
|
17
|
+
|
|
18
|
+
module ChunkerRuby
|
|
19
|
+
def self.split(text, chunk_size: 1000, chunk_overlap: 200, **options)
|
|
20
|
+
RecursiveCharacter.new(chunk_size: chunk_size, chunk_overlap: chunk_overlap, **options).split(text)
|
|
21
|
+
end
|
|
22
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: chunker-ruby
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Johannes Dwi Cahyo
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies: []
|
|
12
|
+
description: Multiple chunking strategies to split documents into optimal pieces for
|
|
13
|
+
embedding and vector search. Supports character, recursive, sentence, markdown,
|
|
14
|
+
HTML, code, token, and semantic splitting.
|
|
15
|
+
email: []
|
|
16
|
+
executables: []
|
|
17
|
+
extensions: []
|
|
18
|
+
extra_rdoc_files: []
|
|
19
|
+
files:
|
|
20
|
+
- LICENSE
|
|
21
|
+
- lib/chunker_ruby.rb
|
|
22
|
+
- lib/chunker_ruby/base_splitter.rb
|
|
23
|
+
- lib/chunker_ruby/character.rb
|
|
24
|
+
- lib/chunker_ruby/chunk.rb
|
|
25
|
+
- lib/chunker_ruby/code.rb
|
|
26
|
+
- lib/chunker_ruby/html.rb
|
|
27
|
+
- lib/chunker_ruby/json_splitter.rb
|
|
28
|
+
- lib/chunker_ruby/markdown.rb
|
|
29
|
+
- lib/chunker_ruby/rails/chunkable.rb
|
|
30
|
+
- lib/chunker_ruby/recursive_character.rb
|
|
31
|
+
- lib/chunker_ruby/semantic.rb
|
|
32
|
+
- lib/chunker_ruby/sentence.rb
|
|
33
|
+
- lib/chunker_ruby/separator.rb
|
|
34
|
+
- lib/chunker_ruby/sliding_window.rb
|
|
35
|
+
- lib/chunker_ruby/token.rb
|
|
36
|
+
- lib/chunker_ruby/version.rb
|
|
37
|
+
homepage: https://github.com/johannesdwicahyo/chunker-ruby
|
|
38
|
+
licenses:
|
|
39
|
+
- MIT
|
|
40
|
+
metadata:
|
|
41
|
+
homepage_uri: https://github.com/johannesdwicahyo/chunker-ruby
|
|
42
|
+
source_code_uri: https://github.com/johannesdwicahyo/chunker-ruby
|
|
43
|
+
changelog_uri: https://github.com/johannesdwicahyo/chunker-ruby/blob/main/CHANGELOG.md
|
|
44
|
+
rdoc_options: []
|
|
45
|
+
require_paths:
|
|
46
|
+
- lib
|
|
47
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
48
|
+
requirements:
|
|
49
|
+
- - ">="
|
|
50
|
+
- !ruby/object:Gem::Version
|
|
51
|
+
version: 3.0.0
|
|
52
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
53
|
+
requirements:
|
|
54
|
+
- - ">="
|
|
55
|
+
- !ruby/object:Gem::Version
|
|
56
|
+
version: '0'
|
|
57
|
+
requirements: []
|
|
58
|
+
rubygems_version: 3.6.9
|
|
59
|
+
specification_version: 4
|
|
60
|
+
summary: Text chunking/splitting library for Ruby, designed for RAG pipelines
|
|
61
|
+
test_files: []
|