chunker-ruby 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/chunker_ruby/base_splitter.rb +38 -20
- data/lib/chunker_ruby/semantic.rb +29 -12
- data/lib/chunker_ruby/token.rb +1 -1
- data/lib/chunker_ruby/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: '0296653d093ff4ab8cfb8752ba9ba000fce26878a116341940a0b5ae23d8bf0d'
|
|
4
|
+
data.tar.gz: 3de883b592a703d1190d83e0cc4c0250998fb195de3c3d672dd6c97a78688f33
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ad7ab58fcfa9abb0e5c6e1b97b7519ab24906ea16b9d641b25954a6d74c50a02ab852a60cc5e537e67b37f2668bc6e629b3635ff9015939e85923c3a6215fd19
|
|
7
|
+
data.tar.gz: 99f9997b05d00fb5f0406ff95ddd385b48446f6b0ed1c203e56e6bdf5018bab859d1ffe53d633238f7d573fae28f3b0aca611ef0dcdc3da683d608f1318515ed
|
|
@@ -24,47 +24,63 @@ module ChunkerRuby
|
|
|
24
24
|
private
|
|
25
25
|
|
|
26
26
|
def build_chunks(pieces, original_text, metadata: {})
|
|
27
|
-
|
|
28
|
-
|
|
27
|
+
# Pre-compute offsets for each piece to avoid re-searching (fixes duplicate text)
|
|
28
|
+
piece_offsets = compute_piece_offsets(pieces, original_text)
|
|
29
|
+
merged = merge_pieces_with_offsets(pieces, piece_offsets)
|
|
29
30
|
|
|
30
|
-
merged
|
|
31
|
+
merged.map.with_index do |entry|
|
|
32
|
+
next if entry[:text].strip.empty?
|
|
31
33
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
offset = original_text.index(chunk_text, current_pos) || current_pos
|
|
37
|
-
|
|
38
|
-
chunks << Chunk.new(
|
|
39
|
-
text: chunk_text,
|
|
40
|
-
index: chunks.size,
|
|
41
|
-
offset: offset,
|
|
34
|
+
Chunk.new(
|
|
35
|
+
text: entry[:text],
|
|
36
|
+
index: 0, # will be reindexed below
|
|
37
|
+
offset: entry[:offset],
|
|
42
38
|
metadata: metadata.dup
|
|
43
39
|
)
|
|
44
|
-
|
|
45
|
-
|
|
40
|
+
end.compact.each_with_index.map do |chunk, i|
|
|
41
|
+
Chunk.new(text: chunk.text, index: i, offset: chunk.offset, metadata: chunk.metadata)
|
|
46
42
|
end
|
|
43
|
+
end
|
|
47
44
|
|
|
48
|
-
|
|
45
|
+
def compute_piece_offsets(pieces, original_text)
|
|
46
|
+
offsets = []
|
|
47
|
+
pos = 0
|
|
48
|
+
pieces.each do |piece|
|
|
49
|
+
idx = original_text.index(piece, pos)
|
|
50
|
+
if idx
|
|
51
|
+
offsets << idx
|
|
52
|
+
pos = idx + piece.length
|
|
53
|
+
else
|
|
54
|
+
offsets << pos
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
offsets
|
|
49
58
|
end
|
|
50
59
|
|
|
51
60
|
def merge_pieces(pieces)
|
|
61
|
+
merge_pieces_with_offsets(pieces, nil).map { |e| e[:text] }
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def merge_pieces_with_offsets(pieces, piece_offsets)
|
|
52
65
|
merged = []
|
|
53
66
|
current_parts = []
|
|
67
|
+
current_offsets = []
|
|
54
68
|
current_length = 0
|
|
55
69
|
|
|
56
|
-
pieces.
|
|
70
|
+
pieces.each_with_index do |piece, i|
|
|
57
71
|
piece_len = piece.length
|
|
58
72
|
|
|
59
73
|
if current_length + piece_len > @chunk_size && !current_parts.empty?
|
|
60
|
-
merged << current_parts.join
|
|
74
|
+
merged << { text: current_parts.join, offset: current_offsets.first || 0 }
|
|
61
75
|
|
|
62
76
|
# Handle overlap: keep trailing parts that fit within overlap size
|
|
63
77
|
overlap_parts = []
|
|
78
|
+
overlap_offsets = []
|
|
64
79
|
overlap_length = 0
|
|
65
|
-
current_parts.reverse_each do |part|
|
|
80
|
+
current_parts.zip(current_offsets).reverse_each do |part, off|
|
|
66
81
|
if overlap_length + part.length <= @chunk_overlap
|
|
67
82
|
overlap_parts.unshift(part)
|
|
83
|
+
overlap_offsets.unshift(off)
|
|
68
84
|
overlap_length += part.length
|
|
69
85
|
else
|
|
70
86
|
break
|
|
@@ -72,14 +88,16 @@ module ChunkerRuby
|
|
|
72
88
|
end
|
|
73
89
|
|
|
74
90
|
current_parts = overlap_parts
|
|
91
|
+
current_offsets = overlap_offsets
|
|
75
92
|
current_length = overlap_length
|
|
76
93
|
end
|
|
77
94
|
|
|
78
95
|
current_parts << piece
|
|
96
|
+
current_offsets << (piece_offsets ? piece_offsets[i] : 0)
|
|
79
97
|
current_length += piece_len
|
|
80
98
|
end
|
|
81
99
|
|
|
82
|
-
merged << current_parts.join unless current_parts.empty?
|
|
100
|
+
merged << { text: current_parts.join, offset: current_offsets.first || 0 } unless current_parts.empty?
|
|
83
101
|
|
|
84
102
|
merged
|
|
85
103
|
end
|
|
@@ -24,7 +24,9 @@ module ChunkerRuby
|
|
|
24
24
|
private
|
|
25
25
|
|
|
26
26
|
def split_into_sentences(text)
|
|
27
|
-
|
|
27
|
+
# Use scan to preserve exact boundaries without losing whitespace info
|
|
28
|
+
parts = text.scan(/[^.!?]*[.!?]+\s*|[^.!?]+/)
|
|
29
|
+
parts.map! { |s| s.rstrip }
|
|
28
30
|
parts.reject(&:empty?)
|
|
29
31
|
end
|
|
30
32
|
|
|
@@ -48,14 +50,27 @@ module ChunkerRuby
|
|
|
48
50
|
|
|
49
51
|
def build_semantic_chunks(sentences, split_points, original_text, metadata)
|
|
50
52
|
chunks = []
|
|
51
|
-
current_pos = 0
|
|
52
53
|
boundaries = [-1] + split_points + [sentences.length - 1]
|
|
53
54
|
|
|
55
|
+
# Pre-compute sentence positions in original text
|
|
56
|
+
sent_offsets = []
|
|
57
|
+
spos = 0
|
|
58
|
+
sentences.each do |s|
|
|
59
|
+
idx = original_text.index(s, spos)
|
|
60
|
+
sent_offsets << (idx || spos)
|
|
61
|
+
spos = (idx || spos) + s.length
|
|
62
|
+
end
|
|
63
|
+
|
|
54
64
|
(0...boundaries.length - 1).each do |i|
|
|
55
65
|
start_idx = boundaries[i] + 1
|
|
56
66
|
end_idx = boundaries[i + 1]
|
|
57
67
|
chunk_sentences = sentences[start_idx..end_idx]
|
|
58
|
-
|
|
68
|
+
|
|
69
|
+
# Extract chunk from original text to preserve spacing
|
|
70
|
+
chunk_start = sent_offsets[start_idx]
|
|
71
|
+
chunk_end = sent_offsets[end_idx] + sentences[end_idx].length
|
|
72
|
+
chunk_text = original_text[chunk_start...chunk_end].rstrip
|
|
73
|
+
chunk_text = chunk_sentences.join(" ") if chunk_text.strip.empty?
|
|
59
74
|
|
|
60
75
|
# Enforce size constraints
|
|
61
76
|
if chunk_text.length > @chunk_size
|
|
@@ -65,8 +80,7 @@ module ChunkerRuby
|
|
|
65
80
|
)
|
|
66
81
|
sub_chunks = sub_splitter.split(chunk_text, metadata: metadata)
|
|
67
82
|
sub_chunks.each do |sc|
|
|
68
|
-
offset =
|
|
69
|
-
current_pos = offset + sc.text.length
|
|
83
|
+
offset = chunk_start + (sc.offset || 0)
|
|
70
84
|
chunks << Chunk.new(
|
|
71
85
|
text: sc.text,
|
|
72
86
|
index: chunks.size,
|
|
@@ -74,9 +88,10 @@ module ChunkerRuby
|
|
|
74
88
|
metadata: sc.metadata
|
|
75
89
|
)
|
|
76
90
|
end
|
|
91
|
+
current_pos = chunk_end
|
|
77
92
|
elsif chunk_text.length >= @min_chunk_size
|
|
78
|
-
offset =
|
|
79
|
-
current_pos =
|
|
93
|
+
offset = chunk_start
|
|
94
|
+
current_pos = chunk_end
|
|
80
95
|
chunks << Chunk.new(
|
|
81
96
|
text: chunk_text,
|
|
82
97
|
index: chunks.size,
|
|
@@ -86,17 +101,19 @@ module ChunkerRuby
|
|
|
86
101
|
elsif !chunks.empty?
|
|
87
102
|
# Merge small chunk with previous
|
|
88
103
|
prev = chunks.pop
|
|
89
|
-
|
|
104
|
+
merged_end = chunk_end
|
|
105
|
+
merged_text = original_text[prev.offset...merged_end].rstrip
|
|
106
|
+
merged_text = prev.text + " " + chunk_text if merged_text.strip.empty?
|
|
90
107
|
chunks << Chunk.new(
|
|
91
|
-
text:
|
|
108
|
+
text: merged_text,
|
|
92
109
|
index: prev.index,
|
|
93
110
|
offset: prev.offset,
|
|
94
111
|
metadata: prev.metadata
|
|
95
112
|
)
|
|
96
|
-
current_pos =
|
|
113
|
+
current_pos = merged_end
|
|
97
114
|
else
|
|
98
|
-
offset =
|
|
99
|
-
current_pos =
|
|
115
|
+
offset = chunk_start
|
|
116
|
+
current_pos = chunk_end
|
|
100
117
|
chunks << Chunk.new(
|
|
101
118
|
text: chunk_text,
|
|
102
119
|
index: chunks.size,
|
data/lib/chunker_ruby/token.rb
CHANGED
data/lib/chunker_ruby/version.rb
CHANGED