chunker-ruby 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2ef1a60bf60351dc527abc378d992bfa05b0de0d9c64af3db4edbb63b9539c61
4
- data.tar.gz: 01452e12091762a1dee9e86b2613e525a3dd9536cd51fd5d52da19e4a4f829dd
3
+ metadata.gz: '0296653d093ff4ab8cfb8752ba9ba000fce26878a116341940a0b5ae23d8bf0d'
4
+ data.tar.gz: 3de883b592a703d1190d83e0cc4c0250998fb195de3c3d672dd6c97a78688f33
5
5
  SHA512:
6
- metadata.gz: a4b276bd94c9c0e7c6749223eecf8e78aa578f65866a4ab98f0d53e6245fc29ff622e6d8d262d3937c22b1f9e35e23f875c3ca52d5fc188138090c251ce1de29
7
- data.tar.gz: d4c86c9423f92c20526a4f771c61e95f24edc7a11bf6920189d0eb408213c49e6f9828e02b1c6cb25121953a2f352a9b8c9f9e46db42af0ec5eca98b71820a2a
6
+ metadata.gz: ad7ab58fcfa9abb0e5c6e1b97b7519ab24906ea16b9d641b25954a6d74c50a02ab852a60cc5e537e67b37f2668bc6e629b3635ff9015939e85923c3a6215fd19
7
+ data.tar.gz: 99f9997b05d00fb5f0406ff95ddd385b48446f6b0ed1c203e56e6bdf5018bab859d1ffe53d633238f7d573fae28f3b0aca611ef0dcdc3da683d608f1318515ed
@@ -24,47 +24,63 @@ module ChunkerRuby
24
24
  private
25
25
 
26
26
  def build_chunks(pieces, original_text, metadata: {})
27
- chunks = []
28
- current_pos = 0
27
+ # Pre-compute offsets for each piece to avoid re-searching (fixes duplicate text)
28
+ piece_offsets = compute_piece_offsets(pieces, original_text)
29
+ merged = merge_pieces_with_offsets(pieces, piece_offsets)
29
30
 
30
- merged = merge_pieces(pieces)
31
+ merged.map.with_index do |entry|
32
+ next if entry[:text].strip.empty?
31
33
 
32
- merged.each do |chunk_text|
33
- next if chunk_text.strip.empty?
34
-
35
- # Find the actual position starting from current_pos
36
- offset = original_text.index(chunk_text, current_pos) || current_pos
37
-
38
- chunks << Chunk.new(
39
- text: chunk_text,
40
- index: chunks.size,
41
- offset: offset,
34
+ Chunk.new(
35
+ text: entry[:text],
36
+ index: 0, # will be reindexed below
37
+ offset: entry[:offset],
42
38
  metadata: metadata.dup
43
39
  )
44
-
45
- current_pos = offset + chunk_text.length
40
+ end.compact.each_with_index.map do |chunk, i|
41
+ Chunk.new(text: chunk.text, index: i, offset: chunk.offset, metadata: chunk.metadata)
46
42
  end
43
+ end
47
44
 
48
- chunks
45
+ def compute_piece_offsets(pieces, original_text)
46
+ offsets = []
47
+ pos = 0
48
+ pieces.each do |piece|
49
+ idx = original_text.index(piece, pos)
50
+ if idx
51
+ offsets << idx
52
+ pos = idx + piece.length
53
+ else
54
+ offsets << pos
55
+ end
56
+ end
57
+ offsets
49
58
  end
50
59
 
51
60
  def merge_pieces(pieces)
61
+ merge_pieces_with_offsets(pieces, nil).map { |e| e[:text] }
62
+ end
63
+
64
+ def merge_pieces_with_offsets(pieces, piece_offsets)
52
65
  merged = []
53
66
  current_parts = []
67
+ current_offsets = []
54
68
  current_length = 0
55
69
 
56
- pieces.each do |piece|
70
+ pieces.each_with_index do |piece, i|
57
71
  piece_len = piece.length
58
72
 
59
73
  if current_length + piece_len > @chunk_size && !current_parts.empty?
60
- merged << current_parts.join
74
+ merged << { text: current_parts.join, offset: current_offsets.first || 0 }
61
75
 
62
76
  # Handle overlap: keep trailing parts that fit within overlap size
63
77
  overlap_parts = []
78
+ overlap_offsets = []
64
79
  overlap_length = 0
65
- current_parts.reverse_each do |part|
80
+ current_parts.zip(current_offsets).reverse_each do |part, off|
66
81
  if overlap_length + part.length <= @chunk_overlap
67
82
  overlap_parts.unshift(part)
83
+ overlap_offsets.unshift(off)
68
84
  overlap_length += part.length
69
85
  else
70
86
  break
@@ -72,14 +88,16 @@ module ChunkerRuby
72
88
  end
73
89
 
74
90
  current_parts = overlap_parts
91
+ current_offsets = overlap_offsets
75
92
  current_length = overlap_length
76
93
  end
77
94
 
78
95
  current_parts << piece
96
+ current_offsets << (piece_offsets ? piece_offsets[i] : 0)
79
97
  current_length += piece_len
80
98
  end
81
99
 
82
- merged << current_parts.join unless current_parts.empty?
100
+ merged << { text: current_parts.join, offset: current_offsets.first || 0 } unless current_parts.empty?
83
101
 
84
102
  merged
85
103
  end
@@ -24,7 +24,9 @@ module ChunkerRuby
24
24
  private
25
25
 
26
26
  def split_into_sentences(text)
27
- parts = text.split(/(?<=[.!?])\s+/)
27
+ # Use scan to preserve exact boundaries without losing whitespace info
28
+ parts = text.scan(/[^.!?]*[.!?]+\s*|[^.!?]+/)
29
+ parts.map! { |s| s.rstrip }
28
30
  parts.reject(&:empty?)
29
31
  end
30
32
 
@@ -48,14 +50,27 @@ module ChunkerRuby
48
50
 
49
51
  def build_semantic_chunks(sentences, split_points, original_text, metadata)
50
52
  chunks = []
51
- current_pos = 0
52
53
  boundaries = [-1] + split_points + [sentences.length - 1]
53
54
 
55
+ # Pre-compute sentence positions in original text
56
+ sent_offsets = []
57
+ spos = 0
58
+ sentences.each do |s|
59
+ idx = original_text.index(s, spos)
60
+ sent_offsets << (idx || spos)
61
+ spos = (idx || spos) + s.length
62
+ end
63
+
54
64
  (0...boundaries.length - 1).each do |i|
55
65
  start_idx = boundaries[i] + 1
56
66
  end_idx = boundaries[i + 1]
57
67
  chunk_sentences = sentences[start_idx..end_idx]
58
- chunk_text = chunk_sentences.join(" ")
68
+
69
+ # Extract chunk from original text to preserve spacing
70
+ chunk_start = sent_offsets[start_idx]
71
+ chunk_end = sent_offsets[end_idx] + sentences[end_idx].length
72
+ chunk_text = original_text[chunk_start...chunk_end].rstrip
73
+ chunk_text = chunk_sentences.join(" ") if chunk_text.strip.empty?
59
74
 
60
75
  # Enforce size constraints
61
76
  if chunk_text.length > @chunk_size
@@ -65,8 +80,7 @@ module ChunkerRuby
65
80
  )
66
81
  sub_chunks = sub_splitter.split(chunk_text, metadata: metadata)
67
82
  sub_chunks.each do |sc|
68
- offset = original_text.index(sc.text, current_pos) || current_pos
69
- current_pos = offset + sc.text.length
83
+ offset = chunk_start + (sc.offset || 0)
70
84
  chunks << Chunk.new(
71
85
  text: sc.text,
72
86
  index: chunks.size,
@@ -74,9 +88,10 @@ module ChunkerRuby
74
88
  metadata: sc.metadata
75
89
  )
76
90
  end
91
+ current_pos = chunk_end
77
92
  elsif chunk_text.length >= @min_chunk_size
78
- offset = original_text.index(chunk_text, current_pos) || current_pos
79
- current_pos = offset + chunk_text.length
93
+ offset = chunk_start
94
+ current_pos = chunk_end
80
95
  chunks << Chunk.new(
81
96
  text: chunk_text,
82
97
  index: chunks.size,
@@ -86,17 +101,19 @@ module ChunkerRuby
86
101
  elsif !chunks.empty?
87
102
  # Merge small chunk with previous
88
103
  prev = chunks.pop
89
- merged = prev.text + " " + chunk_text
104
+ merged_end = chunk_end
105
+ merged_text = original_text[prev.offset...merged_end].rstrip
106
+ merged_text = prev.text + " " + chunk_text if merged_text.strip.empty?
90
107
  chunks << Chunk.new(
91
- text: merged,
108
+ text: merged_text,
92
109
  index: prev.index,
93
110
  offset: prev.offset,
94
111
  metadata: prev.metadata
95
112
  )
96
- current_pos = prev.offset + merged.length
113
+ current_pos = merged_end
97
114
  else
98
- offset = original_text.index(chunk_text, current_pos) || current_pos
99
- current_pos = offset + chunk_text.length
115
+ offset = chunk_start
116
+ current_pos = chunk_end
100
117
  chunks << Chunk.new(
101
118
  text: chunk_text,
102
119
  index: chunks.size,
@@ -54,7 +54,7 @@ module ChunkerRuby
54
54
  stripped = raw_text.strip
55
55
 
56
56
  offset = text.index(stripped, current_pos) || current_pos
57
- current_pos = offset + stripped.length
57
+ current_pos = offset + [stripped.length, 1].max
58
58
 
59
59
  chunks << Chunk.new(
60
60
  text: raw_text,
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ChunkerRuby
4
- VERSION = "0.1.1"
4
+ VERSION = "0.2.0"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chunker-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Johannes Dwi Cahyo