semantic_text_chunker 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +45 -5
- data/lib/semantic_text_chunker/boundary_detector.rb +16 -6
- data/lib/semantic_text_chunker/chunk_builder.rb +7 -1
- data/lib/semantic_text_chunker/chunker.rb +18 -5
- data/lib/semantic_text_chunker/splitters/sentence_splitter.rb +20 -7
- data/lib/semantic_text_chunker/splitters/structure_splitter.rb +69 -0
- data/lib/semantic_text_chunker/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ab921bdd7c9c704dbfbb565d42b5a2d6b4bf120d026e4fede72b97df7192508f
|
|
4
|
+
data.tar.gz: 3dce3cdcd0b84ec89eb296e2c12c7ba27eac8363675266bf7420927e8035a66f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 05dd40f830c2c7e43643b99ef7e739b87acd933cbf24bdf8f115df5121e044ab206abe84b73cfd1121d4a095dd45beb96eb5ff71415b6c2f054f7e79253faeca
|
|
7
|
+
data.tar.gz: f6b20bdf6f581e29885513f9aaa97982ce4031593065969766a93b5fec8c52b2525c16866d2df1f715c36bc8897c7e9b5e4543a3062db404922dcc1b041aa114
|
data/README.md
CHANGED
|
@@ -82,13 +82,17 @@ embedder = SemanticTextChunker::Embedders::Null.new
|
|
|
82
82
|
| `threshold` | `0.75` | Cosine similarity threshold for detecting boundaries |
|
|
83
83
|
| `max_tokens` | `512` | Maximum tokens per chunk (estimated at ~4 chars/token) |
|
|
84
84
|
| `overlap_sentences` | `2` | Number of sentences to overlap between chunks |
|
|
85
|
+
| `respect_structure` | `true` | Treat paragraph breaks and markdown headings as hard chunk boundaries |
|
|
86
|
+
| `extra_abbreviations` | `[]` | Additional abbreviations the sentence splitter should not split on |
|
|
85
87
|
|
|
86
88
|
```ruby
|
|
87
89
|
chunks = SemanticTextChunker.chunk(text,
|
|
88
90
|
embedder: embedder,
|
|
89
91
|
threshold: 0.8,
|
|
90
92
|
max_tokens: 1024,
|
|
91
|
-
overlap_sentences: 3
|
|
93
|
+
overlap_sentences: 3,
|
|
94
|
+
respect_structure: true,
|
|
95
|
+
extra_abbreviations: ["Inc", "Ltd"]
|
|
92
96
|
)
|
|
93
97
|
```
|
|
94
98
|
|
|
@@ -135,12 +139,48 @@ end
|
|
|
135
139
|
|
|
136
140
|
The base class provides a `cosine_similarity` method used for boundary detection.
|
|
137
141
|
|
|
142
|
+
## Sentence Splitting
|
|
143
|
+
|
|
144
|
+
Sentences are detected with punctuation-aware rules that:
|
|
145
|
+
|
|
146
|
+
- Keep common abbreviations intact (`Mr.`, `Dr.`, `U.S.A.`, `e.g.`, etc.)
|
|
147
|
+
- Keep decimal numbers intact (`3.14`, `v1.2.3`)
|
|
148
|
+
- Split dialogue ending in a closing quote (`"Stop!" He ran.`)
|
|
149
|
+
- Start new sentences on digits or opening quotes, not just capital letters
|
|
150
|
+
|
|
151
|
+
To recognize domain-specific abbreviations, pass `extra_abbreviations` (also accepted
|
|
152
|
+
directly by `SemanticTextChunker.chunk`):
|
|
153
|
+
|
|
154
|
+
```ruby
|
|
155
|
+
splitter = SemanticTextChunker::Splitters::SentenceSplitter.new(
|
|
156
|
+
extra_abbreviations: ["Inc", "Ltd", "cf", "al"]
|
|
157
|
+
)
|
|
158
|
+
splitter.split("Acme Inc. shipped it. Done.")
|
|
159
|
+
# => ["Acme Inc. shipped it.", "Done."]
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
## Structure-Aware Chunking
|
|
163
|
+
|
|
164
|
+
By default (`respect_structure: true`), the chunker respects document structure so that
|
|
165
|
+
chunks never blur across obvious boundaries:
|
|
166
|
+
|
|
167
|
+
- **Paragraph breaks** (blank lines) end a chunk — two paragraphs are never merged into one.
|
|
168
|
+
- **Markdown headings** (`# ...` through `###### ...`) start a new section. A standalone
|
|
169
|
+
heading is attached to the content that follows it, so each section's chunk carries its
|
|
170
|
+
heading for context.
|
|
171
|
+
- **Overlap never crosses a structural boundary**, so a section's chunk won't leak the tail
|
|
172
|
+
of the previous section.
|
|
173
|
+
|
|
174
|
+
Semantic similarity and the token limit still apply *within* each structural block. Set
|
|
175
|
+
`respect_structure: false` to disable this and chunk purely by similarity and token count.
|
|
176
|
+
|
|
138
177
|
## How It Works
|
|
139
178
|
|
|
140
|
-
1. **
|
|
141
|
-
2. **
|
|
142
|
-
3. **
|
|
143
|
-
4. **
|
|
179
|
+
1. **Structure splitting** - Text is broken into blocks on paragraph breaks and markdown headings, which become hard boundaries that chunks are never merged across (when `respect_structure` is enabled)
|
|
180
|
+
2. **Sentence splitting** - Each block is split into sentences using punctuation-aware rules that handle abbreviations (Mr., Dr., U.S., etc.), decimals, and dialogue
|
|
181
|
+
3. **Embedding** - Each sentence is embedded using the configured embedder
|
|
182
|
+
4. **Boundary detection** - Consecutive sentences are grouped. A new chunk boundary is created at a structural boundary, when the cosine similarity between the accumulated chunk embedding and the next sentence drops below the threshold, or when the token limit is exceeded
|
|
183
|
+
5. **Chunk building** - Sentences are assembled into chunks with configurable overlap for context continuity (overlap never crosses a structural boundary)
|
|
144
184
|
|
|
145
185
|
## License
|
|
146
186
|
|
|
@@ -1,31 +1,41 @@
|
|
|
1
|
+
require "set"
|
|
2
|
+
|
|
1
3
|
module SemanticTextChunker
|
|
2
4
|
class BoundaryDetector
|
|
3
|
-
def initialize(sentences:, embeddings:, threshold:, max_tokens:, embedder:)
|
|
5
|
+
def initialize(sentences:, embeddings:, threshold:, max_tokens:, embedder:, forced: [])
|
|
4
6
|
@sentences = sentences
|
|
5
7
|
@embeddings = embeddings
|
|
6
8
|
@threshold = threshold
|
|
7
9
|
@max_tokens = max_tokens
|
|
8
10
|
@embedder = embedder
|
|
11
|
+
@forced = forced.to_set
|
|
9
12
|
end
|
|
10
13
|
|
|
11
14
|
# Returns array of sentence indices where chunks end
|
|
12
15
|
def boundaries
|
|
13
16
|
return [] if @sentences.size <= 1
|
|
14
17
|
|
|
15
|
-
boundaries
|
|
16
|
-
chunk_start
|
|
17
|
-
current_text = ""
|
|
18
|
+
boundaries = []
|
|
19
|
+
chunk_start = 0
|
|
18
20
|
|
|
19
21
|
@sentences.each_with_index do |sentence, i|
|
|
20
22
|
next if i == 0
|
|
23
|
+
|
|
24
|
+
# Hard structural boundary: the previous sentence ended a block, so the
|
|
25
|
+
# chunk must end there regardless of similarity or token count.
|
|
26
|
+
if @forced.include?(i - 1)
|
|
27
|
+
boundaries << (i - 1) unless boundaries.last == (i - 1)
|
|
28
|
+
chunk_start = i
|
|
29
|
+
next
|
|
30
|
+
end
|
|
31
|
+
|
|
21
32
|
current_text = @sentences[chunk_start..i - 1].join(" ")
|
|
22
33
|
next_text = current_text + " " + sentence
|
|
23
34
|
|
|
24
35
|
# Force boundary if adding this sentence exceeds token limit
|
|
25
36
|
if tokens(next_text) > @max_tokens
|
|
26
37
|
boundaries << i - 1
|
|
27
|
-
chunk_start
|
|
28
|
-
current_text = ""
|
|
38
|
+
chunk_start = i
|
|
29
39
|
next
|
|
30
40
|
end
|
|
31
41
|
|
|
@@ -1,9 +1,12 @@
|
|
|
1
|
+
require "set"
|
|
2
|
+
|
|
1
3
|
module SemanticTextChunker
|
|
2
4
|
class ChunkBuilder
|
|
3
|
-
def initialize(sentences:, boundaries:, overlap_sentences:)
|
|
5
|
+
def initialize(sentences:, boundaries:, overlap_sentences:, hard_boundaries: [])
|
|
4
6
|
@sentences = sentences
|
|
5
7
|
@boundaries = boundaries
|
|
6
8
|
@overlap_sentences = overlap_sentences
|
|
9
|
+
@hard_boundaries = hard_boundaries.to_set
|
|
7
10
|
end
|
|
8
11
|
|
|
9
12
|
def build
|
|
@@ -17,6 +20,9 @@ module SemanticTextChunker
|
|
|
17
20
|
split_points.each_with_index do |boundary, idx|
|
|
18
21
|
start = if idx == 0
|
|
19
22
|
0
|
|
23
|
+
elsif @hard_boundaries.include?(prev_end)
|
|
24
|
+
# Don't carry overlap across a structural boundary.
|
|
25
|
+
prev_end + 1
|
|
20
26
|
else
|
|
21
27
|
# Overlap: go back N sentences from previous boundary
|
|
22
28
|
[prev_end - @overlap_sentences + 1, 0].max
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
require_relative "splitters/sentence_splitter"
|
|
2
|
+
require_relative "splitters/structure_splitter"
|
|
2
3
|
require_relative "embedders/base"
|
|
3
4
|
require_relative "embedders/null"
|
|
4
5
|
require_relative "boundary_detector"
|
|
@@ -11,19 +12,29 @@ module SemanticTextChunker
|
|
|
11
12
|
embedder: Embedders::Null.new,
|
|
12
13
|
threshold: 0.75,
|
|
13
14
|
max_tokens: 512,
|
|
14
|
-
overlap_sentences: 2
|
|
15
|
+
overlap_sentences: 2,
|
|
16
|
+
respect_structure: true,
|
|
17
|
+
extra_abbreviations: []
|
|
15
18
|
)
|
|
16
19
|
@embedder = embedder
|
|
17
20
|
@threshold = threshold
|
|
18
21
|
@max_tokens = max_tokens
|
|
19
22
|
@overlap_sentences = overlap_sentences
|
|
20
|
-
@
|
|
23
|
+
@respect_structure = respect_structure
|
|
24
|
+
@splitter = Splitters::SentenceSplitter.new(extra_abbreviations: extra_abbreviations)
|
|
25
|
+
@structure_splitter = Splitters::StructureSplitter.new(sentence_splitter: @splitter)
|
|
21
26
|
end
|
|
22
27
|
|
|
23
28
|
def chunk(text)
|
|
24
29
|
return [] if text.nil? || text.strip.empty?
|
|
25
30
|
|
|
26
|
-
|
|
31
|
+
if @respect_structure
|
|
32
|
+
sentences, hard = @structure_splitter.split(text)
|
|
33
|
+
else
|
|
34
|
+
sentences = @splitter.split(text)
|
|
35
|
+
hard = []
|
|
36
|
+
end
|
|
37
|
+
|
|
27
38
|
embeddings = @embedder.embed(sentences)
|
|
28
39
|
|
|
29
40
|
boundaries = BoundaryDetector.new(
|
|
@@ -31,13 +42,15 @@ module SemanticTextChunker
|
|
|
31
42
|
embeddings: embeddings,
|
|
32
43
|
threshold: @threshold,
|
|
33
44
|
max_tokens: @max_tokens,
|
|
34
|
-
embedder: @embedder
|
|
45
|
+
embedder: @embedder,
|
|
46
|
+
forced: hard
|
|
35
47
|
).boundaries
|
|
36
48
|
|
|
37
49
|
ChunkBuilder.new(
|
|
38
50
|
sentences: sentences,
|
|
39
51
|
boundaries: boundaries,
|
|
40
|
-
overlap_sentences: @overlap_sentences
|
|
52
|
+
overlap_sentences: @overlap_sentences,
|
|
53
|
+
hard_boundaries: hard
|
|
41
54
|
).build
|
|
42
55
|
end
|
|
43
56
|
|
|
@@ -2,18 +2,31 @@ module SemanticTextChunker
|
|
|
2
2
|
module Splitters
|
|
3
3
|
class SentenceSplitter
|
|
4
4
|
ABBREVS = %w[Mr Mrs Dr Prof Sr Jr vs etc e.g i.e U.S U.K U.S.A Fig Vol No].freeze
|
|
5
|
-
|
|
5
|
+
|
|
6
|
+
# Split after a terminator (optionally followed by a closing quote/bracket)
|
|
7
|
+
# and whitespace, when the next sentence starts with an opening quote,
|
|
8
|
+
# an uppercase letter, or a digit.
|
|
9
|
+
SPLIT_PATTERN = /(?<=[.?!]|[.?!]["')\]])\s+(?=["'(\[A-Z0-9])/
|
|
10
|
+
|
|
11
|
+
ABBREV_PLACEHOLDER = "__STC_ABBREV__".freeze
|
|
12
|
+
DECIMAL_PLACEHOLDER = "__STC_DEC__".freeze
|
|
13
|
+
|
|
14
|
+
def initialize(extra_abbreviations: [])
|
|
15
|
+
@abbrevs = (ABBREVS + extra_abbreviations).freeze
|
|
16
|
+
@abbrev_pattern = /\b(#{@abbrevs.map { |a| Regexp.escape(a) }.join("|")})\.\s/
|
|
17
|
+
end
|
|
6
18
|
|
|
7
19
|
def split(text)
|
|
20
|
+
# Protect periods inside decimal numbers (e.g. 3.14, v1.2.3)
|
|
21
|
+
protected = text.gsub(/(\d)\.(\d)/) { "#{$1}#{DECIMAL_PLACEHOLDER}#{$2}" }
|
|
22
|
+
|
|
8
23
|
# Temporarily replace abbreviation periods
|
|
9
|
-
protected =
|
|
24
|
+
protected = protected.gsub(@abbrev_pattern) { "#{$1}#{ABBREV_PLACEHOLDER} " }
|
|
10
25
|
|
|
11
|
-
|
|
12
|
-
.split(
|
|
13
|
-
.map { |s| s.gsub("
|
|
26
|
+
protected
|
|
27
|
+
.split(SPLIT_PATTERN)
|
|
28
|
+
.map { |s| s.gsub(ABBREV_PLACEHOLDER, ".").gsub(DECIMAL_PLACEHOLDER, ".").strip }
|
|
14
29
|
.reject(&:empty?)
|
|
15
|
-
|
|
16
|
-
sentences
|
|
17
30
|
end
|
|
18
31
|
end
|
|
19
32
|
end
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
require_relative "sentence_splitter"
|
|
2
|
+
|
|
3
|
+
module SemanticTextChunker
|
|
4
|
+
module Splitters
|
|
5
|
+
# Splits text while respecting document structure. Blank-line paragraph
|
|
6
|
+
# breaks and markdown headings produce "hard" boundaries that chunks are
|
|
7
|
+
# never merged across. A standalone heading is attached to the block of
|
|
8
|
+
# content that follows it, so the heading travels with its section.
|
|
9
|
+
class StructureSplitter
|
|
10
|
+
# ATX markdown heading line, e.g. "## Section title"
|
|
11
|
+
HEADING_LINE = /\A\#{1,6}\s+\S/
|
|
12
|
+
|
|
13
|
+
def initialize(sentence_splitter: SentenceSplitter.new)
|
|
14
|
+
@sentence_splitter = sentence_splitter
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Returns [sentences, hard_boundaries] where:
|
|
18
|
+
# sentences - flat array of sentence strings across all blocks
|
|
19
|
+
# hard_boundaries - sentence indices that must end a chunk
|
|
20
|
+
def split(text)
|
|
21
|
+
sentences = []
|
|
22
|
+
hard = []
|
|
23
|
+
|
|
24
|
+
segment(text).each do |block|
|
|
25
|
+
block_sentences = @sentence_splitter.split(block)
|
|
26
|
+
next if block_sentences.empty?
|
|
27
|
+
|
|
28
|
+
sentences.concat(block_sentences)
|
|
29
|
+
hard << sentences.size - 1
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# The last block's trailing boundary is the document end, not a split.
|
|
33
|
+
hard.pop
|
|
34
|
+
|
|
35
|
+
[sentences, hard]
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
private
|
|
39
|
+
|
|
40
|
+
# Break text into blocks on blank lines, merging a standalone heading
|
|
41
|
+
# into the following block.
|
|
42
|
+
def segment(text)
|
|
43
|
+
raw = text.split(/\n[ \t]*\n+/).map(&:strip).reject(&:empty?)
|
|
44
|
+
|
|
45
|
+
merged = []
|
|
46
|
+
pending_heading = nil
|
|
47
|
+
|
|
48
|
+
raw.each do |block|
|
|
49
|
+
if heading_only?(block)
|
|
50
|
+
pending_heading = pending_heading ? "#{pending_heading}\n#{block}" : block
|
|
51
|
+
elsif pending_heading
|
|
52
|
+
merged << "#{pending_heading}\n#{block}"
|
|
53
|
+
pending_heading = nil
|
|
54
|
+
else
|
|
55
|
+
merged << block
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
merged << pending_heading if pending_heading
|
|
60
|
+
merged
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def heading_only?(block)
|
|
64
|
+
lines = block.lines.map(&:strip).reject(&:empty?)
|
|
65
|
+
lines.size == 1 && lines.first.match?(HEADING_LINE)
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: semantic_text_chunker
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Vlad Tigănilă
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-06-02 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description: Detects topic boundaries using embedding similarity to produce semantically
|
|
14
14
|
coherent chunks from books, articles, and documents. Supports Cohere, OpenAI, and
|
|
@@ -32,6 +32,7 @@ files:
|
|
|
32
32
|
- lib/semantic_text_chunker/embedders/openai.rb
|
|
33
33
|
- lib/semantic_text_chunker/metadata.rb
|
|
34
34
|
- lib/semantic_text_chunker/splitters/sentence_splitter.rb
|
|
35
|
+
- lib/semantic_text_chunker/splitters/structure_splitter.rb
|
|
35
36
|
- lib/semantic_text_chunker/version.rb
|
|
36
37
|
homepage: https://github.com/VladTZY/semantic_text_chunker
|
|
37
38
|
licenses:
|