semantic_text_chunker 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +147 -0
- data/lib/semantic_text_chunker/boundary_detector.rb +60 -0
- data/lib/semantic_text_chunker/chunk_builder.rb +33 -0
- data/lib/semantic_text_chunker/chunker.rb +49 -0
- data/lib/semantic_text_chunker/embedders/base.rb +17 -0
- data/lib/semantic_text_chunker/embedders/cohere.rb +41 -0
- data/lib/semantic_text_chunker/embedders/null.rb +15 -0
- data/lib/semantic_text_chunker/embedders/open_router.rb +45 -0
- data/lib/semantic_text_chunker/embedders/openai.rb +41 -0
- data/lib/semantic_text_chunker/metadata.rb +22 -0
- data/lib/semantic_text_chunker/splitters/sentence_splitter.rb +20 -0
- data/lib/semantic_text_chunker/version.rb +3 -0
- data/lib/semantic_text_chunker.rb +22 -0
- metadata +59 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: bd5a74c09a20b7a87ae6d2a3c8a4cfda92315de9d4f18ffe54de76ee35b0db4f
|
|
4
|
+
data.tar.gz: cfe5098ea9fcd78e0fd8297755f2ac1c568d34656decf4b98b31a7c7f2e69c66
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 8cc50f43d68240887b1af0498ac50a320a3c77fa97656cc60ecb8b2baa4c829c381bb2e66763d53b65dbd32def3f54952b33fd9e0278bfa681b752e046aa1d27
|
|
7
|
+
data.tar.gz: 391aba7c0d6dc181fc29a9b383dd3f07f1196dd4f4e5487e724b0893f80a6ca9a207f9f9e6cb8c6f003a91d800bdd074feb000bed985ba656cab4289f2dbbc53
|
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Vlad Tigănilă
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# SemanticTextChunker
|
|
2
|
+
|
|
3
|
+
Embedding-aware semantic chunking for Ruby RAG pipelines. Splits text into coherent chunks by detecting topic boundaries using embedding similarity, rather than blindly splitting on character count.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
Add to your Gemfile:
|
|
8
|
+
|
|
9
|
+
```ruby
|
|
10
|
+
gem "semantic_text_chunker"
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Then run:
|
|
14
|
+
|
|
15
|
+
```sh
|
|
16
|
+
bundle install
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Or install directly:
|
|
20
|
+
|
|
21
|
+
```sh
|
|
22
|
+
gem install semantic_text_chunker
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
```ruby
|
|
28
|
+
require "semantic_text_chunker"
|
|
29
|
+
|
|
30
|
+
text = "Your long document text here..."
|
|
31
|
+
|
|
32
|
+
# Using OpenAI embeddings
|
|
33
|
+
chunks = SemanticTextChunker.chunk(text,
|
|
34
|
+
embedder: SemanticTextChunker::Embedders::OpenAI.new(api_key: ENV["OPENAI_API_KEY"])
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
chunks.each { |chunk| puts chunk, "---" }
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Embedders
|
|
41
|
+
|
|
42
|
+
### OpenAI
|
|
43
|
+
|
|
44
|
+
```ruby
|
|
45
|
+
embedder = SemanticTextChunker::Embedders::OpenAI.new(
|
|
46
|
+
api_key: ENV["OPENAI_API_KEY"],
|
|
47
|
+
model: "text-embedding-3-small" # default
|
|
48
|
+
)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### Cohere
|
|
52
|
+
|
|
53
|
+
```ruby
|
|
54
|
+
embedder = SemanticTextChunker::Embedders::Cohere.new(
|
|
55
|
+
api_key: ENV["COHERE_API_KEY"],
|
|
56
|
+
model: "embed-english-v3.0" # default
|
|
57
|
+
)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### OpenRouter
|
|
61
|
+
|
|
62
|
+
```ruby
|
|
63
|
+
embedder = SemanticTextChunker::Embedders::OpenRouter.new(
|
|
64
|
+
api_key: ENV["OPENROUTER_API_KEY"],
|
|
65
|
+
model: "openai/text-embedding-3-small" # default
|
|
66
|
+
)
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Null (no API required)
|
|
70
|
+
|
|
71
|
+
A hash-based embedder useful for testing and development. No external API calls needed.
|
|
72
|
+
|
|
73
|
+
```ruby
|
|
74
|
+
embedder = SemanticTextChunker::Embedders::Null.new
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Options
|
|
78
|
+
|
|
79
|
+
| Option | Default | Description |
|
|
80
|
+
|---------------------|---------|-----------------------------------------------------|
|
|
81
|
+
| `embedder` | `Null` | Embedder instance to use for generating embeddings |
|
|
82
|
+
| `threshold` | `0.75` | Cosine similarity threshold for detecting boundaries |
|
|
83
|
+
| `max_tokens` | `512` | Maximum tokens per chunk (estimated at ~4 chars/token) |
|
|
84
|
+
| `overlap_sentences` | `2` | Number of sentences to overlap between chunks |
|
|
85
|
+
|
|
86
|
+
```ruby
|
|
87
|
+
chunks = SemanticTextChunker.chunk(text,
|
|
88
|
+
embedder: embedder,
|
|
89
|
+
threshold: 0.8,
|
|
90
|
+
max_tokens: 1024,
|
|
91
|
+
overlap_sentences: 3
|
|
92
|
+
)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## Metadata
|
|
96
|
+
|
|
97
|
+
Prepend metadata to each chunk for better retrieval context:
|
|
98
|
+
|
|
99
|
+
```ruby
|
|
100
|
+
chunks = SemanticTextChunker.chunk_with_metadata(text,
|
|
101
|
+
embedder: embedder,
|
|
102
|
+
title: "The Great Gatsby",
|
|
103
|
+
author: "F. Scott Fitzgerald",
|
|
104
|
+
chapter: "Chapter 1",
|
|
105
|
+
section: "Opening",
|
|
106
|
+
source: "gutenberg.org"
|
|
107
|
+
)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
Each chunk will be prefixed with:
|
|
111
|
+
|
|
112
|
+
```
|
|
113
|
+
Title: The Great Gatsby
|
|
114
|
+
Author: F. Scott Fitzgerald
|
|
115
|
+
Chapter: Chapter 1
|
|
116
|
+
Section: Opening
|
|
117
|
+
Source: gutenberg.org
|
|
118
|
+
|
|
119
|
+
<chunk text>
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## Custom Embedders
|
|
123
|
+
|
|
124
|
+
Create your own embedder by subclassing `SemanticTextChunker::Embedders::Base`:
|
|
125
|
+
|
|
126
|
+
```ruby
|
|
127
|
+
class MyEmbedder < SemanticTextChunker::Embedders::Base
|
|
128
|
+
def embed(texts)
|
|
129
|
+
# texts is an array of strings
|
|
130
|
+
# Return an array of embedding vectors (arrays of floats)
|
|
131
|
+
texts.map { |t| your_embedding_logic(t) }
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
The base class provides a `cosine_similarity` method used for boundary detection.
|
|
137
|
+
|
|
138
|
+
## How It Works
|
|
139
|
+
|
|
140
|
+
1. **Sentence splitting** - Text is split into sentences using punctuation-aware rules that handle abbreviations (Mr., Dr., U.S., etc.)
|
|
141
|
+
2. **Embedding** - Each sentence is embedded using the configured embedder
|
|
142
|
+
3. **Boundary detection** - Consecutive sentences are grouped. A new chunk boundary is created when the cosine similarity between the accumulated chunk embedding and the next sentence drops below the threshold, or when the token limit is exceeded
|
|
143
|
+
4. **Chunk building** - Sentences are assembled into chunks with configurable overlap for context continuity
|
|
144
|
+
|
|
145
|
+
## License
|
|
146
|
+
|
|
147
|
+
MIT
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
module SemanticTextChunker
|
|
2
|
+
class BoundaryDetector
|
|
3
|
+
def initialize(sentences:, embeddings:, threshold:, max_tokens:, embedder:)
|
|
4
|
+
@sentences = sentences
|
|
5
|
+
@embeddings = embeddings
|
|
6
|
+
@threshold = threshold
|
|
7
|
+
@max_tokens = max_tokens
|
|
8
|
+
@embedder = embedder
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
# Returns array of sentence indices where chunks end
|
|
12
|
+
def boundaries
|
|
13
|
+
return [] if @sentences.size <= 1
|
|
14
|
+
|
|
15
|
+
boundaries = []
|
|
16
|
+
chunk_start = 0
|
|
17
|
+
current_text = ""
|
|
18
|
+
|
|
19
|
+
@sentences.each_with_index do |sentence, i|
|
|
20
|
+
next if i == 0
|
|
21
|
+
current_text = @sentences[chunk_start..i - 1].join(" ")
|
|
22
|
+
next_text = current_text + " " + sentence
|
|
23
|
+
|
|
24
|
+
# Force boundary if adding this sentence exceeds token limit
|
|
25
|
+
if tokens(next_text) > @max_tokens
|
|
26
|
+
boundaries << i - 1
|
|
27
|
+
chunk_start = i
|
|
28
|
+
current_text = ""
|
|
29
|
+
next
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Compute similarity between accumulated chunk and next sentence
|
|
33
|
+
chunk_embedding = mean_embedding(@embeddings[chunk_start..i - 1])
|
|
34
|
+
sentence_embedding = @embeddings[i]
|
|
35
|
+
similarity = @embedder.cosine_similarity(chunk_embedding, sentence_embedding)
|
|
36
|
+
|
|
37
|
+
if similarity < @threshold
|
|
38
|
+
boundaries << i - 1
|
|
39
|
+
chunk_start = i
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
boundaries
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
private
|
|
47
|
+
|
|
48
|
+
def mean_embedding(embeddings)
|
|
49
|
+
return embeddings.first if embeddings.size == 1
|
|
50
|
+
dim = embeddings.first.size
|
|
51
|
+
sum = Array.new(dim, 0.0)
|
|
52
|
+
embeddings.each { |e| e.each_with_index { |v, i| sum[i] += v } }
|
|
53
|
+
sum.map { |v| v / embeddings.size }
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def tokens(text)
|
|
57
|
+
(text.length / 4.0).ceil
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
module SemanticTextChunker
|
|
2
|
+
class ChunkBuilder
|
|
3
|
+
def initialize(sentences:, boundaries:, overlap_sentences:)
|
|
4
|
+
@sentences = sentences
|
|
5
|
+
@boundaries = boundaries
|
|
6
|
+
@overlap_sentences = overlap_sentences
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def build
|
|
10
|
+
return [@sentences.join(" ")] if @boundaries.empty?
|
|
11
|
+
|
|
12
|
+
chunks = []
|
|
13
|
+
prev_end = -1
|
|
14
|
+
|
|
15
|
+
split_points = @boundaries + [@sentences.size - 1]
|
|
16
|
+
|
|
17
|
+
split_points.each_with_index do |boundary, idx|
|
|
18
|
+
start = if idx == 0
|
|
19
|
+
0
|
|
20
|
+
else
|
|
21
|
+
# Overlap: go back N sentences from previous boundary
|
|
22
|
+
[prev_end - @overlap_sentences + 1, 0].max
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
chunk = @sentences[start..boundary].join(" ").strip
|
|
26
|
+
chunks << chunk unless chunk.empty?
|
|
27
|
+
prev_end = boundary
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
chunks
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
require_relative "splitters/sentence_splitter"
|
|
2
|
+
require_relative "embedders/base"
|
|
3
|
+
require_relative "embedders/null"
|
|
4
|
+
require_relative "boundary_detector"
|
|
5
|
+
require_relative "chunk_builder"
|
|
6
|
+
require_relative "metadata"
|
|
7
|
+
|
|
8
|
+
module SemanticTextChunker
|
|
9
|
+
class Chunker
|
|
10
|
+
def initialize(
|
|
11
|
+
embedder: Embedders::Null.new,
|
|
12
|
+
threshold: 0.75,
|
|
13
|
+
max_tokens: 512,
|
|
14
|
+
overlap_sentences: 2
|
|
15
|
+
)
|
|
16
|
+
@embedder = embedder
|
|
17
|
+
@threshold = threshold
|
|
18
|
+
@max_tokens = max_tokens
|
|
19
|
+
@overlap_sentences = overlap_sentences
|
|
20
|
+
@splitter = Splitters::SentenceSplitter.new
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def chunk(text)
|
|
24
|
+
return [] if text.nil? || text.strip.empty?
|
|
25
|
+
|
|
26
|
+
sentences = @splitter.split(text)
|
|
27
|
+
embeddings = @embedder.embed(sentences)
|
|
28
|
+
|
|
29
|
+
boundaries = BoundaryDetector.new(
|
|
30
|
+
sentences: sentences,
|
|
31
|
+
embeddings: embeddings,
|
|
32
|
+
threshold: @threshold,
|
|
33
|
+
max_tokens: @max_tokens,
|
|
34
|
+
embedder: @embedder
|
|
35
|
+
).boundaries
|
|
36
|
+
|
|
37
|
+
ChunkBuilder.new(
|
|
38
|
+
sentences: sentences,
|
|
39
|
+
boundaries: boundaries,
|
|
40
|
+
overlap_sentences: @overlap_sentences
|
|
41
|
+
).build
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def chunk_with_metadata(text, **metadata)
|
|
45
|
+
prefix = Metadata.prefix(**metadata)
|
|
46
|
+
chunk(text).map { |c| prefix + c }
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module SemanticTextChunker
|
|
2
|
+
module Embedders
|
|
3
|
+
class Base
|
|
4
|
+
def embed(texts)
|
|
5
|
+
raise NotImplementedError, "#{self.class} must implement #embed"
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
def cosine_similarity(a, b)
|
|
9
|
+
dot = a.zip(b).sum { |x, y| x * y }
|
|
10
|
+
mag_a = Math.sqrt(a.sum { |x| x**2 })
|
|
11
|
+
mag_b = Math.sqrt(b.sum { |x| x**2 })
|
|
12
|
+
return 0.0 if mag_a.zero? || mag_b.zero?
|
|
13
|
+
dot / (mag_a * mag_b)
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
require "net/http"
|
|
2
|
+
require "json"
|
|
3
|
+
require "uri"
|
|
4
|
+
|
|
5
|
+
module SemanticTextChunker
|
|
6
|
+
module Embedders
|
|
7
|
+
class Cohere < Base
|
|
8
|
+
BATCH_SIZE = 96
|
|
9
|
+
ENDPOINT = "https://api.cohere.com/v1/embed"
|
|
10
|
+
|
|
11
|
+
def initialize(api_key:, model: "embed-english-v3.0")
|
|
12
|
+
@api_key = api_key
|
|
13
|
+
@model = model
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def embed(texts)
|
|
17
|
+
texts.each_slice(BATCH_SIZE).flat_map do |batch|
|
|
18
|
+
response = post(batch)
|
|
19
|
+
response["embeddings"] || raise(EmbedderError, "No embeddings in response: #{response}")
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
private
|
|
24
|
+
|
|
25
|
+
def post(texts)
|
|
26
|
+
uri = URI(ENDPOINT)
|
|
27
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
28
|
+
http.use_ssl = true
|
|
29
|
+
|
|
30
|
+
req = Net::HTTP::Post.new(uri)
|
|
31
|
+
req["Authorization"] = "Bearer #{@api_key}"
|
|
32
|
+
req["Content-Type"] = "application/json"
|
|
33
|
+
req.body = { texts: texts, model: @model, input_type: "search_document" }.to_json
|
|
34
|
+
|
|
35
|
+
res = http.request(req)
|
|
36
|
+
raise EmbedderError, "Cohere #{res.code}: #{res.body}" unless res.is_a?(Net::HTTPSuccess)
|
|
37
|
+
JSON.parse(res.body)
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
module SemanticTextChunker
|
|
2
|
+
module Embedders
|
|
3
|
+
class Null < Base
|
|
4
|
+
def embed(texts)
|
|
5
|
+
texts.map do |text|
|
|
6
|
+
words = text.downcase.split.uniq
|
|
7
|
+
vec = Array.new(512, 0.0)
|
|
8
|
+
words.each { |w| vec[w.hash.abs % 512] += 1.0 }
|
|
9
|
+
norm = Math.sqrt(vec.sum { |x| x**2 })
|
|
10
|
+
norm > 0 ? vec.map { |x| x / norm } : vec
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
require "net/http"
|
|
2
|
+
require "json"
|
|
3
|
+
require "uri"
|
|
4
|
+
|
|
5
|
+
module SemanticTextChunker
|
|
6
|
+
module Embedders
|
|
7
|
+
class OpenRouter < Base
|
|
8
|
+
BATCH_SIZE = 100
|
|
9
|
+
ENDPOINT = "https://openrouter.ai/api/v1/embeddings"
|
|
10
|
+
|
|
11
|
+
def initialize(api_key:, model: "openai/text-embedding-3-small", site_url: nil, site_name: nil)
|
|
12
|
+
@api_key = api_key
|
|
13
|
+
@model = model
|
|
14
|
+
@site_url = site_url || "https://github.com/VladTZY/semantic_text_chunker"
|
|
15
|
+
@site_name = site_name || "semantic_text_chunker"
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def embed(texts)
|
|
19
|
+
texts.each_slice(BATCH_SIZE).flat_map do |batch|
|
|
20
|
+
response = post(batch)
|
|
21
|
+
response["data"].map { |d| d["embedding"] }
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
private
|
|
26
|
+
|
|
27
|
+
def post(texts)
|
|
28
|
+
uri = URI(ENDPOINT)
|
|
29
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
30
|
+
http.use_ssl = true
|
|
31
|
+
|
|
32
|
+
req = Net::HTTP::Post.new(uri)
|
|
33
|
+
req["Authorization"] = "Bearer #{@api_key}"
|
|
34
|
+
req["Content-Type"] = "application/json"
|
|
35
|
+
req["HTTP-Referer"] = @site_url # Required by OpenRouter
|
|
36
|
+
req["X-Title"] = @site_name
|
|
37
|
+
req.body = { input: texts, model: @model }.to_json
|
|
38
|
+
|
|
39
|
+
res = http.request(req)
|
|
40
|
+
raise EmbedderError, "OpenRouter #{res.code}: #{res.body}" unless res.is_a?(Net::HTTPSuccess)
|
|
41
|
+
JSON.parse(res.body)
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
require "net/http"
|
|
2
|
+
require "json"
|
|
3
|
+
require "uri"
|
|
4
|
+
|
|
5
|
+
module SemanticTextChunker
|
|
6
|
+
module Embedders
|
|
7
|
+
class OpenAI < Base
|
|
8
|
+
BATCH_SIZE = 100
|
|
9
|
+
ENDPOINT = "https://api.openai.com/v1/embeddings"
|
|
10
|
+
|
|
11
|
+
def initialize(api_key:, model: "text-embedding-3-small")
|
|
12
|
+
@api_key = api_key
|
|
13
|
+
@model = model
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def embed(texts)
|
|
17
|
+
texts.each_slice(BATCH_SIZE).flat_map do |batch|
|
|
18
|
+
response = post(batch)
|
|
19
|
+
response["data"].map { |d| d["embedding"] }
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
private
|
|
24
|
+
|
|
25
|
+
def post(texts)
|
|
26
|
+
uri = URI(ENDPOINT)
|
|
27
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
28
|
+
http.use_ssl = true
|
|
29
|
+
|
|
30
|
+
req = Net::HTTP::Post.new(uri)
|
|
31
|
+
req["Authorization"] = "Bearer #{@api_key}"
|
|
32
|
+
req["Content-Type"] = "application/json"
|
|
33
|
+
req.body = { input: texts, model: @model }.to_json
|
|
34
|
+
|
|
35
|
+
res = http.request(req)
|
|
36
|
+
raise EmbedderError, "OpenAI #{res.code}: #{res.body}" unless res.is_a?(Net::HTTPSuccess)
|
|
37
|
+
JSON.parse(res.body)
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
module SemanticTextChunker
|
|
2
|
+
module Metadata
|
|
3
|
+
KNOWN_KEYS = %i[title author chapter section source].freeze
|
|
4
|
+
|
|
5
|
+
def self.prefix(**kwargs)
|
|
6
|
+
lines = []
|
|
7
|
+
|
|
8
|
+
KNOWN_KEYS.each do |key|
|
|
9
|
+
val = kwargs[key]
|
|
10
|
+
lines << "#{key.to_s.capitalize}: #{val}" if val && !val.to_s.empty?
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
# Any extra keys appended at end, titlecased
|
|
14
|
+
(kwargs.keys - KNOWN_KEYS).each do |key|
|
|
15
|
+
val = kwargs[key]
|
|
16
|
+
lines << "#{key.to_s.split('_').map(&:capitalize).join(' ')}: #{val}" if val
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
lines.empty? ? "" : lines.join("\n") + "\n\n"
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
module SemanticTextChunker
|
|
2
|
+
module Splitters
|
|
3
|
+
class SentenceSplitter
|
|
4
|
+
ABBREVS = %w[Mr Mrs Dr Prof Sr Jr vs etc e.g i.e U.S U.K U.S.A Fig Vol No].freeze
|
|
5
|
+
ABBREV_PATTERN = /\b(#{ABBREVS.map { |a| Regexp.escape(a) }.join("|")})\.\s/
|
|
6
|
+
|
|
7
|
+
def split(text)
|
|
8
|
+
# Temporarily replace abbreviation periods
|
|
9
|
+
protected = text.gsub(ABBREV_PATTERN) { "#{$1}__ABBREV__ " }
|
|
10
|
+
|
|
11
|
+
sentences = protected
|
|
12
|
+
.split(/(?<=[.?!])\s+(?=[A-Z])/)
|
|
13
|
+
.map { |s| s.gsub("__ABBREV__", ".").strip }
|
|
14
|
+
.reject(&:empty?)
|
|
15
|
+
|
|
16
|
+
sentences
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
require_relative "semantic_text_chunker/version"
|
|
2
|
+
require_relative "semantic_text_chunker/chunker"
|
|
3
|
+
require_relative "semantic_text_chunker/embedders/cohere"
|
|
4
|
+
require_relative "semantic_text_chunker/embedders/openai"
|
|
5
|
+
require_relative "semantic_text_chunker/embedders/open_router"
|
|
6
|
+
require_relative "semantic_text_chunker/embedders/null"
|
|
7
|
+
|
|
8
|
+
module SemanticTextChunker
|
|
9
|
+
class EmbedderError < StandardError; end
|
|
10
|
+
|
|
11
|
+
def self.chunk(text, **opts)
|
|
12
|
+
Chunker.new(**opts).chunk(text)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def self.chunk_with_metadata(text, **opts)
|
|
16
|
+
metadata_keys = %i[title author chapter section source]
|
|
17
|
+
chunker_opts = opts.reject { |k, _| metadata_keys.include?(k) }
|
|
18
|
+
metadata = opts.select { |k, _| metadata_keys.include?(k) }
|
|
19
|
+
|
|
20
|
+
Chunker.new(**chunker_opts).chunk_with_metadata(text, **metadata)
|
|
21
|
+
end
|
|
22
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: semantic_text_chunker
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Vlad Tigănilă
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2026-05-19 00:00:00.000000000 Z
|
|
12
|
+
dependencies: []
|
|
13
|
+
description: Detects topic boundaries using embedding similarity to produce semantically
|
|
14
|
+
coherent chunks from books, articles, and documents. Supports Cohere, OpenAI, and
|
|
15
|
+
OpenRouter embedders.
|
|
16
|
+
email:
|
|
17
|
+
- tiganilavlad@gmail.com
|
|
18
|
+
executables: []
|
|
19
|
+
extensions: []
|
|
20
|
+
extra_rdoc_files: []
|
|
21
|
+
files:
|
|
22
|
+
- LICENSE
|
|
23
|
+
- README.md
|
|
24
|
+
- lib/semantic_text_chunker.rb
|
|
25
|
+
- lib/semantic_text_chunker/boundary_detector.rb
|
|
26
|
+
- lib/semantic_text_chunker/chunk_builder.rb
|
|
27
|
+
- lib/semantic_text_chunker/chunker.rb
|
|
28
|
+
- lib/semantic_text_chunker/embedders/base.rb
|
|
29
|
+
- lib/semantic_text_chunker/embedders/cohere.rb
|
|
30
|
+
- lib/semantic_text_chunker/embedders/null.rb
|
|
31
|
+
- lib/semantic_text_chunker/embedders/open_router.rb
|
|
32
|
+
- lib/semantic_text_chunker/embedders/openai.rb
|
|
33
|
+
- lib/semantic_text_chunker/metadata.rb
|
|
34
|
+
- lib/semantic_text_chunker/splitters/sentence_splitter.rb
|
|
35
|
+
- lib/semantic_text_chunker/version.rb
|
|
36
|
+
homepage: https://github.com/VladTZY/semantic_text_chunker
|
|
37
|
+
licenses:
|
|
38
|
+
- MIT
|
|
39
|
+
metadata: {}
|
|
40
|
+
post_install_message:
|
|
41
|
+
rdoc_options: []
|
|
42
|
+
require_paths:
|
|
43
|
+
- lib
|
|
44
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
45
|
+
requirements:
|
|
46
|
+
- - ">="
|
|
47
|
+
- !ruby/object:Gem::Version
|
|
48
|
+
version: 3.0.0
|
|
49
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - ">="
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '0'
|
|
54
|
+
requirements: []
|
|
55
|
+
rubygems_version: 3.5.11
|
|
56
|
+
signing_key:
|
|
57
|
+
specification_version: 4
|
|
58
|
+
summary: Embedding-aware semantic chunking for Ruby RAG pipelines
|
|
59
|
+
test_files: []
|