semantic_chunker 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/semantic_chunker/adapters/base.rb +10 -0
- data/lib/semantic_chunker/adapters/hugging_face_adapter.rb +56 -0
- data/lib/semantic_chunker/adapters/openai_adapter.rb +44 -0
- data/lib/semantic_chunker/adapters/test_adapter.rb +17 -0
- data/lib/semantic_chunker/chunker.rb +107 -0
- data/lib/semantic_chunker/version.rb +3 -0
- data/lib/semantic_chunker.rb +33 -0
- metadata +120 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: '0782ee5ae0f80488a985b3c12afad2eb95252ecd45849ed98a8446aef4dbfc66'
|
|
4
|
+
data.tar.gz: 988a17459d404db90f460527d105e00579a44a442deabeba3c3e6462a4e440de
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 29b84d713798dabc248986ad2040da8bffe029c74dcba9e7f35aef783a8ac5b1b944c4ebdba1f1af50c79b78d92a9fb06c66c9f71a901a9c457032edd1508865
|
|
7
|
+
data.tar.gz: 26a6d3ac345c0a6d88cbffc7877a2fd387293c75d81d2cb85771b2d8f66778645944c46608f9de405eb787a4c48546f6efe859790963c129d2dfdd1ded8dcd32
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# lib/semantic_chunker/adapters/hugging_face_adapter.rb
|
|
2
|
+
module SemanticChunker
|
|
3
|
+
module Adapters
|
|
4
|
+
class HuggingFaceAdapter < Base
|
|
5
|
+
BASE_URL = "https://router.huggingface.co/hf-inference/models/%{model}"
|
|
6
|
+
|
|
7
|
+
def initialize(api_key:, model: 'intfloat/multilingual-e5-large')
|
|
8
|
+
@api_key = api_key
|
|
9
|
+
@model = model
|
|
10
|
+
# @model = 'sentence-transformers/all-MiniLM-L6-v2'
|
|
11
|
+
# @model = 'BAAI/bge-small-en-v1.5'
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def embed(sentences)
|
|
15
|
+
response = post_request(sentences)
|
|
16
|
+
|
|
17
|
+
unless response.content_type == "application/json"
|
|
18
|
+
raise "HuggingFace Error: Expected JSON, got #{response.content_type}. Body: #{response.body}"
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
parsed = JSON.parse(response.body)
|
|
22
|
+
|
|
23
|
+
if response.is_a?(Net::HTTPSuccess)
|
|
24
|
+
parsed
|
|
25
|
+
else
|
|
26
|
+
if parsed.is_a?(Hash) && parsed["error"]&.include?("loading")
|
|
27
|
+
puts "Model warming up... retrying in 10s"
|
|
28
|
+
sleep 10
|
|
29
|
+
return embed(sentences)
|
|
30
|
+
end
|
|
31
|
+
raise "HuggingFace Error: #{parsed['error'] || parsed}"
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
def post_request(sentences)
|
|
38
|
+
uri = URI(BASE_URL % { model: @model })
|
|
39
|
+
request = Net::HTTP::Post.new(uri)
|
|
40
|
+
|
|
41
|
+
request["Authorization"] = "Bearer #{@api_key}"
|
|
42
|
+
request["Content-Type"] = "application/json"
|
|
43
|
+
request["X-Wait-For-Model"] = "true"
|
|
44
|
+
|
|
45
|
+
request.body = {
|
|
46
|
+
inputs: sentences
|
|
47
|
+
}.to_json
|
|
48
|
+
|
|
49
|
+
Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) do |http|
|
|
50
|
+
http.read_timeout = 60
|
|
51
|
+
http.request(request)
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# lib/semantic_chunker/adapters/openai_adapter.rb
|
|
2
|
+
require "net/http"
|
|
3
|
+
require "json"
|
|
4
|
+
require "uri"
|
|
5
|
+
|
|
6
|
+
module SemanticChunker
|
|
7
|
+
module Adapters
|
|
8
|
+
class OpenAIAdapter < Base
|
|
9
|
+
ENDPOINT = "https://api.openai.com/v1/embeddings"
|
|
10
|
+
|
|
11
|
+
def initialize(api_key:, model: "text-embedding-3-small")
|
|
12
|
+
@api_key = api_key
|
|
13
|
+
@model = model
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def embed(sentences)
|
|
17
|
+
response = post_request(sentences)
|
|
18
|
+
parsed = JSON.parse(response.body)
|
|
19
|
+
|
|
20
|
+
if response.is_a?(Net::HTTPSuccess)
|
|
21
|
+
# OpenAI returns data in the same order as input
|
|
22
|
+
# We extract just the embedding arrays
|
|
23
|
+
parsed["data"].map { |entry| entry["embedding"] }
|
|
24
|
+
else
|
|
25
|
+
raise "OpenAI Error: #{parsed.dig('error', 'message') || response.code}"
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
private
|
|
30
|
+
|
|
31
|
+
def post_request(sentences)
|
|
32
|
+
uri = URI(ENDPOINT)
|
|
33
|
+
request = Net::HTTP::Post.new(uri)
|
|
34
|
+
request["Authorization"] = "Bearer #{@api_key}"
|
|
35
|
+
request["Content-Type"] = "application/json"
|
|
36
|
+
request.body = { input: sentences, model: @model }.to_json
|
|
37
|
+
|
|
38
|
+
Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) do |http|
|
|
39
|
+
http.request(request)
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# lib/semantic_chunker/adapters/test_adapter.rb
|
|
2
|
+
module SemanticChunker
|
|
3
|
+
module Adapters
|
|
4
|
+
class TestAdapter < Base
|
|
5
|
+
# We can pass specific vectors to simulate "topics"
|
|
6
|
+
def initialize(predefined_vectors = nil)
|
|
7
|
+
@predefined_vectors = predefined_vectors
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def embed(sentences)
|
|
11
|
+
# If we have specific vectors, use them;
|
|
12
|
+
# otherwise, return random vectors for each sentence
|
|
13
|
+
@predefined_vectors || sentences.map { [rand, rand, rand] }
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# lib/semantic_chunker/chunker.rb
|
|
2
|
+
require 'matrix'
|
|
3
|
+
require 'pragmatic_segmenter'
|
|
4
|
+
|
|
5
|
+
module SemanticChunker
|
|
6
|
+
class Chunker
|
|
7
|
+
DEFAULT_THRESHOLD = 0.82
|
|
8
|
+
DEFAULT_BUFFER = 1
|
|
9
|
+
DEFAULT_MAX_SIZE = 1500 # Characters
|
|
10
|
+
|
|
11
|
+
def initialize(embedding_provider: nil, threshold: DEFAULT_THRESHOLD, buffer_size: DEFAULT_BUFFER, max_chunk_size: DEFAULT_MAX_SIZE, segmenter_options: {})
|
|
12
|
+
@provider = embedding_provider || SemanticChunker.configuration&.provider
|
|
13
|
+
@threshold = threshold
|
|
14
|
+
@buffer_size = buffer_size
|
|
15
|
+
@max_chunk_size = max_chunk_size
|
|
16
|
+
@segmenter_options = segmenter_options # e.g., { language: 'hy', doc_type: 'pdf' }
|
|
17
|
+
|
|
18
|
+
raise ArgumentError, "A provider must be configured" if @provider.nil?
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def chunks_for(text)
|
|
22
|
+
return [] if text.nil? || text.strip.empty?
|
|
23
|
+
sentences = split_sentences(text)
|
|
24
|
+
|
|
25
|
+
# Step 1: Logic to determine the best buffer window
|
|
26
|
+
effective_buffer = determine_buffer(sentences)
|
|
27
|
+
|
|
28
|
+
# Step 2: Create overlapping "context groups" for more stable embeddings
|
|
29
|
+
context_groups = build_context_groups(sentences, effective_buffer)
|
|
30
|
+
|
|
31
|
+
# Step 3: Embed the groups, not the raw sentences
|
|
32
|
+
group_embeddings = @provider.embed(context_groups)
|
|
33
|
+
|
|
34
|
+
calculate_groups(sentences, group_embeddings)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
private
|
|
38
|
+
|
|
39
|
+
# Selects buffer based on average sentence length if user passes :auto
|
|
40
|
+
def determine_buffer(sentences)
|
|
41
|
+
return @buffer_size unless @buffer_size == :auto
|
|
42
|
+
|
|
43
|
+
avg_length = sentences.map(&:length).sum / sentences.size.to_f
|
|
44
|
+
|
|
45
|
+
# Strategy: If sentences are very short (< 50 chars), we need more context.
|
|
46
|
+
# If they are long (> 150 chars), they are likely self-contained.
|
|
47
|
+
case avg_length
|
|
48
|
+
when 0..50 then 2 # Look 2 ahead and 2 behind
|
|
49
|
+
when 51..150 then 1 # Standard
|
|
50
|
+
else 0 # Long sentences don't need buffers
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def build_context_groups(sentences, buffer)
|
|
55
|
+
sentences.each_with_index.map do |_, i|
|
|
56
|
+
start_idx = [0, i - buffer].max
|
|
57
|
+
end_idx = [sentences.size - 1, i + buffer].min
|
|
58
|
+
sentences[start_idx..end_idx].join(" ")
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def split_sentences(text)
|
|
63
|
+
options = @segmenter_options.merge(text: text)
|
|
64
|
+
ps = PragmaticSegmenter::Segmenter.new(**options)
|
|
65
|
+
ps.segment
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def calculate_groups(sentences, embeddings)
|
|
69
|
+
chunks = []
|
|
70
|
+
current_chunk_text = [sentences[0]]
|
|
71
|
+
current_chunk_vectors = [Vector[*embeddings[0]]]
|
|
72
|
+
|
|
73
|
+
(1...sentences.size).each do |i|
|
|
74
|
+
new_sentence = sentences[i]
|
|
75
|
+
new_vec = Vector[*embeddings[i]]
|
|
76
|
+
|
|
77
|
+
# 1. Calculate Centroid
|
|
78
|
+
centroid = current_chunk_vectors.inject(:+) / current_chunk_vectors.size.to_f
|
|
79
|
+
sim = cosine_similarity(centroid, new_vec)
|
|
80
|
+
|
|
81
|
+
# 2. Check Constraints: Similarity OR Size
|
|
82
|
+
# We calculate the potential size of the chunk if we added this sentence
|
|
83
|
+
potential_size = current_chunk_text.join(" ").length + new_sentence.length + 1
|
|
84
|
+
|
|
85
|
+
if sim < @threshold || potential_size > @max_chunk_size
|
|
86
|
+
# Split if the topic changed OR the chunk is getting too fat
|
|
87
|
+
chunks << current_chunk_text.join(" ")
|
|
88
|
+
|
|
89
|
+
current_chunk_text = [new_sentence]
|
|
90
|
+
current_chunk_vectors = [new_vec]
|
|
91
|
+
else
|
|
92
|
+
# Keep grouping
|
|
93
|
+
current_chunk_text << new_sentence
|
|
94
|
+
current_chunk_vectors << new_vec
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
chunks << current_chunk_text.join(" ")
|
|
99
|
+
chunks
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def cosine_similarity(v1, v2)
|
|
103
|
+
return 0.0 if v1.magnitude.zero? || v2.magnitude.zero?
|
|
104
|
+
v1.inner_product(v2) / (v1.magnitude * v2.magnitude)
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# lib/semantic_chunker.rb
|
|
2
|
+
# 1. Require dependencies
|
|
3
|
+
require 'matrix'
|
|
4
|
+
require 'json'
|
|
5
|
+
require 'net/http'
|
|
6
|
+
|
|
7
|
+
# 2. Require the version and base modules
|
|
8
|
+
require_relative 'semantic_chunker/version' if File.exist?('lib/semantic_chunker/version.rb')
|
|
9
|
+
|
|
10
|
+
# 3. Require the internal logic
|
|
11
|
+
require_relative 'semantic_chunker/adapters/base'
|
|
12
|
+
require_relative 'semantic_chunker/adapters/openai_adapter'
|
|
13
|
+
require_relative 'semantic_chunker/adapters/test_adapter'
|
|
14
|
+
require_relative 'semantic_chunker/chunker'
|
|
15
|
+
require_relative 'semantic_chunker/adapters/hugging_face_adapter'
|
|
16
|
+
module SemanticChunker
|
|
17
|
+
class << self
|
|
18
|
+
attr_accessor :configuration
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def self.configure
|
|
22
|
+
self.configuration ||= Configuration.new
|
|
23
|
+
yield(configuration)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
class Configuration
|
|
27
|
+
attr_accessor :provider
|
|
28
|
+
|
|
29
|
+
def initialize
|
|
30
|
+
@provider = nil # User must set this
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: semantic_chunker
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.5.3
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Daniele Frisanco
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2026-01-07 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: rake
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - "~>"
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '13.0'
|
|
20
|
+
type: :development
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - "~>"
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '13.0'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: rspec
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - "~>"
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: '3.0'
|
|
34
|
+
type: :development
|
|
35
|
+
prerelease: false
|
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - "~>"
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: '3.0'
|
|
41
|
+
- !ruby/object:Gem::Dependency
|
|
42
|
+
name: vcr
|
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - ">="
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: '0'
|
|
48
|
+
type: :development
|
|
49
|
+
prerelease: false
|
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - ">="
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '0'
|
|
55
|
+
- !ruby/object:Gem::Dependency
|
|
56
|
+
name: pragmatic_segmenter
|
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
|
58
|
+
requirements:
|
|
59
|
+
- - "~>"
|
|
60
|
+
- !ruby/object:Gem::Version
|
|
61
|
+
version: '0.3'
|
|
62
|
+
type: :runtime
|
|
63
|
+
prerelease: false
|
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
65
|
+
requirements:
|
|
66
|
+
- - "~>"
|
|
67
|
+
- !ruby/object:Gem::Version
|
|
68
|
+
version: '0.3'
|
|
69
|
+
- !ruby/object:Gem::Dependency
|
|
70
|
+
name: matrix
|
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
|
72
|
+
requirements:
|
|
73
|
+
- - "~>"
|
|
74
|
+
- !ruby/object:Gem::Version
|
|
75
|
+
version: '0.4'
|
|
76
|
+
type: :runtime
|
|
77
|
+
prerelease: false
|
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
79
|
+
requirements:
|
|
80
|
+
- - "~>"
|
|
81
|
+
- !ruby/object:Gem::Version
|
|
82
|
+
version: '0.4'
|
|
83
|
+
description: Split long text into chunks based on semantic meaning.
|
|
84
|
+
email:
|
|
85
|
+
- daniele.frisanco@gmail.com
|
|
86
|
+
executables: []
|
|
87
|
+
extensions: []
|
|
88
|
+
extra_rdoc_files: []
|
|
89
|
+
files:
|
|
90
|
+
- lib/semantic_chunker.rb
|
|
91
|
+
- lib/semantic_chunker/adapters/base.rb
|
|
92
|
+
- lib/semantic_chunker/adapters/hugging_face_adapter.rb
|
|
93
|
+
- lib/semantic_chunker/adapters/openai_adapter.rb
|
|
94
|
+
- lib/semantic_chunker/adapters/test_adapter.rb
|
|
95
|
+
- lib/semantic_chunker/chunker.rb
|
|
96
|
+
- lib/semantic_chunker/version.rb
|
|
97
|
+
homepage: https://github.com/danielefrisanco/semantic_chunker
|
|
98
|
+
licenses:
|
|
99
|
+
- MIT
|
|
100
|
+
metadata: {}
|
|
101
|
+
post_install_message:
|
|
102
|
+
rdoc_options: []
|
|
103
|
+
require_paths:
|
|
104
|
+
- lib
|
|
105
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
106
|
+
requirements:
|
|
107
|
+
- - ">="
|
|
108
|
+
- !ruby/object:Gem::Version
|
|
109
|
+
version: '0'
|
|
110
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
111
|
+
requirements:
|
|
112
|
+
- - ">="
|
|
113
|
+
- !ruby/object:Gem::Version
|
|
114
|
+
version: '0'
|
|
115
|
+
requirements: []
|
|
116
|
+
rubygems_version: 3.3.26
|
|
117
|
+
signing_key:
|
|
118
|
+
specification_version: 4
|
|
119
|
+
summary: Split long text into chunks based on semantic meaning.
|
|
120
|
+
test_files: []
|