semantic_chunker 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: '0782ee5ae0f80488a985b3c12afad2eb95252ecd45849ed98a8446aef4dbfc66'
4
+ data.tar.gz: 988a17459d404db90f460527d105e00579a44a442deabeba3c3e6462a4e440de
5
+ SHA512:
6
+ metadata.gz: 29b84d713798dabc248986ad2040da8bffe029c74dcba9e7f35aef783a8ac5b1b944c4ebdba1f1af50c79b78d92a9fb06c66c9f71a901a9c457032edd1508865
7
+ data.tar.gz: 26a6d3ac345c0a6d88cbffc7877a2fd387293c75d81d2cb85771b2d8f66778645944c46608f9de405eb787a4c48546f6efe859790963c129d2dfdd1ded8dcd32
@@ -0,0 +1,10 @@
1
+ # lib/semantic_chunker/adapters/base.rb
2
+ module SemanticChunker
3
+ module Adapters
4
+ class Base
5
+ def embed(sentences)
6
+ raise NotImplementedError, "#{self.class} must implement #embed"
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,56 @@
1
+ # lib/semantic_chunker/adapters/hugging_face_adapter.rb
2
+ module SemanticChunker
3
+ module Adapters
4
+ class HuggingFaceAdapter < Base
5
+ BASE_URL = "https://router.huggingface.co/hf-inference/models/%{model}"
6
+
7
+ def initialize(api_key:, model: 'intfloat/multilingual-e5-large')
8
+ @api_key = api_key
9
+ @model = model
10
+ # @model = 'sentence-transformers/all-MiniLM-L6-v2'
11
+ # @model = 'BAAI/bge-small-en-v1.5'
12
+ end
13
+
14
+ def embed(sentences)
15
+ response = post_request(sentences)
16
+
17
+ unless response.content_type == "application/json"
18
+ raise "HuggingFace Error: Expected JSON, got #{response.content_type}. Body: #{response.body}"
19
+ end
20
+
21
+ parsed = JSON.parse(response.body)
22
+
23
+ if response.is_a?(Net::HTTPSuccess)
24
+ parsed
25
+ else
26
+ if parsed.is_a?(Hash) && parsed["error"]&.include?("loading")
27
+ puts "Model warming up... retrying in 10s"
28
+ sleep 10
29
+ return embed(sentences)
30
+ end
31
+ raise "HuggingFace Error: #{parsed['error'] || parsed}"
32
+ end
33
+ end
34
+
35
+ private
36
+
37
+ def post_request(sentences)
38
+ uri = URI(BASE_URL % { model: @model })
39
+ request = Net::HTTP::Post.new(uri)
40
+
41
+ request["Authorization"] = "Bearer #{@api_key}"
42
+ request["Content-Type"] = "application/json"
43
+ request["X-Wait-For-Model"] = "true"
44
+
45
+ request.body = {
46
+ inputs: sentences
47
+ }.to_json
48
+
49
+ Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) do |http|
50
+ http.read_timeout = 60
51
+ http.request(request)
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,44 @@
1
+ # lib/semantic_chunker/adapters/openai_adapter.rb
2
+ require "net/http"
3
+ require "json"
4
+ require "uri"
5
+
6
+ module SemanticChunker
7
+ module Adapters
8
+ class OpenAIAdapter < Base
9
+ ENDPOINT = "https://api.openai.com/v1/embeddings"
10
+
11
+ def initialize(api_key:, model: "text-embedding-3-small")
12
+ @api_key = api_key
13
+ @model = model
14
+ end
15
+
16
+ def embed(sentences)
17
+ response = post_request(sentences)
18
+ parsed = JSON.parse(response.body)
19
+
20
+ if response.is_a?(Net::HTTPSuccess)
21
+ # OpenAI returns data in the same order as input
22
+ # We extract just the embedding arrays
23
+ parsed["data"].map { |entry| entry["embedding"] }
24
+ else
25
+ raise "OpenAI Error: #{parsed.dig('error', 'message') || response.code}"
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ def post_request(sentences)
32
+ uri = URI(ENDPOINT)
33
+ request = Net::HTTP::Post.new(uri)
34
+ request["Authorization"] = "Bearer #{@api_key}"
35
+ request["Content-Type"] = "application/json"
36
+ request.body = { input: sentences, model: @model }.to_json
37
+
38
+ Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) do |http|
39
+ http.request(request)
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,17 @@
1
+ # lib/semantic_chunker/adapters/test_adapter.rb
2
+ module SemanticChunker
3
+ module Adapters
4
+ class TestAdapter < Base
5
+ # We can pass specific vectors to simulate "topics"
6
+ def initialize(predefined_vectors = nil)
7
+ @predefined_vectors = predefined_vectors
8
+ end
9
+
10
+ def embed(sentences)
11
+ # If we have specific vectors, use them;
12
+ # otherwise, return random vectors for each sentence
13
+ @predefined_vectors || sentences.map { [rand, rand, rand] }
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,107 @@
1
+ # lib/semantic_chunker/chunker.rb
2
+ require 'matrix'
3
+ require 'pragmatic_segmenter'
4
+
5
+ module SemanticChunker
6
+ class Chunker
7
+ DEFAULT_THRESHOLD = 0.82
8
+ DEFAULT_BUFFER = 1
9
+ DEFAULT_MAX_SIZE = 1500 # Characters
10
+
11
+ def initialize(embedding_provider: nil, threshold: DEFAULT_THRESHOLD, buffer_size: DEFAULT_BUFFER, max_chunk_size: DEFAULT_MAX_SIZE, segmenter_options: {})
12
+ @provider = embedding_provider || SemanticChunker.configuration&.provider
13
+ @threshold = threshold
14
+ @buffer_size = buffer_size
15
+ @max_chunk_size = max_chunk_size
16
+ @segmenter_options = segmenter_options # e.g., { language: 'hy', doc_type: 'pdf' }
17
+
18
+ raise ArgumentError, "A provider must be configured" if @provider.nil?
19
+ end
20
+
21
+ def chunks_for(text)
22
+ return [] if text.nil? || text.strip.empty?
23
+ sentences = split_sentences(text)
24
+
25
+ # Step 1: Logic to determine the best buffer window
26
+ effective_buffer = determine_buffer(sentences)
27
+
28
+ # Step 2: Create overlapping "context groups" for more stable embeddings
29
+ context_groups = build_context_groups(sentences, effective_buffer)
30
+
31
+ # Step 3: Embed the groups, not the raw sentences
32
+ group_embeddings = @provider.embed(context_groups)
33
+
34
+ calculate_groups(sentences, group_embeddings)
35
+ end
36
+
37
+ private
38
+
39
+ # Selects buffer based on average sentence length if user passes :auto
40
+ def determine_buffer(sentences)
41
+ return @buffer_size unless @buffer_size == :auto
42
+
43
+ avg_length = sentences.map(&:length).sum / sentences.size.to_f
44
+
45
+ # Strategy: If sentences are very short (< 50 chars), we need more context.
46
+ # If they are long (> 150 chars), they are likely self-contained.
47
+ case avg_length
48
+ when 0..50 then 2 # Look 2 ahead and 2 behind
49
+ when 51..150 then 1 # Standard
50
+ else 0 # Long sentences don't need buffers
51
+ end
52
+ end
53
+
54
+ def build_context_groups(sentences, buffer)
55
+ sentences.each_with_index.map do |_, i|
56
+ start_idx = [0, i - buffer].max
57
+ end_idx = [sentences.size - 1, i + buffer].min
58
+ sentences[start_idx..end_idx].join(" ")
59
+ end
60
+ end
61
+
62
+ def split_sentences(text)
63
+ options = @segmenter_options.merge(text: text)
64
+ ps = PragmaticSegmenter::Segmenter.new(**options)
65
+ ps.segment
66
+ end
67
+
68
+ def calculate_groups(sentences, embeddings)
69
+ chunks = []
70
+ current_chunk_text = [sentences[0]]
71
+ current_chunk_vectors = [Vector[*embeddings[0]]]
72
+
73
+ (1...sentences.size).each do |i|
74
+ new_sentence = sentences[i]
75
+ new_vec = Vector[*embeddings[i]]
76
+
77
+ # 1. Calculate Centroid
78
+ centroid = current_chunk_vectors.inject(:+) / current_chunk_vectors.size.to_f
79
+ sim = cosine_similarity(centroid, new_vec)
80
+
81
+ # 2. Check Constraints: Similarity OR Size
82
+ # We calculate the potential size of the chunk if we added this sentence
83
+ potential_size = current_chunk_text.join(" ").length + new_sentence.length + 1
84
+
85
+ if sim < @threshold || potential_size > @max_chunk_size
86
+ # Split if the topic changed OR the chunk is getting too fat
87
+ chunks << current_chunk_text.join(" ")
88
+
89
+ current_chunk_text = [new_sentence]
90
+ current_chunk_vectors = [new_vec]
91
+ else
92
+ # Keep grouping
93
+ current_chunk_text << new_sentence
94
+ current_chunk_vectors << new_vec
95
+ end
96
+ end
97
+
98
+ chunks << current_chunk_text.join(" ")
99
+ chunks
100
+ end
101
+
102
+ def cosine_similarity(v1, v2)
103
+ return 0.0 if v1.magnitude.zero? || v2.magnitude.zero?
104
+ v1.inner_product(v2) / (v1.magnitude * v2.magnitude)
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,3 @@
1
+ module SemanticChunker
2
+ VERSION = "0.5.3"
3
+ end
@@ -0,0 +1,33 @@
1
+ # lib/semantic_chunker.rb
2
+ # 1. Require dependencies
3
+ require 'matrix'
4
+ require 'json'
5
+ require 'net/http'
6
+
7
+ # 2. Require the version and base modules
8
+ require_relative 'semantic_chunker/version' if File.exist?('lib/semantic_chunker/version.rb')
9
+
10
+ # 3. Require the internal logic
11
+ require_relative 'semantic_chunker/adapters/base'
12
+ require_relative 'semantic_chunker/adapters/openai_adapter'
13
+ require_relative 'semantic_chunker/adapters/test_adapter'
14
+ require_relative 'semantic_chunker/chunker'
15
+ require_relative 'semantic_chunker/adapters/hugging_face_adapter'
16
+ module SemanticChunker
17
+ class << self
18
+ attr_accessor :configuration
19
+ end
20
+
21
+ def self.configure
22
+ self.configuration ||= Configuration.new
23
+ yield(configuration)
24
+ end
25
+
26
+ class Configuration
27
+ attr_accessor :provider
28
+
29
+ def initialize
30
+ @provider = nil # User must set this
31
+ end
32
+ end
33
+ end
metadata ADDED
@@ -0,0 +1,120 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: semantic_chunker
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.5.3
5
+ platform: ruby
6
+ authors:
7
+ - Daniele Frisanco
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2026-01-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '13.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '13.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rspec
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '3.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '3.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: vcr
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: pragmatic_segmenter
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '0.3'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '0.3'
69
+ - !ruby/object:Gem::Dependency
70
+ name: matrix
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.4'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.4'
83
+ description: Split long text into chunks based on semantic meaning.
84
+ email:
85
+ - daniele.frisanco@gmail.com
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - lib/semantic_chunker.rb
91
+ - lib/semantic_chunker/adapters/base.rb
92
+ - lib/semantic_chunker/adapters/hugging_face_adapter.rb
93
+ - lib/semantic_chunker/adapters/openai_adapter.rb
94
+ - lib/semantic_chunker/adapters/test_adapter.rb
95
+ - lib/semantic_chunker/chunker.rb
96
+ - lib/semantic_chunker/version.rb
97
+ homepage: https://github.com/danielefrisanco/semantic_chunker
98
+ licenses:
99
+ - MIT
100
+ metadata: {}
101
+ post_install_message:
102
+ rdoc_options: []
103
+ require_paths:
104
+ - lib
105
+ required_ruby_version: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ required_rubygems_version: !ruby/object:Gem::Requirement
111
+ requirements:
112
+ - - ">="
113
+ - !ruby/object:Gem::Version
114
+ version: '0'
115
+ requirements: []
116
+ rubygems_version: 3.3.26
117
+ signing_key:
118
+ specification_version: 4
119
+ summary: Split long text into chunks based on semantic meaning.
120
+ test_files: []