robot_lab-document_store 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "fastembed"
4
- require_relative "document_store/version"
3
+ require_relative 'document_store/version'
5
4
 
6
5
  module RobotLab
7
6
  # Embedding-based document store for semantic search over arbitrary text.
@@ -13,6 +12,11 @@ module RobotLab
13
12
  # The embedding model is initialised lazily on first use — the ONNX model
14
13
  # file is downloaded on that first call (cached locally afterwards).
15
14
  #
15
+ # When fastembed is not installed, DocumentStore falls back to a lightweight
16
+ # TF-IDF word-frequency embedder. The fallback is lower quality (no semantic
17
+ # understanding, only lexical overlap) but works offline with no downloads,
18
+ # making it suitable for development and testing.
19
+ #
16
20
  # @example Standalone
17
21
  # store = RobotLab::DocumentStore.new
18
22
  # store.store(:q4_report, "Q4 revenue came in at $4.2M, up 18% YoY…")
@@ -26,15 +30,26 @@ module RobotLab
26
30
  # memory.search_documents("how to configure redis", limit: 3)
27
31
  #
28
32
  class DocumentStore
33
+ # @api private
34
+ FASTEMBED_AVAILABLE = begin
35
+ require 'fastembed'
36
+ true
37
+ # :nocov:
38
+ rescue LoadError
39
+ false
40
+ # :nocov:
41
+ end
42
+
29
43
  # Default embedding model used when none is specified.
30
- DEFAULT_MODEL = "BAAI/bge-small-en-v1.5"
44
+ DEFAULT_MODEL = 'BAAI/bge-small-en-v1.5'
31
45
 
32
- # @param model_name [String] fastembed model name
46
+ # @param model_name [String] fastembed model name (ignored when fastembed unavailable)
33
47
  def initialize(model_name: DEFAULT_MODEL)
34
- @model_name = model_name
35
- @documents = {} # key (Symbol) => { text: String, vector: Array<Float> }
36
- @mutex = Mutex.new
37
- @model = nil # lazy: initialised on first embed call
48
+ @model_name = model_name
49
+ @documents = {} # key (Symbol) => { text: String, vector: Array<Float> }
50
+ @mutex = Mutex.new
51
+ @fastembed_model = nil # lazy: initialised on first embed call
52
+ @using_fastembed = FASTEMBED_AVAILABLE
38
53
  end
39
54
 
40
55
  # Embed +text+ and store it under +key+.
@@ -106,39 +121,117 @@ module RobotLab
106
121
  self
107
122
  end
108
123
 
124
+ STOP_WORDS = %w[
125
+ a an the is are was were be been being am do does did
126
+ to of in and or but for with on at by from as into
127
+ it its this that these those i you he she we they
128
+ not no nor so yet
129
+ ].to_set.freeze
130
+
109
131
  private
110
132
 
111
- def model
112
- @model ||= Fastembed::TextEmbedding.new(model_name: @model_name, show_progress: false)
133
+ # ── Fastembed path ──────────────────────────────────────────────────────
134
+
135
+ def fastembed_model
136
+ @fastembed_model ||= Fastembed::TextEmbedding.new(model_name: @model_name, show_progress: false)
113
137
  end
114
138
 
115
139
  def passage_vector(text)
116
- model.passage_embed([text]).to_a.first
140
+ if @using_fastembed
141
+ fastembed_model.passage_embed([text]).to_a.first
142
+ else
143
+ fallback_vector(text)
144
+ end
117
145
  end
118
146
 
119
147
  def query_vector(text)
120
- model.query_embed([text]).to_a.first
148
+ if @using_fastembed
149
+ fastembed_model.query_embed([text]).to_a.first
150
+ else
151
+ fallback_vector(text)
152
+ end
121
153
  end
122
154
 
123
155
  def cosine_similarity(vec_a, vec_b)
124
156
  return 0.0 unless vec_a && vec_b
157
+
158
+ if @using_fastembed
159
+ return 0.0 if vec_a.empty? || vec_b.empty?
160
+ return 0.0 if vec_a.length != vec_b.length
161
+
162
+ dot = 0.0
163
+ norm_a = 0.0
164
+ norm_b = 0.0
165
+
166
+ vec_a.each_with_index do |a, i|
167
+ b = vec_b[i]
168
+ dot += a * b
169
+ norm_a += a * a
170
+ norm_b += b * b
171
+ end
172
+
173
+ return 0.0 if norm_a.zero? || norm_b.zero?
174
+
175
+ dot / (Math.sqrt(norm_a) * Math.sqrt(norm_b))
176
+ else
177
+ sparse_cosine(vec_a, vec_b)
178
+ end
179
+ end
180
+
181
+ # ── Fallback TF-IDF word-frequency path ─────────────────────────────────
182
+
183
+ # Returns a sparse Hash{String => Float} L2-normalised term-frequency vector.
184
+ def fallback_vector(text)
185
+ counts = Hash.new(0)
186
+ text.downcase.scan(/[a-z]+/).each do |w|
187
+ next if STOP_WORDS.include?(w)
188
+
189
+ counts[stem(w)] += 1
190
+ end
191
+
192
+ return {} if counts.empty?
193
+
194
+ norm = Math.sqrt(counts.values.sum { |c| c * c }.to_f)
195
+ counts.transform_values { |c| c / norm }
196
+ end
197
+
198
+ def sparse_cosine(vec_a, vec_b)
125
199
  return 0.0 if vec_a.empty? || vec_b.empty?
126
- return 0.0 if vec_a.length != vec_b.length
127
200
 
128
201
  dot = 0.0
129
202
  norm_a = 0.0
130
203
  norm_b = 0.0
131
204
 
132
- vec_a.each_with_index do |a, i|
133
- b = vec_b[i]
134
- dot += a * b
135
- norm_a += a * a
136
- norm_b += b * b
205
+ vec_a.each do |k, v|
206
+ dot += v * vec_b[k].to_f
207
+ norm_a += v * v
137
208
  end
209
+ vec_b.each_value { |v| norm_b += v * v }
138
210
 
211
+ # :nocov:
139
212
  return 0.0 if norm_a.zero? || norm_b.zero?
213
+ # :nocov:
140
214
 
141
215
  dot / (Math.sqrt(norm_a) * Math.sqrt(norm_b))
142
216
  end
217
+
218
+ # Very basic Porter-style stemmer for English: strip common suffixes.
219
+ def stem(word)
220
+ word = word.dup
221
+ word.sub!(/ies$/, 'y') ||
222
+ word.sub!(/ness$/, '') ||
223
+ word.sub!(/ment$/, '') ||
224
+ word.sub!(/tion$/, '') ||
225
+ word.sub!(/ing$/, '') ||
226
+ word.sub!(/ed$/, '') ||
227
+ word.sub!(/er$/, '') ||
228
+ word.sub!(/ly$/, '') ||
229
+ word.sub!(/s$/, '')
230
+ word
231
+ end
143
232
  end
144
233
  end
234
+
235
+ if defined?(RobotLab) && RobotLab.respond_to?(:register_extension)
236
+ RobotLab.register_extension(:document_store, RobotLab::DocumentStore)
237
+ end
data/mkdocs.yml CHANGED
@@ -7,6 +7,7 @@ copyright: Copyright &copy; 2025 Dewayne VanHoozer
7
7
  repo_name: MadBomber/robot_lab-document_store
8
8
  repo_url: https://github.com/MadBomber/robot_lab-document_store
9
9
  edit_uri: edit/main/docs/
10
+ docs_dir: docs
10
11
 
11
12
  theme:
12
13
  name: material
@@ -111,3 +112,7 @@ extra:
111
112
 
112
113
  nav:
113
114
  - Home: index.md
115
+ - Getting Started: getting_started.md
116
+ - API Reference: api_reference.md
117
+ - How It Works: how_it_works.md
118
+ - RAG Patterns: rag_patterns.md
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: robot_lab-document_store
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dewayne VanHoozer
@@ -36,17 +36,24 @@ extra_rdoc_files: []
36
36
  files:
37
37
  - ".envrc"
38
38
  - ".github/workflows/deploy-github-pages.yml"
39
+ - ".rubocop.yml"
39
40
  - CHANGELOG.md
40
41
  - LICENSE.txt
41
42
  - README.md
42
43
  - Rakefile
44
+ - docs/api_reference.md
45
+ - docs/assets/architecture.svg
46
+ - docs/getting_started.md
47
+ - docs/how_it_works.md
43
48
  - docs/index.md
44
- - examples/26_document_store.rb
45
- - examples/26_document_store/api_versioning_adr.md
46
- - examples/26_document_store/incident_postmortem.md
47
- - examples/26_document_store/postgres_runbook.md
48
- - examples/26_document_store/redis_caching_guide.md
49
- - examples/26_document_store/sidekiq_guide.md
49
+ - docs/pluggable_backends_design.md
50
+ - docs/rag_patterns.md
51
+ - examples/01_basic_usage.rb
52
+ - examples/01_basic_usage/api_versioning_adr.md
53
+ - examples/01_basic_usage/incident_postmortem.md
54
+ - examples/01_basic_usage/postgres_runbook.md
55
+ - examples/01_basic_usage/redis_caching_guide.md
56
+ - examples/01_basic_usage/sidekiq_guide.md
50
57
  - lib/robot_lab/document_store.rb
51
58
  - lib/robot_lab/document_store/version.rb
52
59
  - mkdocs.yml