robot_lab-document_store 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +173 -0
- data/CHANGELOG.md +19 -0
- data/README.md +1 -1
- data/Rakefile +111 -3
- data/docs/api_reference.md +186 -0
- data/docs/assets/architecture.svg +140 -0
- data/docs/getting_started.md +106 -0
- data/docs/how_it_works.md +141 -0
- data/docs/index.md +24 -41
- data/docs/pluggable_backends_design.md +66 -0
- data/docs/rag_patterns.md +198 -0
- data/examples/{26_document_store.rb → 01_basic_usage.rb} +13 -9
- data/lib/robot_lab/document_store/version.rb +1 -1
- data/lib/robot_lab/document_store.rb +111 -18
- data/mkdocs.yml +5 -0
- metadata +14 -7
- /data/examples/{26_document_store → 01_basic_usage}/api_versioning_adr.md +0 -0
- /data/examples/{26_document_store → 01_basic_usage}/incident_postmortem.md +0 -0
- /data/examples/{26_document_store → 01_basic_usage}/postgres_runbook.md +0 -0
- /data/examples/{26_document_store → 01_basic_usage}/redis_caching_guide.md +0 -0
- /data/examples/{26_document_store → 01_basic_usage}/sidekiq_guide.md +0 -0
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
require_relative "document_store/version"
|
|
3
|
+
require_relative 'document_store/version'
|
|
5
4
|
|
|
6
5
|
module RobotLab
|
|
7
6
|
# Embedding-based document store for semantic search over arbitrary text.
|
|
@@ -13,6 +12,11 @@ module RobotLab
|
|
|
13
12
|
# The embedding model is initialised lazily on first use — the ONNX model
|
|
14
13
|
# file is downloaded on that first call (cached locally afterwards).
|
|
15
14
|
#
|
|
15
|
+
# When fastembed is not installed, DocumentStore falls back to a lightweight
|
|
16
|
+
# TF-IDF word-frequency embedder. The fallback is lower quality (no semantic
|
|
17
|
+
# understanding, only lexical overlap) but works offline with no downloads,
|
|
18
|
+
# making it suitable for development and testing.
|
|
19
|
+
#
|
|
16
20
|
# @example Standalone
|
|
17
21
|
# store = RobotLab::DocumentStore.new
|
|
18
22
|
# store.store(:q4_report, "Q4 revenue came in at $4.2M, up 18% YoY…")
|
|
@@ -26,15 +30,26 @@ module RobotLab
|
|
|
26
30
|
# memory.search_documents("how to configure redis", limit: 3)
|
|
27
31
|
#
|
|
28
32
|
class DocumentStore
|
|
33
|
+
# @api private
|
|
34
|
+
FASTEMBED_AVAILABLE = begin
|
|
35
|
+
require 'fastembed'
|
|
36
|
+
true
|
|
37
|
+
# :nocov:
|
|
38
|
+
rescue LoadError
|
|
39
|
+
false
|
|
40
|
+
# :nocov:
|
|
41
|
+
end
|
|
42
|
+
|
|
29
43
|
# Default embedding model used when none is specified.
|
|
30
|
-
DEFAULT_MODEL =
|
|
44
|
+
DEFAULT_MODEL = 'BAAI/bge-small-en-v1.5'
|
|
31
45
|
|
|
32
|
-
# @param model_name [String] fastembed model name
|
|
46
|
+
# @param model_name [String] fastembed model name (ignored when fastembed unavailable)
|
|
33
47
|
def initialize(model_name: DEFAULT_MODEL)
|
|
34
|
-
@model_name
|
|
35
|
-
@documents
|
|
36
|
-
@mutex
|
|
37
|
-
@
|
|
48
|
+
@model_name = model_name
|
|
49
|
+
@documents = {} # key (Symbol) => { text: String, vector: Array<Float> }
|
|
50
|
+
@mutex = Mutex.new
|
|
51
|
+
@fastembed_model = nil # lazy: initialised on first embed call
|
|
52
|
+
@using_fastembed = FASTEMBED_AVAILABLE
|
|
38
53
|
end
|
|
39
54
|
|
|
40
55
|
# Embed +text+ and store it under +key+.
|
|
@@ -106,39 +121,117 @@ module RobotLab
|
|
|
106
121
|
self
|
|
107
122
|
end
|
|
108
123
|
|
|
124
|
+
STOP_WORDS = %w[
|
|
125
|
+
a an the is are was were be been being am do does did
|
|
126
|
+
to of in and or but for with on at by from as into
|
|
127
|
+
it its this that these those i you he she we they
|
|
128
|
+
not no nor so yet
|
|
129
|
+
].to_set.freeze
|
|
130
|
+
|
|
109
131
|
private
|
|
110
132
|
|
|
111
|
-
|
|
112
|
-
|
|
133
|
+
# ── Fastembed path ──────────────────────────────────────────────────────
|
|
134
|
+
|
|
135
|
+
def fastembed_model
|
|
136
|
+
@fastembed_model ||= Fastembed::TextEmbedding.new(model_name: @model_name, show_progress: false)
|
|
113
137
|
end
|
|
114
138
|
|
|
115
139
|
def passage_vector(text)
|
|
116
|
-
|
|
140
|
+
if @using_fastembed
|
|
141
|
+
fastembed_model.passage_embed([text]).to_a.first
|
|
142
|
+
else
|
|
143
|
+
fallback_vector(text)
|
|
144
|
+
end
|
|
117
145
|
end
|
|
118
146
|
|
|
119
147
|
def query_vector(text)
|
|
120
|
-
|
|
148
|
+
if @using_fastembed
|
|
149
|
+
fastembed_model.query_embed([text]).to_a.first
|
|
150
|
+
else
|
|
151
|
+
fallback_vector(text)
|
|
152
|
+
end
|
|
121
153
|
end
|
|
122
154
|
|
|
123
155
|
def cosine_similarity(vec_a, vec_b)
|
|
124
156
|
return 0.0 unless vec_a && vec_b
|
|
157
|
+
|
|
158
|
+
if @using_fastembed
|
|
159
|
+
return 0.0 if vec_a.empty? || vec_b.empty?
|
|
160
|
+
return 0.0 if vec_a.length != vec_b.length
|
|
161
|
+
|
|
162
|
+
dot = 0.0
|
|
163
|
+
norm_a = 0.0
|
|
164
|
+
norm_b = 0.0
|
|
165
|
+
|
|
166
|
+
vec_a.each_with_index do |a, i|
|
|
167
|
+
b = vec_b[i]
|
|
168
|
+
dot += a * b
|
|
169
|
+
norm_a += a * a
|
|
170
|
+
norm_b += b * b
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
return 0.0 if norm_a.zero? || norm_b.zero?
|
|
174
|
+
|
|
175
|
+
dot / (Math.sqrt(norm_a) * Math.sqrt(norm_b))
|
|
176
|
+
else
|
|
177
|
+
sparse_cosine(vec_a, vec_b)
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# ── Fallback TF-IDF word-frequency path ─────────────────────────────────
|
|
182
|
+
|
|
183
|
+
# Returns a sparse Hash{String => Float} L2-normalised term-frequency vector.
|
|
184
|
+
def fallback_vector(text)
|
|
185
|
+
counts = Hash.new(0)
|
|
186
|
+
text.downcase.scan(/[a-z]+/).each do |w|
|
|
187
|
+
next if STOP_WORDS.include?(w)
|
|
188
|
+
|
|
189
|
+
counts[stem(w)] += 1
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
return {} if counts.empty?
|
|
193
|
+
|
|
194
|
+
norm = Math.sqrt(counts.values.sum { |c| c * c }.to_f)
|
|
195
|
+
counts.transform_values { |c| c / norm }
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
def sparse_cosine(vec_a, vec_b)
|
|
125
199
|
return 0.0 if vec_a.empty? || vec_b.empty?
|
|
126
|
-
return 0.0 if vec_a.length != vec_b.length
|
|
127
200
|
|
|
128
201
|
dot = 0.0
|
|
129
202
|
norm_a = 0.0
|
|
130
203
|
norm_b = 0.0
|
|
131
204
|
|
|
132
|
-
vec_a.
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
norm_a += a * a
|
|
136
|
-
norm_b += b * b
|
|
205
|
+
vec_a.each do |k, v|
|
|
206
|
+
dot += v * vec_b[k].to_f
|
|
207
|
+
norm_a += v * v
|
|
137
208
|
end
|
|
209
|
+
vec_b.each_value { |v| norm_b += v * v }
|
|
138
210
|
|
|
211
|
+
# :nocov:
|
|
139
212
|
return 0.0 if norm_a.zero? || norm_b.zero?
|
|
213
|
+
# :nocov:
|
|
140
214
|
|
|
141
215
|
dot / (Math.sqrt(norm_a) * Math.sqrt(norm_b))
|
|
142
216
|
end
|
|
217
|
+
|
|
218
|
+
# Very basic Porter-style stemmer for English: strip common suffixes.
|
|
219
|
+
def stem(word)
|
|
220
|
+
word = word.dup
|
|
221
|
+
word.sub!(/ies$/, 'y') ||
|
|
222
|
+
word.sub!(/ness$/, '') ||
|
|
223
|
+
word.sub!(/ment$/, '') ||
|
|
224
|
+
word.sub!(/tion$/, '') ||
|
|
225
|
+
word.sub!(/ing$/, '') ||
|
|
226
|
+
word.sub!(/ed$/, '') ||
|
|
227
|
+
word.sub!(/er$/, '') ||
|
|
228
|
+
word.sub!(/ly$/, '') ||
|
|
229
|
+
word.sub!(/s$/, '')
|
|
230
|
+
word
|
|
231
|
+
end
|
|
143
232
|
end
|
|
144
233
|
end
|
|
234
|
+
|
|
235
|
+
if defined?(RobotLab) && RobotLab.respond_to?(:register_extension)
|
|
236
|
+
RobotLab.register_extension(:document_store, RobotLab::DocumentStore)
|
|
237
|
+
end
|
data/mkdocs.yml
CHANGED
|
@@ -7,6 +7,7 @@ copyright: Copyright © 2025 Dewayne VanHoozer
|
|
|
7
7
|
repo_name: MadBomber/robot_lab-document_store
|
|
8
8
|
repo_url: https://github.com/MadBomber/robot_lab-document_store
|
|
9
9
|
edit_uri: edit/main/docs/
|
|
10
|
+
docs_dir: docs
|
|
10
11
|
|
|
11
12
|
theme:
|
|
12
13
|
name: material
|
|
@@ -111,3 +112,7 @@ extra:
|
|
|
111
112
|
|
|
112
113
|
nav:
|
|
113
114
|
- Home: index.md
|
|
115
|
+
- Getting Started: getting_started.md
|
|
116
|
+
- API Reference: api_reference.md
|
|
117
|
+
- How It Works: how_it_works.md
|
|
118
|
+
- RAG Patterns: rag_patterns.md
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: robot_lab-document_store
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Dewayne VanHoozer
|
|
@@ -36,17 +36,24 @@ extra_rdoc_files: []
|
|
|
36
36
|
files:
|
|
37
37
|
- ".envrc"
|
|
38
38
|
- ".github/workflows/deploy-github-pages.yml"
|
|
39
|
+
- ".rubocop.yml"
|
|
39
40
|
- CHANGELOG.md
|
|
40
41
|
- LICENSE.txt
|
|
41
42
|
- README.md
|
|
42
43
|
- Rakefile
|
|
44
|
+
- docs/api_reference.md
|
|
45
|
+
- docs/assets/architecture.svg
|
|
46
|
+
- docs/getting_started.md
|
|
47
|
+
- docs/how_it_works.md
|
|
43
48
|
- docs/index.md
|
|
44
|
-
-
|
|
45
|
-
-
|
|
46
|
-
- examples/
|
|
47
|
-
- examples/
|
|
48
|
-
- examples/
|
|
49
|
-
- examples/
|
|
49
|
+
- docs/pluggable_backends_design.md
|
|
50
|
+
- docs/rag_patterns.md
|
|
51
|
+
- examples/01_basic_usage.rb
|
|
52
|
+
- examples/01_basic_usage/api_versioning_adr.md
|
|
53
|
+
- examples/01_basic_usage/incident_postmortem.md
|
|
54
|
+
- examples/01_basic_usage/postgres_runbook.md
|
|
55
|
+
- examples/01_basic_usage/redis_caching_guide.md
|
|
56
|
+
- examples/01_basic_usage/sidekiq_guide.md
|
|
50
57
|
- lib/robot_lab/document_store.rb
|
|
51
58
|
- lib/robot_lab/document_store/version.rb
|
|
52
59
|
- mkdocs.yml
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|