robot_lab-document_store 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +173 -0
- data/CHANGELOG.md +19 -0
- data/README.md +1 -1
- data/Rakefile +111 -3
- data/docs/api_reference.md +186 -0
- data/docs/assets/architecture.svg +140 -0
- data/docs/getting_started.md +106 -0
- data/docs/how_it_works.md +141 -0
- data/docs/index.md +24 -41
- data/docs/pluggable_backends_design.md +66 -0
- data/docs/rag_patterns.md +198 -0
- data/examples/{26_document_store.rb → 01_basic_usage.rb} +13 -9
- data/lib/robot_lab/document_store/version.rb +1 -1
- data/lib/robot_lab/document_store.rb +111 -18
- data/mkdocs.yml +5 -0
- metadata +14 -7
- /data/examples/{26_document_store → 01_basic_usage}/api_versioning_adr.md +0 -0
- /data/examples/{26_document_store → 01_basic_usage}/incident_postmortem.md +0 -0
- /data/examples/{26_document_store → 01_basic_usage}/postgres_runbook.md +0 -0
- /data/examples/{26_document_store → 01_basic_usage}/redis_caching_guide.md +0 -0
- /data/examples/{26_document_store → 01_basic_usage}/sidekiq_guide.md +0 -0
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 5add23edc59a87fac16aaab24cc061392c241b6d0e0065aa36832dc5681342d6
|
|
4
|
+
data.tar.gz: 9fb1b9c37b3dcdee87ede9c3e0ce43e6013b6964d1268c39c9cb83a1c0a389ff
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 3834e2af8b84030daa4b8641f1e86697dbb9d88d37e9ddc975f81a40df3d6b632e48a0459da012466dc2a695b1e9857ea7fbdb0f5524724578e9754b3b6556bc
|
|
7
|
+
data.tar.gz: c9d07c4c8b6d0f5bffc7c1e7a27c7d5c434a4ad85e6ed2336b6e751bd6e23c6a5ef1c6612c2650947a90928aa30561e0b6e5725ebb5bb2ed8611a23123633ff1
|
data/.rubocop.yml
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
AllCops:
|
|
2
|
+
NewCops: enable
|
|
3
|
+
SuggestExtensions: false
|
|
4
|
+
TargetRubyVersion: 4.0
|
|
5
|
+
Exclude:
|
|
6
|
+
- 'examples/**/*'
|
|
7
|
+
- 'vendor/**/*'
|
|
8
|
+
- 'dead_code/**/*'
|
|
9
|
+
|
|
10
|
+
# ── Style: disabled cops ───────────────────────────────────────────────────
|
|
11
|
+
Style/StringLiterals:
|
|
12
|
+
Enabled: false
|
|
13
|
+
|
|
14
|
+
Style/StringLiteralsInInterpolation:
|
|
15
|
+
Enabled: false
|
|
16
|
+
|
|
17
|
+
Style/Documentation:
|
|
18
|
+
Enabled: false
|
|
19
|
+
|
|
20
|
+
# Ruby 4.0 freezes string literals by default
|
|
21
|
+
Style/FrozenStringLiteralComment:
|
|
22
|
+
Enabled: false
|
|
23
|
+
|
|
24
|
+
Style/IfUnlessModifier:
|
|
25
|
+
Enabled: false
|
|
26
|
+
|
|
27
|
+
Style/RescueModifier:
|
|
28
|
+
Enabled: false
|
|
29
|
+
|
|
30
|
+
Style/TrivialAccessors:
|
|
31
|
+
Enabled: false
|
|
32
|
+
|
|
33
|
+
Style/MultilineTernaryOperator:
|
|
34
|
+
Enabled: false
|
|
35
|
+
|
|
36
|
+
Style/SafeNavigation:
|
|
37
|
+
Enabled: false
|
|
38
|
+
|
|
39
|
+
Style/EmptyClassDefinition:
|
|
40
|
+
Enabled: false
|
|
41
|
+
|
|
42
|
+
Style/ClassAndModuleChildren:
|
|
43
|
+
Enabled: false
|
|
44
|
+
|
|
45
|
+
Style/RescueStandardError:
|
|
46
|
+
Enabled: false
|
|
47
|
+
|
|
48
|
+
Style/OneClassPerFile:
|
|
49
|
+
Enabled: false
|
|
50
|
+
|
|
51
|
+
# Both % and format/sprintf are acceptable
|
|
52
|
+
Style/FormatString:
|
|
53
|
+
Enabled: false
|
|
54
|
+
|
|
55
|
+
# String concatenation and interpolation are both acceptable
|
|
56
|
+
Style/StringConcatenation:
|
|
57
|
+
Enabled: false
|
|
58
|
+
|
|
59
|
+
# ── Layout ─────────────────────────────────────────────────────────────────
|
|
60
|
+
Layout/LineLength:
|
|
61
|
+
Max: 140
|
|
62
|
+
|
|
63
|
+
Layout/ExtraSpacing:
|
|
64
|
+
Enabled: false
|
|
65
|
+
|
|
66
|
+
Layout/HashAlignment:
|
|
67
|
+
Enabled: false
|
|
68
|
+
|
|
69
|
+
Layout/FirstHashElementIndentation:
|
|
70
|
+
Enabled: false
|
|
71
|
+
|
|
72
|
+
Layout/EmptyLineAfterGuardClause:
|
|
73
|
+
Enabled: false
|
|
74
|
+
|
|
75
|
+
# ── Naming ─────────────────────────────────────────────────────────────────
|
|
76
|
+
# Single-char params (c, e, n) are acceptable throughout
|
|
77
|
+
Naming/MethodParameterName:
|
|
78
|
+
Enabled: false
|
|
79
|
+
|
|
80
|
+
Naming/VariableNumber:
|
|
81
|
+
Exclude:
|
|
82
|
+
- 'test/**/*'
|
|
83
|
+
|
|
84
|
+
Naming/RescuedExceptionsVariableName:
|
|
85
|
+
Enabled: false
|
|
86
|
+
|
|
87
|
+
# set_results and similar explicit setters are clear and conventional
|
|
88
|
+
Naming/AccessorMethodName:
|
|
89
|
+
Enabled: false
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# has_tool_calls? and similar are clear and conventional
|
|
93
|
+
Naming/PredicatePrefix:
|
|
94
|
+
Enabled: false
|
|
95
|
+
|
|
96
|
+
# Test helper methods don't need to follow predicate naming rules
|
|
97
|
+
Naming/PredicateMethod:
|
|
98
|
+
Exclude:
|
|
99
|
+
- 'test/**/*'
|
|
100
|
+
|
|
101
|
+
# ── Lint: relax noisy cops on intentional patterns ─────────────────────────
|
|
102
|
+
# Library and framework methods commonly accept args for API/documentation purposes
|
|
103
|
+
Lint/UnusedMethodArgument:
|
|
104
|
+
Enabled: false
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
Lint/EmptyBlock:
|
|
108
|
+
Exclude:
|
|
109
|
+
- 'test/**/*'
|
|
110
|
+
|
|
111
|
+
Lint/ConstantDefinitionInBlock:
|
|
112
|
+
Exclude:
|
|
113
|
+
- 'Rakefile'
|
|
114
|
+
- 'test/**/*'
|
|
115
|
+
|
|
116
|
+
# ── Gemspec ────────────────────────────────────────────────────────────────
|
|
117
|
+
Gemspec/DevelopmentDependencies:
|
|
118
|
+
EnforcedStyle: Gemfile
|
|
119
|
+
|
|
120
|
+
Gemspec/RequiredRubyVersion:
|
|
121
|
+
Enabled: false
|
|
122
|
+
|
|
123
|
+
Gemspec/OrderedDependencies:
|
|
124
|
+
Enabled: false
|
|
125
|
+
|
|
126
|
+
# ── Metrics ────────────────────────────────────────────────────────────────
|
|
127
|
+
# Framework-level code (routers, parsers, orchestrators) is inherently complex.
|
|
128
|
+
# Flog is the primary complexity gate — these RuboCop thresholds catch only
|
|
129
|
+
# egregious outliers without false-positiving every dispatch method.
|
|
130
|
+
|
|
131
|
+
Metrics/MethodLength:
|
|
132
|
+
Max: 35
|
|
133
|
+
CountAsOne:
|
|
134
|
+
- heredoc
|
|
135
|
+
- array
|
|
136
|
+
- hash
|
|
137
|
+
Exclude:
|
|
138
|
+
- 'test/**/*'
|
|
139
|
+
|
|
140
|
+
Metrics/AbcSize:
|
|
141
|
+
Max: 40
|
|
142
|
+
Exclude:
|
|
143
|
+
- 'test/**/*'
|
|
144
|
+
|
|
145
|
+
Metrics/ClassLength:
|
|
146
|
+
Max: 600
|
|
147
|
+
Exclude:
|
|
148
|
+
- 'test/**/*'
|
|
149
|
+
|
|
150
|
+
Metrics/ModuleLength:
|
|
151
|
+
Max: 200
|
|
152
|
+
Exclude:
|
|
153
|
+
- 'test/**/*'
|
|
154
|
+
|
|
155
|
+
Metrics/CyclomaticComplexity:
|
|
156
|
+
Max: 20
|
|
157
|
+
Exclude:
|
|
158
|
+
- 'test/**/*'
|
|
159
|
+
|
|
160
|
+
Metrics/PerceivedComplexity:
|
|
161
|
+
Max: 20
|
|
162
|
+
Exclude:
|
|
163
|
+
- 'test/**/*'
|
|
164
|
+
|
|
165
|
+
# Long method signatures with keyword args are a Ruby framework idiom
|
|
166
|
+
Metrics/ParameterLists:
|
|
167
|
+
Enabled: false
|
|
168
|
+
|
|
169
|
+
Metrics/BlockLength:
|
|
170
|
+
Exclude:
|
|
171
|
+
- 'Rakefile'
|
|
172
|
+
- '*.gemspec'
|
|
173
|
+
- 'test/**/*'
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,24 @@
|
|
|
1
1
|
## [Unreleased]
|
|
2
2
|
|
|
3
|
+
### Fixed
|
|
4
|
+
- Model name in README and docs corrected to `BAAI/bge-small-en-v1.5` (was incorrectly listed as `bge-base`)
|
|
5
|
+
- `register_extension` call guarded with `defined?(RobotLab) && RobotLab.respond_to?(:register_extension)` so the file loads safely without robot_lab core
|
|
6
|
+
- Instance variable `@fastembed_model` renamed from `@model` to eliminate shadowing risk
|
|
7
|
+
- `FASTEMBED_AVAILABLE` constant moved into `DocumentStore` class (was at module level)
|
|
8
|
+
- `STOP_WORDS` constant moved before `private` keyword (was defined after it)
|
|
9
|
+
- `sparse_cosine` parameter names corrected to `vec_a`/`vec_b`; uses `each_value` for the second vector
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
- Full test suite covering fastembed path, TF-IDF fallback path, and cosine edge cases (27 tests, 44 assertions)
|
|
13
|
+
- SimpleCov branch coverage with thresholds (line: 95%, branch: 75%)
|
|
14
|
+
- `quality` Rake task: runs tests + coverage, RuboCop, and Flog in sequence
|
|
15
|
+
- Complete RBS type signatures in `sig/robot_lab/document_store.rbs`
|
|
16
|
+
- Example script `examples/01_basic_usage.rb` with companion Markdown documents
|
|
17
|
+
|
|
18
|
+
### Changed
|
|
19
|
+
- Development dependencies moved from gemspec to Gemfile (per `Gemspec/DevelopmentDependencies` cop)
|
|
20
|
+
- Example renamed from `26_document_store.rb` to `01_basic_usage.rb`
|
|
21
|
+
|
|
3
22
|
## [0.1.0] - 2026-05-07
|
|
4
23
|
|
|
5
24
|
- Initial release
|
data/README.md
CHANGED
data/Rakefile
CHANGED
|
@@ -1,8 +1,116 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require
|
|
4
|
-
require
|
|
3
|
+
require 'bundler/gem_tasks'
|
|
4
|
+
require 'rake/testtask'
|
|
5
5
|
|
|
6
|
-
|
|
6
|
+
Rake::TestTask.new(:test) do |t|
|
|
7
|
+
t.libs << 'test'
|
|
8
|
+
t.libs << 'lib'
|
|
9
|
+
t.test_files = FileList['test/**/*_test.rb', 'test/**/test_*.rb'].exclude('**/*_helper.rb')
|
|
10
|
+
t.verbose = true
|
|
11
|
+
t.ruby_opts << '-rtest_helper'
|
|
12
|
+
end
|
|
7
13
|
|
|
8
14
|
task default: :test
|
|
15
|
+
|
|
16
|
+
desc 'Run tests with verbose output'
|
|
17
|
+
task :test_verbose do
|
|
18
|
+
ENV['TESTOPTS'] = '--verbose'
|
|
19
|
+
Rake::Task[:test].invoke
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
desc 'Run a single test file'
|
|
23
|
+
task :test_file, [:file] do |_t, args|
|
|
24
|
+
ruby "test/#{args[:file]}"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
desc 'Check code style with RuboCop'
|
|
28
|
+
task :rubocop do
|
|
29
|
+
sh 'bundle exec rubocop'
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
desc 'Auto-correct RuboCop offenses'
|
|
33
|
+
task :rubocop_fix do
|
|
34
|
+
sh 'bundle exec rubocop -a'
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
desc 'Check code complexity with Flog (warn >=20, fail >=50)'
|
|
38
|
+
task :flog_check do
|
|
39
|
+
require 'flog'
|
|
40
|
+
|
|
41
|
+
method_warn = 20.0
|
|
42
|
+
method_fail = 50.0
|
|
43
|
+
|
|
44
|
+
flogger = Flog.new(all: true)
|
|
45
|
+
flogger.flog(*Dir.glob('lib/**/*.rb'))
|
|
46
|
+
|
|
47
|
+
warnings = []
|
|
48
|
+
failures = []
|
|
49
|
+
|
|
50
|
+
flogger.each_by_score do |method, score|
|
|
51
|
+
next if method.end_with?('#none')
|
|
52
|
+
|
|
53
|
+
if score > method_fail
|
|
54
|
+
failures << "#{format('%.1f', score)}: #{method}"
|
|
55
|
+
elsif score > method_warn
|
|
56
|
+
warnings << "#{format('%.1f', score)}: #{method}"
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
unless warnings.empty?
|
|
61
|
+
puts "\nFlog warnings (#{method_warn}–#{method_fail}) — target for future refactoring:"
|
|
62
|
+
warnings.each { |v| puts " #{v}" }
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
if failures.empty?
|
|
66
|
+
puts "\nFlog: no methods exceed the failure threshold (>=#{method_fail})"
|
|
67
|
+
else
|
|
68
|
+
puts "\nFlog failures (>=#{method_fail}) — must be refactored:"
|
|
69
|
+
failures.each { |v| puts " #{v}" }
|
|
70
|
+
abort "\nFlog quality gate failed: #{failures.size} method(s) exceed #{method_fail}"
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
desc 'Run all quality checks: tests (with coverage), RuboCop, and Flog'
|
|
75
|
+
task :quality do
|
|
76
|
+
results = {}
|
|
77
|
+
|
|
78
|
+
puts "\n#{'=' * 60}"
|
|
79
|
+
puts 'Quality Gate: Tests + Coverage'
|
|
80
|
+
puts '=' * 60
|
|
81
|
+
results[:tests] = system('bundle exec rake test') ? :pass : :fail
|
|
82
|
+
|
|
83
|
+
puts "\n#{'=' * 60}"
|
|
84
|
+
puts 'Quality Gate: RuboCop'
|
|
85
|
+
puts '=' * 60
|
|
86
|
+
results[:rubocop] = system('bundle exec rubocop') ? :pass : :fail
|
|
87
|
+
|
|
88
|
+
puts "\n#{'=' * 60}"
|
|
89
|
+
puts 'Quality Gate: Flog Complexity'
|
|
90
|
+
puts '=' * 60
|
|
91
|
+
results[:flog] = system('bundle exec rake flog_check') ? :pass : :fail
|
|
92
|
+
|
|
93
|
+
puts "\n#{'=' * 60}"
|
|
94
|
+
puts 'Quality Summary'
|
|
95
|
+
puts '=' * 60
|
|
96
|
+
results.each do |gate, status|
|
|
97
|
+
icon = status == :pass ? 'PASS' : 'FAIL'
|
|
98
|
+
puts " [#{icon}] #{gate}"
|
|
99
|
+
end
|
|
100
|
+
puts '=' * 60
|
|
101
|
+
|
|
102
|
+
abort "\nQuality gate failed" if results.values.any?(:fail)
|
|
103
|
+
puts "\nAll quality gates passed."
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
namespace :docs do
|
|
107
|
+
desc 'Build MkDocs documentation'
|
|
108
|
+
task :build do
|
|
109
|
+
sh 'mkdocs build'
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
desc 'Serve MkDocs documentation locally on http://localhost:8000'
|
|
113
|
+
task :serve do
|
|
114
|
+
sh 'mkdocs serve'
|
|
115
|
+
end
|
|
116
|
+
end
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# API Reference
|
|
2
|
+
|
|
3
|
+
All public methods of `RobotLab::DocumentStore`.
|
|
4
|
+
|
|
5
|
+
## Constructor
|
|
6
|
+
|
|
7
|
+
### `new(model_name: DEFAULT_MODEL)`
|
|
8
|
+
|
|
9
|
+
Creates a new, empty document store.
|
|
10
|
+
|
|
11
|
+
| Parameter | Type | Default | Description |
|
|
12
|
+
|-----------|------|---------|-------------|
|
|
13
|
+
| `model_name` | `String` | `"BAAI/bge-small-en-v1.5"` | fastembed model name. Ignored when fastembed is unavailable. |
|
|
14
|
+
|
|
15
|
+
```ruby
|
|
16
|
+
# Default model
|
|
17
|
+
store = RobotLab::DocumentStore.new
|
|
18
|
+
|
|
19
|
+
# Custom model
|
|
20
|
+
store = RobotLab::DocumentStore.new(model_name: "BAAI/bge-base-en-v1.5")
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
The embedding model is initialised lazily — no download or computation happens
|
|
24
|
+
at construction time.
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
28
|
+
## Writing Documents
|
|
29
|
+
|
|
30
|
+
### `store(key, text) → self`
|
|
31
|
+
|
|
32
|
+
Embeds `text` and stores it under `key`. If a document already exists under that
|
|
33
|
+
key it is replaced. Embedding happens synchronously before the method returns.
|
|
34
|
+
|
|
35
|
+
| Parameter | Type | Description |
|
|
36
|
+
|-----------|------|-------------|
|
|
37
|
+
| `key` | `Symbol` \| `String` | Identifier for the document. Strings are converted to `Symbol` internally. |
|
|
38
|
+
| `text` | `String` | The document text to embed and store. |
|
|
39
|
+
|
|
40
|
+
**Returns:** `self` — supports method chaining.
|
|
41
|
+
|
|
42
|
+
```ruby
|
|
43
|
+
store.store(:readme, File.read("README.md"))
|
|
44
|
+
.store(:changelog, File.read("CHANGELOG.md"))
|
|
45
|
+
.store(:guide, File.read("GUIDE.md"))
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## Searching
|
|
51
|
+
|
|
52
|
+
### `search(query, limit: 5) → Array<Hash>`
|
|
53
|
+
|
|
54
|
+
Embeds `query` and returns the `limit` most similar documents ranked by cosine
|
|
55
|
+
similarity score descending.
|
|
56
|
+
|
|
57
|
+
| Parameter | Type | Default | Description |
|
|
58
|
+
|-----------|------|---------|-------------|
|
|
59
|
+
| `query` | `String` | — | Natural-language search query. |
|
|
60
|
+
| `limit` | `Integer` | `5` | Maximum number of results to return. |
|
|
61
|
+
|
|
62
|
+
**Returns:** `Array` of result hashes, each containing:
|
|
63
|
+
|
|
64
|
+
| Key | Type | Description |
|
|
65
|
+
|-----|------|-------------|
|
|
66
|
+
| `:key` | `Symbol` | The document key |
|
|
67
|
+
| `:text` | `String` | The stored document text |
|
|
68
|
+
| `:score` | `Float` | Cosine similarity score, range `0.0..1.0` |
|
|
69
|
+
|
|
70
|
+
Results are sorted by `:score` descending (most similar first). Returns `[]` if
|
|
71
|
+
the store is empty.
|
|
72
|
+
|
|
73
|
+
```ruby
|
|
74
|
+
results = store.search("database connection pool exhausted", limit: 3)
|
|
75
|
+
|
|
76
|
+
results.each do |r|
|
|
77
|
+
puts "#{r[:key].to_s.ljust(24)} score=#{r[:score].round(3)}"
|
|
78
|
+
puts " #{r[:text][0, 80]}…"
|
|
79
|
+
end
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
!!! tip "Score interpretation"
|
|
83
|
+
Scores above `0.7` indicate strong semantic similarity. Scores below `0.3`
|
|
84
|
+
typically indicate weak or no relationship. The exact thresholds depend on
|
|
85
|
+
the model and your document corpus.
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## Reading Metadata
|
|
90
|
+
|
|
91
|
+
### `size → Integer`
|
|
92
|
+
|
|
93
|
+
Returns the number of stored documents.
|
|
94
|
+
|
|
95
|
+
```ruby
|
|
96
|
+
store.size # => 0
|
|
97
|
+
store.store(:a, "text")
|
|
98
|
+
store.size # => 1
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### `keys → Array<Symbol>`
|
|
102
|
+
|
|
103
|
+
Returns the keys of all stored documents in insertion order.
|
|
104
|
+
|
|
105
|
+
```ruby
|
|
106
|
+
store.store(:alpha, "…")
|
|
107
|
+
store.store(:beta, "…")
|
|
108
|
+
store.keys # => [:alpha, :beta]
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### `empty? → Boolean`
|
|
112
|
+
|
|
113
|
+
Returns `true` if no documents are stored.
|
|
114
|
+
|
|
115
|
+
```ruby
|
|
116
|
+
store.empty? # => true
|
|
117
|
+
store.store(:a, "text")
|
|
118
|
+
store.empty? # => false
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## Removing Documents
|
|
124
|
+
|
|
125
|
+
### `delete(key) → self`
|
|
126
|
+
|
|
127
|
+
Removes the document stored under `key`. No-op if the key does not exist.
|
|
128
|
+
|
|
129
|
+
| Parameter | Type | Description |
|
|
130
|
+
|-----------|------|-------------|
|
|
131
|
+
| `key` | `Symbol` \| `String` | Key to remove. |
|
|
132
|
+
|
|
133
|
+
**Returns:** `self`.
|
|
134
|
+
|
|
135
|
+
```ruby
|
|
136
|
+
store.delete(:outdated_doc)
|
|
137
|
+
store.delete("also_works_with_strings")
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### `clear → self`
|
|
141
|
+
|
|
142
|
+
Removes all stored documents.
|
|
143
|
+
|
|
144
|
+
**Returns:** `self`.
|
|
145
|
+
|
|
146
|
+
```ruby
|
|
147
|
+
store.clear
|
|
148
|
+
store.empty? # => true
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
## Constants
|
|
154
|
+
|
|
155
|
+
### `DEFAULT_MODEL`
|
|
156
|
+
|
|
157
|
+
```ruby
|
|
158
|
+
RobotLab::DocumentStore::DEFAULT_MODEL # => "BAAI/bge-small-en-v1.5"
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
The fastembed model used when no `model_name:` is specified.
|
|
162
|
+
|
|
163
|
+
### `STOP_WORDS`
|
|
164
|
+
|
|
165
|
+
A frozen `Set<String>` of common English words excluded from TF-IDF indexing
|
|
166
|
+
(`a`, `an`, `the`, `is`, `are`, …). Only relevant when fastembed is unavailable.
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
## Thread Safety
|
|
171
|
+
|
|
172
|
+
All public methods are thread-safe. An internal `Mutex` serialises access to
|
|
173
|
+
the document hash. You can safely share a single `DocumentStore` instance across
|
|
174
|
+
Puma threads, Sidekiq workers, or Ractor-based agents.
|
|
175
|
+
|
|
176
|
+
```ruby
|
|
177
|
+
# Safe: multiple threads can store and search concurrently
|
|
178
|
+
store = RobotLab::DocumentStore.new
|
|
179
|
+
|
|
180
|
+
threads = 10.times.map do |i|
|
|
181
|
+
Thread.new { store.store(:"doc_#{i}", "Document #{i} text content") }
|
|
182
|
+
end
|
|
183
|
+
threads.each(&:join)
|
|
184
|
+
|
|
185
|
+
store.size # => 10
|
|
186
|
+
```
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 820 540" font-family="Roboto Mono, monospace">
|
|
2
|
+
<!-- transparent background -->
|
|
3
|
+
|
|
4
|
+
<!-- ── Title ── -->
|
|
5
|
+
<text x="410" y="32" text-anchor="middle" font-size="16" font-weight="bold" fill="#e2e8f0">DocumentStore — Embedding Pipeline</text>
|
|
6
|
+
|
|
7
|
+
<!-- ══════════════════════════════════════════════════════
|
|
8
|
+
LEFT COLUMN — STORE PATH
|
|
9
|
+
═══════════════════════════════════════════════════════ -->
|
|
10
|
+
|
|
11
|
+
<!-- store() label -->
|
|
12
|
+
<text x="160" y="72" text-anchor="middle" font-size="13" font-weight="bold" fill="#94a3b8">store(key, text)</text>
|
|
13
|
+
|
|
14
|
+
<!-- Input text box -->
|
|
15
|
+
<rect x="60" y="82" width="200" height="44" rx="6" fill="#1e293b" stroke="#475569" stroke-width="1.5"/>
|
|
16
|
+
<text x="160" y="100" text-anchor="middle" font-size="11" fill="#94a3b8">Document Text</text>
|
|
17
|
+
<text x="160" y="116" text-anchor="middle" font-size="10" fill="#64748b">"Postgres query slow…"</text>
|
|
18
|
+
|
|
19
|
+
<!-- arrow down -->
|
|
20
|
+
<line x1="160" y1="126" x2="160" y2="150" stroke="#475569" stroke-width="1.5" marker-end="url(#arr)"/>
|
|
21
|
+
|
|
22
|
+
<!-- fastembed decision diamond -->
|
|
23
|
+
<polygon points="160,152 210,176 160,200 110,176" fill="#1e3a5f" stroke="#3b82f6" stroke-width="1.5"/>
|
|
24
|
+
<text x="160" y="172" text-anchor="middle" font-size="10" fill="#93c5fd">fastembed</text>
|
|
25
|
+
<text x="160" y="185" text-anchor="middle" font-size="10" fill="#93c5fd">available?</text>
|
|
26
|
+
|
|
27
|
+
<!-- YES branch → passage_embed -->
|
|
28
|
+
<line x1="210" y1="176" x2="270" y2="176" stroke="#22c55e" stroke-width="1.5" marker-end="url(#arrGreen)"/>
|
|
29
|
+
<text x="237" y="169" text-anchor="middle" font-size="9" fill="#22c55e">yes</text>
|
|
30
|
+
|
|
31
|
+
<rect x="270" y="158" width="130" height="36" rx="6" fill="#14532d" stroke="#22c55e" stroke-width="1.5"/>
|
|
32
|
+
<text x="335" y="173" text-anchor="middle" font-size="10" fill="#86efac">passage_embed()</text>
|
|
33
|
+
<text x="335" y="187" text-anchor="middle" font-size="9" fill="#4ade80">dense Float[] (384d)</text>
|
|
34
|
+
|
|
35
|
+
<!-- NO branch → fallback_vector -->
|
|
36
|
+
<line x1="160" y1="200" x2="160" y2="224" stroke="#f59e0b" stroke-width="1.5" marker-end="url(#arrAmber)"/>
|
|
37
|
+
<text x="172" y="216" font-size="9" fill="#f59e0b">no</text>
|
|
38
|
+
|
|
39
|
+
<rect x="60" y="224" width="200" height="36" rx="6" fill="#451a03" stroke="#f59e0b" stroke-width="1.5"/>
|
|
40
|
+
<text x="160" y="239" text-anchor="middle" font-size="10" fill="#fcd34d">fallback_vector()</text>
|
|
41
|
+
<text x="160" y="253" text-anchor="middle" font-size="9" fill="#fbbf24">sparse Hash TF-IDF L2</text>
|
|
42
|
+
|
|
43
|
+
<!-- merge to store -->
|
|
44
|
+
<line x1="335" y1="194" x2="335" y2="310" stroke="#22c55e" stroke-width="1.2" stroke-dasharray="4,3"/>
|
|
45
|
+
<line x1="160" y1="260" x2="160" y2="310" stroke="#f59e0b" stroke-width="1.2" stroke-dasharray="4,3"/>
|
|
46
|
+
<line x1="160" y1="310" x2="248" y2="310" stroke="#475569" stroke-width="1.2"/>
|
|
47
|
+
<line x1="335" y1="310" x2="248" y2="310" stroke="#475569" stroke-width="1.2"/>
|
|
48
|
+
<line x1="248" y1="310" x2="248" y2="326" stroke="#475569" stroke-width="1.5" marker-end="url(#arr)"/>
|
|
49
|
+
|
|
50
|
+
<!-- @documents store -->
|
|
51
|
+
<rect x="148" y="326" width="200" height="44" rx="6" fill="#1e1b4b" stroke="#818cf8" stroke-width="1.5"/>
|
|
52
|
+
<text x="248" y="344" text-anchor="middle" font-size="11" fill="#a5b4fc">@documents</text>
|
|
53
|
+
<text x="248" y="360" text-anchor="middle" font-size="9" fill="#6366f1">{ key → { text, vector } }</text>
|
|
54
|
+
|
|
55
|
+
<!-- Mutex badge -->
|
|
56
|
+
<rect x="334" y="330" width="50" height="18" rx="4" fill="#312e81" stroke="#6366f1" stroke-width="1"/>
|
|
57
|
+
<text x="359" y="343" text-anchor="middle" font-size="9" fill="#c7d2fe">Mutex</text>
|
|
58
|
+
|
|
59
|
+
<!-- ══════════════════════════════════════════════════════
|
|
60
|
+
RIGHT COLUMN — SEARCH PATH
|
|
61
|
+
═══════════════════════════════════════════════════════ -->
|
|
62
|
+
|
|
63
|
+
<!-- search() label -->
|
|
64
|
+
<text x="640" y="72" text-anchor="middle" font-size="13" font-weight="bold" fill="#94a3b8">search(query, limit:)</text>
|
|
65
|
+
|
|
66
|
+
<!-- Query text box -->
|
|
67
|
+
<rect x="540" y="82" width="200" height="44" rx="6" fill="#1e293b" stroke="#475569" stroke-width="1.5"/>
|
|
68
|
+
<text x="640" y="100" text-anchor="middle" font-size="11" fill="#94a3b8">Query String</text>
|
|
69
|
+
<text x="640" y="116" text-anchor="middle" font-size="10" fill="#64748b">"Why is my query slow?"</text>
|
|
70
|
+
|
|
71
|
+
<!-- arrow down -->
|
|
72
|
+
<line x1="640" y1="126" x2="640" y2="150" stroke="#475569" stroke-width="1.5" marker-end="url(#arr)"/>
|
|
73
|
+
|
|
74
|
+
<!-- decision diamond -->
|
|
75
|
+
<polygon points="640,152 690,176 640,200 590,176" fill="#1e3a5f" stroke="#3b82f6" stroke-width="1.5"/>
|
|
76
|
+
<text x="640" y="172" text-anchor="middle" font-size="10" fill="#93c5fd">fastembed</text>
|
|
77
|
+
<text x="640" y="185" text-anchor="middle" font-size="10" fill="#93c5fd">available?</text>
|
|
78
|
+
|
|
79
|
+
<!-- YES → query_embed -->
|
|
80
|
+
<line x1="690" y1="176" x2="750" y2="176" stroke="#22c55e" stroke-width="1.5" marker-end="url(#arrGreen)"/>
|
|
81
|
+
<text x="717" y="169" text-anchor="middle" font-size="9" fill="#22c55e">yes</text>
|
|
82
|
+
|
|
83
|
+
<rect x="750" y="158" width="56" height="36" rx="6" fill="#14532d" stroke="#22c55e" stroke-width="1.5"/>
|
|
84
|
+
<text x="778" y="173" text-anchor="middle" font-size="10" fill="#86efac">query_</text>
|
|
85
|
+
<text x="778" y="187" text-anchor="middle" font-size="10" fill="#86efac">embed()</text>
|
|
86
|
+
|
|
87
|
+
<!-- NO → fallback_vector -->
|
|
88
|
+
<line x1="640" y1="200" x2="640" y2="224" stroke="#f59e0b" stroke-width="1.5" marker-end="url(#arrAmber)"/>
|
|
89
|
+
<text x="652" y="216" font-size="9" fill="#f59e0b">no</text>
|
|
90
|
+
|
|
91
|
+
<rect x="540" y="224" width="200" height="36" rx="6" fill="#451a03" stroke="#f59e0b" stroke-width="1.5"/>
|
|
92
|
+
<text x="640" y="239" text-anchor="middle" font-size="10" fill="#fcd34d">fallback_vector()</text>
|
|
93
|
+
<text x="640" y="253" text-anchor="middle" font-size="9" fill="#fbbf24">sparse Hash TF-IDF L2</text>
|
|
94
|
+
|
|
95
|
+
<!-- query_vec arrow to cosine_similarity -->
|
|
96
|
+
<line x1="640" y1="260" x2="640" y2="326" stroke="#475569" stroke-width="1.5" marker-end="url(#arr)"/>
|
|
97
|
+
<line x1="778" y1="194" x2="778" y2="310" stroke="#22c55e" stroke-width="1.2" stroke-dasharray="4,3"/>
|
|
98
|
+
<line x1="778" y1="310" x2="700" y2="310" stroke="#475569" stroke-width="1.2"/>
|
|
99
|
+
<line x1="700" y1="310" x2="700" y2="326" stroke="#475569" stroke-width="1.5" marker-end="url(#arr)"/>
|
|
100
|
+
|
|
101
|
+
<!-- cosine_similarity box -->
|
|
102
|
+
<rect x="530" y="326" width="240" height="44" rx="6" fill="#1c1917" stroke="#d97706" stroke-width="1.5"/>
|
|
103
|
+
<text x="650" y="344" text-anchor="middle" font-size="11" fill="#fbbf24">cosine_similarity()</text>
|
|
104
|
+
<text x="650" y="360" text-anchor="middle" font-size="9" fill="#92400e">dot(q,p) / (‖q‖ · ‖p‖) → score 0..1</text>
|
|
105
|
+
|
|
106
|
+
<!-- @documents feeds cosine via arrow -->
|
|
107
|
+
<line x1="348" y1="348" x2="530" y2="348" stroke="#818cf8" stroke-width="1.5" marker-end="url(#arrPurple)"/>
|
|
108
|
+
<text x="435" y="341" text-anchor="middle" font-size="9" fill="#818cf8">stored vectors</text>
|
|
109
|
+
|
|
110
|
+
<!-- Results box -->
|
|
111
|
+
<rect x="530" y="412" width="240" height="44" rx="6" fill="#1e293b" stroke="#38bdf8" stroke-width="1.5"/>
|
|
112
|
+
<text x="650" y="430" text-anchor="middle" font-size="11" fill="#7dd3fc">Ranked Results</text>
|
|
113
|
+
<text x="650" y="446" text-anchor="middle" font-size="9" fill="#38bdf8">[{ key:, text:, score: }, …] sorted desc</text>
|
|
114
|
+
|
|
115
|
+
<line x1="650" y1="370" x2="650" y2="412" stroke="#475569" stroke-width="1.5" marker-end="url(#arr)"/>
|
|
116
|
+
|
|
117
|
+
<!-- ══════════════════════════════════════════════════════
|
|
118
|
+
BOTTOM — FALLBACK DETAIL
|
|
119
|
+
═══════════════════════════════════════════════════════ -->
|
|
120
|
+
|
|
121
|
+
<text x="410" y="490" text-anchor="middle" font-size="11" fill="#64748b">
|
|
122
|
+
TF-IDF fallback: tokenise → strip stop words → Porter stem → L2-normalise → sparse cosine dot product
|
|
123
|
+
</text>
|
|
124
|
+
|
|
125
|
+
<!-- ── arrowhead markers ── -->
|
|
126
|
+
<defs>
|
|
127
|
+
<marker id="arr" markerWidth="8" markerHeight="8" refX="6" refY="3" orient="auto">
|
|
128
|
+
<path d="M0,0 L0,6 L8,3 z" fill="#475569"/>
|
|
129
|
+
</marker>
|
|
130
|
+
<marker id="arrGreen" markerWidth="8" markerHeight="8" refX="6" refY="3" orient="auto">
|
|
131
|
+
<path d="M0,0 L0,6 L8,3 z" fill="#22c55e"/>
|
|
132
|
+
</marker>
|
|
133
|
+
<marker id="arrAmber" markerWidth="8" markerHeight="8" refX="6" refY="3" orient="auto">
|
|
134
|
+
<path d="M0,0 L0,6 L8,3 z" fill="#f59e0b"/>
|
|
135
|
+
</marker>
|
|
136
|
+
<marker id="arrPurple" markerWidth="8" markerHeight="8" refX="6" refY="3" orient="auto">
|
|
137
|
+
<path d="M0,0 L0,6 L8,3 z" fill="#818cf8"/>
|
|
138
|
+
</marker>
|
|
139
|
+
</defs>
|
|
140
|
+
</svg>
|