documentrix 0.2.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGES.md +80 -0
- data/documentrix.gemspec +5 -5
- data/lib/documentrix/documents/cache/common.rb +63 -11
- data/lib/documentrix/documents/cache/records.rb +1 -1
- data/lib/documentrix/documents/cache/redis_cache.rb +3 -3
- data/lib/documentrix/documents/cache/sqlite_cache.rb +132 -33
- data/lib/documentrix/documents/splitters/character.rb +56 -4
- data/lib/documentrix/documents/splitters/common.rb +38 -0
- data/lib/documentrix/documents/splitters/semantic.rb +67 -8
- data/lib/documentrix/documents.rb +133 -29
- data/lib/documentrix/utils/colorize_texts.rb +25 -21
- data/lib/documentrix/utils/digests.rb +78 -0
- data/lib/documentrix/utils.rb +1 -0
- data/lib/documentrix/version.rb +1 -1
- data/spec/documentrix/documents/cache/interface_spec.rb +16 -3
- data/spec/documentrix/documents/cache/memory_cache_spec.rb +64 -2
- data/spec/documentrix/documents/cache/redis_cache_spec.rb +68 -19
- data/spec/documentrix/documents/cache/sqlite_cache_spec.rb +169 -2
- data/spec/documentrix/documents/splitters/character_spec.rb +20 -2
- data/spec/documentrix/documents/splitters/semantic_spec.rb +17 -5
- data/spec/documents_spec.rb +59 -3
- data/spec/utils/colorize_texts_spec.rb +0 -2
- data/spec/utils/digests_spec.rb +97 -0
- data/spec/utils/tags_spec.rb +0 -2
- metadata +7 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c3ab97426ab9fbd4ec832a422a18d574f4c03b11f22391cbc6eded16b9ba0609
|
|
4
|
+
data.tar.gz: 991215fb26b4165a4b3562075d9742e3756218e2a24f3f32a550e232f59fb92f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d67e453e27428bcd8364349e988556af844caab5faa8a8edb3dcfba9650554b8dbfc66ad322a6f9f59d5ac64350d7006bf899a16b6c83f92909793301b6954d5
|
|
7
|
+
data.tar.gz: 49706d4984b27ee7b3d0f8ef8a9efec4a8e74d8bef78d78f15ed5c76f5bacc0faf20c36cef739aa78c1f68e5733696d16999bbd14228ae7be8e6cf9c9baf0a75
|
data/CHANGES.md
CHANGED
|
@@ -1,5 +1,85 @@
|
|
|
1
1
|
# Changes
|
|
2
2
|
|
|
3
|
+
## 2026-05-18 v0.3.1
|
|
4
|
+
|
|
5
|
+
- Fixed scoping bugs in `clear_by_source` and `source_exist?` by implementing a
|
|
6
|
+
`key LIKE ?` constraint.
|
|
7
|
+
- Added regression tests for prefix isolation in
|
|
8
|
+
`spec/documentrix/documents/cache/sqlite_cache_spec.rb`.
|
|
9
|
+
- Introduced the `start_with_prefix` method in
|
|
10
|
+
`lib/documentrix/documents/cache/sqlite_cache.rb` to unify prefix patterns.
|
|
11
|
+
- Updated `tags`, `size`, `clear_all_with_prefix`, and `each` to utilize the
|
|
12
|
+
new `start_with_prefix` method.
|
|
13
|
+
|
|
14
|
+
## 2026-05-17 v0.3.0
|
|
15
|
+
|
|
16
|
+
### New Features
|
|
17
|
+
|
|
18
|
+
- **Source Tracking & Versioning**:
|
|
19
|
+
- Introduced `Documentrix::Utils::Digests` for SHA256 hashing of strings
|
|
20
|
+
and files, including an `mtime`-based cache.
|
|
21
|
+
- Implemented source-based document management in `Documentrix::Documents`
|
|
22
|
+
via `normalize_source`, `source_exist?`, `source_modified?`,
|
|
23
|
+
`source_update`, and `source_remove`.
|
|
24
|
+
- Updated `Documentrix::Documents#add` and
|
|
25
|
+
`Documentrix::Documents#source_update` to support `digest` for version
|
|
26
|
+
tracking.
|
|
27
|
+
- **Text Splitting**:
|
|
28
|
+
- Added `Documentrix::Documents::Splitters::Common` to implement
|
|
29
|
+
`force_split` behavior.
|
|
30
|
+
- Integrated `force` splitting into `Character`, `RecursiveCharacter`, and
|
|
31
|
+
`Semantic` splitters.
|
|
32
|
+
- **Cache Enhancements**:
|
|
33
|
+
- Added `each_source` to `Documentrix::Documents::Cache::Common` and an
|
|
34
|
+
optimized `SELECT DISTINCT source` implementation in
|
|
35
|
+
`Documentrix::Documents::Cache::SQLiteCache`.
|
|
36
|
+
- Added a SQLite trigger `delete_embedding_after_record` to automatically
|
|
37
|
+
clean the `embeddings` table.
|
|
38
|
+
|
|
39
|
+
### Improvements & Refactorings
|
|
40
|
+
|
|
41
|
+
- **Search & Retrieval**:
|
|
42
|
+
- Added `min_similarity` parameter to `Documentrix::Documents#find`,
|
|
43
|
+
`Documentrix::Documents::Cache::Common#find_records`, and
|
|
44
|
+
`Documentrix::Documents::Cache::SQLiteCache#find_records`.
|
|
45
|
+
- Optimized `Documentrix::Documents::Cache::SQLiteCache#find_records` by
|
|
46
|
+
moving similarity calculations into the SQL query using `1 -
|
|
47
|
+
vec_distance_cosine`.
|
|
48
|
+
- Simplified `Documentrix::Documents#find_where` by streamlining
|
|
49
|
+
`take_while` logic and utilizing `opts[:max_records]`.
|
|
50
|
+
- **Cache Implementations**:
|
|
51
|
+
- Made `object_class` a required keyword argument in
|
|
52
|
+
`Documentrix::Documents::RedisCache#initialize`.
|
|
53
|
+
- Refactored `Documentrix::Documents::Cache::Common#clear_by_source` and
|
|
54
|
+
`Documentrix::Documents::Cache::Common#source_exist?` to use ternary
|
|
55
|
+
operators.
|
|
56
|
+
- Improved `Documentrix::Documents::Cache::SQLiteCache#each_source` and
|
|
57
|
+
`Documentrix::Documents::Cache::SQLiteCache#find_records` for better
|
|
58
|
+
robustness and formatting.
|
|
59
|
+
- **Documentation & Tooling**:
|
|
60
|
+
- Expanded YARD documentation for
|
|
61
|
+
`Documentrix::Documents::Splitters::Character`, `RecursiveCharacter`,
|
|
62
|
+
`Semantic`, and `Documentrix::Utils::ColorizeTexts`.
|
|
63
|
+
- Centralized RSpec configuration via a `.rspec` file.
|
|
64
|
+
|
|
65
|
+
### Bug Fixes
|
|
66
|
+
|
|
67
|
+
- Fixed an issue in `Documentrix::Documents#find` where `max_records` was
|
|
68
|
+
hardcoded to `nil` when calling the cache.
|
|
69
|
+
- Adjusted default handling of `min_similarity` in
|
|
70
|
+
`Documentrix::Documents#find` to use `min_similarity ||= -1` within the
|
|
71
|
+
method body.
|
|
72
|
+
|
|
73
|
+
### Testing
|
|
74
|
+
|
|
75
|
+
- Significantly expanded test suites for `SQLiteCache`, `MemoryCache`, and
|
|
76
|
+
`RedisCache`, specifically covering `each_source`, `tags`, `clear_for_tags`,
|
|
77
|
+
and digest-based checks.
|
|
78
|
+
- Added new test cases in `spec/documents_spec.rb` for source management and
|
|
79
|
+
`Documentrix::Documents#source_update`.
|
|
80
|
+
- Added `spec/utils/digests_spec.rb` and updated splitter specs to verify
|
|
81
|
+
`force` splitting behavior.
|
|
82
|
+
|
|
3
83
|
## 2026-05-12 v0.2.0
|
|
4
84
|
|
|
5
85
|
### Added
|
data/documentrix.gemspec
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
|
2
|
-
# stub: documentrix 0.
|
|
2
|
+
# stub: documentrix 0.3.1 ruby lib
|
|
3
3
|
|
|
4
4
|
Gem::Specification.new do |s|
|
|
5
5
|
s.name = "documentrix".freeze
|
|
6
|
-
s.version = "0.
|
|
6
|
+
s.version = "0.3.1".freeze
|
|
7
7
|
|
|
8
8
|
s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
|
|
9
9
|
s.require_paths = ["lib".freeze]
|
|
@@ -11,15 +11,15 @@ Gem::Specification.new do |s|
|
|
|
11
11
|
s.date = "1980-01-02"
|
|
12
12
|
s.description = "The Ruby library, Documentrix, is designed to provide a way to build and\nquery vector databases for applications in natural language processing\n(NLP) and large language models (LLMs). It allows users to store and\nretrieve dense vector embeddings for text strings.\n".freeze
|
|
13
13
|
s.email = "flori@ping.de".freeze
|
|
14
|
-
s.extra_rdoc_files = ["README.md".freeze, "lib/documentrix.rb".freeze, "lib/documentrix/documents.rb".freeze, "lib/documentrix/documents/cache/common.rb".freeze, "lib/documentrix/documents/cache/memory_cache.rb".freeze, "lib/documentrix/documents/cache/records.rb".freeze, "lib/documentrix/documents/cache/redis_cache.rb".freeze, "lib/documentrix/documents/cache/sqlite_cache.rb".freeze, "lib/documentrix/documents/splitters/character.rb".freeze, "lib/documentrix/documents/splitters/semantic.rb".freeze, "lib/documentrix/utils.rb".freeze, "lib/documentrix/utils/colorize_texts.rb".freeze, "lib/documentrix/utils/math.rb".freeze, "lib/documentrix/utils/tags.rb".freeze, "lib/documentrix/version.rb".freeze]
|
|
15
|
-
s.files = [".envrc".freeze, ".utilsrc".freeze, ".yardopts".freeze, "CHANGES.md".freeze, "Gemfile".freeze, "LICENSE".freeze, "README.md".freeze, "Rakefile".freeze, "docker-compose.yml".freeze, "documentrix.gemspec".freeze, "lib/documentrix.rb".freeze, "lib/documentrix/documents.rb".freeze, "lib/documentrix/documents/cache/common.rb".freeze, "lib/documentrix/documents/cache/memory_cache.rb".freeze, "lib/documentrix/documents/cache/records.rb".freeze, "lib/documentrix/documents/cache/redis_cache.rb".freeze, "lib/documentrix/documents/cache/sqlite_cache.rb".freeze, "lib/documentrix/documents/splitters/character.rb".freeze, "lib/documentrix/documents/splitters/semantic.rb".freeze, "lib/documentrix/utils.rb".freeze, "lib/documentrix/utils/colorize_texts.rb".freeze, "lib/documentrix/utils/math.rb".freeze, "lib/documentrix/utils/tags.rb".freeze, "lib/documentrix/version.rb".freeze, "redis/redis.conf".freeze, "spec/assets/embeddings.json".freeze, "spec/documentrix/documents/cache/interface_spec.rb".freeze, "spec/documentrix/documents/cache/memory_cache_spec.rb".freeze, "spec/documentrix/documents/cache/redis_cache_spec.rb".freeze, "spec/documentrix/documents/cache/sqlite_cache_spec.rb".freeze, "spec/documentrix/documents/splitters/character_spec.rb".freeze, "spec/documentrix/documents/splitters/semantic_spec.rb".freeze, "spec/documents_spec.rb".freeze, "spec/spec_helper.rb".freeze, "spec/utils/colorize_texts_spec.rb".freeze, "spec/utils/tags_spec.rb".freeze]
|
|
14
|
+
s.extra_rdoc_files = ["README.md".freeze, "lib/documentrix.rb".freeze, "lib/documentrix/documents.rb".freeze, "lib/documentrix/documents/cache/common.rb".freeze, "lib/documentrix/documents/cache/memory_cache.rb".freeze, "lib/documentrix/documents/cache/records.rb".freeze, "lib/documentrix/documents/cache/redis_cache.rb".freeze, "lib/documentrix/documents/cache/sqlite_cache.rb".freeze, "lib/documentrix/documents/splitters/character.rb".freeze, "lib/documentrix/documents/splitters/common.rb".freeze, "lib/documentrix/documents/splitters/semantic.rb".freeze, "lib/documentrix/utils.rb".freeze, "lib/documentrix/utils/colorize_texts.rb".freeze, "lib/documentrix/utils/digests.rb".freeze, "lib/documentrix/utils/math.rb".freeze, "lib/documentrix/utils/tags.rb".freeze, "lib/documentrix/version.rb".freeze]
|
|
15
|
+
s.files = [".envrc".freeze, ".utilsrc".freeze, ".yardopts".freeze, "CHANGES.md".freeze, "Gemfile".freeze, "LICENSE".freeze, "README.md".freeze, "Rakefile".freeze, "docker-compose.yml".freeze, "documentrix.gemspec".freeze, "lib/documentrix.rb".freeze, "lib/documentrix/documents.rb".freeze, "lib/documentrix/documents/cache/common.rb".freeze, "lib/documentrix/documents/cache/memory_cache.rb".freeze, "lib/documentrix/documents/cache/records.rb".freeze, "lib/documentrix/documents/cache/redis_cache.rb".freeze, "lib/documentrix/documents/cache/sqlite_cache.rb".freeze, "lib/documentrix/documents/splitters/character.rb".freeze, "lib/documentrix/documents/splitters/common.rb".freeze, "lib/documentrix/documents/splitters/semantic.rb".freeze, "lib/documentrix/utils.rb".freeze, "lib/documentrix/utils/colorize_texts.rb".freeze, "lib/documentrix/utils/digests.rb".freeze, "lib/documentrix/utils/math.rb".freeze, "lib/documentrix/utils/tags.rb".freeze, "lib/documentrix/version.rb".freeze, "redis/redis.conf".freeze, "spec/assets/embeddings.json".freeze, "spec/documentrix/documents/cache/interface_spec.rb".freeze, "spec/documentrix/documents/cache/memory_cache_spec.rb".freeze, "spec/documentrix/documents/cache/redis_cache_spec.rb".freeze, "spec/documentrix/documents/cache/sqlite_cache_spec.rb".freeze, "spec/documentrix/documents/splitters/character_spec.rb".freeze, "spec/documentrix/documents/splitters/semantic_spec.rb".freeze, "spec/documents_spec.rb".freeze, "spec/spec_helper.rb".freeze, "spec/utils/colorize_texts_spec.rb".freeze, "spec/utils/digests_spec.rb".freeze, "spec/utils/tags_spec.rb".freeze]
|
|
16
16
|
s.homepage = "https://github.com/flori/documentrix".freeze
|
|
17
17
|
s.licenses = ["MIT".freeze]
|
|
18
18
|
s.rdoc_options = ["--title".freeze, "Documentrix - Ruby library for embedding vector database".freeze, "--main".freeze, "README.md".freeze]
|
|
19
19
|
s.required_ruby_version = Gem::Requirement.new(">= 3.1".freeze)
|
|
20
20
|
s.rubygems_version = "4.0.10".freeze
|
|
21
21
|
s.summary = "Ruby library for embedding vector database".freeze
|
|
22
|
-
s.test_files = ["spec/documentrix/documents/cache/interface_spec.rb".freeze, "spec/documentrix/documents/cache/memory_cache_spec.rb".freeze, "spec/documentrix/documents/cache/redis_cache_spec.rb".freeze, "spec/documentrix/documents/cache/sqlite_cache_spec.rb".freeze, "spec/documentrix/documents/splitters/character_spec.rb".freeze, "spec/documentrix/documents/splitters/semantic_spec.rb".freeze, "spec/documents_spec.rb".freeze, "spec/spec_helper.rb".freeze, "spec/utils/colorize_texts_spec.rb".freeze, "spec/utils/tags_spec.rb".freeze]
|
|
22
|
+
s.test_files = ["spec/documentrix/documents/cache/interface_spec.rb".freeze, "spec/documentrix/documents/cache/memory_cache_spec.rb".freeze, "spec/documentrix/documents/cache/redis_cache_spec.rb".freeze, "spec/documentrix/documents/cache/sqlite_cache_spec.rb".freeze, "spec/documentrix/documents/splitters/character_spec.rb".freeze, "spec/documentrix/documents/splitters/semantic_spec.rb".freeze, "spec/documents_spec.rb".freeze, "spec/spec_helper.rb".freeze, "spec/utils/colorize_texts_spec.rb".freeze, "spec/utils/digests_spec.rb".freeze, "spec/utils/tags_spec.rb".freeze]
|
|
23
23
|
|
|
24
24
|
s.specification_version = 4
|
|
25
25
|
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# memory, Redis, and SQLite.
|
|
13
13
|
module Documentrix::Documents::Cache::Common
|
|
14
14
|
include Documentrix::Utils::Math
|
|
15
|
+
include Documentrix::Utils::Digests
|
|
15
16
|
include Enumerable
|
|
16
17
|
|
|
17
18
|
# The initialize method sets up the Documentrix::Documents::Cache instance's
|
|
@@ -62,27 +63,29 @@ module Documentrix::Documents::Cache::Common
|
|
|
62
63
|
# @param needle [ Array ] an array containing the embedding vector
|
|
63
64
|
# @param tags [ String, Array ] a string or array of strings representing the tags to search for
|
|
64
65
|
# @param max_records [ Integer ] the maximum number of records to return
|
|
66
|
+
# @param min_similarity [ Float ] the minimum similarity score required for a record to be returned (defaults to -1)
|
|
65
67
|
#
|
|
66
|
-
# @
|
|
67
|
-
|
|
68
|
-
# @return [ Array<Documentrix::Documents::Records> ] an array containing the matching records
|
|
69
|
-
def find_records(needle, tags: nil, max_records: nil)
|
|
68
|
+
# @return [ Array<Documentrix::Documents::Record> ] an array containing the matching records
|
|
69
|
+
def find_records(needle, tags: nil, max_records: nil, min_similarity: -1)
|
|
70
70
|
tags = Documentrix::Utils::Tags.new(Array(tags)).to_a
|
|
71
71
|
records = self
|
|
72
72
|
if tags.present?
|
|
73
73
|
records = records.select { |_key, record| (tags & record.tags).size >= 1 }
|
|
74
74
|
end
|
|
75
|
+
|
|
75
76
|
needle_norm = norm(needle)
|
|
76
|
-
records = records.
|
|
77
|
+
records = records.map do |key, record|
|
|
77
78
|
record.key = key
|
|
78
79
|
record.similarity = cosine_similarity(
|
|
79
|
-
a:
|
|
80
|
-
b:
|
|
80
|
+
a: needle,
|
|
81
|
+
b: record.embedding,
|
|
81
82
|
a_norm: needle_norm,
|
|
82
83
|
b_norm: record.norm,
|
|
83
84
|
)
|
|
84
|
-
|
|
85
|
-
|
|
85
|
+
record
|
|
86
|
+
end.sort_by(&:similarity).reverse.select { _1.similarity >= min_similarity }
|
|
87
|
+
|
|
88
|
+
max_records ? records.take(max_records) : records
|
|
86
89
|
end
|
|
87
90
|
|
|
88
91
|
# Returns a set of unique tags found in the cache records.
|
|
@@ -116,19 +119,68 @@ module Documentrix::Documents::Cache::Common
|
|
|
116
119
|
self
|
|
117
120
|
end
|
|
118
121
|
|
|
122
|
+
# Yields each unique, full source present in the cache records.
|
|
123
|
+
#
|
|
124
|
+
# @yield [source] the full source string
|
|
125
|
+
# @return [Enumerator] an enumerator if no block is given, nil otherwise.
|
|
126
|
+
def each_source(&block)
|
|
127
|
+
block or return enum_for(__method__)
|
|
128
|
+
seen = {}
|
|
129
|
+
each do |_key, record|
|
|
130
|
+
source = record.source.full? or next
|
|
131
|
+
seen.key?(source) and next
|
|
132
|
+
seen[source] = true
|
|
133
|
+
block.(source)
|
|
134
|
+
end
|
|
135
|
+
nil
|
|
136
|
+
end
|
|
137
|
+
|
|
119
138
|
# The clear_by_source method removes all records from the cache that
|
|
120
139
|
# have a source matching the given source.
|
|
121
140
|
#
|
|
122
141
|
# @param source [String] the source to filter records by
|
|
142
|
+
# @param digest [String, nil] the SHA256 hexadecimal digest of the source.
|
|
143
|
+
# @param operator [Symbol, String] the operator to compare the digest with ('=' or '!=')
|
|
123
144
|
#
|
|
124
145
|
# @return [self] self
|
|
125
|
-
def clear_by_source(source)
|
|
146
|
+
def clear_by_source(source, digest: nil, operator: ?=)
|
|
147
|
+
operator = operator == '=' ? '==' : '!='
|
|
148
|
+
|
|
126
149
|
each do |key, record|
|
|
127
|
-
|
|
150
|
+
next unless record.source == source
|
|
151
|
+
if digest
|
|
152
|
+
should_delete = record.digest.send(operator, digest)
|
|
153
|
+
delete(unpre(key)) if should_delete
|
|
154
|
+
else
|
|
155
|
+
delete(unpre(key))
|
|
156
|
+
end
|
|
128
157
|
end
|
|
129
158
|
self
|
|
130
159
|
end
|
|
131
160
|
|
|
161
|
+
# Checks if any records associated with the given source exist in the cache.
|
|
162
|
+
#
|
|
163
|
+
# @param source [String] the source to check for existence
|
|
164
|
+
# @param digest [String, nil] the SHA256 hexadecimal digest to compare against
|
|
165
|
+
# @param operator [Symbol, String] the operator to compare the digest with ('=' or '!=')
|
|
166
|
+
#
|
|
167
|
+
# @return [Boolean] true if a matching record is found, false otherwise.
|
|
168
|
+
def source_exist?(source, digest: nil, operator: ?=)
|
|
169
|
+
operator = operator == '=' ? '==' : '!='
|
|
170
|
+
|
|
171
|
+
each do |_, record|
|
|
172
|
+
next unless record.source == source
|
|
173
|
+
if digest
|
|
174
|
+
if record.digest.send(operator, digest)
|
|
175
|
+
return true
|
|
176
|
+
end
|
|
177
|
+
else
|
|
178
|
+
return true
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
false
|
|
182
|
+
end
|
|
183
|
+
|
|
132
184
|
# The clear method removes cached records based on the provided tags or
|
|
133
185
|
# clears all records with the current prefix.
|
|
134
186
|
#
|
|
@@ -27,7 +27,7 @@ module Documentrix::Documents::Cache::Records
|
|
|
27
27
|
# The to_s method returns a string representation of the object.
|
|
28
28
|
#
|
|
29
29
|
# @return [String] A string containing the text and tags of the record,
|
|
30
|
-
#
|
|
30
|
+
# along with its similarity score.
|
|
31
31
|
def to_s
|
|
32
32
|
my_tags = tags_set
|
|
33
33
|
my_tags.empty? or my_tags = " #{my_tags}"
|
|
@@ -23,7 +23,7 @@ class Documentrix::Documents::RedisCache
|
|
|
23
23
|
# @param [String] prefix the string to be used as the prefix for this cache
|
|
24
24
|
# @param [String] url the URL of the Redis server (default: ENV['REDIS_URL'])
|
|
25
25
|
# @param [Class] object_class the class of objects stored in Redis (default: nil)
|
|
26
|
-
def initialize(prefix:, url: ENV['REDIS_URL'], object_class:
|
|
26
|
+
def initialize(prefix:, url: ENV['REDIS_URL'], object_class:)
|
|
27
27
|
super(prefix:)
|
|
28
28
|
url or raise ArgumentError, 'require redis url'
|
|
29
29
|
@url, @object_class = url, object_class
|
|
@@ -46,7 +46,7 @@ class Documentrix::Documents::RedisCache
|
|
|
46
46
|
def [](key)
|
|
47
47
|
value = redis.get(pre(key))
|
|
48
48
|
unless value.nil?
|
|
49
|
-
|
|
49
|
+
JSON.parse(value, object_class:)
|
|
50
50
|
end
|
|
51
51
|
end
|
|
52
52
|
|
|
@@ -153,7 +153,7 @@ class Documentrix::Documents::RedisCache
|
|
|
153
153
|
|
|
154
154
|
redis.scan_each(match: prefix + ?*) do |key|
|
|
155
155
|
value = redis.get(key) or next
|
|
156
|
-
value =
|
|
156
|
+
value = JSON.parse(value, object_class:)
|
|
157
157
|
block.(key, value)
|
|
158
158
|
end
|
|
159
159
|
end
|
|
@@ -46,17 +46,17 @@ class Documentrix::Documents::Cache::SQLiteCache
|
|
|
46
46
|
result = execute(
|
|
47
47
|
%{
|
|
48
48
|
SELECT records.key, records.text, records.norm, records.source,
|
|
49
|
-
records.tags, embeddings.embedding
|
|
49
|
+
records.digest, records.tags, embeddings.embedding
|
|
50
50
|
FROM records
|
|
51
51
|
INNER JOIN embeddings ON records.embedding_id = embeddings.rowid
|
|
52
52
|
WHERE records.key = ?
|
|
53
53
|
},
|
|
54
54
|
pre(key)
|
|
55
55
|
)&.first or return
|
|
56
|
-
key, text, norm, source, tags, embedding = *result
|
|
56
|
+
key, text, norm, source, digest, tags, embedding = *result
|
|
57
57
|
embedding = embedding.unpack("f*")
|
|
58
58
|
tags = Documentrix::Utils::Tags.new(JSON(tags.to_s).to_a, source:)
|
|
59
|
-
convert_value_to_record(key:, text:, norm:, source:, tags:, embedding:)
|
|
59
|
+
convert_value_to_record(key:, text:, norm:, source:, digest:, tags:, embedding:)
|
|
60
60
|
end
|
|
61
61
|
|
|
62
62
|
# The []= method sets the value for a given key by inserting it into the
|
|
@@ -66,15 +66,16 @@ class Documentrix::Documents::Cache::SQLiteCache
|
|
|
66
66
|
# @param [Hash, Documentrix::Documents::Record] value the hash or record
|
|
67
67
|
# containing the text, embedding, and other metadata
|
|
68
68
|
def []=(key, value)
|
|
69
|
-
value
|
|
69
|
+
value = convert_value_to_record(value)
|
|
70
|
+
digest = compute_file_digest(value.source)
|
|
70
71
|
embedding = value.embedding.pack("f*")
|
|
71
72
|
execute(%{BEGIN})
|
|
72
73
|
execute(%{INSERT INTO embeddings(embedding) VALUES(?)}, [ embedding ])
|
|
73
74
|
embedding_id, = execute(%{ SELECT last_insert_rowid() }).flatten
|
|
74
75
|
execute(%{
|
|
75
|
-
INSERT INTO records(key,text,embedding_id,norm,source,tags)
|
|
76
|
-
VALUES(
|
|
77
|
-
}, [ pre(key), value.text, embedding_id, value.norm, value.source, JSON(value.tags) ])
|
|
76
|
+
INSERT INTO records(key,text,embedding_id,norm,source,digest,tags)
|
|
77
|
+
VALUES(?,?,?,?,?,?,?)
|
|
78
|
+
}, [ pre(key), value.text, embedding_id, value.norm, value.source, digest, JSON(value.tags) ])
|
|
78
79
|
execute(%{COMMIT})
|
|
79
80
|
end
|
|
80
81
|
|
|
@@ -113,7 +114,7 @@ class Documentrix::Documents::Cache::SQLiteCache
|
|
|
113
114
|
result = Documentrix::Utils::Tags.new
|
|
114
115
|
execute(%{
|
|
115
116
|
SELECT DISTINCT(tags) FROM records WHERE key LIKE ?
|
|
116
|
-
}, [
|
|
117
|
+
}, [ start_with_prefix ]
|
|
117
118
|
).flatten.each do
|
|
118
119
|
JSON(_1).each { |t| result.add(t) }
|
|
119
120
|
end
|
|
@@ -125,7 +126,10 @@ class Documentrix::Documents::Cache::SQLiteCache
|
|
|
125
126
|
#
|
|
126
127
|
# @return [ Integer ] the count of records
|
|
127
128
|
def size
|
|
128
|
-
execute(
|
|
129
|
+
execute(
|
|
130
|
+
%{SELECT COUNT(*) FROM records WHERE key LIKE ?},
|
|
131
|
+
[ start_with_prefix ]
|
|
132
|
+
).flatten.first
|
|
129
133
|
end
|
|
130
134
|
|
|
131
135
|
# The clear_for_tags method clears the cache for specific tags by deleting
|
|
@@ -153,19 +157,93 @@ class Documentrix::Documents::Cache::SQLiteCache
|
|
|
153
157
|
#
|
|
154
158
|
# @return [ Documentrix::Documents::RedisBackedMemoryCache ] self
|
|
155
159
|
def clear_all_with_prefix
|
|
156
|
-
execute(%{DELETE FROM records WHERE key LIKE ?}, [
|
|
160
|
+
execute(%{DELETE FROM records WHERE key LIKE ?}, [ start_with_prefix ])
|
|
157
161
|
self
|
|
158
162
|
end
|
|
159
163
|
|
|
160
|
-
#
|
|
161
|
-
#
|
|
164
|
+
# Removes all records associated with the specified source from the cache.
|
|
165
|
+
#
|
|
166
|
+
# If a digest is provided, the method will only remove records that do NOT
|
|
167
|
+
# match this digest. This allows for updating a source by wiping old versions
|
|
168
|
+
# while preserving records that are already up-to-date.
|
|
169
|
+
#
|
|
170
|
+
# @param source [String] the source identifier used to filter records
|
|
171
|
+
# @param digest [String, nil] the SHA256 hexadecimal digest of the source.
|
|
172
|
+
# Records matching this digest will be preserved.
|
|
173
|
+
#
|
|
174
|
+
# @return [self] the cache instance for method chaining
|
|
175
|
+
def clear_by_source(source, digest: nil, operator: ?=)
|
|
176
|
+
operator = '!=' if operator != ?=
|
|
177
|
+
if digest
|
|
178
|
+
execute(
|
|
179
|
+
%{
|
|
180
|
+
DELETE FROM records
|
|
181
|
+
WHERE key LIKE ? AND source = ? AND digest #{operator} ?
|
|
182
|
+
},
|
|
183
|
+
[ start_with_prefix, source, digest ]
|
|
184
|
+
)
|
|
185
|
+
else
|
|
186
|
+
execute(
|
|
187
|
+
%{
|
|
188
|
+
DELETE FROM records
|
|
189
|
+
WHERE key LIKE ? AND source = ?
|
|
190
|
+
},
|
|
191
|
+
[ start_with_prefix, source ]
|
|
192
|
+
)
|
|
193
|
+
end
|
|
194
|
+
self
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# The source_exist? method checks if any records associated with the given
|
|
198
|
+
# source exist in the cache. If a digest is provided, it verifies if the
|
|
199
|
+
# source exists and matches the specified digest using the provided operator.
|
|
200
|
+
#
|
|
201
|
+
# @param source [#to_s] the source to check for existence
|
|
202
|
+
# @param digest [String, nil] the SHA256 hexadecimal digest to compare
|
|
203
|
+
# against the stored source digest (optional)
|
|
204
|
+
# @param operator [String] the operator to use for comparison ('=' or '!=').
|
|
205
|
+
# Defaults to '='.
|
|
206
|
+
#
|
|
207
|
+
# @return [Boolean] true if the source exists (and matches the digest
|
|
208
|
+
# condition if provided), false otherwise.
|
|
209
|
+
def source_exist?(source, digest: nil, operator: ?=)
|
|
210
|
+
operator = '!=' if operator != ?=
|
|
211
|
+
if digest
|
|
212
|
+
!!execute(
|
|
213
|
+
%{
|
|
214
|
+
SELECT 1 FROM records WHERE key LIKE ? AND source = ? AND digest #{operator} ?
|
|
215
|
+
},
|
|
216
|
+
[ start_with_prefix, source, digest ]
|
|
217
|
+
).first
|
|
218
|
+
else
|
|
219
|
+
!!execute(
|
|
220
|
+
%{
|
|
221
|
+
SELECT 1 FROM records WHERE key LIKE ? AND source = ?
|
|
222
|
+
},
|
|
223
|
+
[ start_with_prefix, source ]
|
|
224
|
+
).first
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
# Yields each unique, full source present in the cache records.
|
|
162
229
|
#
|
|
163
|
-
#
|
|
230
|
+
# This is a high-performance override for SQLite that avoids loading
|
|
231
|
+
# embeddings and parsing JSON for every record.
|
|
164
232
|
#
|
|
165
|
-
# @
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
233
|
+
# @yield [source] the full source string
|
|
234
|
+
# @return [Enumerator] an enumerator if no block is given, nil otherwise.
|
|
235
|
+
def each_source(&block)
|
|
236
|
+
block or return enum_for(__method__)
|
|
237
|
+
|
|
238
|
+
execute(%{
|
|
239
|
+
SELECT DISTINCT source FROM records
|
|
240
|
+
WHERE key LIKE ? AND source IS NOT NULL
|
|
241
|
+
}, [ start_with_prefix ]).each do |source,|
|
|
242
|
+
source = source.full? or next
|
|
243
|
+
|
|
244
|
+
block.(source)
|
|
245
|
+
end
|
|
246
|
+
nil
|
|
169
247
|
end
|
|
170
248
|
|
|
171
249
|
# Move a key prefix in the cache.
|
|
@@ -203,19 +281,19 @@ class Documentrix::Documents::Cache::SQLiteCache
|
|
|
203
281
|
# cache.each do |key, value|
|
|
204
282
|
# puts "#{key}: #{value}"
|
|
205
283
|
# end
|
|
206
|
-
def each(prefix:
|
|
284
|
+
def each(prefix: start_with_prefix, &block)
|
|
207
285
|
block or return enum_for(__method__, prefix:)
|
|
208
286
|
|
|
209
287
|
execute(%{
|
|
210
288
|
SELECT records.key, records.text, records.norm, records.source,
|
|
211
|
-
records.tags, embeddings.embedding
|
|
289
|
+
records.digest, records.tags, embeddings.embedding
|
|
212
290
|
FROM records
|
|
213
291
|
INNER JOIN embeddings ON records.embedding_id = embeddings.rowid
|
|
214
292
|
WHERE records.key LIKE ?
|
|
215
|
-
}, [ prefix ]).each do |key, text, norm, source, tags, embedding|
|
|
293
|
+
}, [ prefix ]).each do |key, text, norm, source, digest, tags, embedding|
|
|
216
294
|
embedding = embedding.unpack("f*")
|
|
217
295
|
tags = Documentrix::Utils::Tags.new(JSON(tags.to_s).to_a, source:)
|
|
218
|
-
value = convert_value_to_record(key:, text:, norm:, source:, tags:, embedding:)
|
|
296
|
+
value = convert_value_to_record(key:, text:, norm:, source:, digest:, tags:, embedding:)
|
|
219
297
|
block.(key, value)
|
|
220
298
|
end
|
|
221
299
|
self
|
|
@@ -261,7 +339,7 @@ class Documentrix::Documents::Cache::SQLiteCache
|
|
|
261
339
|
SELECT key, tags, embedding_id
|
|
262
340
|
FROM records
|
|
263
341
|
WHERE key LIKE ?#{tags_where}
|
|
264
|
-
}, [
|
|
342
|
+
}, [ start_with_prefix ])
|
|
265
343
|
if tags_filter
|
|
266
344
|
records = records.select { |key, tags, embedding_id|
|
|
267
345
|
(tags_filter & JSON(tags.to_s).to_a).size >= 1
|
|
@@ -275,39 +353,52 @@ class Documentrix::Documents::Cache::SQLiteCache
|
|
|
275
353
|
# @param needle [ Array ] the embedding vector
|
|
276
354
|
# @param tags [ Array ] the list of tags to filter by (optional)
|
|
277
355
|
# @param max_records [ Integer ] the maximum number of records to return (optional)
|
|
356
|
+
# @param min_similarity [ Float ] the minimum similarity score to include (defaults to -1)
|
|
278
357
|
#
|
|
279
358
|
# @yield [ key, value ]
|
|
280
359
|
#
|
|
281
360
|
# @raise [ ArgumentError ] if needle size does not match embedding length
|
|
282
361
|
#
|
|
283
362
|
# @example
|
|
284
|
-
# documents.find_records([ 0.1 ] * 1_024, tags: %w[ test ])
|
|
363
|
+
# documents.find_records([ 0.1 ] * 1_024, tags: %w[ test ], min_similarity: 0.7)
|
|
285
364
|
#
|
|
286
365
|
# @return [ Array<Documentrix::Documents::Record> ] the list of matching records
|
|
287
|
-
def find_records(needle, tags: nil, max_records: nil)
|
|
366
|
+
def find_records(needle, tags: nil, max_records: nil, min_similarity: -1)
|
|
288
367
|
needle.size != @embedding_length and
|
|
289
368
|
raise ArgumentError, "needle embedding length != %s" % @embedding_length
|
|
290
369
|
needle_binary = needle.pack("f*")
|
|
291
370
|
max_records = [ max_records, size, 4_096 ].compact.min
|
|
292
371
|
records = find_records_for_tags(tags)
|
|
293
372
|
rowids_where = '(%s)' % records.transpose.last&.join(?,)
|
|
294
|
-
execute(
|
|
295
|
-
|
|
296
|
-
records.
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
373
|
+
execute(
|
|
374
|
+
%{
|
|
375
|
+
SELECT records.key, records.text, records.norm, records.source,
|
|
376
|
+
records.digest, records.tags, embeddings.embedding,
|
|
377
|
+
1 - vec_distance_cosine(?, vec_f32(embeddings.embedding)) AS similarity
|
|
378
|
+
FROM records
|
|
379
|
+
INNER JOIN embeddings ON records.embedding_id = embeddings.rowid
|
|
380
|
+
WHERE embeddings.rowid IN #{rowids_where}
|
|
381
|
+
AND embeddings.embedding MATCH ? AND similarity >= ?
|
|
382
|
+
AND embeddings.k = ?
|
|
383
|
+
ORDER BY similarity DESC
|
|
384
|
+
}, [ needle_binary, needle_binary, min_similarity, max_records ]
|
|
385
|
+
).map do |key, text, norm, source, digest, tags, embedding, similarity|
|
|
302
386
|
key = unpre(key)
|
|
303
387
|
embedding = embedding.unpack("f*")
|
|
304
388
|
tags = Documentrix::Utils::Tags.new(JSON(tags.to_s).to_a, source:)
|
|
305
|
-
convert_value_to_record(key:, text:, norm:, source:, tags:, embedding:)
|
|
389
|
+
convert_value_to_record(key:, text:, norm:, source:, digest:, tags:, embedding:, similarity:)
|
|
306
390
|
end
|
|
307
391
|
end
|
|
308
392
|
|
|
309
393
|
private
|
|
310
394
|
|
|
395
|
+
# Returns the SQL LIKE pattern for records starting with the current prefix.
|
|
396
|
+
#
|
|
397
|
+
# @return [ String ] the prefix pattern used in SQL WHERE clauses
|
|
398
|
+
def start_with_prefix
|
|
399
|
+
"#@prefix%"
|
|
400
|
+
end
|
|
401
|
+
|
|
311
402
|
# The execute method executes an SQL query on the database by calling the
|
|
312
403
|
# \@database.execute method.
|
|
313
404
|
#
|
|
@@ -362,10 +453,18 @@ class Documentrix::Documents::Cache::SQLiteCache
|
|
|
362
453
|
embedding_id integer,
|
|
363
454
|
norm float NOT NULL DEFAULT 0.0,
|
|
364
455
|
source text,
|
|
456
|
+
digest text,
|
|
365
457
|
tags json NOT NULL DEFAULT [],
|
|
366
458
|
FOREIGN KEY(embedding_id) REFERENCES embeddings(id) ON DELETE CASCADE
|
|
367
459
|
)
|
|
368
460
|
}
|
|
461
|
+
execute %{
|
|
462
|
+
CREATE TRIGGER IF NOT EXISTS delete_embedding_after_record AFTER DELETE ON records
|
|
463
|
+
FOR EACH ROW
|
|
464
|
+
BEGIN
|
|
465
|
+
DELETE FROM embeddings WHERE rowid = OLD.embedding_id;
|
|
466
|
+
END
|
|
467
|
+
}
|
|
369
468
|
nil
|
|
370
469
|
end
|
|
371
470
|
|