documentrix 0.0.4 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.utilsrc +107 -0
- data/CHANGES.md +28 -0
- data/Rakefile +5 -1
- data/docker-compose.yml +1 -1
- data/documentrix.gemspec +8 -8
- data/lib/documentrix/documents/cache/common.rb +102 -6
- data/lib/documentrix/documents/cache/memory_cache.rb +36 -8
- data/lib/documentrix/documents/cache/records.rb +14 -90
- data/lib/documentrix/documents/cache/redis_cache.rb +57 -25
- data/lib/documentrix/documents/cache/sqlite_cache.rb +41 -5
- data/lib/documentrix/documents.rb +62 -6
- data/lib/documentrix/utils/colorize_texts.rb +11 -0
- data/lib/documentrix/utils/math.rb +5 -0
- data/lib/documentrix/utils/tags.rb +34 -0
- data/lib/documentrix/utils.rb +10 -0
- data/lib/documentrix/version.rb +1 -1
- data/lib/documentrix.rb +6 -0
- data/spec/documentrix/documents/cache/interface_spec.rb +188 -0
- data/spec/documentrix/documents/cache/memory_cache_spec.rb +29 -0
- data/spec/documentrix/documents/cache/redis_cache_spec.rb +13 -11
- data/spec/documentrix/documents/cache/sqlite_cache_spec.rb +34 -0
- data/spec/documents_spec.rb +22 -0
- data/spec/spec_helper.rb +2 -6
- metadata +11 -12
- data/lib/documentrix/documents/cache/redis_backed_memory_cache.rb +0 -64
- data/spec/documentrix/documents/cache/redis_backed_memory_cache_spec.rb +0 -121
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c76bf8eb224b18de775f6c7652cde2dcf4409423f79b2006a7e2c50f158d2769
|
|
4
|
+
data.tar.gz: c59dac8defeea81cae51ada232c3778d8ba95a0cc6c3afc19595309cc33d5a83
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 27758319636b90ff0772ed866187bd2cef9673e203e79fc6ff72bd53ad2b43906029b011a738eb5e695f744e4c2cf733cef1ba46640488df91b23f314f3c6d80
|
|
7
|
+
data.tar.gz: 9c115acc0ec7e43eb7330f0422a69d4f0d8afea592947b994382d40d73f54502b143d43b939b924dcfdd053ea824847f044052aef89176f646ab9e1108c592e2
|
data/.utilsrc
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# vim: set ft=ruby:
|
|
2
|
+
|
|
3
|
+
search do
|
|
4
|
+
prune_dirs /\A(\.svn|\.git|\.terraform|CVS|tmp|coverage|corpus|pkg|\.yardoc|doc)\z/
|
|
5
|
+
skip_files /(\A\.|\.sw[pon]\z|\.(log|fnm|jpg|jpeg|png|pdf|svg)\z|\A(tags|cscope\.out)\z|~\z)/i
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
discover do
|
|
9
|
+
prune_dirs /\A(\.svn|\.git|\.terraform|\.yardoc|CVS|tmp|coverage|corpus|pkg|\.yardoc|doc)\z/
|
|
10
|
+
skip_files /(\A\.|\.sw[pon]\z|\.log\z|~\z)/
|
|
11
|
+
index_expire_after 3_600
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
strip_spaces do
|
|
15
|
+
prune_dirs /\A(\..*|CVS|pkg|\.yardoc)\z/
|
|
16
|
+
skip_files /(\A\.|\.sw[pon]\z|\.log\z|~\z)/
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
probe do
|
|
20
|
+
test_framework :rspec
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
ssh_tunnel do
|
|
24
|
+
terminal_multiplexer :tmux
|
|
25
|
+
login_session "/home/#{ENV['USER']}"
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
classify do
|
|
29
|
+
shift_path_by_default 1
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
code_indexer do
|
|
33
|
+
verbose false
|
|
34
|
+
|
|
35
|
+
gems = %w[
|
|
36
|
+
all_images
|
|
37
|
+
amatch
|
|
38
|
+
base64
|
|
39
|
+
bigdecimal
|
|
40
|
+
complex_config
|
|
41
|
+
connection_pool
|
|
42
|
+
date
|
|
43
|
+
debug
|
|
44
|
+
diff-lcs
|
|
45
|
+
docile
|
|
46
|
+
documentrix
|
|
47
|
+
erb
|
|
48
|
+
excon
|
|
49
|
+
fileutils
|
|
50
|
+
gem_hadar
|
|
51
|
+
infobar
|
|
52
|
+
io-console
|
|
53
|
+
irb
|
|
54
|
+
json
|
|
55
|
+
kramdown
|
|
56
|
+
kramdown-ansi
|
|
57
|
+
kramdown-parser-gfm
|
|
58
|
+
logger
|
|
59
|
+
mize
|
|
60
|
+
more_math
|
|
61
|
+
net-http
|
|
62
|
+
nokogiri
|
|
63
|
+
numo-narray-alt
|
|
64
|
+
ollama-ruby
|
|
65
|
+
openssl
|
|
66
|
+
ostruct
|
|
67
|
+
pp
|
|
68
|
+
prettyprint
|
|
69
|
+
prism
|
|
70
|
+
psych
|
|
71
|
+
racc
|
|
72
|
+
rake
|
|
73
|
+
rdoc
|
|
74
|
+
readline
|
|
75
|
+
redis
|
|
76
|
+
redis-client
|
|
77
|
+
reline
|
|
78
|
+
rexml
|
|
79
|
+
rspec
|
|
80
|
+
rspec-core
|
|
81
|
+
rspec-expectations
|
|
82
|
+
rspec-mocks
|
|
83
|
+
rspec-support
|
|
84
|
+
search_ui
|
|
85
|
+
shellwords
|
|
86
|
+
simplecov
|
|
87
|
+
simplecov-html
|
|
88
|
+
simplecov_json_formatter
|
|
89
|
+
sqlite-vec
|
|
90
|
+
sqlite3
|
|
91
|
+
stringio
|
|
92
|
+
sync
|
|
93
|
+
term-ansicolor
|
|
94
|
+
terminal-table
|
|
95
|
+
tins
|
|
96
|
+
tsort
|
|
97
|
+
unicode-display_width
|
|
98
|
+
unicode-emoji
|
|
99
|
+
uri
|
|
100
|
+
yaml
|
|
101
|
+
yard
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
paths {
|
|
105
|
+
%w[ lib ] + gems.map { `bundle show #{it}` }.map(&:chomp)
|
|
106
|
+
}
|
|
107
|
+
end
|
data/CHANGES.md
CHANGED
|
@@ -1,5 +1,33 @@
|
|
|
1
1
|
# Changes
|
|
2
2
|
|
|
3
|
+
## 2026-03-29 v0.1.0
|
|
4
|
+
|
|
5
|
+
- Added `Documentrix::Documents#rename_collection` to rename a collection and
|
|
6
|
+
delegate key moving to the cache.
|
|
7
|
+
- Extended `Documentrix::Documents::Cache::Common#pre` and `#unpre` to accept
|
|
8
|
+
an optional `prefix` argument.
|
|
9
|
+
- Implemented `Documentrix::Documents::Cache::MemoryCache#move_prefix`,
|
|
10
|
+
`RedisCache#move_prefix`, and `SQLiteCache#move_prefix` for atomic key
|
|
11
|
+
mass‑move operations.
|
|
12
|
+
- Updated unit tests to cover the new rename and prefix‑moving functionality.
|
|
13
|
+
- Switched to `GemHadar::SimpleCov` for test coverage.
|
|
14
|
+
- Removed the `redis_backed_memory_cache` implementation and its spec.
|
|
15
|
+
- Added a `.utilsrc` configuration file defining blocks for `search`,
|
|
16
|
+
`discover`, `strip_spaces`, `probe`, `ssh_tunnel`, `classify`, and
|
|
17
|
+
`code_indexer`.
|
|
18
|
+
- Reordered Docker `apk add` commands and added friendly echo messages during
|
|
19
|
+
image build and test phases.
|
|
20
|
+
- Added `fail_fast: true` flag to the Docker file definition.
|
|
21
|
+
- Added interface spec for cache implementations (`memory`, `Redis`, `SQLite`).
|
|
22
|
+
- Included `Set` requirement for older Ruby versions.
|
|
23
|
+
- Removed unused expiry functionality from `RedisCache`.
|
|
24
|
+
- Refactored cache system to use explicit inheritance, moving common methods
|
|
25
|
+
into `Documentrix::Documents::Cache::Common` and renaming `clear` to
|
|
26
|
+
`clear_all_with_prefix`.
|
|
27
|
+
- Updated documentation with detailed RDoc comments.
|
|
28
|
+
- Updated CI configuration to use `bundle exec` and cleaned up `Gemfile.lock`.
|
|
29
|
+
- Added `changelog` configuration in `Rakefile` to support `CHANGES.md`.
|
|
30
|
+
|
|
3
31
|
## 2025-12-20 v0.0.4
|
|
4
32
|
|
|
5
33
|
- Added `openssl-dev` to the package list in `.all_images.yml` for Docker
|
data/Rakefile
CHANGED
|
@@ -24,6 +24,10 @@ GemHadar do
|
|
|
24
24
|
'.rspec'
|
|
25
25
|
readme 'README.md'
|
|
26
26
|
|
|
27
|
+
changelog do
|
|
28
|
+
filename 'CHANGES.md'
|
|
29
|
+
end
|
|
30
|
+
|
|
27
31
|
required_ruby_version '>= 3.1'
|
|
28
32
|
|
|
29
33
|
dependency 'infobar', '~> 0.9'
|
|
@@ -36,7 +40,7 @@ GemHadar do
|
|
|
36
40
|
dependency 'redis', '~> 5.0'
|
|
37
41
|
dependency 'more_math', '~> 1.1'
|
|
38
42
|
|
|
39
|
-
development_dependency 'all_images', '~> 0.
|
|
43
|
+
development_dependency 'all_images', '~> 0.12'
|
|
40
44
|
development_dependency 'rspec', '~> 3.2'
|
|
41
45
|
development_dependency 'kramdown', '~> 2.0'
|
|
42
46
|
development_dependency 'debug'
|
data/docker-compose.yml
CHANGED
data/documentrix.gemspec
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
|
2
|
-
# stub: documentrix 0.0
|
|
2
|
+
# stub: documentrix 0.1.0 ruby lib
|
|
3
3
|
|
|
4
4
|
Gem::Specification.new do |s|
|
|
5
5
|
s.name = "documentrix".freeze
|
|
6
|
-
s.version = "0.0
|
|
6
|
+
s.version = "0.1.0".freeze
|
|
7
7
|
|
|
8
8
|
s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
|
|
9
9
|
s.require_paths = ["lib".freeze]
|
|
@@ -11,20 +11,20 @@ Gem::Specification.new do |s|
|
|
|
11
11
|
s.date = "1980-01-02"
|
|
12
12
|
s.description = "The Ruby library, Documentrix, is designed to provide a way to build and\nquery vector databases for applications in natural language processing\n(NLP) and large language models (LLMs). It allows users to store and\nretrieve dense vector embeddings for text strings.\n".freeze
|
|
13
13
|
s.email = "flori@ping.de".freeze
|
|
14
|
-
s.extra_rdoc_files = ["README.md".freeze, "lib/documentrix.rb".freeze, "lib/documentrix/documents.rb".freeze, "lib/documentrix/documents/cache/common.rb".freeze, "lib/documentrix/documents/cache/memory_cache.rb".freeze, "lib/documentrix/documents/cache/records.rb".freeze, "lib/documentrix/documents/cache/
|
|
15
|
-
s.files = [".envrc".freeze, ".yardopts".freeze, "CHANGES.md".freeze, "Gemfile".freeze, "LICENSE".freeze, "README.md".freeze, "Rakefile".freeze, "docker-compose.yml".freeze, "documentrix.gemspec".freeze, "lib/documentrix.rb".freeze, "lib/documentrix/documents.rb".freeze, "lib/documentrix/documents/cache/common.rb".freeze, "lib/documentrix/documents/cache/memory_cache.rb".freeze, "lib/documentrix/documents/cache/records.rb".freeze, "lib/documentrix/documents/cache/
|
|
14
|
+
s.extra_rdoc_files = ["README.md".freeze, "lib/documentrix.rb".freeze, "lib/documentrix/documents.rb".freeze, "lib/documentrix/documents/cache/common.rb".freeze, "lib/documentrix/documents/cache/memory_cache.rb".freeze, "lib/documentrix/documents/cache/records.rb".freeze, "lib/documentrix/documents/cache/redis_cache.rb".freeze, "lib/documentrix/documents/cache/sqlite_cache.rb".freeze, "lib/documentrix/documents/splitters/character.rb".freeze, "lib/documentrix/documents/splitters/semantic.rb".freeze, "lib/documentrix/utils.rb".freeze, "lib/documentrix/utils/colorize_texts.rb".freeze, "lib/documentrix/utils/math.rb".freeze, "lib/documentrix/utils/tags.rb".freeze, "lib/documentrix/version.rb".freeze]
|
|
15
|
+
s.files = [".envrc".freeze, ".utilsrc".freeze, ".yardopts".freeze, "CHANGES.md".freeze, "Gemfile".freeze, "LICENSE".freeze, "README.md".freeze, "Rakefile".freeze, "docker-compose.yml".freeze, "documentrix.gemspec".freeze, "lib/documentrix.rb".freeze, "lib/documentrix/documents.rb".freeze, "lib/documentrix/documents/cache/common.rb".freeze, "lib/documentrix/documents/cache/memory_cache.rb".freeze, "lib/documentrix/documents/cache/records.rb".freeze, "lib/documentrix/documents/cache/redis_cache.rb".freeze, "lib/documentrix/documents/cache/sqlite_cache.rb".freeze, "lib/documentrix/documents/splitters/character.rb".freeze, "lib/documentrix/documents/splitters/semantic.rb".freeze, "lib/documentrix/utils.rb".freeze, "lib/documentrix/utils/colorize_texts.rb".freeze, "lib/documentrix/utils/math.rb".freeze, "lib/documentrix/utils/tags.rb".freeze, "lib/documentrix/version.rb".freeze, "redis/redis.conf".freeze, "spec/assets/embeddings.json".freeze, "spec/documentrix/documents/cache/interface_spec.rb".freeze, "spec/documentrix/documents/cache/memory_cache_spec.rb".freeze, "spec/documentrix/documents/cache/redis_cache_spec.rb".freeze, "spec/documentrix/documents/cache/sqlite_cache_spec.rb".freeze, "spec/documentrix/documents/splitters/character_spec.rb".freeze, "spec/documentrix/documents/splitters/semantic_spec.rb".freeze, "spec/documents_spec.rb".freeze, "spec/spec_helper.rb".freeze, "spec/utils/colorize_texts_spec.rb".freeze, "spec/utils/tags_spec.rb".freeze]
|
|
16
16
|
s.homepage = "https://github.com/flori/documentrix".freeze
|
|
17
17
|
s.licenses = ["MIT".freeze]
|
|
18
18
|
s.rdoc_options = ["--title".freeze, "Documentrix - Ruby library for embedding vector database".freeze, "--main".freeze, "README.md".freeze]
|
|
19
19
|
s.required_ruby_version = Gem::Requirement.new(">= 3.1".freeze)
|
|
20
|
-
s.rubygems_version = "4.0.
|
|
20
|
+
s.rubygems_version = "4.0.8".freeze
|
|
21
21
|
s.summary = "Ruby library for embedding vector database".freeze
|
|
22
|
-
s.test_files = ["spec/documentrix/documents/cache/
|
|
22
|
+
s.test_files = ["spec/documentrix/documents/cache/interface_spec.rb".freeze, "spec/documentrix/documents/cache/memory_cache_spec.rb".freeze, "spec/documentrix/documents/cache/redis_cache_spec.rb".freeze, "spec/documentrix/documents/cache/sqlite_cache_spec.rb".freeze, "spec/documentrix/documents/splitters/character_spec.rb".freeze, "spec/documentrix/documents/splitters/semantic_spec.rb".freeze, "spec/documents_spec.rb".freeze, "spec/spec_helper.rb".freeze, "spec/utils/colorize_texts_spec.rb".freeze, "spec/utils/tags_spec.rb".freeze]
|
|
23
23
|
|
|
24
24
|
s.specification_version = 4
|
|
25
25
|
|
|
26
|
-
s.add_development_dependency(%q<gem_hadar>.freeze, ["
|
|
27
|
-
s.add_development_dependency(%q<all_images>.freeze, ["~> 0.
|
|
26
|
+
s.add_development_dependency(%q<gem_hadar>.freeze, [">= 2.17.0".freeze])
|
|
27
|
+
s.add_development_dependency(%q<all_images>.freeze, ["~> 0.12".freeze])
|
|
28
28
|
s.add_development_dependency(%q<rspec>.freeze, ["~> 3.2".freeze])
|
|
29
29
|
s.add_development_dependency(%q<kramdown>.freeze, ["~> 2.0".freeze])
|
|
30
30
|
s.add_development_dependency(%q<debug>.freeze, [">= 0".freeze])
|
|
@@ -1,5 +1,18 @@
|
|
|
1
|
+
# Common interface for document caches
|
|
2
|
+
#
|
|
3
|
+
# This module defines the standard interface that all document cache
|
|
4
|
+
# implementations must adhere to. It provides shared functionality for managing
|
|
5
|
+
# cached document embeddings, including methods for setting, retrieving, and
|
|
6
|
+
# deleting cache entries, as well as querying and filtering cached data based
|
|
7
|
+
# on tags and similarity searches.
|
|
8
|
+
#
|
|
9
|
+
# The module includes methods for prefix management, collection enumeration,
|
|
10
|
+
# tag extraction, and cache clearing operations, ensuring consistent behavior
|
|
11
|
+
# across different cache backends such as
|
|
12
|
+
# memory, Redis, and SQLite.
|
|
1
13
|
module Documentrix::Documents::Cache::Common
|
|
2
14
|
include Documentrix::Utils::Math
|
|
15
|
+
include Enumerable
|
|
3
16
|
|
|
4
17
|
# The initialize method sets up the Documentrix::Documents::Cache instance's
|
|
5
18
|
# by setting its prefix attribute to the given value.
|
|
@@ -27,17 +40,100 @@ module Documentrix::Documents::Cache::Common
|
|
|
27
40
|
# Returns a string representing the given `key` prefixed with the defined
|
|
28
41
|
# prefix.
|
|
29
42
|
#
|
|
30
|
-
# @param key [String] the key to
|
|
31
|
-
# @
|
|
32
|
-
|
|
33
|
-
|
|
43
|
+
# @param key [String] the key to prefix
|
|
44
|
+
# @param prefix [String] the prefix to use (defaults to the cache's prefix)
|
|
45
|
+
# @return [String] the prefixed key
|
|
46
|
+
def pre(key, prefix: @prefix)
|
|
47
|
+
[ prefix, key ].join
|
|
34
48
|
end
|
|
35
49
|
|
|
36
50
|
# Returns a string with the prefix removed from the given `key`.
|
|
37
51
|
#
|
|
38
52
|
# @param key [String] the input string containing the prefix.
|
|
53
|
+
# @param prefix [String] the prefix to use (defaults to the cache's prefix)
|
|
39
54
|
# @return [String] the input string without the prefix.
|
|
40
|
-
def unpre(key)
|
|
41
|
-
key.sub(/\A
|
|
55
|
+
def unpre(key, prefix: @prefix)
|
|
56
|
+
key.sub(/\A#{prefix}/, '')
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# The find_records method finds records that match the given needle and
|
|
60
|
+
# tags.
|
|
61
|
+
#
|
|
62
|
+
# @param needle [ Array ] an array containing the embedding vector
|
|
63
|
+
# @param tags [ String, Array ] a string or array of strings representing the tags to search for
|
|
64
|
+
# @param max_records [ Integer ] the maximum number of records to return
|
|
65
|
+
#
|
|
66
|
+
# @yield [ record ]
|
|
67
|
+
#
|
|
68
|
+
# @return [ Array<Documentrix::Documents::Records> ] an array containing the matching records
|
|
69
|
+
def find_records(needle, tags: nil, max_records: nil)
|
|
70
|
+
tags = Documentrix::Utils::Tags.new(Array(tags)).to_a
|
|
71
|
+
records = self
|
|
72
|
+
if tags.present?
|
|
73
|
+
records = records.select { |_key, record| (tags & record.tags).size >= 1 }
|
|
74
|
+
end
|
|
75
|
+
needle_norm = norm(needle)
|
|
76
|
+
records = records.sort_by { |key, record|
|
|
77
|
+
record.key = key
|
|
78
|
+
record.similarity = cosine_similarity(
|
|
79
|
+
a: needle,
|
|
80
|
+
b: record.embedding,
|
|
81
|
+
a_norm: needle_norm,
|
|
82
|
+
b_norm: record.norm,
|
|
83
|
+
)
|
|
84
|
+
}
|
|
85
|
+
records.transpose.last&.reverse.to_a
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Returns a set of unique tags found in the cache records.
|
|
89
|
+
#
|
|
90
|
+
# This method iterates through all records in the cache and collects unique
|
|
91
|
+
# tags from each record's tags collection. It constructs a new
|
|
92
|
+
# Documentrix::Utils::Tags object containing all the unique tags encountered.
|
|
93
|
+
#
|
|
94
|
+
# @return [Documentrix::Utils::Tags] a set of unique tags from all records in
|
|
95
|
+
# the cache
|
|
96
|
+
def tags
|
|
97
|
+
each_with_object(Documentrix::Utils::Tags.new) do |(_, record), t|
|
|
98
|
+
record.tags.each do |tag|
|
|
99
|
+
t.add(tag, source: record.source)
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# The clear_for_tags method removes all records from the cache that have tags
|
|
105
|
+
# matching any of the provided tags.
|
|
106
|
+
#
|
|
107
|
+
# @param tags [Array<String>] an array of tag names to filter records by
|
|
108
|
+
#
|
|
109
|
+
# @return [self] self
|
|
110
|
+
def clear_for_tags(tags)
|
|
111
|
+
each do |key, record|
|
|
112
|
+
if (tags & record.tags.to_a).size >= 1
|
|
113
|
+
delete(unpre(key))
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
self
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# The clear method removes cached records based on the provided tags or
|
|
120
|
+
# clears all records with the current prefix.
|
|
121
|
+
#
|
|
122
|
+
# When tags are provided, it removes only the records that have matching
|
|
123
|
+
# tags. If no tags are provided, it removes all records that have keys
|
|
124
|
+
# starting with the current prefix.
|
|
125
|
+
#
|
|
126
|
+
# @param tags [NilClass, Array<String>] an array of tag names to filter
|
|
127
|
+
# records by, or nil to clear all records
|
|
128
|
+
#
|
|
129
|
+
# @return [self] returns the cache instance for method chaining
|
|
130
|
+
def clear(tags: nil)
|
|
131
|
+
tags = Documentrix::Utils::Tags.new(tags).to_a
|
|
132
|
+
if tags.present?
|
|
133
|
+
clear_for_tags(tags)
|
|
134
|
+
else
|
|
135
|
+
clear_all_with_prefix
|
|
136
|
+
end
|
|
137
|
+
self
|
|
42
138
|
end
|
|
43
139
|
end
|
|
@@ -1,5 +1,17 @@
|
|
|
1
1
|
require 'documentrix/documents/cache/common'
|
|
2
2
|
|
|
3
|
+
# MemoryCache is an in-memory cache implementation for document embeddings.
|
|
4
|
+
#
|
|
5
|
+
# This class provides a cache store for document embeddings using a hash-based
|
|
6
|
+
# in-memory storage mechanism. It implements the common cache interface
|
|
7
|
+
# defined in Documentrix::Documents::Cache::Common and supports operations
|
|
8
|
+
# such as setting, retrieving, and deleting cached entries, as well as
|
|
9
|
+
# iterating over cached items.
|
|
10
|
+
#
|
|
11
|
+
# The cache uses a prefix to namespace keys and supports clearing entries
|
|
12
|
+
# based on prefixes or specific tags. It is designed to be used as a
|
|
13
|
+
# temporary storage mechanism during processing and is not persistent
|
|
14
|
+
# across application restarts.
|
|
3
15
|
class Documentrix::Documents::MemoryCache
|
|
4
16
|
include Documentrix::Documents::Cache::Common
|
|
5
17
|
|
|
@@ -59,32 +71,48 @@ class Documentrix::Documents::MemoryCache
|
|
|
59
71
|
count
|
|
60
72
|
end
|
|
61
73
|
|
|
62
|
-
# The
|
|
63
|
-
# with the prefix `prefix`.
|
|
74
|
+
# The clear_all_with_prefix method removes all records from the cache that
|
|
75
|
+
# have keys starting with the prefix `prefix`.
|
|
64
76
|
#
|
|
65
77
|
# @return [ Documentrix::Documents::MemoryCache ] self
|
|
66
|
-
def
|
|
78
|
+
def clear_all_with_prefix
|
|
67
79
|
@data.delete_if { |key, _| key.start_with?(@prefix) }
|
|
68
80
|
self
|
|
69
81
|
end
|
|
70
82
|
|
|
83
|
+
# Move all keys that start with +old_prefix+ into a new prefix.
|
|
84
|
+
#
|
|
85
|
+
# This helper is used when a collection is renamed. It iterates over all
|
|
86
|
+
# cached entries, selects those whose keys start with *old_prefix*, strips
|
|
87
|
+
# that prefix, then re‑inserts the entry with the *new_prefix* instead.
|
|
88
|
+
#
|
|
89
|
+
# @param old_prefix [String] The prefix to look for on existing keys.
|
|
90
|
+
# @param new_prefix [String] The prefix that will replace *old_prefix*.
|
|
91
|
+
# @return [Documentrix::Documents::MemoryCache] Returns `self` to allow
|
|
92
|
+
# chaining.
|
|
93
|
+
def move_prefix(old_prefix, new_prefix)
|
|
94
|
+
new_data = @data.dup
|
|
95
|
+
full_each do |key, value|
|
|
96
|
+
key.start_with?(old_prefix) or next
|
|
97
|
+
unpre_key = unpre(key, prefix: old_prefix)
|
|
98
|
+
new_data[pre(unpre_key, prefix: new_prefix)] = new_data.delete(key)
|
|
99
|
+
end
|
|
100
|
+
@data.replace(new_data)
|
|
101
|
+
self
|
|
102
|
+
end
|
|
103
|
+
|
|
71
104
|
# The each method iterates over the cache's keys and values under a given
|
|
72
105
|
# prefix `prefix`.
|
|
73
106
|
#
|
|
74
107
|
# @yield [key, value] Each key-value pair in the cache
|
|
75
|
-
#
|
|
76
|
-
# @return [void]
|
|
77
108
|
def each(&block)
|
|
78
109
|
@data.select { |key,| key.start_with?(@prefix) }.each(&block)
|
|
79
110
|
end
|
|
80
|
-
include Enumerable
|
|
81
111
|
|
|
82
112
|
# The full_each method iterates over the data hash and yields each key-value
|
|
83
113
|
# pair to the given block regardless of the prefix `prefix`.
|
|
84
114
|
#
|
|
85
115
|
# @yield [key, value] Each key-value pair in the data hash
|
|
86
|
-
#
|
|
87
|
-
# @return [void]
|
|
88
116
|
def full_each(&block)
|
|
89
117
|
@data.each(&block)
|
|
90
118
|
end
|
|
@@ -1,4 +1,18 @@
|
|
|
1
|
+
# Module for cache record definitions used in Documentrix document caching.
|
|
2
|
+
#
|
|
3
|
+
# This module provides the Record class and RedisFullEach module for managing
|
|
4
|
+
# cached document embeddings and their associated metadata in the Documentrix
|
|
5
|
+
# library's caching system.
|
|
1
6
|
module Documentrix::Documents::Cache::Records
|
|
7
|
+
# A record class for caching document embeddings and their associated
|
|
8
|
+
# metadata.
|
|
9
|
+
#
|
|
10
|
+
# This class extends JSON::GenericObject and is used to represent cached
|
|
11
|
+
# document entries in the Documentrix library. It stores text content,
|
|
12
|
+
# embedding vectors, normalization values, source information, and tags
|
|
13
|
+
# associated with each document record. The class provides methods for string
|
|
14
|
+
# representation, tag handling, and equality comparison based on text
|
|
15
|
+
# content.
|
|
2
16
|
class Record < JSON::GenericObject
|
|
3
17
|
# The initialize method sets default values for the text and norm
|
|
4
18
|
# attributes.
|
|
@@ -41,94 +55,4 @@ module Documentrix::Documents::Cache::Records
|
|
|
41
55
|
|
|
42
56
|
alias inspect to_s
|
|
43
57
|
end
|
|
44
|
-
|
|
45
|
-
module RedisFullEach
|
|
46
|
-
# The full_each method iterates over all records in the cache and yields
|
|
47
|
-
# them to the block.
|
|
48
|
-
#
|
|
49
|
-
# @yield [ key, value ] where key is the record's key and value is the record itself
|
|
50
|
-
def full_each(&block)
|
|
51
|
-
redis.scan_each(match: [ Documentrix::Documents, ?* ] * ?-) do |key|
|
|
52
|
-
value = redis.get(key) or next
|
|
53
|
-
value = JSON(value, object_class: Documentrix::Documents::Record)
|
|
54
|
-
block.(key, value)
|
|
55
|
-
end
|
|
56
|
-
end
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
module FindRecords
|
|
60
|
-
# The find_records method finds records that match the given needle and
|
|
61
|
-
# tags.
|
|
62
|
-
#
|
|
63
|
-
# @param needle [ Array ] an array containing the embedding vector
|
|
64
|
-
# @param tags [ String, Array ] a string or array of strings representing the tags to search for
|
|
65
|
-
# @param max_records [ Integer ] the maximum number of records to return
|
|
66
|
-
#
|
|
67
|
-
# @yield [ record ]
|
|
68
|
-
#
|
|
69
|
-
# @return [ Array<Documentrix::Documents::Records> ] an array containing the matching records
|
|
70
|
-
def find_records(needle, tags: nil, max_records: nil)
|
|
71
|
-
tags = Documentrix::Utils::Tags.new(Array(tags)).to_a
|
|
72
|
-
records = self
|
|
73
|
-
if tags.present?
|
|
74
|
-
records = records.select { |_key, record| (tags & record.tags).size >= 1 }
|
|
75
|
-
end
|
|
76
|
-
needle_norm = norm(needle)
|
|
77
|
-
records = records.sort_by { |key, record|
|
|
78
|
-
record.key = key
|
|
79
|
-
record.similarity = cosine_similarity(
|
|
80
|
-
a: needle,
|
|
81
|
-
b: record.embedding,
|
|
82
|
-
a_norm: needle_norm,
|
|
83
|
-
b_norm: record.norm,
|
|
84
|
-
)
|
|
85
|
-
}
|
|
86
|
-
records.transpose.last&.reverse.to_a
|
|
87
|
-
end
|
|
88
|
-
end
|
|
89
|
-
|
|
90
|
-
module Tags
|
|
91
|
-
# The clear method removes all records that match the given tags from the
|
|
92
|
-
# cache.
|
|
93
|
-
#
|
|
94
|
-
# @param tags [ Array<String> ] an array of tag names
|
|
95
|
-
#
|
|
96
|
-
# @example
|
|
97
|
-
# documents.clear(tags: %w[ foo bar ])
|
|
98
|
-
#
|
|
99
|
-
# @return [ self ]
|
|
100
|
-
def clear(tags: nil)
|
|
101
|
-
tags = Documentrix::Utils::Tags.new(tags).to_a
|
|
102
|
-
if tags.present?
|
|
103
|
-
if respond_to?(:clear_for_tags)
|
|
104
|
-
clear_for_tags(tags)
|
|
105
|
-
else
|
|
106
|
-
each do |key, record|
|
|
107
|
-
if (tags & record.tags.to_a).size >= 1
|
|
108
|
-
delete(unpre(key))
|
|
109
|
-
end
|
|
110
|
-
end
|
|
111
|
-
end
|
|
112
|
-
else
|
|
113
|
-
super()
|
|
114
|
-
end
|
|
115
|
-
self
|
|
116
|
-
end
|
|
117
|
-
|
|
118
|
-
# The tags method returns an array of unique tags from all records.
|
|
119
|
-
#
|
|
120
|
-
# @return [Documentrix::Utils::Tags] An instance of
|
|
121
|
-
# Documentrix::Utils::Tags containing the unique tags.
|
|
122
|
-
def tags
|
|
123
|
-
if defined? super
|
|
124
|
-
super
|
|
125
|
-
else
|
|
126
|
-
each_with_object(Documentrix::Utils::Tags.new) do |(_, record), t|
|
|
127
|
-
record.tags.each do |tag|
|
|
128
|
-
t.add(tag, source: record.source)
|
|
129
|
-
end
|
|
130
|
-
end
|
|
131
|
-
end
|
|
132
|
-
end
|
|
133
|
-
end
|
|
134
58
|
end
|
|
@@ -1,6 +1,18 @@
|
|
|
1
1
|
require 'documentrix/documents/cache/common'
|
|
2
2
|
require 'redis'
|
|
3
3
|
|
|
4
|
+
# RedisCache is a cache implementation that uses Redis for storing document
|
|
5
|
+
# embeddings and related metadata.
|
|
6
|
+
#
|
|
7
|
+
# This class provides a persistent cache storage solution for document
|
|
8
|
+
# embeddings, leveraging Redis's capabilities to store both the embedding
|
|
9
|
+
# vectors and associated text data, tags, and source information. It supports
|
|
10
|
+
# efficient vector similarity searches through Redis-based operations.
|
|
11
|
+
#
|
|
12
|
+
# @example
|
|
13
|
+
# cache = Documentrix::Documents::RedisCache.new(prefix: 'docs-', url: 'redis://localhost:6379')
|
|
14
|
+
# cache['key'] = { text: 'example', embedding: [0.1, 0.2, 0.3] }
|
|
15
|
+
# value = cache['key']
|
|
4
16
|
class Documentrix::Documents::RedisCache
|
|
5
17
|
include Documentrix::Documents::Cache::Common
|
|
6
18
|
|
|
@@ -11,11 +23,10 @@ class Documentrix::Documents::RedisCache
|
|
|
11
23
|
# @param [String] prefix the string to be used as the prefix for this cache
|
|
12
24
|
# @param [String] url the URL of the Redis server (default: ENV['REDIS_URL'])
|
|
13
25
|
# @param [Class] object_class the class of objects stored in Redis (default: nil)
|
|
14
|
-
|
|
15
|
-
def initialize(prefix:, url: ENV['REDIS_URL'], object_class: nil, ex: nil)
|
|
26
|
+
def initialize(prefix:, url: ENV['REDIS_URL'], object_class: nil)
|
|
16
27
|
super(prefix:)
|
|
17
28
|
url or raise ArgumentError, 'require redis url'
|
|
18
|
-
@url, @object_class
|
|
29
|
+
@url, @object_class = url, object_class
|
|
19
30
|
end
|
|
20
31
|
|
|
21
32
|
attr_reader :object_class # the class of objects stored in the cache
|
|
@@ -35,7 +46,7 @@ class Documentrix::Documents::RedisCache
|
|
|
35
46
|
def [](key)
|
|
36
47
|
value = redis.get(pre(key))
|
|
37
48
|
unless value.nil?
|
|
38
|
-
object_class ? JSON(value, object_class:) : JSON(value)
|
|
49
|
+
object_class ? JSON.parse(value, object_class:) : JSON.parse(value)
|
|
39
50
|
end
|
|
40
51
|
end
|
|
41
52
|
|
|
@@ -53,28 +64,13 @@ class Documentrix::Documents::RedisCache
|
|
|
53
64
|
#
|
|
54
65
|
# @param [String] key the string representation of the key
|
|
55
66
|
# @param [Object] value the object to be stored under the given key
|
|
56
|
-
# @option ex [Integer] ex the expiration time in seconds (default: nil)
|
|
57
67
|
#
|
|
58
68
|
# @return [Object] self
|
|
59
|
-
def set(key, value
|
|
60
|
-
|
|
61
|
-
if !ex.nil? && ex < 1
|
|
62
|
-
redis.del(pre(key))
|
|
63
|
-
else
|
|
64
|
-
redis.set(pre(key), JSON.generate(value), ex:)
|
|
65
|
-
end
|
|
69
|
+
def set(key, value)
|
|
70
|
+
redis.set(pre(key), JSON.generate(value))
|
|
66
71
|
value
|
|
67
72
|
end
|
|
68
73
|
|
|
69
|
-
# The ttl method returns the time-to-live (TTL) value for the given key
|
|
70
|
-
#
|
|
71
|
-
# @param [String] key the string representation of the key
|
|
72
|
-
#
|
|
73
|
-
# @return [Integer, nil] the TTL value if it exists in Redis, or nil otherwise
|
|
74
|
-
def ttl(key)
|
|
75
|
-
redis.ttl(pre(key))
|
|
76
|
-
end
|
|
77
|
-
|
|
78
74
|
# The key? method checks if the given key exists in Redis by calling the
|
|
79
75
|
# redis.exists? method
|
|
80
76
|
#
|
|
@@ -105,12 +101,33 @@ class Documentrix::Documents::RedisCache
|
|
|
105
101
|
s
|
|
106
102
|
end
|
|
107
103
|
|
|
108
|
-
# The
|
|
109
|
-
# prefix from this cache instance.
|
|
104
|
+
# The clear_all_with_prefix method removes all key-value pairs associated
|
|
105
|
+
# with the given prefix from this cache instance.
|
|
110
106
|
#
|
|
111
107
|
# @return [Documentrix::Documents::RedisCache] self
|
|
112
|
-
def
|
|
108
|
+
def clear_all_with_prefix
|
|
113
109
|
redis.scan_each(match: "#@prefix*") { |key| redis.del(key) }
|
|
110
|
+
defined? super and super
|
|
111
|
+
self
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Renames all keys that start with <tt>old_prefix</tt> to use
|
|
115
|
+
# <tt>new_prefix</tt>. The method iterates over every affected key,
|
|
116
|
+
# reconstructs the new key name (preserving the part of the key that follows
|
|
117
|
+
# the old prefix), writes the value under the new name, and deletes the old
|
|
118
|
+
# key.
|
|
119
|
+
#
|
|
120
|
+
# @param old_prefix [String] The prefix that currently identifies the target keys.
|
|
121
|
+
# @param new_prefix [String] The prefix that should replace <tt>old_prefix</tt>.
|
|
122
|
+
#
|
|
123
|
+
# @return [self] The cache instance, facilitating method chaining.
|
|
124
|
+
def move_prefix(old_prefix, new_prefix)
|
|
125
|
+
full_each(prefix: '') do |key, value|
|
|
126
|
+
key.start_with?(old_prefix) or next
|
|
127
|
+
unpre_key = unpre(key, prefix: old_prefix)
|
|
128
|
+
redis.set(pre(unpre_key, prefix: new_prefix), JSON.generate(value))
|
|
129
|
+
redis.del(key)
|
|
130
|
+
end
|
|
114
131
|
self
|
|
115
132
|
end
|
|
116
133
|
|
|
@@ -121,8 +138,23 @@ class Documentrix::Documents::RedisCache
|
|
|
121
138
|
#
|
|
122
139
|
# @return [self] self
|
|
123
140
|
def each(&block)
|
|
141
|
+
block or return enum_for(__method__)
|
|
142
|
+
|
|
124
143
|
redis.scan_each(match: "#@prefix*") { |key| block.(key, self[unpre(key)]) }
|
|
125
144
|
self
|
|
126
145
|
end
|
|
127
|
-
|
|
146
|
+
|
|
147
|
+
# The full_each method iterates over all records in the cache and yields
|
|
148
|
+
# them to the block.
|
|
149
|
+
#
|
|
150
|
+
# @yield [ key, value ] where key is the record's key and value is the record itself
|
|
151
|
+
def full_each(prefix: 'Documents-', &block)
|
|
152
|
+
block or return enum_for(__method__, prefix:)
|
|
153
|
+
|
|
154
|
+
redis.scan_each(match: prefix + ?*) do |key|
|
|
155
|
+
value = redis.get(key) or next
|
|
156
|
+
value = object_class ? JSON.parse(value, object_class:) : JSON.parse(value)
|
|
157
|
+
block.(key, value)
|
|
158
|
+
end
|
|
159
|
+
end
|
|
128
160
|
end
|