documentrix 0.0.4 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.utilsrc +107 -0
- data/CHANGES.md +33 -0
- data/Rakefile +6 -2
- data/docker-compose.yml +1 -1
- data/documentrix.gemspec +9 -9
- data/lib/documentrix/documents/cache/common.rb +102 -6
- data/lib/documentrix/documents/cache/memory_cache.rb +36 -8
- data/lib/documentrix/documents/cache/records.rb +14 -90
- data/lib/documentrix/documents/cache/redis_cache.rb +57 -25
- data/lib/documentrix/documents/cache/sqlite_cache.rb +41 -5
- data/lib/documentrix/documents.rb +62 -6
- data/lib/documentrix/utils/colorize_texts.rb +11 -0
- data/lib/documentrix/utils/math.rb +5 -0
- data/lib/documentrix/utils/tags.rb +34 -0
- data/lib/documentrix/utils.rb +10 -0
- data/lib/documentrix/version.rb +1 -1
- data/lib/documentrix.rb +6 -0
- data/spec/documentrix/documents/cache/interface_spec.rb +188 -0
- data/spec/documentrix/documents/cache/memory_cache_spec.rb +29 -0
- data/spec/documentrix/documents/cache/redis_cache_spec.rb +13 -11
- data/spec/documentrix/documents/cache/sqlite_cache_spec.rb +34 -0
- data/spec/documents_spec.rb +22 -0
- data/spec/spec_helper.rb +2 -6
- metadata +15 -16
- data/lib/documentrix/documents/cache/redis_backed_memory_cache.rb +0 -64
- data/spec/documentrix/documents/cache/redis_backed_memory_cache_spec.rb +0 -121
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ef2b0d8150c99cc22cc17d7d29d0331729b06d116e593fd32289852eb7739228
|
|
4
|
+
data.tar.gz: 386b85971a711fd6ba73aad746e98df33c14109c5524b48dd8fa20f3059fc875
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7b1abe342b523199e58724d1b9ec66ce56654389dc010fc59c2847647825876b8c03e1511c5b007dd97ee8f19f4e79caf132760dbb549906d5febdc9ac403bc1
|
|
7
|
+
data.tar.gz: 5b638e7af884173350e48dcdb83a81d2adeb6c6a4f90dc20243b690d888f083d95edcd34e00fd3bffc9435aa8a29db151b0c1c6311f696d39379e397379ef9df
|
data/.utilsrc
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# vim: set ft=ruby:
|
|
2
|
+
|
|
3
|
+
search do
|
|
4
|
+
prune_dirs /\A(\.svn|\.git|\.terraform|CVS|tmp|coverage|corpus|pkg|\.yardoc|doc)\z/
|
|
5
|
+
skip_files /(\A\.|\.sw[pon]\z|\.(log|fnm|jpg|jpeg|png|pdf|svg)\z|\A(tags|cscope\.out)\z|~\z)/i
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
discover do
|
|
9
|
+
prune_dirs /\A(\.svn|\.git|\.terraform|\.yardoc|CVS|tmp|coverage|corpus|pkg|\.yardoc|doc)\z/
|
|
10
|
+
skip_files /(\A\.|\.sw[pon]\z|\.log\z|~\z)/
|
|
11
|
+
index_expire_after 3_600
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
strip_spaces do
|
|
15
|
+
prune_dirs /\A(\..*|CVS|pkg|\.yardoc)\z/
|
|
16
|
+
skip_files /(\A\.|\.sw[pon]\z|\.log\z|~\z)/
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
probe do
|
|
20
|
+
test_framework :rspec
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
ssh_tunnel do
|
|
24
|
+
terminal_multiplexer :tmux
|
|
25
|
+
login_session "/home/#{ENV['USER']}"
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
classify do
|
|
29
|
+
shift_path_by_default 1
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
code_indexer do
|
|
33
|
+
verbose false
|
|
34
|
+
|
|
35
|
+
gems = %w[
|
|
36
|
+
all_images
|
|
37
|
+
amatch
|
|
38
|
+
base64
|
|
39
|
+
bigdecimal
|
|
40
|
+
complex_config
|
|
41
|
+
connection_pool
|
|
42
|
+
date
|
|
43
|
+
debug
|
|
44
|
+
diff-lcs
|
|
45
|
+
docile
|
|
46
|
+
documentrix
|
|
47
|
+
erb
|
|
48
|
+
excon
|
|
49
|
+
fileutils
|
|
50
|
+
gem_hadar
|
|
51
|
+
infobar
|
|
52
|
+
io-console
|
|
53
|
+
irb
|
|
54
|
+
json
|
|
55
|
+
kramdown
|
|
56
|
+
kramdown-ansi
|
|
57
|
+
kramdown-parser-gfm
|
|
58
|
+
logger
|
|
59
|
+
mize
|
|
60
|
+
more_math
|
|
61
|
+
net-http
|
|
62
|
+
nokogiri
|
|
63
|
+
numo-narray-alt
|
|
64
|
+
ollama-ruby
|
|
65
|
+
openssl
|
|
66
|
+
ostruct
|
|
67
|
+
pp
|
|
68
|
+
prettyprint
|
|
69
|
+
prism
|
|
70
|
+
psych
|
|
71
|
+
racc
|
|
72
|
+
rake
|
|
73
|
+
rdoc
|
|
74
|
+
readline
|
|
75
|
+
redis
|
|
76
|
+
redis-client
|
|
77
|
+
reline
|
|
78
|
+
rexml
|
|
79
|
+
rspec
|
|
80
|
+
rspec-core
|
|
81
|
+
rspec-expectations
|
|
82
|
+
rspec-mocks
|
|
83
|
+
rspec-support
|
|
84
|
+
search_ui
|
|
85
|
+
shellwords
|
|
86
|
+
simplecov
|
|
87
|
+
simplecov-html
|
|
88
|
+
simplecov_json_formatter
|
|
89
|
+
sqlite-vec
|
|
90
|
+
sqlite3
|
|
91
|
+
stringio
|
|
92
|
+
sync
|
|
93
|
+
term-ansicolor
|
|
94
|
+
terminal-table
|
|
95
|
+
tins
|
|
96
|
+
tsort
|
|
97
|
+
unicode-display_width
|
|
98
|
+
unicode-emoji
|
|
99
|
+
uri
|
|
100
|
+
yaml
|
|
101
|
+
yard
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
paths {
|
|
105
|
+
%w[ lib ] + gems.map { `bundle show #{it}` }.map(&:chomp)
|
|
106
|
+
}
|
|
107
|
+
end
|
data/CHANGES.md
CHANGED
|
@@ -1,5 +1,38 @@
|
|
|
1
1
|
# Changes
|
|
2
2
|
|
|
3
|
+
## 2026-03-31 v0.1.1
|
|
4
|
+
|
|
5
|
+
- Improved compatibility and reliability by ensuring the gem uses a stable,
|
|
6
|
+
newer version **0.1.8** of `sqlite-vec`.
|
|
7
|
+
|
|
8
|
+
## 2026-03-29 v0.1.0
|
|
9
|
+
|
|
10
|
+
- Added `Documentrix::Documents#rename_collection` to rename a collection and
|
|
11
|
+
delegate key moving to the cache.
|
|
12
|
+
- Extended `Documentrix::Documents::Cache::Common#pre` and `#unpre` to accept
|
|
13
|
+
an optional `prefix` argument.
|
|
14
|
+
- Implemented `Documentrix::Documents::Cache::MemoryCache#move_prefix`,
|
|
15
|
+
`RedisCache#move_prefix`, and `SQLiteCache#move_prefix` for atomic key
|
|
16
|
+
mass‑move operations.
|
|
17
|
+
- Updated unit tests to cover the new rename and prefix‑moving functionality.
|
|
18
|
+
- Switched to `GemHadar::SimpleCov` for test coverage.
|
|
19
|
+
- Removed the `redis_backed_memory_cache` implementation and its spec.
|
|
20
|
+
- Added a `.utilsrc` configuration file defining blocks for `search`,
|
|
21
|
+
`discover`, `strip_spaces`, `probe`, `ssh_tunnel`, `classify`, and
|
|
22
|
+
`code_indexer`.
|
|
23
|
+
- Reordered Docker `apk add` commands and added friendly echo messages during
|
|
24
|
+
image build and test phases.
|
|
25
|
+
- Added `fail_fast: true` flag to the Docker file definition.
|
|
26
|
+
- Added interface spec for cache implementations (`memory`, `Redis`, `SQLite`).
|
|
27
|
+
- Included `Set` requirement for older Ruby versions.
|
|
28
|
+
- Removed unused expiry functionality from `RedisCache`.
|
|
29
|
+
- Refactored cache system to use explicit inheritance, moving common methods
|
|
30
|
+
into `Documentrix::Documents::Cache::Common` and renaming `clear` to
|
|
31
|
+
`clear_all_with_prefix`.
|
|
32
|
+
- Updated documentation with detailed RDoc comments.
|
|
33
|
+
- Updated CI configuration to use `bundle exec` and cleaned up `Gemfile.lock`.
|
|
34
|
+
- Added `changelog` configuration in `Rakefile` to support `CHANGES.md`.
|
|
35
|
+
|
|
3
36
|
## 2025-12-20 v0.0.4
|
|
4
37
|
|
|
5
38
|
- Added `openssl-dev` to the package list in `.all_images.yml` for Docker
|
data/Rakefile
CHANGED
|
@@ -24,19 +24,23 @@ GemHadar do
|
|
|
24
24
|
'.rspec'
|
|
25
25
|
readme 'README.md'
|
|
26
26
|
|
|
27
|
+
changelog do
|
|
28
|
+
filename 'CHANGES.md'
|
|
29
|
+
end
|
|
30
|
+
|
|
27
31
|
required_ruby_version '>= 3.1'
|
|
28
32
|
|
|
29
33
|
dependency 'infobar', '~> 0.9'
|
|
30
34
|
dependency 'json', '~> 2.0'
|
|
31
35
|
dependency 'tins', '~> 1.34'
|
|
32
|
-
dependency 'sqlite-vec', '
|
|
36
|
+
dependency 'sqlite-vec', '>= 0.1.8'
|
|
33
37
|
dependency 'sqlite3', '~> 2.0', '>= 2.0.1'
|
|
34
38
|
dependency 'kramdown-ansi', '~> 0.0', '>= 0.0.1'
|
|
35
39
|
dependency 'numo-narray-alt', '~> 0.9'
|
|
36
40
|
dependency 'redis', '~> 5.0'
|
|
37
41
|
dependency 'more_math', '~> 1.1'
|
|
38
42
|
|
|
39
|
-
development_dependency 'all_images', '~> 0.
|
|
43
|
+
development_dependency 'all_images', '~> 0.12'
|
|
40
44
|
development_dependency 'rspec', '~> 3.2'
|
|
41
45
|
development_dependency 'kramdown', '~> 2.0'
|
|
42
46
|
development_dependency 'debug'
|
data/docker-compose.yml
CHANGED
data/documentrix.gemspec
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
|
2
|
-
# stub: documentrix 0.
|
|
2
|
+
# stub: documentrix 0.1.1 ruby lib
|
|
3
3
|
|
|
4
4
|
Gem::Specification.new do |s|
|
|
5
5
|
s.name = "documentrix".freeze
|
|
6
|
-
s.version = "0.
|
|
6
|
+
s.version = "0.1.1".freeze
|
|
7
7
|
|
|
8
8
|
s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
|
|
9
9
|
s.require_paths = ["lib".freeze]
|
|
@@ -11,20 +11,20 @@ Gem::Specification.new do |s|
|
|
|
11
11
|
s.date = "1980-01-02"
|
|
12
12
|
s.description = "The Ruby library, Documentrix, is designed to provide a way to build and\nquery vector databases for applications in natural language processing\n(NLP) and large language models (LLMs). It allows users to store and\nretrieve dense vector embeddings for text strings.\n".freeze
|
|
13
13
|
s.email = "flori@ping.de".freeze
|
|
14
|
-
s.extra_rdoc_files = ["README.md".freeze, "lib/documentrix.rb".freeze, "lib/documentrix/documents.rb".freeze, "lib/documentrix/documents/cache/common.rb".freeze, "lib/documentrix/documents/cache/memory_cache.rb".freeze, "lib/documentrix/documents/cache/records.rb".freeze, "lib/documentrix/documents/cache/
|
|
15
|
-
s.files = [".envrc".freeze, ".yardopts".freeze, "CHANGES.md".freeze, "Gemfile".freeze, "LICENSE".freeze, "README.md".freeze, "Rakefile".freeze, "docker-compose.yml".freeze, "documentrix.gemspec".freeze, "lib/documentrix.rb".freeze, "lib/documentrix/documents.rb".freeze, "lib/documentrix/documents/cache/common.rb".freeze, "lib/documentrix/documents/cache/memory_cache.rb".freeze, "lib/documentrix/documents/cache/records.rb".freeze, "lib/documentrix/documents/cache/
|
|
14
|
+
s.extra_rdoc_files = ["README.md".freeze, "lib/documentrix.rb".freeze, "lib/documentrix/documents.rb".freeze, "lib/documentrix/documents/cache/common.rb".freeze, "lib/documentrix/documents/cache/memory_cache.rb".freeze, "lib/documentrix/documents/cache/records.rb".freeze, "lib/documentrix/documents/cache/redis_cache.rb".freeze, "lib/documentrix/documents/cache/sqlite_cache.rb".freeze, "lib/documentrix/documents/splitters/character.rb".freeze, "lib/documentrix/documents/splitters/semantic.rb".freeze, "lib/documentrix/utils.rb".freeze, "lib/documentrix/utils/colorize_texts.rb".freeze, "lib/documentrix/utils/math.rb".freeze, "lib/documentrix/utils/tags.rb".freeze, "lib/documentrix/version.rb".freeze]
|
|
15
|
+
s.files = [".envrc".freeze, ".utilsrc".freeze, ".yardopts".freeze, "CHANGES.md".freeze, "Gemfile".freeze, "LICENSE".freeze, "README.md".freeze, "Rakefile".freeze, "docker-compose.yml".freeze, "documentrix.gemspec".freeze, "lib/documentrix.rb".freeze, "lib/documentrix/documents.rb".freeze, "lib/documentrix/documents/cache/common.rb".freeze, "lib/documentrix/documents/cache/memory_cache.rb".freeze, "lib/documentrix/documents/cache/records.rb".freeze, "lib/documentrix/documents/cache/redis_cache.rb".freeze, "lib/documentrix/documents/cache/sqlite_cache.rb".freeze, "lib/documentrix/documents/splitters/character.rb".freeze, "lib/documentrix/documents/splitters/semantic.rb".freeze, "lib/documentrix/utils.rb".freeze, "lib/documentrix/utils/colorize_texts.rb".freeze, "lib/documentrix/utils/math.rb".freeze, "lib/documentrix/utils/tags.rb".freeze, "lib/documentrix/version.rb".freeze, "redis/redis.conf".freeze, "spec/assets/embeddings.json".freeze, "spec/documentrix/documents/cache/interface_spec.rb".freeze, "spec/documentrix/documents/cache/memory_cache_spec.rb".freeze, "spec/documentrix/documents/cache/redis_cache_spec.rb".freeze, "spec/documentrix/documents/cache/sqlite_cache_spec.rb".freeze, "spec/documentrix/documents/splitters/character_spec.rb".freeze, "spec/documentrix/documents/splitters/semantic_spec.rb".freeze, "spec/documents_spec.rb".freeze, "spec/spec_helper.rb".freeze, "spec/utils/colorize_texts_spec.rb".freeze, "spec/utils/tags_spec.rb".freeze]
|
|
16
16
|
s.homepage = "https://github.com/flori/documentrix".freeze
|
|
17
17
|
s.licenses = ["MIT".freeze]
|
|
18
18
|
s.rdoc_options = ["--title".freeze, "Documentrix - Ruby library for embedding vector database".freeze, "--main".freeze, "README.md".freeze]
|
|
19
19
|
s.required_ruby_version = Gem::Requirement.new(">= 3.1".freeze)
|
|
20
|
-
s.rubygems_version = "4.0.
|
|
20
|
+
s.rubygems_version = "4.0.8".freeze
|
|
21
21
|
s.summary = "Ruby library for embedding vector database".freeze
|
|
22
|
-
s.test_files = ["spec/documentrix/documents/cache/
|
|
22
|
+
s.test_files = ["spec/documentrix/documents/cache/interface_spec.rb".freeze, "spec/documentrix/documents/cache/memory_cache_spec.rb".freeze, "spec/documentrix/documents/cache/redis_cache_spec.rb".freeze, "spec/documentrix/documents/cache/sqlite_cache_spec.rb".freeze, "spec/documentrix/documents/splitters/character_spec.rb".freeze, "spec/documentrix/documents/splitters/semantic_spec.rb".freeze, "spec/documents_spec.rb".freeze, "spec/spec_helper.rb".freeze, "spec/utils/colorize_texts_spec.rb".freeze, "spec/utils/tags_spec.rb".freeze]
|
|
23
23
|
|
|
24
24
|
s.specification_version = 4
|
|
25
25
|
|
|
26
|
-
s.add_development_dependency(%q<gem_hadar>.freeze, ["
|
|
27
|
-
s.add_development_dependency(%q<all_images>.freeze, ["~> 0.
|
|
26
|
+
s.add_development_dependency(%q<gem_hadar>.freeze, [">= 2.17.0".freeze])
|
|
27
|
+
s.add_development_dependency(%q<all_images>.freeze, ["~> 0.12".freeze])
|
|
28
28
|
s.add_development_dependency(%q<rspec>.freeze, ["~> 3.2".freeze])
|
|
29
29
|
s.add_development_dependency(%q<kramdown>.freeze, ["~> 2.0".freeze])
|
|
30
30
|
s.add_development_dependency(%q<debug>.freeze, [">= 0".freeze])
|
|
@@ -32,7 +32,7 @@ Gem::Specification.new do |s|
|
|
|
32
32
|
s.add_runtime_dependency(%q<infobar>.freeze, ["~> 0.9".freeze])
|
|
33
33
|
s.add_runtime_dependency(%q<json>.freeze, ["~> 2.0".freeze])
|
|
34
34
|
s.add_runtime_dependency(%q<tins>.freeze, ["~> 1.34".freeze])
|
|
35
|
-
s.add_runtime_dependency(%q<sqlite-vec>.freeze, ["
|
|
35
|
+
s.add_runtime_dependency(%q<sqlite-vec>.freeze, [">= 0.1.8".freeze])
|
|
36
36
|
s.add_runtime_dependency(%q<sqlite3>.freeze, ["~> 2.0".freeze, ">= 2.0.1".freeze])
|
|
37
37
|
s.add_runtime_dependency(%q<kramdown-ansi>.freeze, ["~> 0.0".freeze, ">= 0.0.1".freeze])
|
|
38
38
|
s.add_runtime_dependency(%q<numo-narray-alt>.freeze, ["~> 0.9".freeze])
|
|
@@ -1,5 +1,18 @@
|
|
|
1
|
+
# Common interface for document caches
|
|
2
|
+
#
|
|
3
|
+
# This module defines the standard interface that all document cache
|
|
4
|
+
# implementations must adhere to. It provides shared functionality for managing
|
|
5
|
+
# cached document embeddings, including methods for setting, retrieving, and
|
|
6
|
+
# deleting cache entries, as well as querying and filtering cached data based
|
|
7
|
+
# on tags and similarity searches.
|
|
8
|
+
#
|
|
9
|
+
# The module includes methods for prefix management, collection enumeration,
|
|
10
|
+
# tag extraction, and cache clearing operations, ensuring consistent behavior
|
|
11
|
+
# across different cache backends such as
|
|
12
|
+
# memory, Redis, and SQLite.
|
|
1
13
|
module Documentrix::Documents::Cache::Common
|
|
2
14
|
include Documentrix::Utils::Math
|
|
15
|
+
include Enumerable
|
|
3
16
|
|
|
4
17
|
# The initialize method sets up the Documentrix::Documents::Cache instance's
|
|
5
18
|
# by setting its prefix attribute to the given value.
|
|
@@ -27,17 +40,100 @@ module Documentrix::Documents::Cache::Common
|
|
|
27
40
|
# Returns a string representing the given `key` prefixed with the defined
|
|
28
41
|
# prefix.
|
|
29
42
|
#
|
|
30
|
-
# @param key [String] the key to
|
|
31
|
-
# @
|
|
32
|
-
|
|
33
|
-
|
|
43
|
+
# @param key [String] the key to prefix
|
|
44
|
+
# @param prefix [String] the prefix to use (defaults to the cache's prefix)
|
|
45
|
+
# @return [String] the prefixed key
|
|
46
|
+
def pre(key, prefix: @prefix)
|
|
47
|
+
[ prefix, key ].join
|
|
34
48
|
end
|
|
35
49
|
|
|
36
50
|
# Returns a string with the prefix removed from the given `key`.
|
|
37
51
|
#
|
|
38
52
|
# @param key [String] the input string containing the prefix.
|
|
53
|
+
# @param prefix [String] the prefix to use (defaults to the cache's prefix)
|
|
39
54
|
# @return [String] the input string without the prefix.
|
|
40
|
-
def unpre(key)
|
|
41
|
-
key.sub(/\A
|
|
55
|
+
def unpre(key, prefix: @prefix)
|
|
56
|
+
key.sub(/\A#{prefix}/, '')
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# The find_records method finds records that match the given needle and
|
|
60
|
+
# tags.
|
|
61
|
+
#
|
|
62
|
+
# @param needle [ Array ] an array containing the embedding vector
|
|
63
|
+
# @param tags [ String, Array ] a string or array of strings representing the tags to search for
|
|
64
|
+
# @param max_records [ Integer ] the maximum number of records to return
|
|
65
|
+
#
|
|
66
|
+
# @yield [ record ]
|
|
67
|
+
#
|
|
68
|
+
# @return [ Array<Documentrix::Documents::Records> ] an array containing the matching records
|
|
69
|
+
def find_records(needle, tags: nil, max_records: nil)
|
|
70
|
+
tags = Documentrix::Utils::Tags.new(Array(tags)).to_a
|
|
71
|
+
records = self
|
|
72
|
+
if tags.present?
|
|
73
|
+
records = records.select { |_key, record| (tags & record.tags).size >= 1 }
|
|
74
|
+
end
|
|
75
|
+
needle_norm = norm(needle)
|
|
76
|
+
records = records.sort_by { |key, record|
|
|
77
|
+
record.key = key
|
|
78
|
+
record.similarity = cosine_similarity(
|
|
79
|
+
a: needle,
|
|
80
|
+
b: record.embedding,
|
|
81
|
+
a_norm: needle_norm,
|
|
82
|
+
b_norm: record.norm,
|
|
83
|
+
)
|
|
84
|
+
}
|
|
85
|
+
records.transpose.last&.reverse.to_a
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Returns a set of unique tags found in the cache records.
|
|
89
|
+
#
|
|
90
|
+
# This method iterates through all records in the cache and collects unique
|
|
91
|
+
# tags from each record's tags collection. It constructs a new
|
|
92
|
+
# Documentrix::Utils::Tags object containing all the unique tags encountered.
|
|
93
|
+
#
|
|
94
|
+
# @return [Documentrix::Utils::Tags] a set of unique tags from all records in
|
|
95
|
+
# the cache
|
|
96
|
+
def tags
|
|
97
|
+
each_with_object(Documentrix::Utils::Tags.new) do |(_, record), t|
|
|
98
|
+
record.tags.each do |tag|
|
|
99
|
+
t.add(tag, source: record.source)
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# The clear_for_tags method removes all records from the cache that have tags
|
|
105
|
+
# matching any of the provided tags.
|
|
106
|
+
#
|
|
107
|
+
# @param tags [Array<String>] an array of tag names to filter records by
|
|
108
|
+
#
|
|
109
|
+
# @return [self] self
|
|
110
|
+
def clear_for_tags(tags)
|
|
111
|
+
each do |key, record|
|
|
112
|
+
if (tags & record.tags.to_a).size >= 1
|
|
113
|
+
delete(unpre(key))
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
self
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# The clear method removes cached records based on the provided tags or
|
|
120
|
+
# clears all records with the current prefix.
|
|
121
|
+
#
|
|
122
|
+
# When tags are provided, it removes only the records that have matching
|
|
123
|
+
# tags. If no tags are provided, it removes all records that have keys
|
|
124
|
+
# starting with the current prefix.
|
|
125
|
+
#
|
|
126
|
+
# @param tags [NilClass, Array<String>] an array of tag names to filter
|
|
127
|
+
# records by, or nil to clear all records
|
|
128
|
+
#
|
|
129
|
+
# @return [self] returns the cache instance for method chaining
|
|
130
|
+
def clear(tags: nil)
|
|
131
|
+
tags = Documentrix::Utils::Tags.new(tags).to_a
|
|
132
|
+
if tags.present?
|
|
133
|
+
clear_for_tags(tags)
|
|
134
|
+
else
|
|
135
|
+
clear_all_with_prefix
|
|
136
|
+
end
|
|
137
|
+
self
|
|
42
138
|
end
|
|
43
139
|
end
|
|
@@ -1,5 +1,17 @@
|
|
|
1
1
|
require 'documentrix/documents/cache/common'
|
|
2
2
|
|
|
3
|
+
# MemoryCache is an in-memory cache implementation for document embeddings.
|
|
4
|
+
#
|
|
5
|
+
# This class provides a cache store for document embeddings using a hash-based
|
|
6
|
+
# in-memory storage mechanism. It implements the common cache interface
|
|
7
|
+
# defined in Documentrix::Documents::Cache::Common and supports operations
|
|
8
|
+
# such as setting, retrieving, and deleting cached entries, as well as
|
|
9
|
+
# iterating over cached items.
|
|
10
|
+
#
|
|
11
|
+
# The cache uses a prefix to namespace keys and supports clearing entries
|
|
12
|
+
# based on prefixes or specific tags. It is designed to be used as a
|
|
13
|
+
# temporary storage mechanism during processing and is not persistent
|
|
14
|
+
# across application restarts.
|
|
3
15
|
class Documentrix::Documents::MemoryCache
|
|
4
16
|
include Documentrix::Documents::Cache::Common
|
|
5
17
|
|
|
@@ -59,32 +71,48 @@ class Documentrix::Documents::MemoryCache
|
|
|
59
71
|
count
|
|
60
72
|
end
|
|
61
73
|
|
|
62
|
-
# The
|
|
63
|
-
# with the prefix `prefix`.
|
|
74
|
+
# The clear_all_with_prefix method removes all records from the cache that
|
|
75
|
+
# have keys starting with the prefix `prefix`.
|
|
64
76
|
#
|
|
65
77
|
# @return [ Documentrix::Documents::MemoryCache ] self
|
|
66
|
-
def
|
|
78
|
+
def clear_all_with_prefix
|
|
67
79
|
@data.delete_if { |key, _| key.start_with?(@prefix) }
|
|
68
80
|
self
|
|
69
81
|
end
|
|
70
82
|
|
|
83
|
+
# Move all keys that start with +old_prefix+ into a new prefix.
|
|
84
|
+
#
|
|
85
|
+
# This helper is used when a collection is renamed. It iterates over all
|
|
86
|
+
# cached entries, selects those whose keys start with *old_prefix*, strips
|
|
87
|
+
# that prefix, then re‑inserts the entry with the *new_prefix* instead.
|
|
88
|
+
#
|
|
89
|
+
# @param old_prefix [String] The prefix to look for on existing keys.
|
|
90
|
+
# @param new_prefix [String] The prefix that will replace *old_prefix*.
|
|
91
|
+
# @return [Documentrix::Documents::MemoryCache] Returns `self` to allow
|
|
92
|
+
# chaining.
|
|
93
|
+
def move_prefix(old_prefix, new_prefix)
|
|
94
|
+
new_data = @data.dup
|
|
95
|
+
full_each do |key, value|
|
|
96
|
+
key.start_with?(old_prefix) or next
|
|
97
|
+
unpre_key = unpre(key, prefix: old_prefix)
|
|
98
|
+
new_data[pre(unpre_key, prefix: new_prefix)] = new_data.delete(key)
|
|
99
|
+
end
|
|
100
|
+
@data.replace(new_data)
|
|
101
|
+
self
|
|
102
|
+
end
|
|
103
|
+
|
|
71
104
|
# The each method iterates over the cache's keys and values under a given
|
|
72
105
|
# prefix `prefix`.
|
|
73
106
|
#
|
|
74
107
|
# @yield [key, value] Each key-value pair in the cache
|
|
75
|
-
#
|
|
76
|
-
# @return [void]
|
|
77
108
|
def each(&block)
|
|
78
109
|
@data.select { |key,| key.start_with?(@prefix) }.each(&block)
|
|
79
110
|
end
|
|
80
|
-
include Enumerable
|
|
81
111
|
|
|
82
112
|
# The full_each method iterates over the data hash and yields each key-value
|
|
83
113
|
# pair to the given block regardless of the prefix `prefix`.
|
|
84
114
|
#
|
|
85
115
|
# @yield [key, value] Each key-value pair in the data hash
|
|
86
|
-
#
|
|
87
|
-
# @return [void]
|
|
88
116
|
def full_each(&block)
|
|
89
117
|
@data.each(&block)
|
|
90
118
|
end
|
|
@@ -1,4 +1,18 @@
|
|
|
1
|
+
# Module for cache record definitions used in Documentrix document caching.
|
|
2
|
+
#
|
|
3
|
+
# This module provides the Record class and RedisFullEach module for managing
|
|
4
|
+
# cached document embeddings and their associated metadata in the Documentrix
|
|
5
|
+
# library's caching system.
|
|
1
6
|
module Documentrix::Documents::Cache::Records
|
|
7
|
+
# A record class for caching document embeddings and their associated
|
|
8
|
+
# metadata.
|
|
9
|
+
#
|
|
10
|
+
# This class extends JSON::GenericObject and is used to represent cached
|
|
11
|
+
# document entries in the Documentrix library. It stores text content,
|
|
12
|
+
# embedding vectors, normalization values, source information, and tags
|
|
13
|
+
# associated with each document record. The class provides methods for string
|
|
14
|
+
# representation, tag handling, and equality comparison based on text
|
|
15
|
+
# content.
|
|
2
16
|
class Record < JSON::GenericObject
|
|
3
17
|
# The initialize method sets default values for the text and norm
|
|
4
18
|
# attributes.
|
|
@@ -41,94 +55,4 @@ module Documentrix::Documents::Cache::Records
|
|
|
41
55
|
|
|
42
56
|
alias inspect to_s
|
|
43
57
|
end
|
|
44
|
-
|
|
45
|
-
module RedisFullEach
|
|
46
|
-
# The full_each method iterates over all records in the cache and yields
|
|
47
|
-
# them to the block.
|
|
48
|
-
#
|
|
49
|
-
# @yield [ key, value ] where key is the record's key and value is the record itself
|
|
50
|
-
def full_each(&block)
|
|
51
|
-
redis.scan_each(match: [ Documentrix::Documents, ?* ] * ?-) do |key|
|
|
52
|
-
value = redis.get(key) or next
|
|
53
|
-
value = JSON(value, object_class: Documentrix::Documents::Record)
|
|
54
|
-
block.(key, value)
|
|
55
|
-
end
|
|
56
|
-
end
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
module FindRecords
|
|
60
|
-
# The find_records method finds records that match the given needle and
|
|
61
|
-
# tags.
|
|
62
|
-
#
|
|
63
|
-
# @param needle [ Array ] an array containing the embedding vector
|
|
64
|
-
# @param tags [ String, Array ] a string or array of strings representing the tags to search for
|
|
65
|
-
# @param max_records [ Integer ] the maximum number of records to return
|
|
66
|
-
#
|
|
67
|
-
# @yield [ record ]
|
|
68
|
-
#
|
|
69
|
-
# @return [ Array<Documentrix::Documents::Records> ] an array containing the matching records
|
|
70
|
-
def find_records(needle, tags: nil, max_records: nil)
|
|
71
|
-
tags = Documentrix::Utils::Tags.new(Array(tags)).to_a
|
|
72
|
-
records = self
|
|
73
|
-
if tags.present?
|
|
74
|
-
records = records.select { |_key, record| (tags & record.tags).size >= 1 }
|
|
75
|
-
end
|
|
76
|
-
needle_norm = norm(needle)
|
|
77
|
-
records = records.sort_by { |key, record|
|
|
78
|
-
record.key = key
|
|
79
|
-
record.similarity = cosine_similarity(
|
|
80
|
-
a: needle,
|
|
81
|
-
b: record.embedding,
|
|
82
|
-
a_norm: needle_norm,
|
|
83
|
-
b_norm: record.norm,
|
|
84
|
-
)
|
|
85
|
-
}
|
|
86
|
-
records.transpose.last&.reverse.to_a
|
|
87
|
-
end
|
|
88
|
-
end
|
|
89
|
-
|
|
90
|
-
module Tags
|
|
91
|
-
# The clear method removes all records that match the given tags from the
|
|
92
|
-
# cache.
|
|
93
|
-
#
|
|
94
|
-
# @param tags [ Array<String> ] an array of tag names
|
|
95
|
-
#
|
|
96
|
-
# @example
|
|
97
|
-
# documents.clear(tags: %w[ foo bar ])
|
|
98
|
-
#
|
|
99
|
-
# @return [ self ]
|
|
100
|
-
def clear(tags: nil)
|
|
101
|
-
tags = Documentrix::Utils::Tags.new(tags).to_a
|
|
102
|
-
if tags.present?
|
|
103
|
-
if respond_to?(:clear_for_tags)
|
|
104
|
-
clear_for_tags(tags)
|
|
105
|
-
else
|
|
106
|
-
each do |key, record|
|
|
107
|
-
if (tags & record.tags.to_a).size >= 1
|
|
108
|
-
delete(unpre(key))
|
|
109
|
-
end
|
|
110
|
-
end
|
|
111
|
-
end
|
|
112
|
-
else
|
|
113
|
-
super()
|
|
114
|
-
end
|
|
115
|
-
self
|
|
116
|
-
end
|
|
117
|
-
|
|
118
|
-
# The tags method returns an array of unique tags from all records.
|
|
119
|
-
#
|
|
120
|
-
# @return [Documentrix::Utils::Tags] An instance of
|
|
121
|
-
# Documentrix::Utils::Tags containing the unique tags.
|
|
122
|
-
def tags
|
|
123
|
-
if defined? super
|
|
124
|
-
super
|
|
125
|
-
else
|
|
126
|
-
each_with_object(Documentrix::Utils::Tags.new) do |(_, record), t|
|
|
127
|
-
record.tags.each do |tag|
|
|
128
|
-
t.add(tag, source: record.source)
|
|
129
|
-
end
|
|
130
|
-
end
|
|
131
|
-
end
|
|
132
|
-
end
|
|
133
|
-
end
|
|
134
58
|
end
|