documentrix 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +1 -0
- data/Gemfile +5 -0
- data/README.md +71 -0
- data/Rakefile +48 -0
- data/documentrix.gemspec +41 -0
- data/lib/documentrix/documents/cache/common.rb +43 -0
- data/lib/documentrix/documents/cache/memory_cache.rb +91 -0
- data/lib/documentrix/documents/cache/records.rb +145 -0
- data/lib/documentrix/documents/cache/redis_backed_memory_cache.rb +64 -0
- data/lib/documentrix/documents/cache/redis_cache.rb +128 -0
- data/lib/documentrix/documents/cache/sqlite_cache.rb +335 -0
- data/lib/documentrix/documents/splitters/character.rb +72 -0
- data/lib/documentrix/documents/splitters/semantic.rb +91 -0
- data/lib/documentrix/documents.rb +328 -0
- data/lib/documentrix/utils/colorize_texts.rb +65 -0
- data/lib/documentrix/utils/math.rb +48 -0
- data/lib/documentrix/utils/tags.rb +112 -0
- data/lib/documentrix/utils.rb +5 -0
- data/lib/documentrix/version.rb +8 -0
- data/lib/documentrix.rb +11 -0
- data/spec/assets/embeddings.json +1 -0
- data/spec/documentrix/documents/cache/memory_cache_spec.rb +98 -0
- data/spec/documentrix/documents/cache/redis_backed_memory_cache_spec.rb +121 -0
- data/spec/documentrix/documents/cache/redis_cache_spec.rb +123 -0
- data/spec/documentrix/documents/cache/sqlite_cache_spec.rb +141 -0
- data/spec/documentrix/documents/splitters/character_spec.rb +110 -0
- data/spec/documentrix/documents/splitters/semantic_spec.rb +56 -0
- data/spec/documents_spec.rb +174 -0
- data/spec/spec_helper.rb +23 -0
- data/spec/utils/colorize_texts_spec.rb +13 -0
- data/spec/utils/tags_spec.rb +53 -0
- metadata +329 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 7987ac709860e4747aec91c9d99b766e1b959b40142ebb17a3516aa80a0f6b85
|
4
|
+
data.tar.gz: a9ea8a8f360bef62ad687d1ecbcf04e6a978dc7450a76f26d97c6f04ee1bdf62
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ba03da2c50fffb014489ee3281765e1ad0712da263dae1a6f45d5f449aba345113f28c093c7d2df14986ad283aee6cbd88c8545c6a7dceaacadff405ad27f457
|
7
|
+
data.tar.gz: 3a10470a23349509a575e79ab1ca18b0969827ef7f720f745537de83924348109cb3fcbd4b7a4a9703daebb471489948ec26ba704362838ad41fa546bb0040f2
|
data/.yardopts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--markup markdown
|
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
# Documentrix - Ruby library for embedding vector database
|
2
|
+
|
3
|
+
## Description
|
4
|
+
|
5
|
+
The Ruby library, Documentrix, is designed to provide a way to build and
|
6
|
+
query vector databases for applications in natural language processing
|
7
|
+
(NLP) and large language models (LLMs). It allows users to store and
|
8
|
+
retrieve dense vector embeddings for text strings.
|
9
|
+
|
10
|
+
## Installation (gem & bundler)
|
11
|
+
|
12
|
+
To install Documentrix, you can use the following methods:
|
13
|
+
|
14
|
+
### Using the gem command
|
15
|
+
|
16
|
+
Type `gem install documentrix` in your terminal.
|
17
|
+
|
18
|
+
### Using Bundler
|
19
|
+
|
20
|
+
Add the line `gem 'documentrix'` to your Gemfile and run `bundle install` in
|
21
|
+
your terminal.
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
In your own software the library can be used as shown in this example:
|
26
|
+
|
27
|
+
```ruby
|
28
|
+
# Require necessary libraries: ollama-ruby and documentrix
|
29
|
+
require 'ollama'
|
30
|
+
require 'documentrix'
|
31
|
+
|
32
|
+
# Initialize an Ollama client instance, pointing to a local server
|
33
|
+
ollama = Ollama::Client.new(base_url: 'http://localhost:11434')
|
34
|
+
|
35
|
+
# Create a new Documentrix documents instance
|
36
|
+
documents = Documentrix::Documents.new(
|
37
|
+
ollama: ollama,
|
38
|
+
model: 'mxbai-embed-large',
|
39
|
+
collection: 'my-collection',
|
40
|
+
cache: Documentrix::Documents::SQLiteCache
|
41
|
+
)
|
42
|
+
|
43
|
+
# Split sample text into individual chunks using recursive character splitting
|
44
|
+
splitter = Documentrix::Documents::Splitters::RecursiveCharacter.new
|
45
|
+
text = "hay hay hay…" # Sample text data
|
46
|
+
chunks = splitter.split(text)
|
47
|
+
documents.add(chunks)
|
48
|
+
|
49
|
+
# Search the document collection for matching records
|
50
|
+
query = "What needles can you find in a haystack" # Search query
|
51
|
+
records = documents.find_where(
|
52
|
+
query,
|
53
|
+
prompt: 'Represent this sentence for searching relevant passages: %s',
|
54
|
+
text_size: 4096,
|
55
|
+
text_count: 10
|
56
|
+
)
|
57
|
+
```
|
58
|
+
|
59
|
+
## Download
|
60
|
+
|
61
|
+
The homepage of this library is located at
|
62
|
+
|
63
|
+
* https://github.com/flori/documentrix
|
64
|
+
|
65
|
+
## Author
|
66
|
+
|
67
|
+
<b>Documentrix</b> was written by [Florian Frank](mailto:flori@ping.de)
|
68
|
+
|
69
|
+
## License
|
70
|
+
|
71
|
+
This software is licensed under the <i>MIT</i> license.
|
data/Rakefile
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# vim: set filetype=ruby et sw=2 ts=2:
|
2
|
+
|
3
|
+
require 'gem_hadar'
|
4
|
+
|
5
|
+
GemHadar do
|
6
|
+
name 'documentrix'
|
7
|
+
module_type :module
|
8
|
+
author 'Florian Frank'
|
9
|
+
email 'flori@ping.de'
|
10
|
+
homepage "https://github.com/flori/#{name}"
|
11
|
+
summary '"Ruby library for embedding vector database'
|
12
|
+
description <<~EOT
|
13
|
+
The Ruby library, Documentrix, is designed to provide a way to build and
|
14
|
+
query vector databases for applications in natural language processing
|
15
|
+
(NLP) and large language models (LLMs). It allows users to store and
|
16
|
+
retrieve dense vector embeddings for text strings.
|
17
|
+
EOT
|
18
|
+
|
19
|
+
test_dir 'spec'
|
20
|
+
ignore '.*.sw[pon]', 'pkg', 'Gemfile.lock', '.AppleDouble', '.bundle',
|
21
|
+
'.yardoc', 'doc', 'tags', 'errors.lst', 'cscope.out', 'coverage', 'tmp',
|
22
|
+
'yard'
|
23
|
+
package_ignore '.all_images.yml', '.tool-versions', '.gitignore', 'VERSION',
|
24
|
+
'.rspec', *Dir.glob('.github/**/*', File::FNM_DOTMATCH)
|
25
|
+
readme 'README.md'
|
26
|
+
|
27
|
+
required_ruby_version '~> 3.1'
|
28
|
+
|
29
|
+
dependency 'infobar', '~> 0.8'
|
30
|
+
dependency 'json', '~> 2.0'
|
31
|
+
dependency 'tins', '~> 1.34'
|
32
|
+
dependency 'sqlite-vec', '~> 0.0'
|
33
|
+
dependency 'sqlite3', '~> 2.0', '>= 2.0.1'
|
34
|
+
dependency 'kramdown-ansi', '~> 0.0', '>= 0.0.1'
|
35
|
+
dependency 'numo-narray', '~> 0.9'
|
36
|
+
dependency 'redis', '~> 5.0'
|
37
|
+
dependency 'more_math', '~> 1.1'
|
38
|
+
|
39
|
+
development_dependency 'all_images', '~> 0.6'
|
40
|
+
development_dependency 'rspec', '~> 3.2'
|
41
|
+
development_dependency 'kramdown', '~> 2.0'
|
42
|
+
development_dependency 'debug'
|
43
|
+
development_dependency 'simplecov'
|
44
|
+
|
45
|
+
licenses << 'MIT'
|
46
|
+
|
47
|
+
clobber 'coverage'
|
48
|
+
end
|
data/documentrix.gemspec
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
# stub: documentrix 0.0.0 ruby lib
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "documentrix".freeze
|
6
|
+
s.version = "0.0.0".freeze
|
7
|
+
|
8
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
|
9
|
+
s.require_paths = ["lib".freeze]
|
10
|
+
s.authors = ["Florian Frank".freeze]
|
11
|
+
s.date = "2024-12-06"
|
12
|
+
s.description = "The Ruby library, Documentrix, is designed to provide a way to build and\nquery vector databases for applications in natural language processing\n(NLP) and large language models (LLMs). It allows users to store and\nretrieve dense vector embeddings for text strings.\n".freeze
|
13
|
+
s.email = "flori@ping.de".freeze
|
14
|
+
s.extra_rdoc_files = ["README.md".freeze, "lib/documentrix.rb".freeze, "lib/documentrix/documents.rb".freeze, "lib/documentrix/documents/cache/common.rb".freeze, "lib/documentrix/documents/cache/memory_cache.rb".freeze, "lib/documentrix/documents/cache/records.rb".freeze, "lib/documentrix/documents/cache/redis_backed_memory_cache.rb".freeze, "lib/documentrix/documents/cache/redis_cache.rb".freeze, "lib/documentrix/documents/cache/sqlite_cache.rb".freeze, "lib/documentrix/documents/splitters/character.rb".freeze, "lib/documentrix/documents/splitters/semantic.rb".freeze, "lib/documentrix/utils.rb".freeze, "lib/documentrix/utils/colorize_texts.rb".freeze, "lib/documentrix/utils/math.rb".freeze, "lib/documentrix/utils/tags.rb".freeze, "lib/documentrix/version.rb".freeze]
|
15
|
+
s.files = [".yardopts".freeze, "Gemfile".freeze, "README.md".freeze, "Rakefile".freeze, "documentrix.gemspec".freeze, "lib/documentrix.rb".freeze, "lib/documentrix/documents.rb".freeze, "lib/documentrix/documents/cache/common.rb".freeze, "lib/documentrix/documents/cache/memory_cache.rb".freeze, "lib/documentrix/documents/cache/records.rb".freeze, "lib/documentrix/documents/cache/redis_backed_memory_cache.rb".freeze, "lib/documentrix/documents/cache/redis_cache.rb".freeze, "lib/documentrix/documents/cache/sqlite_cache.rb".freeze, "lib/documentrix/documents/splitters/character.rb".freeze, "lib/documentrix/documents/splitters/semantic.rb".freeze, "lib/documentrix/utils.rb".freeze, "lib/documentrix/utils/colorize_texts.rb".freeze, "lib/documentrix/utils/math.rb".freeze, "lib/documentrix/utils/tags.rb".freeze, "lib/documentrix/version.rb".freeze, "spec/assets/embeddings.json".freeze, "spec/documentrix/documents/cache/memory_cache_spec.rb".freeze, "spec/documentrix/documents/cache/redis_backed_memory_cache_spec.rb".freeze, "spec/documentrix/documents/cache/redis_cache_spec.rb".freeze, "spec/documentrix/documents/cache/sqlite_cache_spec.rb".freeze, "spec/documentrix/documents/splitters/character_spec.rb".freeze, "spec/documentrix/documents/splitters/semantic_spec.rb".freeze, "spec/documents_spec.rb".freeze, "spec/spec_helper.rb".freeze, "spec/utils/colorize_texts_spec.rb".freeze, "spec/utils/tags_spec.rb".freeze]
|
16
|
+
s.homepage = "https://github.com/flori/documentrix".freeze
|
17
|
+
s.licenses = ["MIT".freeze]
|
18
|
+
s.rdoc_options = ["--title".freeze, "Documentrix - \"Ruby library for embedding vector database".freeze, "--main".freeze, "README.md".freeze]
|
19
|
+
s.required_ruby_version = Gem::Requirement.new("~> 3.1".freeze)
|
20
|
+
s.rubygems_version = "3.5.23".freeze
|
21
|
+
s.summary = "\"Ruby library for embedding vector database".freeze
|
22
|
+
s.test_files = ["spec/documentrix/documents/cache/memory_cache_spec.rb".freeze, "spec/documentrix/documents/cache/redis_backed_memory_cache_spec.rb".freeze, "spec/documentrix/documents/cache/redis_cache_spec.rb".freeze, "spec/documentrix/documents/cache/sqlite_cache_spec.rb".freeze, "spec/documentrix/documents/splitters/character_spec.rb".freeze, "spec/documentrix/documents/splitters/semantic_spec.rb".freeze, "spec/documents_spec.rb".freeze, "spec/spec_helper.rb".freeze, "spec/utils/colorize_texts_spec.rb".freeze, "spec/utils/tags_spec.rb".freeze]
|
23
|
+
|
24
|
+
s.specification_version = 4
|
25
|
+
|
26
|
+
s.add_development_dependency(%q<gem_hadar>.freeze, ["~> 1.19".freeze])
|
27
|
+
s.add_development_dependency(%q<all_images>.freeze, ["~> 0.6".freeze])
|
28
|
+
s.add_development_dependency(%q<rspec>.freeze, ["~> 3.2".freeze])
|
29
|
+
s.add_development_dependency(%q<kramdown>.freeze, ["~> 2.0".freeze])
|
30
|
+
s.add_development_dependency(%q<debug>.freeze, [">= 0".freeze])
|
31
|
+
s.add_development_dependency(%q<simplecov>.freeze, [">= 0".freeze])
|
32
|
+
s.add_runtime_dependency(%q<infobar>.freeze, ["~> 0.8".freeze])
|
33
|
+
s.add_runtime_dependency(%q<json>.freeze, ["~> 2.0".freeze])
|
34
|
+
s.add_runtime_dependency(%q<tins>.freeze, ["~> 1.34".freeze])
|
35
|
+
s.add_runtime_dependency(%q<sqlite-vec>.freeze, ["~> 0.0".freeze])
|
36
|
+
s.add_runtime_dependency(%q<sqlite3>.freeze, ["~> 2.0".freeze, ">= 2.0.1".freeze])
|
37
|
+
s.add_runtime_dependency(%q<kramdown-ansi>.freeze, ["~> 0.0".freeze, ">= 0.0.1".freeze])
|
38
|
+
s.add_runtime_dependency(%q<numo-narray>.freeze, ["~> 0.9".freeze])
|
39
|
+
s.add_runtime_dependency(%q<redis>.freeze, ["~> 5.0".freeze])
|
40
|
+
s.add_runtime_dependency(%q<more_math>.freeze, ["~> 1.1".freeze])
|
41
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Documentrix::Documents::Cache::Common
|
2
|
+
include Documentrix::Utils::Math
|
3
|
+
|
4
|
+
# The initialize method sets up the Documentrix::Documents::Cache instance's
|
5
|
+
# by setting its prefix attribute to the given value.
|
6
|
+
#
|
7
|
+
# @param [String] prefix the string to be used as the prefix for this cache
|
8
|
+
def initialize(prefix:)
|
9
|
+
self.prefix = prefix
|
10
|
+
end
|
11
|
+
|
12
|
+
attr_accessor :prefix # current prefix defined for the cache
|
13
|
+
|
14
|
+
# Returns an array of collection names that match the given prefix.
|
15
|
+
#
|
16
|
+
# @param prefix [String] a string to search for in collection names
|
17
|
+
# @return [Array<Symbol>] an array of matching collection names
|
18
|
+
def collections(prefix)
|
19
|
+
unique = Set.new
|
20
|
+
full_each do |key, _|
|
21
|
+
key =~ /\A#{prefix}(.+)-/ or next
|
22
|
+
unique << $1
|
23
|
+
end
|
24
|
+
unique.map(&:to_sym)
|
25
|
+
end
|
26
|
+
|
27
|
+
# Returns a string representing the given `key` prefixed with the defined
|
28
|
+
# prefix.
|
29
|
+
#
|
30
|
+
# @param key [String] the key to join with the prefix
|
31
|
+
# @return [String] the joined string of prefix and key
|
32
|
+
def pre(key)
|
33
|
+
[ @prefix, key ].join
|
34
|
+
end
|
35
|
+
|
36
|
+
# Returns a string with the prefix removed from the given `key`.
|
37
|
+
#
|
38
|
+
# @param key [String] the input string containing the prefix.
|
39
|
+
# @return [String] the input string without the prefix.
|
40
|
+
def unpre(key)
|
41
|
+
key.sub(/\A#@prefix/, '')
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
require 'documentrix/documents/cache/common'
|
2
|
+
|
3
|
+
class Documentrix::Documents::MemoryCache
|
4
|
+
include Documentrix::Documents::Cache::Common
|
5
|
+
|
6
|
+
# The initialize method sets up the Documentrix::Documents::Cache instance's
|
7
|
+
# by setting its prefix attribute to the given value.
|
8
|
+
#
|
9
|
+
# @param [String] prefix the string to be used as the prefix for this cache
|
10
|
+
def initialize(prefix:)
|
11
|
+
super(prefix:)
|
12
|
+
@data = {}
|
13
|
+
end
|
14
|
+
|
15
|
+
# The [] method retrieves the value associated with the given key from the
|
16
|
+
# cache.
|
17
|
+
#
|
18
|
+
# @param [String] key the key to look up in the cache
|
19
|
+
#
|
20
|
+
# @return [Object] the cached value, or nil if not found
|
21
|
+
def [](key)
|
22
|
+
@data[pre(key)]
|
23
|
+
end
|
24
|
+
|
25
|
+
# The []= method sets the value for a given key in the cache.
|
26
|
+
#
|
27
|
+
# @param [String] key the key to set
|
28
|
+
# @param [Hash] value the value to associate with the key
|
29
|
+
#
|
30
|
+
# @return [void]
|
31
|
+
def []=(key, value)
|
32
|
+
@data[pre(key)] = value
|
33
|
+
end
|
34
|
+
|
35
|
+
# The key? method checks if the given key exists in the cache.
|
36
|
+
#
|
37
|
+
# @param [String] key the key to check for existence
|
38
|
+
#
|
39
|
+
# @return [TrueClass, FalseClass] true if the key exists, false otherwise
|
40
|
+
def key?(key)
|
41
|
+
@data.key?(pre(key))
|
42
|
+
end
|
43
|
+
|
44
|
+
# The delete method removes the key-value pair from the cache by deleting it
|
45
|
+
# from the underlying data structure.
|
46
|
+
#
|
47
|
+
# @param [String] key the key of the value to be deleted
|
48
|
+
#
|
49
|
+
# @return [TrueClass, FalseClass] true if the key was found and deleted, false otherwise.
|
50
|
+
def delete(key)
|
51
|
+
!!@data.delete(pre(key))
|
52
|
+
end
|
53
|
+
|
54
|
+
# The size method returns the number of elements in the cache, that is the
|
55
|
+
# ones prefixed with `prefix`.
|
56
|
+
#
|
57
|
+
# @return [ Integer ] The count of elements in the cache.
|
58
|
+
def size
|
59
|
+
count
|
60
|
+
end
|
61
|
+
|
62
|
+
# The clear method removes all records from the cache that have keys starting
|
63
|
+
# with the prefix `prefix`.
|
64
|
+
#
|
65
|
+
# @return [ Documentrix::Documents::MemoryCache ] self
|
66
|
+
def clear
|
67
|
+
@data.delete_if { |key, _| key.start_with?(@prefix) }
|
68
|
+
self
|
69
|
+
end
|
70
|
+
|
71
|
+
# The each method iterates over the cache's keys and values under a given
|
72
|
+
# prefix `prefix`.
|
73
|
+
#
|
74
|
+
# @yield [key, value] Each key-value pair in the cache
|
75
|
+
#
|
76
|
+
# @return [void]
|
77
|
+
def each(&block)
|
78
|
+
@data.select { |key,| key.start_with?(@prefix) }.each(&block)
|
79
|
+
end
|
80
|
+
include Enumerable
|
81
|
+
|
82
|
+
# The full_each method iterates over the data hash and yields each key-value
|
83
|
+
# pair to the given block regardless of the prefix `prefix`.
|
84
|
+
#
|
85
|
+
# @yield [key, value] Each key-value pair in the data hash
|
86
|
+
#
|
87
|
+
# @return [void]
|
88
|
+
def full_each(&block)
|
89
|
+
@data.each(&block)
|
90
|
+
end
|
91
|
+
end
|
@@ -0,0 +1,145 @@
|
|
1
|
+
module Documentrix::Documents::Cache::Records
|
2
|
+
class Record < JSON::GenericObject
|
3
|
+
# The initialize method sets default values for the text and norm
|
4
|
+
# attributes.
|
5
|
+
#
|
6
|
+
# @param [Hash] options A hash containing optional parameters.
|
7
|
+
def initialize(options = {})
|
8
|
+
super
|
9
|
+
self.text ||= ''
|
10
|
+
self.norm ||= 0.0
|
11
|
+
end
|
12
|
+
|
13
|
+
# The to_s method returns a string representation of the object.
|
14
|
+
#
|
15
|
+
# @return [String] A string containing the text and tags of the record,
|
16
|
+
# along with its similarity score.
|
17
|
+
def to_s
|
18
|
+
my_tags = tags_set
|
19
|
+
my_tags.empty? or my_tags = " #{my_tags}"
|
20
|
+
"#<#{self.class} #{text.inspect}#{my_tags} #{similarity || 'n/a'}>"
|
21
|
+
end
|
22
|
+
|
23
|
+
# The tags_set method creates a new Documentrix::Utils::Tags object from
|
24
|
+
# the tags and source of this instance.
|
25
|
+
#
|
26
|
+
# @return [ Documentrix::Utils::Tags ] a new Documentrix::Utils::Tags object
|
27
|
+
def tags_set
|
28
|
+
Documentrix::Utils::Tags.new(tags, source:)
|
29
|
+
end
|
30
|
+
|
31
|
+
# The == method compares this record with another one by comparing their
|
32
|
+
# text fields.
|
33
|
+
#
|
34
|
+
# @param other [ Documentrix::Documents::Record ] the other record to compare with
|
35
|
+
#
|
36
|
+
# @return [ FalseClass, TrueClass ] true if both records have the same
|
37
|
+
# text, false otherwise.
|
38
|
+
def ==(other)
|
39
|
+
text == other.text
|
40
|
+
end
|
41
|
+
|
42
|
+
alias inspect to_s
|
43
|
+
end
|
44
|
+
|
45
|
+
module RedisFullEach
|
46
|
+
# The full_each method iterates over all records in the cache and yields
|
47
|
+
# them to the block.
|
48
|
+
#
|
49
|
+
# @yield [ key, value ] where key is the record's key and value is the record itself
|
50
|
+
def full_each(&block)
|
51
|
+
redis.scan_each(match: [ Documentrix::Documents, ?* ] * ?-) do |key|
|
52
|
+
value = redis.get(key) or next
|
53
|
+
value = JSON(value, object_class: Documentrix::Documents::Record)
|
54
|
+
block.(key, value)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
module FindRecords
|
60
|
+
# The find_records method finds records that match the given needle and
|
61
|
+
# tags.
|
62
|
+
#
|
63
|
+
# @param needle [ Array ] an array containing the embedding vector
|
64
|
+
# @param tags [ String, Array ] a string or array of strings representing the tags to search for
|
65
|
+
# @param max_records [ Integer ] the maximum number of records to return
|
66
|
+
#
|
67
|
+
# @yield [ record ]
|
68
|
+
#
|
69
|
+
# @return [ Array<Documentrix::Documents::Records> ] an array containing the matching records
|
70
|
+
def find_records(needle, tags: nil, max_records: nil)
|
71
|
+
tags = Documentrix::Utils::Tags.new(Array(tags)).to_a
|
72
|
+
records = self
|
73
|
+
if tags.present?
|
74
|
+
records = records.select { |_key, record| (tags & record.tags).size >= 1 }
|
75
|
+
end
|
76
|
+
needle_norm = norm(needle)
|
77
|
+
records = records.sort_by { |key, record|
|
78
|
+
record.key = key
|
79
|
+
record.similarity = cosine_similarity(
|
80
|
+
a: needle,
|
81
|
+
b: record.embedding,
|
82
|
+
a_norm: needle_norm,
|
83
|
+
b_norm: record.norm,
|
84
|
+
)
|
85
|
+
}
|
86
|
+
records.transpose.last&.reverse.to_a
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
module Tags
|
91
|
+
# The clear method removes all records that match the given tags from the
|
92
|
+
# cache.
|
93
|
+
#
|
94
|
+
# @param tags [ Array<String> ] an array of tag names
|
95
|
+
#
|
96
|
+
# @example
|
97
|
+
# documents.clear(tags: %w[ foo bar ])
|
98
|
+
#
|
99
|
+
# @return [ self ]
|
100
|
+
def clear(tags: nil)
|
101
|
+
tags = Documentrix::Utils::Tags.new(tags).to_a
|
102
|
+
if tags.present?
|
103
|
+
if respond_to?(:clear_for_tags)
|
104
|
+
clear_for_tags(tags)
|
105
|
+
else
|
106
|
+
each do |key, record|
|
107
|
+
if (tags & record.tags.to_a).size >= 1
|
108
|
+
delete(unpre(key))
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
else
|
113
|
+
super()
|
114
|
+
end
|
115
|
+
self
|
116
|
+
end
|
117
|
+
|
118
|
+
# The tags method returns an array of unique tags from all records.
|
119
|
+
#
|
120
|
+
# @return [Documentrix::Utils::Tags] An instance of
|
121
|
+
# Documentrix::Utils::Tags containing the unique tags.
|
122
|
+
def tags
|
123
|
+
if defined? super
|
124
|
+
super
|
125
|
+
else
|
126
|
+
each_with_object(Documentrix::Utils::Tags.new) do |(_, record), t|
|
127
|
+
record.tags.each do |tag|
|
128
|
+
t.add(tag, source: record.source)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
def tags
|
134
|
+
if defined? super
|
135
|
+
super
|
136
|
+
else
|
137
|
+
each_with_object(Documentrix::Utils::Tags.new) do |(_, record), t|
|
138
|
+
record.tags.each do |tag|
|
139
|
+
t.add(tag, source: record.source)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'redis'
|
2
|
+
|
3
|
+
class Documentrix::Documents
|
4
|
+
class RedisBackedMemoryCache < MemoryCache
|
5
|
+
|
6
|
+
# The initialize method sets up the RedisBackedMemoryCache cache by
|
7
|
+
# creating a new instance and populating it with data from the internally
|
8
|
+
# created RedisCache.
|
9
|
+
#
|
10
|
+
# @param prefix [String] The prefix for keys in the Redis cache
|
11
|
+
# @param url [String] The URL of the Redis server (default: ENV['REDIS_URL'])
|
12
|
+
# @param object_class [Class] The class to use for deserializing values from Redis (default: nil)
|
13
|
+
#
|
14
|
+
# @raise [ArgumentError] If the redis_url environment variable is not set
|
15
|
+
def initialize(prefix:, url: ENV['REDIS_URL'], object_class: nil)
|
16
|
+
super(prefix:)
|
17
|
+
url or raise ArgumentError, 'require redis url'
|
18
|
+
@url, @object_class = url, object_class
|
19
|
+
@redis_cache = Documentrix::Documents::RedisCache.new(prefix:, url:, object_class:)
|
20
|
+
@redis_cache.extend(Documentrix::Documents::Cache::Records::RedisFullEach)
|
21
|
+
@redis_cache.full_each { |key, value| @data[key] = value }
|
22
|
+
end
|
23
|
+
|
24
|
+
attr_reader :object_class # the class of objects stored in the cache
|
25
|
+
|
26
|
+
# The redis method returns the Redis client instance used by the cache.
|
27
|
+
#
|
28
|
+
# @return [Redis] The Redis client instance
|
29
|
+
def redis
|
30
|
+
@redis_cache.redis
|
31
|
+
end
|
32
|
+
|
33
|
+
# The set method sets the value for a given key in memory and in Redis.
|
34
|
+
#
|
35
|
+
# @param [String] key the key to be set
|
36
|
+
# @param [Hash] value the hash containing the data to be stored
|
37
|
+
def []=(key, value)
|
38
|
+
super
|
39
|
+
redis.set(pre(key), JSON(value))
|
40
|
+
end
|
41
|
+
|
42
|
+
# The delete method removes a key from the cache by calling Redis's del
|
43
|
+
# method and then calling the superclass's delete method.
|
44
|
+
#
|
45
|
+
# @param [String] key the key to be deleted
|
46
|
+
#
|
47
|
+
# @return [FalseClass, TrueClass] true if the key was successfully deleted, false otherwise.
|
48
|
+
def delete(key)
|
49
|
+
result = redis.del(pre(key))
|
50
|
+
super && result == 1
|
51
|
+
end
|
52
|
+
|
53
|
+
# The clear method deletes all keys from the cache by scanning redis for
|
54
|
+
# keys that match the prefix `prefix` and then deleting them, then it does
|
55
|
+
# the same for the MemoryCache by calling its super.
|
56
|
+
#
|
57
|
+
# @return [self] self
|
58
|
+
def clear
|
59
|
+
redis.scan_each(match: "#@prefix*") { |key| redis.del(key) }
|
60
|
+
super
|
61
|
+
self
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,128 @@
|
|
1
|
+
require 'documentrix/documents/cache/common'
|
2
|
+
require 'redis'
|
3
|
+
|
4
|
+
class Documentrix::Documents::RedisCache
|
5
|
+
include Documentrix::Documents::Cache::Common
|
6
|
+
|
7
|
+
# The initialize method sets up the Documentrix::Documents::RedisCache
|
8
|
+
# instance's by setting its prefix attribute to the given value and
|
9
|
+
# initializing the Redis client.
|
10
|
+
#
|
11
|
+
# @param [String] prefix the string to be used as the prefix for this cache
|
12
|
+
# @param [String] url the URL of the Redis server (default: ENV['REDIS_URL'])
|
13
|
+
# @param [Class] object_class the class of objects stored in Redis (default: nil)
|
14
|
+
# @param [Integer] ex the expiration time in seconds (default: nil)
|
15
|
+
def initialize(prefix:, url: ENV['REDIS_URL'], object_class: nil, ex: nil)
|
16
|
+
super(prefix:)
|
17
|
+
url or raise ArgumentError, 'require redis url'
|
18
|
+
@url, @object_class, @ex = url, object_class, ex
|
19
|
+
end
|
20
|
+
|
21
|
+
attr_reader :object_class # the class of objects stored in the cache
|
22
|
+
|
23
|
+
# The redis method returns an instance of Redis client
|
24
|
+
#
|
25
|
+
# @return [Redis] An instance of Redis client
|
26
|
+
def redis
|
27
|
+
@redis ||= Redis.new(url: @url)
|
28
|
+
end
|
29
|
+
|
30
|
+
# The [](key) method retrieves the value associated with the given key from Redis.
|
31
|
+
#
|
32
|
+
# @param [String] key the string representation of the key
|
33
|
+
#
|
34
|
+
# @return [Object, nil] the retrieved value if it exists in Redis, or nil otherwise
|
35
|
+
def [](key)
|
36
|
+
value = redis.get(pre(key))
|
37
|
+
unless value.nil?
|
38
|
+
object_class ? JSON(value, object_class:) : JSON(value)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# The []= method sets the value associated with the given key in this cache instance.
|
43
|
+
#
|
44
|
+
# @param [String] key the string representation of the key
|
45
|
+
# @param [Object] value the object to be stored under the given key
|
46
|
+
#
|
47
|
+
# @return [Object] self
|
48
|
+
def []=(key, value)
|
49
|
+
set(key, value)
|
50
|
+
end
|
51
|
+
|
52
|
+
# The set method sets the value associated with the given key in this cache instance.
|
53
|
+
#
|
54
|
+
# @param [String] key the string representation of the key
|
55
|
+
# @param [Object] value the object to be stored under the given key
|
56
|
+
# @option ex [Integer] ex the expiration time in seconds (default: nil)
|
57
|
+
#
|
58
|
+
# @return [Object] self
|
59
|
+
def set(key, value, ex: nil)
|
60
|
+
ex ||= @ex
|
61
|
+
if !ex.nil? && ex < 1
|
62
|
+
redis.del(pre(key))
|
63
|
+
else
|
64
|
+
redis.set(pre(key), JSON.generate(value), ex:)
|
65
|
+
end
|
66
|
+
value
|
67
|
+
end
|
68
|
+
|
69
|
+
# The ttl method returns the time-to-live (TTL) value for the given key
|
70
|
+
#
|
71
|
+
# @param [String] key the string representation of the key
|
72
|
+
#
|
73
|
+
# @return [Integer, nil] the TTL value if it exists in Redis, or nil otherwise
|
74
|
+
def ttl(key)
|
75
|
+
redis.ttl(pre(key))
|
76
|
+
end
|
77
|
+
|
78
|
+
# The key? method checks if the given key exists in Redis by calling the
|
79
|
+
# redis.exists? method
|
80
|
+
#
|
81
|
+
# @param [String] key the string representation of the key
|
82
|
+
#
|
83
|
+
# @return [FalseClass, TrueClass] true if the key exists, false otherwise
|
84
|
+
def key?(key)
|
85
|
+
!!redis.exists?(pre(key))
|
86
|
+
end
|
87
|
+
|
88
|
+
# The delete method removes the key-value pair associated with the given key
|
89
|
+
# from this cache instance.
|
90
|
+
#
|
91
|
+
# @param [String] key the string representation of the key
|
92
|
+
#
|
93
|
+
# @return [FalseClass, TrueClass] true if the key was deleted successfully, false otherwise
|
94
|
+
def delete(key)
|
95
|
+
redis.del(pre(key)) == 1
|
96
|
+
end
|
97
|
+
|
98
|
+
# The size method returns the total number of keys stored in this cache
|
99
|
+
# instance, that is the ones with the prefix `prefix`.
|
100
|
+
#
|
101
|
+
# @return [Integer] The total count of keys
|
102
|
+
def size
|
103
|
+
s = 0
|
104
|
+
redis.scan_each(match: "#@prefix*") { |key| s += 1 }
|
105
|
+
s
|
106
|
+
end
|
107
|
+
|
108
|
+
# The clear method removes all key-value pairs associated with the given
|
109
|
+
# prefix from this cache instance.
|
110
|
+
#
|
111
|
+
# @return [Documentrix::Documents::RedisCache] self
|
112
|
+
def clear
|
113
|
+
redis.scan_each(match: "#@prefix*") { |key| redis.del(key) }
|
114
|
+
self
|
115
|
+
end
|
116
|
+
|
117
|
+
# The each method iterates over the cache keys with prefix `prefix` and
|
118
|
+
# yields each key-value pair to the given block.
|
119
|
+
#
|
120
|
+
# @yield [key, value] Each key-value pair in the cache
|
121
|
+
#
|
122
|
+
# @return [self] self
|
123
|
+
def each(&block)
|
124
|
+
redis.scan_each(match: "#@prefix*") { |key| block.(key, self[unpre(key)]) }
|
125
|
+
self
|
126
|
+
end
|
127
|
+
include Enumerable
|
128
|
+
end
|