documentrix 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +1 -0
- data/Gemfile +5 -0
- data/README.md +71 -0
- data/Rakefile +48 -0
- data/documentrix.gemspec +41 -0
- data/lib/documentrix/documents/cache/common.rb +43 -0
- data/lib/documentrix/documents/cache/memory_cache.rb +91 -0
- data/lib/documentrix/documents/cache/records.rb +145 -0
- data/lib/documentrix/documents/cache/redis_backed_memory_cache.rb +64 -0
- data/lib/documentrix/documents/cache/redis_cache.rb +128 -0
- data/lib/documentrix/documents/cache/sqlite_cache.rb +335 -0
- data/lib/documentrix/documents/splitters/character.rb +72 -0
- data/lib/documentrix/documents/splitters/semantic.rb +91 -0
- data/lib/documentrix/documents.rb +328 -0
- data/lib/documentrix/utils/colorize_texts.rb +65 -0
- data/lib/documentrix/utils/math.rb +48 -0
- data/lib/documentrix/utils/tags.rb +112 -0
- data/lib/documentrix/utils.rb +5 -0
- data/lib/documentrix/version.rb +8 -0
- data/lib/documentrix.rb +11 -0
- data/spec/assets/embeddings.json +1 -0
- data/spec/documentrix/documents/cache/memory_cache_spec.rb +98 -0
- data/spec/documentrix/documents/cache/redis_backed_memory_cache_spec.rb +121 -0
- data/spec/documentrix/documents/cache/redis_cache_spec.rb +123 -0
- data/spec/documentrix/documents/cache/sqlite_cache_spec.rb +141 -0
- data/spec/documentrix/documents/splitters/character_spec.rb +110 -0
- data/spec/documentrix/documents/splitters/semantic_spec.rb +56 -0
- data/spec/documents_spec.rb +174 -0
- data/spec/spec_helper.rb +23 -0
- data/spec/utils/colorize_texts_spec.rb +13 -0
- data/spec/utils/tags_spec.rb +53 -0
- metadata +329 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 7987ac709860e4747aec91c9d99b766e1b959b40142ebb17a3516aa80a0f6b85
|
4
|
+
data.tar.gz: a9ea8a8f360bef62ad687d1ecbcf04e6a978dc7450a76f26d97c6f04ee1bdf62
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ba03da2c50fffb014489ee3281765e1ad0712da263dae1a6f45d5f449aba345113f28c093c7d2df14986ad283aee6cbd88c8545c6a7dceaacadff405ad27f457
|
7
|
+
data.tar.gz: 3a10470a23349509a575e79ab1ca18b0969827ef7f720f745537de83924348109cb3fcbd4b7a4a9703daebb471489948ec26ba704362838ad41fa546bb0040f2
|
data/.yardopts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--markup markdown
|
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
# Documentrix - Ruby library for embedding vector database
|
2
|
+
|
3
|
+
## Description
|
4
|
+
|
5
|
+
The Ruby library, Documentrix, is designed to provide a way to build and
|
6
|
+
query vector databases for applications in natural language processing
|
7
|
+
(NLP) and large language models (LLMs). It allows users to store and
|
8
|
+
retrieve dense vector embeddings for text strings.
|
9
|
+
|
10
|
+
## Installation (gem & bundler)
|
11
|
+
|
12
|
+
To install Documentrix, you can use the following methods:
|
13
|
+
|
14
|
+
### Using the gem command
|
15
|
+
|
16
|
+
Type `gem install documentrix` in your terminal.
|
17
|
+
|
18
|
+
### Using Bundler
|
19
|
+
|
20
|
+
Add the line `gem 'documentrix'` to your Gemfile and run `bundle install` in
|
21
|
+
your terminal.
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
In your own software the library can be used as shown in this example:
|
26
|
+
|
27
|
+
```ruby
|
28
|
+
# Require necessary libraries: ollama-ruby and documentrix
|
29
|
+
require 'ollama'
|
30
|
+
require 'documentrix'
|
31
|
+
|
32
|
+
# Initialize an Ollama client instance, pointing to a local server
|
33
|
+
ollama = Ollama::Client.new(base_url: 'http://localhost:11434')
|
34
|
+
|
35
|
+
# Create a new Documentrix documents instance
|
36
|
+
documents = Documentrix::Documents.new(
|
37
|
+
ollama: ollama,
|
38
|
+
model: 'mxbai-embed-large',
|
39
|
+
collection: 'my-collection',
|
40
|
+
cache: Documentrix::Documents::SQLiteCache
|
41
|
+
)
|
42
|
+
|
43
|
+
# Split sample text into individual chunks using recursive character splitting
|
44
|
+
splitter = Documentrix::Documents::Splitters::RecursiveCharacter.new
|
45
|
+
text = "hay hay hay…" # Sample text data
|
46
|
+
chunks = splitter.split(text)
|
47
|
+
documents.add(chunks)
|
48
|
+
|
49
|
+
# Search the document collection for matching records
|
50
|
+
query = "What needles can you find in a haystack" # Search query
|
51
|
+
records = documents.find_where(
|
52
|
+
query,
|
53
|
+
prompt: 'Represent this sentence for searching relevant passages: %s',
|
54
|
+
text_size: 4096,
|
55
|
+
text_count: 10
|
56
|
+
)
|
57
|
+
```
|
58
|
+
|
59
|
+
## Download
|
60
|
+
|
61
|
+
The homepage of this library is located at
|
62
|
+
|
63
|
+
* https://github.com/flori/documentrix
|
64
|
+
|
65
|
+
## Author
|
66
|
+
|
67
|
+
<b>Documentrix</b> was written by [Florian Frank](mailto:flori@ping.de)
|
68
|
+
|
69
|
+
## License
|
70
|
+
|
71
|
+
This software is licensed under the <i>MIT</i> license.
|
data/Rakefile
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# vim: set filetype=ruby et sw=2 ts=2:
|
2
|
+
|
3
|
+
require 'gem_hadar'
|
4
|
+
|
5
|
+
GemHadar do
|
6
|
+
name 'documentrix'
|
7
|
+
module_type :module
|
8
|
+
author 'Florian Frank'
|
9
|
+
email 'flori@ping.de'
|
10
|
+
homepage "https://github.com/flori/#{name}"
|
11
|
+
summary '"Ruby library for embedding vector database'
|
12
|
+
description <<~EOT
|
13
|
+
The Ruby library, Documentrix, is designed to provide a way to build and
|
14
|
+
query vector databases for applications in natural language processing
|
15
|
+
(NLP) and large language models (LLMs). It allows users to store and
|
16
|
+
retrieve dense vector embeddings for text strings.
|
17
|
+
EOT
|
18
|
+
|
19
|
+
test_dir 'spec'
|
20
|
+
ignore '.*.sw[pon]', 'pkg', 'Gemfile.lock', '.AppleDouble', '.bundle',
|
21
|
+
'.yardoc', 'doc', 'tags', 'errors.lst', 'cscope.out', 'coverage', 'tmp',
|
22
|
+
'yard'
|
23
|
+
package_ignore '.all_images.yml', '.tool-versions', '.gitignore', 'VERSION',
|
24
|
+
'.rspec', *Dir.glob('.github/**/*', File::FNM_DOTMATCH)
|
25
|
+
readme 'README.md'
|
26
|
+
|
27
|
+
required_ruby_version '~> 3.1'
|
28
|
+
|
29
|
+
dependency 'infobar', '~> 0.8'
|
30
|
+
dependency 'json', '~> 2.0'
|
31
|
+
dependency 'tins', '~> 1.34'
|
32
|
+
dependency 'sqlite-vec', '~> 0.0'
|
33
|
+
dependency 'sqlite3', '~> 2.0', '>= 2.0.1'
|
34
|
+
dependency 'kramdown-ansi', '~> 0.0', '>= 0.0.1'
|
35
|
+
dependency 'numo-narray', '~> 0.9'
|
36
|
+
dependency 'redis', '~> 5.0'
|
37
|
+
dependency 'more_math', '~> 1.1'
|
38
|
+
|
39
|
+
development_dependency 'all_images', '~> 0.6'
|
40
|
+
development_dependency 'rspec', '~> 3.2'
|
41
|
+
development_dependency 'kramdown', '~> 2.0'
|
42
|
+
development_dependency 'debug'
|
43
|
+
development_dependency 'simplecov'
|
44
|
+
|
45
|
+
licenses << 'MIT'
|
46
|
+
|
47
|
+
clobber 'coverage'
|
48
|
+
end
|
data/documentrix.gemspec
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
# stub: documentrix 0.0.0 ruby lib
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "documentrix".freeze
|
6
|
+
s.version = "0.0.0".freeze
|
7
|
+
|
8
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
|
9
|
+
s.require_paths = ["lib".freeze]
|
10
|
+
s.authors = ["Florian Frank".freeze]
|
11
|
+
s.date = "2024-12-06"
|
12
|
+
s.description = "The Ruby library, Documentrix, is designed to provide a way to build and\nquery vector databases for applications in natural language processing\n(NLP) and large language models (LLMs). It allows users to store and\nretrieve dense vector embeddings for text strings.\n".freeze
|
13
|
+
s.email = "flori@ping.de".freeze
|
14
|
+
s.extra_rdoc_files = ["README.md".freeze, "lib/documentrix.rb".freeze, "lib/documentrix/documents.rb".freeze, "lib/documentrix/documents/cache/common.rb".freeze, "lib/documentrix/documents/cache/memory_cache.rb".freeze, "lib/documentrix/documents/cache/records.rb".freeze, "lib/documentrix/documents/cache/redis_backed_memory_cache.rb".freeze, "lib/documentrix/documents/cache/redis_cache.rb".freeze, "lib/documentrix/documents/cache/sqlite_cache.rb".freeze, "lib/documentrix/documents/splitters/character.rb".freeze, "lib/documentrix/documents/splitters/semantic.rb".freeze, "lib/documentrix/utils.rb".freeze, "lib/documentrix/utils/colorize_texts.rb".freeze, "lib/documentrix/utils/math.rb".freeze, "lib/documentrix/utils/tags.rb".freeze, "lib/documentrix/version.rb".freeze]
|
15
|
+
s.files = [".yardopts".freeze, "Gemfile".freeze, "README.md".freeze, "Rakefile".freeze, "documentrix.gemspec".freeze, "lib/documentrix.rb".freeze, "lib/documentrix/documents.rb".freeze, "lib/documentrix/documents/cache/common.rb".freeze, "lib/documentrix/documents/cache/memory_cache.rb".freeze, "lib/documentrix/documents/cache/records.rb".freeze, "lib/documentrix/documents/cache/redis_backed_memory_cache.rb".freeze, "lib/documentrix/documents/cache/redis_cache.rb".freeze, "lib/documentrix/documents/cache/sqlite_cache.rb".freeze, "lib/documentrix/documents/splitters/character.rb".freeze, "lib/documentrix/documents/splitters/semantic.rb".freeze, "lib/documentrix/utils.rb".freeze, "lib/documentrix/utils/colorize_texts.rb".freeze, "lib/documentrix/utils/math.rb".freeze, "lib/documentrix/utils/tags.rb".freeze, "lib/documentrix/version.rb".freeze, "spec/assets/embeddings.json".freeze, "spec/documentrix/documents/cache/memory_cache_spec.rb".freeze, "spec/documentrix/documents/cache/redis_backed_memory_cache_spec.rb".freeze, "spec/documentrix/documents/cache/redis_cache_spec.rb".freeze, "spec/documentrix/documents/cache/sqlite_cache_spec.rb".freeze, "spec/documentrix/documents/splitters/character_spec.rb".freeze, "spec/documentrix/documents/splitters/semantic_spec.rb".freeze, "spec/documents_spec.rb".freeze, "spec/spec_helper.rb".freeze, "spec/utils/colorize_texts_spec.rb".freeze, "spec/utils/tags_spec.rb".freeze]
|
16
|
+
s.homepage = "https://github.com/flori/documentrix".freeze
|
17
|
+
s.licenses = ["MIT".freeze]
|
18
|
+
s.rdoc_options = ["--title".freeze, "Documentrix - \"Ruby library for embedding vector database".freeze, "--main".freeze, "README.md".freeze]
|
19
|
+
s.required_ruby_version = Gem::Requirement.new("~> 3.1".freeze)
|
20
|
+
s.rubygems_version = "3.5.23".freeze
|
21
|
+
s.summary = "\"Ruby library for embedding vector database".freeze
|
22
|
+
s.test_files = ["spec/documentrix/documents/cache/memory_cache_spec.rb".freeze, "spec/documentrix/documents/cache/redis_backed_memory_cache_spec.rb".freeze, "spec/documentrix/documents/cache/redis_cache_spec.rb".freeze, "spec/documentrix/documents/cache/sqlite_cache_spec.rb".freeze, "spec/documentrix/documents/splitters/character_spec.rb".freeze, "spec/documentrix/documents/splitters/semantic_spec.rb".freeze, "spec/documents_spec.rb".freeze, "spec/spec_helper.rb".freeze, "spec/utils/colorize_texts_spec.rb".freeze, "spec/utils/tags_spec.rb".freeze]
|
23
|
+
|
24
|
+
s.specification_version = 4
|
25
|
+
|
26
|
+
s.add_development_dependency(%q<gem_hadar>.freeze, ["~> 1.19".freeze])
|
27
|
+
s.add_development_dependency(%q<all_images>.freeze, ["~> 0.6".freeze])
|
28
|
+
s.add_development_dependency(%q<rspec>.freeze, ["~> 3.2".freeze])
|
29
|
+
s.add_development_dependency(%q<kramdown>.freeze, ["~> 2.0".freeze])
|
30
|
+
s.add_development_dependency(%q<debug>.freeze, [">= 0".freeze])
|
31
|
+
s.add_development_dependency(%q<simplecov>.freeze, [">= 0".freeze])
|
32
|
+
s.add_runtime_dependency(%q<infobar>.freeze, ["~> 0.8".freeze])
|
33
|
+
s.add_runtime_dependency(%q<json>.freeze, ["~> 2.0".freeze])
|
34
|
+
s.add_runtime_dependency(%q<tins>.freeze, ["~> 1.34".freeze])
|
35
|
+
s.add_runtime_dependency(%q<sqlite-vec>.freeze, ["~> 0.0".freeze])
|
36
|
+
s.add_runtime_dependency(%q<sqlite3>.freeze, ["~> 2.0".freeze, ">= 2.0.1".freeze])
|
37
|
+
s.add_runtime_dependency(%q<kramdown-ansi>.freeze, ["~> 0.0".freeze, ">= 0.0.1".freeze])
|
38
|
+
s.add_runtime_dependency(%q<numo-narray>.freeze, ["~> 0.9".freeze])
|
39
|
+
s.add_runtime_dependency(%q<redis>.freeze, ["~> 5.0".freeze])
|
40
|
+
s.add_runtime_dependency(%q<more_math>.freeze, ["~> 1.1".freeze])
|
41
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Documentrix::Documents::Cache::Common
|
2
|
+
include Documentrix::Utils::Math
|
3
|
+
|
4
|
+
# The initialize method sets up the Documentrix::Documents::Cache instance's
|
5
|
+
# by setting its prefix attribute to the given value.
|
6
|
+
#
|
7
|
+
# @param [String] prefix the string to be used as the prefix for this cache
|
8
|
+
def initialize(prefix:)
|
9
|
+
self.prefix = prefix
|
10
|
+
end
|
11
|
+
|
12
|
+
attr_accessor :prefix # current prefix defined for the cache
|
13
|
+
|
14
|
+
# Returns an array of collection names that match the given prefix.
|
15
|
+
#
|
16
|
+
# @param prefix [String] a string to search for in collection names
|
17
|
+
# @return [Array<Symbol>] an array of matching collection names
|
18
|
+
def collections(prefix)
|
19
|
+
unique = Set.new
|
20
|
+
full_each do |key, _|
|
21
|
+
key =~ /\A#{prefix}(.+)-/ or next
|
22
|
+
unique << $1
|
23
|
+
end
|
24
|
+
unique.map(&:to_sym)
|
25
|
+
end
|
26
|
+
|
27
|
+
# Returns a string representing the given `key` prefixed with the defined
|
28
|
+
# prefix.
|
29
|
+
#
|
30
|
+
# @param key [String] the key to join with the prefix
|
31
|
+
# @return [String] the joined string of prefix and key
|
32
|
+
def pre(key)
|
33
|
+
[ @prefix, key ].join
|
34
|
+
end
|
35
|
+
|
36
|
+
# Returns a string with the prefix removed from the given `key`.
|
37
|
+
#
|
38
|
+
# @param key [String] the input string containing the prefix.
|
39
|
+
# @return [String] the input string without the prefix.
|
40
|
+
def unpre(key)
|
41
|
+
key.sub(/\A#@prefix/, '')
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
require 'documentrix/documents/cache/common'
|
2
|
+
|
3
|
+
class Documentrix::Documents::MemoryCache
|
4
|
+
include Documentrix::Documents::Cache::Common
|
5
|
+
|
6
|
+
# The initialize method sets up the Documentrix::Documents::Cache instance's
|
7
|
+
# by setting its prefix attribute to the given value.
|
8
|
+
#
|
9
|
+
# @param [String] prefix the string to be used as the prefix for this cache
|
10
|
+
def initialize(prefix:)
|
11
|
+
super(prefix:)
|
12
|
+
@data = {}
|
13
|
+
end
|
14
|
+
|
15
|
+
# The [] method retrieves the value associated with the given key from the
|
16
|
+
# cache.
|
17
|
+
#
|
18
|
+
# @param [String] key the key to look up in the cache
|
19
|
+
#
|
20
|
+
# @return [Object] the cached value, or nil if not found
|
21
|
+
def [](key)
|
22
|
+
@data[pre(key)]
|
23
|
+
end
|
24
|
+
|
25
|
+
# The []= method sets the value for a given key in the cache.
|
26
|
+
#
|
27
|
+
# @param [String] key the key to set
|
28
|
+
# @param [Hash] value the value to associate with the key
|
29
|
+
#
|
30
|
+
# @return [void]
|
31
|
+
def []=(key, value)
|
32
|
+
@data[pre(key)] = value
|
33
|
+
end
|
34
|
+
|
35
|
+
# The key? method checks if the given key exists in the cache.
|
36
|
+
#
|
37
|
+
# @param [String] key the key to check for existence
|
38
|
+
#
|
39
|
+
# @return [TrueClass, FalseClass] true if the key exists, false otherwise
|
40
|
+
def key?(key)
|
41
|
+
@data.key?(pre(key))
|
42
|
+
end
|
43
|
+
|
44
|
+
# The delete method removes the key-value pair from the cache by deleting it
|
45
|
+
# from the underlying data structure.
|
46
|
+
#
|
47
|
+
# @param [String] key the key of the value to be deleted
|
48
|
+
#
|
49
|
+
# @return [TrueClass, FalseClass] true if the key was found and deleted, false otherwise.
|
50
|
+
def delete(key)
|
51
|
+
!!@data.delete(pre(key))
|
52
|
+
end
|
53
|
+
|
54
|
+
# The size method returns the number of elements in the cache, that is the
|
55
|
+
# ones prefixed with `prefix`.
|
56
|
+
#
|
57
|
+
# @return [ Integer ] The count of elements in the cache.
|
58
|
+
def size
|
59
|
+
count
|
60
|
+
end
|
61
|
+
|
62
|
+
# The clear method removes all records from the cache that have keys starting
|
63
|
+
# with the prefix `prefix`.
|
64
|
+
#
|
65
|
+
# @return [ Documentrix::Documents::MemoryCache ] self
|
66
|
+
def clear
|
67
|
+
@data.delete_if { |key, _| key.start_with?(@prefix) }
|
68
|
+
self
|
69
|
+
end
|
70
|
+
|
71
|
+
# The each method iterates over the cache's keys and values under a given
|
72
|
+
# prefix `prefix`.
|
73
|
+
#
|
74
|
+
# @yield [key, value] Each key-value pair in the cache
|
75
|
+
#
|
76
|
+
# @return [void]
|
77
|
+
def each(&block)
|
78
|
+
@data.select { |key,| key.start_with?(@prefix) }.each(&block)
|
79
|
+
end
|
80
|
+
include Enumerable
|
81
|
+
|
82
|
+
# The full_each method iterates over the data hash and yields each key-value
|
83
|
+
# pair to the given block regardless of the prefix `prefix`.
|
84
|
+
#
|
85
|
+
# @yield [key, value] Each key-value pair in the data hash
|
86
|
+
#
|
87
|
+
# @return [void]
|
88
|
+
def full_each(&block)
|
89
|
+
@data.each(&block)
|
90
|
+
end
|
91
|
+
end
|
@@ -0,0 +1,145 @@
|
|
1
|
+
module Documentrix::Documents::Cache::Records
|
2
|
+
class Record < JSON::GenericObject
|
3
|
+
# The initialize method sets default values for the text and norm
|
4
|
+
# attributes.
|
5
|
+
#
|
6
|
+
# @param [Hash] options A hash containing optional parameters.
|
7
|
+
def initialize(options = {})
|
8
|
+
super
|
9
|
+
self.text ||= ''
|
10
|
+
self.norm ||= 0.0
|
11
|
+
end
|
12
|
+
|
13
|
+
# The to_s method returns a string representation of the object.
|
14
|
+
#
|
15
|
+
# @return [String] A string containing the text and tags of the record,
|
16
|
+
# along with its similarity score.
|
17
|
+
def to_s
|
18
|
+
my_tags = tags_set
|
19
|
+
my_tags.empty? or my_tags = " #{my_tags}"
|
20
|
+
"#<#{self.class} #{text.inspect}#{my_tags} #{similarity || 'n/a'}>"
|
21
|
+
end
|
22
|
+
|
23
|
+
# The tags_set method creates a new Documentrix::Utils::Tags object from
|
24
|
+
# the tags and source of this instance.
|
25
|
+
#
|
26
|
+
# @return [ Documentrix::Utils::Tags ] a new Documentrix::Utils::Tags object
|
27
|
+
def tags_set
|
28
|
+
Documentrix::Utils::Tags.new(tags, source:)
|
29
|
+
end
|
30
|
+
|
31
|
+
# The == method compares this record with another one by comparing their
|
32
|
+
# text fields.
|
33
|
+
#
|
34
|
+
# @param other [ Documentrix::Documents::Record ] the other record to compare with
|
35
|
+
#
|
36
|
+
# @return [ FalseClass, TrueClass ] true if both records have the same
|
37
|
+
# text, false otherwise.
|
38
|
+
def ==(other)
|
39
|
+
text == other.text
|
40
|
+
end
|
41
|
+
|
42
|
+
alias inspect to_s
|
43
|
+
end
|
44
|
+
|
45
|
+
module RedisFullEach
|
46
|
+
# The full_each method iterates over all records in the cache and yields
|
47
|
+
# them to the block.
|
48
|
+
#
|
49
|
+
# @yield [ key, value ] where key is the record's key and value is the record itself
|
50
|
+
def full_each(&block)
|
51
|
+
redis.scan_each(match: [ Documentrix::Documents, ?* ] * ?-) do |key|
|
52
|
+
value = redis.get(key) or next
|
53
|
+
value = JSON(value, object_class: Documentrix::Documents::Record)
|
54
|
+
block.(key, value)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
module FindRecords
|
60
|
+
# The find_records method finds records that match the given needle and
|
61
|
+
# tags.
|
62
|
+
#
|
63
|
+
# @param needle [ Array ] an array containing the embedding vector
|
64
|
+
# @param tags [ String, Array ] a string or array of strings representing the tags to search for
|
65
|
+
# @param max_records [ Integer ] the maximum number of records to return
|
66
|
+
#
|
67
|
+
# @yield [ record ]
|
68
|
+
#
|
69
|
+
# @return [ Array<Documentrix::Documents::Records> ] an array containing the matching records
|
70
|
+
def find_records(needle, tags: nil, max_records: nil)
|
71
|
+
tags = Documentrix::Utils::Tags.new(Array(tags)).to_a
|
72
|
+
records = self
|
73
|
+
if tags.present?
|
74
|
+
records = records.select { |_key, record| (tags & record.tags).size >= 1 }
|
75
|
+
end
|
76
|
+
needle_norm = norm(needle)
|
77
|
+
records = records.sort_by { |key, record|
|
78
|
+
record.key = key
|
79
|
+
record.similarity = cosine_similarity(
|
80
|
+
a: needle,
|
81
|
+
b: record.embedding,
|
82
|
+
a_norm: needle_norm,
|
83
|
+
b_norm: record.norm,
|
84
|
+
)
|
85
|
+
}
|
86
|
+
records.transpose.last&.reverse.to_a
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
module Tags
|
91
|
+
# The clear method removes all records that match the given tags from the
|
92
|
+
# cache.
|
93
|
+
#
|
94
|
+
# @param tags [ Array<String> ] an array of tag names
|
95
|
+
#
|
96
|
+
# @example
|
97
|
+
# documents.clear(tags: %w[ foo bar ])
|
98
|
+
#
|
99
|
+
# @return [ self ]
|
100
|
+
def clear(tags: nil)
|
101
|
+
tags = Documentrix::Utils::Tags.new(tags).to_a
|
102
|
+
if tags.present?
|
103
|
+
if respond_to?(:clear_for_tags)
|
104
|
+
clear_for_tags(tags)
|
105
|
+
else
|
106
|
+
each do |key, record|
|
107
|
+
if (tags & record.tags.to_a).size >= 1
|
108
|
+
delete(unpre(key))
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
else
|
113
|
+
super()
|
114
|
+
end
|
115
|
+
self
|
116
|
+
end
|
117
|
+
|
118
|
+
# The tags method returns an array of unique tags from all records.
|
119
|
+
#
|
120
|
+
# @return [Documentrix::Utils::Tags] An instance of
|
121
|
+
# Documentrix::Utils::Tags containing the unique tags.
|
122
|
+
def tags
|
123
|
+
if defined? super
|
124
|
+
super
|
125
|
+
else
|
126
|
+
each_with_object(Documentrix::Utils::Tags.new) do |(_, record), t|
|
127
|
+
record.tags.each do |tag|
|
128
|
+
t.add(tag, source: record.source)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
def tags
|
134
|
+
if defined? super
|
135
|
+
super
|
136
|
+
else
|
137
|
+
each_with_object(Documentrix::Utils::Tags.new) do |(_, record), t|
|
138
|
+
record.tags.each do |tag|
|
139
|
+
t.add(tag, source: record.source)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'redis'
|
2
|
+
|
3
|
+
class Documentrix::Documents
|
4
|
+
class RedisBackedMemoryCache < MemoryCache
|
5
|
+
|
6
|
+
# The initialize method sets up the RedisBackedMemoryCache cache by
|
7
|
+
# creating a new instance and populating it with data from the internally
|
8
|
+
# created RedisCache.
|
9
|
+
#
|
10
|
+
# @param prefix [String] The prefix for keys in the Redis cache
|
11
|
+
# @param url [String] The URL of the Redis server (default: ENV['REDIS_URL'])
|
12
|
+
# @param object_class [Class] The class to use for deserializing values from Redis (default: nil)
|
13
|
+
#
|
14
|
+
# @raise [ArgumentError] If the redis_url environment variable is not set
|
15
|
+
def initialize(prefix:, url: ENV['REDIS_URL'], object_class: nil)
|
16
|
+
super(prefix:)
|
17
|
+
url or raise ArgumentError, 'require redis url'
|
18
|
+
@url, @object_class = url, object_class
|
19
|
+
@redis_cache = Documentrix::Documents::RedisCache.new(prefix:, url:, object_class:)
|
20
|
+
@redis_cache.extend(Documentrix::Documents::Cache::Records::RedisFullEach)
|
21
|
+
@redis_cache.full_each { |key, value| @data[key] = value }
|
22
|
+
end
|
23
|
+
|
24
|
+
attr_reader :object_class # the class of objects stored in the cache
|
25
|
+
|
26
|
+
# The redis method returns the Redis client instance used by the cache.
|
27
|
+
#
|
28
|
+
# @return [Redis] The Redis client instance
|
29
|
+
def redis
|
30
|
+
@redis_cache.redis
|
31
|
+
end
|
32
|
+
|
33
|
+
# The set method sets the value for a given key in memory and in Redis.
|
34
|
+
#
|
35
|
+
# @param [String] key the key to be set
|
36
|
+
# @param [Hash] value the hash containing the data to be stored
|
37
|
+
def []=(key, value)
|
38
|
+
super
|
39
|
+
redis.set(pre(key), JSON(value))
|
40
|
+
end
|
41
|
+
|
42
|
+
# The delete method removes a key from the cache by calling Redis's del
|
43
|
+
# method and then calling the superclass's delete method.
|
44
|
+
#
|
45
|
+
# @param [String] key the key to be deleted
|
46
|
+
#
|
47
|
+
# @return [FalseClass, TrueClass] true if the key was successfully deleted, false otherwise.
|
48
|
+
def delete(key)
|
49
|
+
result = redis.del(pre(key))
|
50
|
+
super && result == 1
|
51
|
+
end
|
52
|
+
|
53
|
+
# The clear method deletes all keys from the cache by scanning redis for
|
54
|
+
# keys that match the prefix `prefix` and then deleting them, then it does
|
55
|
+
# the same for the MemoryCache by calling its super.
|
56
|
+
#
|
57
|
+
# @return [self] self
|
58
|
+
def clear
|
59
|
+
redis.scan_each(match: "#@prefix*") { |key| redis.del(key) }
|
60
|
+
super
|
61
|
+
self
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,128 @@
|
|
1
|
+
require 'documentrix/documents/cache/common'
|
2
|
+
require 'redis'
|
3
|
+
|
4
|
+
class Documentrix::Documents::RedisCache
|
5
|
+
include Documentrix::Documents::Cache::Common
|
6
|
+
|
7
|
+
# The initialize method sets up the Documentrix::Documents::RedisCache
|
8
|
+
# instance's by setting its prefix attribute to the given value and
|
9
|
+
# initializing the Redis client.
|
10
|
+
#
|
11
|
+
# @param [String] prefix the string to be used as the prefix for this cache
|
12
|
+
# @param [String] url the URL of the Redis server (default: ENV['REDIS_URL'])
|
13
|
+
# @param [Class] object_class the class of objects stored in Redis (default: nil)
|
14
|
+
# @param [Integer] ex the expiration time in seconds (default: nil)
|
15
|
+
def initialize(prefix:, url: ENV['REDIS_URL'], object_class: nil, ex: nil)
|
16
|
+
super(prefix:)
|
17
|
+
url or raise ArgumentError, 'require redis url'
|
18
|
+
@url, @object_class, @ex = url, object_class, ex
|
19
|
+
end
|
20
|
+
|
21
|
+
attr_reader :object_class # the class of objects stored in the cache
|
22
|
+
|
23
|
+
# The redis method returns an instance of Redis client
|
24
|
+
#
|
25
|
+
# @return [Redis] An instance of Redis client
|
26
|
+
def redis
|
27
|
+
@redis ||= Redis.new(url: @url)
|
28
|
+
end
|
29
|
+
|
30
|
+
# The [](key) method retrieves the value associated with the given key from Redis.
|
31
|
+
#
|
32
|
+
# @param [String] key the string representation of the key
|
33
|
+
#
|
34
|
+
# @return [Object, nil] the retrieved value if it exists in Redis, or nil otherwise
|
35
|
+
def [](key)
|
36
|
+
value = redis.get(pre(key))
|
37
|
+
unless value.nil?
|
38
|
+
object_class ? JSON(value, object_class:) : JSON(value)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# The []= method sets the value associated with the given key in this cache instance.
|
43
|
+
#
|
44
|
+
# @param [String] key the string representation of the key
|
45
|
+
# @param [Object] value the object to be stored under the given key
|
46
|
+
#
|
47
|
+
# @return [Object] self
|
48
|
+
def []=(key, value)
|
49
|
+
set(key, value)
|
50
|
+
end
|
51
|
+
|
52
|
+
# The set method sets the value associated with the given key in this cache instance.
|
53
|
+
#
|
54
|
+
# @param [String] key the string representation of the key
|
55
|
+
# @param [Object] value the object to be stored under the given key
|
56
|
+
# @option ex [Integer] ex the expiration time in seconds (default: nil)
|
57
|
+
#
|
58
|
+
# @return [Object] self
|
59
|
+
def set(key, value, ex: nil)
|
60
|
+
ex ||= @ex
|
61
|
+
if !ex.nil? && ex < 1
|
62
|
+
redis.del(pre(key))
|
63
|
+
else
|
64
|
+
redis.set(pre(key), JSON.generate(value), ex:)
|
65
|
+
end
|
66
|
+
value
|
67
|
+
end
|
68
|
+
|
69
|
+
# The ttl method returns the time-to-live (TTL) value for the given key
|
70
|
+
#
|
71
|
+
# @param [String] key the string representation of the key
|
72
|
+
#
|
73
|
+
# @return [Integer, nil] the TTL value if it exists in Redis, or nil otherwise
|
74
|
+
def ttl(key)
|
75
|
+
redis.ttl(pre(key))
|
76
|
+
end
|
77
|
+
|
78
|
+
# The key? method checks if the given key exists in Redis by calling the
|
79
|
+
# redis.exists? method
|
80
|
+
#
|
81
|
+
# @param [String] key the string representation of the key
|
82
|
+
#
|
83
|
+
# @return [FalseClass, TrueClass] true if the key exists, false otherwise
|
84
|
+
def key?(key)
|
85
|
+
!!redis.exists?(pre(key))
|
86
|
+
end
|
87
|
+
|
88
|
+
# The delete method removes the key-value pair associated with the given key
|
89
|
+
# from this cache instance.
|
90
|
+
#
|
91
|
+
# @param [String] key the string representation of the key
|
92
|
+
#
|
93
|
+
# @return [FalseClass, TrueClass] true if the key was deleted successfully, false otherwise
|
94
|
+
def delete(key)
|
95
|
+
redis.del(pre(key)) == 1
|
96
|
+
end
|
97
|
+
|
98
|
+
# The size method returns the total number of keys stored in this cache
|
99
|
+
# instance, that is the ones with the prefix `prefix`.
|
100
|
+
#
|
101
|
+
# @return [Integer] The total count of keys
|
102
|
+
def size
|
103
|
+
s = 0
|
104
|
+
redis.scan_each(match: "#@prefix*") { |key| s += 1 }
|
105
|
+
s
|
106
|
+
end
|
107
|
+
|
108
|
+
# The clear method removes all key-value pairs associated with the given
|
109
|
+
# prefix from this cache instance.
|
110
|
+
#
|
111
|
+
# @return [Documentrix::Documents::RedisCache] self
|
112
|
+
def clear
|
113
|
+
redis.scan_each(match: "#@prefix*") { |key| redis.del(key) }
|
114
|
+
self
|
115
|
+
end
|
116
|
+
|
117
|
+
# The each method iterates over the cache keys with prefix `prefix` and
|
118
|
+
# yields each key-value pair to the given block.
|
119
|
+
#
|
120
|
+
# @yield [key, value] Each key-value pair in the cache
|
121
|
+
#
|
122
|
+
# @return [self] self
|
123
|
+
def each(&block)
|
124
|
+
redis.scan_each(match: "#@prefix*") { |key| block.(key, self[unpre(key)]) }
|
125
|
+
self
|
126
|
+
end
|
127
|
+
include Enumerable
|
128
|
+
end
|