documentrix 0.2.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGES.md +80 -0
- data/documentrix.gemspec +5 -5
- data/lib/documentrix/documents/cache/common.rb +63 -11
- data/lib/documentrix/documents/cache/records.rb +1 -1
- data/lib/documentrix/documents/cache/redis_cache.rb +3 -3
- data/lib/documentrix/documents/cache/sqlite_cache.rb +132 -33
- data/lib/documentrix/documents/splitters/character.rb +56 -4
- data/lib/documentrix/documents/splitters/common.rb +38 -0
- data/lib/documentrix/documents/splitters/semantic.rb +67 -8
- data/lib/documentrix/documents.rb +133 -29
- data/lib/documentrix/utils/colorize_texts.rb +25 -21
- data/lib/documentrix/utils/digests.rb +78 -0
- data/lib/documentrix/utils.rb +1 -0
- data/lib/documentrix/version.rb +1 -1
- data/spec/documentrix/documents/cache/interface_spec.rb +16 -3
- data/spec/documentrix/documents/cache/memory_cache_spec.rb +64 -2
- data/spec/documentrix/documents/cache/redis_cache_spec.rb +68 -19
- data/spec/documentrix/documents/cache/sqlite_cache_spec.rb +169 -2
- data/spec/documentrix/documents/splitters/character_spec.rb +20 -2
- data/spec/documentrix/documents/splitters/semantic_spec.rb +17 -5
- data/spec/documents_spec.rb +59 -3
- data/spec/utils/colorize_texts_spec.rb +0 -2
- data/spec/utils/digests_spec.rb +97 -0
- data/spec/utils/tags_spec.rb +0 -2
- metadata +7 -1
|
@@ -1,15 +1,38 @@
|
|
|
1
1
|
module Documentrix::Documents::Splitters
|
|
2
|
+
# The Character class provides basic text splitting based on a single
|
|
3
|
+
# separator and bundles the resulting segments into chunks of a maximum size.
|
|
4
|
+
#
|
|
5
|
+
# It allows for the preservation of separators and uses a combining string
|
|
6
|
+
# to join segments back together into chunks.
|
|
2
7
|
class Character
|
|
8
|
+
include Documentrix::Documents::Splitters::Common
|
|
9
|
+
|
|
10
|
+
# The default regex used to identify paragraph boundaries.
|
|
11
|
+
# It matches two or more consecutive newline characters (CRLF or LF).
|
|
12
|
+
#
|
|
13
|
+
# @return [Regexp]
|
|
3
14
|
DEFAULT_SEPARATOR = /(?:\r?\n){2,}/
|
|
4
15
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
16
|
+
# Initializes a new Character splitter.
|
|
17
|
+
#
|
|
18
|
+
# @param separator [Regexp] the regex used to split the text (defaults to DEFAULT_SEPARATOR)
|
|
19
|
+
# @param include_separator [Boolean] whether to include the separator in the resulting chunks (defaults to false)
|
|
20
|
+
# @param combining_string [String] the string used to join segments into chunks (defaults to "\n\n")
|
|
21
|
+
# @param chunk_size [Integer] the maximum size of each resulting chunk (defaults to 4096)
|
|
22
|
+
# @param force [Boolean] whether to force-split the final chunk if it exceeds `chunk_size` (defaults to false)
|
|
23
|
+
def initialize(separator: DEFAULT_SEPARATOR, include_separator: false, combining_string: "\n\n", chunk_size: 4096, force: false)
|
|
24
|
+
@separator, @include_separator, @combining_string, @chunk_size, @force =
|
|
25
|
+
separator, include_separator, combining_string, chunk_size, force
|
|
8
26
|
if include_separator
|
|
9
27
|
@separator = Regexp.new("(#@separator)")
|
|
10
28
|
end
|
|
11
29
|
end
|
|
12
30
|
|
|
31
|
+
# Splits the given text into chunks based on the configured separator and
|
|
32
|
+
# size limit.
|
|
33
|
+
#
|
|
34
|
+
# @param text [String] the text to be split
|
|
35
|
+
# @return [Array<String>] an array of text chunks
|
|
13
36
|
def split(text)
|
|
14
37
|
texts = []
|
|
15
38
|
text.split(@separator) do |t|
|
|
@@ -29,12 +52,27 @@ module Documentrix::Documents::Splitters
|
|
|
29
52
|
current_text = t
|
|
30
53
|
end
|
|
31
54
|
end
|
|
32
|
-
|
|
55
|
+
result.concat force_split(current_text)
|
|
33
56
|
result
|
|
34
57
|
end
|
|
35
58
|
end
|
|
36
59
|
|
|
60
|
+
# The RecursiveCharacter class implements a hierarchical splitting strategy.
|
|
61
|
+
#
|
|
62
|
+
# It attempts to split text using a priority list of separators. If a
|
|
63
|
+
# resulting chunk is still larger than the specified chunk_size, it
|
|
64
|
+
# recursively applies the next separator in the list until the size limit is
|
|
65
|
+
# met or all separators have been exhausted.
|
|
37
66
|
class RecursiveCharacter
|
|
67
|
+
include Documentrix::Documents::Splitters::Common
|
|
68
|
+
|
|
69
|
+
# The default priority list of regexes used for recursive splitting.
|
|
70
|
+
# The strategy is to split by the coarsest grain first (paragraphs)
|
|
71
|
+
# and move toward the finest grain (individual characters) as needed.
|
|
72
|
+
#
|
|
73
|
+
# Order: Paragraphs -> Newlines -> Word Boundaries -> Characters
|
|
74
|
+
#
|
|
75
|
+
# @return [Array<Regexp>]
|
|
38
76
|
DEFAULT_SEPARATORS = [
|
|
39
77
|
/(?:\r?\n){2,}/,
|
|
40
78
|
/\r?\n/,
|
|
@@ -42,13 +80,27 @@ module Documentrix::Documents::Splitters
|
|
|
42
80
|
//,
|
|
43
81
|
].freeze
|
|
44
82
|
|
|
83
|
+
# Initializes a new RecursiveCharacter splitter.
|
|
84
|
+
#
|
|
85
|
+
# @param separators [Array<Regexp>] a priority list of regexes to use for splitting (defaults to DEFAULT_SEPARATORS)
|
|
86
|
+
# @param include_separator [Boolean] whether to include the separator in the resulting chunks (defaults to false)
|
|
87
|
+
# @param combining_string [String] the string used to join segments into chunks (defaults to "\n\n")
|
|
88
|
+
# @param chunk_size [Integer] the maximum size of each resulting chunk (defaults to 4096)
|
|
89
|
+
# @raise [ArgumentError] if the separators array is empty
|
|
45
90
|
def initialize(separators: DEFAULT_SEPARATORS, include_separator: false, combining_string: "\n\n", chunk_size: 4096)
|
|
46
91
|
separators.empty? and
|
|
47
92
|
raise ArgumentError, "non-empty array of separators required"
|
|
48
93
|
@separators, @include_separator, @combining_string, @chunk_size =
|
|
49
94
|
separators, include_separator, combining_string, chunk_size
|
|
95
|
+
@force = separators.last == //
|
|
50
96
|
end
|
|
51
97
|
|
|
98
|
+
# Recursively splits the given text into chunks using the list of
|
|
99
|
+
# separators.
|
|
100
|
+
#
|
|
101
|
+
# @param text [String] the text to be split
|
|
102
|
+
# @param separators [Array<Regexp>] the list of separators to use (defaults to @separators)
|
|
103
|
+
# @return [Array<String>] an array of text chunks
|
|
52
104
|
def split(text, separators: @separators)
|
|
53
105
|
separators.empty? and return [ text ]
|
|
54
106
|
separators = separators.dup
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# A shared utility module for text splitters that provides consistent
|
|
2
|
+
# handling of chunk size constraints.
|
|
3
|
+
#
|
|
4
|
+
# This module is intended to be included in splitter classes that
|
|
5
|
+
# implement a maximum chunk size limit. It expects the including class
|
|
6
|
+
# to provide the following attributes:
|
|
7
|
+
# - `force` [Boolean]: Whether to hard-split chunks that exceed the limit.
|
|
8
|
+
# - `chunk_size` [Integer]: The maximum allowed size for a single chunk.
|
|
9
|
+
module Documentrix::Documents::Splitters::Common
|
|
10
|
+
private
|
|
11
|
+
|
|
12
|
+
# Whether to force-split chunks that exceed the chunk size limit.
|
|
13
|
+
# @return [Boolean]
|
|
14
|
+
attr_reader :force
|
|
15
|
+
|
|
16
|
+
# The maximum allowed size for a single chunk.
|
|
17
|
+
# @return [Integer]
|
|
18
|
+
attr_reader :chunk_size
|
|
19
|
+
|
|
20
|
+
# Ensures text respects the chunk size limit if force splitting is enabled.
|
|
21
|
+
#
|
|
22
|
+
# If the `force` attribute is true and the provided text exceeds the
|
|
23
|
+
# `chunk_size`, the text is hard-split into fixed-size chunks using a
|
|
24
|
+
# regular expression. If `force` is false or the text is within the
|
|
25
|
+
# limit, the text is returned wrapped in a single-element array to
|
|
26
|
+
# maintain return-type consistency (Array<String>).
|
|
27
|
+
#
|
|
28
|
+
# @param text [String, nil] the text to potentially split
|
|
29
|
+
# @return [Array<String>] the resulting chunk(s), or an empty array if text is nil/empty
|
|
30
|
+
def force_split(text)
|
|
31
|
+
text&.empty? and return []
|
|
32
|
+
if force && text.size > chunk_size
|
|
33
|
+
text.scan(/.{1,#{chunk_size}}/)
|
|
34
|
+
else
|
|
35
|
+
Array(text)
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
@@ -1,14 +1,60 @@
|
|
|
1
1
|
module Documentrix::Documents::Splitters
|
|
2
|
+
# Semantic splitter that divides text based on thematic changes in meaning.
|
|
3
|
+
#
|
|
4
|
+
# It works by splitting text into sentences, computing embeddings for each,
|
|
5
|
+
# and then calculating the cosine distance between adjacent sentences.
|
|
6
|
+
# Where the distance exceeds a calculated threshold (the "breakpoint"),
|
|
7
|
+
# a semantic boundary is identified.
|
|
8
|
+
#
|
|
9
|
+
# @example
|
|
10
|
+
# splitter = Documentrix::Documents::Splitters::Semantic.new(
|
|
11
|
+
# ollama: ollama_client,
|
|
12
|
+
# model: 'mxbai-embed-large'
|
|
13
|
+
# )
|
|
14
|
+
# chunks = splitter.split(text, breakpoint: :percentile, percentile: 90)
|
|
2
15
|
class Semantic
|
|
16
|
+
include Documentrix::Documents::Splitters::Common
|
|
3
17
|
include Documentrix::Utils::Math
|
|
4
18
|
|
|
5
|
-
|
|
19
|
+
# The default regex used to identify sentence boundaries for semantic
|
|
20
|
+
# splitting. It matches a sentence-ending punctuation mark (., !, ?)
|
|
21
|
+
# followed by optional whitespace at a word boundary or the end of the
|
|
22
|
+
# string.
|
|
23
|
+
#
|
|
24
|
+
# @return [Regexp]
|
|
25
|
+
DEFAULT_SEPARATOR = /[.!?,;]\s*(?:\b|\z)/
|
|
6
26
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
27
|
+
# Initializes a new Semantic splitter.
|
|
28
|
+
#
|
|
29
|
+
# @param ollama [Ollama::Client] the client used for generating embeddings
|
|
30
|
+
# @param model [String] the embedding model name
|
|
31
|
+
# @param model_options [Hash, nil] optional parameters passed to the embedding model
|
|
32
|
+
# @param separator [Regexp] the regex used to identify sentence boundaries
|
|
33
|
+
# @param chunk_size [Integer] the maximum character length of a resulting chunk
|
|
34
|
+
# @param force [Boolean] whether to force split chunks that exceed chunk_size (defaults to false)
|
|
35
|
+
def initialize(ollama:, model:, model_options: nil, separator: DEFAULT_SEPARATOR, chunk_size: 4096, force: false)
|
|
36
|
+
@ollama, @model, @model_options, @separator, @chunk_size, @force =
|
|
37
|
+
ollama, model, model_options, separator, chunk_size, force
|
|
10
38
|
end
|
|
11
39
|
|
|
40
|
+
# Splits the given text into semantic chunks.
|
|
41
|
+
#
|
|
42
|
+
# The method first decomposes the text into sentences, then identifies gaps
|
|
43
|
+
# in semantic similarity. It then groups these sentences into chunks that
|
|
44
|
+
# respect both the semantic boundaries and the maximum chunk size.
|
|
45
|
+
#
|
|
46
|
+
# @param text [String] the text to be split
|
|
47
|
+
# @param batch_size [Integer] the number of sentences to embed in a single API call
|
|
48
|
+
# @param breakpoint [Symbol] the method used to determine the distance threshold
|
|
49
|
+
# * :percentile (default) - uses the N-th percentile of distances
|
|
50
|
+
# * :standard_deviation - uses mean + (std_dev * multiplier)
|
|
51
|
+
# * :interquartile - uses mean + (iqr * multiplier)
|
|
52
|
+
# @param opts [Hash] additional options for the splitting process:
|
|
53
|
+
# * :include_separator [Boolean] whether to keep the sentence separator in the result
|
|
54
|
+
# * :percentile [Integer] the percentile to use if breakpoint is :percentile (default: 95)
|
|
55
|
+
# * :percentage [Integer] the multiplier percentage for :standard_deviation or :interquartile (default: 100)
|
|
56
|
+
#
|
|
57
|
+
# @return [Array<String>] an array of semantically grouped text chunks
|
|
12
58
|
def split(text, batch_size: 100, breakpoint: :percentile, **opts)
|
|
13
59
|
sentences = Documentrix::Documents::Splitters::Character.new(
|
|
14
60
|
separator: @separator,
|
|
@@ -45,22 +91,30 @@ module Documentrix::Documents::Splitters
|
|
|
45
91
|
if current_text.size + sentence.size < @chunk_size
|
|
46
92
|
current_text += sentence
|
|
47
93
|
else
|
|
48
|
-
|
|
94
|
+
result.concat(force_split(current_text))
|
|
49
95
|
current_text = sentence
|
|
50
96
|
end
|
|
51
97
|
end
|
|
52
|
-
|
|
53
|
-
result
|
|
98
|
+
if current_text.present?
|
|
99
|
+
result.concat(force_split(current_text))
|
|
54
100
|
current_text = +''
|
|
55
101
|
end
|
|
56
102
|
sg = g.succ
|
|
57
103
|
end
|
|
58
|
-
|
|
104
|
+
result.concat(force_split(current_text))
|
|
59
105
|
result
|
|
60
106
|
end
|
|
61
107
|
|
|
62
108
|
private
|
|
63
109
|
|
|
110
|
+
# Calculates the distance threshold used to identify semantic boundaries.
|
|
111
|
+
#
|
|
112
|
+
# @param breakpoint_method [Symbol] the method to use (:percentile, :standard_deviation, :interquartile)
|
|
113
|
+
# @param distances [Array<Float>] the cosine distances between adjacent sentences
|
|
114
|
+
# @param opts [Hash] options specific to the chosen method (e.g., :percentile, :percentage)
|
|
115
|
+
#
|
|
116
|
+
# @return [Float] the distance threshold
|
|
117
|
+
# @raise [ArgumentError] if an unsupported breakpoint_method is provided
|
|
64
118
|
def calculate_breakpoint_threshold(breakpoint_method, distances, **opts)
|
|
65
119
|
sequence = MoreMath::Sequence.new(distances)
|
|
66
120
|
case breakpoint_method
|
|
@@ -82,6 +136,11 @@ module Documentrix::Documents::Splitters
|
|
|
82
136
|
end
|
|
83
137
|
end
|
|
84
138
|
|
|
139
|
+
# Fetches embeddings for a batch of sentences and converts them to
|
|
140
|
+
# Numo::NArray.
|
|
141
|
+
#
|
|
142
|
+
# @param input [Array<String>] the batch of sentences to embed
|
|
143
|
+
# @return [Array<Numo::NArray>] an array of embeddings as Numo arrays
|
|
85
144
|
def sentence_embeddings(input)
|
|
86
145
|
@ollama.embed(model: @model, input:, options: @model_options).embeddings.map! {
|
|
87
146
|
Numo::NArray[*_1]
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
require 'numo/narray'
|
|
2
|
-
require 'digest'
|
|
3
2
|
require 'kramdown/ansi'
|
|
4
3
|
|
|
5
4
|
class Documentrix::Documents
|
|
@@ -33,6 +32,7 @@ require 'documentrix/documents/cache/sqlite_cache'
|
|
|
33
32
|
# to prepare text data for embedding and storage in vector databases.
|
|
34
33
|
module Documentrix::Documents::Splitters
|
|
35
34
|
end
|
|
35
|
+
require 'documentrix/documents/splitters/common'
|
|
36
36
|
require 'documentrix/documents/splitters/character'
|
|
37
37
|
require 'documentrix/documents/splitters/semantic'
|
|
38
38
|
|
|
@@ -59,6 +59,7 @@ require 'documentrix/documents/splitters/semantic'
|
|
|
59
59
|
class Documentrix::Documents
|
|
60
60
|
include Kramdown::ANSI::Width
|
|
61
61
|
include Documentrix::Documents::Cache
|
|
62
|
+
include Documentrix::Utils::Digests
|
|
62
63
|
|
|
63
64
|
# Shortcut for Documentrix::Documents::Cache::Records::Record
|
|
64
65
|
Record = Class.new Documentrix::Documents::Cache::Records::Record
|
|
@@ -116,16 +117,16 @@ class Documentrix::Documents
|
|
|
116
117
|
texts
|
|
117
118
|
end
|
|
118
119
|
|
|
119
|
-
|
|
120
|
-
# The method adds new texts `texts` to the documents collection by
|
|
120
|
+
# The add method adds new texts `texts` to the documents collection by
|
|
121
121
|
# processing them through various stages. It first filters out existing texts
|
|
122
122
|
# from the input array using the `prepare_texts` method, then fetches
|
|
123
123
|
# embeddings for each text using the specified model and options. The fetched
|
|
124
124
|
# embeddings are used to create a new record in the cache, which is
|
|
125
|
-
# associated with the original text and
|
|
126
|
-
# the texts in batches of size
|
|
127
|
-
# in the console. It also accepts an optional
|
|
128
|
-
# with the added texts
|
|
125
|
+
# associated with the original text, tags, and version digest (if any). The
|
|
126
|
+
# method processes the texts in batches of size `batch_size`, displaying
|
|
127
|
+
# progress information in the console. It also accepts an optional `source`
|
|
128
|
+
# string to associate with the added texts, an array of `tags` to attach to
|
|
129
|
+
# each record, and an optional `digest` string for version tracking. Once
|
|
129
130
|
# all texts have been processed, it returns the `Documentrix::Documents`
|
|
130
131
|
# instance itself, allowing for method chaining.
|
|
131
132
|
#
|
|
@@ -133,14 +134,17 @@ class Documentrix::Documents
|
|
|
133
134
|
# @param batch_size [Integer] the number of texts to process in one batch
|
|
134
135
|
# @param source [String] the source URL for the added texts
|
|
135
136
|
# @param tags [Array] an array of tags associated with the added texts
|
|
137
|
+
# @param digest [String, nil] the SHA256 hexadecimal digest of the source
|
|
136
138
|
#
|
|
137
139
|
# @example
|
|
138
140
|
# documents.add(%w[ foo bar ], batch_size: 23, source: 'https://example.com', tags: %w[tag1 tag2])
|
|
139
141
|
#
|
|
140
142
|
# @return [Documentrix::Documents] self
|
|
141
|
-
def add(texts, batch_size: nil, source: nil, tags: [])
|
|
142
|
-
texts
|
|
143
|
-
|
|
143
|
+
def add(texts, batch_size: nil, source: nil, tags: [], digest: nil)
|
|
144
|
+
texts = prepare_texts(texts) or return self
|
|
145
|
+
source = normalize_source(source)
|
|
146
|
+
tags = Documentrix::Utils::Tags.new(tags, source:)
|
|
147
|
+
digest ||= compute_file_digest(source)
|
|
144
148
|
if source
|
|
145
149
|
tags.add(File.basename(source).gsub(/\?.*/, ''), source:)
|
|
146
150
|
end
|
|
@@ -153,7 +157,7 @@ class Documentrix::Documents
|
|
|
153
157
|
embeddings = fetch_embeddings(model:, options: @model_options, input: batch)
|
|
154
158
|
batch.zip(embeddings) do |text, embedding|
|
|
155
159
|
norm = @cache.norm(embedding)
|
|
156
|
-
self[text] = Record[text:, embedding:, norm:, source:, tags: tags.to_a]
|
|
160
|
+
self[text] = Record[text:, embedding:, norm:, source:, tags: tags.to_a, digest:]
|
|
157
161
|
end
|
|
158
162
|
infobar.progress by: batch.size
|
|
159
163
|
end
|
|
@@ -219,13 +223,101 @@ class Documentrix::Documents
|
|
|
219
223
|
self
|
|
220
224
|
end
|
|
221
225
|
|
|
222
|
-
#
|
|
226
|
+
# Normalizes the source identifier to a canonical form.
|
|
227
|
+
#
|
|
228
|
+
# If the source is blank, returns nil.
|
|
229
|
+
# If the source is an absolute URL, it is returned as-is.
|
|
230
|
+
# If the source is a local file path that exists, it is expanded to its real
|
|
231
|
+
# path, resolving all symlinks and absolute paths.
|
|
232
|
+
# Otherwise, the original source is returned.
|
|
233
|
+
#
|
|
234
|
+
# @param source [String, #to_s] the source identifier to normalize
|
|
235
|
+
# @return [String, nil] the normalized canonical path, the original source,
|
|
236
|
+
# or nil if blank
|
|
237
|
+
def normalize_source(source)
|
|
238
|
+
source.blank? and return
|
|
239
|
+
begin
|
|
240
|
+
URI::PARSER.parse(source).absolute? and return source
|
|
241
|
+
rescue
|
|
242
|
+
end
|
|
243
|
+
Pathname.new(source).realpath.to_path
|
|
244
|
+
rescue Errno::ENOENT
|
|
245
|
+
source
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
# The source_exist? method checks if any records associated with the given
|
|
249
|
+
# source exist in the cache. If a digest is provided, it verifies if the
|
|
250
|
+
# source exists and satisfies the comparison with the specified digest.
|
|
251
|
+
#
|
|
252
|
+
# @param source [#to_s] the source to check for existence
|
|
253
|
+
# @param digest [String, nil] the SHA256 hexadecimal digest to compare
|
|
254
|
+
# against the stored source digest (optional)
|
|
255
|
+
# @param operator [Symbol, String] the operator to compare the digest with
|
|
256
|
+
# (defaults to '=')
|
|
257
|
+
#
|
|
258
|
+
# @return [Boolean] true if the source exists (and satisfies the digest
|
|
259
|
+
# comparison if provided), false otherwise.
|
|
260
|
+
def source_exist?(source, digest: nil, operator: ?=)
|
|
261
|
+
source = normalize_source(source)
|
|
262
|
+
@cache.source_exist?(source, digest:, operator:)
|
|
263
|
+
end
|
|
264
|
+
|
|
265
|
+
# Checks if the content of the given source has been modified compared to
|
|
266
|
+
# the version stored in the cache, or if it is missing from the cache.
|
|
267
|
+
#
|
|
268
|
+
# The method is considered modified (returns true) if:
|
|
269
|
+
# 1. The source is blank or cannot be normalized.
|
|
270
|
+
# 2. The source is not a valid local file or its digest cannot be computed.
|
|
271
|
+
# 3. No records exist in the cache for this source.
|
|
272
|
+
# 4. Records exist in the cache for this source, but they have a different
|
|
273
|
+
# digest than the current version on disk.
|
|
274
|
+
#
|
|
275
|
+
# @param source [String, #to_s] the source identifier to check
|
|
276
|
+
# @return [Boolean] true if the source is modified, missing, or cannot be
|
|
277
|
+
# verified, false if it is up-to-date.
|
|
278
|
+
def source_modified?(source)
|
|
279
|
+
source = normalize_source(source) or return true
|
|
280
|
+
digest = compute_file_digest(source) or return true
|
|
281
|
+
!source_exist?(source) || source_exist?(source, digest:, operator: '!=')
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
# Updates the records associated with a given source.
|
|
285
|
+
#
|
|
286
|
+
# If the source already exists in the cache, this method computes its current
|
|
287
|
+
# digest and removes only the stale records that do not match this digest. If
|
|
288
|
+
# the source is new or has been modified, it adds the provided texts to the
|
|
289
|
+
# cache.
|
|
290
|
+
#
|
|
291
|
+
# @param texts [Array] the text strings to add if the source is new or modified
|
|
292
|
+
# @param opts [Hash] additional options passed to #add (e.g., :batch_size, :tags)
|
|
293
|
+
# * :source [#to_s] the source to update
|
|
294
|
+
#
|
|
295
|
+
# @return [Documentrix::Documents, nil] the instance itself if the source
|
|
296
|
+
# was added/updated, or nil if the source was already up-to-date.
|
|
297
|
+
def source_update(texts, **opts)
|
|
298
|
+
if source = normalize_source(opts[:source]) and source_exist?(source)
|
|
299
|
+
digest = compute_file_digest(source)
|
|
300
|
+
source_remove(source, digest:)
|
|
301
|
+
unless source_exist?(source, digest:, operator: ?=)
|
|
302
|
+
opts[:digest] = digest
|
|
303
|
+
add(texts, **opts)
|
|
304
|
+
end
|
|
305
|
+
else
|
|
306
|
+
add(texts, **opts)
|
|
307
|
+
end
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
# The source_remove method removes all documents associated with the given
|
|
311
|
+
# source.
|
|
223
312
|
#
|
|
224
|
-
# @param source [
|
|
313
|
+
# @param source [#to_s] the source of the documents to remove
|
|
314
|
+
# @param digest [String, nil] the SHA256 hexadecimal digest for which records
|
|
315
|
+
# with this source are **not** to be removed if given.
|
|
225
316
|
#
|
|
226
317
|
# @return [Documentrix::Documents] self
|
|
227
|
-
def
|
|
228
|
-
|
|
318
|
+
def source_remove(source, digest: nil)
|
|
319
|
+
source = normalize_source(source)
|
|
320
|
+
@cache.clear_by_source(source, digest:, operator: '!=')
|
|
229
321
|
self
|
|
230
322
|
end
|
|
231
323
|
|
|
@@ -236,14 +328,16 @@ class Documentrix::Documents
|
|
|
236
328
|
# @param tags [Array<String>] an array of tags to filter results by (optional)
|
|
237
329
|
# @param prompt [String] a prompt to use when searching for similar strings (optional)
|
|
238
330
|
# @param max_records [Integer] the maximum number of records to return (optional)
|
|
331
|
+
# @param min_similarity [Numeric] the minimum similarity score to include in results (defaults to -1)
|
|
239
332
|
#
|
|
240
333
|
# @example
|
|
241
334
|
# documents.find("foo")
|
|
242
335
|
#
|
|
243
336
|
# @return [Array<Documentrix::Documents::Record>]
|
|
244
|
-
def find(string, tags: nil, prompt: nil, max_records: nil)
|
|
337
|
+
def find(string, tags: nil, prompt: nil, max_records: nil, min_similarity: nil)
|
|
338
|
+
min_similarity ||= -1
|
|
245
339
|
needle = convert_to_vector(string, prompt:)
|
|
246
|
-
@cache.find_records(needle, tags:, max_records:
|
|
340
|
+
@cache.find_records(needle, tags:, max_records:, min_similarity:)
|
|
247
341
|
end
|
|
248
342
|
|
|
249
343
|
# The method filters the records returned by find based on text
|
|
@@ -256,20 +350,28 @@ class Documentrix::Documents
|
|
|
256
350
|
# @example
|
|
257
351
|
# documents.find_where('foo', text_size: 3, text_count: 1)
|
|
258
352
|
# @return [Array<Documentrix::Documents::Record>] the filtered records
|
|
353
|
+
|
|
354
|
+
# The find_where method filters the records returned by find based on text
|
|
355
|
+
# size and count.
|
|
356
|
+
#
|
|
357
|
+
# @param string [String] the search query
|
|
358
|
+
# @param text_size [Integer] the maximum allowed total text size to return
|
|
359
|
+
# @param text_count [Integer] the maximum number of records to return
|
|
360
|
+
# @param opts [Hash] additional options passed to #find, such as:
|
|
361
|
+
# * :tags [Array<String>] filter results by tags
|
|
362
|
+
# * :prompt [String] a prompt to use for the search
|
|
363
|
+
# * :min_similarity [Numeric] minimum similarity score
|
|
364
|
+
#
|
|
365
|
+
# @example
|
|
366
|
+
# documents.find_where('foo', text_size: 1000, text_count: 5, tags: ['ruby'])
|
|
367
|
+
#
|
|
368
|
+
# @return [Array<Documentrix::Documents::Record>] the filtered records
|
|
259
369
|
def find_where(string, text_size: nil, text_count: nil, **opts)
|
|
260
|
-
|
|
261
|
-
opts[:max_records] = text_count
|
|
262
|
-
end
|
|
370
|
+
text_count and opts[:max_records] = text_count
|
|
263
371
|
records = find(string, **opts)
|
|
264
|
-
size
|
|
372
|
+
size = 0
|
|
265
373
|
records.take_while do |record|
|
|
266
|
-
|
|
267
|
-
next false
|
|
268
|
-
end
|
|
269
|
-
if text_count and (count += 1) > text_count
|
|
270
|
-
next false
|
|
271
|
-
end
|
|
272
|
-
true
|
|
374
|
+
!text_size || (size += record.text.size) <= text_size
|
|
273
375
|
end
|
|
274
376
|
end
|
|
275
377
|
|
|
@@ -333,6 +435,8 @@ class Documentrix::Documents
|
|
|
333
435
|
debug: @debug
|
|
334
436
|
)
|
|
335
437
|
end
|
|
438
|
+
rescue => e
|
|
439
|
+
warn "Caught #{e.class}: #{e}"
|
|
336
440
|
ensure
|
|
337
441
|
cache ||= MemoryCache.new(prefix:,)
|
|
338
442
|
return cache
|
|
@@ -389,6 +493,6 @@ class Documentrix::Documents
|
|
|
389
493
|
#
|
|
390
494
|
# @return [String] the SHA256 hash of the input string
|
|
391
495
|
def key(input)
|
|
392
|
-
|
|
496
|
+
compute_digest(input)
|
|
393
497
|
end
|
|
394
498
|
end
|
|
@@ -4,33 +4,37 @@ require 'kramdown/ansi'
|
|
|
4
4
|
# A utility class for colorizing and formatting text output with ANSI color
|
|
5
5
|
# codes and size information.
|
|
6
6
|
#
|
|
7
|
-
# The ColorizeTexts class takes
|
|
8
|
-
#
|
|
9
|
-
#
|
|
10
|
-
#
|
|
7
|
+
# The ColorizeTexts class takes a collection of text strings and formats them
|
|
8
|
+
# with dynamically generated ANSI colors for visual distinction. Each text
|
|
9
|
+
# block is wrapped to fit the terminal width and appended with its size in
|
|
10
|
+
# bytes, making it ideal for debugging text-splitting pipelines.
|
|
11
11
|
#
|
|
12
12
|
# @example
|
|
13
|
-
# colorizer = Documentrix::Utils::ColorizeTexts.new('
|
|
13
|
+
# colorizer = Documentrix::Utils::ColorizeTexts.new('First chunk', 'Second chunk')
|
|
14
14
|
# puts colorizer.to_s
|
|
15
15
|
class Documentrix::Utils::ColorizeTexts
|
|
16
16
|
include Math
|
|
17
17
|
include Term::ANSIColor
|
|
18
18
|
include Kramdown::ANSI::Width
|
|
19
19
|
|
|
20
|
-
# Initializes a new instance of
|
|
20
|
+
# Initializes a new instance of ColorizeTexts.
|
|
21
21
|
#
|
|
22
|
-
# @param [Array<String>]
|
|
22
|
+
# @param texts [String, Array<String>] a variable list of strings or an array
|
|
23
|
+
# of strings to be colorized.
|
|
23
24
|
#
|
|
24
|
-
# @return [Documentrix
|
|
25
|
+
# @return [Documentrix::Utils::ColorizeTexts] a new instance of ColorizeTexts
|
|
25
26
|
def initialize(*texts)
|
|
26
|
-
texts
|
|
27
|
-
@texts = Array(texts.flatten)
|
|
27
|
+
@texts = texts.flatten
|
|
28
28
|
end
|
|
29
29
|
|
|
30
|
-
# Returns a string representation of the
|
|
31
|
-
# colored differently and their sizes.
|
|
30
|
+
# Returns a formatted string representation of the texts.
|
|
32
31
|
#
|
|
33
|
-
#
|
|
32
|
+
# Each text block is:
|
|
33
|
+
# 1. Assigned a color from a trigonometric RGB gradient.
|
|
34
|
+
# 2. Wrapped to 90% of the terminal width.
|
|
35
|
+
# 3. Appended with its size in bold text.
|
|
36
|
+
#
|
|
37
|
+
# @return [String] the colorized and formatted output string.
|
|
34
38
|
def to_s
|
|
35
39
|
result = +''
|
|
36
40
|
@texts.each_with_index do |t, i|
|
|
@@ -45,14 +49,13 @@ class Documentrix::Utils::ColorizeTexts
|
|
|
45
49
|
|
|
46
50
|
private
|
|
47
51
|
|
|
48
|
-
#
|
|
52
|
+
# Determines the optimal text color (black or white) for a given background
|
|
53
|
+
# color to ensure maximum readability based on contrast.
|
|
49
54
|
#
|
|
50
|
-
# @param [
|
|
55
|
+
# @param color [Symbol, Term::ANSIColor::Attribute] the ANSI color attribute
|
|
51
56
|
#
|
|
52
|
-
# @return [Array<
|
|
53
|
-
#
|
|
54
|
-
# when printed on a black background, and the second is the closest match
|
|
55
|
-
# when printed on a white background.
|
|
57
|
+
# @return [Array<String>] an array containing the RGB colors that provide
|
|
58
|
+
# the best contrast for black and white backgrounds.
|
|
56
59
|
def text_color(color)
|
|
57
60
|
color = Term::ANSIColor::Attribute[color]
|
|
58
61
|
[
|
|
@@ -61,9 +64,10 @@ class Documentrix::Utils::ColorizeTexts
|
|
|
61
64
|
].max_by { |t| t.distance_to(color) }
|
|
62
65
|
end
|
|
63
66
|
|
|
64
|
-
#
|
|
67
|
+
# Generates a 256-color RGB gradient using sine wave oscillations.
|
|
65
68
|
#
|
|
66
|
-
# @return [Array<Array<Integer>>]
|
|
69
|
+
# @return [Array<Array<Integer>>] an array of 256 RGB color arrays,
|
|
70
|
+
# where each inner array contains [R, G, B] values from 0 to 255.
|
|
67
71
|
def colors
|
|
68
72
|
@colors ||= (0..255).map { |i|
|
|
69
73
|
[
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
require 'digest'
|
|
2
|
+
require 'uri'
|
|
3
|
+
|
|
4
|
+
# Module for computing cryptographic digests used for tracking content changes.
|
|
5
|
+
module Documentrix::Utils::Digests
|
|
6
|
+
private
|
|
7
|
+
|
|
8
|
+
@@file_digest_cache = {}
|
|
9
|
+
|
|
10
|
+
# Computes the SHA256 hexadecimal digest of the given text.
|
|
11
|
+
#
|
|
12
|
+
# @param text [String] the text to be hashed
|
|
13
|
+
# @return [String] the SHA256 hexadecimal digest
|
|
14
|
+
def compute_digest(text)
|
|
15
|
+
Digest::SHA256.hexdigest(text)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Stores a computed digest in the internal cache, keyed by the filename
|
|
19
|
+
# and the file's modification time.
|
|
20
|
+
#
|
|
21
|
+
# @param filename [String] the path to the file
|
|
22
|
+
# @param stat [File::Stat] the status information of the file
|
|
23
|
+
# @param digest [String] the SHA256 digest to store
|
|
24
|
+
# @return [void]
|
|
25
|
+
def file_digest_store(filename, stat, digest)
|
|
26
|
+
@@file_digest_cache[[filename, stat.mtime]] = digest
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Checks if a valid digest exists in the internal cache for the given
|
|
30
|
+
# filename and modification time.
|
|
31
|
+
#
|
|
32
|
+
# @param filename [String] the path to the file
|
|
33
|
+
# @param stat [File::Stat] the status information of the file
|
|
34
|
+
# @return [String, nil] the cached digest if found, nil otherwise
|
|
35
|
+
def file_digest_cached?(filename, stat)
|
|
36
|
+
@@file_digest_cache.fetch([filename, stat.mtime], nil)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Clears the internal file digest cache.
|
|
40
|
+
#
|
|
41
|
+
# This removes all stored digests and their associated modification times,
|
|
42
|
+
# forcing subsequent calls to #compute_file_digest to re-read files from
|
|
43
|
+
# disk.
|
|
44
|
+
def file_digest_cache_clear
|
|
45
|
+
@@file_digest_cache&.clear
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Computes the SHA256 hexadecimal digest of a local file's content.
|
|
49
|
+
#
|
|
50
|
+
# This method first verifies that the provided filename is not an absolute
|
|
51
|
+
# URL and that the file actually exists on the filesystem before reading
|
|
52
|
+
# and hashing its content. It uses an internal cache to avoid re-reading
|
|
53
|
+
# the file if the modification time has not changed.
|
|
54
|
+
#
|
|
55
|
+
# @param filename [String, #to_s] the path to the local file
|
|
56
|
+
# @return [String, nil] the SHA256 hexadecimal digest if the file is a
|
|
57
|
+
# valid local file and exists, nil otherwise.
|
|
58
|
+
def compute_file_digest(filename)
|
|
59
|
+
filename = filename.to_s
|
|
60
|
+
case
|
|
61
|
+
when !filename.present?
|
|
62
|
+
nil
|
|
63
|
+
when (URI::PARSER.parse(filename).absolute? rescue nil)
|
|
64
|
+
nil
|
|
65
|
+
else
|
|
66
|
+
stat = begin
|
|
67
|
+
File.stat(filename)
|
|
68
|
+
rescue Errno::ENOENT
|
|
69
|
+
end
|
|
70
|
+
stat or return
|
|
71
|
+
if digest = file_digest_cached?(filename, stat)
|
|
72
|
+
digest
|
|
73
|
+
else
|
|
74
|
+
file_digest_store(filename, stat, compute_digest(File.read(filename)))
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
data/lib/documentrix/utils.rb
CHANGED