documentrix 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGES.md +69 -0
- data/documentrix.gemspec +5 -5
- data/lib/documentrix/documents/cache/common.rb +63 -11
- data/lib/documentrix/documents/cache/records.rb +1 -1
- data/lib/documentrix/documents/cache/redis_cache.rb +3 -3
- data/lib/documentrix/documents/cache/sqlite_cache.rb +95 -27
- data/lib/documentrix/documents/splitters/character.rb +56 -4
- data/lib/documentrix/documents/splitters/common.rb +38 -0
- data/lib/documentrix/documents/splitters/semantic.rb +67 -8
- data/lib/documentrix/documents.rb +133 -29
- data/lib/documentrix/utils/colorize_texts.rb +25 -21
- data/lib/documentrix/utils/digests.rb +78 -0
- data/lib/documentrix/utils.rb +1 -0
- data/lib/documentrix/version.rb +1 -1
- data/spec/documentrix/documents/cache/interface_spec.rb +16 -3
- data/spec/documentrix/documents/cache/memory_cache_spec.rb +64 -2
- data/spec/documentrix/documents/cache/redis_cache_spec.rb +68 -19
- data/spec/documentrix/documents/cache/sqlite_cache_spec.rb +128 -2
- data/spec/documentrix/documents/splitters/character_spec.rb +20 -2
- data/spec/documentrix/documents/splitters/semantic_spec.rb +17 -5
- data/spec/documents_spec.rb +59 -3
- data/spec/utils/colorize_texts_spec.rb +0 -2
- data/spec/utils/digests_spec.rb +97 -0
- data/spec/utils/tags_spec.rb +0 -2
- metadata +7 -1
|
@@ -1,14 +1,60 @@
|
|
|
1
1
|
module Documentrix::Documents::Splitters
|
|
2
|
+
# Semantic splitter that divides text based on thematic changes in meaning.
|
|
3
|
+
#
|
|
4
|
+
# It works by splitting text into sentences, computing embeddings for each,
|
|
5
|
+
# and then calculating the cosine distance between adjacent sentences.
|
|
6
|
+
# Where the distance exceeds a calculated threshold (the "breakpoint"),
|
|
7
|
+
# a semantic boundary is identified.
|
|
8
|
+
#
|
|
9
|
+
# @example
|
|
10
|
+
# splitter = Documentrix::Documents::Splitters::Semantic.new(
|
|
11
|
+
# ollama: ollama_client,
|
|
12
|
+
# model: 'mxbai-embed-large'
|
|
13
|
+
# )
|
|
14
|
+
# chunks = splitter.split(text, breakpoint: :percentile, percentile: 90)
|
|
2
15
|
class Semantic
|
|
16
|
+
include Documentrix::Documents::Splitters::Common
|
|
3
17
|
include Documentrix::Utils::Math
|
|
4
18
|
|
|
5
|
-
|
|
19
|
+
# The default regex used to identify sentence boundaries for semantic
|
|
20
|
+
# splitting. It matches a sentence-ending punctuation mark (., !, ?)
|
|
21
|
+
# followed by optional whitespace at a word boundary or the end of the
|
|
22
|
+
# string.
|
|
23
|
+
#
|
|
24
|
+
# @return [Regexp]
|
|
25
|
+
DEFAULT_SEPARATOR = /[.!?,;]\s*(?:\b|\z)/
|
|
6
26
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
27
|
+
# Initializes a new Semantic splitter.
|
|
28
|
+
#
|
|
29
|
+
# @param ollama [Ollama::Client] the client used for generating embeddings
|
|
30
|
+
# @param model [String] the embedding model name
|
|
31
|
+
# @param model_options [Hash, nil] optional parameters passed to the embedding model
|
|
32
|
+
# @param separator [Regexp] the regex used to identify sentence boundaries
|
|
33
|
+
# @param chunk_size [Integer] the maximum character length of a resulting chunk
|
|
34
|
+
# @param force [Boolean] whether to force split chunks that exceed chunk_size (defaults to false)
|
|
35
|
+
def initialize(ollama:, model:, model_options: nil, separator: DEFAULT_SEPARATOR, chunk_size: 4096, force: false)
|
|
36
|
+
@ollama, @model, @model_options, @separator, @chunk_size, @force =
|
|
37
|
+
ollama, model, model_options, separator, chunk_size, force
|
|
10
38
|
end
|
|
11
39
|
|
|
40
|
+
# Splits the given text into semantic chunks.
|
|
41
|
+
#
|
|
42
|
+
# The method first decomposes the text into sentences, then identifies gaps
|
|
43
|
+
# in semantic similarity. It then groups these sentences into chunks that
|
|
44
|
+
# respect both the semantic boundaries and the maximum chunk size.
|
|
45
|
+
#
|
|
46
|
+
# @param text [String] the text to be split
|
|
47
|
+
# @param batch_size [Integer] the number of sentences to embed in a single API call
|
|
48
|
+
# @param breakpoint [Symbol] the method used to determine the distance threshold
|
|
49
|
+
# * :percentile (default) - uses the N-th percentile of distances
|
|
50
|
+
# * :standard_deviation - uses mean + (std_dev * multiplier)
|
|
51
|
+
# * :interquartile - uses mean + (iqr * multiplier)
|
|
52
|
+
# @param opts [Hash] additional options for the splitting process:
|
|
53
|
+
# * :include_separator [Boolean] whether to keep the sentence separator in the result
|
|
54
|
+
# * :percentile [Integer] the percentile to use if breakpoint is :percentile (default: 95)
|
|
55
|
+
# * :percentage [Integer] the multiplier percentage for :standard_deviation or :interquartile (default: 100)
|
|
56
|
+
#
|
|
57
|
+
# @return [Array<String>] an array of semantically grouped text chunks
|
|
12
58
|
def split(text, batch_size: 100, breakpoint: :percentile, **opts)
|
|
13
59
|
sentences = Documentrix::Documents::Splitters::Character.new(
|
|
14
60
|
separator: @separator,
|
|
@@ -45,22 +91,30 @@ module Documentrix::Documents::Splitters
|
|
|
45
91
|
if current_text.size + sentence.size < @chunk_size
|
|
46
92
|
current_text += sentence
|
|
47
93
|
else
|
|
48
|
-
|
|
94
|
+
result.concat(force_split(current_text))
|
|
49
95
|
current_text = sentence
|
|
50
96
|
end
|
|
51
97
|
end
|
|
52
|
-
|
|
53
|
-
result
|
|
98
|
+
if current_text.present?
|
|
99
|
+
result.concat(force_split(current_text))
|
|
54
100
|
current_text = +''
|
|
55
101
|
end
|
|
56
102
|
sg = g.succ
|
|
57
103
|
end
|
|
58
|
-
|
|
104
|
+
result.concat(force_split(current_text))
|
|
59
105
|
result
|
|
60
106
|
end
|
|
61
107
|
|
|
62
108
|
private
|
|
63
109
|
|
|
110
|
+
# Calculates the distance threshold used to identify semantic boundaries.
|
|
111
|
+
#
|
|
112
|
+
# @param breakpoint_method [Symbol] the method to use (:percentile, :standard_deviation, :interquartile)
|
|
113
|
+
# @param distances [Array<Float>] the cosine distances between adjacent sentences
|
|
114
|
+
# @param opts [Hash] options specific to the chosen method (e.g., :percentile, :percentage)
|
|
115
|
+
#
|
|
116
|
+
# @return [Float] the distance threshold
|
|
117
|
+
# @raise [ArgumentError] if an unsupported breakpoint_method is provided
|
|
64
118
|
def calculate_breakpoint_threshold(breakpoint_method, distances, **opts)
|
|
65
119
|
sequence = MoreMath::Sequence.new(distances)
|
|
66
120
|
case breakpoint_method
|
|
@@ -82,6 +136,11 @@ module Documentrix::Documents::Splitters
|
|
|
82
136
|
end
|
|
83
137
|
end
|
|
84
138
|
|
|
139
|
+
# Fetches embeddings for a batch of sentences and converts them to
|
|
140
|
+
# Numo::NArray.
|
|
141
|
+
#
|
|
142
|
+
# @param input [Array<String>] the batch of sentences to embed
|
|
143
|
+
# @return [Array<Numo::NArray>] an array of embeddings as Numo arrays
|
|
85
144
|
def sentence_embeddings(input)
|
|
86
145
|
@ollama.embed(model: @model, input:, options: @model_options).embeddings.map! {
|
|
87
146
|
Numo::NArray[*_1]
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
require 'numo/narray'
|
|
2
|
-
require 'digest'
|
|
3
2
|
require 'kramdown/ansi'
|
|
4
3
|
|
|
5
4
|
class Documentrix::Documents
|
|
@@ -33,6 +32,7 @@ require 'documentrix/documents/cache/sqlite_cache'
|
|
|
33
32
|
# to prepare text data for embedding and storage in vector databases.
|
|
34
33
|
module Documentrix::Documents::Splitters
|
|
35
34
|
end
|
|
35
|
+
require 'documentrix/documents/splitters/common'
|
|
36
36
|
require 'documentrix/documents/splitters/character'
|
|
37
37
|
require 'documentrix/documents/splitters/semantic'
|
|
38
38
|
|
|
@@ -59,6 +59,7 @@ require 'documentrix/documents/splitters/semantic'
|
|
|
59
59
|
class Documentrix::Documents
|
|
60
60
|
include Kramdown::ANSI::Width
|
|
61
61
|
include Documentrix::Documents::Cache
|
|
62
|
+
include Documentrix::Utils::Digests
|
|
62
63
|
|
|
63
64
|
# Shortcut for Documentrix::Documents::Cache::Records::Record
|
|
64
65
|
Record = Class.new Documentrix::Documents::Cache::Records::Record
|
|
@@ -116,16 +117,16 @@ class Documentrix::Documents
|
|
|
116
117
|
texts
|
|
117
118
|
end
|
|
118
119
|
|
|
119
|
-
|
|
120
|
-
# The method adds new texts `texts` to the documents collection by
|
|
120
|
+
# The add method adds new texts `texts` to the documents collection by
|
|
121
121
|
# processing them through various stages. It first filters out existing texts
|
|
122
122
|
# from the input array using the `prepare_texts` method, then fetches
|
|
123
123
|
# embeddings for each text using the specified model and options. The fetched
|
|
124
124
|
# embeddings are used to create a new record in the cache, which is
|
|
125
|
-
# associated with the original text and
|
|
126
|
-
# the texts in batches of size
|
|
127
|
-
# in the console. It also accepts an optional
|
|
128
|
-
# with the added texts
|
|
125
|
+
# associated with the original text, tags, and version digest (if any). The
|
|
126
|
+
# method processes the texts in batches of size `batch_size`, displaying
|
|
127
|
+
# progress information in the console. It also accepts an optional `source`
|
|
128
|
+
# string to associate with the added texts, an array of `tags` to attach to
|
|
129
|
+
# each record, and an optional `digest` string for version tracking. Once
|
|
129
130
|
# all texts have been processed, it returns the `Documentrix::Documents`
|
|
130
131
|
# instance itself, allowing for method chaining.
|
|
131
132
|
#
|
|
@@ -133,14 +134,17 @@ class Documentrix::Documents
|
|
|
133
134
|
# @param batch_size [Integer] the number of texts to process in one batch
|
|
134
135
|
# @param source [String] the source URL for the added texts
|
|
135
136
|
# @param tags [Array] an array of tags associated with the added texts
|
|
137
|
+
# @param digest [String, nil] the SHA256 hexadecimal digest of the source
|
|
136
138
|
#
|
|
137
139
|
# @example
|
|
138
140
|
# documents.add(%w[ foo bar ], batch_size: 23, source: 'https://example.com', tags: %w[tag1 tag2])
|
|
139
141
|
#
|
|
140
142
|
# @return [Documentrix::Documents] self
|
|
141
|
-
def add(texts, batch_size: nil, source: nil, tags: [])
|
|
142
|
-
texts
|
|
143
|
-
|
|
143
|
+
def add(texts, batch_size: nil, source: nil, tags: [], digest: nil)
|
|
144
|
+
texts = prepare_texts(texts) or return self
|
|
145
|
+
source = normalize_source(source)
|
|
146
|
+
tags = Documentrix::Utils::Tags.new(tags, source:)
|
|
147
|
+
digest ||= compute_file_digest(source)
|
|
144
148
|
if source
|
|
145
149
|
tags.add(File.basename(source).gsub(/\?.*/, ''), source:)
|
|
146
150
|
end
|
|
@@ -153,7 +157,7 @@ class Documentrix::Documents
|
|
|
153
157
|
embeddings = fetch_embeddings(model:, options: @model_options, input: batch)
|
|
154
158
|
batch.zip(embeddings) do |text, embedding|
|
|
155
159
|
norm = @cache.norm(embedding)
|
|
156
|
-
self[text] = Record[text:, embedding:, norm:, source:, tags: tags.to_a]
|
|
160
|
+
self[text] = Record[text:, embedding:, norm:, source:, tags: tags.to_a, digest:]
|
|
157
161
|
end
|
|
158
162
|
infobar.progress by: batch.size
|
|
159
163
|
end
|
|
@@ -219,13 +223,101 @@ class Documentrix::Documents
|
|
|
219
223
|
self
|
|
220
224
|
end
|
|
221
225
|
|
|
222
|
-
#
|
|
226
|
+
# Normalizes the source identifier to a canonical form.
|
|
227
|
+
#
|
|
228
|
+
# If the source is blank, returns nil.
|
|
229
|
+
# If the source is an absolute URL, it is returned as-is.
|
|
230
|
+
# If the source is a local file path that exists, it is expanded to its real
|
|
231
|
+
# path, resolving all symlinks and absolute paths.
|
|
232
|
+
# Otherwise, the original source is returned.
|
|
233
|
+
#
|
|
234
|
+
# @param source [String, #to_s] the source identifier to normalize
|
|
235
|
+
# @return [String, nil] the normalized canonical path, the original source,
|
|
236
|
+
# or nil if blank
|
|
237
|
+
def normalize_source(source)
|
|
238
|
+
source.blank? and return
|
|
239
|
+
begin
|
|
240
|
+
URI::PARSER.parse(source).absolute? and return source
|
|
241
|
+
rescue
|
|
242
|
+
end
|
|
243
|
+
Pathname.new(source).realpath.to_path
|
|
244
|
+
rescue Errno::ENOENT
|
|
245
|
+
source
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
# The source_exist? method checks if any records associated with the given
|
|
249
|
+
# source exist in the cache. If a digest is provided, it verifies if the
|
|
250
|
+
# source exists and satisfies the comparison with the specified digest.
|
|
251
|
+
#
|
|
252
|
+
# @param source [#to_s] the source to check for existence
|
|
253
|
+
# @param digest [String, nil] the SHA256 hexadecimal digest to compare
|
|
254
|
+
# against the stored source digest (optional)
|
|
255
|
+
# @param operator [Symbol, String] the operator to compare the digest with
|
|
256
|
+
# (defaults to '=')
|
|
257
|
+
#
|
|
258
|
+
# @return [Boolean] true if the source exists (and satisfies the digest
|
|
259
|
+
# comparison if provided), false otherwise.
|
|
260
|
+
def source_exist?(source, digest: nil, operator: ?=)
|
|
261
|
+
source = normalize_source(source)
|
|
262
|
+
@cache.source_exist?(source, digest:, operator:)
|
|
263
|
+
end
|
|
264
|
+
|
|
265
|
+
# Checks if the content of the given source has been modified compared to
|
|
266
|
+
# the version stored in the cache, or if it is missing from the cache.
|
|
267
|
+
#
|
|
268
|
+
# The method is considered modified (returns true) if:
|
|
269
|
+
# 1. The source is blank or cannot be normalized.
|
|
270
|
+
# 2. The source is not a valid local file or its digest cannot be computed.
|
|
271
|
+
# 3. No records exist in the cache for this source.
|
|
272
|
+
# 4. Records exist in the cache for this source, but they have a different
|
|
273
|
+
# digest than the current version on disk.
|
|
274
|
+
#
|
|
275
|
+
# @param source [String, #to_s] the source identifier to check
|
|
276
|
+
# @return [Boolean] true if the source is modified, missing, or cannot be
|
|
277
|
+
# verified, false if it is up-to-date.
|
|
278
|
+
def source_modified?(source)
|
|
279
|
+
source = normalize_source(source) or return true
|
|
280
|
+
digest = compute_file_digest(source) or return true
|
|
281
|
+
!source_exist?(source) || source_exist?(source, digest:, operator: '!=')
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
# Updates the records associated with a given source.
|
|
285
|
+
#
|
|
286
|
+
# If the source already exists in the cache, this method computes its current
|
|
287
|
+
# digest and removes only the stale records that do not match this digest. If
|
|
288
|
+
# the source is new or has been modified, it adds the provided texts to the
|
|
289
|
+
# cache.
|
|
290
|
+
#
|
|
291
|
+
# @param texts [Array] the text strings to add if the source is new or modified
|
|
292
|
+
# @param opts [Hash] additional options passed to #add (e.g., :batch_size, :tags)
|
|
293
|
+
# * :source [#to_s] the source to update
|
|
294
|
+
#
|
|
295
|
+
# @return [Documentrix::Documents, nil] the instance itself if the source
|
|
296
|
+
# was added/updated, or nil if the source was already up-to-date.
|
|
297
|
+
def source_update(texts, **opts)
|
|
298
|
+
if source = normalize_source(opts[:source]) and source_exist?(source)
|
|
299
|
+
digest = compute_file_digest(source)
|
|
300
|
+
source_remove(source, digest:)
|
|
301
|
+
unless source_exist?(source, digest:, operator: ?=)
|
|
302
|
+
opts[:digest] = digest
|
|
303
|
+
add(texts, **opts)
|
|
304
|
+
end
|
|
305
|
+
else
|
|
306
|
+
add(texts, **opts)
|
|
307
|
+
end
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
# The source_remove method removes all documents associated with the given
|
|
311
|
+
# source.
|
|
223
312
|
#
|
|
224
|
-
# @param source [
|
|
313
|
+
# @param source [#to_s] the source of the documents to remove
|
|
314
|
+
# @param digest [String, nil] the SHA256 hexadecimal digest for which records
|
|
315
|
+
# with this source are **not** to be removed if given.
|
|
225
316
|
#
|
|
226
317
|
# @return [Documentrix::Documents] self
|
|
227
|
-
def
|
|
228
|
-
|
|
318
|
+
def source_remove(source, digest: nil)
|
|
319
|
+
source = normalize_source(source)
|
|
320
|
+
@cache.clear_by_source(source, digest:, operator: '!=')
|
|
229
321
|
self
|
|
230
322
|
end
|
|
231
323
|
|
|
@@ -236,14 +328,16 @@ class Documentrix::Documents
|
|
|
236
328
|
# @param tags [Array<String>] an array of tags to filter results by (optional)
|
|
237
329
|
# @param prompt [String] a prompt to use when searching for similar strings (optional)
|
|
238
330
|
# @param max_records [Integer] the maximum number of records to return (optional)
|
|
331
|
+
# @param min_similarity [Numeric] the minimum similarity score to include in results (defaults to -1)
|
|
239
332
|
#
|
|
240
333
|
# @example
|
|
241
334
|
# documents.find("foo")
|
|
242
335
|
#
|
|
243
336
|
# @return [Array<Documentrix::Documents::Record>]
|
|
244
|
-
def find(string, tags: nil, prompt: nil, max_records: nil)
|
|
337
|
+
def find(string, tags: nil, prompt: nil, max_records: nil, min_similarity: nil)
|
|
338
|
+
min_similarity ||= -1
|
|
245
339
|
needle = convert_to_vector(string, prompt:)
|
|
246
|
-
@cache.find_records(needle, tags:, max_records:
|
|
340
|
+
@cache.find_records(needle, tags:, max_records:, min_similarity:)
|
|
247
341
|
end
|
|
248
342
|
|
|
249
343
|
# The method filters the records returned by find based on text
|
|
@@ -256,20 +350,28 @@ class Documentrix::Documents
|
|
|
256
350
|
# @example
|
|
257
351
|
# documents.find_where('foo', text_size: 3, text_count: 1)
|
|
258
352
|
# @return [Array<Documentrix::Documents::Record>] the filtered records
|
|
353
|
+
|
|
354
|
+
# The find_where method filters the records returned by find based on text
|
|
355
|
+
# size and count.
|
|
356
|
+
#
|
|
357
|
+
# @param string [String] the search query
|
|
358
|
+
# @param text_size [Integer] the maximum allowed total text size to return
|
|
359
|
+
# @param text_count [Integer] the maximum number of records to return
|
|
360
|
+
# @param opts [Hash] additional options passed to #find, such as:
|
|
361
|
+
# * :tags [Array<String>] filter results by tags
|
|
362
|
+
# * :prompt [String] a prompt to use for the search
|
|
363
|
+
# * :min_similarity [Numeric] minimum similarity score
|
|
364
|
+
#
|
|
365
|
+
# @example
|
|
366
|
+
# documents.find_where('foo', text_size: 1000, text_count: 5, tags: ['ruby'])
|
|
367
|
+
#
|
|
368
|
+
# @return [Array<Documentrix::Documents::Record>] the filtered records
|
|
259
369
|
def find_where(string, text_size: nil, text_count: nil, **opts)
|
|
260
|
-
|
|
261
|
-
opts[:max_records] = text_count
|
|
262
|
-
end
|
|
370
|
+
text_count and opts[:max_records] = text_count
|
|
263
371
|
records = find(string, **opts)
|
|
264
|
-
size
|
|
372
|
+
size = 0
|
|
265
373
|
records.take_while do |record|
|
|
266
|
-
|
|
267
|
-
next false
|
|
268
|
-
end
|
|
269
|
-
if text_count and (count += 1) > text_count
|
|
270
|
-
next false
|
|
271
|
-
end
|
|
272
|
-
true
|
|
374
|
+
!text_size || (size += record.text.size) <= text_size
|
|
273
375
|
end
|
|
274
376
|
end
|
|
275
377
|
|
|
@@ -333,6 +435,8 @@ class Documentrix::Documents
|
|
|
333
435
|
debug: @debug
|
|
334
436
|
)
|
|
335
437
|
end
|
|
438
|
+
rescue => e
|
|
439
|
+
warn "Caught #{e.class}: #{e}"
|
|
336
440
|
ensure
|
|
337
441
|
cache ||= MemoryCache.new(prefix:,)
|
|
338
442
|
return cache
|
|
@@ -389,6 +493,6 @@ class Documentrix::Documents
|
|
|
389
493
|
#
|
|
390
494
|
# @return [String] the SHA256 hash of the input string
|
|
391
495
|
def key(input)
|
|
392
|
-
|
|
496
|
+
compute_digest(input)
|
|
393
497
|
end
|
|
394
498
|
end
|
|
@@ -4,33 +4,37 @@ require 'kramdown/ansi'
|
|
|
4
4
|
# A utility class for colorizing and formatting text output with ANSI color
|
|
5
5
|
# codes and size information.
|
|
6
6
|
#
|
|
7
|
-
# The ColorizeTexts class takes
|
|
8
|
-
#
|
|
9
|
-
#
|
|
10
|
-
#
|
|
7
|
+
# The ColorizeTexts class takes a collection of text strings and formats them
|
|
8
|
+
# with dynamically generated ANSI colors for visual distinction. Each text
|
|
9
|
+
# block is wrapped to fit the terminal width and appended with its size in
|
|
10
|
+
# bytes, making it ideal for debugging text-splitting pipelines.
|
|
11
11
|
#
|
|
12
12
|
# @example
|
|
13
|
-
# colorizer = Documentrix::Utils::ColorizeTexts.new('
|
|
13
|
+
# colorizer = Documentrix::Utils::ColorizeTexts.new('First chunk', 'Second chunk')
|
|
14
14
|
# puts colorizer.to_s
|
|
15
15
|
class Documentrix::Utils::ColorizeTexts
|
|
16
16
|
include Math
|
|
17
17
|
include Term::ANSIColor
|
|
18
18
|
include Kramdown::ANSI::Width
|
|
19
19
|
|
|
20
|
-
# Initializes a new instance of
|
|
20
|
+
# Initializes a new instance of ColorizeTexts.
|
|
21
21
|
#
|
|
22
|
-
# @param [Array<String>]
|
|
22
|
+
# @param texts [String, Array<String>] a variable list of strings or an array
|
|
23
|
+
# of strings to be colorized.
|
|
23
24
|
#
|
|
24
|
-
# @return [Documentrix
|
|
25
|
+
# @return [Documentrix::Utils::ColorizeTexts] a new instance of ColorizeTexts
|
|
25
26
|
def initialize(*texts)
|
|
26
|
-
texts
|
|
27
|
-
@texts = Array(texts.flatten)
|
|
27
|
+
@texts = texts.flatten
|
|
28
28
|
end
|
|
29
29
|
|
|
30
|
-
# Returns a string representation of the
|
|
31
|
-
# colored differently and their sizes.
|
|
30
|
+
# Returns a formatted string representation of the texts.
|
|
32
31
|
#
|
|
33
|
-
#
|
|
32
|
+
# Each text block is:
|
|
33
|
+
# 1. Assigned a color from a trigonometric RGB gradient.
|
|
34
|
+
# 2. Wrapped to 90% of the terminal width.
|
|
35
|
+
# 3. Appended with its size in bold text.
|
|
36
|
+
#
|
|
37
|
+
# @return [String] the colorized and formatted output string.
|
|
34
38
|
def to_s
|
|
35
39
|
result = +''
|
|
36
40
|
@texts.each_with_index do |t, i|
|
|
@@ -45,14 +49,13 @@ class Documentrix::Utils::ColorizeTexts
|
|
|
45
49
|
|
|
46
50
|
private
|
|
47
51
|
|
|
48
|
-
#
|
|
52
|
+
# Determines the optimal text color (black or white) for a given background
|
|
53
|
+
# color to ensure maximum readability based on contrast.
|
|
49
54
|
#
|
|
50
|
-
# @param [
|
|
55
|
+
# @param color [Symbol, Term::ANSIColor::Attribute] the ANSI color attribute
|
|
51
56
|
#
|
|
52
|
-
# @return [Array<
|
|
53
|
-
#
|
|
54
|
-
# when printed on a black background, and the second is the closest match
|
|
55
|
-
# when printed on a white background.
|
|
57
|
+
# @return [Array<String>] an array containing the RGB colors that provide
|
|
58
|
+
# the best contrast for black and white backgrounds.
|
|
56
59
|
def text_color(color)
|
|
57
60
|
color = Term::ANSIColor::Attribute[color]
|
|
58
61
|
[
|
|
@@ -61,9 +64,10 @@ class Documentrix::Utils::ColorizeTexts
|
|
|
61
64
|
].max_by { |t| t.distance_to(color) }
|
|
62
65
|
end
|
|
63
66
|
|
|
64
|
-
#
|
|
67
|
+
# Generates a 256-color RGB gradient using sine wave oscillations.
|
|
65
68
|
#
|
|
66
|
-
# @return [Array<Array<Integer>>]
|
|
69
|
+
# @return [Array<Array<Integer>>] an array of 256 RGB color arrays,
|
|
70
|
+
# where each inner array contains [R, G, B] values from 0 to 255.
|
|
67
71
|
def colors
|
|
68
72
|
@colors ||= (0..255).map { |i|
|
|
69
73
|
[
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
require 'digest'
|
|
2
|
+
require 'uri'
|
|
3
|
+
|
|
4
|
+
# Module for computing cryptographic digests used for tracking content changes.
|
|
5
|
+
module Documentrix::Utils::Digests
|
|
6
|
+
private
|
|
7
|
+
|
|
8
|
+
@@file_digest_cache = {}
|
|
9
|
+
|
|
10
|
+
# Computes the SHA256 hexadecimal digest of the given text.
|
|
11
|
+
#
|
|
12
|
+
# @param text [String] the text to be hashed
|
|
13
|
+
# @return [String] the SHA256 hexadecimal digest
|
|
14
|
+
def compute_digest(text)
|
|
15
|
+
Digest::SHA256.hexdigest(text)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Stores a computed digest in the internal cache, keyed by the filename
|
|
19
|
+
# and the file's modification time.
|
|
20
|
+
#
|
|
21
|
+
# @param filename [String] the path to the file
|
|
22
|
+
# @param stat [File::Stat] the status information of the file
|
|
23
|
+
# @param digest [String] the SHA256 digest to store
|
|
24
|
+
# @return [void]
|
|
25
|
+
def file_digest_store(filename, stat, digest)
|
|
26
|
+
@@file_digest_cache[[filename, stat.mtime]] = digest
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Checks if a valid digest exists in the internal cache for the given
|
|
30
|
+
# filename and modification time.
|
|
31
|
+
#
|
|
32
|
+
# @param filename [String] the path to the file
|
|
33
|
+
# @param stat [File::Stat] the status information of the file
|
|
34
|
+
# @return [String, nil] the cached digest if found, nil otherwise
|
|
35
|
+
def file_digest_cached?(filename, stat)
|
|
36
|
+
@@file_digest_cache.fetch([filename, stat.mtime], nil)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Clears the internal file digest cache.
|
|
40
|
+
#
|
|
41
|
+
# This removes all stored digests and their associated modification times,
|
|
42
|
+
# forcing subsequent calls to #compute_file_digest to re-read files from
|
|
43
|
+
# disk.
|
|
44
|
+
def file_digest_cache_clear
|
|
45
|
+
@@file_digest_cache&.clear
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Computes the SHA256 hexadecimal digest of a local file's content.
|
|
49
|
+
#
|
|
50
|
+
# This method first verifies that the provided filename is not an absolute
|
|
51
|
+
# URL and that the file actually exists on the filesystem before reading
|
|
52
|
+
# and hashing its content. It uses an internal cache to avoid re-reading
|
|
53
|
+
# the file if the modification time has not changed.
|
|
54
|
+
#
|
|
55
|
+
# @param filename [String, #to_s] the path to the local file
|
|
56
|
+
# @return [String, nil] the SHA256 hexadecimal digest if the file is a
|
|
57
|
+
# valid local file and exists, nil otherwise.
|
|
58
|
+
def compute_file_digest(filename)
|
|
59
|
+
filename = filename.to_s
|
|
60
|
+
case
|
|
61
|
+
when !filename.present?
|
|
62
|
+
nil
|
|
63
|
+
when (URI::PARSER.parse(filename).absolute? rescue nil)
|
|
64
|
+
nil
|
|
65
|
+
else
|
|
66
|
+
stat = begin
|
|
67
|
+
File.stat(filename)
|
|
68
|
+
rescue Errno::ENOENT
|
|
69
|
+
end
|
|
70
|
+
stat or return
|
|
71
|
+
if digest = file_digest_cached?(filename, stat)
|
|
72
|
+
digest
|
|
73
|
+
else
|
|
74
|
+
file_digest_store(filename, stat, compute_digest(File.read(filename)))
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
data/lib/documentrix/utils.rb
CHANGED
data/lib/documentrix/version.rb
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
require 'spec_helper'
|
|
2
|
-
|
|
3
1
|
describe 'Documentrix::Documents::Cache Interface' do
|
|
4
2
|
describe 'MemoryCache Interface' do
|
|
5
3
|
let(:cache) { Documentrix::Documents::MemoryCache.new(prefix: 'test-') }
|
|
@@ -55,13 +53,22 @@ describe 'Documentrix::Documents::Cache Interface' do
|
|
|
55
53
|
expect(cache).to respond_to(:clear_by_source)
|
|
56
54
|
expect(cache.method(:clear_by_source).owner).to eq Documentrix::Documents::Cache::Common
|
|
57
55
|
|
|
56
|
+
expect(cache).to respond_to(:source_exist?)
|
|
57
|
+
expect(cache.method(:source_exist?).owner).to eq Documentrix::Documents::Cache::Common
|
|
58
|
+
|
|
58
59
|
expect(cache).to respond_to(:clear)
|
|
59
60
|
expect(cache.method(:clear).owner).to eq Documentrix::Documents::Cache::Common
|
|
60
61
|
end
|
|
61
62
|
end
|
|
62
63
|
|
|
63
64
|
describe 'RedisCache Interface' do
|
|
64
|
-
let
|
|
65
|
+
let :object_class do
|
|
66
|
+
Documentrix::Documents::Cache::Records::Record
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
let(:cache) do
|
|
70
|
+
Documentrix::Documents::RedisCache.new(prefix: 'test-', url: 'redis://localhost:6379', object_class:)
|
|
71
|
+
end
|
|
65
72
|
|
|
66
73
|
it 'has proper method resolution' do
|
|
67
74
|
# Basic cache operations
|
|
@@ -114,6 +121,9 @@ describe 'Documentrix::Documents::Cache Interface' do
|
|
|
114
121
|
expect(cache).to respond_to(:clear_by_source)
|
|
115
122
|
expect(cache.method(:clear_by_source).owner).to eq Documentrix::Documents::Cache::Common
|
|
116
123
|
|
|
124
|
+
expect(cache).to respond_to(:source_exist?)
|
|
125
|
+
expect(cache.method(:source_exist?).owner).to eq Documentrix::Documents::Cache::Common
|
|
126
|
+
|
|
117
127
|
expect(cache).to respond_to(:clear)
|
|
118
128
|
expect(cache.method(:clear).owner).to eq Documentrix::Documents::Cache::Common
|
|
119
129
|
|
|
@@ -177,6 +187,9 @@ describe 'Documentrix::Documents::Cache Interface' do
|
|
|
177
187
|
expect(cache).to respond_to(:clear_by_source)
|
|
178
188
|
expect(cache.method(:clear_by_source).owner).to eq Documentrix::Documents::Cache::SQLiteCache
|
|
179
189
|
|
|
190
|
+
expect(cache).to respond_to(:source_exist?)
|
|
191
|
+
expect(cache.method(:source_exist?).owner).to eq Documentrix::Documents::Cache::SQLiteCache
|
|
192
|
+
|
|
180
193
|
expect(cache).to respond_to(:clear)
|
|
181
194
|
expect(cache.method(:clear).owner).to eq Documentrix::Documents::Cache::Common
|
|
182
195
|
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
require 'spec_helper'
|
|
2
|
-
|
|
3
1
|
describe Documentrix::Documents::MemoryCache do
|
|
4
2
|
let :prefix do
|
|
5
3
|
'test-'
|
|
@@ -135,4 +133,68 @@ describe Documentrix::Documents::MemoryCache do
|
|
|
135
133
|
cache['foo'] = 'bar'
|
|
136
134
|
expect(cache.to_a).to eq [ %W[ #{prefix}foo bar ] ]
|
|
137
135
|
end
|
|
136
|
+
|
|
137
|
+
it 'can iterate over unique sources' do
|
|
138
|
+
cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', embedding: [0.1]]
|
|
139
|
+
cache['bar'] = Documentrix::Documents::Record[text: 'bar', source: 's1', embedding: [0.1]]
|
|
140
|
+
cache['baz'] = Documentrix::Documents::Record[text: 'baz', source: 's2', embedding: [0.1]]
|
|
141
|
+
|
|
142
|
+
expect(cache.each_source.to_a).to match_array(['s1', 's2'])
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
it 'can retrieve all unique tags' do
|
|
146
|
+
cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', tags: ['a', 'b'], embedding: [0.1]]
|
|
147
|
+
cache['bar'] = Documentrix::Documents::Record[text: 'bar', source: 's2', tags: ['b', 'c'], embedding: [0.1]]
|
|
148
|
+
|
|
149
|
+
expect(cache.tags.to_a).to match_array(['a', 'b', 'c'])
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
it 'can clear records by tags' do
|
|
153
|
+
cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', tags: ['keep'], embedding: [0.1]]
|
|
154
|
+
cache['bar'] = Documentrix::Documents::Record[text: 'bar', source: 's1', tags: ['trash'], embedding: [0.1]]
|
|
155
|
+
|
|
156
|
+
expect {
|
|
157
|
+
cache.clear_for_tags(['trash'])
|
|
158
|
+
}.to change { cache.size }.from(2).to(1)
|
|
159
|
+
expect(cache.key?('foo')).to be true
|
|
160
|
+
expect(cache.key?('bar')).to be false
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
it 'can check if a source exists' do
|
|
164
|
+
cache['foo'] = Documentrix::Documents::Record[text: 'foo', source: 's1', embedding: [0.1]]
|
|
165
|
+
|
|
166
|
+
expect(cache.source_exist?('s1')).to be true
|
|
167
|
+
expect(cache.source_exist?('s2')).to be false
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
it 'can clear by source with a specific digest' do
|
|
171
|
+
cache['f1'] = Documentrix::Documents::Record[text: 'v1', source: 's1', digest: 'd1', embedding: [0.1]]
|
|
172
|
+
cache['f2'] = Documentrix::Documents::Record[text: 'v2', source: 's1', digest: 'd2', embedding: [0.1]]
|
|
173
|
+
|
|
174
|
+
expect {
|
|
175
|
+
cache.clear_by_source('s1', digest: 'd1')
|
|
176
|
+
}.to change { cache.size }.from(2).to(1)
|
|
177
|
+
expect(cache.key?('f2')).to be true
|
|
178
|
+
expect(cache.key?('f1')).to be false
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
it 'can clear outdated versions of a source' do
|
|
182
|
+
cache['f1'] = Documentrix::Documents::Record[text: 'v1', source: 's1', digest: 'd1', embedding: [0.1]]
|
|
183
|
+
cache['f2'] = Documentrix::Documents::Record[text: 'v2', source: 's1', digest: 'd2', embedding: [0.1]]
|
|
184
|
+
|
|
185
|
+
expect {
|
|
186
|
+
cache.clear_by_source('s1', digest: 'd2', operator: '!=')
|
|
187
|
+
}.to change { cache.size }.from(2).to(1)
|
|
188
|
+
expect(cache.key?('f2')).to be true
|
|
189
|
+
expect(cache.key?('f1')).to be false
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
it 'can check if a source exists with a specific digest' do
|
|
193
|
+
cache['f1'] = Documentrix::Documents::Record[text: 'v1', source: 's1', digest: 'd1', embedding: [0.1]]
|
|
194
|
+
|
|
195
|
+
expect(cache.source_exist?('s1', digest: 'd1')).to be true
|
|
196
|
+
expect(cache.source_exist?('s1', digest: 'd2')).to be false
|
|
197
|
+
expect(cache.source_exist?('s1', digest: 'd1', operator: '!=')).to be false
|
|
198
|
+
expect(cache.source_exist?('s1', digest: 'd2', operator: '!=')).to be true
|
|
199
|
+
end
|
|
138
200
|
end
|