lex-knowledge 0.5.0 → 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/legion/extensions/knowledge/actors/corpus_watcher.rb +15 -10
- data/lib/legion/extensions/knowledge/actors/maintenance_runner.rb +11 -4
- data/lib/legion/extensions/knowledge/client.rb +1 -0
- data/lib/legion/extensions/knowledge/runners/ingest.rb +78 -6
- data/lib/legion/extensions/knowledge/runners/monitor.rb +118 -0
- data/lib/legion/extensions/knowledge/transport/messages/monitor_reload.rb +16 -0
- data/lib/legion/extensions/knowledge/version.rb +1 -1
- data/lib/legion/extensions/knowledge.rb +6 -0
- metadata +3 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 40412f676840f9927c0e66d7603678bbde1489dcb9a2faba958b1a684fd47b3d
|
|
4
|
+
data.tar.gz: f9247b163289a8e3e07deaf63efbab11703e153150e91d77745eca010cb49b21
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 83cdc62303978876ae5f04c50d3bcf5fcd08faddaa3ee49dc8ac092c49df9665419fb2c52452d7247357b5e424d6122111d82b9e772879bd2afd0068dc780918
|
|
7
|
+
data.tar.gz: 24f0a1208fc396058ae014c497ef5bc49aa96dfe185ad32d0628a19b8fee4ab1c8754048d77d06a73a4a3d42f7c2c1537e6653fecae2b2b280c4b38e6f169745
|
|
@@ -10,34 +10,39 @@ module Legion
|
|
|
10
10
|
def check_subtask? = false
|
|
11
11
|
def generate_task? = false
|
|
12
12
|
|
|
13
|
-
def
|
|
13
|
+
def time
|
|
14
14
|
if defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil?
|
|
15
15
|
Legion::Settings.dig(:knowledge, :actors, :watcher_interval) || 300
|
|
16
16
|
else
|
|
17
17
|
300
|
|
18
18
|
end
|
|
19
|
-
rescue StandardError
|
|
19
|
+
rescue StandardError => e
|
|
20
|
+
log.warn(e.message)
|
|
20
21
|
300
|
|
21
22
|
end
|
|
22
23
|
|
|
23
24
|
def enabled?
|
|
24
|
-
|
|
25
|
-
rescue StandardError
|
|
25
|
+
resolve_monitors.any?
|
|
26
|
+
rescue StandardError => e
|
|
27
|
+
log.warn(e.message)
|
|
26
28
|
false
|
|
27
29
|
end
|
|
28
30
|
|
|
29
31
|
def args
|
|
30
|
-
{
|
|
32
|
+
{ monitors: resolve_monitors }
|
|
31
33
|
end
|
|
32
34
|
|
|
33
35
|
private
|
|
34
36
|
|
|
35
|
-
def
|
|
36
|
-
|
|
37
|
+
def log
|
|
38
|
+
Legion::Logging
|
|
39
|
+
end
|
|
37
40
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
+
def resolve_monitors
|
|
42
|
+
Runners::Monitor.resolve_monitors
|
|
43
|
+
rescue StandardError => e
|
|
44
|
+
log.warn(e.message)
|
|
45
|
+
[]
|
|
41
46
|
end
|
|
42
47
|
end
|
|
43
48
|
end
|
|
@@ -10,13 +10,14 @@ module Legion
|
|
|
10
10
|
def check_subtask? = false
|
|
11
11
|
def generate_task? = false
|
|
12
12
|
|
|
13
|
-
def
|
|
13
|
+
def time
|
|
14
14
|
if defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil?
|
|
15
15
|
Legion::Settings.dig(:knowledge, :actors, :maintenance_interval) || 21_600
|
|
16
16
|
else
|
|
17
17
|
21_600
|
|
18
18
|
end
|
|
19
|
-
rescue StandardError
|
|
19
|
+
rescue StandardError => e
|
|
20
|
+
log.warn(e.message)
|
|
20
21
|
21_600
|
|
21
22
|
end
|
|
22
23
|
|
|
@@ -24,7 +25,8 @@ module Legion
|
|
|
24
25
|
return false unless corpus_path && !corpus_path.empty?
|
|
25
26
|
|
|
26
27
|
true
|
|
27
|
-
rescue StandardError
|
|
28
|
+
rescue StandardError => e
|
|
29
|
+
log.warn(e.message)
|
|
28
30
|
false
|
|
29
31
|
end
|
|
30
32
|
|
|
@@ -34,11 +36,16 @@ module Legion
|
|
|
34
36
|
|
|
35
37
|
private
|
|
36
38
|
|
|
39
|
+
def log
|
|
40
|
+
Legion::Logging
|
|
41
|
+
end
|
|
42
|
+
|
|
37
43
|
def corpus_path
|
|
38
44
|
return nil unless defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil?
|
|
39
45
|
|
|
40
46
|
Legion::Settings.dig(:knowledge, :corpus_path)
|
|
41
|
-
rescue StandardError
|
|
47
|
+
rescue StandardError => e
|
|
48
|
+
log.warn(e.message)
|
|
42
49
|
nil
|
|
43
50
|
end
|
|
44
51
|
end
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'securerandom'
|
|
4
|
+
|
|
3
5
|
module Legion
|
|
4
6
|
module Extensions
|
|
5
7
|
module Knowledge
|
|
@@ -7,6 +9,11 @@ module Legion
|
|
|
7
9
|
module Ingest
|
|
8
10
|
module_function
|
|
9
11
|
|
|
12
|
+
def log
|
|
13
|
+
Legion::Logging
|
|
14
|
+
end
|
|
15
|
+
private_class_method :log
|
|
16
|
+
|
|
10
17
|
def scan_corpus(path:, extensions: nil)
|
|
11
18
|
opts = { path: path }
|
|
12
19
|
opts[:extensions] = extensions if extensions
|
|
@@ -22,7 +29,17 @@ module Legion
|
|
|
22
29
|
}
|
|
23
30
|
end
|
|
24
31
|
|
|
25
|
-
def ingest_corpus(path
|
|
32
|
+
def ingest_corpus(path: nil, monitors: nil, dry_run: false, force: false)
|
|
33
|
+
return ingest_monitors(monitors: monitors, dry_run: dry_run, force: force) if monitors&.any?
|
|
34
|
+
raise ArgumentError, 'path is required when monitors is not provided' if path.nil?
|
|
35
|
+
|
|
36
|
+
ingest_corpus_path(path: path, dry_run: dry_run, force: force)
|
|
37
|
+
rescue ArgumentError => e
|
|
38
|
+
log.warn(e.message)
|
|
39
|
+
{ success: false, error: e.message }
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def ingest_corpus_path(path:, dry_run: false, force: false)
|
|
26
43
|
current = Helpers::Manifest.scan(path: path)
|
|
27
44
|
previous = force ? [] : Helpers::ManifestStore.load(corpus_path: path)
|
|
28
45
|
delta = Helpers::Manifest.diff(current: current, previous: previous)
|
|
@@ -54,8 +71,57 @@ module Legion
|
|
|
54
71
|
chunks_updated: chunks_updated
|
|
55
72
|
}
|
|
56
73
|
rescue StandardError => e
|
|
74
|
+
log.warn(e.message)
|
|
75
|
+
{ success: false, error: e.message }
|
|
76
|
+
end
|
|
77
|
+
private_class_method :ingest_corpus_path
|
|
78
|
+
|
|
79
|
+
def ingest_monitors(monitors:, dry_run: false, force: false)
|
|
80
|
+
results = monitors.map do |monitor|
|
|
81
|
+
ingest_corpus(path: monitor[:path], dry_run: dry_run, force: force)
|
|
82
|
+
rescue StandardError => e
|
|
83
|
+
log.warn(e.message)
|
|
84
|
+
{ success: false, path: monitor[:path], error: e.message }
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
total = {
|
|
88
|
+
files_scanned: 0,
|
|
89
|
+
files_added: 0,
|
|
90
|
+
files_changed: 0,
|
|
91
|
+
files_removed: 0,
|
|
92
|
+
chunks_created: 0,
|
|
93
|
+
chunks_skipped: 0,
|
|
94
|
+
chunks_updated: 0
|
|
95
|
+
}
|
|
96
|
+
results.each do |r|
|
|
97
|
+
next unless r[:success]
|
|
98
|
+
|
|
99
|
+
total.each_key { |k| total[k] += r[k].to_i }
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
{ success: true, monitors_processed: results.size, **total }
|
|
103
|
+
rescue StandardError => e
|
|
104
|
+
log.warn(e.message)
|
|
57
105
|
{ success: false, error: e.message }
|
|
58
106
|
end
|
|
107
|
+
private_class_method :ingest_monitors
|
|
108
|
+
|
|
109
|
+
def ingest_content(content:, source_type: :text, metadata: {})
|
|
110
|
+
source_path = "content://#{source_type}/#{SecureRandom.uuid}"
|
|
111
|
+
section = {
|
|
112
|
+
content: content,
|
|
113
|
+
heading: source_type.to_s,
|
|
114
|
+
section_path: [source_type.to_s],
|
|
115
|
+
source_file: source_path
|
|
116
|
+
}
|
|
117
|
+
chunks = Helpers::Chunker.chunk(sections: [section])
|
|
118
|
+
paired = batch_embed_chunks(chunks, force: false)
|
|
119
|
+
paired.each { |p| upsert_chunk_with_embedding(p[:chunk], p[:embedding], force: false, exists: p[:exists] || false) }
|
|
120
|
+
{ status: :ingested, chunks: chunks.size, source_type: source_type, metadata: metadata }
|
|
121
|
+
rescue StandardError => e
|
|
122
|
+
log.warn(e.message)
|
|
123
|
+
{ status: :failed, error: e.message, source_type: source_type, metadata: metadata }
|
|
124
|
+
end
|
|
59
125
|
|
|
60
126
|
def ingest_file(file_path:, force: false)
|
|
61
127
|
result = process_file(file_path, dry_run: false, force: force)
|
|
@@ -68,6 +134,7 @@ module Legion
|
|
|
68
134
|
chunks_updated: result[:updated]
|
|
69
135
|
}
|
|
70
136
|
rescue StandardError => e
|
|
137
|
+
log.warn(e.message)
|
|
71
138
|
{ success: false, error: e.message }
|
|
72
139
|
end
|
|
73
140
|
|
|
@@ -107,7 +174,8 @@ module Legion
|
|
|
107
174
|
embed_map = needs_embed.empty? ? {} : build_embed_map(needs_embed)
|
|
108
175
|
|
|
109
176
|
chunks.map { |c| { chunk: c, embedding: embed_map[c[:content_hash]], exists: exists_map.fetch(c[:content_hash], false) } }
|
|
110
|
-
rescue StandardError
|
|
177
|
+
rescue StandardError => e
|
|
178
|
+
log.warn(e.message)
|
|
111
179
|
paired_without_embed(chunks, {})
|
|
112
180
|
end
|
|
113
181
|
private_class_method :batch_embed_chunks
|
|
@@ -132,7 +200,8 @@ module Legion
|
|
|
132
200
|
results.each_with_object({}) do |r, h|
|
|
133
201
|
h[needs_embed[r[:index]][:content_hash]] = r[:vector] unless r[:error]
|
|
134
202
|
end
|
|
135
|
-
rescue StandardError
|
|
203
|
+
rescue StandardError => e
|
|
204
|
+
log.warn(e.message)
|
|
136
205
|
{}
|
|
137
206
|
end
|
|
138
207
|
private_class_method :build_embed_map
|
|
@@ -144,7 +213,8 @@ module Legion
|
|
|
144
213
|
|
|
145
214
|
ingest_to_apollo(chunk, embedding)
|
|
146
215
|
force ? :updated : :created
|
|
147
|
-
rescue StandardError
|
|
216
|
+
rescue StandardError => e
|
|
217
|
+
log.warn(e.message)
|
|
148
218
|
:skipped
|
|
149
219
|
end
|
|
150
220
|
private_class_method :upsert_chunk_with_embedding
|
|
@@ -156,7 +226,8 @@ module Legion
|
|
|
156
226
|
.where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
|
|
157
227
|
.where(Sequel.like(:content, "%#{content_hash}%"))
|
|
158
228
|
.any?
|
|
159
|
-
rescue StandardError
|
|
229
|
+
rescue StandardError => e
|
|
230
|
+
log.warn(e.message)
|
|
160
231
|
false
|
|
161
232
|
end
|
|
162
233
|
private_class_method :chunk_exists?
|
|
@@ -193,7 +264,8 @@ module Legion
|
|
|
193
264
|
tags: [file_path, 'retired', 'document_chunk'].uniq,
|
|
194
265
|
metadata: { source_file: file_path, retired: true }
|
|
195
266
|
)
|
|
196
|
-
rescue StandardError
|
|
267
|
+
rescue StandardError => e
|
|
268
|
+
log.warn(e.message)
|
|
197
269
|
nil
|
|
198
270
|
end
|
|
199
271
|
private_class_method :retire_file
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Extensions
|
|
5
|
+
module Knowledge
|
|
6
|
+
module Runners
|
|
7
|
+
module Monitor
|
|
8
|
+
module_function
|
|
9
|
+
|
|
10
|
+
DEFAULT_EXTENSIONS = %w[.md .txt].freeze
|
|
11
|
+
|
|
12
|
+
def resolve_monitors
|
|
13
|
+
monitors = Array(read_monitors_setting)
|
|
14
|
+
legacy = read_legacy_corpus_path
|
|
15
|
+
|
|
16
|
+
if legacy && !legacy.empty? && monitors.none? { |m| m[:path] == legacy }
|
|
17
|
+
monitors << { path: legacy, extensions: %w[.md .txt .docx .pdf], label: 'legacy' }
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
monitors
|
|
21
|
+
rescue StandardError
|
|
22
|
+
[]
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def add_monitor(path:, extensions: nil, label: nil)
|
|
26
|
+
abs_path = File.expand_path(path)
|
|
27
|
+
return { success: false, error: "Path #{abs_path} does not exist or is not a directory" } unless File.directory?(abs_path)
|
|
28
|
+
|
|
29
|
+
existing = Array(read_monitors_setting)
|
|
30
|
+
return { success: false, error: "Path #{abs_path} is already registered" } if existing.any? { |m| m[:path] == abs_path }
|
|
31
|
+
|
|
32
|
+
entry = {
|
|
33
|
+
path: abs_path,
|
|
34
|
+
extensions: extensions || DEFAULT_EXTENSIONS.dup,
|
|
35
|
+
label: label || File.basename(abs_path),
|
|
36
|
+
added_at: Time.now.utc.iso8601
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
existing << entry
|
|
40
|
+
persist_monitors(existing)
|
|
41
|
+
|
|
42
|
+
{ success: true, monitor: entry }
|
|
43
|
+
rescue StandardError => e
|
|
44
|
+
{ success: false, error: e.message }
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def remove_monitor(identifier:)
|
|
48
|
+
existing = Array(read_monitors_setting)
|
|
49
|
+
found = existing.find { |m| m[:path] == identifier || m[:label] == identifier }
|
|
50
|
+
return { success: false, error: "Monitor '#{identifier}' not found" } unless found
|
|
51
|
+
|
|
52
|
+
existing.delete(found)
|
|
53
|
+
persist_monitors(existing)
|
|
54
|
+
|
|
55
|
+
{ success: true, removed: found }
|
|
56
|
+
rescue StandardError => e
|
|
57
|
+
{ success: false, error: e.message }
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def list_monitors
|
|
61
|
+
{ success: true, monitors: resolve_monitors }
|
|
62
|
+
rescue StandardError => e
|
|
63
|
+
{ success: false, error: e.message }
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def monitor_status
|
|
67
|
+
monitors = resolve_monitors
|
|
68
|
+
total_files = 0
|
|
69
|
+
|
|
70
|
+
monitors.each do |m|
|
|
71
|
+
scan = Helpers::Manifest.scan(path: m[:path], extensions: m[:extensions])
|
|
72
|
+
total_files += scan.size
|
|
73
|
+
rescue StandardError
|
|
74
|
+
next
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
{ success: true, total_monitors: monitors.size, total_files: total_files }
|
|
78
|
+
rescue StandardError => e
|
|
79
|
+
{ success: false, error: e.message }
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# --- private helpers ---
|
|
83
|
+
|
|
84
|
+
def read_monitors_setting
|
|
85
|
+
return nil unless defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil?
|
|
86
|
+
|
|
87
|
+
Legion::Settings.dig(:knowledge, :monitors)
|
|
88
|
+
rescue StandardError
|
|
89
|
+
nil
|
|
90
|
+
end
|
|
91
|
+
private_class_method :read_monitors_setting
|
|
92
|
+
|
|
93
|
+
def read_legacy_corpus_path
|
|
94
|
+
return nil unless defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil?
|
|
95
|
+
|
|
96
|
+
Legion::Settings.dig(:knowledge, :corpus_path)
|
|
97
|
+
rescue StandardError
|
|
98
|
+
nil
|
|
99
|
+
end
|
|
100
|
+
private_class_method :read_legacy_corpus_path
|
|
101
|
+
|
|
102
|
+
def persist_monitors(monitors)
|
|
103
|
+
return false unless defined?(Legion::Settings)
|
|
104
|
+
|
|
105
|
+
loader = Legion::Settings.loader
|
|
106
|
+
knowledge = loader.settings[:knowledge] || {}
|
|
107
|
+
knowledge[:monitors] = monitors
|
|
108
|
+
loader.settings[:knowledge] = knowledge
|
|
109
|
+
true
|
|
110
|
+
rescue StandardError
|
|
111
|
+
false
|
|
112
|
+
end
|
|
113
|
+
private_class_method :persist_monitors
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Extensions
|
|
5
|
+
module Knowledge
|
|
6
|
+
module Transport
|
|
7
|
+
module Messages
|
|
8
|
+
class MonitorReload < Legion::Transport::Message
|
|
9
|
+
def exchange_name = 'knowledge'
|
|
10
|
+
def routing_key = 'knowledge.monitor.reload'
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -9,12 +9,14 @@ require_relative 'knowledge/runners/ingest'
|
|
|
9
9
|
require_relative 'knowledge/runners/query'
|
|
10
10
|
require_relative 'knowledge/runners/corpus'
|
|
11
11
|
require_relative 'knowledge/runners/maintenance'
|
|
12
|
+
require_relative 'knowledge/runners/monitor'
|
|
12
13
|
require_relative 'knowledge/client'
|
|
13
14
|
|
|
14
15
|
if defined?(Legion::Transport)
|
|
15
16
|
require_relative 'knowledge/transport/exchanges/knowledge'
|
|
16
17
|
require_relative 'knowledge/transport/queues/ingest'
|
|
17
18
|
require_relative 'knowledge/transport/messages/ingest_message'
|
|
19
|
+
require_relative 'knowledge/transport/messages/monitor_reload'
|
|
18
20
|
end
|
|
19
21
|
|
|
20
22
|
require_relative 'knowledge/actors/corpus_watcher' if defined?(Legion::Extensions::Actors::Every)
|
|
@@ -26,6 +28,10 @@ module Legion
|
|
|
26
28
|
module Extensions
|
|
27
29
|
module Knowledge
|
|
28
30
|
extend Legion::Extensions::Core if defined?(Legion::Extensions::Core)
|
|
31
|
+
|
|
32
|
+
def self.remote_invocable?
|
|
33
|
+
false
|
|
34
|
+
end
|
|
29
35
|
end
|
|
30
36
|
end
|
|
31
37
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: lex-knowledge
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.6.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Matthew Iverson
|
|
@@ -127,9 +127,11 @@ files:
|
|
|
127
127
|
- lib/legion/extensions/knowledge/runners/corpus.rb
|
|
128
128
|
- lib/legion/extensions/knowledge/runners/ingest.rb
|
|
129
129
|
- lib/legion/extensions/knowledge/runners/maintenance.rb
|
|
130
|
+
- lib/legion/extensions/knowledge/runners/monitor.rb
|
|
130
131
|
- lib/legion/extensions/knowledge/runners/query.rb
|
|
131
132
|
- lib/legion/extensions/knowledge/transport/exchanges/knowledge.rb
|
|
132
133
|
- lib/legion/extensions/knowledge/transport/messages/ingest_message.rb
|
|
134
|
+
- lib/legion/extensions/knowledge/transport/messages/monitor_reload.rb
|
|
133
135
|
- lib/legion/extensions/knowledge/transport/queues/ingest.rb
|
|
134
136
|
- lib/legion/extensions/knowledge/version.rb
|
|
135
137
|
homepage: https://github.com/LegionIO/lex-knowledge
|