lex-knowledge 0.4.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/legion/extensions/knowledge/actors/corpus_watcher.rb +5 -7
- data/lib/legion/extensions/knowledge/actors/maintenance_runner.rb +48 -0
- data/lib/legion/extensions/knowledge/client.rb +2 -0
- data/lib/legion/extensions/knowledge/runners/ingest.rb +39 -1
- data/lib/legion/extensions/knowledge/runners/maintenance.rb +315 -0
- data/lib/legion/extensions/knowledge/runners/monitor.rb +118 -0
- data/lib/legion/extensions/knowledge/runners/query.rb +75 -8
- data/lib/legion/extensions/knowledge/transport/messages/monitor_reload.rb +16 -0
- data/lib/legion/extensions/knowledge/version.rb +1 -1
- data/lib/legion/extensions/knowledge.rb +4 -0
- metadata +5 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 93e36e27a559476697a65f659e5c5a21db2e061b40877db6ea467875642d3232
|
|
4
|
+
data.tar.gz: dfae368bcc8db4eba0c6b41da9bb79537ed3a9b4fd898b189905fb1ca58a6bc7
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: c9d11bdbcc27f14431c6e900c703a02cea7389bf2220c6e8c2fb1c920a97d7a86993d11e207453ef79ab928da68629a98aa711d51d009dd04d1f5215337dc4a6
|
|
7
|
+
data.tar.gz: 9fe9dc96f91a19aca0d58b4ad49abbcf0876797bba5ddea6984cff0d51b7a577ce0f165fa8d54d9271b185da84d6a267d7c9883903e99937f7b084454b1affcc
|
|
@@ -21,23 +21,21 @@ module Legion
|
|
|
21
21
|
end
|
|
22
22
|
|
|
23
23
|
def enabled?
|
|
24
|
-
|
|
24
|
+
resolve_monitors.any?
|
|
25
25
|
rescue StandardError
|
|
26
26
|
false
|
|
27
27
|
end
|
|
28
28
|
|
|
29
29
|
def args
|
|
30
|
-
{
|
|
30
|
+
{ monitors: resolve_monitors }
|
|
31
31
|
end
|
|
32
32
|
|
|
33
33
|
private
|
|
34
34
|
|
|
35
|
-
def
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
Legion::Settings.dig(:knowledge, :corpus_path)
|
|
35
|
+
def resolve_monitors
|
|
36
|
+
Runners::Monitor.resolve_monitors
|
|
39
37
|
rescue StandardError
|
|
40
|
-
|
|
38
|
+
[]
|
|
41
39
|
end
|
|
42
40
|
end
|
|
43
41
|
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Extensions
|
|
5
|
+
module Knowledge
|
|
6
|
+
module Actor
|
|
7
|
+
class MaintenanceRunner < Legion::Extensions::Actors::Every
|
|
8
|
+
def runner_class = 'Legion::Extensions::Knowledge::Runners::Maintenance'
|
|
9
|
+
def runner_function = 'health'
|
|
10
|
+
def check_subtask? = false
|
|
11
|
+
def generate_task? = false
|
|
12
|
+
|
|
13
|
+
def every_interval
|
|
14
|
+
if defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil?
|
|
15
|
+
Legion::Settings.dig(:knowledge, :actors, :maintenance_interval) || 21_600
|
|
16
|
+
else
|
|
17
|
+
21_600
|
|
18
|
+
end
|
|
19
|
+
rescue StandardError
|
|
20
|
+
21_600
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def enabled?
|
|
24
|
+
return false unless corpus_path && !corpus_path.empty?
|
|
25
|
+
|
|
26
|
+
true
|
|
27
|
+
rescue StandardError
|
|
28
|
+
false
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def args
|
|
32
|
+
{ path: corpus_path }
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
def corpus_path
|
|
38
|
+
return nil unless defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil?
|
|
39
|
+
|
|
40
|
+
Legion::Settings.dig(:knowledge, :corpus_path)
|
|
41
|
+
rescue StandardError
|
|
42
|
+
nil
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
@@ -22,7 +22,16 @@ module Legion
|
|
|
22
22
|
}
|
|
23
23
|
end
|
|
24
24
|
|
|
25
|
-
def ingest_corpus(path
|
|
25
|
+
def ingest_corpus(path: nil, monitors: nil, dry_run: false, force: false)
|
|
26
|
+
return ingest_monitors(monitors: monitors, dry_run: dry_run, force: force) if monitors&.any?
|
|
27
|
+
raise ArgumentError, 'path is required when monitors is not provided' if path.nil?
|
|
28
|
+
|
|
29
|
+
ingest_corpus_path(path: path, dry_run: dry_run, force: force)
|
|
30
|
+
rescue ArgumentError => e
|
|
31
|
+
{ success: false, error: e.message }
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def ingest_corpus_path(path:, dry_run: false, force: false)
|
|
26
35
|
current = Helpers::Manifest.scan(path: path)
|
|
27
36
|
previous = force ? [] : Helpers::ManifestStore.load(corpus_path: path)
|
|
28
37
|
delta = Helpers::Manifest.diff(current: current, previous: previous)
|
|
@@ -56,6 +65,35 @@ module Legion
|
|
|
56
65
|
rescue StandardError => e
|
|
57
66
|
{ success: false, error: e.message }
|
|
58
67
|
end
|
|
68
|
+
private_class_method :ingest_corpus_path
|
|
69
|
+
|
|
70
|
+
def ingest_monitors(monitors:, dry_run: false, force: false)
|
|
71
|
+
results = monitors.map do |monitor|
|
|
72
|
+
ingest_corpus(path: monitor[:path], dry_run: dry_run, force: force)
|
|
73
|
+
rescue StandardError => e
|
|
74
|
+
{ success: false, path: monitor[:path], error: e.message }
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
total = {
|
|
78
|
+
files_scanned: 0,
|
|
79
|
+
files_added: 0,
|
|
80
|
+
files_changed: 0,
|
|
81
|
+
files_removed: 0,
|
|
82
|
+
chunks_created: 0,
|
|
83
|
+
chunks_skipped: 0,
|
|
84
|
+
chunks_updated: 0
|
|
85
|
+
}
|
|
86
|
+
results.each do |r|
|
|
87
|
+
next unless r[:success]
|
|
88
|
+
|
|
89
|
+
total.each_key { |k| total[k] += r[k].to_i }
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
{ success: true, monitors_processed: results.size, **total }
|
|
93
|
+
rescue StandardError => e
|
|
94
|
+
{ success: false, error: e.message }
|
|
95
|
+
end
|
|
96
|
+
private_class_method :ingest_monitors
|
|
59
97
|
|
|
60
98
|
def ingest_file(file_path:, force: false)
|
|
61
99
|
result = process_file(file_path, dry_run: false, force: force)
|
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Extensions
|
|
5
|
+
module Knowledge
|
|
6
|
+
module Runners
|
|
7
|
+
module Maintenance
|
|
8
|
+
module_function
|
|
9
|
+
|
|
10
|
+
def detect_orphans(path:)
|
|
11
|
+
manifest_files = load_manifest_files(path)
|
|
12
|
+
apollo_files = load_apollo_source_files
|
|
13
|
+
|
|
14
|
+
orphan_files = apollo_files - manifest_files
|
|
15
|
+
|
|
16
|
+
{
|
|
17
|
+
success: true,
|
|
18
|
+
orphan_count: orphan_files.size,
|
|
19
|
+
orphan_files: orphan_files,
|
|
20
|
+
total_apollo_chunks: count_apollo_chunks,
|
|
21
|
+
total_manifest_files: manifest_files.size
|
|
22
|
+
}
|
|
23
|
+
rescue StandardError => e
|
|
24
|
+
{ success: false, error: e.message }
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def cleanup_orphans(path:, dry_run: true)
|
|
28
|
+
detection = detect_orphans(path: path)
|
|
29
|
+
return detection unless detection[:success]
|
|
30
|
+
return detection.merge(archived: 0, files_cleaned: 0, dry_run: dry_run) if detection[:orphan_count].zero?
|
|
31
|
+
return detection.merge(archived: detection[:orphan_count], files_cleaned: detection[:orphan_files].size, dry_run: true) if dry_run
|
|
32
|
+
|
|
33
|
+
archived = archive_orphan_entries(detection[:orphan_files])
|
|
34
|
+
|
|
35
|
+
{ success: true, archived: archived, files_cleaned: detection[:orphan_files].size, dry_run: false }
|
|
36
|
+
rescue StandardError => e
|
|
37
|
+
{ success: false, error: e.message }
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def reindex(path:)
|
|
41
|
+
store_path = Helpers::ManifestStore.store_path(corpus_path: path)
|
|
42
|
+
::FileUtils.rm_f(store_path)
|
|
43
|
+
|
|
44
|
+
Runners::Ingest.ingest_corpus(path: path, force: true)
|
|
45
|
+
rescue StandardError => e
|
|
46
|
+
{ success: false, error: e.message }
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def health(path:)
|
|
50
|
+
scan_entries = Helpers::Manifest.scan(path: path)
|
|
51
|
+
store_path = Helpers::ManifestStore.store_path(corpus_path: path)
|
|
52
|
+
manifest_file = ::File.exist?(store_path)
|
|
53
|
+
last_ingest = manifest_file ? ::File.mtime(store_path).iso8601 : nil
|
|
54
|
+
|
|
55
|
+
{
|
|
56
|
+
success: true,
|
|
57
|
+
local: build_local_stats(path, scan_entries, manifest_file, last_ingest),
|
|
58
|
+
apollo: build_apollo_stats,
|
|
59
|
+
sync: build_sync_stats(path, scan_entries)
|
|
60
|
+
}
|
|
61
|
+
rescue StandardError => e
|
|
62
|
+
{ success: false, error: e.message }
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def quality_report(limit: nil)
|
|
66
|
+
resolved_limit = limit || settings_quality_limit
|
|
67
|
+
|
|
68
|
+
{
|
|
69
|
+
success: true,
|
|
70
|
+
hot_chunks: hot_chunks(resolved_limit),
|
|
71
|
+
cold_chunks: cold_chunks(resolved_limit),
|
|
72
|
+
low_confidence: low_confidence_chunks(resolved_limit),
|
|
73
|
+
poor_retrieval: [],
|
|
74
|
+
summary: quality_summary
|
|
75
|
+
}
|
|
76
|
+
rescue StandardError => e
|
|
77
|
+
{ success: false, error: e.message }
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def build_local_stats(path, scan_entries, manifest_file, last_ingest)
|
|
81
|
+
{
|
|
82
|
+
corpus_path: path,
|
|
83
|
+
file_count: scan_entries.size,
|
|
84
|
+
total_bytes: scan_entries.sum { |e| e[:size] },
|
|
85
|
+
manifest_exists: manifest_file,
|
|
86
|
+
last_ingest: last_ingest
|
|
87
|
+
}
|
|
88
|
+
end
|
|
89
|
+
private_class_method :build_local_stats
|
|
90
|
+
|
|
91
|
+
def build_apollo_stats
|
|
92
|
+
return apollo_defaults unless defined?(Legion::Data::Model::ApolloEntry)
|
|
93
|
+
|
|
94
|
+
base = Legion::Data::Model::ApolloEntry
|
|
95
|
+
.where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
|
|
96
|
+
.exclude(status: 'archived')
|
|
97
|
+
total = base.count
|
|
98
|
+
return apollo_defaults if total.zero?
|
|
99
|
+
|
|
100
|
+
rows = base.select(:confidence, :status, :access_count, :embedding, :created_at).all
|
|
101
|
+
apollo_stats_from_rows(base, rows, total)
|
|
102
|
+
rescue StandardError
|
|
103
|
+
apollo_defaults
|
|
104
|
+
end
|
|
105
|
+
private_class_method :build_apollo_stats
|
|
106
|
+
|
|
107
|
+
def apollo_stats_from_rows(base, rows, total)
|
|
108
|
+
confidences = rows.map { |r| r[:confidence].to_f }
|
|
109
|
+
with_embeddings = rows.count { |r| !r[:embedding].nil? }
|
|
110
|
+
stale_threshold = settings_stale_threshold
|
|
111
|
+
timestamps = rows.map { |r| r[:created_at] }
|
|
112
|
+
|
|
113
|
+
{
|
|
114
|
+
total_chunks: total,
|
|
115
|
+
by_status: base.group_and_count(:status).as_hash(:status, :count).transform_keys(&:to_sym),
|
|
116
|
+
embedding_coverage: (with_embeddings.to_f / total).round(4),
|
|
117
|
+
avg_confidence: confidences.sum / confidences.size.to_f,
|
|
118
|
+
confidence_range: confidences.minmax,
|
|
119
|
+
stale_count: confidences.count { |c| c < stale_threshold },
|
|
120
|
+
never_accessed: rows.count { |r| r[:access_count].to_i.zero? },
|
|
121
|
+
unique_source_files: load_apollo_source_files.size,
|
|
122
|
+
oldest_chunk: timestamps.min&.iso8601,
|
|
123
|
+
newest_chunk: timestamps.max&.iso8601
|
|
124
|
+
}
|
|
125
|
+
end
|
|
126
|
+
private_class_method :apollo_stats_from_rows
|
|
127
|
+
|
|
128
|
+
def apollo_defaults
|
|
129
|
+
{
|
|
130
|
+
total_chunks: 0,
|
|
131
|
+
by_status: {},
|
|
132
|
+
embedding_coverage: 0.0,
|
|
133
|
+
avg_confidence: 0.0,
|
|
134
|
+
confidence_range: [0.0, 0.0],
|
|
135
|
+
stale_count: 0,
|
|
136
|
+
never_accessed: 0,
|
|
137
|
+
unique_source_files: 0,
|
|
138
|
+
oldest_chunk: nil,
|
|
139
|
+
newest_chunk: nil
|
|
140
|
+
}
|
|
141
|
+
end
|
|
142
|
+
private_class_method :apollo_defaults
|
|
143
|
+
|
|
144
|
+
def build_sync_stats(path, scan_entries)
|
|
145
|
+
manifest_paths = load_manifest_files(path)
|
|
146
|
+
apollo_paths = load_apollo_source_files
|
|
147
|
+
scan_paths = scan_entries.map { |e| e[:path] }
|
|
148
|
+
|
|
149
|
+
{
|
|
150
|
+
orphan_count: (apollo_paths - manifest_paths).size,
|
|
151
|
+
missing_count: (scan_paths - apollo_paths).size
|
|
152
|
+
}
|
|
153
|
+
end
|
|
154
|
+
private_class_method :build_sync_stats
|
|
155
|
+
|
|
156
|
+
def load_manifest_files(path)
|
|
157
|
+
manifest = Helpers::ManifestStore.load(corpus_path: path)
|
|
158
|
+
manifest.map { |e| e[:path] }.compact.uniq
|
|
159
|
+
end
|
|
160
|
+
private_class_method :load_manifest_files
|
|
161
|
+
|
|
162
|
+
def load_apollo_source_files
|
|
163
|
+
return [] unless defined?(Legion::Data::Model::ApolloEntry)
|
|
164
|
+
|
|
165
|
+
Legion::Data::Model::ApolloEntry
|
|
166
|
+
.where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
|
|
167
|
+
.exclude(status: 'archived')
|
|
168
|
+
.select_map(Sequel.lit("source_context->>'source_file'"))
|
|
169
|
+
.compact
|
|
170
|
+
.uniq
|
|
171
|
+
rescue StandardError
|
|
172
|
+
[]
|
|
173
|
+
end
|
|
174
|
+
private_class_method :load_apollo_source_files
|
|
175
|
+
|
|
176
|
+
def count_apollo_chunks
|
|
177
|
+
return 0 unless defined?(Legion::Data::Model::ApolloEntry)
|
|
178
|
+
|
|
179
|
+
Legion::Data::Model::ApolloEntry
|
|
180
|
+
.where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
|
|
181
|
+
.exclude(status: 'archived')
|
|
182
|
+
.count
|
|
183
|
+
rescue StandardError
|
|
184
|
+
0
|
|
185
|
+
end
|
|
186
|
+
private_class_method :count_apollo_chunks
|
|
187
|
+
|
|
188
|
+
def archive_orphan_entries(orphan_files)
|
|
189
|
+
return 0 unless defined?(Legion::Data::Model::ApolloEntry)
|
|
190
|
+
|
|
191
|
+
Legion::Data::Model::ApolloEntry
|
|
192
|
+
.where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
|
|
193
|
+
.where(Sequel.lit("source_context->>'source_file' IN ?", orphan_files))
|
|
194
|
+
.exclude(status: 'archived')
|
|
195
|
+
.update(status: 'archived', updated_at: Time.now)
|
|
196
|
+
end
|
|
197
|
+
private_class_method :archive_orphan_entries
|
|
198
|
+
|
|
199
|
+
def hot_chunks(limit)
|
|
200
|
+
return [] unless defined?(Legion::Data::Model::ApolloEntry)
|
|
201
|
+
|
|
202
|
+
Legion::Data::Model::ApolloEntry
|
|
203
|
+
.where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
|
|
204
|
+
.exclude(status: 'archived')
|
|
205
|
+
.where { access_count.positive? }
|
|
206
|
+
.order(Sequel.desc(:access_count))
|
|
207
|
+
.limit(limit)
|
|
208
|
+
.select_map([:id, :access_count, :confidence,
|
|
209
|
+
Sequel.lit("source_context->>'source_file' AS source_file")])
|
|
210
|
+
.map { |r| { id: r[0], access_count: r[1], confidence: r[2], source_file: r[3] } }
|
|
211
|
+
rescue StandardError
|
|
212
|
+
[]
|
|
213
|
+
end
|
|
214
|
+
private_class_method :hot_chunks
|
|
215
|
+
|
|
216
|
+
def cold_chunks(limit)
|
|
217
|
+
return [] unless defined?(Legion::Data::Model::ApolloEntry)
|
|
218
|
+
|
|
219
|
+
days = settings_cold_chunk_days
|
|
220
|
+
cutoff = Time.now - (days * 86_400)
|
|
221
|
+
|
|
222
|
+
Legion::Data::Model::ApolloEntry
|
|
223
|
+
.where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
|
|
224
|
+
.exclude(status: 'archived')
|
|
225
|
+
.where(access_count: 0)
|
|
226
|
+
.where { created_at < cutoff }
|
|
227
|
+
.order(:created_at)
|
|
228
|
+
.limit(limit)
|
|
229
|
+
.select_map([:id, :confidence, :created_at,
|
|
230
|
+
Sequel.lit("source_context->>'source_file' AS source_file")])
|
|
231
|
+
.map { |r| { id: r[0], confidence: r[1], created_at: r[2]&.iso8601, source_file: r[3] } }
|
|
232
|
+
rescue StandardError
|
|
233
|
+
[]
|
|
234
|
+
end
|
|
235
|
+
private_class_method :cold_chunks
|
|
236
|
+
|
|
237
|
+
def low_confidence_chunks(limit)
|
|
238
|
+
return [] unless defined?(Legion::Data::Model::ApolloEntry)
|
|
239
|
+
|
|
240
|
+
threshold = settings_stale_threshold
|
|
241
|
+
|
|
242
|
+
Legion::Data::Model::ApolloEntry
|
|
243
|
+
.where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
|
|
244
|
+
.exclude(status: 'archived')
|
|
245
|
+
.where { confidence < threshold }
|
|
246
|
+
.order(:confidence)
|
|
247
|
+
.limit(limit)
|
|
248
|
+
.select_map([:id, :confidence, :access_count,
|
|
249
|
+
Sequel.lit("source_context->>'source_file' AS source_file")])
|
|
250
|
+
.map { |r| { id: r[0], confidence: r[1], access_count: r[2], source_file: r[3] } }
|
|
251
|
+
rescue StandardError
|
|
252
|
+
[]
|
|
253
|
+
end
|
|
254
|
+
private_class_method :low_confidence_chunks
|
|
255
|
+
|
|
256
|
+
def quality_summary
|
|
257
|
+
defaults = { total_queries: 0, avg_retrieval_score: nil, chunks_never_accessed: 0,
|
|
258
|
+
chunks_below_threshold: 0 }
|
|
259
|
+
return defaults unless defined?(Legion::Data::Model::ApolloEntry)
|
|
260
|
+
|
|
261
|
+
base = Legion::Data::Model::ApolloEntry
|
|
262
|
+
.where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
|
|
263
|
+
.exclude(status: 'archived')
|
|
264
|
+
|
|
265
|
+
{
|
|
266
|
+
total_queries: query_count,
|
|
267
|
+
avg_retrieval_score: nil,
|
|
268
|
+
chunks_never_accessed: base.where(access_count: 0).count,
|
|
269
|
+
chunks_below_threshold: base.where { confidence < settings_stale_threshold }.count
|
|
270
|
+
}
|
|
271
|
+
rescue StandardError
|
|
272
|
+
defaults
|
|
273
|
+
end
|
|
274
|
+
private_class_method :quality_summary
|
|
275
|
+
|
|
276
|
+
def query_count
|
|
277
|
+
return 0 unless defined?(Legion::Data::Model::ApolloAccessLog)
|
|
278
|
+
|
|
279
|
+
Legion::Data::Model::ApolloAccessLog.where(action: 'knowledge_query').count
|
|
280
|
+
rescue StandardError
|
|
281
|
+
0
|
|
282
|
+
end
|
|
283
|
+
private_class_method :query_count
|
|
284
|
+
|
|
285
|
+
def settings_stale_threshold
|
|
286
|
+
return 0.3 unless defined?(Legion::Settings)
|
|
287
|
+
|
|
288
|
+
Legion::Settings.dig(:knowledge, :maintenance, :stale_threshold) || 0.3
|
|
289
|
+
rescue StandardError
|
|
290
|
+
0.3
|
|
291
|
+
end
|
|
292
|
+
private_class_method :settings_stale_threshold
|
|
293
|
+
|
|
294
|
+
def settings_cold_chunk_days
|
|
295
|
+
return 7 unless defined?(Legion::Settings)
|
|
296
|
+
|
|
297
|
+
Legion::Settings.dig(:knowledge, :maintenance, :cold_chunk_days) || 7
|
|
298
|
+
rescue StandardError
|
|
299
|
+
7
|
|
300
|
+
end
|
|
301
|
+
private_class_method :settings_cold_chunk_days
|
|
302
|
+
|
|
303
|
+
def settings_quality_limit
|
|
304
|
+
return 10 unless defined?(Legion::Settings)
|
|
305
|
+
|
|
306
|
+
Legion::Settings.dig(:knowledge, :maintenance, :quality_report_limit) || 10
|
|
307
|
+
rescue StandardError
|
|
308
|
+
10
|
|
309
|
+
end
|
|
310
|
+
private_class_method :settings_quality_limit
|
|
311
|
+
end
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
end
|
|
315
|
+
end
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Extensions
|
|
5
|
+
module Knowledge
|
|
6
|
+
module Runners
|
|
7
|
+
module Monitor
|
|
8
|
+
module_function
|
|
9
|
+
|
|
10
|
+
DEFAULT_EXTENSIONS = %w[.md .txt].freeze
|
|
11
|
+
|
|
12
|
+
def resolve_monitors
|
|
13
|
+
monitors = Array(read_monitors_setting)
|
|
14
|
+
legacy = read_legacy_corpus_path
|
|
15
|
+
|
|
16
|
+
if legacy && !legacy.empty? && monitors.none? { |m| m[:path] == legacy }
|
|
17
|
+
monitors << { path: legacy, extensions: %w[.md .txt .docx .pdf], label: 'legacy' }
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
monitors
|
|
21
|
+
rescue StandardError
|
|
22
|
+
[]
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def add_monitor(path:, extensions: nil, label: nil)
|
|
26
|
+
abs_path = File.expand_path(path)
|
|
27
|
+
return { success: false, error: "Path #{abs_path} does not exist or is not a directory" } unless File.directory?(abs_path)
|
|
28
|
+
|
|
29
|
+
existing = Array(read_monitors_setting)
|
|
30
|
+
return { success: false, error: "Path #{abs_path} is already registered" } if existing.any? { |m| m[:path] == abs_path }
|
|
31
|
+
|
|
32
|
+
entry = {
|
|
33
|
+
path: abs_path,
|
|
34
|
+
extensions: extensions || DEFAULT_EXTENSIONS.dup,
|
|
35
|
+
label: label || File.basename(abs_path),
|
|
36
|
+
added_at: Time.now.utc.iso8601
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
existing << entry
|
|
40
|
+
persist_monitors(existing)
|
|
41
|
+
|
|
42
|
+
{ success: true, monitor: entry }
|
|
43
|
+
rescue StandardError => e
|
|
44
|
+
{ success: false, error: e.message }
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def remove_monitor(identifier:)
|
|
48
|
+
existing = Array(read_monitors_setting)
|
|
49
|
+
found = existing.find { |m| m[:path] == identifier || m[:label] == identifier }
|
|
50
|
+
return { success: false, error: "Monitor '#{identifier}' not found" } unless found
|
|
51
|
+
|
|
52
|
+
existing.delete(found)
|
|
53
|
+
persist_monitors(existing)
|
|
54
|
+
|
|
55
|
+
{ success: true, removed: found }
|
|
56
|
+
rescue StandardError => e
|
|
57
|
+
{ success: false, error: e.message }
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def list_monitors
|
|
61
|
+
{ success: true, monitors: resolve_monitors }
|
|
62
|
+
rescue StandardError => e
|
|
63
|
+
{ success: false, error: e.message }
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def monitor_status
|
|
67
|
+
monitors = resolve_monitors
|
|
68
|
+
total_files = 0
|
|
69
|
+
|
|
70
|
+
monitors.each do |m|
|
|
71
|
+
scan = Helpers::Manifest.scan(path: m[:path], extensions: m[:extensions])
|
|
72
|
+
total_files += scan.size
|
|
73
|
+
rescue StandardError
|
|
74
|
+
next
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
{ success: true, total_monitors: monitors.size, total_files: total_files }
|
|
78
|
+
rescue StandardError => e
|
|
79
|
+
{ success: false, error: e.message }
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# --- private helpers ---
|
|
83
|
+
|
|
84
|
+
def read_monitors_setting
|
|
85
|
+
return nil unless defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil?
|
|
86
|
+
|
|
87
|
+
Legion::Settings.dig(:knowledge, :monitors)
|
|
88
|
+
rescue StandardError
|
|
89
|
+
nil
|
|
90
|
+
end
|
|
91
|
+
private_class_method :read_monitors_setting
|
|
92
|
+
|
|
93
|
+
def read_legacy_corpus_path
|
|
94
|
+
return nil unless defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil?
|
|
95
|
+
|
|
96
|
+
Legion::Settings.dig(:knowledge, :corpus_path)
|
|
97
|
+
rescue StandardError
|
|
98
|
+
nil
|
|
99
|
+
end
|
|
100
|
+
private_class_method :read_legacy_corpus_path
|
|
101
|
+
|
|
102
|
+
def persist_monitors(monitors)
|
|
103
|
+
return false unless defined?(Legion::Settings)
|
|
104
|
+
|
|
105
|
+
loader = Legion::Settings.loader
|
|
106
|
+
knowledge = loader.settings[:knowledge] || {}
|
|
107
|
+
knowledge[:monitors] = monitors
|
|
108
|
+
loader.settings[:knowledge] = knowledge
|
|
109
|
+
true
|
|
110
|
+
rescue StandardError
|
|
111
|
+
false
|
|
112
|
+
end
|
|
113
|
+
private_class_method :persist_monitors
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'digest'
|
|
4
|
+
|
|
3
5
|
module Legion
|
|
4
6
|
module Extensions
|
|
5
7
|
module Knowledge
|
|
@@ -17,15 +19,21 @@ module Legion
|
|
|
17
19
|
|
|
18
20
|
latency_ms = ((::Process.clock_gettime(::Process::CLOCK_MONOTONIC) - started) * 1000).round
|
|
19
21
|
|
|
22
|
+
score = average_score(chunks)
|
|
23
|
+
unless chunks.empty?
|
|
24
|
+
record_feedback(
|
|
25
|
+
question: question,
|
|
26
|
+
chunk_ids: chunks.filter_map { |c| c[:id] },
|
|
27
|
+
retrieval_score: score.to_f,
|
|
28
|
+
synthesized: synthesize && llm_available?
|
|
29
|
+
)
|
|
30
|
+
end
|
|
31
|
+
|
|
20
32
|
{
|
|
21
33
|
success: true,
|
|
22
34
|
answer: answer,
|
|
23
35
|
sources: chunks.map { |c| format_source(c) },
|
|
24
|
-
metadata:
|
|
25
|
-
retrieval_score: average_score(chunks),
|
|
26
|
-
chunk_count: chunks.size,
|
|
27
|
-
latency_ms: latency_ms
|
|
28
|
-
}
|
|
36
|
+
metadata: build_metadata(chunks, score, latency_ms)
|
|
29
37
|
}
|
|
30
38
|
rescue StandardError => e
|
|
31
39
|
{ success: false, error: e.message }
|
|
@@ -38,14 +46,26 @@ module Legion
|
|
|
38
46
|
{
|
|
39
47
|
success: true,
|
|
40
48
|
sources: chunks.map { |c| format_source(c) },
|
|
41
|
-
metadata:
|
|
42
|
-
chunk_count: chunks.size
|
|
43
|
-
}
|
|
49
|
+
metadata: build_metadata(chunks, average_score(chunks))
|
|
44
50
|
}
|
|
45
51
|
rescue StandardError => e
|
|
46
52
|
{ success: false, error: e.message }
|
|
47
53
|
end
|
|
48
54
|
|
|
55
|
+
def record_feedback(question:, chunk_ids:, retrieval_score:, synthesized: true, rating: nil)
|
|
56
|
+
question_hash = ::Digest::SHA256.hexdigest(question.to_s)[0, 16]
|
|
57
|
+
emit_feedback_event(
|
|
58
|
+
question_hash: question_hash,
|
|
59
|
+
chunk_ids: chunk_ids,
|
|
60
|
+
retrieval_score: retrieval_score,
|
|
61
|
+
synthesized: synthesized,
|
|
62
|
+
rating: rating
|
|
63
|
+
)
|
|
64
|
+
{ success: true, question_hash: question_hash, rating: rating }
|
|
65
|
+
rescue StandardError => e
|
|
66
|
+
{ success: false, error: e.message }
|
|
67
|
+
end
|
|
68
|
+
|
|
49
69
|
def retrieve_chunks(question, top_k)
|
|
50
70
|
return [] unless defined?(Legion::Extensions::Apollo)
|
|
51
71
|
|
|
@@ -97,6 +117,53 @@ module Legion
|
|
|
97
117
|
end
|
|
98
118
|
private_class_method :average_score
|
|
99
119
|
|
|
120
|
+
def build_metadata(chunks, score, latency_ms = nil)
|
|
121
|
+
confidences = chunks.filter_map { |c| c[:confidence] }
|
|
122
|
+
distances = chunks.filter_map { |c| c[:distance] }
|
|
123
|
+
source_names = chunks.filter_map do |c|
|
|
124
|
+
c.dig(:metadata, :source_file) || c[:source_file]
|
|
125
|
+
end.uniq
|
|
126
|
+
statuses = chunks.group_by { |c| c[:status] }.transform_values(&:size)
|
|
127
|
+
|
|
128
|
+
meta = {
|
|
129
|
+
retrieval_score: score,
|
|
130
|
+
chunk_count: chunks.size,
|
|
131
|
+
confidence_avg: confidences.empty? ? nil : (confidences.sum.to_f / confidences.size).round(4),
|
|
132
|
+
confidence_range: confidences.empty? ? nil : confidences.minmax,
|
|
133
|
+
distance_range: distances.empty? ? nil : distances.minmax,
|
|
134
|
+
source_files: source_names,
|
|
135
|
+
source_file_count: source_names.size,
|
|
136
|
+
all_embedded: chunks.none? { |c| zero_embedding?(c) },
|
|
137
|
+
statuses: statuses
|
|
138
|
+
}
|
|
139
|
+
meta[:latency_ms] = latency_ms unless latency_ms.nil?
|
|
140
|
+
meta
|
|
141
|
+
end
|
|
142
|
+
private_class_method :build_metadata
|
|
143
|
+
|
|
144
|
+
def zero_embedding?(chunk)
|
|
145
|
+
emb = chunk[:embedding]
|
|
146
|
+
return true if emb.nil?
|
|
147
|
+
|
|
148
|
+
emb.is_a?(Array) && (emb.empty? || emb.all?(&:zero?))
|
|
149
|
+
end
|
|
150
|
+
private_class_method :zero_embedding?
|
|
151
|
+
|
|
152
|
+
def emit_feedback_event(question_hash:, chunk_ids:, retrieval_score:, synthesized:, rating:)
|
|
153
|
+
return unless defined?(Legion::Events)
|
|
154
|
+
|
|
155
|
+
Legion::Events.emit('knowledge.query_feedback', {
|
|
156
|
+
question_hash: question_hash,
|
|
157
|
+
chunk_ids: chunk_ids,
|
|
158
|
+
retrieval_score: retrieval_score,
|
|
159
|
+
synthesized: synthesized,
|
|
160
|
+
rating: rating
|
|
161
|
+
})
|
|
162
|
+
rescue StandardError
|
|
163
|
+
nil
|
|
164
|
+
end
|
|
165
|
+
private_class_method :emit_feedback_event
|
|
166
|
+
|
|
100
167
|
def llm_available?
|
|
101
168
|
defined?(Legion::LLM)
|
|
102
169
|
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Extensions
|
|
5
|
+
module Knowledge
|
|
6
|
+
module Transport
|
|
7
|
+
module Messages
|
|
8
|
+
class MonitorReload < Legion::Transport::Message
|
|
9
|
+
def exchange_name = 'knowledge'
|
|
10
|
+
def routing_key = 'knowledge.monitor.reload'
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -8,15 +8,19 @@ require_relative 'knowledge/helpers/chunker'
|
|
|
8
8
|
require_relative 'knowledge/runners/ingest'
|
|
9
9
|
require_relative 'knowledge/runners/query'
|
|
10
10
|
require_relative 'knowledge/runners/corpus'
|
|
11
|
+
require_relative 'knowledge/runners/maintenance'
|
|
12
|
+
require_relative 'knowledge/runners/monitor'
|
|
11
13
|
require_relative 'knowledge/client'
|
|
12
14
|
|
|
13
15
|
if defined?(Legion::Transport)
|
|
14
16
|
require_relative 'knowledge/transport/exchanges/knowledge'
|
|
15
17
|
require_relative 'knowledge/transport/queues/ingest'
|
|
16
18
|
require_relative 'knowledge/transport/messages/ingest_message'
|
|
19
|
+
require_relative 'knowledge/transport/messages/monitor_reload'
|
|
17
20
|
end
|
|
18
21
|
|
|
19
22
|
require_relative 'knowledge/actors/corpus_watcher' if defined?(Legion::Extensions::Actors::Every)
|
|
23
|
+
require_relative 'knowledge/actors/maintenance_runner' if defined?(Legion::Extensions::Actors::Every)
|
|
20
24
|
|
|
21
25
|
require_relative 'knowledge/actors/corpus_ingest' if defined?(Legion::Extensions::Actors::Subscription)
|
|
22
26
|
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: lex-knowledge
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.6.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Matthew Iverson
|
|
@@ -118,6 +118,7 @@ files:
|
|
|
118
118
|
- lib/legion/extensions/knowledge.rb
|
|
119
119
|
- lib/legion/extensions/knowledge/actors/corpus_ingest.rb
|
|
120
120
|
- lib/legion/extensions/knowledge/actors/corpus_watcher.rb
|
|
121
|
+
- lib/legion/extensions/knowledge/actors/maintenance_runner.rb
|
|
121
122
|
- lib/legion/extensions/knowledge/client.rb
|
|
122
123
|
- lib/legion/extensions/knowledge/helpers/chunker.rb
|
|
123
124
|
- lib/legion/extensions/knowledge/helpers/manifest.rb
|
|
@@ -125,9 +126,12 @@ files:
|
|
|
125
126
|
- lib/legion/extensions/knowledge/helpers/parser.rb
|
|
126
127
|
- lib/legion/extensions/knowledge/runners/corpus.rb
|
|
127
128
|
- lib/legion/extensions/knowledge/runners/ingest.rb
|
|
129
|
+
- lib/legion/extensions/knowledge/runners/maintenance.rb
|
|
130
|
+
- lib/legion/extensions/knowledge/runners/monitor.rb
|
|
128
131
|
- lib/legion/extensions/knowledge/runners/query.rb
|
|
129
132
|
- lib/legion/extensions/knowledge/transport/exchanges/knowledge.rb
|
|
130
133
|
- lib/legion/extensions/knowledge/transport/messages/ingest_message.rb
|
|
134
|
+
- lib/legion/extensions/knowledge/transport/messages/monitor_reload.rb
|
|
131
135
|
- lib/legion/extensions/knowledge/transport/queues/ingest.rb
|
|
132
136
|
- lib/legion/extensions/knowledge/version.rb
|
|
133
137
|
homepage: https://github.com/LegionIO/lex-knowledge
|