lex-knowledge 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: da332d52d481d4e45e8f7ee771a60fb13cebb74144a18ac54ff7b7ec4cb26f27
4
- data.tar.gz: 3a858fbdae4511ec6e34573fccd85f330b33c7be818a2a78271af33cb0dcbfbd
3
+ metadata.gz: 8d512db91b31e6d8a9747a3987aa90dd406d04f8987400787f6de27b13eb10c5
4
+ data.tar.gz: 2d9df5e1289bb80f603dbd882863d1802bf942e8ff12940d259ba44e25b728f3
5
5
  SHA512:
6
- metadata.gz: e7e15c2174b28ea518c3fee6ecc68723bc628ee3c6d9121a2292706ad32125d32f766da164aa581c56915cd178e54379014a67d8ad1d5cd765485a6eb88d0610
7
- data.tar.gz: eb7f343423ea41ff46b6a478b3c32a22b0dd3ab3939ad858d0e8d540b5faaf8f01496a5839c79b5ccb1847529367ad9c55ffdc72bebf5959891ff4a7c06b72e4
6
+ metadata.gz: 2022e4654a3a815e8c5433daaad7c2e9767b039c51513e12d9cd6c71a2de47e3ee883c6e0624aa9ac8f6108439c19d4552260694482db4e2da9d58851fa093bd
7
+ data.tar.gz: d1afc8cb8fdcd0317ee9fa416c91827ca3536e40fed404791a355b4b4662360211e3c064dc09e8504d0ed5eb359f1dc906edaa57277f205b1aa9eafedcf261da
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Extensions
5
+ module Knowledge
6
+ module Actor
7
+ class MaintenanceRunner < Legion::Extensions::Actors::Every
8
+ def runner_class = 'Legion::Extensions::Knowledge::Runners::Maintenance'
9
+ def runner_function = 'health'
10
+ def check_subtask? = false
11
+ def generate_task? = false
12
+
13
+ def every_interval
14
+ if defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil?
15
+ Legion::Settings.dig(:knowledge, :actors, :maintenance_interval) || 21_600
16
+ else
17
+ 21_600
18
+ end
19
+ rescue StandardError
20
+ 21_600
21
+ end
22
+
23
+ def enabled?
24
+ return false unless corpus_path && !corpus_path.empty?
25
+
26
+ true
27
+ rescue StandardError
28
+ false
29
+ end
30
+
31
+ def args
32
+ { path: corpus_path }
33
+ end
34
+
35
+ private
36
+
37
+ def corpus_path
38
+ return nil unless defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil?
39
+
40
+ Legion::Settings.dig(:knowledge, :corpus_path)
41
+ rescue StandardError
42
+ nil
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
@@ -7,6 +7,7 @@ module Legion
7
7
  include Runners::Ingest
8
8
  include Runners::Query
9
9
  include Runners::Corpus
10
+ include Runners::Maintenance
10
11
  end
11
12
  end
12
13
  end
@@ -0,0 +1,315 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Extensions
5
+ module Knowledge
6
+ module Runners
7
+ module Maintenance
8
+ module_function
9
+
10
+ def detect_orphans(path:)
11
+ manifest_files = load_manifest_files(path)
12
+ apollo_files = load_apollo_source_files
13
+
14
+ orphan_files = apollo_files - manifest_files
15
+
16
+ {
17
+ success: true,
18
+ orphan_count: orphan_files.size,
19
+ orphan_files: orphan_files,
20
+ total_apollo_chunks: count_apollo_chunks,
21
+ total_manifest_files: manifest_files.size
22
+ }
23
+ rescue StandardError => e
24
+ { success: false, error: e.message }
25
+ end
26
+
27
+ def cleanup_orphans(path:, dry_run: true)
28
+ detection = detect_orphans(path: path)
29
+ return detection unless detection[:success]
30
+ return detection.merge(archived: 0, files_cleaned: 0, dry_run: dry_run) if detection[:orphan_count].zero?
31
+ return detection.merge(archived: detection[:orphan_count], files_cleaned: detection[:orphan_files].size, dry_run: true) if dry_run
32
+
33
+ archived = archive_orphan_entries(detection[:orphan_files])
34
+
35
+ { success: true, archived: archived, files_cleaned: detection[:orphan_files].size, dry_run: false }
36
+ rescue StandardError => e
37
+ { success: false, error: e.message }
38
+ end
39
+
40
+ def reindex(path:)
41
+ store_path = Helpers::ManifestStore.store_path(corpus_path: path)
42
+ ::FileUtils.rm_f(store_path)
43
+
44
+ Runners::Ingest.ingest_corpus(path: path, force: true)
45
+ rescue StandardError => e
46
+ { success: false, error: e.message }
47
+ end
48
+
49
+ def health(path:)
50
+ scan_entries = Helpers::Manifest.scan(path: path)
51
+ store_path = Helpers::ManifestStore.store_path(corpus_path: path)
52
+ manifest_file = ::File.exist?(store_path)
53
+ last_ingest = manifest_file ? ::File.mtime(store_path).iso8601 : nil
54
+
55
+ {
56
+ success: true,
57
+ local: build_local_stats(path, scan_entries, manifest_file, last_ingest),
58
+ apollo: build_apollo_stats,
59
+ sync: build_sync_stats(path, scan_entries)
60
+ }
61
+ rescue StandardError => e
62
+ { success: false, error: e.message }
63
+ end
64
+
65
+ def quality_report(limit: nil)
66
+ resolved_limit = limit || settings_quality_limit
67
+
68
+ {
69
+ success: true,
70
+ hot_chunks: hot_chunks(resolved_limit),
71
+ cold_chunks: cold_chunks(resolved_limit),
72
+ low_confidence: low_confidence_chunks(resolved_limit),
73
+ poor_retrieval: [],
74
+ summary: quality_summary
75
+ }
76
+ rescue StandardError => e
77
+ { success: false, error: e.message }
78
+ end
79
+
80
+ def build_local_stats(path, scan_entries, manifest_file, last_ingest)
81
+ {
82
+ corpus_path: path,
83
+ file_count: scan_entries.size,
84
+ total_bytes: scan_entries.sum { |e| e[:size] },
85
+ manifest_exists: manifest_file,
86
+ last_ingest: last_ingest
87
+ }
88
+ end
89
+ private_class_method :build_local_stats
90
+
91
+ def build_apollo_stats
92
+ return apollo_defaults unless defined?(Legion::Data::Model::ApolloEntry)
93
+
94
+ base = Legion::Data::Model::ApolloEntry
95
+ .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
96
+ .exclude(status: 'archived')
97
+ total = base.count
98
+ return apollo_defaults if total.zero?
99
+
100
+ rows = base.select(:confidence, :status, :access_count, :embedding, :created_at).all
101
+ apollo_stats_from_rows(base, rows, total)
102
+ rescue StandardError
103
+ apollo_defaults
104
+ end
105
+ private_class_method :build_apollo_stats
106
+
107
+ def apollo_stats_from_rows(base, rows, total)
108
+ confidences = rows.map { |r| r[:confidence].to_f }
109
+ with_embeddings = rows.count { |r| !r[:embedding].nil? }
110
+ stale_threshold = settings_stale_threshold
111
+ timestamps = rows.map { |r| r[:created_at] }
112
+
113
+ {
114
+ total_chunks: total,
115
+ by_status: base.group_and_count(:status).as_hash(:status, :count).transform_keys(&:to_sym),
116
+ embedding_coverage: (with_embeddings.to_f / total).round(4),
117
+ avg_confidence: confidences.sum / confidences.size.to_f,
118
+ confidence_range: confidences.minmax,
119
+ stale_count: confidences.count { |c| c < stale_threshold },
120
+ never_accessed: rows.count { |r| r[:access_count].to_i.zero? },
121
+ unique_source_files: load_apollo_source_files.size,
122
+ oldest_chunk: timestamps.min&.iso8601,
123
+ newest_chunk: timestamps.max&.iso8601
124
+ }
125
+ end
126
+ private_class_method :apollo_stats_from_rows
127
+
128
+ def apollo_defaults
129
+ {
130
+ total_chunks: 0,
131
+ by_status: {},
132
+ embedding_coverage: 0.0,
133
+ avg_confidence: 0.0,
134
+ confidence_range: [0.0, 0.0],
135
+ stale_count: 0,
136
+ never_accessed: 0,
137
+ unique_source_files: 0,
138
+ oldest_chunk: nil,
139
+ newest_chunk: nil
140
+ }
141
+ end
142
+ private_class_method :apollo_defaults
143
+
144
+ def build_sync_stats(path, scan_entries)
145
+ manifest_paths = load_manifest_files(path)
146
+ apollo_paths = load_apollo_source_files
147
+ scan_paths = scan_entries.map { |e| e[:path] }
148
+
149
+ {
150
+ orphan_count: (apollo_paths - manifest_paths).size,
151
+ missing_count: (scan_paths - apollo_paths).size
152
+ }
153
+ end
154
+ private_class_method :build_sync_stats
155
+
156
+ def load_manifest_files(path)
157
+ manifest = Helpers::ManifestStore.load(corpus_path: path)
158
+ manifest.map { |e| e[:path] }.compact.uniq
159
+ end
160
+ private_class_method :load_manifest_files
161
+
162
+ def load_apollo_source_files
163
+ return [] unless defined?(Legion::Data::Model::ApolloEntry)
164
+
165
+ Legion::Data::Model::ApolloEntry
166
+ .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
167
+ .exclude(status: 'archived')
168
+ .select_map(Sequel.lit("source_context->>'source_file'"))
169
+ .compact
170
+ .uniq
171
+ rescue StandardError
172
+ []
173
+ end
174
+ private_class_method :load_apollo_source_files
175
+
176
+ def count_apollo_chunks
177
+ return 0 unless defined?(Legion::Data::Model::ApolloEntry)
178
+
179
+ Legion::Data::Model::ApolloEntry
180
+ .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
181
+ .exclude(status: 'archived')
182
+ .count
183
+ rescue StandardError
184
+ 0
185
+ end
186
+ private_class_method :count_apollo_chunks
187
+
188
+ def archive_orphan_entries(orphan_files)
189
+ return 0 unless defined?(Legion::Data::Model::ApolloEntry)
190
+
191
+ Legion::Data::Model::ApolloEntry
192
+ .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
193
+ .where(Sequel.lit("source_context->>'source_file' IN ?", orphan_files))
194
+ .exclude(status: 'archived')
195
+ .update(status: 'archived', updated_at: Time.now)
196
+ end
197
+ private_class_method :archive_orphan_entries
198
+
199
+ def hot_chunks(limit)
200
+ return [] unless defined?(Legion::Data::Model::ApolloEntry)
201
+
202
+ Legion::Data::Model::ApolloEntry
203
+ .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
204
+ .exclude(status: 'archived')
205
+ .where { access_count.positive? }
206
+ .order(Sequel.desc(:access_count))
207
+ .limit(limit)
208
+ .select_map([:id, :access_count, :confidence,
209
+ Sequel.lit("source_context->>'source_file' AS source_file")])
210
+ .map { |r| { id: r[0], access_count: r[1], confidence: r[2], source_file: r[3] } }
211
+ rescue StandardError
212
+ []
213
+ end
214
+ private_class_method :hot_chunks
215
+
216
+ def cold_chunks(limit)
217
+ return [] unless defined?(Legion::Data::Model::ApolloEntry)
218
+
219
+ days = settings_cold_chunk_days
220
+ cutoff = Time.now - (days * 86_400)
221
+
222
+ Legion::Data::Model::ApolloEntry
223
+ .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
224
+ .exclude(status: 'archived')
225
+ .where(access_count: 0)
226
+ .where { created_at < cutoff }
227
+ .order(:created_at)
228
+ .limit(limit)
229
+ .select_map([:id, :confidence, :created_at,
230
+ Sequel.lit("source_context->>'source_file' AS source_file")])
231
+ .map { |r| { id: r[0], confidence: r[1], created_at: r[2]&.iso8601, source_file: r[3] } }
232
+ rescue StandardError
233
+ []
234
+ end
235
+ private_class_method :cold_chunks
236
+
237
+ def low_confidence_chunks(limit)
238
+ return [] unless defined?(Legion::Data::Model::ApolloEntry)
239
+
240
+ threshold = settings_stale_threshold
241
+
242
+ Legion::Data::Model::ApolloEntry
243
+ .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
244
+ .exclude(status: 'archived')
245
+ .where { confidence < threshold }
246
+ .order(:confidence)
247
+ .limit(limit)
248
+ .select_map([:id, :confidence, :access_count,
249
+ Sequel.lit("source_context->>'source_file' AS source_file")])
250
+ .map { |r| { id: r[0], confidence: r[1], access_count: r[2], source_file: r[3] } }
251
+ rescue StandardError
252
+ []
253
+ end
254
+ private_class_method :low_confidence_chunks
255
+
256
+ def quality_summary
257
+ defaults = { total_queries: 0, avg_retrieval_score: nil, chunks_never_accessed: 0,
258
+ chunks_below_threshold: 0 }
259
+ return defaults unless defined?(Legion::Data::Model::ApolloEntry)
260
+
261
+ base = Legion::Data::Model::ApolloEntry
262
+ .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
263
+ .exclude(status: 'archived')
264
+
265
+ {
266
+ total_queries: query_count,
267
+ avg_retrieval_score: nil,
268
+ chunks_never_accessed: base.where(access_count: 0).count,
269
+ chunks_below_threshold: base.where { confidence < settings_stale_threshold }.count
270
+ }
271
+ rescue StandardError
272
+ defaults
273
+ end
274
+ private_class_method :quality_summary
275
+
276
+ def query_count
277
+ return 0 unless defined?(Legion::Data::Model::ApolloAccessLog)
278
+
279
+ Legion::Data::Model::ApolloAccessLog.where(action: 'knowledge_query').count
280
+ rescue StandardError
281
+ 0
282
+ end
283
+ private_class_method :query_count
284
+
285
+ def settings_stale_threshold
286
+ return 0.3 unless defined?(Legion::Settings)
287
+
288
+ Legion::Settings.dig(:knowledge, :maintenance, :stale_threshold) || 0.3
289
+ rescue StandardError
290
+ 0.3
291
+ end
292
+ private_class_method :settings_stale_threshold
293
+
294
+ def settings_cold_chunk_days
295
+ return 7 unless defined?(Legion::Settings)
296
+
297
+ Legion::Settings.dig(:knowledge, :maintenance, :cold_chunk_days) || 7
298
+ rescue StandardError
299
+ 7
300
+ end
301
+ private_class_method :settings_cold_chunk_days
302
+
303
+ def settings_quality_limit
304
+ return 10 unless defined?(Legion::Settings)
305
+
306
+ Legion::Settings.dig(:knowledge, :maintenance, :quality_report_limit) || 10
307
+ rescue StandardError
308
+ 10
309
+ end
310
+ private_class_method :settings_quality_limit
311
+ end
312
+ end
313
+ end
314
+ end
315
+ end
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'digest'
4
+
3
5
  module Legion
4
6
  module Extensions
5
7
  module Knowledge
@@ -17,15 +19,21 @@ module Legion
17
19
 
18
20
  latency_ms = ((::Process.clock_gettime(::Process::CLOCK_MONOTONIC) - started) * 1000).round
19
21
 
22
+ score = average_score(chunks)
23
+ unless chunks.empty?
24
+ record_feedback(
25
+ question: question,
26
+ chunk_ids: chunks.filter_map { |c| c[:id] },
27
+ retrieval_score: score.to_f,
28
+ synthesized: synthesize && llm_available?
29
+ )
30
+ end
31
+
20
32
  {
21
33
  success: true,
22
34
  answer: answer,
23
35
  sources: chunks.map { |c| format_source(c) },
24
- metadata: {
25
- retrieval_score: average_score(chunks),
26
- chunk_count: chunks.size,
27
- latency_ms: latency_ms
28
- }
36
+ metadata: build_metadata(chunks, score, latency_ms)
29
37
  }
30
38
  rescue StandardError => e
31
39
  { success: false, error: e.message }
@@ -38,14 +46,26 @@ module Legion
38
46
  {
39
47
  success: true,
40
48
  sources: chunks.map { |c| format_source(c) },
41
- metadata: {
42
- chunk_count: chunks.size
43
- }
49
+ metadata: build_metadata(chunks, average_score(chunks))
44
50
  }
45
51
  rescue StandardError => e
46
52
  { success: false, error: e.message }
47
53
  end
48
54
 
55
+ def record_feedback(question:, chunk_ids:, retrieval_score:, synthesized: true, rating: nil)
56
+ question_hash = ::Digest::SHA256.hexdigest(question.to_s)[0, 16]
57
+ emit_feedback_event(
58
+ question_hash: question_hash,
59
+ chunk_ids: chunk_ids,
60
+ retrieval_score: retrieval_score,
61
+ synthesized: synthesized,
62
+ rating: rating
63
+ )
64
+ { success: true, question_hash: question_hash, rating: rating }
65
+ rescue StandardError => e
66
+ { success: false, error: e.message }
67
+ end
68
+
49
69
  def retrieve_chunks(question, top_k)
50
70
  return [] unless defined?(Legion::Extensions::Apollo)
51
71
 
@@ -97,6 +117,53 @@ module Legion
97
117
  end
98
118
  private_class_method :average_score
99
119
 
120
+ def build_metadata(chunks, score, latency_ms = nil)
121
+ confidences = chunks.filter_map { |c| c[:confidence] }
122
+ distances = chunks.filter_map { |c| c[:distance] }
123
+ source_names = chunks.filter_map do |c|
124
+ c.dig(:metadata, :source_file) || c[:source_file]
125
+ end.uniq
126
+ statuses = chunks.group_by { |c| c[:status] }.transform_values(&:size)
127
+
128
+ meta = {
129
+ retrieval_score: score,
130
+ chunk_count: chunks.size,
131
+ confidence_avg: confidences.empty? ? nil : (confidences.sum.to_f / confidences.size).round(4),
132
+ confidence_range: confidences.empty? ? nil : confidences.minmax,
133
+ distance_range: distances.empty? ? nil : distances.minmax,
134
+ source_files: source_names,
135
+ source_file_count: source_names.size,
136
+ all_embedded: chunks.none? { |c| zero_embedding?(c) },
137
+ statuses: statuses
138
+ }
139
+ meta[:latency_ms] = latency_ms unless latency_ms.nil?
140
+ meta
141
+ end
142
+ private_class_method :build_metadata
143
+
144
+ def zero_embedding?(chunk)
145
+ emb = chunk[:embedding]
146
+ return true if emb.nil?
147
+
148
+ emb.is_a?(Array) && (emb.empty? || emb.all?(&:zero?))
149
+ end
150
+ private_class_method :zero_embedding?
151
+
152
+ def emit_feedback_event(question_hash:, chunk_ids:, retrieval_score:, synthesized:, rating:)
153
+ return unless defined?(Legion::Events)
154
+
155
+ Legion::Events.emit('knowledge.query_feedback', {
156
+ question_hash: question_hash,
157
+ chunk_ids: chunk_ids,
158
+ retrieval_score: retrieval_score,
159
+ synthesized: synthesized,
160
+ rating: rating
161
+ })
162
+ rescue StandardError
163
+ nil
164
+ end
165
+ private_class_method :emit_feedback_event
166
+
100
167
  def llm_available?
101
168
  defined?(Legion::LLM)
102
169
  end
@@ -3,7 +3,7 @@
3
3
  module Legion
4
4
  module Extensions
5
5
  module Knowledge
6
- VERSION = '0.4.0'
6
+ VERSION = '0.5.0'
7
7
  end
8
8
  end
9
9
  end
@@ -8,6 +8,7 @@ require_relative 'knowledge/helpers/chunker'
8
8
  require_relative 'knowledge/runners/ingest'
9
9
  require_relative 'knowledge/runners/query'
10
10
  require_relative 'knowledge/runners/corpus'
11
+ require_relative 'knowledge/runners/maintenance'
11
12
  require_relative 'knowledge/client'
12
13
 
13
14
  if defined?(Legion::Transport)
@@ -17,6 +18,7 @@ if defined?(Legion::Transport)
17
18
  end
18
19
 
19
20
  require_relative 'knowledge/actors/corpus_watcher' if defined?(Legion::Extensions::Actors::Every)
21
+ require_relative 'knowledge/actors/maintenance_runner' if defined?(Legion::Extensions::Actors::Every)
20
22
 
21
23
  require_relative 'knowledge/actors/corpus_ingest' if defined?(Legion::Extensions::Actors::Subscription)
22
24
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lex-knowledge
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Matthew Iverson
@@ -118,6 +118,7 @@ files:
118
118
  - lib/legion/extensions/knowledge.rb
119
119
  - lib/legion/extensions/knowledge/actors/corpus_ingest.rb
120
120
  - lib/legion/extensions/knowledge/actors/corpus_watcher.rb
121
+ - lib/legion/extensions/knowledge/actors/maintenance_runner.rb
121
122
  - lib/legion/extensions/knowledge/client.rb
122
123
  - lib/legion/extensions/knowledge/helpers/chunker.rb
123
124
  - lib/legion/extensions/knowledge/helpers/manifest.rb
@@ -125,6 +126,7 @@ files:
125
126
  - lib/legion/extensions/knowledge/helpers/parser.rb
126
127
  - lib/legion/extensions/knowledge/runners/corpus.rb
127
128
  - lib/legion/extensions/knowledge/runners/ingest.rb
129
+ - lib/legion/extensions/knowledge/runners/maintenance.rb
128
130
  - lib/legion/extensions/knowledge/runners/query.rb
129
131
  - lib/legion/extensions/knowledge/transport/exchanges/knowledge.rb
130
132
  - lib/legion/extensions/knowledge/transport/messages/ingest_message.rb