lex-knowledge 0.2.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2a5307a5b8c19abaedd5f7d6be95d0fc3d068fa0b6d7366293c70da0edec7825
4
- data.tar.gz: 756114f38b345f356a826e09c50714d8b14ff2a4baa6ef0087ddab9f2d75ff78
3
+ metadata.gz: 8d512db91b31e6d8a9747a3987aa90dd406d04f8987400787f6de27b13eb10c5
4
+ data.tar.gz: 2d9df5e1289bb80f603dbd882863d1802bf942e8ff12940d259ba44e25b728f3
5
5
  SHA512:
6
- metadata.gz: 6d77ae8947c2ac53af380a35935c33cb075f4ae1a7b8c2b495389701fb15652a7baaefeeddd32606bc81d9e4c6f8f319562fe869906523b34ca4563ce0e245c4
7
- data.tar.gz: a30c57db2c8cb0da0d54ac73afc178ef55892123e4a6878aa1bf1ca62014663c508c699857970f5582a0e0361a5fd3c29349dbe913a6d1ae8a19aed78515fcc0
6
+ metadata.gz: 2022e4654a3a815e8c5433daaad7c2e9767b039c51513e12d9cd6c71a2de47e3ee883c6e0624aa9ac8f6108439c19d4552260694482db4e2da9d58851fa093bd
7
+ data.tar.gz: d1afc8cb8fdcd0317ee9fa416c91827ca3536e40fed404791a355b4b4662360211e3c064dc09e8504d0ed5eb359f1dc906edaa57277f205b1aa9eafedcf261da
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Extensions
5
+ module Knowledge
6
+ module Actor
7
+ class MaintenanceRunner < Legion::Extensions::Actors::Every
8
+ def runner_class = 'Legion::Extensions::Knowledge::Runners::Maintenance'
9
+ def runner_function = 'health'
10
+ def check_subtask? = false
11
+ def generate_task? = false
12
+
13
+ def every_interval
14
+ if defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil?
15
+ Legion::Settings.dig(:knowledge, :actors, :maintenance_interval) || 21_600
16
+ else
17
+ 21_600
18
+ end
19
+ rescue StandardError
20
+ 21_600
21
+ end
22
+
23
+ def enabled?
24
+ return false unless corpus_path && !corpus_path.empty?
25
+
26
+ true
27
+ rescue StandardError
28
+ false
29
+ end
30
+
31
+ def args
32
+ { path: corpus_path }
33
+ end
34
+
35
+ private
36
+
37
+ def corpus_path
38
+ return nil unless defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil?
39
+
40
+ Legion::Settings.dig(:knowledge, :corpus_path)
41
+ rescue StandardError
42
+ nil
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
@@ -7,6 +7,7 @@ module Legion
7
7
  include Runners::Ingest
8
8
  include Runners::Query
9
9
  include Runners::Corpus
10
+ include Runners::Maintenance
10
11
  end
11
12
  end
12
13
  end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'digest'
4
+ require 'fileutils'
5
+ require 'json'
6
+ require 'tempfile'
7
+
8
+ module Legion
9
+ module Extensions
10
+ module Knowledge
11
+ module Helpers
12
+ module ManifestStore
13
+ module_function
14
+
15
+ STORE_DIR = ::File.expand_path('~/.legionio/knowledge').freeze
16
+
17
+ def load(corpus_path:)
18
+ path = store_path(corpus_path: corpus_path)
19
+ return [] unless ::File.exist?(path)
20
+
21
+ raw = ::File.read(path, encoding: 'utf-8')
22
+ ::JSON.parse(raw, symbolize_names: true)
23
+ rescue StandardError
24
+ []
25
+ end
26
+
27
+ def save(corpus_path:, manifest:)
28
+ ::FileUtils.mkdir_p(STORE_DIR)
29
+ path = store_path(corpus_path: corpus_path)
30
+ tmp = "#{path}.tmp"
31
+ ::File.write(tmp, ::JSON.generate(manifest.map { |e| serialize_entry(e) }))
32
+ ::File.rename(tmp, path)
33
+ true
34
+ rescue StandardError
35
+ false
36
+ end
37
+
38
+ def store_path(corpus_path:)
39
+ hash = ::Digest::SHA256.hexdigest(corpus_path.to_s)[0, 16]
40
+ ::File.join(STORE_DIR, "#{hash}.manifest.json")
41
+ end
42
+
43
+ def serialize_entry(entry)
44
+ entry.merge(mtime: entry[:mtime].to_s)
45
+ end
46
+ private_class_method :serialize_entry
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -15,6 +15,8 @@ module Legion
15
15
  parse_markdown(file_path: file_path)
16
16
  when '.txt'
17
17
  parse_text(file_path: file_path)
18
+ when '.pdf', '.docx'
19
+ extract_via_data(file_path: file_path)
18
20
  else
19
21
  [{ error: 'unsupported format', source_file: file_path }]
20
22
  end
@@ -22,32 +24,42 @@ module Legion
22
24
 
23
25
  def parse_markdown(file_path:)
24
26
  content = ::File.read(file_path, encoding: 'utf-8')
25
- sections = []
27
+ sections = []
26
28
  current_heading = ::File.basename(file_path, '.*')
27
29
  current_lines = []
28
- section_path = []
30
+ heading_stack = {}
29
31
 
30
32
  content.each_line do |line|
31
- if line.start_with?('# ')
32
- flush_section(sections, current_heading, section_path, current_lines, file_path) unless current_lines.empty?
33
- current_heading = line.sub(/^#+\s*/, '').chomp
34
- section_path = [current_heading]
35
- current_lines = []
36
- elsif line.start_with?('## ')
37
- flush_section(sections, current_heading, section_path, current_lines, file_path) unless current_lines.empty?
38
- current_heading = line.sub(/^#+\s*/, '').chomp
39
- section_path = section_path.first(1) + [current_heading]
33
+ level = heading_level(line)
34
+ if level
35
+ flush_section(sections, current_heading, build_section_path(heading_stack), current_lines, file_path)
36
+ title = line.sub(/^#+\s*/, '').chomp
37
+ heading_stack.delete_if { |d, _| d >= level }
38
+ heading_stack[level] = title
39
+ current_heading = title
40
40
  current_lines = []
41
41
  else
42
42
  current_lines << line
43
43
  end
44
44
  end
45
45
 
46
- flush_section(sections, current_heading, section_path, current_lines, file_path) unless current_lines.empty?
46
+ flush_section(sections, current_heading, build_section_path(heading_stack), current_lines, file_path)
47
47
 
48
48
  sections.empty? ? [{ heading: ::File.basename(file_path, '.*'), section_path: [], content: content.strip, source_file: file_path }] : sections
49
49
  end
50
50
 
51
+ def extract_via_data(file_path:)
52
+ return [{ error: 'unsupported format', source_file: file_path }] unless defined?(::Legion::Data::Extract)
53
+
54
+ result = ::Legion::Data::Extract.extract(file_path, type: :auto)
55
+ return [{ error: 'extraction_failed', source_file: file_path, detail: result }] unless result.is_a?(Hash) && result[:text]
56
+
57
+ heading = ::File.basename(file_path, '.*')
58
+ [{ heading: heading, section_path: [], content: result[:text].strip, source_file: file_path }]
59
+ rescue StandardError => e
60
+ [{ error: 'extraction_failed', source_file: file_path, detail: e.message }]
61
+ end
62
+
51
63
  def parse_text(file_path:)
52
64
  content = ::File.read(file_path, encoding: 'utf-8')
53
65
  heading = ::File.basename(file_path, '.*')
@@ -67,6 +79,17 @@ module Legion
67
79
  }
68
80
  end
69
81
  private_class_method :flush_section
82
+
83
+ def heading_level(line)
84
+ m = line.match(/^(\#{1,6})\s/)
85
+ m ? m[1].length : nil
86
+ end
87
+ private_class_method :heading_level
88
+
89
+ def build_section_path(stack)
90
+ stack.sort.map { |_, title| title }
91
+ end
92
+ private_class_method :build_section_path
70
93
  end
71
94
  end
72
95
  end
@@ -7,6 +7,10 @@ module Legion
7
7
  module Corpus
8
8
  module_function
9
9
 
10
+ def manifest_path(path:)
11
+ Helpers::ManifestStore.store_path(corpus_path: path)
12
+ end
13
+
10
14
  def corpus_stats(path:, extensions: nil)
11
15
  return { success: false, error: 'path does not exist' } unless ::File.exist?(path)
12
16
 
@@ -23,23 +23,32 @@ module Legion
23
23
  end
24
24
 
25
25
  def ingest_corpus(path:, dry_run: false, force: false)
26
- entries = Helpers::Manifest.scan(path: path)
27
-
28
- files_scanned = entries.size
29
- chunks_created = 0
30
- chunks_skipped = 0
31
- chunks_updated = 0
32
-
33
- entries.each do |entry|
34
- result = process_file(entry[:path], dry_run: dry_run, force: force)
35
- chunks_created += result[:created]
36
- chunks_skipped += result[:skipped]
37
- chunks_updated += result[:updated]
26
+ current = Helpers::Manifest.scan(path: path)
27
+ previous = force ? [] : Helpers::ManifestStore.load(corpus_path: path)
28
+ delta = Helpers::Manifest.diff(current: current, previous: previous)
29
+
30
+ to_process = delta[:added] + delta[:changed]
31
+ chunks_created = 0
32
+ chunks_skipped = 0
33
+ chunks_updated = 0
34
+
35
+ to_process.each do |file_path|
36
+ result = process_file(file_path, dry_run: dry_run, force: force)
37
+ chunks_created += result[:created]
38
+ chunks_skipped += result[:skipped]
39
+ chunks_updated += result[:updated]
38
40
  end
39
41
 
42
+ delta[:removed].each { |file_path| retire_file(file_path: file_path) } unless dry_run
43
+
44
+ Helpers::ManifestStore.save(corpus_path: path, manifest: current) unless dry_run
45
+
40
46
  {
41
47
  success: true,
42
- files_scanned: files_scanned,
48
+ files_scanned: current.size,
49
+ files_added: delta[:added].size,
50
+ files_changed: delta[:changed].size,
51
+ files_removed: delta[:removed].size,
43
52
  chunks_created: chunks_created,
44
53
  chunks_skipped: chunks_skipped,
45
54
  chunks_updated: chunks_updated
@@ -67,12 +76,18 @@ module Legion
67
76
  return { created: 0, skipped: 0, updated: 0 } if sections.first&.key?(:error)
68
77
 
69
78
  chunks = Helpers::Chunker.chunk(sections: sections)
79
+ paired = if dry_run
80
+ chunks.map { |c| { chunk: c, embedding: nil } }
81
+ else
82
+ batch_embed_chunks(chunks, force: force)
83
+ end
84
+
70
85
  created = 0
71
86
  skipped = 0
72
87
  updated = 0
73
88
 
74
- chunks.each do |chunk|
75
- outcome = upsert_chunk(chunk, dry_run: dry_run, force: force)
89
+ paired.each do |p|
90
+ outcome = upsert_chunk_with_embedding(p[:chunk], p[:embedding], dry_run: dry_run, force: force, exists: p[:exists] || false)
76
91
  case outcome
77
92
  when :created then created += 1
78
93
  when :skipped then skipped += 1
@@ -84,21 +99,55 @@ module Legion
84
99
  end
85
100
  private_class_method :process_file
86
101
 
87
- def upsert_chunk(chunk, dry_run: false, force: false)
88
- return :created if dry_run
102
+ def batch_embed_chunks(chunks, force:)
103
+ exists_map = force ? {} : build_exists_map(chunks)
104
+ return paired_without_embed(chunks, exists_map) unless llm_embed_available?
89
105
 
90
- return :created unless defined?(Legion::Extensions::Apollo)
106
+ needs_embed = force ? chunks : chunks.reject { |c| exists_map[c[:content_hash]] }
107
+ embed_map = needs_embed.empty? ? {} : build_embed_map(needs_embed)
91
108
 
92
- return :skipped if !force && chunk_exists?(chunk[:content_hash])
109
+ chunks.map { |c| { chunk: c, embedding: embed_map[c[:content_hash]], exists: exists_map.fetch(c[:content_hash], false) } }
110
+ rescue StandardError
111
+ paired_without_embed(chunks, {})
112
+ end
113
+ private_class_method :batch_embed_chunks
93
114
 
94
- embedding = generate_embedding(chunk[:content])
95
- ingest_to_apollo(chunk, embedding)
115
+ def build_exists_map(chunks)
116
+ chunks.to_h { |c| [c[:content_hash], chunk_exists?(c[:content_hash])] }
117
+ end
118
+ private_class_method :build_exists_map
119
+
120
+ def llm_embed_available?
121
+ defined?(Legion::LLM) && Legion::LLM.respond_to?(:embed_batch)
122
+ end
123
+ private_class_method :llm_embed_available?
124
+
125
+ def paired_without_embed(chunks, exists_map)
126
+ chunks.map { |c| { chunk: c, embedding: nil, exists: exists_map.fetch(c[:content_hash], false) } }
127
+ end
128
+ private_class_method :paired_without_embed
96
129
 
130
+ def build_embed_map(needs_embed)
131
+ results = Legion::LLM.embed_batch(needs_embed.map { |c| c[:content] })
132
+ results.each_with_object({}) do |r, h|
133
+ h[needs_embed[r[:index]][:content_hash]] = r[:vector] unless r[:error]
134
+ end
135
+ rescue StandardError
136
+ {}
137
+ end
138
+ private_class_method :build_embed_map
139
+
140
+ def upsert_chunk_with_embedding(chunk, embedding, dry_run: false, force: false, exists: false)
141
+ return :created if dry_run
142
+ return :created unless defined?(Legion::Extensions::Apollo)
143
+ return :skipped if !force && exists
144
+
145
+ ingest_to_apollo(chunk, embedding)
97
146
  force ? :updated : :created
98
147
  rescue StandardError
99
148
  :skipped
100
149
  end
101
- private_class_method :upsert_chunk
150
+ private_class_method :upsert_chunk_with_embedding
102
151
 
103
152
  def chunk_exists?(content_hash)
104
153
  return false unless defined?(Legion::Data::Model::ApolloEntry)
@@ -112,16 +161,6 @@ module Legion
112
161
  end
113
162
  private_class_method :chunk_exists?
114
163
 
115
- def generate_embedding(content)
116
- return nil unless defined?(Legion::LLM) && Legion::LLM.respond_to?(:embed)
117
-
118
- result = Legion::LLM.embed(content)
119
- result.is_a?(Hash) ? result[:vector] : nil
120
- rescue StandardError
121
- nil
122
- end
123
- private_class_method :generate_embedding
124
-
125
164
  def ingest_to_apollo(chunk, embedding)
126
165
  return unless defined?(Legion::Extensions::Apollo)
127
166
 
@@ -143,6 +182,21 @@ module Legion
143
182
  Legion::Extensions::Apollo::Runners::Knowledge.handle_ingest(**payload)
144
183
  end
145
184
  private_class_method :ingest_to_apollo
185
+
186
+ def retire_file(file_path:)
187
+ return unless defined?(Legion::Apollo)
188
+ return unless Legion::Apollo.respond_to?(:ingest) && Legion::Apollo.started?
189
+
190
+ Legion::Apollo.ingest(
191
+ content: file_path,
192
+ content_type: 'document_retired',
193
+ tags: [file_path, 'retired', 'document_chunk'].uniq,
194
+ metadata: { source_file: file_path, retired: true }
195
+ )
196
+ rescue StandardError
197
+ nil
198
+ end
199
+ private_class_method :retire_file
146
200
  end
147
201
  end
148
202
  end
@@ -0,0 +1,315 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Extensions
5
+ module Knowledge
6
+ module Runners
7
+ module Maintenance
8
+ module_function
9
+
10
+ def detect_orphans(path:)
11
+ manifest_files = load_manifest_files(path)
12
+ apollo_files = load_apollo_source_files
13
+
14
+ orphan_files = apollo_files - manifest_files
15
+
16
+ {
17
+ success: true,
18
+ orphan_count: orphan_files.size,
19
+ orphan_files: orphan_files,
20
+ total_apollo_chunks: count_apollo_chunks,
21
+ total_manifest_files: manifest_files.size
22
+ }
23
+ rescue StandardError => e
24
+ { success: false, error: e.message }
25
+ end
26
+
27
+ def cleanup_orphans(path:, dry_run: true)
28
+ detection = detect_orphans(path: path)
29
+ return detection unless detection[:success]
30
+ return detection.merge(archived: 0, files_cleaned: 0, dry_run: dry_run) if detection[:orphan_count].zero?
31
+ return detection.merge(archived: detection[:orphan_count], files_cleaned: detection[:orphan_files].size, dry_run: true) if dry_run
32
+
33
+ archived = archive_orphan_entries(detection[:orphan_files])
34
+
35
+ { success: true, archived: archived, files_cleaned: detection[:orphan_files].size, dry_run: false }
36
+ rescue StandardError => e
37
+ { success: false, error: e.message }
38
+ end
39
+
40
+ def reindex(path:)
41
+ store_path = Helpers::ManifestStore.store_path(corpus_path: path)
42
+ ::FileUtils.rm_f(store_path)
43
+
44
+ Runners::Ingest.ingest_corpus(path: path, force: true)
45
+ rescue StandardError => e
46
+ { success: false, error: e.message }
47
+ end
48
+
49
+ def health(path:)
50
+ scan_entries = Helpers::Manifest.scan(path: path)
51
+ store_path = Helpers::ManifestStore.store_path(corpus_path: path)
52
+ manifest_file = ::File.exist?(store_path)
53
+ last_ingest = manifest_file ? ::File.mtime(store_path).iso8601 : nil
54
+
55
+ {
56
+ success: true,
57
+ local: build_local_stats(path, scan_entries, manifest_file, last_ingest),
58
+ apollo: build_apollo_stats,
59
+ sync: build_sync_stats(path, scan_entries)
60
+ }
61
+ rescue StandardError => e
62
+ { success: false, error: e.message }
63
+ end
64
+
65
+ def quality_report(limit: nil)
66
+ resolved_limit = limit || settings_quality_limit
67
+
68
+ {
69
+ success: true,
70
+ hot_chunks: hot_chunks(resolved_limit),
71
+ cold_chunks: cold_chunks(resolved_limit),
72
+ low_confidence: low_confidence_chunks(resolved_limit),
73
+ poor_retrieval: [],
74
+ summary: quality_summary
75
+ }
76
+ rescue StandardError => e
77
+ { success: false, error: e.message }
78
+ end
79
+
80
+ def build_local_stats(path, scan_entries, manifest_file, last_ingest)
81
+ {
82
+ corpus_path: path,
83
+ file_count: scan_entries.size,
84
+ total_bytes: scan_entries.sum { |e| e[:size] },
85
+ manifest_exists: manifest_file,
86
+ last_ingest: last_ingest
87
+ }
88
+ end
89
+ private_class_method :build_local_stats
90
+
91
+ def build_apollo_stats
92
+ return apollo_defaults unless defined?(Legion::Data::Model::ApolloEntry)
93
+
94
+ base = Legion::Data::Model::ApolloEntry
95
+ .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
96
+ .exclude(status: 'archived')
97
+ total = base.count
98
+ return apollo_defaults if total.zero?
99
+
100
+ rows = base.select(:confidence, :status, :access_count, :embedding, :created_at).all
101
+ apollo_stats_from_rows(base, rows, total)
102
+ rescue StandardError
103
+ apollo_defaults
104
+ end
105
+ private_class_method :build_apollo_stats
106
+
107
+ def apollo_stats_from_rows(base, rows, total)
108
+ confidences = rows.map { |r| r[:confidence].to_f }
109
+ with_embeddings = rows.count { |r| !r[:embedding].nil? }
110
+ stale_threshold = settings_stale_threshold
111
+ timestamps = rows.map { |r| r[:created_at] }
112
+
113
+ {
114
+ total_chunks: total,
115
+ by_status: base.group_and_count(:status).as_hash(:status, :count).transform_keys(&:to_sym),
116
+ embedding_coverage: (with_embeddings.to_f / total).round(4),
117
+ avg_confidence: confidences.sum / confidences.size.to_f,
118
+ confidence_range: confidences.minmax,
119
+ stale_count: confidences.count { |c| c < stale_threshold },
120
+ never_accessed: rows.count { |r| r[:access_count].to_i.zero? },
121
+ unique_source_files: load_apollo_source_files.size,
122
+ oldest_chunk: timestamps.min&.iso8601,
123
+ newest_chunk: timestamps.max&.iso8601
124
+ }
125
+ end
126
+ private_class_method :apollo_stats_from_rows
127
+
128
+ def apollo_defaults
129
+ {
130
+ total_chunks: 0,
131
+ by_status: {},
132
+ embedding_coverage: 0.0,
133
+ avg_confidence: 0.0,
134
+ confidence_range: [0.0, 0.0],
135
+ stale_count: 0,
136
+ never_accessed: 0,
137
+ unique_source_files: 0,
138
+ oldest_chunk: nil,
139
+ newest_chunk: nil
140
+ }
141
+ end
142
+ private_class_method :apollo_defaults
143
+
144
+ def build_sync_stats(path, scan_entries)
145
+ manifest_paths = load_manifest_files(path)
146
+ apollo_paths = load_apollo_source_files
147
+ scan_paths = scan_entries.map { |e| e[:path] }
148
+
149
+ {
150
+ orphan_count: (apollo_paths - manifest_paths).size,
151
+ missing_count: (scan_paths - apollo_paths).size
152
+ }
153
+ end
154
+ private_class_method :build_sync_stats
155
+
156
+ def load_manifest_files(path)
157
+ manifest = Helpers::ManifestStore.load(corpus_path: path)
158
+ manifest.map { |e| e[:path] }.compact.uniq
159
+ end
160
+ private_class_method :load_manifest_files
161
+
162
+ def load_apollo_source_files
163
+ return [] unless defined?(Legion::Data::Model::ApolloEntry)
164
+
165
+ Legion::Data::Model::ApolloEntry
166
+ .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
167
+ .exclude(status: 'archived')
168
+ .select_map(Sequel.lit("source_context->>'source_file'"))
169
+ .compact
170
+ .uniq
171
+ rescue StandardError
172
+ []
173
+ end
174
+ private_class_method :load_apollo_source_files
175
+
176
+ def count_apollo_chunks
177
+ return 0 unless defined?(Legion::Data::Model::ApolloEntry)
178
+
179
+ Legion::Data::Model::ApolloEntry
180
+ .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
181
+ .exclude(status: 'archived')
182
+ .count
183
+ rescue StandardError
184
+ 0
185
+ end
186
+ private_class_method :count_apollo_chunks
187
+
188
+ def archive_orphan_entries(orphan_files)
189
+ return 0 unless defined?(Legion::Data::Model::ApolloEntry)
190
+
191
+ Legion::Data::Model::ApolloEntry
192
+ .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
193
+ .where(Sequel.lit("source_context->>'source_file' IN ?", orphan_files))
194
+ .exclude(status: 'archived')
195
+ .update(status: 'archived', updated_at: Time.now)
196
+ end
197
+ private_class_method :archive_orphan_entries
198
+
199
+ def hot_chunks(limit)
200
+ return [] unless defined?(Legion::Data::Model::ApolloEntry)
201
+
202
+ Legion::Data::Model::ApolloEntry
203
+ .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
204
+ .exclude(status: 'archived')
205
+ .where { access_count.positive? }
206
+ .order(Sequel.desc(:access_count))
207
+ .limit(limit)
208
+ .select_map([:id, :access_count, :confidence,
209
+ Sequel.lit("source_context->>'source_file' AS source_file")])
210
+ .map { |r| { id: r[0], access_count: r[1], confidence: r[2], source_file: r[3] } }
211
+ rescue StandardError
212
+ []
213
+ end
214
+ private_class_method :hot_chunks
215
+
216
+ def cold_chunks(limit)
217
+ return [] unless defined?(Legion::Data::Model::ApolloEntry)
218
+
219
+ days = settings_cold_chunk_days
220
+ cutoff = Time.now - (days * 86_400)
221
+
222
+ Legion::Data::Model::ApolloEntry
223
+ .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
224
+ .exclude(status: 'archived')
225
+ .where(access_count: 0)
226
+ .where { created_at < cutoff }
227
+ .order(:created_at)
228
+ .limit(limit)
229
+ .select_map([:id, :confidence, :created_at,
230
+ Sequel.lit("source_context->>'source_file' AS source_file")])
231
+ .map { |r| { id: r[0], confidence: r[1], created_at: r[2]&.iso8601, source_file: r[3] } }
232
+ rescue StandardError
233
+ []
234
+ end
235
+ private_class_method :cold_chunks
236
+
237
+ def low_confidence_chunks(limit)
238
+ return [] unless defined?(Legion::Data::Model::ApolloEntry)
239
+
240
+ threshold = settings_stale_threshold
241
+
242
+ Legion::Data::Model::ApolloEntry
243
+ .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
244
+ .exclude(status: 'archived')
245
+ .where { confidence < threshold }
246
+ .order(:confidence)
247
+ .limit(limit)
248
+ .select_map([:id, :confidence, :access_count,
249
+ Sequel.lit("source_context->>'source_file' AS source_file")])
250
+ .map { |r| { id: r[0], confidence: r[1], access_count: r[2], source_file: r[3] } }
251
+ rescue StandardError
252
+ []
253
+ end
254
+ private_class_method :low_confidence_chunks
255
+
256
+ def quality_summary
257
+ defaults = { total_queries: 0, avg_retrieval_score: nil, chunks_never_accessed: 0,
258
+ chunks_below_threshold: 0 }
259
+ return defaults unless defined?(Legion::Data::Model::ApolloEntry)
260
+
261
+ base = Legion::Data::Model::ApolloEntry
262
+ .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
263
+ .exclude(status: 'archived')
264
+
265
+ {
266
+ total_queries: query_count,
267
+ avg_retrieval_score: nil,
268
+ chunks_never_accessed: base.where(access_count: 0).count,
269
+ chunks_below_threshold: base.where { confidence < settings_stale_threshold }.count
270
+ }
271
+ rescue StandardError
272
+ defaults
273
+ end
274
+ private_class_method :quality_summary
275
+
276
+ def query_count
277
+ return 0 unless defined?(Legion::Data::Model::ApolloAccessLog)
278
+
279
+ Legion::Data::Model::ApolloAccessLog.where(action: 'knowledge_query').count
280
+ rescue StandardError
281
+ 0
282
+ end
283
+ private_class_method :query_count
284
+
285
+ def settings_stale_threshold
286
+ return 0.3 unless defined?(Legion::Settings)
287
+
288
+ Legion::Settings.dig(:knowledge, :maintenance, :stale_threshold) || 0.3
289
+ rescue StandardError
290
+ 0.3
291
+ end
292
+ private_class_method :settings_stale_threshold
293
+
294
+ def settings_cold_chunk_days
295
+ return 7 unless defined?(Legion::Settings)
296
+
297
+ Legion::Settings.dig(:knowledge, :maintenance, :cold_chunk_days) || 7
298
+ rescue StandardError
299
+ 7
300
+ end
301
+ private_class_method :settings_cold_chunk_days
302
+
303
+ def settings_quality_limit
304
+ return 10 unless defined?(Legion::Settings)
305
+
306
+ Legion::Settings.dig(:knowledge, :maintenance, :quality_report_limit) || 10
307
+ rescue StandardError
308
+ 10
309
+ end
310
+ private_class_method :settings_quality_limit
311
+ end
312
+ end
313
+ end
314
+ end
315
+ end
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'digest'
4
+
3
5
  module Legion
4
6
  module Extensions
5
7
  module Knowledge
@@ -17,15 +19,21 @@ module Legion
17
19
 
18
20
  latency_ms = ((::Process.clock_gettime(::Process::CLOCK_MONOTONIC) - started) * 1000).round
19
21
 
22
+ score = average_score(chunks)
23
+ unless chunks.empty?
24
+ record_feedback(
25
+ question: question,
26
+ chunk_ids: chunks.filter_map { |c| c[:id] },
27
+ retrieval_score: score.to_f,
28
+ synthesized: synthesize && llm_available?
29
+ )
30
+ end
31
+
20
32
  {
21
33
  success: true,
22
34
  answer: answer,
23
35
  sources: chunks.map { |c| format_source(c) },
24
- metadata: {
25
- retrieval_score: average_score(chunks),
26
- chunk_count: chunks.size,
27
- latency_ms: latency_ms
28
- }
36
+ metadata: build_metadata(chunks, score, latency_ms)
29
37
  }
30
38
  rescue StandardError => e
31
39
  { success: false, error: e.message }
@@ -38,14 +46,26 @@ module Legion
38
46
  {
39
47
  success: true,
40
48
  sources: chunks.map { |c| format_source(c) },
41
- metadata: {
42
- chunk_count: chunks.size
43
- }
49
+ metadata: build_metadata(chunks, average_score(chunks))
44
50
  }
45
51
  rescue StandardError => e
46
52
  { success: false, error: e.message }
47
53
  end
48
54
 
55
+ def record_feedback(question:, chunk_ids:, retrieval_score:, synthesized: true, rating: nil)
56
+ question_hash = ::Digest::SHA256.hexdigest(question.to_s)[0, 16]
57
+ emit_feedback_event(
58
+ question_hash: question_hash,
59
+ chunk_ids: chunk_ids,
60
+ retrieval_score: retrieval_score,
61
+ synthesized: synthesized,
62
+ rating: rating
63
+ )
64
+ { success: true, question_hash: question_hash, rating: rating }
65
+ rescue StandardError => e
66
+ { success: false, error: e.message }
67
+ end
68
+
49
69
  def retrieve_chunks(question, top_k)
50
70
  return [] unless defined?(Legion::Extensions::Apollo)
51
71
 
@@ -97,6 +117,53 @@ module Legion
97
117
  end
98
118
  private_class_method :average_score
99
119
 
120
+ def build_metadata(chunks, score, latency_ms = nil)
121
+ confidences = chunks.filter_map { |c| c[:confidence] }
122
+ distances = chunks.filter_map { |c| c[:distance] }
123
+ source_names = chunks.filter_map do |c|
124
+ c.dig(:metadata, :source_file) || c[:source_file]
125
+ end.uniq
126
+ statuses = chunks.group_by { |c| c[:status] }.transform_values(&:size)
127
+
128
+ meta = {
129
+ retrieval_score: score,
130
+ chunk_count: chunks.size,
131
+ confidence_avg: confidences.empty? ? nil : (confidences.sum.to_f / confidences.size).round(4),
132
+ confidence_range: confidences.empty? ? nil : confidences.minmax,
133
+ distance_range: distances.empty? ? nil : distances.minmax,
134
+ source_files: source_names,
135
+ source_file_count: source_names.size,
136
+ all_embedded: chunks.none? { |c| zero_embedding?(c) },
137
+ statuses: statuses
138
+ }
139
+ meta[:latency_ms] = latency_ms unless latency_ms.nil?
140
+ meta
141
+ end
142
+ private_class_method :build_metadata
143
+
144
+ def zero_embedding?(chunk)
145
+ emb = chunk[:embedding]
146
+ return true if emb.nil?
147
+
148
+ emb.is_a?(Array) && (emb.empty? || emb.all?(&:zero?))
149
+ end
150
+ private_class_method :zero_embedding?
151
+
152
+ def emit_feedback_event(question_hash:, chunk_ids:, retrieval_score:, synthesized:, rating:)
153
+ return unless defined?(Legion::Events)
154
+
155
+ Legion::Events.emit('knowledge.query_feedback', {
156
+ question_hash: question_hash,
157
+ chunk_ids: chunk_ids,
158
+ retrieval_score: retrieval_score,
159
+ synthesized: synthesized,
160
+ rating: rating
161
+ })
162
+ rescue StandardError
163
+ nil
164
+ end
165
+ private_class_method :emit_feedback_event
166
+
100
167
  def llm_available?
101
168
  defined?(Legion::LLM)
102
169
  end
@@ -3,7 +3,7 @@
3
3
  module Legion
4
4
  module Extensions
5
5
  module Knowledge
6
- VERSION = '0.2.0'
6
+ VERSION = '0.5.0'
7
7
  end
8
8
  end
9
9
  end
@@ -2,11 +2,13 @@
2
2
 
3
3
  require_relative 'knowledge/version'
4
4
  require_relative 'knowledge/helpers/manifest'
5
+ require_relative 'knowledge/helpers/manifest_store'
5
6
  require_relative 'knowledge/helpers/parser'
6
7
  require_relative 'knowledge/helpers/chunker'
7
8
  require_relative 'knowledge/runners/ingest'
8
9
  require_relative 'knowledge/runners/query'
9
10
  require_relative 'knowledge/runners/corpus'
11
+ require_relative 'knowledge/runners/maintenance'
10
12
  require_relative 'knowledge/client'
11
13
 
12
14
  if defined?(Legion::Transport)
@@ -16,6 +18,7 @@ if defined?(Legion::Transport)
16
18
  end
17
19
 
18
20
  require_relative 'knowledge/actors/corpus_watcher' if defined?(Legion::Extensions::Actors::Every)
21
+ require_relative 'knowledge/actors/maintenance_runner' if defined?(Legion::Extensions::Actors::Every)
19
22
 
20
23
  require_relative 'knowledge/actors/corpus_ingest' if defined?(Legion::Extensions::Actors::Subscription)
21
24
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lex-knowledge
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Matthew Iverson
@@ -118,12 +118,15 @@ files:
118
118
  - lib/legion/extensions/knowledge.rb
119
119
  - lib/legion/extensions/knowledge/actors/corpus_ingest.rb
120
120
  - lib/legion/extensions/knowledge/actors/corpus_watcher.rb
121
+ - lib/legion/extensions/knowledge/actors/maintenance_runner.rb
121
122
  - lib/legion/extensions/knowledge/client.rb
122
123
  - lib/legion/extensions/knowledge/helpers/chunker.rb
123
124
  - lib/legion/extensions/knowledge/helpers/manifest.rb
125
+ - lib/legion/extensions/knowledge/helpers/manifest_store.rb
124
126
  - lib/legion/extensions/knowledge/helpers/parser.rb
125
127
  - lib/legion/extensions/knowledge/runners/corpus.rb
126
128
  - lib/legion/extensions/knowledge/runners/ingest.rb
129
+ - lib/legion/extensions/knowledge/runners/maintenance.rb
127
130
  - lib/legion/extensions/knowledge/runners/query.rb
128
131
  - lib/legion/extensions/knowledge/transport/exchanges/knowledge.rb
129
132
  - lib/legion/extensions/knowledge/transport/messages/ingest_message.rb