lex-knowledge 0.2.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/legion/extensions/knowledge/actors/maintenance_runner.rb +48 -0
- data/lib/legion/extensions/knowledge/client.rb +1 -0
- data/lib/legion/extensions/knowledge/helpers/manifest_store.rb +51 -0
- data/lib/legion/extensions/knowledge/helpers/parser.rb +35 -12
- data/lib/legion/extensions/knowledge/runners/corpus.rb +4 -0
- data/lib/legion/extensions/knowledge/runners/ingest.rb +86 -32
- data/lib/legion/extensions/knowledge/runners/maintenance.rb +315 -0
- data/lib/legion/extensions/knowledge/runners/query.rb +75 -8
- data/lib/legion/extensions/knowledge/version.rb +1 -1
- data/lib/legion/extensions/knowledge.rb +3 -0
- metadata +4 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 8d512db91b31e6d8a9747a3987aa90dd406d04f8987400787f6de27b13eb10c5
|
|
4
|
+
data.tar.gz: 2d9df5e1289bb80f603dbd882863d1802bf942e8ff12940d259ba44e25b728f3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 2022e4654a3a815e8c5433daaad7c2e9767b039c51513e12d9cd6c71a2de47e3ee883c6e0624aa9ac8f6108439c19d4552260694482db4e2da9d58851fa093bd
|
|
7
|
+
data.tar.gz: d1afc8cb8fdcd0317ee9fa416c91827ca3536e40fed404791a355b4b4662360211e3c064dc09e8504d0ed5eb359f1dc906edaa57277f205b1aa9eafedcf261da
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Extensions
|
|
5
|
+
module Knowledge
|
|
6
|
+
module Actor
|
|
7
|
+
class MaintenanceRunner < Legion::Extensions::Actors::Every
|
|
8
|
+
def runner_class = 'Legion::Extensions::Knowledge::Runners::Maintenance'
|
|
9
|
+
def runner_function = 'health'
|
|
10
|
+
def check_subtask? = false
|
|
11
|
+
def generate_task? = false
|
|
12
|
+
|
|
13
|
+
def every_interval
|
|
14
|
+
if defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil?
|
|
15
|
+
Legion::Settings.dig(:knowledge, :actors, :maintenance_interval) || 21_600
|
|
16
|
+
else
|
|
17
|
+
21_600
|
|
18
|
+
end
|
|
19
|
+
rescue StandardError
|
|
20
|
+
21_600
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def enabled?
|
|
24
|
+
return false unless corpus_path && !corpus_path.empty?
|
|
25
|
+
|
|
26
|
+
true
|
|
27
|
+
rescue StandardError
|
|
28
|
+
false
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def args
|
|
32
|
+
{ path: corpus_path }
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
def corpus_path
|
|
38
|
+
return nil unless defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil?
|
|
39
|
+
|
|
40
|
+
Legion::Settings.dig(:knowledge, :corpus_path)
|
|
41
|
+
rescue StandardError
|
|
42
|
+
nil
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'digest'
|
|
4
|
+
require 'fileutils'
|
|
5
|
+
require 'json'
|
|
6
|
+
require 'tempfile'
|
|
7
|
+
|
|
8
|
+
module Legion
|
|
9
|
+
module Extensions
|
|
10
|
+
module Knowledge
|
|
11
|
+
module Helpers
|
|
12
|
+
module ManifestStore
|
|
13
|
+
module_function
|
|
14
|
+
|
|
15
|
+
STORE_DIR = ::File.expand_path('~/.legionio/knowledge').freeze
|
|
16
|
+
|
|
17
|
+
def load(corpus_path:)
|
|
18
|
+
path = store_path(corpus_path: corpus_path)
|
|
19
|
+
return [] unless ::File.exist?(path)
|
|
20
|
+
|
|
21
|
+
raw = ::File.read(path, encoding: 'utf-8')
|
|
22
|
+
::JSON.parse(raw, symbolize_names: true)
|
|
23
|
+
rescue StandardError
|
|
24
|
+
[]
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def save(corpus_path:, manifest:)
|
|
28
|
+
::FileUtils.mkdir_p(STORE_DIR)
|
|
29
|
+
path = store_path(corpus_path: corpus_path)
|
|
30
|
+
tmp = "#{path}.tmp"
|
|
31
|
+
::File.write(tmp, ::JSON.generate(manifest.map { |e| serialize_entry(e) }))
|
|
32
|
+
::File.rename(tmp, path)
|
|
33
|
+
true
|
|
34
|
+
rescue StandardError
|
|
35
|
+
false
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def store_path(corpus_path:)
|
|
39
|
+
hash = ::Digest::SHA256.hexdigest(corpus_path.to_s)[0, 16]
|
|
40
|
+
::File.join(STORE_DIR, "#{hash}.manifest.json")
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def serialize_entry(entry)
|
|
44
|
+
entry.merge(mtime: entry[:mtime].to_s)
|
|
45
|
+
end
|
|
46
|
+
private_class_method :serialize_entry
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
@@ -15,6 +15,8 @@ module Legion
|
|
|
15
15
|
parse_markdown(file_path: file_path)
|
|
16
16
|
when '.txt'
|
|
17
17
|
parse_text(file_path: file_path)
|
|
18
|
+
when '.pdf', '.docx'
|
|
19
|
+
extract_via_data(file_path: file_path)
|
|
18
20
|
else
|
|
19
21
|
[{ error: 'unsupported format', source_file: file_path }]
|
|
20
22
|
end
|
|
@@ -22,32 +24,42 @@ module Legion
|
|
|
22
24
|
|
|
23
25
|
def parse_markdown(file_path:)
|
|
24
26
|
content = ::File.read(file_path, encoding: 'utf-8')
|
|
25
|
-
sections
|
|
27
|
+
sections = []
|
|
26
28
|
current_heading = ::File.basename(file_path, '.*')
|
|
27
29
|
current_lines = []
|
|
28
|
-
|
|
30
|
+
heading_stack = {}
|
|
29
31
|
|
|
30
32
|
content.each_line do |line|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
current_heading
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
current_heading = line.sub(/^#+\s*/, '').chomp
|
|
39
|
-
section_path = section_path.first(1) + [current_heading]
|
|
33
|
+
level = heading_level(line)
|
|
34
|
+
if level
|
|
35
|
+
flush_section(sections, current_heading, build_section_path(heading_stack), current_lines, file_path)
|
|
36
|
+
title = line.sub(/^#+\s*/, '').chomp
|
|
37
|
+
heading_stack.delete_if { |d, _| d >= level }
|
|
38
|
+
heading_stack[level] = title
|
|
39
|
+
current_heading = title
|
|
40
40
|
current_lines = []
|
|
41
41
|
else
|
|
42
42
|
current_lines << line
|
|
43
43
|
end
|
|
44
44
|
end
|
|
45
45
|
|
|
46
|
-
flush_section(sections, current_heading,
|
|
46
|
+
flush_section(sections, current_heading, build_section_path(heading_stack), current_lines, file_path)
|
|
47
47
|
|
|
48
48
|
sections.empty? ? [{ heading: ::File.basename(file_path, '.*'), section_path: [], content: content.strip, source_file: file_path }] : sections
|
|
49
49
|
end
|
|
50
50
|
|
|
51
|
+
def extract_via_data(file_path:)
|
|
52
|
+
return [{ error: 'unsupported format', source_file: file_path }] unless defined?(::Legion::Data::Extract)
|
|
53
|
+
|
|
54
|
+
result = ::Legion::Data::Extract.extract(file_path, type: :auto)
|
|
55
|
+
return [{ error: 'extraction_failed', source_file: file_path, detail: result }] unless result.is_a?(Hash) && result[:text]
|
|
56
|
+
|
|
57
|
+
heading = ::File.basename(file_path, '.*')
|
|
58
|
+
[{ heading: heading, section_path: [], content: result[:text].strip, source_file: file_path }]
|
|
59
|
+
rescue StandardError => e
|
|
60
|
+
[{ error: 'extraction_failed', source_file: file_path, detail: e.message }]
|
|
61
|
+
end
|
|
62
|
+
|
|
51
63
|
def parse_text(file_path:)
|
|
52
64
|
content = ::File.read(file_path, encoding: 'utf-8')
|
|
53
65
|
heading = ::File.basename(file_path, '.*')
|
|
@@ -67,6 +79,17 @@ module Legion
|
|
|
67
79
|
}
|
|
68
80
|
end
|
|
69
81
|
private_class_method :flush_section
|
|
82
|
+
|
|
83
|
+
def heading_level(line)
|
|
84
|
+
m = line.match(/^(\#{1,6})\s/)
|
|
85
|
+
m ? m[1].length : nil
|
|
86
|
+
end
|
|
87
|
+
private_class_method :heading_level
|
|
88
|
+
|
|
89
|
+
def build_section_path(stack)
|
|
90
|
+
stack.sort.map { |_, title| title }
|
|
91
|
+
end
|
|
92
|
+
private_class_method :build_section_path
|
|
70
93
|
end
|
|
71
94
|
end
|
|
72
95
|
end
|
|
@@ -7,6 +7,10 @@ module Legion
|
|
|
7
7
|
module Corpus
|
|
8
8
|
module_function
|
|
9
9
|
|
|
10
|
+
def manifest_path(path:)
|
|
11
|
+
Helpers::ManifestStore.store_path(corpus_path: path)
|
|
12
|
+
end
|
|
13
|
+
|
|
10
14
|
def corpus_stats(path:, extensions: nil)
|
|
11
15
|
return { success: false, error: 'path does not exist' } unless ::File.exist?(path)
|
|
12
16
|
|
|
@@ -23,23 +23,32 @@ module Legion
|
|
|
23
23
|
end
|
|
24
24
|
|
|
25
25
|
def ingest_corpus(path:, dry_run: false, force: false)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
26
|
+
current = Helpers::Manifest.scan(path: path)
|
|
27
|
+
previous = force ? [] : Helpers::ManifestStore.load(corpus_path: path)
|
|
28
|
+
delta = Helpers::Manifest.diff(current: current, previous: previous)
|
|
29
|
+
|
|
30
|
+
to_process = delta[:added] + delta[:changed]
|
|
31
|
+
chunks_created = 0
|
|
32
|
+
chunks_skipped = 0
|
|
33
|
+
chunks_updated = 0
|
|
34
|
+
|
|
35
|
+
to_process.each do |file_path|
|
|
36
|
+
result = process_file(file_path, dry_run: dry_run, force: force)
|
|
37
|
+
chunks_created += result[:created]
|
|
38
|
+
chunks_skipped += result[:skipped]
|
|
39
|
+
chunks_updated += result[:updated]
|
|
38
40
|
end
|
|
39
41
|
|
|
42
|
+
delta[:removed].each { |file_path| retire_file(file_path: file_path) } unless dry_run
|
|
43
|
+
|
|
44
|
+
Helpers::ManifestStore.save(corpus_path: path, manifest: current) unless dry_run
|
|
45
|
+
|
|
40
46
|
{
|
|
41
47
|
success: true,
|
|
42
|
-
files_scanned:
|
|
48
|
+
files_scanned: current.size,
|
|
49
|
+
files_added: delta[:added].size,
|
|
50
|
+
files_changed: delta[:changed].size,
|
|
51
|
+
files_removed: delta[:removed].size,
|
|
43
52
|
chunks_created: chunks_created,
|
|
44
53
|
chunks_skipped: chunks_skipped,
|
|
45
54
|
chunks_updated: chunks_updated
|
|
@@ -67,12 +76,18 @@ module Legion
|
|
|
67
76
|
return { created: 0, skipped: 0, updated: 0 } if sections.first&.key?(:error)
|
|
68
77
|
|
|
69
78
|
chunks = Helpers::Chunker.chunk(sections: sections)
|
|
79
|
+
paired = if dry_run
|
|
80
|
+
chunks.map { |c| { chunk: c, embedding: nil } }
|
|
81
|
+
else
|
|
82
|
+
batch_embed_chunks(chunks, force: force)
|
|
83
|
+
end
|
|
84
|
+
|
|
70
85
|
created = 0
|
|
71
86
|
skipped = 0
|
|
72
87
|
updated = 0
|
|
73
88
|
|
|
74
|
-
|
|
75
|
-
outcome =
|
|
89
|
+
paired.each do |p|
|
|
90
|
+
outcome = upsert_chunk_with_embedding(p[:chunk], p[:embedding], dry_run: dry_run, force: force, exists: p[:exists] || false)
|
|
76
91
|
case outcome
|
|
77
92
|
when :created then created += 1
|
|
78
93
|
when :skipped then skipped += 1
|
|
@@ -84,21 +99,55 @@ module Legion
|
|
|
84
99
|
end
|
|
85
100
|
private_class_method :process_file
|
|
86
101
|
|
|
87
|
-
def
|
|
88
|
-
|
|
102
|
+
def batch_embed_chunks(chunks, force:)
|
|
103
|
+
exists_map = force ? {} : build_exists_map(chunks)
|
|
104
|
+
return paired_without_embed(chunks, exists_map) unless llm_embed_available?
|
|
89
105
|
|
|
90
|
-
|
|
106
|
+
needs_embed = force ? chunks : chunks.reject { |c| exists_map[c[:content_hash]] }
|
|
107
|
+
embed_map = needs_embed.empty? ? {} : build_embed_map(needs_embed)
|
|
91
108
|
|
|
92
|
-
|
|
109
|
+
chunks.map { |c| { chunk: c, embedding: embed_map[c[:content_hash]], exists: exists_map.fetch(c[:content_hash], false) } }
|
|
110
|
+
rescue StandardError
|
|
111
|
+
paired_without_embed(chunks, {})
|
|
112
|
+
end
|
|
113
|
+
private_class_method :batch_embed_chunks
|
|
93
114
|
|
|
94
|
-
|
|
95
|
-
|
|
115
|
+
def build_exists_map(chunks)
|
|
116
|
+
chunks.to_h { |c| [c[:content_hash], chunk_exists?(c[:content_hash])] }
|
|
117
|
+
end
|
|
118
|
+
private_class_method :build_exists_map
|
|
119
|
+
|
|
120
|
+
def llm_embed_available?
|
|
121
|
+
defined?(Legion::LLM) && Legion::LLM.respond_to?(:embed_batch)
|
|
122
|
+
end
|
|
123
|
+
private_class_method :llm_embed_available?
|
|
124
|
+
|
|
125
|
+
def paired_without_embed(chunks, exists_map)
|
|
126
|
+
chunks.map { |c| { chunk: c, embedding: nil, exists: exists_map.fetch(c[:content_hash], false) } }
|
|
127
|
+
end
|
|
128
|
+
private_class_method :paired_without_embed
|
|
96
129
|
|
|
130
|
+
def build_embed_map(needs_embed)
|
|
131
|
+
results = Legion::LLM.embed_batch(needs_embed.map { |c| c[:content] })
|
|
132
|
+
results.each_with_object({}) do |r, h|
|
|
133
|
+
h[needs_embed[r[:index]][:content_hash]] = r[:vector] unless r[:error]
|
|
134
|
+
end
|
|
135
|
+
rescue StandardError
|
|
136
|
+
{}
|
|
137
|
+
end
|
|
138
|
+
private_class_method :build_embed_map
|
|
139
|
+
|
|
140
|
+
def upsert_chunk_with_embedding(chunk, embedding, dry_run: false, force: false, exists: false)
|
|
141
|
+
return :created if dry_run
|
|
142
|
+
return :created unless defined?(Legion::Extensions::Apollo)
|
|
143
|
+
return :skipped if !force && exists
|
|
144
|
+
|
|
145
|
+
ingest_to_apollo(chunk, embedding)
|
|
97
146
|
force ? :updated : :created
|
|
98
147
|
rescue StandardError
|
|
99
148
|
:skipped
|
|
100
149
|
end
|
|
101
|
-
private_class_method :
|
|
150
|
+
private_class_method :upsert_chunk_with_embedding
|
|
102
151
|
|
|
103
152
|
def chunk_exists?(content_hash)
|
|
104
153
|
return false unless defined?(Legion::Data::Model::ApolloEntry)
|
|
@@ -112,16 +161,6 @@ module Legion
|
|
|
112
161
|
end
|
|
113
162
|
private_class_method :chunk_exists?
|
|
114
163
|
|
|
115
|
-
def generate_embedding(content)
|
|
116
|
-
return nil unless defined?(Legion::LLM) && Legion::LLM.respond_to?(:embed)
|
|
117
|
-
|
|
118
|
-
result = Legion::LLM.embed(content)
|
|
119
|
-
result.is_a?(Hash) ? result[:vector] : nil
|
|
120
|
-
rescue StandardError
|
|
121
|
-
nil
|
|
122
|
-
end
|
|
123
|
-
private_class_method :generate_embedding
|
|
124
|
-
|
|
125
164
|
def ingest_to_apollo(chunk, embedding)
|
|
126
165
|
return unless defined?(Legion::Extensions::Apollo)
|
|
127
166
|
|
|
@@ -143,6 +182,21 @@ module Legion
|
|
|
143
182
|
Legion::Extensions::Apollo::Runners::Knowledge.handle_ingest(**payload)
|
|
144
183
|
end
|
|
145
184
|
private_class_method :ingest_to_apollo
|
|
185
|
+
|
|
186
|
+
def retire_file(file_path:)
|
|
187
|
+
return unless defined?(Legion::Apollo)
|
|
188
|
+
return unless Legion::Apollo.respond_to?(:ingest) && Legion::Apollo.started?
|
|
189
|
+
|
|
190
|
+
Legion::Apollo.ingest(
|
|
191
|
+
content: file_path,
|
|
192
|
+
content_type: 'document_retired',
|
|
193
|
+
tags: [file_path, 'retired', 'document_chunk'].uniq,
|
|
194
|
+
metadata: { source_file: file_path, retired: true }
|
|
195
|
+
)
|
|
196
|
+
rescue StandardError
|
|
197
|
+
nil
|
|
198
|
+
end
|
|
199
|
+
private_class_method :retire_file
|
|
146
200
|
end
|
|
147
201
|
end
|
|
148
202
|
end
|
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Extensions
|
|
5
|
+
module Knowledge
|
|
6
|
+
module Runners
|
|
7
|
+
module Maintenance
|
|
8
|
+
module_function
|
|
9
|
+
|
|
10
|
+
def detect_orphans(path:)
|
|
11
|
+
manifest_files = load_manifest_files(path)
|
|
12
|
+
apollo_files = load_apollo_source_files
|
|
13
|
+
|
|
14
|
+
orphan_files = apollo_files - manifest_files
|
|
15
|
+
|
|
16
|
+
{
|
|
17
|
+
success: true,
|
|
18
|
+
orphan_count: orphan_files.size,
|
|
19
|
+
orphan_files: orphan_files,
|
|
20
|
+
total_apollo_chunks: count_apollo_chunks,
|
|
21
|
+
total_manifest_files: manifest_files.size
|
|
22
|
+
}
|
|
23
|
+
rescue StandardError => e
|
|
24
|
+
{ success: false, error: e.message }
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def cleanup_orphans(path:, dry_run: true)
|
|
28
|
+
detection = detect_orphans(path: path)
|
|
29
|
+
return detection unless detection[:success]
|
|
30
|
+
return detection.merge(archived: 0, files_cleaned: 0, dry_run: dry_run) if detection[:orphan_count].zero?
|
|
31
|
+
return detection.merge(archived: detection[:orphan_count], files_cleaned: detection[:orphan_files].size, dry_run: true) if dry_run
|
|
32
|
+
|
|
33
|
+
archived = archive_orphan_entries(detection[:orphan_files])
|
|
34
|
+
|
|
35
|
+
{ success: true, archived: archived, files_cleaned: detection[:orphan_files].size, dry_run: false }
|
|
36
|
+
rescue StandardError => e
|
|
37
|
+
{ success: false, error: e.message }
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def reindex(path:)
|
|
41
|
+
store_path = Helpers::ManifestStore.store_path(corpus_path: path)
|
|
42
|
+
::FileUtils.rm_f(store_path)
|
|
43
|
+
|
|
44
|
+
Runners::Ingest.ingest_corpus(path: path, force: true)
|
|
45
|
+
rescue StandardError => e
|
|
46
|
+
{ success: false, error: e.message }
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def health(path:)
|
|
50
|
+
scan_entries = Helpers::Manifest.scan(path: path)
|
|
51
|
+
store_path = Helpers::ManifestStore.store_path(corpus_path: path)
|
|
52
|
+
manifest_file = ::File.exist?(store_path)
|
|
53
|
+
last_ingest = manifest_file ? ::File.mtime(store_path).iso8601 : nil
|
|
54
|
+
|
|
55
|
+
{
|
|
56
|
+
success: true,
|
|
57
|
+
local: build_local_stats(path, scan_entries, manifest_file, last_ingest),
|
|
58
|
+
apollo: build_apollo_stats,
|
|
59
|
+
sync: build_sync_stats(path, scan_entries)
|
|
60
|
+
}
|
|
61
|
+
rescue StandardError => e
|
|
62
|
+
{ success: false, error: e.message }
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def quality_report(limit: nil)
|
|
66
|
+
resolved_limit = limit || settings_quality_limit
|
|
67
|
+
|
|
68
|
+
{
|
|
69
|
+
success: true,
|
|
70
|
+
hot_chunks: hot_chunks(resolved_limit),
|
|
71
|
+
cold_chunks: cold_chunks(resolved_limit),
|
|
72
|
+
low_confidence: low_confidence_chunks(resolved_limit),
|
|
73
|
+
poor_retrieval: [],
|
|
74
|
+
summary: quality_summary
|
|
75
|
+
}
|
|
76
|
+
rescue StandardError => e
|
|
77
|
+
{ success: false, error: e.message }
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def build_local_stats(path, scan_entries, manifest_file, last_ingest)
|
|
81
|
+
{
|
|
82
|
+
corpus_path: path,
|
|
83
|
+
file_count: scan_entries.size,
|
|
84
|
+
total_bytes: scan_entries.sum { |e| e[:size] },
|
|
85
|
+
manifest_exists: manifest_file,
|
|
86
|
+
last_ingest: last_ingest
|
|
87
|
+
}
|
|
88
|
+
end
|
|
89
|
+
private_class_method :build_local_stats
|
|
90
|
+
|
|
91
|
+
def build_apollo_stats
|
|
92
|
+
return apollo_defaults unless defined?(Legion::Data::Model::ApolloEntry)
|
|
93
|
+
|
|
94
|
+
base = Legion::Data::Model::ApolloEntry
|
|
95
|
+
.where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
|
|
96
|
+
.exclude(status: 'archived')
|
|
97
|
+
total = base.count
|
|
98
|
+
return apollo_defaults if total.zero?
|
|
99
|
+
|
|
100
|
+
rows = base.select(:confidence, :status, :access_count, :embedding, :created_at).all
|
|
101
|
+
apollo_stats_from_rows(base, rows, total)
|
|
102
|
+
rescue StandardError
|
|
103
|
+
apollo_defaults
|
|
104
|
+
end
|
|
105
|
+
private_class_method :build_apollo_stats
|
|
106
|
+
|
|
107
|
+
def apollo_stats_from_rows(base, rows, total)
|
|
108
|
+
confidences = rows.map { |r| r[:confidence].to_f }
|
|
109
|
+
with_embeddings = rows.count { |r| !r[:embedding].nil? }
|
|
110
|
+
stale_threshold = settings_stale_threshold
|
|
111
|
+
timestamps = rows.map { |r| r[:created_at] }
|
|
112
|
+
|
|
113
|
+
{
|
|
114
|
+
total_chunks: total,
|
|
115
|
+
by_status: base.group_and_count(:status).as_hash(:status, :count).transform_keys(&:to_sym),
|
|
116
|
+
embedding_coverage: (with_embeddings.to_f / total).round(4),
|
|
117
|
+
avg_confidence: confidences.sum / confidences.size.to_f,
|
|
118
|
+
confidence_range: confidences.minmax,
|
|
119
|
+
stale_count: confidences.count { |c| c < stale_threshold },
|
|
120
|
+
never_accessed: rows.count { |r| r[:access_count].to_i.zero? },
|
|
121
|
+
unique_source_files: load_apollo_source_files.size,
|
|
122
|
+
oldest_chunk: timestamps.min&.iso8601,
|
|
123
|
+
newest_chunk: timestamps.max&.iso8601
|
|
124
|
+
}
|
|
125
|
+
end
|
|
126
|
+
private_class_method :apollo_stats_from_rows
|
|
127
|
+
|
|
128
|
+
def apollo_defaults
|
|
129
|
+
{
|
|
130
|
+
total_chunks: 0,
|
|
131
|
+
by_status: {},
|
|
132
|
+
embedding_coverage: 0.0,
|
|
133
|
+
avg_confidence: 0.0,
|
|
134
|
+
confidence_range: [0.0, 0.0],
|
|
135
|
+
stale_count: 0,
|
|
136
|
+
never_accessed: 0,
|
|
137
|
+
unique_source_files: 0,
|
|
138
|
+
oldest_chunk: nil,
|
|
139
|
+
newest_chunk: nil
|
|
140
|
+
}
|
|
141
|
+
end
|
|
142
|
+
private_class_method :apollo_defaults
|
|
143
|
+
|
|
144
|
+
def build_sync_stats(path, scan_entries)
|
|
145
|
+
manifest_paths = load_manifest_files(path)
|
|
146
|
+
apollo_paths = load_apollo_source_files
|
|
147
|
+
scan_paths = scan_entries.map { |e| e[:path] }
|
|
148
|
+
|
|
149
|
+
{
|
|
150
|
+
orphan_count: (apollo_paths - manifest_paths).size,
|
|
151
|
+
missing_count: (scan_paths - apollo_paths).size
|
|
152
|
+
}
|
|
153
|
+
end
|
|
154
|
+
private_class_method :build_sync_stats
|
|
155
|
+
|
|
156
|
+
def load_manifest_files(path)
|
|
157
|
+
manifest = Helpers::ManifestStore.load(corpus_path: path)
|
|
158
|
+
manifest.map { |e| e[:path] }.compact.uniq
|
|
159
|
+
end
|
|
160
|
+
private_class_method :load_manifest_files
|
|
161
|
+
|
|
162
|
+
def load_apollo_source_files
|
|
163
|
+
return [] unless defined?(Legion::Data::Model::ApolloEntry)
|
|
164
|
+
|
|
165
|
+
Legion::Data::Model::ApolloEntry
|
|
166
|
+
.where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
|
|
167
|
+
.exclude(status: 'archived')
|
|
168
|
+
.select_map(Sequel.lit("source_context->>'source_file'"))
|
|
169
|
+
.compact
|
|
170
|
+
.uniq
|
|
171
|
+
rescue StandardError
|
|
172
|
+
[]
|
|
173
|
+
end
|
|
174
|
+
private_class_method :load_apollo_source_files
|
|
175
|
+
|
|
176
|
+
def count_apollo_chunks
|
|
177
|
+
return 0 unless defined?(Legion::Data::Model::ApolloEntry)
|
|
178
|
+
|
|
179
|
+
Legion::Data::Model::ApolloEntry
|
|
180
|
+
.where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
|
|
181
|
+
.exclude(status: 'archived')
|
|
182
|
+
.count
|
|
183
|
+
rescue StandardError
|
|
184
|
+
0
|
|
185
|
+
end
|
|
186
|
+
private_class_method :count_apollo_chunks
|
|
187
|
+
|
|
188
|
+
def archive_orphan_entries(orphan_files)
|
|
189
|
+
return 0 unless defined?(Legion::Data::Model::ApolloEntry)
|
|
190
|
+
|
|
191
|
+
Legion::Data::Model::ApolloEntry
|
|
192
|
+
.where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
|
|
193
|
+
.where(Sequel.lit("source_context->>'source_file' IN ?", orphan_files))
|
|
194
|
+
.exclude(status: 'archived')
|
|
195
|
+
.update(status: 'archived', updated_at: Time.now)
|
|
196
|
+
end
|
|
197
|
+
private_class_method :archive_orphan_entries
|
|
198
|
+
|
|
199
|
+
def hot_chunks(limit)
|
|
200
|
+
return [] unless defined?(Legion::Data::Model::ApolloEntry)
|
|
201
|
+
|
|
202
|
+
Legion::Data::Model::ApolloEntry
|
|
203
|
+
.where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
|
|
204
|
+
.exclude(status: 'archived')
|
|
205
|
+
.where { access_count.positive? }
|
|
206
|
+
.order(Sequel.desc(:access_count))
|
|
207
|
+
.limit(limit)
|
|
208
|
+
.select_map([:id, :access_count, :confidence,
|
|
209
|
+
Sequel.lit("source_context->>'source_file' AS source_file")])
|
|
210
|
+
.map { |r| { id: r[0], access_count: r[1], confidence: r[2], source_file: r[3] } }
|
|
211
|
+
rescue StandardError
|
|
212
|
+
[]
|
|
213
|
+
end
|
|
214
|
+
private_class_method :hot_chunks
|
|
215
|
+
|
|
216
|
+
def cold_chunks(limit)
|
|
217
|
+
return [] unless defined?(Legion::Data::Model::ApolloEntry)
|
|
218
|
+
|
|
219
|
+
days = settings_cold_chunk_days
|
|
220
|
+
cutoff = Time.now - (days * 86_400)
|
|
221
|
+
|
|
222
|
+
Legion::Data::Model::ApolloEntry
|
|
223
|
+
.where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
|
|
224
|
+
.exclude(status: 'archived')
|
|
225
|
+
.where(access_count: 0)
|
|
226
|
+
.where { created_at < cutoff }
|
|
227
|
+
.order(:created_at)
|
|
228
|
+
.limit(limit)
|
|
229
|
+
.select_map([:id, :confidence, :created_at,
|
|
230
|
+
Sequel.lit("source_context->>'source_file' AS source_file")])
|
|
231
|
+
.map { |r| { id: r[0], confidence: r[1], created_at: r[2]&.iso8601, source_file: r[3] } }
|
|
232
|
+
rescue StandardError
|
|
233
|
+
[]
|
|
234
|
+
end
|
|
235
|
+
private_class_method :cold_chunks
|
|
236
|
+
|
|
237
|
+
def low_confidence_chunks(limit)
|
|
238
|
+
return [] unless defined?(Legion::Data::Model::ApolloEntry)
|
|
239
|
+
|
|
240
|
+
threshold = settings_stale_threshold
|
|
241
|
+
|
|
242
|
+
Legion::Data::Model::ApolloEntry
|
|
243
|
+
.where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
|
|
244
|
+
.exclude(status: 'archived')
|
|
245
|
+
.where { confidence < threshold }
|
|
246
|
+
.order(:confidence)
|
|
247
|
+
.limit(limit)
|
|
248
|
+
.select_map([:id, :confidence, :access_count,
|
|
249
|
+
Sequel.lit("source_context->>'source_file' AS source_file")])
|
|
250
|
+
.map { |r| { id: r[0], confidence: r[1], access_count: r[2], source_file: r[3] } }
|
|
251
|
+
rescue StandardError
|
|
252
|
+
[]
|
|
253
|
+
end
|
|
254
|
+
private_class_method :low_confidence_chunks
|
|
255
|
+
|
|
256
|
+
def quality_summary
|
|
257
|
+
defaults = { total_queries: 0, avg_retrieval_score: nil, chunks_never_accessed: 0,
|
|
258
|
+
chunks_below_threshold: 0 }
|
|
259
|
+
return defaults unless defined?(Legion::Data::Model::ApolloEntry)
|
|
260
|
+
|
|
261
|
+
base = Legion::Data::Model::ApolloEntry
|
|
262
|
+
.where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
|
|
263
|
+
.exclude(status: 'archived')
|
|
264
|
+
|
|
265
|
+
{
|
|
266
|
+
total_queries: query_count,
|
|
267
|
+
avg_retrieval_score: nil,
|
|
268
|
+
chunks_never_accessed: base.where(access_count: 0).count,
|
|
269
|
+
chunks_below_threshold: base.where { confidence < settings_stale_threshold }.count
|
|
270
|
+
}
|
|
271
|
+
rescue StandardError
|
|
272
|
+
defaults
|
|
273
|
+
end
|
|
274
|
+
private_class_method :quality_summary
|
|
275
|
+
|
|
276
|
+
def query_count
|
|
277
|
+
return 0 unless defined?(Legion::Data::Model::ApolloAccessLog)
|
|
278
|
+
|
|
279
|
+
Legion::Data::Model::ApolloAccessLog.where(action: 'knowledge_query').count
|
|
280
|
+
rescue StandardError
|
|
281
|
+
0
|
|
282
|
+
end
|
|
283
|
+
private_class_method :query_count
|
|
284
|
+
|
|
285
|
+
def settings_stale_threshold
|
|
286
|
+
return 0.3 unless defined?(Legion::Settings)
|
|
287
|
+
|
|
288
|
+
Legion::Settings.dig(:knowledge, :maintenance, :stale_threshold) || 0.3
|
|
289
|
+
rescue StandardError
|
|
290
|
+
0.3
|
|
291
|
+
end
|
|
292
|
+
private_class_method :settings_stale_threshold
|
|
293
|
+
|
|
294
|
+
def settings_cold_chunk_days
|
|
295
|
+
return 7 unless defined?(Legion::Settings)
|
|
296
|
+
|
|
297
|
+
Legion::Settings.dig(:knowledge, :maintenance, :cold_chunk_days) || 7
|
|
298
|
+
rescue StandardError
|
|
299
|
+
7
|
|
300
|
+
end
|
|
301
|
+
private_class_method :settings_cold_chunk_days
|
|
302
|
+
|
|
303
|
+
def settings_quality_limit
|
|
304
|
+
return 10 unless defined?(Legion::Settings)
|
|
305
|
+
|
|
306
|
+
Legion::Settings.dig(:knowledge, :maintenance, :quality_report_limit) || 10
|
|
307
|
+
rescue StandardError
|
|
308
|
+
10
|
|
309
|
+
end
|
|
310
|
+
private_class_method :settings_quality_limit
|
|
311
|
+
end
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
end
|
|
315
|
+
end
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'digest'
|
|
4
|
+
|
|
3
5
|
module Legion
|
|
4
6
|
module Extensions
|
|
5
7
|
module Knowledge
|
|
@@ -17,15 +19,21 @@ module Legion
|
|
|
17
19
|
|
|
18
20
|
latency_ms = ((::Process.clock_gettime(::Process::CLOCK_MONOTONIC) - started) * 1000).round
|
|
19
21
|
|
|
22
|
+
score = average_score(chunks)
|
|
23
|
+
unless chunks.empty?
|
|
24
|
+
record_feedback(
|
|
25
|
+
question: question,
|
|
26
|
+
chunk_ids: chunks.filter_map { |c| c[:id] },
|
|
27
|
+
retrieval_score: score.to_f,
|
|
28
|
+
synthesized: synthesize && llm_available?
|
|
29
|
+
)
|
|
30
|
+
end
|
|
31
|
+
|
|
20
32
|
{
|
|
21
33
|
success: true,
|
|
22
34
|
answer: answer,
|
|
23
35
|
sources: chunks.map { |c| format_source(c) },
|
|
24
|
-
metadata:
|
|
25
|
-
retrieval_score: average_score(chunks),
|
|
26
|
-
chunk_count: chunks.size,
|
|
27
|
-
latency_ms: latency_ms
|
|
28
|
-
}
|
|
36
|
+
metadata: build_metadata(chunks, score, latency_ms)
|
|
29
37
|
}
|
|
30
38
|
rescue StandardError => e
|
|
31
39
|
{ success: false, error: e.message }
|
|
@@ -38,14 +46,26 @@ module Legion
|
|
|
38
46
|
{
|
|
39
47
|
success: true,
|
|
40
48
|
sources: chunks.map { |c| format_source(c) },
|
|
41
|
-
metadata:
|
|
42
|
-
chunk_count: chunks.size
|
|
43
|
-
}
|
|
49
|
+
metadata: build_metadata(chunks, average_score(chunks))
|
|
44
50
|
}
|
|
45
51
|
rescue StandardError => e
|
|
46
52
|
{ success: false, error: e.message }
|
|
47
53
|
end
|
|
48
54
|
|
|
55
|
+
def record_feedback(question:, chunk_ids:, retrieval_score:, synthesized: true, rating: nil)
|
|
56
|
+
question_hash = ::Digest::SHA256.hexdigest(question.to_s)[0, 16]
|
|
57
|
+
emit_feedback_event(
|
|
58
|
+
question_hash: question_hash,
|
|
59
|
+
chunk_ids: chunk_ids,
|
|
60
|
+
retrieval_score: retrieval_score,
|
|
61
|
+
synthesized: synthesized,
|
|
62
|
+
rating: rating
|
|
63
|
+
)
|
|
64
|
+
{ success: true, question_hash: question_hash, rating: rating }
|
|
65
|
+
rescue StandardError => e
|
|
66
|
+
{ success: false, error: e.message }
|
|
67
|
+
end
|
|
68
|
+
|
|
49
69
|
def retrieve_chunks(question, top_k)
|
|
50
70
|
return [] unless defined?(Legion::Extensions::Apollo)
|
|
51
71
|
|
|
@@ -97,6 +117,53 @@ module Legion
|
|
|
97
117
|
end
|
|
98
118
|
private_class_method :average_score
|
|
99
119
|
|
|
120
|
+
def build_metadata(chunks, score, latency_ms = nil)
|
|
121
|
+
confidences = chunks.filter_map { |c| c[:confidence] }
|
|
122
|
+
distances = chunks.filter_map { |c| c[:distance] }
|
|
123
|
+
source_names = chunks.filter_map do |c|
|
|
124
|
+
c.dig(:metadata, :source_file) || c[:source_file]
|
|
125
|
+
end.uniq
|
|
126
|
+
statuses = chunks.group_by { |c| c[:status] }.transform_values(&:size)
|
|
127
|
+
|
|
128
|
+
meta = {
|
|
129
|
+
retrieval_score: score,
|
|
130
|
+
chunk_count: chunks.size,
|
|
131
|
+
confidence_avg: confidences.empty? ? nil : (confidences.sum.to_f / confidences.size).round(4),
|
|
132
|
+
confidence_range: confidences.empty? ? nil : confidences.minmax,
|
|
133
|
+
distance_range: distances.empty? ? nil : distances.minmax,
|
|
134
|
+
source_files: source_names,
|
|
135
|
+
source_file_count: source_names.size,
|
|
136
|
+
all_embedded: chunks.none? { |c| zero_embedding?(c) },
|
|
137
|
+
statuses: statuses
|
|
138
|
+
}
|
|
139
|
+
meta[:latency_ms] = latency_ms unless latency_ms.nil?
|
|
140
|
+
meta
|
|
141
|
+
end
|
|
142
|
+
private_class_method :build_metadata
|
|
143
|
+
|
|
144
|
+
def zero_embedding?(chunk)
|
|
145
|
+
emb = chunk[:embedding]
|
|
146
|
+
return true if emb.nil?
|
|
147
|
+
|
|
148
|
+
emb.is_a?(Array) && (emb.empty? || emb.all?(&:zero?))
|
|
149
|
+
end
|
|
150
|
+
private_class_method :zero_embedding?
|
|
151
|
+
|
|
152
|
+
def emit_feedback_event(question_hash:, chunk_ids:, retrieval_score:, synthesized:, rating:)
|
|
153
|
+
return unless defined?(Legion::Events)
|
|
154
|
+
|
|
155
|
+
Legion::Events.emit('knowledge.query_feedback', {
|
|
156
|
+
question_hash: question_hash,
|
|
157
|
+
chunk_ids: chunk_ids,
|
|
158
|
+
retrieval_score: retrieval_score,
|
|
159
|
+
synthesized: synthesized,
|
|
160
|
+
rating: rating
|
|
161
|
+
})
|
|
162
|
+
rescue StandardError
|
|
163
|
+
nil
|
|
164
|
+
end
|
|
165
|
+
private_class_method :emit_feedback_event
|
|
166
|
+
|
|
100
167
|
def llm_available?
|
|
101
168
|
defined?(Legion::LLM)
|
|
102
169
|
end
|
|
@@ -2,11 +2,13 @@
|
|
|
2
2
|
|
|
3
3
|
require_relative 'knowledge/version'
|
|
4
4
|
require_relative 'knowledge/helpers/manifest'
|
|
5
|
+
require_relative 'knowledge/helpers/manifest_store'
|
|
5
6
|
require_relative 'knowledge/helpers/parser'
|
|
6
7
|
require_relative 'knowledge/helpers/chunker'
|
|
7
8
|
require_relative 'knowledge/runners/ingest'
|
|
8
9
|
require_relative 'knowledge/runners/query'
|
|
9
10
|
require_relative 'knowledge/runners/corpus'
|
|
11
|
+
require_relative 'knowledge/runners/maintenance'
|
|
10
12
|
require_relative 'knowledge/client'
|
|
11
13
|
|
|
12
14
|
if defined?(Legion::Transport)
|
|
@@ -16,6 +18,7 @@ if defined?(Legion::Transport)
|
|
|
16
18
|
end
|
|
17
19
|
|
|
18
20
|
require_relative 'knowledge/actors/corpus_watcher' if defined?(Legion::Extensions::Actors::Every)
|
|
21
|
+
require_relative 'knowledge/actors/maintenance_runner' if defined?(Legion::Extensions::Actors::Every)
|
|
19
22
|
|
|
20
23
|
require_relative 'knowledge/actors/corpus_ingest' if defined?(Legion::Extensions::Actors::Subscription)
|
|
21
24
|
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: lex-knowledge
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.5.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Matthew Iverson
|
|
@@ -118,12 +118,15 @@ files:
|
|
|
118
118
|
- lib/legion/extensions/knowledge.rb
|
|
119
119
|
- lib/legion/extensions/knowledge/actors/corpus_ingest.rb
|
|
120
120
|
- lib/legion/extensions/knowledge/actors/corpus_watcher.rb
|
|
121
|
+
- lib/legion/extensions/knowledge/actors/maintenance_runner.rb
|
|
121
122
|
- lib/legion/extensions/knowledge/client.rb
|
|
122
123
|
- lib/legion/extensions/knowledge/helpers/chunker.rb
|
|
123
124
|
- lib/legion/extensions/knowledge/helpers/manifest.rb
|
|
125
|
+
- lib/legion/extensions/knowledge/helpers/manifest_store.rb
|
|
124
126
|
- lib/legion/extensions/knowledge/helpers/parser.rb
|
|
125
127
|
- lib/legion/extensions/knowledge/runners/corpus.rb
|
|
126
128
|
- lib/legion/extensions/knowledge/runners/ingest.rb
|
|
129
|
+
- lib/legion/extensions/knowledge/runners/maintenance.rb
|
|
127
130
|
- lib/legion/extensions/knowledge/runners/query.rb
|
|
128
131
|
- lib/legion/extensions/knowledge/transport/exchanges/knowledge.rb
|
|
129
132
|
- lib/legion/extensions/knowledge/transport/messages/ingest_message.rb
|