lex-knowledge 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +77 -0
- data/lib/legion/extensions/knowledge/client.rb +13 -0
- data/lib/legion/extensions/knowledge/helpers/chunker.rb +99 -0
- data/lib/legion/extensions/knowledge/helpers/manifest.rb +55 -0
- data/lib/legion/extensions/knowledge/helpers/parser.rb +74 -0
- data/lib/legion/extensions/knowledge/runners/corpus.rb +17 -0
- data/lib/legion/extensions/knowledge/runners/ingest.rb +152 -0
- data/lib/legion/extensions/knowledge/runners/query.rb +117 -0
- data/lib/legion/extensions/knowledge/version.rb +9 -0
- data/lib/legion/extensions/knowledge.rb +18 -0
- metadata +149 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: ea1661f154fc5184b961caac74654775240dcc49fd7d8c6be469f6cd5520429c
|
|
4
|
+
data.tar.gz: 3499bcb535edb1f56e896ed9c462a2cfabb9d7d1804dc55ecd559fc3b1941ae9
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 8b50acc322c82ae7629e5152e31c84ce9b348fdd1226e96aa7bc3c1f9f67cd330b809212f49c19636d93a45a199ce4e936a3d172cf71997ffad5aa69eb9712ef
|
|
7
|
+
data.tar.gz: c1feac59942cf706beb303cbac9989e569937048d2f666af3e762ef50d69659c46de6bb350f69cc16a5103581d54682c3f68886f84a4a840b16b1f0e8cfb82e6
|
data/README.md
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# lex-knowledge
|
|
2
|
+
|
|
3
|
+
Document corpus ingestion and knowledge query pipeline for LegionIO.
|
|
4
|
+
|
|
5
|
+
`lex-knowledge` walks a directory of documents, parses them into sections, splits sections into token-aware chunks, and writes each chunk to Apollo as a searchable knowledge entry. A query runner retrieves relevant chunks via semantic search and optionally synthesizes an answer through the LLM pipeline.
|
|
6
|
+
|
|
7
|
+
## Phase A: Corpus Ingestion
|
|
8
|
+
|
|
9
|
+
This gem implements Phase A of the knowledge pipeline:
|
|
10
|
+
|
|
11
|
+
- **Manifest**: file walker with SHA256 fingerprinting and incremental diff support
|
|
12
|
+
- **Parser**: section-aware extraction for Markdown and plain text
|
|
13
|
+
- **Chunker**: paragraph-respecting splits with configurable token budget and overlap
|
|
14
|
+
- **Ingest runners**: full corpus or single-file ingestion, writing chunks to Apollo
|
|
15
|
+
- **Query runners**: retrieval-only or retrieval + LLM synthesis
|
|
16
|
+
|
|
17
|
+
`.docx` and `.pdf` parsing are deferred to a later phase.
|
|
18
|
+
|
|
19
|
+
## Usage
|
|
20
|
+
|
|
21
|
+
```ruby
|
|
22
|
+
require 'legion/extensions/knowledge'
|
|
23
|
+
|
|
24
|
+
# Ingest an entire directory
|
|
25
|
+
Legion::Extensions::Knowledge::Runners::Ingest.ingest_corpus(
|
|
26
|
+
path: '/path/to/docs',
|
|
27
|
+
dry_run: false,
|
|
28
|
+
force: false
|
|
29
|
+
)
|
|
30
|
+
# => { success: true, files_scanned: 12, chunks_created: 84, chunks_skipped: 0, chunks_updated: 0 }
|
|
31
|
+
|
|
32
|
+
# Ingest a single file
|
|
33
|
+
Legion::Extensions::Knowledge::Runners::Ingest.ingest_file(
|
|
34
|
+
file_path: '/path/to/docs/guide.md'
|
|
35
|
+
)
|
|
36
|
+
# => { success: true, file: '...', chunks_created: 7, chunks_skipped: 0, chunks_updated: 0 }
|
|
37
|
+
|
|
38
|
+
# Query with LLM synthesis
|
|
39
|
+
Legion::Extensions::Knowledge::Runners::Query.query(
|
|
40
|
+
question: 'How does Legion route tasks?',
|
|
41
|
+
top_k: 5,
|
|
42
|
+
synthesize: true
|
|
43
|
+
)
|
|
44
|
+
# => { success: true, answer: '...', sources: [...], metadata: { retrieval_score: 0.87, chunk_count: 5, latency_ms: 312 } }
|
|
45
|
+
|
|
46
|
+
# Retrieval only (no LLM)
|
|
47
|
+
Legion::Extensions::Knowledge::Runners::Query.retrieve(
|
|
48
|
+
question: 'What is a LEX extension?',
|
|
49
|
+
top_k: 3
|
|
50
|
+
)
|
|
51
|
+
# => { success: true, sources: [...], metadata: { chunk_count: 3 } }
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Configuration
|
|
55
|
+
|
|
56
|
+
Settings are read from `Legion::Settings` under the `:knowledge` key:
|
|
57
|
+
|
|
58
|
+
```yaml
|
|
59
|
+
knowledge:
|
|
60
|
+
chunker:
|
|
61
|
+
max_tokens: 512 # default 512
|
|
62
|
+
overlap_tokens: 128 # default 128
|
|
63
|
+
query:
|
|
64
|
+
top_k: 5 # default 5
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Dependencies
|
|
68
|
+
|
|
69
|
+
- `legion-cache`, `legion-crypt`, `legion-data`, `legion-json`, `legion-logging`, `legion-settings`, `legion-transport`
|
|
70
|
+
- `lex-apollo` (optional): chunk storage and vector retrieval
|
|
71
|
+
- `legion-llm` (optional): answer synthesis
|
|
72
|
+
|
|
73
|
+
Both optional dependencies are guarded with `defined?()` — the gem degrades gracefully when they are absent.
|
|
74
|
+
|
|
75
|
+
## License
|
|
76
|
+
|
|
77
|
+
MIT
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'digest'
|
|
4
|
+
|
|
5
|
+
module Legion
|
|
6
|
+
module Extensions
|
|
7
|
+
module Knowledge
|
|
8
|
+
module Helpers
|
|
9
|
+
module Chunker
|
|
10
|
+
CHARS_PER_TOKEN = 4
|
|
11
|
+
|
|
12
|
+
module_function
|
|
13
|
+
|
|
14
|
+
def chunk(sections:, max_tokens: nil, overlap_tokens: nil)
|
|
15
|
+
resolved_max = max_tokens || settings_max_tokens || 512
|
|
16
|
+
resolved_overlap = overlap_tokens || settings_overlap_tokens || 128
|
|
17
|
+
|
|
18
|
+
max_chars = resolved_max * CHARS_PER_TOKEN
|
|
19
|
+
overlap_chars = resolved_overlap * CHARS_PER_TOKEN
|
|
20
|
+
|
|
21
|
+
chunks = []
|
|
22
|
+
sections.each do |section|
|
|
23
|
+
chunks.concat(split_section(section, max_chars, overlap_chars))
|
|
24
|
+
end
|
|
25
|
+
chunks
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def split_section(section, max_chars, overlap_chars)
|
|
29
|
+
paragraphs = section[:content].split(/\n\n+/)
|
|
30
|
+
chunks = []
|
|
31
|
+
buffer = ''
|
|
32
|
+
chunk_idx = 0
|
|
33
|
+
|
|
34
|
+
paragraphs.each do |para|
|
|
35
|
+
candidate = buffer.empty? ? para : "#{buffer}\n\n#{para}"
|
|
36
|
+
|
|
37
|
+
if candidate.length <= max_chars
|
|
38
|
+
buffer = candidate
|
|
39
|
+
else
|
|
40
|
+
unless buffer.empty?
|
|
41
|
+
chunks << build_chunk(section, buffer, chunk_idx)
|
|
42
|
+
chunk_idx += 1
|
|
43
|
+
tail = buffer.length > overlap_chars ? buffer[-overlap_chars..] : buffer
|
|
44
|
+
buffer = tail.empty? ? para : "#{tail}\n\n#{para}"
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
if para.length > max_chars
|
|
48
|
+
para.chars.each_slice(max_chars).with_index do |slice, i|
|
|
49
|
+
text = slice.join
|
|
50
|
+
chunks << build_chunk(section, text, chunk_idx + i)
|
|
51
|
+
end
|
|
52
|
+
chunk_idx += (para.length.to_f / max_chars).ceil
|
|
53
|
+
buffer = ''
|
|
54
|
+
else
|
|
55
|
+
buffer = para
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
chunks << build_chunk(section, buffer, chunk_idx) unless buffer.empty?
|
|
61
|
+
chunks
|
|
62
|
+
end
|
|
63
|
+
private_class_method :split_section
|
|
64
|
+
|
|
65
|
+
def build_chunk(section, content, index)
|
|
66
|
+
{
|
|
67
|
+
content: content,
|
|
68
|
+
heading: section[:heading],
|
|
69
|
+
section_path: section[:section_path],
|
|
70
|
+
source_file: section[:source_file],
|
|
71
|
+
token_count: (content.length.to_f / CHARS_PER_TOKEN).ceil,
|
|
72
|
+
chunk_index: index,
|
|
73
|
+
content_hash: ::Digest::SHA256.hexdigest(content)
|
|
74
|
+
}
|
|
75
|
+
end
|
|
76
|
+
private_class_method :build_chunk
|
|
77
|
+
|
|
78
|
+
def settings_max_tokens
|
|
79
|
+
return nil unless defined?(Legion::Settings)
|
|
80
|
+
|
|
81
|
+
Legion::Settings.dig(:knowledge, :chunker, :max_tokens)
|
|
82
|
+
rescue StandardError
|
|
83
|
+
nil
|
|
84
|
+
end
|
|
85
|
+
private_class_method :settings_max_tokens
|
|
86
|
+
|
|
87
|
+
def settings_overlap_tokens
|
|
88
|
+
return nil unless defined?(Legion::Settings)
|
|
89
|
+
|
|
90
|
+
Legion::Settings.dig(:knowledge, :chunker, :overlap_tokens)
|
|
91
|
+
rescue StandardError
|
|
92
|
+
nil
|
|
93
|
+
end
|
|
94
|
+
private_class_method :settings_overlap_tokens
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'digest'
|
|
4
|
+
require 'find'
|
|
5
|
+
|
|
6
|
+
module Legion
|
|
7
|
+
module Extensions
|
|
8
|
+
module Knowledge
|
|
9
|
+
module Helpers
|
|
10
|
+
module Manifest
|
|
11
|
+
module_function
|
|
12
|
+
|
|
13
|
+
def scan(path:, extensions: %w[.md .txt .docx .pdf])
|
|
14
|
+
results = []
|
|
15
|
+
|
|
16
|
+
Find.find(path) do |entry|
|
|
17
|
+
basename = ::File.basename(entry)
|
|
18
|
+
Find.prune if basename.start_with?('.')
|
|
19
|
+
|
|
20
|
+
next unless ::File.file?(entry)
|
|
21
|
+
next unless extensions.include?(::File.extname(entry).downcase)
|
|
22
|
+
|
|
23
|
+
results << build_entry(entry)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
results
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def diff(current:, previous:)
|
|
30
|
+
current_map = current.to_h { |e| [e[:path], e[:sha256]] }
|
|
31
|
+
previous_map = previous.to_h { |e| [e[:path], e[:sha256]] }
|
|
32
|
+
|
|
33
|
+
added = current_map.keys - previous_map.keys
|
|
34
|
+
removed = previous_map.keys - current_map.keys
|
|
35
|
+
changed = current_map.keys.select do |p|
|
|
36
|
+
previous_map.key?(p) && previous_map[p] != current_map[p]
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
{ added: added, changed: changed, removed: removed }
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def build_entry(path)
|
|
43
|
+
{
|
|
44
|
+
path: path,
|
|
45
|
+
size: ::File.size(path),
|
|
46
|
+
mtime: ::File.mtime(path),
|
|
47
|
+
sha256: ::Digest::SHA256.file(path).hexdigest
|
|
48
|
+
}
|
|
49
|
+
end
|
|
50
|
+
private_class_method :build_entry
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Extensions
|
|
5
|
+
module Knowledge
|
|
6
|
+
module Helpers
|
|
7
|
+
module Parser
|
|
8
|
+
module_function
|
|
9
|
+
|
|
10
|
+
def parse(file_path:)
|
|
11
|
+
ext = ::File.extname(file_path).downcase
|
|
12
|
+
|
|
13
|
+
case ext
|
|
14
|
+
when '.md'
|
|
15
|
+
parse_markdown(file_path: file_path)
|
|
16
|
+
when '.txt'
|
|
17
|
+
parse_text(file_path: file_path)
|
|
18
|
+
else
|
|
19
|
+
[{ error: 'unsupported format', source_file: file_path }]
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def parse_markdown(file_path:)
|
|
24
|
+
content = ::File.read(file_path, encoding: 'utf-8')
|
|
25
|
+
sections = []
|
|
26
|
+
current_heading = ::File.basename(file_path, '.*')
|
|
27
|
+
current_lines = []
|
|
28
|
+
section_path = []
|
|
29
|
+
|
|
30
|
+
content.each_line do |line|
|
|
31
|
+
if line.start_with?('# ')
|
|
32
|
+
flush_section(sections, current_heading, section_path, current_lines, file_path) unless current_lines.empty?
|
|
33
|
+
current_heading = line.sub(/^#+\s*/, '').chomp
|
|
34
|
+
section_path = [current_heading]
|
|
35
|
+
current_lines = []
|
|
36
|
+
elsif line.start_with?('## ')
|
|
37
|
+
flush_section(sections, current_heading, section_path, current_lines, file_path) unless current_lines.empty?
|
|
38
|
+
current_heading = line.sub(/^#+\s*/, '').chomp
|
|
39
|
+
section_path = section_path.first(1) + [current_heading]
|
|
40
|
+
current_lines = []
|
|
41
|
+
else
|
|
42
|
+
current_lines << line
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
flush_section(sections, current_heading, section_path, current_lines, file_path) unless current_lines.empty?
|
|
47
|
+
|
|
48
|
+
sections.empty? ? [{ heading: ::File.basename(file_path, '.*'), section_path: [], content: content.strip, source_file: file_path }] : sections
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def parse_text(file_path:)
|
|
52
|
+
content = ::File.read(file_path, encoding: 'utf-8')
|
|
53
|
+
heading = ::File.basename(file_path, '.*')
|
|
54
|
+
|
|
55
|
+
[{ heading: heading, section_path: [], content: content.strip, source_file: file_path }]
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def flush_section(sections, heading, section_path, lines, file_path)
|
|
59
|
+
content = lines.join.strip
|
|
60
|
+
return if content.empty?
|
|
61
|
+
|
|
62
|
+
sections << {
|
|
63
|
+
heading: heading,
|
|
64
|
+
section_path: section_path.dup,
|
|
65
|
+
content: content,
|
|
66
|
+
source_file: file_path
|
|
67
|
+
}
|
|
68
|
+
end
|
|
69
|
+
private_class_method :flush_section
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Extensions
|
|
5
|
+
module Knowledge
|
|
6
|
+
module Runners
|
|
7
|
+
module Corpus
|
|
8
|
+
module_function
|
|
9
|
+
|
|
10
|
+
def corpus_stats
|
|
11
|
+
{ success: true, info: 'not yet implemented' }
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Extensions
|
|
5
|
+
module Knowledge
|
|
6
|
+
module Runners
|
|
7
|
+
module Ingest
|
|
8
|
+
module_function
|
|
9
|
+
|
|
10
|
+
def scan_corpus(path:, extensions: nil)
|
|
11
|
+
opts = { path: path }
|
|
12
|
+
opts[:extensions] = extensions if extensions
|
|
13
|
+
|
|
14
|
+
entries = Helpers::Manifest.scan(**opts)
|
|
15
|
+
|
|
16
|
+
{
|
|
17
|
+
success: true,
|
|
18
|
+
path: path,
|
|
19
|
+
file_count: entries.size,
|
|
20
|
+
total_bytes: entries.sum { |e| e[:size] },
|
|
21
|
+
files: entries.map { |e| e[:path] }
|
|
22
|
+
}
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def ingest_corpus(path:, dry_run: false, force: false)
|
|
26
|
+
entries = Helpers::Manifest.scan(path: path)
|
|
27
|
+
|
|
28
|
+
files_scanned = entries.size
|
|
29
|
+
chunks_created = 0
|
|
30
|
+
chunks_skipped = 0
|
|
31
|
+
chunks_updated = 0
|
|
32
|
+
|
|
33
|
+
entries.each do |entry|
|
|
34
|
+
result = process_file(entry[:path], dry_run: dry_run, force: force)
|
|
35
|
+
chunks_created += result[:created]
|
|
36
|
+
chunks_skipped += result[:skipped]
|
|
37
|
+
chunks_updated += result[:updated]
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
{
|
|
41
|
+
success: true,
|
|
42
|
+
files_scanned: files_scanned,
|
|
43
|
+
chunks_created: chunks_created,
|
|
44
|
+
chunks_skipped: chunks_skipped,
|
|
45
|
+
chunks_updated: chunks_updated
|
|
46
|
+
}
|
|
47
|
+
rescue StandardError => e
|
|
48
|
+
{ success: false, error: e.message }
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def ingest_file(file_path:, force: false)
|
|
52
|
+
result = process_file(file_path, dry_run: false, force: force)
|
|
53
|
+
|
|
54
|
+
{
|
|
55
|
+
success: true,
|
|
56
|
+
file: file_path,
|
|
57
|
+
chunks_created: result[:created],
|
|
58
|
+
chunks_skipped: result[:skipped],
|
|
59
|
+
chunks_updated: result[:updated]
|
|
60
|
+
}
|
|
61
|
+
rescue StandardError => e
|
|
62
|
+
{ success: false, error: e.message }
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def process_file(file_path, dry_run: false, force: false)
|
|
66
|
+
sections = Helpers::Parser.parse(file_path: file_path)
|
|
67
|
+
return { created: 0, skipped: 0, updated: 0 } if sections.first&.key?(:error)
|
|
68
|
+
|
|
69
|
+
chunks = Helpers::Chunker.chunk(sections: sections)
|
|
70
|
+
created = 0
|
|
71
|
+
skipped = 0
|
|
72
|
+
updated = 0
|
|
73
|
+
|
|
74
|
+
chunks.each do |chunk|
|
|
75
|
+
outcome = upsert_chunk(chunk, dry_run: dry_run, force: force)
|
|
76
|
+
case outcome
|
|
77
|
+
when :created then created += 1
|
|
78
|
+
when :skipped then skipped += 1
|
|
79
|
+
when :updated then updated += 1
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
{ created: created, skipped: skipped, updated: updated }
|
|
84
|
+
end
|
|
85
|
+
private_class_method :process_file
|
|
86
|
+
|
|
87
|
+
def upsert_chunk(chunk, dry_run: false, force: false)
|
|
88
|
+
return :created if dry_run
|
|
89
|
+
|
|
90
|
+
return :created unless defined?(Legion::Extensions::Apollo)
|
|
91
|
+
|
|
92
|
+
return :skipped if !force && chunk_exists?(chunk[:content_hash])
|
|
93
|
+
|
|
94
|
+
embedding = generate_embedding(chunk[:content])
|
|
95
|
+
ingest_to_apollo(chunk, embedding)
|
|
96
|
+
|
|
97
|
+
force ? :updated : :created
|
|
98
|
+
rescue StandardError
|
|
99
|
+
:skipped
|
|
100
|
+
end
|
|
101
|
+
private_class_method :upsert_chunk
|
|
102
|
+
|
|
103
|
+
def chunk_exists?(content_hash)
|
|
104
|
+
return false unless defined?(Legion::Extensions::Apollo)
|
|
105
|
+
|
|
106
|
+
Legion::Extensions::Apollo::Runners::Knowledge.retrieve_relevant(
|
|
107
|
+
query: content_hash,
|
|
108
|
+
limit: 1,
|
|
109
|
+
tags: ['document_chunk'],
|
|
110
|
+
filter: { content_hash: content_hash }
|
|
111
|
+
).any?
|
|
112
|
+
rescue StandardError
|
|
113
|
+
false
|
|
114
|
+
end
|
|
115
|
+
private_class_method :chunk_exists?
|
|
116
|
+
|
|
117
|
+
def generate_embedding(content)
|
|
118
|
+
return nil unless defined?(Legion::Extensions::Apollo)
|
|
119
|
+
return nil unless defined?(Legion::Extensions::Apollo::Helpers::Embedding)
|
|
120
|
+
|
|
121
|
+
Legion::Extensions::Apollo::Helpers::Embedding.generate(content)
|
|
122
|
+
rescue StandardError
|
|
123
|
+
nil
|
|
124
|
+
end
|
|
125
|
+
private_class_method :generate_embedding
|
|
126
|
+
|
|
127
|
+
def ingest_to_apollo(chunk, embedding)
|
|
128
|
+
return unless defined?(Legion::Extensions::Apollo)
|
|
129
|
+
|
|
130
|
+
payload = {
|
|
131
|
+
content: chunk[:content],
|
|
132
|
+
content_type: 'document_chunk',
|
|
133
|
+
content_hash: chunk[:content_hash],
|
|
134
|
+
tags: [chunk[:source_file], chunk[:heading], 'document_chunk'].compact.uniq,
|
|
135
|
+
metadata: {
|
|
136
|
+
source_file: chunk[:source_file],
|
|
137
|
+
heading: chunk[:heading],
|
|
138
|
+
section_path: chunk[:section_path],
|
|
139
|
+
chunk_index: chunk[:chunk_index],
|
|
140
|
+
token_count: chunk[:token_count]
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
payload[:embedding] = embedding if embedding
|
|
144
|
+
|
|
145
|
+
Legion::Extensions::Apollo::Runners::Knowledge.handle_ingest(**payload)
|
|
146
|
+
end
|
|
147
|
+
private_class_method :ingest_to_apollo
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
end
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Extensions
|
|
5
|
+
module Knowledge
|
|
6
|
+
module Runners
|
|
7
|
+
module Query
|
|
8
|
+
module_function
|
|
9
|
+
|
|
10
|
+
def query(question:, top_k: nil, synthesize: true)
|
|
11
|
+
started = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC)
|
|
12
|
+
resolved_k = top_k || settings_top_k || 5
|
|
13
|
+
|
|
14
|
+
chunks = retrieve_chunks(question, resolved_k)
|
|
15
|
+
|
|
16
|
+
answer = (synthesize_answer(question, chunks) if synthesize && llm_available?)
|
|
17
|
+
|
|
18
|
+
latency_ms = ((::Process.clock_gettime(::Process::CLOCK_MONOTONIC) - started) * 1000).round
|
|
19
|
+
|
|
20
|
+
{
|
|
21
|
+
success: true,
|
|
22
|
+
answer: answer,
|
|
23
|
+
sources: chunks.map { |c| format_source(c) },
|
|
24
|
+
metadata: {
|
|
25
|
+
retrieval_score: average_score(chunks),
|
|
26
|
+
chunk_count: chunks.size,
|
|
27
|
+
latency_ms: latency_ms
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
rescue StandardError => e
|
|
31
|
+
{ success: false, error: e.message }
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def retrieve(question:, top_k: nil)
|
|
35
|
+
resolved_k = top_k || settings_top_k || 5
|
|
36
|
+
chunks = retrieve_chunks(question, resolved_k)
|
|
37
|
+
|
|
38
|
+
{
|
|
39
|
+
success: true,
|
|
40
|
+
sources: chunks.map { |c| format_source(c) },
|
|
41
|
+
metadata: {
|
|
42
|
+
chunk_count: chunks.size
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
rescue StandardError => e
|
|
46
|
+
{ success: false, error: e.message }
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def retrieve_chunks(question, top_k)
|
|
50
|
+
return [] unless defined?(Legion::Extensions::Apollo)
|
|
51
|
+
|
|
52
|
+
Legion::Extensions::Apollo::Runners::Knowledge.retrieve_relevant(
|
|
53
|
+
query: question,
|
|
54
|
+
limit: top_k,
|
|
55
|
+
tags: ['document_chunk']
|
|
56
|
+
)
|
|
57
|
+
rescue StandardError
|
|
58
|
+
[]
|
|
59
|
+
end
|
|
60
|
+
private_class_method :retrieve_chunks
|
|
61
|
+
|
|
62
|
+
def synthesize_answer(question, chunks)
|
|
63
|
+
return nil unless llm_available?
|
|
64
|
+
|
|
65
|
+
context_text = chunks.map { |c| c[:content] }.join("\n\n---\n\n")
|
|
66
|
+
prompt = if context_text.empty?
|
|
67
|
+
question
|
|
68
|
+
else
|
|
69
|
+
"You are a helpful assistant. Use the context below to answer the question.\n\n" \
|
|
70
|
+
"Context:\n#{context_text}\n\nQuestion: #{question}\n\nAnswer:"
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
result = Legion::LLM.chat(message: prompt, caller: { extension: 'lex-knowledge' })
|
|
74
|
+
result.is_a?(Hash) ? result[:content] : result
|
|
75
|
+
rescue StandardError => e
|
|
76
|
+
"Error generating answer: #{e.message}"
|
|
77
|
+
end
|
|
78
|
+
private_class_method :synthesize_answer
|
|
79
|
+
|
|
80
|
+
def format_source(chunk)
|
|
81
|
+
{
|
|
82
|
+
content: chunk[:content],
|
|
83
|
+
source_file: chunk.dig(:metadata, :source_file) || chunk[:source_file],
|
|
84
|
+
heading: chunk.dig(:metadata, :heading) || chunk[:heading],
|
|
85
|
+
distance: chunk[:distance] || chunk[:score]
|
|
86
|
+
}
|
|
87
|
+
end
|
|
88
|
+
private_class_method :format_source
|
|
89
|
+
|
|
90
|
+
def average_score(chunks)
|
|
91
|
+
return nil if chunks.empty?
|
|
92
|
+
|
|
93
|
+
scores = chunks.filter_map { |c| c[:distance] || c[:score] }
|
|
94
|
+
return nil if scores.empty?
|
|
95
|
+
|
|
96
|
+
(scores.sum.to_f / scores.size).round(4)
|
|
97
|
+
end
|
|
98
|
+
private_class_method :average_score
|
|
99
|
+
|
|
100
|
+
def llm_available?
|
|
101
|
+
defined?(Legion::LLM)
|
|
102
|
+
end
|
|
103
|
+
private_class_method :llm_available?
|
|
104
|
+
|
|
105
|
+
def settings_top_k
|
|
106
|
+
return nil unless defined?(Legion::Settings)
|
|
107
|
+
|
|
108
|
+
Legion::Settings.dig(:knowledge, :query, :top_k)
|
|
109
|
+
rescue StandardError
|
|
110
|
+
nil
|
|
111
|
+
end
|
|
112
|
+
private_class_method :settings_top_k
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'knowledge/version'
|
|
4
|
+
require_relative 'knowledge/helpers/manifest'
|
|
5
|
+
require_relative 'knowledge/helpers/parser'
|
|
6
|
+
require_relative 'knowledge/helpers/chunker'
|
|
7
|
+
require_relative 'knowledge/runners/ingest'
|
|
8
|
+
require_relative 'knowledge/runners/query'
|
|
9
|
+
require_relative 'knowledge/runners/corpus'
|
|
10
|
+
require_relative 'knowledge/client'
|
|
11
|
+
|
|
12
|
+
module Legion
|
|
13
|
+
module Extensions
|
|
14
|
+
module Knowledge
|
|
15
|
+
extend Legion::Extensions::Core if defined?(Legion::Extensions::Core)
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: lex-knowledge
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.2
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Matthew Iverson
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: legion-cache
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - ">="
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: 1.3.13
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - ">="
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: 1.3.13
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: legion-crypt
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - ">="
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: 1.4.9
|
|
33
|
+
type: :runtime
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - ">="
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: 1.4.9
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: legion-data
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - ">="
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: 1.5.0
|
|
47
|
+
type: :runtime
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - ">="
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: 1.5.0
|
|
54
|
+
- !ruby/object:Gem::Dependency
|
|
55
|
+
name: legion-json
|
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
|
57
|
+
requirements:
|
|
58
|
+
- - ">="
|
|
59
|
+
- !ruby/object:Gem::Version
|
|
60
|
+
version: 1.2.1
|
|
61
|
+
type: :runtime
|
|
62
|
+
prerelease: false
|
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
64
|
+
requirements:
|
|
65
|
+
- - ">="
|
|
66
|
+
- !ruby/object:Gem::Version
|
|
67
|
+
version: 1.2.1
|
|
68
|
+
- !ruby/object:Gem::Dependency
|
|
69
|
+
name: legion-logging
|
|
70
|
+
requirement: !ruby/object:Gem::Requirement
|
|
71
|
+
requirements:
|
|
72
|
+
- - ">="
|
|
73
|
+
- !ruby/object:Gem::Version
|
|
74
|
+
version: 1.3.3
|
|
75
|
+
type: :runtime
|
|
76
|
+
prerelease: false
|
|
77
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
78
|
+
requirements:
|
|
79
|
+
- - ">="
|
|
80
|
+
- !ruby/object:Gem::Version
|
|
81
|
+
version: 1.3.3
|
|
82
|
+
- !ruby/object:Gem::Dependency
|
|
83
|
+
name: legion-settings
|
|
84
|
+
requirement: !ruby/object:Gem::Requirement
|
|
85
|
+
requirements:
|
|
86
|
+
- - ">="
|
|
87
|
+
- !ruby/object:Gem::Version
|
|
88
|
+
version: 1.3.15
|
|
89
|
+
type: :runtime
|
|
90
|
+
prerelease: false
|
|
91
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
92
|
+
requirements:
|
|
93
|
+
- - ">="
|
|
94
|
+
- !ruby/object:Gem::Version
|
|
95
|
+
version: 1.3.15
|
|
96
|
+
- !ruby/object:Gem::Dependency
|
|
97
|
+
name: legion-transport
|
|
98
|
+
requirement: !ruby/object:Gem::Requirement
|
|
99
|
+
requirements:
|
|
100
|
+
- - ">="
|
|
101
|
+
- !ruby/object:Gem::Version
|
|
102
|
+
version: 1.3.11
|
|
103
|
+
type: :runtime
|
|
104
|
+
prerelease: false
|
|
105
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
106
|
+
requirements:
|
|
107
|
+
- - ">="
|
|
108
|
+
- !ruby/object:Gem::Version
|
|
109
|
+
version: 1.3.11
|
|
110
|
+
description: Document corpus ingestion and knowledge query pipeline for LegionIO
|
|
111
|
+
email:
|
|
112
|
+
- matt@iverson.io
|
|
113
|
+
executables: []
|
|
114
|
+
extensions: []
|
|
115
|
+
extra_rdoc_files: []
|
|
116
|
+
files:
|
|
117
|
+
- README.md
|
|
118
|
+
- lib/legion/extensions/knowledge.rb
|
|
119
|
+
- lib/legion/extensions/knowledge/client.rb
|
|
120
|
+
- lib/legion/extensions/knowledge/helpers/chunker.rb
|
|
121
|
+
- lib/legion/extensions/knowledge/helpers/manifest.rb
|
|
122
|
+
- lib/legion/extensions/knowledge/helpers/parser.rb
|
|
123
|
+
- lib/legion/extensions/knowledge/runners/corpus.rb
|
|
124
|
+
- lib/legion/extensions/knowledge/runners/ingest.rb
|
|
125
|
+
- lib/legion/extensions/knowledge/runners/query.rb
|
|
126
|
+
- lib/legion/extensions/knowledge/version.rb
|
|
127
|
+
homepage: https://github.com/LegionIO/lex-knowledge
|
|
128
|
+
licenses:
|
|
129
|
+
- MIT
|
|
130
|
+
metadata:
|
|
131
|
+
rubygems_mfa_required: 'true'
|
|
132
|
+
rdoc_options: []
|
|
133
|
+
require_paths:
|
|
134
|
+
- lib
|
|
135
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
136
|
+
requirements:
|
|
137
|
+
- - ">="
|
|
138
|
+
- !ruby/object:Gem::Version
|
|
139
|
+
version: '3.4'
|
|
140
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
141
|
+
requirements:
|
|
142
|
+
- - ">="
|
|
143
|
+
- !ruby/object:Gem::Version
|
|
144
|
+
version: '0'
|
|
145
|
+
requirements: []
|
|
146
|
+
rubygems_version: 3.6.9
|
|
147
|
+
specification_version: 4
|
|
148
|
+
summary: Document corpus ingestion and knowledge query pipeline for LegionIO
|
|
149
|
+
test_files: []
|