lex-knowledge 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/legion/extensions/knowledge/helpers/manifest_store.rb +51 -0
- data/lib/legion/extensions/knowledge/helpers/parser.rb +35 -12
- data/lib/legion/extensions/knowledge/runners/corpus.rb +4 -0
- data/lib/legion/extensions/knowledge/runners/ingest.rb +86 -32
- data/lib/legion/extensions/knowledge/version.rb +1 -1
- data/lib/legion/extensions/knowledge.rb +1 -0
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: da332d52d481d4e45e8f7ee771a60fb13cebb74144a18ac54ff7b7ec4cb26f27
|
|
4
|
+
data.tar.gz: 3a858fbdae4511ec6e34573fccd85f330b33c7be818a2a78271af33cb0dcbfbd
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e7e15c2174b28ea518c3fee6ecc68723bc628ee3c6d9121a2292706ad32125d32f766da164aa581c56915cd178e54379014a67d8ad1d5cd765485a6eb88d0610
|
|
7
|
+
data.tar.gz: eb7f343423ea41ff46b6a478b3c32a22b0dd3ab3939ad858d0e8d540b5faaf8f01496a5839c79b5ccb1847529367ad9c55ffdc72bebf5959891ff4a7c06b72e4
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'digest'
|
|
4
|
+
require 'fileutils'
|
|
5
|
+
require 'json'
|
|
6
|
+
require 'tempfile'
|
|
7
|
+
|
|
8
|
+
module Legion
|
|
9
|
+
module Extensions
|
|
10
|
+
module Knowledge
|
|
11
|
+
module Helpers
|
|
12
|
+
module ManifestStore
|
|
13
|
+
module_function
|
|
14
|
+
|
|
15
|
+
STORE_DIR = ::File.expand_path('~/.legionio/knowledge').freeze
|
|
16
|
+
|
|
17
|
+
def load(corpus_path:)
|
|
18
|
+
path = store_path(corpus_path: corpus_path)
|
|
19
|
+
return [] unless ::File.exist?(path)
|
|
20
|
+
|
|
21
|
+
raw = ::File.read(path, encoding: 'utf-8')
|
|
22
|
+
::JSON.parse(raw, symbolize_names: true)
|
|
23
|
+
rescue StandardError
|
|
24
|
+
[]
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def save(corpus_path:, manifest:)
|
|
28
|
+
::FileUtils.mkdir_p(STORE_DIR)
|
|
29
|
+
path = store_path(corpus_path: corpus_path)
|
|
30
|
+
tmp = "#{path}.tmp"
|
|
31
|
+
::File.write(tmp, ::JSON.generate(manifest.map { |e| serialize_entry(e) }))
|
|
32
|
+
::File.rename(tmp, path)
|
|
33
|
+
true
|
|
34
|
+
rescue StandardError
|
|
35
|
+
false
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def store_path(corpus_path:)
|
|
39
|
+
hash = ::Digest::SHA256.hexdigest(corpus_path.to_s)[0, 16]
|
|
40
|
+
::File.join(STORE_DIR, "#{hash}.manifest.json")
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def serialize_entry(entry)
|
|
44
|
+
entry.merge(mtime: entry[:mtime].to_s)
|
|
45
|
+
end
|
|
46
|
+
private_class_method :serialize_entry
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
@@ -15,6 +15,8 @@ module Legion
|
|
|
15
15
|
parse_markdown(file_path: file_path)
|
|
16
16
|
when '.txt'
|
|
17
17
|
parse_text(file_path: file_path)
|
|
18
|
+
when '.pdf', '.docx'
|
|
19
|
+
extract_via_data(file_path: file_path)
|
|
18
20
|
else
|
|
19
21
|
[{ error: 'unsupported format', source_file: file_path }]
|
|
20
22
|
end
|
|
@@ -22,32 +24,42 @@ module Legion
|
|
|
22
24
|
|
|
23
25
|
def parse_markdown(file_path:)
|
|
24
26
|
content = ::File.read(file_path, encoding: 'utf-8')
|
|
25
|
-
sections
|
|
27
|
+
sections = []
|
|
26
28
|
current_heading = ::File.basename(file_path, '.*')
|
|
27
29
|
current_lines = []
|
|
28
|
-
|
|
30
|
+
heading_stack = {}
|
|
29
31
|
|
|
30
32
|
content.each_line do |line|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
current_heading
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
current_heading = line.sub(/^#+\s*/, '').chomp
|
|
39
|
-
section_path = section_path.first(1) + [current_heading]
|
|
33
|
+
level = heading_level(line)
|
|
34
|
+
if level
|
|
35
|
+
flush_section(sections, current_heading, build_section_path(heading_stack), current_lines, file_path)
|
|
36
|
+
title = line.sub(/^#+\s*/, '').chomp
|
|
37
|
+
heading_stack.delete_if { |d, _| d >= level }
|
|
38
|
+
heading_stack[level] = title
|
|
39
|
+
current_heading = title
|
|
40
40
|
current_lines = []
|
|
41
41
|
else
|
|
42
42
|
current_lines << line
|
|
43
43
|
end
|
|
44
44
|
end
|
|
45
45
|
|
|
46
|
-
flush_section(sections, current_heading,
|
|
46
|
+
flush_section(sections, current_heading, build_section_path(heading_stack), current_lines, file_path)
|
|
47
47
|
|
|
48
48
|
sections.empty? ? [{ heading: ::File.basename(file_path, '.*'), section_path: [], content: content.strip, source_file: file_path }] : sections
|
|
49
49
|
end
|
|
50
50
|
|
|
51
|
+
def extract_via_data(file_path:)
|
|
52
|
+
return [{ error: 'unsupported format', source_file: file_path }] unless defined?(::Legion::Data::Extract)
|
|
53
|
+
|
|
54
|
+
result = ::Legion::Data::Extract.extract(file_path, type: :auto)
|
|
55
|
+
return [{ error: 'extraction_failed', source_file: file_path, detail: result }] unless result.is_a?(Hash) && result[:text]
|
|
56
|
+
|
|
57
|
+
heading = ::File.basename(file_path, '.*')
|
|
58
|
+
[{ heading: heading, section_path: [], content: result[:text].strip, source_file: file_path }]
|
|
59
|
+
rescue StandardError => e
|
|
60
|
+
[{ error: 'extraction_failed', source_file: file_path, detail: e.message }]
|
|
61
|
+
end
|
|
62
|
+
|
|
51
63
|
def parse_text(file_path:)
|
|
52
64
|
content = ::File.read(file_path, encoding: 'utf-8')
|
|
53
65
|
heading = ::File.basename(file_path, '.*')
|
|
@@ -67,6 +79,17 @@ module Legion
|
|
|
67
79
|
}
|
|
68
80
|
end
|
|
69
81
|
private_class_method :flush_section
|
|
82
|
+
|
|
83
|
+
def heading_level(line)
|
|
84
|
+
m = line.match(/^(\#{1,6})\s/)
|
|
85
|
+
m ? m[1].length : nil
|
|
86
|
+
end
|
|
87
|
+
private_class_method :heading_level
|
|
88
|
+
|
|
89
|
+
def build_section_path(stack)
|
|
90
|
+
stack.sort.map { |_, title| title }
|
|
91
|
+
end
|
|
92
|
+
private_class_method :build_section_path
|
|
70
93
|
end
|
|
71
94
|
end
|
|
72
95
|
end
|
|
@@ -7,6 +7,10 @@ module Legion
|
|
|
7
7
|
module Corpus
|
|
8
8
|
module_function
|
|
9
9
|
|
|
10
|
+
def manifest_path(path:)
|
|
11
|
+
Helpers::ManifestStore.store_path(corpus_path: path)
|
|
12
|
+
end
|
|
13
|
+
|
|
10
14
|
def corpus_stats(path:, extensions: nil)
|
|
11
15
|
return { success: false, error: 'path does not exist' } unless ::File.exist?(path)
|
|
12
16
|
|
|
@@ -23,23 +23,32 @@ module Legion
|
|
|
23
23
|
end
|
|
24
24
|
|
|
25
25
|
def ingest_corpus(path:, dry_run: false, force: false)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
26
|
+
current = Helpers::Manifest.scan(path: path)
|
|
27
|
+
previous = force ? [] : Helpers::ManifestStore.load(corpus_path: path)
|
|
28
|
+
delta = Helpers::Manifest.diff(current: current, previous: previous)
|
|
29
|
+
|
|
30
|
+
to_process = delta[:added] + delta[:changed]
|
|
31
|
+
chunks_created = 0
|
|
32
|
+
chunks_skipped = 0
|
|
33
|
+
chunks_updated = 0
|
|
34
|
+
|
|
35
|
+
to_process.each do |file_path|
|
|
36
|
+
result = process_file(file_path, dry_run: dry_run, force: force)
|
|
37
|
+
chunks_created += result[:created]
|
|
38
|
+
chunks_skipped += result[:skipped]
|
|
39
|
+
chunks_updated += result[:updated]
|
|
38
40
|
end
|
|
39
41
|
|
|
42
|
+
delta[:removed].each { |file_path| retire_file(file_path: file_path) } unless dry_run
|
|
43
|
+
|
|
44
|
+
Helpers::ManifestStore.save(corpus_path: path, manifest: current) unless dry_run
|
|
45
|
+
|
|
40
46
|
{
|
|
41
47
|
success: true,
|
|
42
|
-
files_scanned:
|
|
48
|
+
files_scanned: current.size,
|
|
49
|
+
files_added: delta[:added].size,
|
|
50
|
+
files_changed: delta[:changed].size,
|
|
51
|
+
files_removed: delta[:removed].size,
|
|
43
52
|
chunks_created: chunks_created,
|
|
44
53
|
chunks_skipped: chunks_skipped,
|
|
45
54
|
chunks_updated: chunks_updated
|
|
@@ -67,12 +76,18 @@ module Legion
|
|
|
67
76
|
return { created: 0, skipped: 0, updated: 0 } if sections.first&.key?(:error)
|
|
68
77
|
|
|
69
78
|
chunks = Helpers::Chunker.chunk(sections: sections)
|
|
79
|
+
paired = if dry_run
|
|
80
|
+
chunks.map { |c| { chunk: c, embedding: nil } }
|
|
81
|
+
else
|
|
82
|
+
batch_embed_chunks(chunks, force: force)
|
|
83
|
+
end
|
|
84
|
+
|
|
70
85
|
created = 0
|
|
71
86
|
skipped = 0
|
|
72
87
|
updated = 0
|
|
73
88
|
|
|
74
|
-
|
|
75
|
-
outcome =
|
|
89
|
+
paired.each do |p|
|
|
90
|
+
outcome = upsert_chunk_with_embedding(p[:chunk], p[:embedding], dry_run: dry_run, force: force, exists: p[:exists] || false)
|
|
76
91
|
case outcome
|
|
77
92
|
when :created then created += 1
|
|
78
93
|
when :skipped then skipped += 1
|
|
@@ -84,21 +99,55 @@ module Legion
|
|
|
84
99
|
end
|
|
85
100
|
private_class_method :process_file
|
|
86
101
|
|
|
87
|
-
def
|
|
88
|
-
|
|
102
|
+
def batch_embed_chunks(chunks, force:)
|
|
103
|
+
exists_map = force ? {} : build_exists_map(chunks)
|
|
104
|
+
return paired_without_embed(chunks, exists_map) unless llm_embed_available?
|
|
89
105
|
|
|
90
|
-
|
|
106
|
+
needs_embed = force ? chunks : chunks.reject { |c| exists_map[c[:content_hash]] }
|
|
107
|
+
embed_map = needs_embed.empty? ? {} : build_embed_map(needs_embed)
|
|
91
108
|
|
|
92
|
-
|
|
109
|
+
chunks.map { |c| { chunk: c, embedding: embed_map[c[:content_hash]], exists: exists_map.fetch(c[:content_hash], false) } }
|
|
110
|
+
rescue StandardError
|
|
111
|
+
paired_without_embed(chunks, {})
|
|
112
|
+
end
|
|
113
|
+
private_class_method :batch_embed_chunks
|
|
93
114
|
|
|
94
|
-
|
|
95
|
-
|
|
115
|
+
def build_exists_map(chunks)
|
|
116
|
+
chunks.to_h { |c| [c[:content_hash], chunk_exists?(c[:content_hash])] }
|
|
117
|
+
end
|
|
118
|
+
private_class_method :build_exists_map
|
|
119
|
+
|
|
120
|
+
def llm_embed_available?
|
|
121
|
+
defined?(Legion::LLM) && Legion::LLM.respond_to?(:embed_batch)
|
|
122
|
+
end
|
|
123
|
+
private_class_method :llm_embed_available?
|
|
124
|
+
|
|
125
|
+
def paired_without_embed(chunks, exists_map)
|
|
126
|
+
chunks.map { |c| { chunk: c, embedding: nil, exists: exists_map.fetch(c[:content_hash], false) } }
|
|
127
|
+
end
|
|
128
|
+
private_class_method :paired_without_embed
|
|
96
129
|
|
|
130
|
+
def build_embed_map(needs_embed)
|
|
131
|
+
results = Legion::LLM.embed_batch(needs_embed.map { |c| c[:content] })
|
|
132
|
+
results.each_with_object({}) do |r, h|
|
|
133
|
+
h[needs_embed[r[:index]][:content_hash]] = r[:vector] unless r[:error]
|
|
134
|
+
end
|
|
135
|
+
rescue StandardError
|
|
136
|
+
{}
|
|
137
|
+
end
|
|
138
|
+
private_class_method :build_embed_map
|
|
139
|
+
|
|
140
|
+
def upsert_chunk_with_embedding(chunk, embedding, dry_run: false, force: false, exists: false)
|
|
141
|
+
return :created if dry_run
|
|
142
|
+
return :created unless defined?(Legion::Extensions::Apollo)
|
|
143
|
+
return :skipped if !force && exists
|
|
144
|
+
|
|
145
|
+
ingest_to_apollo(chunk, embedding)
|
|
97
146
|
force ? :updated : :created
|
|
98
147
|
rescue StandardError
|
|
99
148
|
:skipped
|
|
100
149
|
end
|
|
101
|
-
private_class_method :
|
|
150
|
+
private_class_method :upsert_chunk_with_embedding
|
|
102
151
|
|
|
103
152
|
def chunk_exists?(content_hash)
|
|
104
153
|
return false unless defined?(Legion::Data::Model::ApolloEntry)
|
|
@@ -112,16 +161,6 @@ module Legion
|
|
|
112
161
|
end
|
|
113
162
|
private_class_method :chunk_exists?
|
|
114
163
|
|
|
115
|
-
def generate_embedding(content)
|
|
116
|
-
return nil unless defined?(Legion::LLM) && Legion::LLM.respond_to?(:embed)
|
|
117
|
-
|
|
118
|
-
result = Legion::LLM.embed(content)
|
|
119
|
-
result.is_a?(Hash) ? result[:vector] : nil
|
|
120
|
-
rescue StandardError
|
|
121
|
-
nil
|
|
122
|
-
end
|
|
123
|
-
private_class_method :generate_embedding
|
|
124
|
-
|
|
125
164
|
def ingest_to_apollo(chunk, embedding)
|
|
126
165
|
return unless defined?(Legion::Extensions::Apollo)
|
|
127
166
|
|
|
@@ -143,6 +182,21 @@ module Legion
|
|
|
143
182
|
Legion::Extensions::Apollo::Runners::Knowledge.handle_ingest(**payload)
|
|
144
183
|
end
|
|
145
184
|
private_class_method :ingest_to_apollo
|
|
185
|
+
|
|
186
|
+
def retire_file(file_path:)
|
|
187
|
+
return unless defined?(Legion::Apollo)
|
|
188
|
+
return unless Legion::Apollo.respond_to?(:ingest) && Legion::Apollo.started?
|
|
189
|
+
|
|
190
|
+
Legion::Apollo.ingest(
|
|
191
|
+
content: file_path,
|
|
192
|
+
content_type: 'document_retired',
|
|
193
|
+
tags: [file_path, 'retired', 'document_chunk'].uniq,
|
|
194
|
+
metadata: { source_file: file_path, retired: true }
|
|
195
|
+
)
|
|
196
|
+
rescue StandardError
|
|
197
|
+
nil
|
|
198
|
+
end
|
|
199
|
+
private_class_method :retire_file
|
|
146
200
|
end
|
|
147
201
|
end
|
|
148
202
|
end
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require_relative 'knowledge/version'
|
|
4
4
|
require_relative 'knowledge/helpers/manifest'
|
|
5
|
+
require_relative 'knowledge/helpers/manifest_store'
|
|
5
6
|
require_relative 'knowledge/helpers/parser'
|
|
6
7
|
require_relative 'knowledge/helpers/chunker'
|
|
7
8
|
require_relative 'knowledge/runners/ingest'
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: lex-knowledge
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.4.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Matthew Iverson
|
|
@@ -121,6 +121,7 @@ files:
|
|
|
121
121
|
- lib/legion/extensions/knowledge/client.rb
|
|
122
122
|
- lib/legion/extensions/knowledge/helpers/chunker.rb
|
|
123
123
|
- lib/legion/extensions/knowledge/helpers/manifest.rb
|
|
124
|
+
- lib/legion/extensions/knowledge/helpers/manifest_store.rb
|
|
124
125
|
- lib/legion/extensions/knowledge/helpers/parser.rb
|
|
125
126
|
- lib/legion/extensions/knowledge/runners/corpus.rb
|
|
126
127
|
- lib/legion/extensions/knowledge/runners/ingest.rb
|