lex-knowledge 0.6.10 → 0.6.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/legion/extensions/knowledge/actors/corpus_ingest.rb +5 -1
- data/lib/legion/extensions/knowledge/actors/corpus_watcher.rb +7 -12
- data/lib/legion/extensions/knowledge/actors/maintenance_runner.rb +8 -15
- data/lib/legion/extensions/knowledge/helpers/apollo_models.rb +45 -0
- data/lib/legion/extensions/knowledge/helpers/chunker.rb +5 -20
- data/lib/legion/extensions/knowledge/helpers/manifest.rb +3 -6
- data/lib/legion/extensions/knowledge/helpers/manifest_store.rb +10 -5
- data/lib/legion/extensions/knowledge/helpers/parser.rb +3 -0
- data/lib/legion/extensions/knowledge/runners/corpus.rb +3 -0
- data/lib/legion/extensions/knowledge/runners/ingest.rb +98 -42
- data/lib/legion/extensions/knowledge/runners/maintenance.rb +95 -104
- data/lib/legion/extensions/knowledge/runners/monitor.rb +20 -17
- data/lib/legion/extensions/knowledge/runners/query.rb +155 -17
- data/lib/legion/extensions/knowledge/version.rb +1 -1
- data/lib/legion/extensions/knowledge.rb +34 -0
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 21c773508b11dfb12b4d07bf2e37705483b82f4100382c29d84d883f3790ce5c
|
|
4
|
+
data.tar.gz: 96c6e3da83bb20eabb5db9f55b8ef4b7715ac08d3110ec4c6680ccf6b62210e4
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 5e2a5993ca4b4213c1dfb25eff797aaf7b89af3b5bc4c37072683d564bc49db49aad6f30702f47c19bd66c008bcdcc9b9a29b075920b66c03ab237dd4e9070e0
|
|
7
|
+
data.tar.gz: 04f3d33195e5c81c4b48c361170909b6f8a53e28660a0d30370584453a847f8d787443261ffa1b47a383c58aa387281616a29f1a2366c6c46e43308d82284808
|
|
@@ -5,6 +5,9 @@ module Legion
|
|
|
5
5
|
module Knowledge
|
|
6
6
|
module Actor
|
|
7
7
|
class CorpusIngest < Legion::Extensions::Actors::Subscription
|
|
8
|
+
include Legion::Logging::Helper
|
|
9
|
+
include Legion::Settings::Helper
|
|
10
|
+
|
|
8
11
|
def runner_class = 'Legion::Extensions::Knowledge::Runners::Ingest'
|
|
9
12
|
def runner_function = 'ingest_file'
|
|
10
13
|
def check_subtask? = false
|
|
@@ -13,7 +16,8 @@ module Legion
|
|
|
13
16
|
def enabled? # rubocop:disable Legion/Extension/ActorEnabledSideEffects
|
|
14
17
|
Legion.const_defined?(:Transport, false) &&
|
|
15
18
|
defined?(Legion::Extensions::Knowledge::Runners::Ingest)
|
|
16
|
-
rescue StandardError =>
|
|
19
|
+
rescue StandardError => e
|
|
20
|
+
handle_exception(e, level: :warn, operation: 'knowledge.corpus_ingest.enabled')
|
|
17
21
|
false
|
|
18
22
|
end
|
|
19
23
|
end
|
|
@@ -5,26 +5,25 @@ module Legion
|
|
|
5
5
|
module Knowledge
|
|
6
6
|
module Actor
|
|
7
7
|
class CorpusWatcher < Legion::Extensions::Actors::Every # rubocop:disable Legion/Extension/EveryActorRequiresTime
|
|
8
|
+
include Legion::Logging::Helper
|
|
9
|
+
include Legion::Settings::Helper
|
|
10
|
+
|
|
8
11
|
def runner_class = 'Legion::Extensions::Knowledge::Runners::Ingest'
|
|
9
12
|
def runner_function = 'ingest_corpus'
|
|
10
13
|
def check_subtask? = false
|
|
11
14
|
def generate_task? = false
|
|
12
15
|
|
|
13
16
|
def time
|
|
14
|
-
|
|
15
|
-
Legion::Settings.dig(:knowledge, :actors, :watcher_interval) || 300
|
|
16
|
-
else
|
|
17
|
-
300
|
|
18
|
-
end
|
|
17
|
+
settings[:actors][:watcher_interval]
|
|
19
18
|
rescue StandardError => e
|
|
20
|
-
|
|
19
|
+
handle_exception(e, level: :warn, operation: 'knowledge.corpus_watcher.time')
|
|
21
20
|
300
|
|
22
21
|
end
|
|
23
22
|
|
|
24
23
|
def enabled? # rubocop:disable Legion/Extension/ActorEnabledSideEffects
|
|
25
24
|
resolve_monitors.any?
|
|
26
25
|
rescue StandardError => e
|
|
27
|
-
|
|
26
|
+
handle_exception(e, level: :warn, operation: 'knowledge.corpus_watcher.enabled')
|
|
28
27
|
false
|
|
29
28
|
end
|
|
30
29
|
|
|
@@ -34,14 +33,10 @@ module Legion
|
|
|
34
33
|
|
|
35
34
|
private
|
|
36
35
|
|
|
37
|
-
def log
|
|
38
|
-
Legion::Logging
|
|
39
|
-
end
|
|
40
|
-
|
|
41
36
|
def resolve_monitors
|
|
42
37
|
Runners::Monitor.resolve_monitors
|
|
43
38
|
rescue StandardError => e
|
|
44
|
-
|
|
39
|
+
handle_exception(e, level: :warn, operation: 'knowledge.corpus_watcher.resolve_monitors')
|
|
45
40
|
[]
|
|
46
41
|
end
|
|
47
42
|
end
|
|
@@ -5,19 +5,18 @@ module Legion
|
|
|
5
5
|
module Knowledge
|
|
6
6
|
module Actor
|
|
7
7
|
class MaintenanceRunner < Legion::Extensions::Actors::Every # rubocop:disable Legion/Extension/EveryActorRequiresTime
|
|
8
|
+
include Legion::Logging::Helper
|
|
9
|
+
include Legion::Settings::Helper
|
|
10
|
+
|
|
8
11
|
def runner_class = 'Legion::Extensions::Knowledge::Runners::Maintenance'
|
|
9
12
|
def runner_function = 'health'
|
|
10
13
|
def check_subtask? = false
|
|
11
14
|
def generate_task? = false
|
|
12
15
|
|
|
13
16
|
def time
|
|
14
|
-
|
|
15
|
-
Legion::Settings.dig(:knowledge, :actors, :maintenance_interval) || 21_600
|
|
16
|
-
else
|
|
17
|
-
21_600
|
|
18
|
-
end
|
|
17
|
+
settings[:actors][:maintenance_interval]
|
|
19
18
|
rescue StandardError => e
|
|
20
|
-
|
|
19
|
+
handle_exception(e, level: :warn, operation: 'knowledge.maintenance_runner.time')
|
|
21
20
|
21_600
|
|
22
21
|
end
|
|
23
22
|
|
|
@@ -26,7 +25,7 @@ module Legion
|
|
|
26
25
|
|
|
27
26
|
true
|
|
28
27
|
rescue StandardError => e
|
|
29
|
-
|
|
28
|
+
handle_exception(e, level: :warn, operation: 'knowledge.maintenance_runner.enabled')
|
|
30
29
|
false
|
|
31
30
|
end
|
|
32
31
|
|
|
@@ -36,16 +35,10 @@ module Legion
|
|
|
36
35
|
|
|
37
36
|
private
|
|
38
37
|
|
|
39
|
-
def log
|
|
40
|
-
Legion::Logging
|
|
41
|
-
end
|
|
42
|
-
|
|
43
38
|
def corpus_path
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
Legion::Settings.dig(:knowledge, :corpus_path)
|
|
39
|
+
settings[:corpus_path]
|
|
47
40
|
rescue StandardError => e
|
|
48
|
-
|
|
41
|
+
handle_exception(e, level: :warn, operation: 'knowledge.maintenance_runner.corpus_path')
|
|
49
42
|
nil
|
|
50
43
|
end
|
|
51
44
|
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Extensions
|
|
5
|
+
module Knowledge
|
|
6
|
+
module Helpers
|
|
7
|
+
module ApolloModels
|
|
8
|
+
class << self
|
|
9
|
+
def entry
|
|
10
|
+
namespaced_apollo_model(:Entry) || legacy_model(:ApolloEntry)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def access_log
|
|
14
|
+
namespaced_apollo_model(:AccessLog) || legacy_model(:ApolloAccessLog)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def entry_available?
|
|
18
|
+
!entry.nil?
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def access_log_available?
|
|
22
|
+
!access_log.nil?
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
private
|
|
26
|
+
|
|
27
|
+
def namespaced_apollo_model(name)
|
|
28
|
+
return nil unless defined?(Legion::Data::Model::Apollo)
|
|
29
|
+
return nil unless Legion::Data::Model::Apollo.const_defined?(name, false)
|
|
30
|
+
|
|
31
|
+
Legion::Data::Model::Apollo.const_get(name, false)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def legacy_model(name)
|
|
35
|
+
return nil unless defined?(Legion::Data::Model)
|
|
36
|
+
return nil unless Legion::Data::Model.const_defined?(name, false)
|
|
37
|
+
|
|
38
|
+
Legion::Data::Model.const_get(name, false)
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -7,13 +7,16 @@ module Legion
|
|
|
7
7
|
module Knowledge
|
|
8
8
|
module Helpers
|
|
9
9
|
module Chunker
|
|
10
|
+
extend Legion::Logging::Helper
|
|
11
|
+
extend Legion::Settings::Helper
|
|
12
|
+
|
|
10
13
|
CHARS_PER_TOKEN = 4
|
|
11
14
|
|
|
12
15
|
module_function
|
|
13
16
|
|
|
14
17
|
def chunk(sections:, max_tokens: nil, overlap_tokens: nil)
|
|
15
|
-
resolved_max = max_tokens ||
|
|
16
|
-
resolved_overlap = overlap_tokens ||
|
|
18
|
+
resolved_max = max_tokens || settings[:chunker][:max_tokens]
|
|
19
|
+
resolved_overlap = overlap_tokens || settings[:chunker][:overlap_tokens]
|
|
17
20
|
|
|
18
21
|
max_chars = resolved_max * CHARS_PER_TOKEN
|
|
19
22
|
overlap_chars = resolved_overlap * CHARS_PER_TOKEN
|
|
@@ -89,24 +92,6 @@ module Legion
|
|
|
89
92
|
end
|
|
90
93
|
end
|
|
91
94
|
private_class_method :apollo_compatible_content_hash
|
|
92
|
-
|
|
93
|
-
def settings_max_tokens
|
|
94
|
-
return nil unless defined?(Legion::Settings)
|
|
95
|
-
|
|
96
|
-
Legion::Settings.dig(:knowledge, :chunker, :max_tokens)
|
|
97
|
-
rescue StandardError => _e
|
|
98
|
-
nil
|
|
99
|
-
end
|
|
100
|
-
private_class_method :settings_max_tokens
|
|
101
|
-
|
|
102
|
-
def settings_overlap_tokens
|
|
103
|
-
return nil unless defined?(Legion::Settings)
|
|
104
|
-
|
|
105
|
-
Legion::Settings.dig(:knowledge, :chunker, :overlap_tokens)
|
|
106
|
-
rescue StandardError => _e
|
|
107
|
-
nil
|
|
108
|
-
end
|
|
109
|
-
private_class_method :settings_overlap_tokens
|
|
110
95
|
end
|
|
111
96
|
end
|
|
112
97
|
end
|
|
@@ -7,6 +7,8 @@ module Legion
|
|
|
7
7
|
module Knowledge
|
|
8
8
|
module Helpers
|
|
9
9
|
module Manifest
|
|
10
|
+
extend Legion::Logging::Helper
|
|
11
|
+
|
|
10
12
|
module_function
|
|
11
13
|
|
|
12
14
|
def scan(path:, extensions: %w[.md .txt .docx .pdf])
|
|
@@ -25,15 +27,10 @@ module Legion
|
|
|
25
27
|
results << build_entry(entry)
|
|
26
28
|
end
|
|
27
29
|
rescue Errno::EPERM, Errno::EACCES, Errno::ELOOP, Errno::ENOENT => e
|
|
28
|
-
|
|
30
|
+
handle_exception(e, level: :warn, operation: 'knowledge.manifest.walk', entry: entry)
|
|
29
31
|
end
|
|
30
32
|
private_class_method :walk
|
|
31
33
|
|
|
32
|
-
def log
|
|
33
|
-
Legion::Logging
|
|
34
|
-
end
|
|
35
|
-
private_class_method :log
|
|
36
|
-
|
|
37
34
|
def diff(current:, previous:)
|
|
38
35
|
current_map = current.to_h { |e| [e[:path], e[:sha256]] }
|
|
39
36
|
previous_map = previous.to_h { |e| [e[:path], e[:sha256]] }
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require 'digest'
|
|
4
4
|
require 'fileutils'
|
|
5
|
-
require 'json'
|
|
5
|
+
require 'legion/json'
|
|
6
6
|
require 'tempfile'
|
|
7
7
|
|
|
8
8
|
module Legion
|
|
@@ -10,6 +10,9 @@ module Legion
|
|
|
10
10
|
module Knowledge
|
|
11
11
|
module Helpers
|
|
12
12
|
module ManifestStore
|
|
13
|
+
extend Legion::Logging::Helper
|
|
14
|
+
extend Legion::JSON::Helper
|
|
15
|
+
|
|
13
16
|
module_function
|
|
14
17
|
|
|
15
18
|
STORE_DIR = ::File.expand_path('~/.legionio/knowledge').freeze
|
|
@@ -19,8 +22,9 @@ module Legion
|
|
|
19
22
|
return [] unless ::File.exist?(path)
|
|
20
23
|
|
|
21
24
|
raw = ::File.read(path, encoding: 'utf-8')
|
|
22
|
-
|
|
23
|
-
rescue StandardError =>
|
|
25
|
+
json_parse(raw)
|
|
26
|
+
rescue StandardError => e
|
|
27
|
+
handle_exception(e, level: :warn, operation: 'knowledge.manifest_store.load', corpus_path: corpus_path)
|
|
24
28
|
[]
|
|
25
29
|
end
|
|
26
30
|
|
|
@@ -28,10 +32,11 @@ module Legion
|
|
|
28
32
|
::FileUtils.mkdir_p(STORE_DIR)
|
|
29
33
|
path = store_path(corpus_path: corpus_path)
|
|
30
34
|
tmp = "#{path}.tmp"
|
|
31
|
-
::File.write(tmp,
|
|
35
|
+
::File.write(tmp, json_generate(manifest.map { |e| serialize_entry(e) }))
|
|
32
36
|
::File.rename(tmp, path)
|
|
33
37
|
true
|
|
34
|
-
rescue StandardError =>
|
|
38
|
+
rescue StandardError => e
|
|
39
|
+
handle_exception(e, level: :warn, operation: 'knowledge.manifest_store.save', corpus_path: corpus_path)
|
|
35
40
|
false
|
|
36
41
|
end
|
|
37
42
|
|
|
@@ -5,6 +5,8 @@ module Legion
|
|
|
5
5
|
module Knowledge
|
|
6
6
|
module Helpers
|
|
7
7
|
module Parser
|
|
8
|
+
extend Legion::Logging::Helper
|
|
9
|
+
|
|
8
10
|
module_function
|
|
9
11
|
|
|
10
12
|
def parse(file_path:)
|
|
@@ -57,6 +59,7 @@ module Legion
|
|
|
57
59
|
heading = ::File.basename(file_path, '.*')
|
|
58
60
|
[{ heading: heading, section_path: [], content: result[:text].strip, source_file: file_path }]
|
|
59
61
|
rescue StandardError => e
|
|
62
|
+
handle_exception(e, level: :warn, operation: 'knowledge.parser.extract_via_data', file_path: file_path)
|
|
60
63
|
[{ error: 'extraction_failed', source_file: file_path, detail: e.message }]
|
|
61
64
|
end
|
|
62
65
|
|
|
@@ -5,6 +5,8 @@ module Legion
|
|
|
5
5
|
module Knowledge
|
|
6
6
|
module Runners
|
|
7
7
|
module Corpus # rubocop:disable Legion/Extension/RunnerIncludeHelpers
|
|
8
|
+
extend Legion::Logging::Helper
|
|
9
|
+
|
|
8
10
|
module_function
|
|
9
11
|
|
|
10
12
|
def manifest_path(path:)
|
|
@@ -32,6 +34,7 @@ module Legion
|
|
|
32
34
|
total_bytes: entries.sum { |e| e[:size] }
|
|
33
35
|
}
|
|
34
36
|
rescue StandardError => e
|
|
37
|
+
handle_exception(e, level: :warn, operation: 'knowledge.corpus.corpus_stats', path: path)
|
|
35
38
|
{ success: false, error: e.message }
|
|
36
39
|
end
|
|
37
40
|
end
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative '../helpers/apollo_models'
|
|
4
|
+
|
|
3
5
|
require 'securerandom'
|
|
4
6
|
|
|
5
7
|
module Legion
|
|
@@ -7,12 +9,10 @@ module Legion
|
|
|
7
9
|
module Knowledge
|
|
8
10
|
module Runners
|
|
9
11
|
module Ingest # rubocop:disable Legion/Extension/RunnerIncludeHelpers
|
|
10
|
-
|
|
12
|
+
extend Legion::Logging::Helper
|
|
13
|
+
extend Legion::Settings::Helper
|
|
11
14
|
|
|
12
|
-
|
|
13
|
-
Legion::Logging
|
|
14
|
-
end
|
|
15
|
-
private_class_method :log
|
|
15
|
+
module_function
|
|
16
16
|
|
|
17
17
|
def scan_corpus(path:, extensions: nil)
|
|
18
18
|
opts = { path: path }
|
|
@@ -29,17 +29,27 @@ module Legion
|
|
|
29
29
|
}
|
|
30
30
|
end
|
|
31
31
|
|
|
32
|
-
|
|
33
|
-
|
|
32
|
+
FILTER_SCHEMA = {
|
|
33
|
+
type: 'object',
|
|
34
|
+
properties: {
|
|
35
|
+
relevant: { type: 'boolean' },
|
|
36
|
+
confidence: { type: 'number' },
|
|
37
|
+
reason: { type: 'string' }
|
|
38
|
+
},
|
|
39
|
+
required: %w[relevant confidence]
|
|
40
|
+
}.freeze
|
|
41
|
+
|
|
42
|
+
def ingest_corpus(path: nil, monitors: nil, dry_run: false, force: false, filter: true)
|
|
43
|
+
return ingest_monitors(monitors: monitors, dry_run: dry_run, force: force, filter: filter) if monitors&.any?
|
|
34
44
|
raise ArgumentError, 'path is required when monitors is not provided' if path.nil?
|
|
35
45
|
|
|
36
|
-
ingest_corpus_path(path: path, dry_run: dry_run, force: force)
|
|
46
|
+
ingest_corpus_path(path: path, dry_run: dry_run, force: force, filter: filter)
|
|
37
47
|
rescue ArgumentError => e
|
|
38
|
-
|
|
48
|
+
handle_exception(e, level: :warn, operation: 'knowledge.ingest.ingest_corpus')
|
|
39
49
|
{ success: false, error: e.message }
|
|
40
50
|
end
|
|
41
51
|
|
|
42
|
-
def ingest_corpus_path(path:, dry_run: false, force: false)
|
|
52
|
+
def ingest_corpus_path(path:, dry_run: false, force: false, filter: true)
|
|
43
53
|
current = Helpers::Manifest.scan(path: path)
|
|
44
54
|
previous = force ? [] : Helpers::ManifestStore.load(corpus_path: path)
|
|
45
55
|
delta = Helpers::Manifest.diff(current: current, previous: previous)
|
|
@@ -50,7 +60,7 @@ module Legion
|
|
|
50
60
|
chunks_updated = 0
|
|
51
61
|
|
|
52
62
|
to_process.each do |file_path|
|
|
53
|
-
result = process_file(file_path, dry_run: dry_run, force: force)
|
|
63
|
+
result = process_file(file_path, dry_run: dry_run, force: force, filter: filter)
|
|
54
64
|
chunks_created += result[:created]
|
|
55
65
|
chunks_skipped += result[:skipped]
|
|
56
66
|
chunks_updated += result[:updated]
|
|
@@ -71,16 +81,16 @@ module Legion
|
|
|
71
81
|
chunks_updated: chunks_updated
|
|
72
82
|
}
|
|
73
83
|
rescue StandardError => e
|
|
74
|
-
|
|
84
|
+
handle_exception(e, level: :warn, operation: 'knowledge.ingest.ingest_corpus_path', path: path)
|
|
75
85
|
{ success: false, error: e.message }
|
|
76
86
|
end
|
|
77
87
|
private_class_method :ingest_corpus_path
|
|
78
88
|
|
|
79
|
-
def ingest_monitors(monitors:, dry_run: false, force: false)
|
|
89
|
+
def ingest_monitors(monitors:, dry_run: false, force: false, filter: true)
|
|
80
90
|
results = monitors.map do |monitor|
|
|
81
|
-
ingest_corpus(path: monitor[:path], dry_run: dry_run, force: force)
|
|
91
|
+
ingest_corpus(path: monitor[:path], dry_run: dry_run, force: force, filter: filter)
|
|
82
92
|
rescue StandardError => e
|
|
83
|
-
|
|
93
|
+
handle_exception(e, level: :warn, operation: 'knowledge.ingest.ingest_monitor', path: monitor[:path])
|
|
84
94
|
{ success: false, path: monitor[:path], error: e.message }
|
|
85
95
|
end
|
|
86
96
|
|
|
@@ -101,7 +111,7 @@ module Legion
|
|
|
101
111
|
|
|
102
112
|
{ success: true, monitors_processed: results.size, **total }
|
|
103
113
|
rescue StandardError => e
|
|
104
|
-
|
|
114
|
+
handle_exception(e, level: :warn, operation: 'knowledge.ingest.ingest_monitors')
|
|
105
115
|
{ success: false, error: e.message }
|
|
106
116
|
end
|
|
107
117
|
private_class_method :ingest_monitors
|
|
@@ -114,17 +124,17 @@ module Legion
|
|
|
114
124
|
section_path: [source_type.to_s],
|
|
115
125
|
source_file: source_path
|
|
116
126
|
}
|
|
117
|
-
chunks = Helpers::Chunker.chunk(sections: [section])
|
|
127
|
+
chunks = filter_chunks(Helpers::Chunker.chunk(sections: [section]), filter: true)
|
|
118
128
|
paired = batch_embed_chunks(chunks, force: false)
|
|
119
129
|
paired.each { |p| upsert_chunk_with_embedding(p[:chunk], p[:embedding], force: false, exists: p[:exists] || false) }
|
|
120
130
|
{ status: :ingested, chunks: chunks.size, source_type: source_type, metadata: metadata }
|
|
121
131
|
rescue StandardError => e
|
|
122
|
-
|
|
132
|
+
handle_exception(e, level: :warn, operation: 'knowledge.ingest.ingest_content', source_type: source_type)
|
|
123
133
|
{ status: :failed, error: e.message, source_type: source_type, metadata: metadata }
|
|
124
134
|
end
|
|
125
135
|
|
|
126
|
-
def ingest_file(file_path:, force: false)
|
|
127
|
-
result = process_file(file_path, dry_run: false, force: force)
|
|
136
|
+
def ingest_file(file_path:, force: false, filter: true)
|
|
137
|
+
result = process_file(file_path, dry_run: false, force: force, filter: filter)
|
|
128
138
|
|
|
129
139
|
{
|
|
130
140
|
success: true,
|
|
@@ -134,23 +144,24 @@ module Legion
|
|
|
134
144
|
chunks_updated: result[:updated]
|
|
135
145
|
}
|
|
136
146
|
rescue StandardError => e
|
|
137
|
-
|
|
147
|
+
handle_exception(e, level: :warn, operation: 'knowledge.ingest.ingest_file', file_path: file_path)
|
|
138
148
|
{ success: false, error: e.message }
|
|
139
149
|
end
|
|
140
150
|
|
|
141
|
-
def process_file(file_path, dry_run: false, force: false)
|
|
151
|
+
def process_file(file_path, dry_run: false, force: false, filter: true)
|
|
142
152
|
sections = Helpers::Parser.parse(file_path: file_path)
|
|
143
153
|
return { created: 0, skipped: 0, updated: 0 } if sections.first&.key?(:error)
|
|
144
154
|
|
|
145
|
-
chunks
|
|
155
|
+
chunks = Helpers::Chunker.chunk(sections: sections)
|
|
156
|
+
filtered_chunks = filter_chunks(chunks, filter: filter)
|
|
146
157
|
paired = if dry_run
|
|
147
|
-
|
|
158
|
+
filtered_chunks.map { |c| { chunk: c, embedding: nil } }
|
|
148
159
|
else
|
|
149
|
-
batch_embed_chunks(
|
|
160
|
+
batch_embed_chunks(filtered_chunks, force: force)
|
|
150
161
|
end
|
|
151
162
|
|
|
152
163
|
created = 0
|
|
153
|
-
skipped =
|
|
164
|
+
skipped = chunks.size - filtered_chunks.size
|
|
154
165
|
updated = 0
|
|
155
166
|
|
|
156
167
|
paired.each do |p|
|
|
@@ -166,6 +177,49 @@ module Legion
|
|
|
166
177
|
end
|
|
167
178
|
private_class_method :process_file
|
|
168
179
|
|
|
180
|
+
def filter_chunks(chunks, filter:)
|
|
181
|
+
return chunks unless filter
|
|
182
|
+
|
|
183
|
+
prompt = settings[:ingest][:filter_prompt]
|
|
184
|
+
return chunks if prompt.to_s.strip.empty? || !llm_structured_available?
|
|
185
|
+
|
|
186
|
+
chunks.select { |chunk| chunk_allowed_by_filter?(chunk, prompt: prompt, threshold: settings[:ingest][:filter_threshold]) }
|
|
187
|
+
rescue StandardError => e
|
|
188
|
+
handle_exception(e, level: :warn, operation: 'knowledge.ingest.filter_chunks')
|
|
189
|
+
chunks
|
|
190
|
+
end
|
|
191
|
+
private_class_method :filter_chunks
|
|
192
|
+
|
|
193
|
+
def chunk_allowed_by_filter?(chunk, prompt:, threshold:)
|
|
194
|
+
hash = chunk[:content_hash] || Helpers::Chunker.send(:apollo_compatible_content_hash, chunk[:content].to_s)
|
|
195
|
+
return filter_cache[hash] if filter_cache.key?(hash)
|
|
196
|
+
|
|
197
|
+
result = Legion::LLM.structured( # rubocop:disable Legion/HelperMigration/DirectLlm
|
|
198
|
+
messages: [
|
|
199
|
+
{ role: 'system', content: prompt },
|
|
200
|
+
{ role: 'user', content: chunk[:content].to_s }
|
|
201
|
+
],
|
|
202
|
+
schema: FILTER_SCHEMA,
|
|
203
|
+
caller: { extension: 'lex-knowledge', runner: 'ingest', operation: 'filter_chunk' }
|
|
204
|
+
)
|
|
205
|
+
data = result.is_a?(Hash) ? (result[:data] || result) : {}
|
|
206
|
+
filter_cache[hash] = data[:relevant] == true && data[:confidence].to_f >= threshold.to_f
|
|
207
|
+
rescue StandardError => e
|
|
208
|
+
handle_exception(e, level: :warn, operation: 'knowledge.ingest.filter_chunk', content_hash: hash)
|
|
209
|
+
filter_cache[hash] = true
|
|
210
|
+
end
|
|
211
|
+
private_class_method :chunk_allowed_by_filter?
|
|
212
|
+
|
|
213
|
+
def filter_cache
|
|
214
|
+
Thread.current[:lex_knowledge_filter_cache] ||= {}
|
|
215
|
+
end
|
|
216
|
+
private_class_method :filter_cache
|
|
217
|
+
|
|
218
|
+
def llm_structured_available?
|
|
219
|
+
defined?(Legion::LLM) && Legion::LLM.respond_to?(:structured)
|
|
220
|
+
end
|
|
221
|
+
private_class_method :llm_structured_available?
|
|
222
|
+
|
|
169
223
|
def batch_embed_chunks(chunks, force:)
|
|
170
224
|
exists_map = force ? {} : build_exists_map(chunks)
|
|
171
225
|
return paired_without_embed(chunks, exists_map) unless llm_embed_available?
|
|
@@ -175,7 +229,7 @@ module Legion
|
|
|
175
229
|
|
|
176
230
|
chunks.map { |c| { chunk: c, embedding: embed_map[c[:content_hash]], exists: exists_map.fetch(c[:content_hash], false) } }
|
|
177
231
|
rescue StandardError => e
|
|
178
|
-
|
|
232
|
+
handle_exception(e, level: :warn, operation: 'knowledge.ingest.batch_embed_chunks')
|
|
179
233
|
paired_without_embed(chunks, {})
|
|
180
234
|
end
|
|
181
235
|
private_class_method :batch_embed_chunks
|
|
@@ -201,7 +255,7 @@ module Legion
|
|
|
201
255
|
h[needs_embed[r[:index]][:content_hash]] = r[:vector] unless r[:error]
|
|
202
256
|
end
|
|
203
257
|
rescue StandardError => e
|
|
204
|
-
|
|
258
|
+
handle_exception(e, level: :warn, operation: 'knowledge.ingest.build_embed_map')
|
|
205
259
|
{}
|
|
206
260
|
end
|
|
207
261
|
private_class_method :build_embed_map
|
|
@@ -228,19 +282,19 @@ module Legion
|
|
|
228
282
|
end
|
|
229
283
|
force ? :updated : :created
|
|
230
284
|
rescue StandardError => e
|
|
231
|
-
|
|
285
|
+
handle_exception(e, level: :warn, operation: 'knowledge.ingest.upsert_chunk', content_hash: chunk[:content_hash]&.slice(0, 12))
|
|
232
286
|
:skipped
|
|
233
287
|
end
|
|
234
288
|
private_class_method :upsert_chunk_with_embedding
|
|
235
289
|
|
|
236
290
|
def chunk_exists?(content_hash)
|
|
237
|
-
return false unless
|
|
291
|
+
return false unless Helpers::ApolloModels.entry_available?
|
|
238
292
|
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
293
|
+
Helpers::ApolloModels.entry
|
|
294
|
+
.where(content_hash: content_hash)
|
|
295
|
+
.any?
|
|
242
296
|
rescue StandardError => e
|
|
243
|
-
|
|
297
|
+
handle_exception(e, level: :warn, operation: 'knowledge.ingest.chunk_exists', content_hash: content_hash)
|
|
244
298
|
false
|
|
245
299
|
end
|
|
246
300
|
private_class_method :chunk_exists?
|
|
@@ -248,18 +302,20 @@ module Legion
|
|
|
248
302
|
def ingest_to_apollo(chunk, embedding)
|
|
249
303
|
return unless defined?(Legion::Extensions::Apollo)
|
|
250
304
|
|
|
305
|
+
context = {
|
|
306
|
+
source_file: chunk[:source_file],
|
|
307
|
+
heading: chunk[:heading],
|
|
308
|
+
section_path: chunk[:section_path],
|
|
309
|
+
chunk_index: chunk[:chunk_index],
|
|
310
|
+
token_count: chunk[:token_count]
|
|
311
|
+
}
|
|
251
312
|
payload = {
|
|
252
313
|
content: chunk[:content],
|
|
253
314
|
content_type: 'document_chunk',
|
|
254
315
|
content_hash: chunk[:content_hash],
|
|
255
316
|
tags: [chunk[:source_file], chunk[:heading], 'document_chunk'].compact.uniq,
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
heading: chunk[:heading],
|
|
259
|
-
section_path: chunk[:section_path],
|
|
260
|
-
chunk_index: chunk[:chunk_index],
|
|
261
|
-
token_count: chunk[:token_count]
|
|
262
|
-
}
|
|
317
|
+
context: context,
|
|
318
|
+
metadata: context
|
|
263
319
|
}
|
|
264
320
|
payload[:embedding] = embedding if embedding
|
|
265
321
|
|
|
@@ -278,7 +334,7 @@ module Legion
|
|
|
278
334
|
metadata: { source_file: file_path, retired: true }
|
|
279
335
|
)
|
|
280
336
|
rescue StandardError => e
|
|
281
|
-
|
|
337
|
+
handle_exception(e, level: :warn, operation: 'knowledge.ingest.retire_file', file_path: file_path)
|
|
282
338
|
nil
|
|
283
339
|
end
|
|
284
340
|
private_class_method :retire_file
|