lex-knowledge 0.6.10 → 0.6.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/legion/extensions/knowledge/actors/corpus_ingest.rb +5 -1
- data/lib/legion/extensions/knowledge/actors/corpus_watcher.rb +7 -12
- data/lib/legion/extensions/knowledge/actors/maintenance_runner.rb +8 -15
- data/lib/legion/extensions/knowledge/helpers/apollo_models.rb +45 -0
- data/lib/legion/extensions/knowledge/helpers/chunker.rb +5 -20
- data/lib/legion/extensions/knowledge/helpers/manifest.rb +3 -6
- data/lib/legion/extensions/knowledge/helpers/manifest_store.rb +10 -5
- data/lib/legion/extensions/knowledge/helpers/parser.rb +3 -0
- data/lib/legion/extensions/knowledge/runners/corpus.rb +3 -0
- data/lib/legion/extensions/knowledge/runners/ingest.rb +98 -42
- data/lib/legion/extensions/knowledge/runners/maintenance.rb +95 -104
- data/lib/legion/extensions/knowledge/runners/monitor.rb +20 -17
- data/lib/legion/extensions/knowledge/runners/query.rb +155 -17
- data/lib/legion/extensions/knowledge/version.rb +1 -1
- data/lib/legion/extensions/knowledge.rb +34 -0
- metadata +2 -1
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative '../helpers/apollo_models'
|
|
4
|
+
|
|
3
5
|
require 'digest'
|
|
4
6
|
|
|
5
7
|
module Legion
|
|
@@ -7,13 +9,23 @@ module Legion
|
|
|
7
9
|
module Knowledge
|
|
8
10
|
module Runners
|
|
9
11
|
module Query # rubocop:disable Legion/Extension/RunnerIncludeHelpers
|
|
12
|
+
extend Legion::Logging::Helper
|
|
13
|
+
extend Legion::JSON::Helper
|
|
14
|
+
extend Legion::Settings::Helper
|
|
15
|
+
|
|
10
16
|
module_function
|
|
11
17
|
|
|
12
|
-
def query(question:, top_k: nil, synthesize: true)
|
|
18
|
+
def query(question:, top_k: nil, synthesize: true, expand_neighbors: false, neighbor_radius: nil)
|
|
13
19
|
started = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC)
|
|
14
|
-
resolved_k
|
|
20
|
+
resolved_k = top_k || settings[:query][:top_k]
|
|
21
|
+
resolved_radius = resolve_neighbor_radius(neighbor_radius)
|
|
15
22
|
|
|
16
|
-
chunks = retrieve_chunks(
|
|
23
|
+
chunks = retrieve_chunks(
|
|
24
|
+
question,
|
|
25
|
+
resolved_k,
|
|
26
|
+
expand_neighbors: expand_neighbors,
|
|
27
|
+
neighbor_radius: resolved_radius
|
|
28
|
+
)
|
|
17
29
|
|
|
18
30
|
answer = (synthesize_answer(question, chunks) if synthesize && llm_available?)
|
|
19
31
|
|
|
@@ -36,12 +48,19 @@ module Legion
|
|
|
36
48
|
metadata: build_metadata(chunks, score, latency_ms)
|
|
37
49
|
}
|
|
38
50
|
rescue StandardError => e
|
|
51
|
+
handle_exception(e, level: :warn, operation: 'knowledge.query.query')
|
|
39
52
|
{ success: false, error: e.message }
|
|
40
53
|
end
|
|
41
54
|
|
|
42
|
-
def retrieve(question:, top_k: nil)
|
|
43
|
-
resolved_k
|
|
44
|
-
|
|
55
|
+
def retrieve(question:, top_k: nil, expand_neighbors: false, neighbor_radius: nil)
|
|
56
|
+
resolved_k = top_k || settings[:query][:top_k]
|
|
57
|
+
resolved_radius = resolve_neighbor_radius(neighbor_radius)
|
|
58
|
+
chunks = retrieve_chunks(
|
|
59
|
+
question,
|
|
60
|
+
resolved_k,
|
|
61
|
+
expand_neighbors: expand_neighbors,
|
|
62
|
+
neighbor_radius: resolved_radius
|
|
63
|
+
)
|
|
45
64
|
|
|
46
65
|
{
|
|
47
66
|
success: true,
|
|
@@ -49,6 +68,7 @@ module Legion
|
|
|
49
68
|
metadata: build_metadata(chunks, average_score(chunks))
|
|
50
69
|
}
|
|
51
70
|
rescue StandardError => e
|
|
71
|
+
handle_exception(e, level: :warn, operation: 'knowledge.query.retrieve')
|
|
52
72
|
{ success: false, error: e.message }
|
|
53
73
|
end
|
|
54
74
|
|
|
@@ -63,10 +83,11 @@ module Legion
|
|
|
63
83
|
)
|
|
64
84
|
{ success: true, question_hash: question_hash, rating: rating }
|
|
65
85
|
rescue StandardError => e
|
|
86
|
+
handle_exception(e, level: :warn, operation: 'knowledge.query.record_feedback')
|
|
66
87
|
{ success: false, error: e.message }
|
|
67
88
|
end
|
|
68
89
|
|
|
69
|
-
def retrieve_chunks(question, top_k)
|
|
90
|
+
def retrieve_chunks(question, top_k, expand_neighbors: false, neighbor_radius: 1)
|
|
70
91
|
return [] unless defined?(Legion::Extensions::Apollo)
|
|
71
92
|
|
|
72
93
|
result = Legion::Extensions::Apollo::Runners::Knowledge.retrieve_relevant(
|
|
@@ -74,12 +95,81 @@ module Legion
|
|
|
74
95
|
limit: top_k,
|
|
75
96
|
tags: ['document_chunk']
|
|
76
97
|
)
|
|
77
|
-
result.is_a?(Hash) && result[:success] ? Array(result[:entries]) : []
|
|
78
|
-
|
|
98
|
+
chunks = result.is_a?(Hash) && result[:success] ? Array(result[:entries]) : []
|
|
99
|
+
expand_neighbors ? expand_neighbor_chunks(chunks, neighbor_radius) : chunks
|
|
100
|
+
rescue StandardError => e
|
|
101
|
+
handle_exception(e, level: :warn, operation: 'knowledge.query.retrieve_chunks')
|
|
79
102
|
[]
|
|
80
103
|
end
|
|
81
104
|
private_class_method :retrieve_chunks
|
|
82
105
|
|
|
106
|
+
def expand_neighbor_chunks(chunks, neighbor_radius)
|
|
107
|
+
return chunks if chunks.empty?
|
|
108
|
+
|
|
109
|
+
radius = neighbor_radius.to_i
|
|
110
|
+
return chunks unless radius.positive? && Helpers::ApolloModels.entry_available?
|
|
111
|
+
|
|
112
|
+
merge_neighbor_chunks(chunks.flat_map { |chunk| neighbor_window_for(chunk, radius) })
|
|
113
|
+
rescue StandardError => e
|
|
114
|
+
handle_exception(e, level: :warn, operation: 'knowledge.query.expand_neighbor_chunks')
|
|
115
|
+
chunks
|
|
116
|
+
end
|
|
117
|
+
private_class_method :expand_neighbor_chunks
|
|
118
|
+
|
|
119
|
+
def neighbor_window_for(chunk, radius)
|
|
120
|
+
context = chunk_context(chunk)
|
|
121
|
+
return [chunk] unless context[:source_file] && !context[:chunk_index].nil?
|
|
122
|
+
|
|
123
|
+
source_file = context[:source_file]
|
|
124
|
+
chunk_index = context[:chunk_index].to_i
|
|
125
|
+
lower = chunk_index - radius
|
|
126
|
+
upper = chunk_index + radius
|
|
127
|
+
|
|
128
|
+
rows = neighbor_dataset(source_file, lower, upper).all.map { |entry| chunk_from_entry(entry) }
|
|
129
|
+
rows << chunk unless rows.any? { |row| chunk_dedupe_key(row) == chunk_dedupe_key(chunk) }
|
|
130
|
+
rows.sort_by { |row| chunk_context(row)[:chunk_index].to_i }
|
|
131
|
+
rescue StandardError => e
|
|
132
|
+
handle_exception(e, level: :warn, operation: 'knowledge.query.neighbor_window')
|
|
133
|
+
[chunk]
|
|
134
|
+
end
|
|
135
|
+
private_class_method :neighbor_window_for
|
|
136
|
+
|
|
137
|
+
def neighbor_dataset(source_file, lower, upper)
|
|
138
|
+
Helpers::ApolloModels.entry
|
|
139
|
+
.where(content_type: 'document_chunk')
|
|
140
|
+
.where(Sequel.lit("source_context->>'source_file' = ?", source_file))
|
|
141
|
+
.where(Sequel.lit("(source_context->>'chunk_index')::integer BETWEEN ? AND ?", lower, upper))
|
|
142
|
+
.order(Sequel.lit("(source_context->>'chunk_index')::integer ASC"))
|
|
143
|
+
end
|
|
144
|
+
private_class_method :neighbor_dataset
|
|
145
|
+
|
|
146
|
+
def chunk_from_entry(entry)
|
|
147
|
+
values = entry.respond_to?(:values) ? entry.values : entry
|
|
148
|
+
context = normalize_context(values[:source_context] || values[:metadata] || values[:context])
|
|
149
|
+
|
|
150
|
+
{
|
|
151
|
+
id: values[:id],
|
|
152
|
+
content: values[:content],
|
|
153
|
+
content_type: values[:content_type],
|
|
154
|
+
confidence: values[:confidence],
|
|
155
|
+
tags: values[:tags],
|
|
156
|
+
source_agent: values[:source_agent],
|
|
157
|
+
knowledge_domain: values[:knowledge_domain],
|
|
158
|
+
status: values[:status],
|
|
159
|
+
content_hash: values[:content_hash],
|
|
160
|
+
metadata: context
|
|
161
|
+
}.compact
|
|
162
|
+
end
|
|
163
|
+
private_class_method :chunk_from_entry
|
|
164
|
+
|
|
165
|
+
def merge_neighbor_chunks(chunks)
|
|
166
|
+
chunks.each_with_object({}) do |chunk, merged|
|
|
167
|
+
key = chunk_dedupe_key(chunk)
|
|
168
|
+
merged[key] ||= chunk
|
|
169
|
+
end.values
|
|
170
|
+
end
|
|
171
|
+
private_class_method :merge_neighbor_chunks
|
|
172
|
+
|
|
83
173
|
def synthesize_answer(question, chunks)
|
|
84
174
|
return nil unless llm_available?
|
|
85
175
|
|
|
@@ -94,6 +184,7 @@ module Legion
|
|
|
94
184
|
result = llm_chat(message: prompt, caller: { extension: 'lex-knowledge' })
|
|
95
185
|
result.is_a?(Hash) ? result[:content] : result
|
|
96
186
|
rescue StandardError => e
|
|
187
|
+
handle_exception(e, level: :warn, operation: 'knowledge.query.synthesize_answer')
|
|
97
188
|
"Error generating answer: #{e.message}"
|
|
98
189
|
end
|
|
99
190
|
private_class_method :synthesize_answer
|
|
@@ -103,11 +194,61 @@ module Legion
|
|
|
103
194
|
content: chunk[:content],
|
|
104
195
|
source_file: chunk.dig(:metadata, :source_file) || chunk[:source_file],
|
|
105
196
|
heading: chunk.dig(:metadata, :heading) || chunk[:heading],
|
|
197
|
+
chunk_index: chunk.dig(:metadata, :chunk_index) || chunk[:chunk_index],
|
|
106
198
|
distance: chunk[:distance] || chunk[:score]
|
|
107
199
|
}
|
|
108
200
|
end
|
|
109
201
|
private_class_method :format_source
|
|
110
202
|
|
|
203
|
+
def chunk_context(chunk)
|
|
204
|
+
context = normalize_context(chunk[:metadata] || chunk[:source_context] || chunk[:context])
|
|
205
|
+
if (context[:source_file].nil? || context[:chunk_index].nil?) && chunk[:id] && Helpers::ApolloModels.entry_available?
|
|
206
|
+
row = Helpers::ApolloModels.entry.where(id: chunk[:id]).first
|
|
207
|
+
context = context.merge(normalize_context(row_context(row))) if row
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
context[:source_file] ||= chunk[:source_file]
|
|
211
|
+
context[:chunk_index] ||= chunk[:chunk_index]
|
|
212
|
+
context[:heading] ||= chunk[:heading]
|
|
213
|
+
context
|
|
214
|
+
rescue StandardError => e
|
|
215
|
+
handle_exception(e, level: :warn, operation: 'knowledge.query.chunk_context')
|
|
216
|
+
{}
|
|
217
|
+
end
|
|
218
|
+
private_class_method :chunk_context
|
|
219
|
+
|
|
220
|
+
def row_context(row)
|
|
221
|
+
values = row.respond_to?(:values) ? row.values : row
|
|
222
|
+
values[:source_context] || values[:metadata] || values[:context]
|
|
223
|
+
end
|
|
224
|
+
private_class_method :row_context
|
|
225
|
+
|
|
226
|
+
def normalize_context(context)
|
|
227
|
+
normalized = case context
|
|
228
|
+
when String
|
|
229
|
+
context.strip.empty? ? {} : json_parse(context)
|
|
230
|
+
when Hash
|
|
231
|
+
context
|
|
232
|
+
else
|
|
233
|
+
{}
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
normalized.transform_keys { |key| key.respond_to?(:to_sym) ? key.to_sym : key }
|
|
237
|
+
rescue StandardError => e
|
|
238
|
+
handle_exception(e, level: :warn, operation: 'knowledge.query.normalize_context')
|
|
239
|
+
{}
|
|
240
|
+
end
|
|
241
|
+
private_class_method :normalize_context
|
|
242
|
+
|
|
243
|
+
def chunk_dedupe_key(chunk)
|
|
244
|
+
chunk[:id] || chunk[:content_hash] || [
|
|
245
|
+
chunk_context(chunk)[:source_file],
|
|
246
|
+
chunk_context(chunk)[:chunk_index],
|
|
247
|
+
chunk[:content]
|
|
248
|
+
]
|
|
249
|
+
end
|
|
250
|
+
private_class_method :chunk_dedupe_key
|
|
251
|
+
|
|
111
252
|
def average_score(chunks)
|
|
112
253
|
return nil if chunks.empty?
|
|
113
254
|
|
|
@@ -160,7 +301,8 @@ module Legion
|
|
|
160
301
|
synthesized: synthesized,
|
|
161
302
|
rating: rating
|
|
162
303
|
})
|
|
163
|
-
rescue StandardError =>
|
|
304
|
+
rescue StandardError => e
|
|
305
|
+
handle_exception(e, level: :warn, operation: 'knowledge.query.emit_feedback_event')
|
|
164
306
|
nil
|
|
165
307
|
end
|
|
166
308
|
private_class_method :emit_feedback_event
|
|
@@ -170,14 +312,10 @@ module Legion
|
|
|
170
312
|
end
|
|
171
313
|
private_class_method :llm_available?
|
|
172
314
|
|
|
173
|
-
def
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
Legion::Settings.dig(:knowledge, :query, :top_k)
|
|
177
|
-
rescue StandardError => _e
|
|
178
|
-
nil
|
|
315
|
+
def resolve_neighbor_radius(neighbor_radius)
|
|
316
|
+
(neighbor_radius || settings[:query][:neighbor_radius]).to_i
|
|
179
317
|
end
|
|
180
|
-
private_class_method :
|
|
318
|
+
private_class_method :resolve_neighbor_radius
|
|
181
319
|
end
|
|
182
320
|
end
|
|
183
321
|
end
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'legion/logging'
|
|
4
|
+
require 'legion/settings'
|
|
5
|
+
require 'legion/json'
|
|
3
6
|
require_relative 'knowledge/version'
|
|
4
7
|
require_relative 'knowledge/helpers/manifest'
|
|
5
8
|
require_relative 'knowledge/helpers/manifest_store'
|
|
6
9
|
require_relative 'knowledge/helpers/parser'
|
|
7
10
|
require_relative 'knowledge/helpers/chunker'
|
|
11
|
+
require_relative 'knowledge/helpers/apollo_models'
|
|
8
12
|
require_relative 'knowledge/runners/ingest'
|
|
9
13
|
require_relative 'knowledge/runners/query'
|
|
10
14
|
require_relative 'knowledge/runners/corpus'
|
|
@@ -27,11 +31,41 @@ require_relative 'knowledge/actors/corpus_ingest'
|
|
|
27
31
|
module Legion
|
|
28
32
|
module Extensions
|
|
29
33
|
module Knowledge
|
|
34
|
+
extend Legion::Logging::Helper
|
|
35
|
+
extend Legion::Settings::Helper
|
|
30
36
|
extend Legion::Extensions::Core if defined?(Legion::Extensions::Core)
|
|
31
37
|
|
|
32
38
|
def self.remote_invocable?
|
|
33
39
|
false
|
|
34
40
|
end
|
|
41
|
+
|
|
42
|
+
def self.default_settings
|
|
43
|
+
{
|
|
44
|
+
corpus_path: nil,
|
|
45
|
+
monitors: [],
|
|
46
|
+
chunker: {
|
|
47
|
+
max_tokens: 512,
|
|
48
|
+
overlap_tokens: 128
|
|
49
|
+
},
|
|
50
|
+
query: {
|
|
51
|
+
top_k: 5,
|
|
52
|
+
neighbor_radius: 1
|
|
53
|
+
},
|
|
54
|
+
ingest: {
|
|
55
|
+
filter_prompt: nil,
|
|
56
|
+
filter_threshold: 0.5
|
|
57
|
+
},
|
|
58
|
+
maintenance: {
|
|
59
|
+
stale_threshold: 0.3,
|
|
60
|
+
cold_chunk_days: 7,
|
|
61
|
+
quality_report_limit: 10
|
|
62
|
+
},
|
|
63
|
+
actors: {
|
|
64
|
+
watcher_interval: 300,
|
|
65
|
+
maintenance_interval: 21_600
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
end
|
|
35
69
|
end
|
|
36
70
|
end
|
|
37
71
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: lex-knowledge
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.6.
|
|
4
|
+
version: 0.6.14
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Matthew Iverson
|
|
@@ -120,6 +120,7 @@ files:
|
|
|
120
120
|
- lib/legion/extensions/knowledge/actors/corpus_watcher.rb
|
|
121
121
|
- lib/legion/extensions/knowledge/actors/maintenance_runner.rb
|
|
122
122
|
- lib/legion/extensions/knowledge/client.rb
|
|
123
|
+
- lib/legion/extensions/knowledge/helpers/apollo_models.rb
|
|
123
124
|
- lib/legion/extensions/knowledge/helpers/chunker.rb
|
|
124
125
|
- lib/legion/extensions/knowledge/helpers/manifest.rb
|
|
125
126
|
- lib/legion/extensions/knowledge/helpers/manifest_store.rb
|