lex-knowledge 0.6.10 → 0.6.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/legion/extensions/knowledge/actors/corpus_ingest.rb +5 -1
- data/lib/legion/extensions/knowledge/actors/corpus_watcher.rb +7 -12
- data/lib/legion/extensions/knowledge/actors/maintenance_runner.rb +15 -18
- data/lib/legion/extensions/knowledge/helpers/apollo_models.rb +45 -0
- data/lib/legion/extensions/knowledge/helpers/chunker.rb +5 -20
- data/lib/legion/extensions/knowledge/helpers/manifest.rb +3 -6
- data/lib/legion/extensions/knowledge/helpers/manifest_store.rb +10 -5
- data/lib/legion/extensions/knowledge/helpers/parser.rb +3 -0
- data/lib/legion/extensions/knowledge/runners/corpus.rb +3 -0
- data/lib/legion/extensions/knowledge/runners/ingest.rb +115 -49
- data/lib/legion/extensions/knowledge/runners/maintenance.rb +95 -104
- data/lib/legion/extensions/knowledge/runners/monitor.rb +20 -17
- data/lib/legion/extensions/knowledge/runners/query.rb +161 -20
- data/lib/legion/extensions/knowledge/version.rb +1 -1
- data/lib/legion/extensions/knowledge.rb +34 -0
- metadata +2 -1
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative '../helpers/apollo_models'
|
|
4
|
+
|
|
3
5
|
require 'digest'
|
|
4
6
|
|
|
5
7
|
module Legion
|
|
@@ -7,13 +9,23 @@ module Legion
|
|
|
7
9
|
module Knowledge
|
|
8
10
|
module Runners
|
|
9
11
|
module Query # rubocop:disable Legion/Extension/RunnerIncludeHelpers
|
|
12
|
+
extend Legion::Logging::Helper
|
|
13
|
+
extend Legion::JSON::Helper
|
|
14
|
+
extend Legion::Settings::Helper
|
|
15
|
+
|
|
10
16
|
module_function
|
|
11
17
|
|
|
12
|
-
def query(question:, top_k: nil, synthesize: true)
|
|
18
|
+
def query(question:, top_k: nil, synthesize: true, expand_neighbors: false, neighbor_radius: nil)
|
|
13
19
|
started = ::Process.clock_gettime(::Process::CLOCK_MONOTONIC)
|
|
14
|
-
resolved_k
|
|
20
|
+
resolved_k = top_k || settings[:query][:top_k]
|
|
21
|
+
resolved_radius = resolve_neighbor_radius(neighbor_radius)
|
|
15
22
|
|
|
16
|
-
chunks = retrieve_chunks(
|
|
23
|
+
chunks = retrieve_chunks(
|
|
24
|
+
question,
|
|
25
|
+
resolved_k,
|
|
26
|
+
expand_neighbors: expand_neighbors,
|
|
27
|
+
neighbor_radius: resolved_radius
|
|
28
|
+
)
|
|
17
29
|
|
|
18
30
|
answer = (synthesize_answer(question, chunks) if synthesize && llm_available?)
|
|
19
31
|
|
|
@@ -36,12 +48,19 @@ module Legion
|
|
|
36
48
|
metadata: build_metadata(chunks, score, latency_ms)
|
|
37
49
|
}
|
|
38
50
|
rescue StandardError => e
|
|
51
|
+
handle_exception(e, level: :warn, operation: 'knowledge.query.query')
|
|
39
52
|
{ success: false, error: e.message }
|
|
40
53
|
end
|
|
41
54
|
|
|
42
|
-
def retrieve(question:, top_k: nil)
|
|
43
|
-
resolved_k
|
|
44
|
-
|
|
55
|
+
def retrieve(question:, top_k: nil, expand_neighbors: false, neighbor_radius: nil)
|
|
56
|
+
resolved_k = top_k || settings[:query][:top_k]
|
|
57
|
+
resolved_radius = resolve_neighbor_radius(neighbor_radius)
|
|
58
|
+
chunks = retrieve_chunks(
|
|
59
|
+
question,
|
|
60
|
+
resolved_k,
|
|
61
|
+
expand_neighbors: expand_neighbors,
|
|
62
|
+
neighbor_radius: resolved_radius
|
|
63
|
+
)
|
|
45
64
|
|
|
46
65
|
{
|
|
47
66
|
success: true,
|
|
@@ -49,6 +68,7 @@ module Legion
|
|
|
49
68
|
metadata: build_metadata(chunks, average_score(chunks))
|
|
50
69
|
}
|
|
51
70
|
rescue StandardError => e
|
|
71
|
+
handle_exception(e, level: :warn, operation: 'knowledge.query.retrieve')
|
|
52
72
|
{ success: false, error: e.message }
|
|
53
73
|
end
|
|
54
74
|
|
|
@@ -63,10 +83,11 @@ module Legion
|
|
|
63
83
|
)
|
|
64
84
|
{ success: true, question_hash: question_hash, rating: rating }
|
|
65
85
|
rescue StandardError => e
|
|
86
|
+
handle_exception(e, level: :warn, operation: 'knowledge.query.record_feedback')
|
|
66
87
|
{ success: false, error: e.message }
|
|
67
88
|
end
|
|
68
89
|
|
|
69
|
-
def retrieve_chunks(question, top_k)
|
|
90
|
+
def retrieve_chunks(question, top_k, expand_neighbors: false, neighbor_radius: 1)
|
|
70
91
|
return [] unless defined?(Legion::Extensions::Apollo)
|
|
71
92
|
|
|
72
93
|
result = Legion::Extensions::Apollo::Runners::Knowledge.retrieve_relevant(
|
|
@@ -74,12 +95,81 @@ module Legion
|
|
|
74
95
|
limit: top_k,
|
|
75
96
|
tags: ['document_chunk']
|
|
76
97
|
)
|
|
77
|
-
result.is_a?(Hash) && result[:success] ? Array(result[:entries]) : []
|
|
78
|
-
|
|
98
|
+
chunks = result.is_a?(Hash) && result[:success] ? Array(result[:entries]) : []
|
|
99
|
+
expand_neighbors ? expand_neighbor_chunks(chunks, neighbor_radius) : chunks
|
|
100
|
+
rescue StandardError => e
|
|
101
|
+
handle_exception(e, level: :warn, operation: 'knowledge.query.retrieve_chunks')
|
|
79
102
|
[]
|
|
80
103
|
end
|
|
81
104
|
private_class_method :retrieve_chunks
|
|
82
105
|
|
|
106
|
+
def expand_neighbor_chunks(chunks, neighbor_radius)
|
|
107
|
+
return chunks if chunks.empty?
|
|
108
|
+
|
|
109
|
+
radius = neighbor_radius.to_i
|
|
110
|
+
return chunks unless radius.positive? && Helpers::ApolloModels.entry_available?
|
|
111
|
+
|
|
112
|
+
merge_neighbor_chunks(chunks.flat_map { |chunk| neighbor_window_for(chunk, radius) })
|
|
113
|
+
rescue StandardError => e
|
|
114
|
+
handle_exception(e, level: :warn, operation: 'knowledge.query.expand_neighbor_chunks')
|
|
115
|
+
chunks
|
|
116
|
+
end
|
|
117
|
+
private_class_method :expand_neighbor_chunks
|
|
118
|
+
|
|
119
|
+
def neighbor_window_for(chunk, radius)
|
|
120
|
+
context = chunk_context(chunk)
|
|
121
|
+
return [chunk] unless context[:source_file] && !context[:chunk_index].nil?
|
|
122
|
+
|
|
123
|
+
source_file = context[:source_file]
|
|
124
|
+
chunk_index = context[:chunk_index].to_i
|
|
125
|
+
lower = chunk_index - radius
|
|
126
|
+
upper = chunk_index + radius
|
|
127
|
+
|
|
128
|
+
rows = neighbor_dataset(source_file, lower, upper).all.map { |entry| chunk_from_entry(entry) }
|
|
129
|
+
rows << chunk unless rows.any? { |row| chunk_dedupe_key(row) == chunk_dedupe_key(chunk) }
|
|
130
|
+
rows.sort_by { |row| chunk_context(row)[:chunk_index].to_i }
|
|
131
|
+
rescue StandardError => e
|
|
132
|
+
handle_exception(e, level: :warn, operation: 'knowledge.query.neighbor_window')
|
|
133
|
+
[chunk]
|
|
134
|
+
end
|
|
135
|
+
private_class_method :neighbor_window_for
|
|
136
|
+
|
|
137
|
+
def neighbor_dataset(source_file, lower, upper)
|
|
138
|
+
Helpers::ApolloModels.entry
|
|
139
|
+
.where(content_type: 'document_chunk')
|
|
140
|
+
.where(Sequel.lit("source_context->>'source_file' = ?", source_file))
|
|
141
|
+
.where(Sequel.lit("(source_context->>'chunk_index')::integer BETWEEN ? AND ?", lower, upper))
|
|
142
|
+
.order(Sequel.lit("(source_context->>'chunk_index')::integer ASC"))
|
|
143
|
+
end
|
|
144
|
+
private_class_method :neighbor_dataset
|
|
145
|
+
|
|
146
|
+
def chunk_from_entry(entry)
|
|
147
|
+
values = entry.respond_to?(:values) ? entry.values : entry
|
|
148
|
+
context = normalize_context(values[:source_context] || values[:metadata] || values[:context])
|
|
149
|
+
|
|
150
|
+
{
|
|
151
|
+
id: values[:id],
|
|
152
|
+
content: values[:content],
|
|
153
|
+
content_type: values[:content_type],
|
|
154
|
+
confidence: values[:confidence],
|
|
155
|
+
tags: values[:tags],
|
|
156
|
+
source_agent: values[:source_agent],
|
|
157
|
+
knowledge_domain: values[:knowledge_domain],
|
|
158
|
+
status: values[:status],
|
|
159
|
+
content_hash: values[:content_hash],
|
|
160
|
+
metadata: context
|
|
161
|
+
}.compact
|
|
162
|
+
end
|
|
163
|
+
private_class_method :chunk_from_entry
|
|
164
|
+
|
|
165
|
+
def merge_neighbor_chunks(chunks)
|
|
166
|
+
chunks.each_with_object({}) do |chunk, merged|
|
|
167
|
+
key = chunk_dedupe_key(chunk)
|
|
168
|
+
merged[key] ||= chunk
|
|
169
|
+
end.values
|
|
170
|
+
end
|
|
171
|
+
private_class_method :merge_neighbor_chunks
|
|
172
|
+
|
|
83
173
|
def synthesize_answer(question, chunks)
|
|
84
174
|
return nil unless llm_available?
|
|
85
175
|
|
|
@@ -91,10 +181,14 @@ module Legion
|
|
|
91
181
|
"Context:\n#{context_text}\n\nQuestion: #{question}\n\nAnswer:"
|
|
92
182
|
end
|
|
93
183
|
|
|
94
|
-
result =
|
|
95
|
-
|
|
184
|
+
result = Legion::LLM.chat( # rubocop:disable Legion/HelperMigration/DirectLlm
|
|
185
|
+
message: prompt,
|
|
186
|
+
caller: { extension: 'lex-knowledge' }
|
|
187
|
+
)
|
|
188
|
+
result.is_a?(Hash) ? result[:content] : result.content
|
|
96
189
|
rescue StandardError => e
|
|
97
|
-
|
|
190
|
+
handle_exception(e, level: :warn, operation: 'knowledge.query.synthesize_answer')
|
|
191
|
+
nil
|
|
98
192
|
end
|
|
99
193
|
private_class_method :synthesize_answer
|
|
100
194
|
|
|
@@ -103,11 +197,61 @@ module Legion
|
|
|
103
197
|
content: chunk[:content],
|
|
104
198
|
source_file: chunk.dig(:metadata, :source_file) || chunk[:source_file],
|
|
105
199
|
heading: chunk.dig(:metadata, :heading) || chunk[:heading],
|
|
200
|
+
chunk_index: chunk.dig(:metadata, :chunk_index) || chunk[:chunk_index],
|
|
106
201
|
distance: chunk[:distance] || chunk[:score]
|
|
107
202
|
}
|
|
108
203
|
end
|
|
109
204
|
private_class_method :format_source
|
|
110
205
|
|
|
206
|
+
def chunk_context(chunk)
|
|
207
|
+
context = normalize_context(chunk[:metadata] || chunk[:source_context] || chunk[:context])
|
|
208
|
+
if (context[:source_file].nil? || context[:chunk_index].nil?) && chunk[:id] && Helpers::ApolloModels.entry_available?
|
|
209
|
+
row = Helpers::ApolloModels.entry.where(id: chunk[:id]).first
|
|
210
|
+
context = context.merge(normalize_context(row_context(row))) if row
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
context[:source_file] ||= chunk[:source_file]
|
|
214
|
+
context[:chunk_index] ||= chunk[:chunk_index]
|
|
215
|
+
context[:heading] ||= chunk[:heading]
|
|
216
|
+
context
|
|
217
|
+
rescue StandardError => e
|
|
218
|
+
handle_exception(e, level: :warn, operation: 'knowledge.query.chunk_context')
|
|
219
|
+
{}
|
|
220
|
+
end
|
|
221
|
+
private_class_method :chunk_context
|
|
222
|
+
|
|
223
|
+
def row_context(row)
|
|
224
|
+
values = row.respond_to?(:values) ? row.values : row
|
|
225
|
+
values[:source_context] || values[:metadata] || values[:context]
|
|
226
|
+
end
|
|
227
|
+
private_class_method :row_context
|
|
228
|
+
|
|
229
|
+
def normalize_context(context)
|
|
230
|
+
normalized = case context
|
|
231
|
+
when String
|
|
232
|
+
context.strip.empty? ? {} : json_parse(context)
|
|
233
|
+
when Hash
|
|
234
|
+
context
|
|
235
|
+
else
|
|
236
|
+
{}
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
normalized.transform_keys { |key| key.respond_to?(:to_sym) ? key.to_sym : key }
|
|
240
|
+
rescue StandardError => e
|
|
241
|
+
handle_exception(e, level: :warn, operation: 'knowledge.query.normalize_context')
|
|
242
|
+
{}
|
|
243
|
+
end
|
|
244
|
+
private_class_method :normalize_context
|
|
245
|
+
|
|
246
|
+
def chunk_dedupe_key(chunk)
|
|
247
|
+
chunk[:id] || chunk[:content_hash] || [
|
|
248
|
+
chunk_context(chunk)[:source_file],
|
|
249
|
+
chunk_context(chunk)[:chunk_index],
|
|
250
|
+
chunk[:content]
|
|
251
|
+
]
|
|
252
|
+
end
|
|
253
|
+
private_class_method :chunk_dedupe_key
|
|
254
|
+
|
|
111
255
|
def average_score(chunks)
|
|
112
256
|
return nil if chunks.empty?
|
|
113
257
|
|
|
@@ -160,7 +304,8 @@ module Legion
|
|
|
160
304
|
synthesized: synthesized,
|
|
161
305
|
rating: rating
|
|
162
306
|
})
|
|
163
|
-
rescue StandardError =>
|
|
307
|
+
rescue StandardError => e
|
|
308
|
+
handle_exception(e, level: :warn, operation: 'knowledge.query.emit_feedback_event')
|
|
164
309
|
nil
|
|
165
310
|
end
|
|
166
311
|
private_class_method :emit_feedback_event
|
|
@@ -170,14 +315,10 @@ module Legion
|
|
|
170
315
|
end
|
|
171
316
|
private_class_method :llm_available?
|
|
172
317
|
|
|
173
|
-
def
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
Legion::Settings.dig(:knowledge, :query, :top_k)
|
|
177
|
-
rescue StandardError => _e
|
|
178
|
-
nil
|
|
318
|
+
def resolve_neighbor_radius(neighbor_radius)
|
|
319
|
+
(neighbor_radius || settings[:query][:neighbor_radius]).to_i
|
|
179
320
|
end
|
|
180
|
-
private_class_method :
|
|
321
|
+
private_class_method :resolve_neighbor_radius
|
|
181
322
|
end
|
|
182
323
|
end
|
|
183
324
|
end
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'legion/logging'
|
|
4
|
+
require 'legion/settings'
|
|
5
|
+
require 'legion/json'
|
|
3
6
|
require_relative 'knowledge/version'
|
|
4
7
|
require_relative 'knowledge/helpers/manifest'
|
|
5
8
|
require_relative 'knowledge/helpers/manifest_store'
|
|
6
9
|
require_relative 'knowledge/helpers/parser'
|
|
7
10
|
require_relative 'knowledge/helpers/chunker'
|
|
11
|
+
require_relative 'knowledge/helpers/apollo_models'
|
|
8
12
|
require_relative 'knowledge/runners/ingest'
|
|
9
13
|
require_relative 'knowledge/runners/query'
|
|
10
14
|
require_relative 'knowledge/runners/corpus'
|
|
@@ -27,11 +31,41 @@ require_relative 'knowledge/actors/corpus_ingest'
|
|
|
27
31
|
module Legion
|
|
28
32
|
module Extensions
|
|
29
33
|
module Knowledge
|
|
34
|
+
extend Legion::Logging::Helper
|
|
35
|
+
extend Legion::Settings::Helper
|
|
30
36
|
extend Legion::Extensions::Core if defined?(Legion::Extensions::Core)
|
|
31
37
|
|
|
32
38
|
def self.remote_invocable?
|
|
33
39
|
false
|
|
34
40
|
end
|
|
41
|
+
|
|
42
|
+
def self.default_settings
|
|
43
|
+
{
|
|
44
|
+
corpus_path: nil,
|
|
45
|
+
monitors: [],
|
|
46
|
+
chunker: {
|
|
47
|
+
max_tokens: 512,
|
|
48
|
+
overlap_tokens: 128
|
|
49
|
+
},
|
|
50
|
+
query: {
|
|
51
|
+
top_k: 5,
|
|
52
|
+
neighbor_radius: 1
|
|
53
|
+
},
|
|
54
|
+
ingest: {
|
|
55
|
+
filter_prompt: nil,
|
|
56
|
+
filter_threshold: 0.5
|
|
57
|
+
},
|
|
58
|
+
maintenance: {
|
|
59
|
+
stale_threshold: 0.3,
|
|
60
|
+
cold_chunk_days: 7,
|
|
61
|
+
quality_report_limit: 10
|
|
62
|
+
},
|
|
63
|
+
actors: {
|
|
64
|
+
watcher_interval: 300,
|
|
65
|
+
maintenance_interval: 21_600
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
end
|
|
35
69
|
end
|
|
36
70
|
end
|
|
37
71
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: lex-knowledge
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.6.
|
|
4
|
+
version: 0.6.15
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Matthew Iverson
|
|
@@ -120,6 +120,7 @@ files:
|
|
|
120
120
|
- lib/legion/extensions/knowledge/actors/corpus_watcher.rb
|
|
121
121
|
- lib/legion/extensions/knowledge/actors/maintenance_runner.rb
|
|
122
122
|
- lib/legion/extensions/knowledge/client.rb
|
|
123
|
+
- lib/legion/extensions/knowledge/helpers/apollo_models.rb
|
|
123
124
|
- lib/legion/extensions/knowledge/helpers/chunker.rb
|
|
124
125
|
- lib/legion/extensions/knowledge/helpers/manifest.rb
|
|
125
126
|
- lib/legion/extensions/knowledge/helpers/manifest_store.rb
|