lex-knowledge 0.6.0 → 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/legion/extensions/knowledge/actors/corpus_watcher.rb +11 -4
- data/lib/legion/extensions/knowledge/actors/maintenance_runner.rb +11 -4
- data/lib/legion/extensions/knowledge/runners/ingest.rb +39 -5
- data/lib/legion/extensions/knowledge/version.rb +1 -1
- data/lib/legion/extensions/knowledge.rb +4 -0
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 40412f676840f9927c0e66d7603678bbde1489dcb9a2faba958b1a684fd47b3d
|
|
4
|
+
data.tar.gz: f9247b163289a8e3e07deaf63efbab11703e153150e91d77745eca010cb49b21
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 83cdc62303978876ae5f04c50d3bcf5fcd08faddaa3ee49dc8ac092c49df9665419fb2c52452d7247357b5e424d6122111d82b9e772879bd2afd0068dc780918
|
|
7
|
+
data.tar.gz: 24f0a1208fc396058ae014c497ef5bc49aa96dfe185ad32d0628a19b8fee4ab1c8754048d77d06a73a4a3d42f7c2c1537e6653fecae2b2b280c4b38e6f169745
|
|
@@ -10,19 +10,21 @@ module Legion
|
|
|
10
10
|
def check_subtask? = false
|
|
11
11
|
def generate_task? = false
|
|
12
12
|
|
|
13
|
-
def
|
|
13
|
+
def time
|
|
14
14
|
if defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil?
|
|
15
15
|
Legion::Settings.dig(:knowledge, :actors, :watcher_interval) || 300
|
|
16
16
|
else
|
|
17
17
|
300
|
|
18
18
|
end
|
|
19
|
-
rescue StandardError
|
|
19
|
+
rescue StandardError => e
|
|
20
|
+
log.warn(e.message)
|
|
20
21
|
300
|
|
21
22
|
end
|
|
22
23
|
|
|
23
24
|
def enabled?
|
|
24
25
|
resolve_monitors.any?
|
|
25
|
-
rescue StandardError
|
|
26
|
+
rescue StandardError => e
|
|
27
|
+
log.warn(e.message)
|
|
26
28
|
false
|
|
27
29
|
end
|
|
28
30
|
|
|
@@ -32,9 +34,14 @@ module Legion
|
|
|
32
34
|
|
|
33
35
|
private
|
|
34
36
|
|
|
37
|
+
def log
|
|
38
|
+
Legion::Logging
|
|
39
|
+
end
|
|
40
|
+
|
|
35
41
|
def resolve_monitors
|
|
36
42
|
Runners::Monitor.resolve_monitors
|
|
37
|
-
rescue StandardError
|
|
43
|
+
rescue StandardError => e
|
|
44
|
+
log.warn(e.message)
|
|
38
45
|
[]
|
|
39
46
|
end
|
|
40
47
|
end
|
|
@@ -10,13 +10,14 @@ module Legion
|
|
|
10
10
|
def check_subtask? = false
|
|
11
11
|
def generate_task? = false
|
|
12
12
|
|
|
13
|
-
def
|
|
13
|
+
def time
|
|
14
14
|
if defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil?
|
|
15
15
|
Legion::Settings.dig(:knowledge, :actors, :maintenance_interval) || 21_600
|
|
16
16
|
else
|
|
17
17
|
21_600
|
|
18
18
|
end
|
|
19
|
-
rescue StandardError
|
|
19
|
+
rescue StandardError => e
|
|
20
|
+
log.warn(e.message)
|
|
20
21
|
21_600
|
|
21
22
|
end
|
|
22
23
|
|
|
@@ -24,7 +25,8 @@ module Legion
|
|
|
24
25
|
return false unless corpus_path && !corpus_path.empty?
|
|
25
26
|
|
|
26
27
|
true
|
|
27
|
-
rescue StandardError
|
|
28
|
+
rescue StandardError => e
|
|
29
|
+
log.warn(e.message)
|
|
28
30
|
false
|
|
29
31
|
end
|
|
30
32
|
|
|
@@ -34,11 +36,16 @@ module Legion
|
|
|
34
36
|
|
|
35
37
|
private
|
|
36
38
|
|
|
39
|
+
def log
|
|
40
|
+
Legion::Logging
|
|
41
|
+
end
|
|
42
|
+
|
|
37
43
|
def corpus_path
|
|
38
44
|
return nil unless defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil?
|
|
39
45
|
|
|
40
46
|
Legion::Settings.dig(:knowledge, :corpus_path)
|
|
41
|
-
rescue StandardError
|
|
47
|
+
rescue StandardError => e
|
|
48
|
+
log.warn(e.message)
|
|
42
49
|
nil
|
|
43
50
|
end
|
|
44
51
|
end
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'securerandom'
|
|
4
|
+
|
|
3
5
|
module Legion
|
|
4
6
|
module Extensions
|
|
5
7
|
module Knowledge
|
|
@@ -7,6 +9,11 @@ module Legion
|
|
|
7
9
|
module Ingest
|
|
8
10
|
module_function
|
|
9
11
|
|
|
12
|
+
def log
|
|
13
|
+
Legion::Logging
|
|
14
|
+
end
|
|
15
|
+
private_class_method :log
|
|
16
|
+
|
|
10
17
|
def scan_corpus(path:, extensions: nil)
|
|
11
18
|
opts = { path: path }
|
|
12
19
|
opts[:extensions] = extensions if extensions
|
|
@@ -28,6 +35,7 @@ module Legion
|
|
|
28
35
|
|
|
29
36
|
ingest_corpus_path(path: path, dry_run: dry_run, force: force)
|
|
30
37
|
rescue ArgumentError => e
|
|
38
|
+
log.warn(e.message)
|
|
31
39
|
{ success: false, error: e.message }
|
|
32
40
|
end
|
|
33
41
|
|
|
@@ -63,6 +71,7 @@ module Legion
|
|
|
63
71
|
chunks_updated: chunks_updated
|
|
64
72
|
}
|
|
65
73
|
rescue StandardError => e
|
|
74
|
+
log.warn(e.message)
|
|
66
75
|
{ success: false, error: e.message }
|
|
67
76
|
end
|
|
68
77
|
private_class_method :ingest_corpus_path
|
|
@@ -71,6 +80,7 @@ module Legion
|
|
|
71
80
|
results = monitors.map do |monitor|
|
|
72
81
|
ingest_corpus(path: monitor[:path], dry_run: dry_run, force: force)
|
|
73
82
|
rescue StandardError => e
|
|
83
|
+
log.warn(e.message)
|
|
74
84
|
{ success: false, path: monitor[:path], error: e.message }
|
|
75
85
|
end
|
|
76
86
|
|
|
@@ -91,10 +101,28 @@ module Legion
|
|
|
91
101
|
|
|
92
102
|
{ success: true, monitors_processed: results.size, **total }
|
|
93
103
|
rescue StandardError => e
|
|
104
|
+
log.warn(e.message)
|
|
94
105
|
{ success: false, error: e.message }
|
|
95
106
|
end
|
|
96
107
|
private_class_method :ingest_monitors
|
|
97
108
|
|
|
109
|
+
def ingest_content(content:, source_type: :text, metadata: {})
|
|
110
|
+
source_path = "content://#{source_type}/#{SecureRandom.uuid}"
|
|
111
|
+
section = {
|
|
112
|
+
content: content,
|
|
113
|
+
heading: source_type.to_s,
|
|
114
|
+
section_path: [source_type.to_s],
|
|
115
|
+
source_file: source_path
|
|
116
|
+
}
|
|
117
|
+
chunks = Helpers::Chunker.chunk(sections: [section])
|
|
118
|
+
paired = batch_embed_chunks(chunks, force: false)
|
|
119
|
+
paired.each { |p| upsert_chunk_with_embedding(p[:chunk], p[:embedding], force: false, exists: p[:exists] || false) }
|
|
120
|
+
{ status: :ingested, chunks: chunks.size, source_type: source_type, metadata: metadata }
|
|
121
|
+
rescue StandardError => e
|
|
122
|
+
log.warn(e.message)
|
|
123
|
+
{ status: :failed, error: e.message, source_type: source_type, metadata: metadata }
|
|
124
|
+
end
|
|
125
|
+
|
|
98
126
|
def ingest_file(file_path:, force: false)
|
|
99
127
|
result = process_file(file_path, dry_run: false, force: force)
|
|
100
128
|
|
|
@@ -106,6 +134,7 @@ module Legion
|
|
|
106
134
|
chunks_updated: result[:updated]
|
|
107
135
|
}
|
|
108
136
|
rescue StandardError => e
|
|
137
|
+
log.warn(e.message)
|
|
109
138
|
{ success: false, error: e.message }
|
|
110
139
|
end
|
|
111
140
|
|
|
@@ -145,7 +174,8 @@ module Legion
|
|
|
145
174
|
embed_map = needs_embed.empty? ? {} : build_embed_map(needs_embed)
|
|
146
175
|
|
|
147
176
|
chunks.map { |c| { chunk: c, embedding: embed_map[c[:content_hash]], exists: exists_map.fetch(c[:content_hash], false) } }
|
|
148
|
-
rescue StandardError
|
|
177
|
+
rescue StandardError => e
|
|
178
|
+
log.warn(e.message)
|
|
149
179
|
paired_without_embed(chunks, {})
|
|
150
180
|
end
|
|
151
181
|
private_class_method :batch_embed_chunks
|
|
@@ -170,7 +200,8 @@ module Legion
|
|
|
170
200
|
results.each_with_object({}) do |r, h|
|
|
171
201
|
h[needs_embed[r[:index]][:content_hash]] = r[:vector] unless r[:error]
|
|
172
202
|
end
|
|
173
|
-
rescue StandardError
|
|
203
|
+
rescue StandardError => e
|
|
204
|
+
log.warn(e.message)
|
|
174
205
|
{}
|
|
175
206
|
end
|
|
176
207
|
private_class_method :build_embed_map
|
|
@@ -182,7 +213,8 @@ module Legion
|
|
|
182
213
|
|
|
183
214
|
ingest_to_apollo(chunk, embedding)
|
|
184
215
|
force ? :updated : :created
|
|
185
|
-
rescue StandardError
|
|
216
|
+
rescue StandardError => e
|
|
217
|
+
log.warn(e.message)
|
|
186
218
|
:skipped
|
|
187
219
|
end
|
|
188
220
|
private_class_method :upsert_chunk_with_embedding
|
|
@@ -194,7 +226,8 @@ module Legion
|
|
|
194
226
|
.where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
|
|
195
227
|
.where(Sequel.like(:content, "%#{content_hash}%"))
|
|
196
228
|
.any?
|
|
197
|
-
rescue StandardError
|
|
229
|
+
rescue StandardError => e
|
|
230
|
+
log.warn(e.message)
|
|
198
231
|
false
|
|
199
232
|
end
|
|
200
233
|
private_class_method :chunk_exists?
|
|
@@ -231,7 +264,8 @@ module Legion
|
|
|
231
264
|
tags: [file_path, 'retired', 'document_chunk'].uniq,
|
|
232
265
|
metadata: { source_file: file_path, retired: true }
|
|
233
266
|
)
|
|
234
|
-
rescue StandardError
|
|
267
|
+
rescue StandardError => e
|
|
268
|
+
log.warn(e.message)
|
|
235
269
|
nil
|
|
236
270
|
end
|
|
237
271
|
private_class_method :retire_file
|