lex-knowledge 0.1.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ea1661f154fc5184b961caac74654775240dcc49fd7d8c6be469f6cd5520429c
4
- data.tar.gz: 3499bcb535edb1f56e896ed9c462a2cfabb9d7d1804dc55ecd559fc3b1941ae9
3
+ metadata.gz: da332d52d481d4e45e8f7ee771a60fb13cebb74144a18ac54ff7b7ec4cb26f27
4
+ data.tar.gz: 3a858fbdae4511ec6e34573fccd85f330b33c7be818a2a78271af33cb0dcbfbd
5
5
  SHA512:
6
- metadata.gz: 8b50acc322c82ae7629e5152e31c84ce9b348fdd1226e96aa7bc3c1f9f67cd330b809212f49c19636d93a45a199ce4e936a3d172cf71997ffad5aa69eb9712ef
7
- data.tar.gz: c1feac59942cf706beb303cbac9989e569937048d2f666af3e762ef50d69659c46de6bb350f69cc16a5103581d54682c3f68886f84a4a840b16b1f0e8cfb82e6
6
+ metadata.gz: e7e15c2174b28ea518c3fee6ecc68723bc628ee3c6d9121a2292706ad32125d32f766da164aa581c56915cd178e54379014a67d8ad1d5cd765485a6eb88d0610
7
+ data.tar.gz: eb7f343423ea41ff46b6a478b3c32a22b0dd3ab3939ad858d0e8d540b5faaf8f01496a5839c79b5ccb1847529367ad9c55ffdc72bebf5959891ff4a7c06b72e4
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Extensions
5
+ module Knowledge
6
+ module Actor
7
+ class CorpusIngest < Legion::Extensions::Actors::Subscription
8
+ def runner_class = 'Legion::Extensions::Knowledge::Runners::Ingest'
9
+ def runner_function = 'ingest_file'
10
+ def check_subtask? = false
11
+ def generate_task? = false
12
+
13
+ def enabled?
14
+ defined?(Legion::Transport) &&
15
+ defined?(Legion::Extensions::Knowledge::Runners::Ingest)
16
+ rescue StandardError
17
+ false
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Extensions
5
+ module Knowledge
6
+ module Actor
7
+ class CorpusWatcher < Legion::Extensions::Actors::Every
8
+ def runner_class = 'Legion::Extensions::Knowledge::Runners::Ingest'
9
+ def runner_function = 'ingest_corpus'
10
+ def check_subtask? = false
11
+ def generate_task? = false
12
+
13
+ def every_interval
14
+ if defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil?
15
+ Legion::Settings.dig(:knowledge, :actors, :watcher_interval) || 300
16
+ else
17
+ 300
18
+ end
19
+ rescue StandardError
20
+ 300
21
+ end
22
+
23
+ def enabled?
24
+ corpus_path && !corpus_path.empty?
25
+ rescue StandardError
26
+ false
27
+ end
28
+
29
+ def args
30
+ { path: corpus_path }
31
+ end
32
+
33
+ private
34
+
35
+ def corpus_path
36
+ return nil unless defined?(Legion::Settings) && !Legion::Settings[:knowledge].nil?
37
+
38
+ Legion::Settings.dig(:knowledge, :corpus_path)
39
+ rescue StandardError
40
+ nil
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'digest'
4
+ require 'fileutils'
5
+ require 'json'
6
+ require 'tempfile'
7
+
8
+ module Legion
9
+ module Extensions
10
+ module Knowledge
11
+ module Helpers
12
+ module ManifestStore
13
+ module_function
14
+
15
+ STORE_DIR = ::File.expand_path('~/.legionio/knowledge').freeze
16
+
17
+ def load(corpus_path:)
18
+ path = store_path(corpus_path: corpus_path)
19
+ return [] unless ::File.exist?(path)
20
+
21
+ raw = ::File.read(path, encoding: 'utf-8')
22
+ ::JSON.parse(raw, symbolize_names: true)
23
+ rescue StandardError
24
+ []
25
+ end
26
+
27
+ def save(corpus_path:, manifest:)
28
+ ::FileUtils.mkdir_p(STORE_DIR)
29
+ path = store_path(corpus_path: corpus_path)
30
+ tmp = "#{path}.tmp"
31
+ ::File.write(tmp, ::JSON.generate(manifest.map { |e| serialize_entry(e) }))
32
+ ::File.rename(tmp, path)
33
+ true
34
+ rescue StandardError
35
+ false
36
+ end
37
+
38
+ def store_path(corpus_path:)
39
+ hash = ::Digest::SHA256.hexdigest(corpus_path.to_s)[0, 16]
40
+ ::File.join(STORE_DIR, "#{hash}.manifest.json")
41
+ end
42
+
43
+ def serialize_entry(entry)
44
+ entry.merge(mtime: entry[:mtime].to_s)
45
+ end
46
+ private_class_method :serialize_entry
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -15,6 +15,8 @@ module Legion
15
15
  parse_markdown(file_path: file_path)
16
16
  when '.txt'
17
17
  parse_text(file_path: file_path)
18
+ when '.pdf', '.docx'
19
+ extract_via_data(file_path: file_path)
18
20
  else
19
21
  [{ error: 'unsupported format', source_file: file_path }]
20
22
  end
@@ -22,32 +24,42 @@ module Legion
22
24
 
23
25
  def parse_markdown(file_path:)
24
26
  content = ::File.read(file_path, encoding: 'utf-8')
25
- sections = []
27
+ sections = []
26
28
  current_heading = ::File.basename(file_path, '.*')
27
29
  current_lines = []
28
- section_path = []
30
+ heading_stack = {}
29
31
 
30
32
  content.each_line do |line|
31
- if line.start_with?('# ')
32
- flush_section(sections, current_heading, section_path, current_lines, file_path) unless current_lines.empty?
33
- current_heading = line.sub(/^#+\s*/, '').chomp
34
- section_path = [current_heading]
35
- current_lines = []
36
- elsif line.start_with?('## ')
37
- flush_section(sections, current_heading, section_path, current_lines, file_path) unless current_lines.empty?
38
- current_heading = line.sub(/^#+\s*/, '').chomp
39
- section_path = section_path.first(1) + [current_heading]
33
+ level = heading_level(line)
34
+ if level
35
+ flush_section(sections, current_heading, build_section_path(heading_stack), current_lines, file_path)
36
+ title = line.sub(/^#+\s*/, '').chomp
37
+ heading_stack.delete_if { |d, _| d >= level }
38
+ heading_stack[level] = title
39
+ current_heading = title
40
40
  current_lines = []
41
41
  else
42
42
  current_lines << line
43
43
  end
44
44
  end
45
45
 
46
- flush_section(sections, current_heading, section_path, current_lines, file_path) unless current_lines.empty?
46
+ flush_section(sections, current_heading, build_section_path(heading_stack), current_lines, file_path)
47
47
 
48
48
  sections.empty? ? [{ heading: ::File.basename(file_path, '.*'), section_path: [], content: content.strip, source_file: file_path }] : sections
49
49
  end
50
50
 
51
+ def extract_via_data(file_path:)
52
+ return [{ error: 'unsupported format', source_file: file_path }] unless defined?(::Legion::Data::Extract)
53
+
54
+ result = ::Legion::Data::Extract.extract(file_path, type: :auto)
55
+ return [{ error: 'extraction_failed', source_file: file_path, detail: result }] unless result.is_a?(Hash) && result[:text]
56
+
57
+ heading = ::File.basename(file_path, '.*')
58
+ [{ heading: heading, section_path: [], content: result[:text].strip, source_file: file_path }]
59
+ rescue StandardError => e
60
+ [{ error: 'extraction_failed', source_file: file_path, detail: e.message }]
61
+ end
62
+
51
63
  def parse_text(file_path:)
52
64
  content = ::File.read(file_path, encoding: 'utf-8')
53
65
  heading = ::File.basename(file_path, '.*')
@@ -67,6 +79,17 @@ module Legion
67
79
  }
68
80
  end
69
81
  private_class_method :flush_section
82
+
83
+ def heading_level(line)
84
+ m = line.match(/^(\#{1,6})\s/)
85
+ m ? m[1].length : nil
86
+ end
87
+ private_class_method :heading_level
88
+
89
+ def build_section_path(stack)
90
+ stack.sort.map { |_, title| title }
91
+ end
92
+ private_class_method :build_section_path
70
93
  end
71
94
  end
72
95
  end
@@ -7,8 +7,32 @@ module Legion
7
7
  module Corpus
8
8
  module_function
9
9
 
10
- def corpus_stats
11
- { success: true, info: 'not yet implemented' }
10
+ def manifest_path(path:)
11
+ Helpers::ManifestStore.store_path(corpus_path: path)
12
+ end
13
+
14
+ def corpus_stats(path:, extensions: nil)
15
+ return { success: false, error: 'path does not exist' } unless ::File.exist?(path)
16
+
17
+ opts = { path: path }
18
+ opts[:extensions] = extensions if extensions
19
+ entries = Helpers::Manifest.scan(**opts)
20
+ chunk_count = entries.sum do |entry|
21
+ sections = Helpers::Parser.parse(file_path: entry[:path])
22
+ next 0 if sections.first&.key?(:error)
23
+
24
+ Helpers::Chunker.chunk(sections: sections).size
25
+ end
26
+
27
+ {
28
+ success: true,
29
+ path: path,
30
+ file_count: entries.size,
31
+ estimated_chunks: chunk_count,
32
+ total_bytes: entries.sum { |e| e[:size] }
33
+ }
34
+ rescue StandardError => e
35
+ { success: false, error: e.message }
12
36
  end
13
37
  end
14
38
  end
@@ -23,23 +23,32 @@ module Legion
23
23
  end
24
24
 
25
25
  def ingest_corpus(path:, dry_run: false, force: false)
26
- entries = Helpers::Manifest.scan(path: path)
27
-
28
- files_scanned = entries.size
29
- chunks_created = 0
30
- chunks_skipped = 0
31
- chunks_updated = 0
32
-
33
- entries.each do |entry|
34
- result = process_file(entry[:path], dry_run: dry_run, force: force)
35
- chunks_created += result[:created]
36
- chunks_skipped += result[:skipped]
37
- chunks_updated += result[:updated]
26
+ current = Helpers::Manifest.scan(path: path)
27
+ previous = force ? [] : Helpers::ManifestStore.load(corpus_path: path)
28
+ delta = Helpers::Manifest.diff(current: current, previous: previous)
29
+
30
+ to_process = delta[:added] + delta[:changed]
31
+ chunks_created = 0
32
+ chunks_skipped = 0
33
+ chunks_updated = 0
34
+
35
+ to_process.each do |file_path|
36
+ result = process_file(file_path, dry_run: dry_run, force: force)
37
+ chunks_created += result[:created]
38
+ chunks_skipped += result[:skipped]
39
+ chunks_updated += result[:updated]
38
40
  end
39
41
 
42
+ delta[:removed].each { |file_path| retire_file(file_path: file_path) } unless dry_run
43
+
44
+ Helpers::ManifestStore.save(corpus_path: path, manifest: current) unless dry_run
45
+
40
46
  {
41
47
  success: true,
42
- files_scanned: files_scanned,
48
+ files_scanned: current.size,
49
+ files_added: delta[:added].size,
50
+ files_changed: delta[:changed].size,
51
+ files_removed: delta[:removed].size,
43
52
  chunks_created: chunks_created,
44
53
  chunks_skipped: chunks_skipped,
45
54
  chunks_updated: chunks_updated
@@ -67,12 +76,18 @@ module Legion
67
76
  return { created: 0, skipped: 0, updated: 0 } if sections.first&.key?(:error)
68
77
 
69
78
  chunks = Helpers::Chunker.chunk(sections: sections)
79
+ paired = if dry_run
80
+ chunks.map { |c| { chunk: c, embedding: nil } }
81
+ else
82
+ batch_embed_chunks(chunks, force: force)
83
+ end
84
+
70
85
  created = 0
71
86
  skipped = 0
72
87
  updated = 0
73
88
 
74
- chunks.each do |chunk|
75
- outcome = upsert_chunk(chunk, dry_run: dry_run, force: force)
89
+ paired.each do |p|
90
+ outcome = upsert_chunk_with_embedding(p[:chunk], p[:embedding], dry_run: dry_run, force: force, exists: p[:exists] || false)
76
91
  case outcome
77
92
  when :created then created += 1
78
93
  when :skipped then skipped += 1
@@ -84,46 +99,68 @@ module Legion
84
99
  end
85
100
  private_class_method :process_file
86
101
 
87
- def upsert_chunk(chunk, dry_run: false, force: false)
88
- return :created if dry_run
102
+ def batch_embed_chunks(chunks, force:)
103
+ exists_map = force ? {} : build_exists_map(chunks)
104
+ return paired_without_embed(chunks, exists_map) unless llm_embed_available?
89
105
 
90
- return :created unless defined?(Legion::Extensions::Apollo)
106
+ needs_embed = force ? chunks : chunks.reject { |c| exists_map[c[:content_hash]] }
107
+ embed_map = needs_embed.empty? ? {} : build_embed_map(needs_embed)
91
108
 
92
- return :skipped if !force && chunk_exists?(chunk[:content_hash])
109
+ chunks.map { |c| { chunk: c, embedding: embed_map[c[:content_hash]], exists: exists_map.fetch(c[:content_hash], false) } }
110
+ rescue StandardError
111
+ paired_without_embed(chunks, {})
112
+ end
113
+ private_class_method :batch_embed_chunks
93
114
 
94
- embedding = generate_embedding(chunk[:content])
95
- ingest_to_apollo(chunk, embedding)
115
+ def build_exists_map(chunks)
116
+ chunks.to_h { |c| [c[:content_hash], chunk_exists?(c[:content_hash])] }
117
+ end
118
+ private_class_method :build_exists_map
119
+
120
+ def llm_embed_available?
121
+ defined?(Legion::LLM) && Legion::LLM.respond_to?(:embed_batch)
122
+ end
123
+ private_class_method :llm_embed_available?
124
+
125
+ def paired_without_embed(chunks, exists_map)
126
+ chunks.map { |c| { chunk: c, embedding: nil, exists: exists_map.fetch(c[:content_hash], false) } }
127
+ end
128
+ private_class_method :paired_without_embed
129
+
130
+ def build_embed_map(needs_embed)
131
+ results = Legion::LLM.embed_batch(needs_embed.map { |c| c[:content] })
132
+ results.each_with_object({}) do |r, h|
133
+ h[needs_embed[r[:index]][:content_hash]] = r[:vector] unless r[:error]
134
+ end
135
+ rescue StandardError
136
+ {}
137
+ end
138
+ private_class_method :build_embed_map
139
+
140
+ def upsert_chunk_with_embedding(chunk, embedding, dry_run: false, force: false, exists: false)
141
+ return :created if dry_run
142
+ return :created unless defined?(Legion::Extensions::Apollo)
143
+ return :skipped if !force && exists
96
144
 
145
+ ingest_to_apollo(chunk, embedding)
97
146
  force ? :updated : :created
98
147
  rescue StandardError
99
148
  :skipped
100
149
  end
101
- private_class_method :upsert_chunk
150
+ private_class_method :upsert_chunk_with_embedding
102
151
 
103
152
  def chunk_exists?(content_hash)
104
- return false unless defined?(Legion::Extensions::Apollo)
105
-
106
- Legion::Extensions::Apollo::Runners::Knowledge.retrieve_relevant(
107
- query: content_hash,
108
- limit: 1,
109
- tags: ['document_chunk'],
110
- filter: { content_hash: content_hash }
111
- ).any?
153
+ return false unless defined?(Legion::Data::Model::ApolloEntry)
154
+
155
+ Legion::Data::Model::ApolloEntry
156
+ .where(Sequel.pg_array_op(:tags).contains(Sequel.pg_array(['document_chunk'])))
157
+ .where(Sequel.like(:content, "%#{content_hash}%"))
158
+ .any?
112
159
  rescue StandardError
113
160
  false
114
161
  end
115
162
  private_class_method :chunk_exists?
116
163
 
117
- def generate_embedding(content)
118
- return nil unless defined?(Legion::Extensions::Apollo)
119
- return nil unless defined?(Legion::Extensions::Apollo::Helpers::Embedding)
120
-
121
- Legion::Extensions::Apollo::Helpers::Embedding.generate(content)
122
- rescue StandardError
123
- nil
124
- end
125
- private_class_method :generate_embedding
126
-
127
164
  def ingest_to_apollo(chunk, embedding)
128
165
  return unless defined?(Legion::Extensions::Apollo)
129
166
 
@@ -145,6 +182,21 @@ module Legion
145
182
  Legion::Extensions::Apollo::Runners::Knowledge.handle_ingest(**payload)
146
183
  end
147
184
  private_class_method :ingest_to_apollo
185
+
186
+ def retire_file(file_path:)
187
+ return unless defined?(Legion::Apollo)
188
+ return unless Legion::Apollo.respond_to?(:ingest) && Legion::Apollo.started?
189
+
190
+ Legion::Apollo.ingest(
191
+ content: file_path,
192
+ content_type: 'document_retired',
193
+ tags: [file_path, 'retired', 'document_chunk'].uniq,
194
+ metadata: { source_file: file_path, retired: true }
195
+ )
196
+ rescue StandardError
197
+ nil
198
+ end
199
+ private_class_method :retire_file
148
200
  end
149
201
  end
150
202
  end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Extensions
5
+ module Knowledge
6
+ module Transport
7
+ module Exchanges
8
+ class Knowledge < Legion::Transport::Exchange
9
+ def exchange_name = 'knowledge'
10
+ def type = 'topic'
11
+ def durable = true
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Extensions
5
+ module Knowledge
6
+ module Transport
7
+ module Messages
8
+ class IngestMessage < Legion::Transport::Message
9
+ def exchange_name = 'knowledge'
10
+ def routing_key = 'knowledge.ingest'
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Extensions
5
+ module Knowledge
6
+ module Transport
7
+ module Queues
8
+ class Ingest < Legion::Transport::Queue
9
+ def queue_name = 'knowledge.ingest'
10
+ def exchange_name = 'knowledge'
11
+ def routing_key = 'knowledge.ingest'
12
+ def durable = true
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
@@ -3,7 +3,7 @@
3
3
  module Legion
4
4
  module Extensions
5
5
  module Knowledge
6
- VERSION = '0.1.2'
6
+ VERSION = '0.4.0'
7
7
  end
8
8
  end
9
9
  end
@@ -2,6 +2,7 @@
2
2
 
3
3
  require_relative 'knowledge/version'
4
4
  require_relative 'knowledge/helpers/manifest'
5
+ require_relative 'knowledge/helpers/manifest_store'
5
6
  require_relative 'knowledge/helpers/parser'
6
7
  require_relative 'knowledge/helpers/chunker'
7
8
  require_relative 'knowledge/runners/ingest'
@@ -9,6 +10,16 @@ require_relative 'knowledge/runners/query'
9
10
  require_relative 'knowledge/runners/corpus'
10
11
  require_relative 'knowledge/client'
11
12
 
13
+ if defined?(Legion::Transport)
14
+ require_relative 'knowledge/transport/exchanges/knowledge'
15
+ require_relative 'knowledge/transport/queues/ingest'
16
+ require_relative 'knowledge/transport/messages/ingest_message'
17
+ end
18
+
19
+ require_relative 'knowledge/actors/corpus_watcher' if defined?(Legion::Extensions::Actors::Every)
20
+
21
+ require_relative 'knowledge/actors/corpus_ingest' if defined?(Legion::Extensions::Actors::Subscription)
22
+
12
23
  module Legion
13
24
  module Extensions
14
25
  module Knowledge
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lex-knowledge
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Matthew Iverson
@@ -116,13 +116,19 @@ extra_rdoc_files: []
116
116
  files:
117
117
  - README.md
118
118
  - lib/legion/extensions/knowledge.rb
119
+ - lib/legion/extensions/knowledge/actors/corpus_ingest.rb
120
+ - lib/legion/extensions/knowledge/actors/corpus_watcher.rb
119
121
  - lib/legion/extensions/knowledge/client.rb
120
122
  - lib/legion/extensions/knowledge/helpers/chunker.rb
121
123
  - lib/legion/extensions/knowledge/helpers/manifest.rb
124
+ - lib/legion/extensions/knowledge/helpers/manifest_store.rb
122
125
  - lib/legion/extensions/knowledge/helpers/parser.rb
123
126
  - lib/legion/extensions/knowledge/runners/corpus.rb
124
127
  - lib/legion/extensions/knowledge/runners/ingest.rb
125
128
  - lib/legion/extensions/knowledge/runners/query.rb
129
+ - lib/legion/extensions/knowledge/transport/exchanges/knowledge.rb
130
+ - lib/legion/extensions/knowledge/transport/messages/ingest_message.rb
131
+ - lib/legion/extensions/knowledge/transport/queues/ingest.rb
126
132
  - lib/legion/extensions/knowledge/version.rb
127
133
  homepage: https://github.com/LegionIO/lex-knowledge
128
134
  licenses: