lex-knowledge 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2a5307a5b8c19abaedd5f7d6be95d0fc3d068fa0b6d7366293c70da0edec7825
4
- data.tar.gz: 756114f38b345f356a826e09c50714d8b14ff2a4baa6ef0087ddab9f2d75ff78
3
+ metadata.gz: da332d52d481d4e45e8f7ee771a60fb13cebb74144a18ac54ff7b7ec4cb26f27
4
+ data.tar.gz: 3a858fbdae4511ec6e34573fccd85f330b33c7be818a2a78271af33cb0dcbfbd
5
5
  SHA512:
6
- metadata.gz: 6d77ae8947c2ac53af380a35935c33cb075f4ae1a7b8c2b495389701fb15652a7baaefeeddd32606bc81d9e4c6f8f319562fe869906523b34ca4563ce0e245c4
7
- data.tar.gz: a30c57db2c8cb0da0d54ac73afc178ef55892123e4a6878aa1bf1ca62014663c508c699857970f5582a0e0361a5fd3c29349dbe913a6d1ae8a19aed78515fcc0
6
+ metadata.gz: e7e15c2174b28ea518c3fee6ecc68723bc628ee3c6d9121a2292706ad32125d32f766da164aa581c56915cd178e54379014a67d8ad1d5cd765485a6eb88d0610
7
+ data.tar.gz: eb7f343423ea41ff46b6a478b3c32a22b0dd3ab3939ad858d0e8d540b5faaf8f01496a5839c79b5ccb1847529367ad9c55ffdc72bebf5959891ff4a7c06b72e4
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'digest'
4
+ require 'fileutils'
5
+ require 'json'
6
+ require 'tempfile'
7
+
8
+ module Legion
9
+ module Extensions
10
+ module Knowledge
11
+ module Helpers
12
+ module ManifestStore
13
+ module_function
14
+
15
+ STORE_DIR = ::File.expand_path('~/.legionio/knowledge').freeze
16
+
17
+ def load(corpus_path:)
18
+ path = store_path(corpus_path: corpus_path)
19
+ return [] unless ::File.exist?(path)
20
+
21
+ raw = ::File.read(path, encoding: 'utf-8')
22
+ ::JSON.parse(raw, symbolize_names: true)
23
+ rescue StandardError
24
+ []
25
+ end
26
+
27
+ def save(corpus_path:, manifest:)
28
+ ::FileUtils.mkdir_p(STORE_DIR)
29
+ path = store_path(corpus_path: corpus_path)
30
+ tmp = "#{path}.tmp"
31
+ ::File.write(tmp, ::JSON.generate(manifest.map { |e| serialize_entry(e) }))
32
+ ::File.rename(tmp, path)
33
+ true
34
+ rescue StandardError
35
+ false
36
+ end
37
+
38
+ def store_path(corpus_path:)
39
+ hash = ::Digest::SHA256.hexdigest(corpus_path.to_s)[0, 16]
40
+ ::File.join(STORE_DIR, "#{hash}.manifest.json")
41
+ end
42
+
43
+ def serialize_entry(entry)
44
+ entry.merge(mtime: entry[:mtime].to_s)
45
+ end
46
+ private_class_method :serialize_entry
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -15,6 +15,8 @@ module Legion
15
15
  parse_markdown(file_path: file_path)
16
16
  when '.txt'
17
17
  parse_text(file_path: file_path)
18
+ when '.pdf', '.docx'
19
+ extract_via_data(file_path: file_path)
18
20
  else
19
21
  [{ error: 'unsupported format', source_file: file_path }]
20
22
  end
@@ -22,32 +24,42 @@ module Legion
22
24
 
23
25
  def parse_markdown(file_path:)
24
26
  content = ::File.read(file_path, encoding: 'utf-8')
25
- sections = []
27
+ sections = []
26
28
  current_heading = ::File.basename(file_path, '.*')
27
29
  current_lines = []
28
- section_path = []
30
+ heading_stack = {}
29
31
 
30
32
  content.each_line do |line|
31
- if line.start_with?('# ')
32
- flush_section(sections, current_heading, section_path, current_lines, file_path) unless current_lines.empty?
33
- current_heading = line.sub(/^#+\s*/, '').chomp
34
- section_path = [current_heading]
35
- current_lines = []
36
- elsif line.start_with?('## ')
37
- flush_section(sections, current_heading, section_path, current_lines, file_path) unless current_lines.empty?
38
- current_heading = line.sub(/^#+\s*/, '').chomp
39
- section_path = section_path.first(1) + [current_heading]
33
+ level = heading_level(line)
34
+ if level
35
+ flush_section(sections, current_heading, build_section_path(heading_stack), current_lines, file_path)
36
+ title = line.sub(/^#+\s*/, '').chomp
37
+ heading_stack.delete_if { |d, _| d >= level }
38
+ heading_stack[level] = title
39
+ current_heading = title
40
40
  current_lines = []
41
41
  else
42
42
  current_lines << line
43
43
  end
44
44
  end
45
45
 
46
- flush_section(sections, current_heading, section_path, current_lines, file_path) unless current_lines.empty?
46
+ flush_section(sections, current_heading, build_section_path(heading_stack), current_lines, file_path)
47
47
 
48
48
  sections.empty? ? [{ heading: ::File.basename(file_path, '.*'), section_path: [], content: content.strip, source_file: file_path }] : sections
49
49
  end
50
50
 
51
+ def extract_via_data(file_path:)
52
+ return [{ error: 'unsupported format', source_file: file_path }] unless defined?(::Legion::Data::Extract)
53
+
54
+ result = ::Legion::Data::Extract.extract(file_path, type: :auto)
55
+ return [{ error: 'extraction_failed', source_file: file_path, detail: result }] unless result.is_a?(Hash) && result[:text]
56
+
57
+ heading = ::File.basename(file_path, '.*')
58
+ [{ heading: heading, section_path: [], content: result[:text].strip, source_file: file_path }]
59
+ rescue StandardError => e
60
+ [{ error: 'extraction_failed', source_file: file_path, detail: e.message }]
61
+ end
62
+
51
63
  def parse_text(file_path:)
52
64
  content = ::File.read(file_path, encoding: 'utf-8')
53
65
  heading = ::File.basename(file_path, '.*')
@@ -67,6 +79,17 @@ module Legion
67
79
  }
68
80
  end
69
81
  private_class_method :flush_section
82
+
83
+ def heading_level(line)
84
+ m = line.match(/^(\#{1,6})\s/)
85
+ m ? m[1].length : nil
86
+ end
87
+ private_class_method :heading_level
88
+
89
+ def build_section_path(stack)
90
+ stack.sort.map { |_, title| title }
91
+ end
92
+ private_class_method :build_section_path
70
93
  end
71
94
  end
72
95
  end
@@ -7,6 +7,10 @@ module Legion
7
7
  module Corpus
8
8
  module_function
9
9
 
10
+ def manifest_path(path:)
11
+ Helpers::ManifestStore.store_path(corpus_path: path)
12
+ end
13
+
10
14
  def corpus_stats(path:, extensions: nil)
11
15
  return { success: false, error: 'path does not exist' } unless ::File.exist?(path)
12
16
 
@@ -23,23 +23,32 @@ module Legion
23
23
  end
24
24
 
25
25
  def ingest_corpus(path:, dry_run: false, force: false)
26
- entries = Helpers::Manifest.scan(path: path)
27
-
28
- files_scanned = entries.size
29
- chunks_created = 0
30
- chunks_skipped = 0
31
- chunks_updated = 0
32
-
33
- entries.each do |entry|
34
- result = process_file(entry[:path], dry_run: dry_run, force: force)
35
- chunks_created += result[:created]
36
- chunks_skipped += result[:skipped]
37
- chunks_updated += result[:updated]
26
+ current = Helpers::Manifest.scan(path: path)
27
+ previous = force ? [] : Helpers::ManifestStore.load(corpus_path: path)
28
+ delta = Helpers::Manifest.diff(current: current, previous: previous)
29
+
30
+ to_process = delta[:added] + delta[:changed]
31
+ chunks_created = 0
32
+ chunks_skipped = 0
33
+ chunks_updated = 0
34
+
35
+ to_process.each do |file_path|
36
+ result = process_file(file_path, dry_run: dry_run, force: force)
37
+ chunks_created += result[:created]
38
+ chunks_skipped += result[:skipped]
39
+ chunks_updated += result[:updated]
38
40
  end
39
41
 
42
+ delta[:removed].each { |file_path| retire_file(file_path: file_path) } unless dry_run
43
+
44
+ Helpers::ManifestStore.save(corpus_path: path, manifest: current) unless dry_run
45
+
40
46
  {
41
47
  success: true,
42
- files_scanned: files_scanned,
48
+ files_scanned: current.size,
49
+ files_added: delta[:added].size,
50
+ files_changed: delta[:changed].size,
51
+ files_removed: delta[:removed].size,
43
52
  chunks_created: chunks_created,
44
53
  chunks_skipped: chunks_skipped,
45
54
  chunks_updated: chunks_updated
@@ -67,12 +76,18 @@ module Legion
67
76
  return { created: 0, skipped: 0, updated: 0 } if sections.first&.key?(:error)
68
77
 
69
78
  chunks = Helpers::Chunker.chunk(sections: sections)
79
+ paired = if dry_run
80
+ chunks.map { |c| { chunk: c, embedding: nil } }
81
+ else
82
+ batch_embed_chunks(chunks, force: force)
83
+ end
84
+
70
85
  created = 0
71
86
  skipped = 0
72
87
  updated = 0
73
88
 
74
- chunks.each do |chunk|
75
- outcome = upsert_chunk(chunk, dry_run: dry_run, force: force)
89
+ paired.each do |p|
90
+ outcome = upsert_chunk_with_embedding(p[:chunk], p[:embedding], dry_run: dry_run, force: force, exists: p[:exists] || false)
76
91
  case outcome
77
92
  when :created then created += 1
78
93
  when :skipped then skipped += 1
@@ -84,21 +99,55 @@ module Legion
84
99
  end
85
100
  private_class_method :process_file
86
101
 
87
- def upsert_chunk(chunk, dry_run: false, force: false)
88
- return :created if dry_run
102
+ def batch_embed_chunks(chunks, force:)
103
+ exists_map = force ? {} : build_exists_map(chunks)
104
+ return paired_without_embed(chunks, exists_map) unless llm_embed_available?
89
105
 
90
- return :created unless defined?(Legion::Extensions::Apollo)
106
+ needs_embed = force ? chunks : chunks.reject { |c| exists_map[c[:content_hash]] }
107
+ embed_map = needs_embed.empty? ? {} : build_embed_map(needs_embed)
91
108
 
92
- return :skipped if !force && chunk_exists?(chunk[:content_hash])
109
+ chunks.map { |c| { chunk: c, embedding: embed_map[c[:content_hash]], exists: exists_map.fetch(c[:content_hash], false) } }
110
+ rescue StandardError
111
+ paired_without_embed(chunks, {})
112
+ end
113
+ private_class_method :batch_embed_chunks
93
114
 
94
- embedding = generate_embedding(chunk[:content])
95
- ingest_to_apollo(chunk, embedding)
115
+ def build_exists_map(chunks)
116
+ chunks.to_h { |c| [c[:content_hash], chunk_exists?(c[:content_hash])] }
117
+ end
118
+ private_class_method :build_exists_map
119
+
120
+ def llm_embed_available?
121
+ defined?(Legion::LLM) && Legion::LLM.respond_to?(:embed_batch)
122
+ end
123
+ private_class_method :llm_embed_available?
124
+
125
+ def paired_without_embed(chunks, exists_map)
126
+ chunks.map { |c| { chunk: c, embedding: nil, exists: exists_map.fetch(c[:content_hash], false) } }
127
+ end
128
+ private_class_method :paired_without_embed
96
129
 
130
+ def build_embed_map(needs_embed)
131
+ results = Legion::LLM.embed_batch(needs_embed.map { |c| c[:content] })
132
+ results.each_with_object({}) do |r, h|
133
+ h[needs_embed[r[:index]][:content_hash]] = r[:vector] unless r[:error]
134
+ end
135
+ rescue StandardError
136
+ {}
137
+ end
138
+ private_class_method :build_embed_map
139
+
140
+ def upsert_chunk_with_embedding(chunk, embedding, dry_run: false, force: false, exists: false)
141
+ return :created if dry_run
142
+ return :created unless defined?(Legion::Extensions::Apollo)
143
+ return :skipped if !force && exists
144
+
145
+ ingest_to_apollo(chunk, embedding)
97
146
  force ? :updated : :created
98
147
  rescue StandardError
99
148
  :skipped
100
149
  end
101
- private_class_method :upsert_chunk
150
+ private_class_method :upsert_chunk_with_embedding
102
151
 
103
152
  def chunk_exists?(content_hash)
104
153
  return false unless defined?(Legion::Data::Model::ApolloEntry)
@@ -112,16 +161,6 @@ module Legion
112
161
  end
113
162
  private_class_method :chunk_exists?
114
163
 
115
- def generate_embedding(content)
116
- return nil unless defined?(Legion::LLM) && Legion::LLM.respond_to?(:embed)
117
-
118
- result = Legion::LLM.embed(content)
119
- result.is_a?(Hash) ? result[:vector] : nil
120
- rescue StandardError
121
- nil
122
- end
123
- private_class_method :generate_embedding
124
-
125
164
  def ingest_to_apollo(chunk, embedding)
126
165
  return unless defined?(Legion::Extensions::Apollo)
127
166
 
@@ -143,6 +182,21 @@ module Legion
143
182
  Legion::Extensions::Apollo::Runners::Knowledge.handle_ingest(**payload)
144
183
  end
145
184
  private_class_method :ingest_to_apollo
185
+
186
+ def retire_file(file_path:)
187
+ return unless defined?(Legion::Apollo)
188
+ return unless Legion::Apollo.respond_to?(:ingest) && Legion::Apollo.started?
189
+
190
+ Legion::Apollo.ingest(
191
+ content: file_path,
192
+ content_type: 'document_retired',
193
+ tags: [file_path, 'retired', 'document_chunk'].uniq,
194
+ metadata: { source_file: file_path, retired: true }
195
+ )
196
+ rescue StandardError
197
+ nil
198
+ end
199
+ private_class_method :retire_file
146
200
  end
147
201
  end
148
202
  end
@@ -3,7 +3,7 @@
3
3
  module Legion
4
4
  module Extensions
5
5
  module Knowledge
6
- VERSION = '0.2.0'
6
+ VERSION = '0.4.0'
7
7
  end
8
8
  end
9
9
  end
@@ -2,6 +2,7 @@
2
2
 
3
3
  require_relative 'knowledge/version'
4
4
  require_relative 'knowledge/helpers/manifest'
5
+ require_relative 'knowledge/helpers/manifest_store'
5
6
  require_relative 'knowledge/helpers/parser'
6
7
  require_relative 'knowledge/helpers/chunker'
7
8
  require_relative 'knowledge/runners/ingest'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lex-knowledge
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Matthew Iverson
@@ -121,6 +121,7 @@ files:
121
121
  - lib/legion/extensions/knowledge/client.rb
122
122
  - lib/legion/extensions/knowledge/helpers/chunker.rb
123
123
  - lib/legion/extensions/knowledge/helpers/manifest.rb
124
+ - lib/legion/extensions/knowledge/helpers/manifest_store.rb
124
125
  - lib/legion/extensions/knowledge/helpers/parser.rb
125
126
  - lib/legion/extensions/knowledge/runners/corpus.rb
126
127
  - lib/legion/extensions/knowledge/runners/ingest.rb