legion-data 1.6.3 → 1.6.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -0
- data/legion-data.gemspec +1 -0
- data/lib/legion/data/extract/handlers/base.rb +58 -0
- data/lib/legion/data/extract/handlers/csv.rb +26 -0
- data/lib/legion/data/extract/handlers/docx.rb +28 -0
- data/lib/legion/data/extract/handlers/html.rb +33 -0
- data/lib/legion/data/extract/handlers/json.rb +26 -0
- data/lib/legion/data/extract/handlers/jsonl.rb +26 -0
- data/lib/legion/data/extract/handlers/markdown.rb +24 -0
- data/lib/legion/data/extract/handlers/pdf.rb +27 -0
- data/lib/legion/data/extract/handlers/pptx.rb +36 -0
- data/lib/legion/data/extract/handlers/text.rb +22 -0
- data/lib/legion/data/extract/handlers/xlsx.rb +37 -0
- data/lib/legion/data/extract/type_detector.rb +45 -0
- data/lib/legion/data/extract.rb +60 -0
- data/lib/legion/data/migrations/047_apollo_knowledge_capture.rb +152 -0
- data/lib/legion/data/version.rb +1 -1
- data/lib/legion/data.rb +46 -0
- metadata +29 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 55c32ec7725b3d773d0050f52873892de6d9091082546c25969288a7abe159a9
|
|
4
|
+
data.tar.gz: 9d837ec4d10952b7103373972354d32886737f845bd623ad09d3671eddcd9f1d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: c4d314992432f8b4e6c67b1d8c69bc60d0f0da5c28f0e7906ecd455e8d12e8c7756d32b99fa96e7ce57ed2c1478ce88f1ffe8a0d17f9fdd0fa0896e5501d0ac4
|
|
7
|
+
data.tar.gz: b98331ae6bb79d68525cbe361ec054d0919ee14ee1e4322c3d191ad0d15f86c1c587e33ff358cf5fdd824b37e47ae40ad7b7d73d96dc6cbf38b917d6f01c7a5c
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,25 @@
|
|
|
1
1
|
# Legion::Data Changelog
|
|
2
2
|
|
|
3
|
+
## [1.6.6] - 2026-03-25
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- `connected?` — returns true when the shared DB is connected (reads `Settings[:data][:connected]`)
|
|
7
|
+
- `can_write?(table_name)` — checks INSERT privilege; sqlite always returns true, postgres queries `has_table_privilege`, results cached per table
|
|
8
|
+
- `can_read?(table_name)` — checks SELECT privilege; sqlite always returns true, postgres queries `has_table_privilege`, results cached per table
|
|
9
|
+
- `reset_privileges!` — clears cached privilege results (used in tests and after re-connect)
|
|
10
|
+
- `Legion::Data::Extract` — file format extraction with handler registry
|
|
11
|
+
- Built-in handlers: text, markdown, csv, json, jsonl (no external gems required)
|
|
12
|
+
- Optional handlers: pdf (pdf-reader), docx (docx), pptx (rubyzip), xlsx (rubyXL), html (nokogiri) — lazy-loaded, degrade gracefully if gem not installed
|
|
13
|
+
- `Extract.register_handler(type, klass)` — register custom format handlers
|
|
14
|
+
- `Extract.can_extract?(type)` — check if a type can be extracted (handler present and gem available)
|
|
15
|
+
- `Extract.supported_types` — list all registered types
|
|
16
|
+
- Added `csv` gem dependency (Ruby 3.4 stdlib split)
|
|
17
|
+
|
|
18
|
+
## [1.6.4] - 2026-03-25
|
|
19
|
+
|
|
20
|
+
### Added
|
|
21
|
+
- Migration 047: Apollo identity columns (submitted_by, submitted_from), content hash dedup, apollo_operations table, apollo_entries_archive table, comprehensive indexes including partial HNSW on active entries only
|
|
22
|
+
|
|
3
23
|
## [1.6.2] - 2026-03-25
|
|
4
24
|
|
|
5
25
|
### Changed
|
data/legion-data.gemspec
CHANGED
|
@@ -26,6 +26,7 @@ Gem::Specification.new do |spec|
|
|
|
26
26
|
'rubygems_mfa_required' => 'true'
|
|
27
27
|
}
|
|
28
28
|
|
|
29
|
+
spec.add_dependency 'csv', '>= 3.2'
|
|
29
30
|
spec.add_dependency 'legion-logging', '>= 1.2.8'
|
|
30
31
|
spec.add_dependency 'legion-settings', '>= 1.3.12'
|
|
31
32
|
spec.add_dependency 'sequel', '>= 5.70'
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Data
|
|
5
|
+
module Extract
|
|
6
|
+
module Handlers
|
|
7
|
+
class Base
|
|
8
|
+
@registry = {}
|
|
9
|
+
|
|
10
|
+
class << self
|
|
11
|
+
attr_reader :registry
|
|
12
|
+
|
|
13
|
+
def inherited(subclass)
|
|
14
|
+
super
|
|
15
|
+
# Deferred registration — subclass defines type after class body loads
|
|
16
|
+
TracePoint.new(:end) do |tp|
|
|
17
|
+
if tp.self == subclass
|
|
18
|
+
register(subclass) if subclass.respond_to?(:type) && subclass.type
|
|
19
|
+
tp.disable
|
|
20
|
+
end
|
|
21
|
+
end.enable
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def register(handler_class)
|
|
25
|
+
@registry[handler_class.type] = handler_class
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def for_type(type)
|
|
29
|
+
@registry[type&.to_sym]
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def supported_types
|
|
33
|
+
@registry.keys
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Override in subclasses
|
|
37
|
+
def type = nil
|
|
38
|
+
def extensions = []
|
|
39
|
+
def gem_name = nil
|
|
40
|
+
|
|
41
|
+
def extract(_source)
|
|
42
|
+
raise NotImplementedError, "#{name} must implement .extract"
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def available?
|
|
46
|
+
return true if gem_name.nil?
|
|
47
|
+
|
|
48
|
+
require gem_name
|
|
49
|
+
true
|
|
50
|
+
rescue LoadError
|
|
51
|
+
false
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'csv'
|
|
4
|
+
|
|
5
|
+
module Legion
|
|
6
|
+
module Data
|
|
7
|
+
module Extract
|
|
8
|
+
module Handlers
|
|
9
|
+
class Csv < Base
|
|
10
|
+
def self.type = :csv
|
|
11
|
+
def self.extensions = %w[.csv]
|
|
12
|
+
def self.gem_name = nil
|
|
13
|
+
|
|
14
|
+
def self.extract(source)
|
|
15
|
+
content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
|
|
16
|
+
table = ::CSV.parse(content, headers: true)
|
|
17
|
+
text = table.map { |row| row.to_h.map { |k, v| "#{k}: #{v}" }.join(', ') }.join("\n")
|
|
18
|
+
{ text: text, metadata: { rows: table.size, columns: table.headers.size, headers: table.headers } }
|
|
19
|
+
rescue StandardError => e
|
|
20
|
+
{ text: nil, error: e.message }
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Data
|
|
5
|
+
module Extract
|
|
6
|
+
module Handlers
|
|
7
|
+
class Docx < Base
|
|
8
|
+
def self.type = :docx
|
|
9
|
+
def self.extensions = %w[.docx]
|
|
10
|
+
def self.gem_name = 'docx'
|
|
11
|
+
|
|
12
|
+
def self.extract(source)
|
|
13
|
+
require 'docx'
|
|
14
|
+
|
|
15
|
+
doc = ::Docx::Document.open(source)
|
|
16
|
+
paragraphs = doc.paragraphs.map(&:text).reject(&:empty?)
|
|
17
|
+
text = paragraphs.join("\n\n")
|
|
18
|
+
{ text: text, metadata: { paragraphs: paragraphs.size } }
|
|
19
|
+
rescue LoadError
|
|
20
|
+
{ text: nil, error: :gem_not_installed, gem: gem_name }
|
|
21
|
+
rescue StandardError => e
|
|
22
|
+
{ text: nil, error: e.message }
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Data
|
|
5
|
+
module Extract
|
|
6
|
+
module Handlers
|
|
7
|
+
class Html < Base
|
|
8
|
+
def self.type = :html
|
|
9
|
+
def self.extensions = %w[.html .htm]
|
|
10
|
+
def self.gem_name = 'nokogiri'
|
|
11
|
+
|
|
12
|
+
def self.extract(source)
|
|
13
|
+
require 'nokogiri'
|
|
14
|
+
|
|
15
|
+
content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
|
|
16
|
+
doc = ::Nokogiri::HTML(content)
|
|
17
|
+
|
|
18
|
+
# Remove script and style elements
|
|
19
|
+
doc.css('script, style, noscript').each(&:remove)
|
|
20
|
+
|
|
21
|
+
title = doc.at_css('title')&.text&.strip
|
|
22
|
+
text = doc.text.gsub(/\s+/, ' ').strip
|
|
23
|
+
{ text: text, metadata: { title: title } }
|
|
24
|
+
rescue LoadError
|
|
25
|
+
{ text: nil, error: :gem_not_installed, gem: gem_name }
|
|
26
|
+
rescue StandardError => e
|
|
27
|
+
{ text: nil, error: e.message }
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module Legion
|
|
6
|
+
module Data
|
|
7
|
+
module Extract
|
|
8
|
+
module Handlers
|
|
9
|
+
class Json < Base
|
|
10
|
+
def self.type = :json
|
|
11
|
+
def self.extensions = %w[.json]
|
|
12
|
+
def self.gem_name = nil
|
|
13
|
+
|
|
14
|
+
def self.extract(source)
|
|
15
|
+
content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
|
|
16
|
+
parsed = ::JSON.parse(content)
|
|
17
|
+
text = ::JSON.pretty_generate(parsed)
|
|
18
|
+
{ text: text, metadata: { keys: parsed.is_a?(Hash) ? parsed.keys : nil } }
|
|
19
|
+
rescue StandardError => e
|
|
20
|
+
{ text: nil, error: e.message }
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module Legion
|
|
6
|
+
module Data
|
|
7
|
+
module Extract
|
|
8
|
+
module Handlers
|
|
9
|
+
class Jsonl < Base
|
|
10
|
+
def self.type = :jsonl
|
|
11
|
+
def self.extensions = %w[.jsonl]
|
|
12
|
+
def self.gem_name = nil
|
|
13
|
+
|
|
14
|
+
def self.extract(source)
|
|
15
|
+
content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
|
|
16
|
+
lines = content.each_line.map { |l| ::JSON.parse(l.strip) rescue l.strip } # rubocop:disable Style/RescueModifier
|
|
17
|
+
text = lines.map { |l| l.is_a?(Hash) ? ::JSON.pretty_generate(l) : l }.join("\n---\n")
|
|
18
|
+
{ text: text, metadata: { lines: lines.size } }
|
|
19
|
+
rescue StandardError => e
|
|
20
|
+
{ text: nil, error: e.message }
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Data
|
|
5
|
+
module Extract
|
|
6
|
+
module Handlers
|
|
7
|
+
class Markdown < Base
|
|
8
|
+
def self.type = :markdown
|
|
9
|
+
def self.extensions = %w[.md .markdown]
|
|
10
|
+
def self.gem_name = nil
|
|
11
|
+
|
|
12
|
+
def self.extract(source)
|
|
13
|
+
content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
|
|
14
|
+
# Strip YAML frontmatter if present
|
|
15
|
+
text = content.sub(/\A---\n.*?\n---\n/m, '')
|
|
16
|
+
{ text: text.strip, metadata: { bytes: content.bytesize, has_frontmatter: content != text } }
|
|
17
|
+
rescue StandardError => e
|
|
18
|
+
{ text: nil, error: e.message }
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Data
|
|
5
|
+
module Extract
|
|
6
|
+
module Handlers
|
|
7
|
+
class Pdf < Base
|
|
8
|
+
def self.type = :pdf
|
|
9
|
+
def self.extensions = %w[.pdf]
|
|
10
|
+
def self.gem_name = 'pdf-reader'
|
|
11
|
+
|
|
12
|
+
def self.extract(source)
|
|
13
|
+
require 'pdf-reader'
|
|
14
|
+
|
|
15
|
+
reader = ::PDF::Reader.new(source)
|
|
16
|
+
text = reader.pages.map(&:text).join("\n\n")
|
|
17
|
+
{ text: text, metadata: { pages: reader.page_count, title: reader.info[:Title] } }
|
|
18
|
+
rescue LoadError
|
|
19
|
+
{ text: nil, error: :gem_not_installed, gem: gem_name }
|
|
20
|
+
rescue StandardError => e
|
|
21
|
+
{ text: nil, error: e.message }
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Data
|
|
5
|
+
module Extract
|
|
6
|
+
module Handlers
|
|
7
|
+
class Pptx < Base
|
|
8
|
+
def self.type = :pptx
|
|
9
|
+
def self.extensions = %w[.pptx]
|
|
10
|
+
def self.gem_name = 'rubyzip'
|
|
11
|
+
|
|
12
|
+
def self.extract(source)
|
|
13
|
+
require 'zip'
|
|
14
|
+
require 'rexml/document'
|
|
15
|
+
|
|
16
|
+
slides = []
|
|
17
|
+
::Zip::File.open(source) do |zip|
|
|
18
|
+
zip.glob('ppt/slides/slide*.xml').sort_by(&:name).each do |entry|
|
|
19
|
+
doc = REXML::Document.new(entry.get_input_stream.read)
|
|
20
|
+
texts = []
|
|
21
|
+
doc.each_element('//a:t') { |e| texts << e.text }
|
|
22
|
+
slides << texts.join(' ') unless texts.empty?
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
text = slides.each_with_index.map { |s, i| "Slide #{i + 1}: #{s}" }.join("\n\n")
|
|
26
|
+
{ text: text, metadata: { slides: slides.size } }
|
|
27
|
+
rescue LoadError
|
|
28
|
+
{ text: nil, error: :gem_not_installed, gem: 'rubyzip' }
|
|
29
|
+
rescue StandardError => e
|
|
30
|
+
{ text: nil, error: e.message }
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Data
|
|
5
|
+
module Extract
|
|
6
|
+
module Handlers
|
|
7
|
+
class Text < Base
|
|
8
|
+
def self.type = :text
|
|
9
|
+
def self.extensions = %w[.txt]
|
|
10
|
+
def self.gem_name = nil
|
|
11
|
+
|
|
12
|
+
def self.extract(source)
|
|
13
|
+
content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
|
|
14
|
+
{ text: content, metadata: { bytes: content.bytesize } }
|
|
15
|
+
rescue StandardError => e
|
|
16
|
+
{ text: nil, error: e.message }
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Data
|
|
5
|
+
module Extract
|
|
6
|
+
module Handlers
|
|
7
|
+
class Xlsx < Base
|
|
8
|
+
def self.type = :xlsx
|
|
9
|
+
def self.extensions = %w[.xlsx .xls]
|
|
10
|
+
def self.gem_name = 'rubyXL'
|
|
11
|
+
|
|
12
|
+
def self.extract(source)
|
|
13
|
+
require 'rubyXL'
|
|
14
|
+
require 'rubyXL/convenience_methods'
|
|
15
|
+
|
|
16
|
+
workbook = ::RubyXL::Parser.parse(source)
|
|
17
|
+
sheets = []
|
|
18
|
+
workbook.worksheets.each do |sheet|
|
|
19
|
+
rows = sheet.each.map do |row|
|
|
20
|
+
next unless row
|
|
21
|
+
|
|
22
|
+
row.cells.map { |c| c&.value.to_s }.join(', ')
|
|
23
|
+
end.compact
|
|
24
|
+
sheets << "Sheet: #{sheet.sheet_name}\n#{rows.join("\n")}" unless rows.empty?
|
|
25
|
+
end
|
|
26
|
+
text = sheets.join("\n\n")
|
|
27
|
+
{ text: text, metadata: { sheets: workbook.worksheets.size } }
|
|
28
|
+
rescue LoadError
|
|
29
|
+
{ text: nil, error: :gem_not_installed, gem: gem_name }
|
|
30
|
+
rescue StandardError => e
|
|
31
|
+
{ text: nil, error: e.message }
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Data
|
|
5
|
+
module Extract
|
|
6
|
+
module TypeDetector
|
|
7
|
+
EXTENSION_MAP = {
|
|
8
|
+
'.pdf' => :pdf,
|
|
9
|
+
'.docx' => :docx,
|
|
10
|
+
'.pptx' => :pptx,
|
|
11
|
+
'.xlsx' => :xlsx,
|
|
12
|
+
'.xls' => :xlsx,
|
|
13
|
+
'.md' => :markdown,
|
|
14
|
+
'.markdown' => :markdown,
|
|
15
|
+
'.txt' => :text,
|
|
16
|
+
'.csv' => :csv,
|
|
17
|
+
'.json' => :json,
|
|
18
|
+
'.jsonl' => :jsonl,
|
|
19
|
+
'.html' => :html,
|
|
20
|
+
'.htm' => :html
|
|
21
|
+
}.freeze
|
|
22
|
+
|
|
23
|
+
module_function
|
|
24
|
+
|
|
25
|
+
def detect(source)
|
|
26
|
+
return detect_from_path(source) if source.is_a?(String) && File.exist?(source)
|
|
27
|
+
return detect_from_io(source) if source.respond_to?(:path)
|
|
28
|
+
|
|
29
|
+
nil
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def detect_from_path(path)
|
|
33
|
+
ext = File.extname(path).downcase
|
|
34
|
+
EXTENSION_MAP[ext]
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def detect_from_io(io)
|
|
38
|
+
return nil unless io.respond_to?(:path) && io.path
|
|
39
|
+
|
|
40
|
+
detect_from_path(io.path)
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'extract/type_detector'
|
|
4
|
+
require_relative 'extract/handlers/base'
|
|
5
|
+
|
|
6
|
+
module Legion
|
|
7
|
+
module Data
|
|
8
|
+
module Extract
|
|
9
|
+
class << self
|
|
10
|
+
def extract(source, type: :auto)
|
|
11
|
+
detected_type = type == :auto ? TypeDetector.detect(source) : type&.to_sym
|
|
12
|
+
return { success: false, text: nil, error: :unknown_type } unless detected_type
|
|
13
|
+
|
|
14
|
+
handler = Handlers::Base.for_type(detected_type)
|
|
15
|
+
return { success: false, text: nil, error: :no_handler, type: detected_type } unless handler
|
|
16
|
+
|
|
17
|
+
unless handler.available?
|
|
18
|
+
return { success: false, text: nil, error: :gem_not_installed,
|
|
19
|
+
gem: handler.gem_name, type: detected_type }
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
result = handler.extract(source)
|
|
23
|
+
if result[:text]
|
|
24
|
+
{ success: true, text: result[:text], metadata: result[:metadata], type: detected_type }
|
|
25
|
+
else
|
|
26
|
+
{ success: false, text: nil, error: result[:error], type: detected_type }
|
|
27
|
+
end
|
|
28
|
+
rescue StandardError => e
|
|
29
|
+
{ success: false, text: nil, error: e.message, type: detected_type }
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def supported_types
|
|
33
|
+
load_all_handlers
|
|
34
|
+
Handlers::Base.supported_types
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def can_extract?(type)
|
|
38
|
+
load_all_handlers
|
|
39
|
+
handler = Handlers::Base.for_type(type&.to_sym)
|
|
40
|
+
handler&.available? || false
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def register_handler(type, klass)
|
|
44
|
+
Handlers::Base.registry[type.to_sym] = klass
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
private
|
|
48
|
+
|
|
49
|
+
def load_all_handlers
|
|
50
|
+
return if @handlers_loaded
|
|
51
|
+
|
|
52
|
+
Dir[File.join(__dir__, 'extract', 'handlers', '*.rb')].each do |f|
|
|
53
|
+
require f unless f.end_with?('base.rb')
|
|
54
|
+
end
|
|
55
|
+
@handlers_loaded = true
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
Sequel.migration do
|
|
4
|
+
up do
|
|
5
|
+
next unless adapter_scheme == :postgres
|
|
6
|
+
|
|
7
|
+
# --- Identity columns on apollo_entries ---
|
|
8
|
+
alter_table(:apollo_entries) do
|
|
9
|
+
add_column :submitted_by, String, size: 255
|
|
10
|
+
add_column :submitted_from, String, size: 255
|
|
11
|
+
add_column :content_hash, String, fixed: true, size: 32
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# --- apollo_operations table ---
|
|
15
|
+
run <<~SQL
|
|
16
|
+
CREATE TABLE IF NOT EXISTS apollo_operations (
|
|
17
|
+
id BIGSERIAL PRIMARY KEY,
|
|
18
|
+
operation VARCHAR(50) NOT NULL,
|
|
19
|
+
actor VARCHAR(100) NOT NULL,
|
|
20
|
+
target_type VARCHAR(50),
|
|
21
|
+
target_ids INTEGER[],
|
|
22
|
+
summary TEXT,
|
|
23
|
+
detail JSONB,
|
|
24
|
+
old_state JSONB,
|
|
25
|
+
new_state JSONB,
|
|
26
|
+
reason TEXT,
|
|
27
|
+
principal_id VARCHAR(255),
|
|
28
|
+
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
29
|
+
);
|
|
30
|
+
SQL
|
|
31
|
+
|
|
32
|
+
# --- apollo_entries_archive table ---
|
|
33
|
+
run <<~SQL
|
|
34
|
+
CREATE TABLE IF NOT EXISTS apollo_entries_archive (
|
|
35
|
+
LIKE apollo_entries INCLUDING ALL,
|
|
36
|
+
archived_at TIMESTAMPTZ DEFAULT NOW(),
|
|
37
|
+
archive_reason TEXT
|
|
38
|
+
);
|
|
39
|
+
SQL
|
|
40
|
+
|
|
41
|
+
# --- Indexes: apollo_entries ---
|
|
42
|
+
run 'CREATE INDEX IF NOT EXISTS idx_apollo_submitted_by ON apollo_entries (submitted_by);'
|
|
43
|
+
run 'CREATE INDEX IF NOT EXISTS idx_apollo_submitted_from ON apollo_entries (submitted_from);'
|
|
44
|
+
|
|
45
|
+
# Content hash dedup (unique among active entries only)
|
|
46
|
+
run <<~SQL
|
|
47
|
+
CREATE UNIQUE INDEX IF NOT EXISTS idx_apollo_content_hash
|
|
48
|
+
ON apollo_entries (content_hash)
|
|
49
|
+
WHERE status != 'archived';
|
|
50
|
+
SQL
|
|
51
|
+
|
|
52
|
+
# Status filtering (every read query filters on status)
|
|
53
|
+
run 'CREATE INDEX IF NOT EXISTS idx_apollo_status ON apollo_entries (status);'
|
|
54
|
+
|
|
55
|
+
# Partial index: active entries only (hot path)
|
|
56
|
+
run <<~SQL
|
|
57
|
+
CREATE INDEX IF NOT EXISTS idx_apollo_active
|
|
58
|
+
ON apollo_entries (id)
|
|
59
|
+
WHERE status IN ('candidate', 'confirmed', 'disputed');
|
|
60
|
+
SQL
|
|
61
|
+
|
|
62
|
+
# Confidence ranking and decay targeting
|
|
63
|
+
run 'CREATE INDEX IF NOT EXISTS idx_apollo_confidence ON apollo_entries (confidence);'
|
|
64
|
+
|
|
65
|
+
# Time-based: decay age, archival sweep
|
|
66
|
+
run 'CREATE INDEX IF NOT EXISTS idx_apollo_created ON apollo_entries (created_at);'
|
|
67
|
+
run 'CREATE INDEX IF NOT EXISTS idx_apollo_updated ON apollo_entries (updated_at);'
|
|
68
|
+
|
|
69
|
+
# Composite: decay cycle targets
|
|
70
|
+
run <<~SQL
|
|
71
|
+
CREATE INDEX IF NOT EXISTS idx_apollo_decay_target
|
|
72
|
+
ON apollo_entries (updated_at)
|
|
73
|
+
WHERE status != 'archived';
|
|
74
|
+
SQL
|
|
75
|
+
|
|
76
|
+
# Composite: corroboration targets
|
|
77
|
+
run <<~SQL
|
|
78
|
+
CREATE INDEX IF NOT EXISTS idx_apollo_candidates
|
|
79
|
+
ON apollo_entries (status, source_provider, source_channel)
|
|
80
|
+
WHERE status = 'candidate' AND embedding IS NOT NULL;
|
|
81
|
+
SQL
|
|
82
|
+
|
|
83
|
+
# Knowledge domain (expertise, RBAC)
|
|
84
|
+
run 'CREATE INDEX IF NOT EXISTS idx_apollo_domain ON apollo_entries (knowledge_domain);'
|
|
85
|
+
|
|
86
|
+
# Source agent (expertise aggregation)
|
|
87
|
+
run 'CREATE INDEX IF NOT EXISTS idx_apollo_source_agent ON apollo_entries (source_agent);'
|
|
88
|
+
|
|
89
|
+
# Drop existing HNSW index and recreate as partial (active entries only)
|
|
90
|
+
run 'DROP INDEX IF EXISTS apollo_entries_embedding_idx;'
|
|
91
|
+
run <<~SQL
|
|
92
|
+
CREATE INDEX IF NOT EXISTS idx_apollo_embedding_active
|
|
93
|
+
ON apollo_entries USING hnsw (embedding vector_cosine_ops)
|
|
94
|
+
WITH (m = 16, ef_construction = 64)
|
|
95
|
+
WHERE status IN ('candidate', 'confirmed', 'disputed');
|
|
96
|
+
SQL
|
|
97
|
+
|
|
98
|
+
# --- Indexes: apollo_relations ---
|
|
99
|
+
run 'CREATE INDEX IF NOT EXISTS idx_apollo_rel_from ON apollo_relations (from_entry_id);'
|
|
100
|
+
run 'CREATE INDEX IF NOT EXISTS idx_apollo_rel_to ON apollo_relations (to_entry_id);'
|
|
101
|
+
run 'CREATE INDEX IF NOT EXISTS idx_apollo_rel_type ON apollo_relations (relation_type);'
|
|
102
|
+
run 'CREATE INDEX IF NOT EXISTS idx_apollo_rel_composite ON apollo_relations (from_entry_id, relation_type);'
|
|
103
|
+
|
|
104
|
+
# --- Indexes: apollo_expertise ---
|
|
105
|
+
run 'CREATE INDEX IF NOT EXISTS idx_apollo_exp_agent ON apollo_expertise (agent_id);'
|
|
106
|
+
run 'CREATE INDEX IF NOT EXISTS idx_apollo_exp_domain ON apollo_expertise (domain);'
|
|
107
|
+
run 'CREATE INDEX IF NOT EXISTS idx_apollo_exp_composite ON apollo_expertise (agent_id, domain);'
|
|
108
|
+
|
|
109
|
+
# --- Indexes: apollo_operations ---
|
|
110
|
+
run 'CREATE INDEX IF NOT EXISTS idx_apollo_ops_created ON apollo_operations (created_at);'
|
|
111
|
+
run 'CREATE INDEX IF NOT EXISTS idx_apollo_ops_operation ON apollo_operations (operation);'
|
|
112
|
+
run 'CREATE INDEX IF NOT EXISTS idx_apollo_ops_actor ON apollo_operations (actor);'
|
|
113
|
+
run 'CREATE INDEX IF NOT EXISTS idx_apollo_ops_target ON apollo_operations USING GIN (target_ids);'
|
|
114
|
+
|
|
115
|
+
# --- Indexes: apollo_entries_archive ---
|
|
116
|
+
run 'CREATE INDEX IF NOT EXISTS idx_archive_content_hash ON apollo_entries_archive (content_hash);'
|
|
117
|
+
run 'CREATE INDEX IF NOT EXISTS idx_archive_source_agent ON apollo_entries_archive (source_agent);'
|
|
118
|
+
run 'CREATE INDEX IF NOT EXISTS idx_archive_archived_at ON apollo_entries_archive (archived_at);'
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
down do
|
|
122
|
+
next unless adapter_scheme == :postgres
|
|
123
|
+
|
|
124
|
+
# Restore original HNSW index (non-partial)
|
|
125
|
+
run 'DROP INDEX IF EXISTS idx_apollo_embedding_active;'
|
|
126
|
+
run <<~SQL
|
|
127
|
+
CREATE INDEX IF NOT EXISTS apollo_entries_embedding_idx
|
|
128
|
+
ON apollo_entries USING hnsw (embedding vector_cosine_ops);
|
|
129
|
+
SQL
|
|
130
|
+
|
|
131
|
+
drop_table?(:apollo_entries_archive)
|
|
132
|
+
drop_table?(:apollo_operations)
|
|
133
|
+
|
|
134
|
+
# Drop new indexes
|
|
135
|
+
%w[
|
|
136
|
+
idx_apollo_submitted_by idx_apollo_submitted_from idx_apollo_content_hash
|
|
137
|
+
idx_apollo_status idx_apollo_active idx_apollo_confidence
|
|
138
|
+
idx_apollo_created idx_apollo_updated idx_apollo_decay_target
|
|
139
|
+
idx_apollo_candidates idx_apollo_domain idx_apollo_source_agent
|
|
140
|
+
idx_apollo_rel_from idx_apollo_rel_to idx_apollo_rel_type idx_apollo_rel_composite
|
|
141
|
+
idx_apollo_exp_agent idx_apollo_exp_domain idx_apollo_exp_composite
|
|
142
|
+
idx_apollo_ops_created idx_apollo_ops_operation idx_apollo_ops_actor idx_apollo_ops_target
|
|
143
|
+
idx_archive_content_hash idx_archive_source_agent idx_archive_archived_at
|
|
144
|
+
].each { |idx| run "DROP INDEX IF EXISTS #{idx};" }
|
|
145
|
+
|
|
146
|
+
alter_table(:apollo_entries) do
|
|
147
|
+
drop_column :content_hash
|
|
148
|
+
drop_column :submitted_from
|
|
149
|
+
drop_column :submitted_by
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
end
|
data/lib/legion/data/version.rb
CHANGED
data/lib/legion/data.rb
CHANGED
|
@@ -13,6 +13,7 @@ require_relative 'data/partition_manager'
|
|
|
13
13
|
require_relative 'data/archiver'
|
|
14
14
|
require_relative 'data/helper'
|
|
15
15
|
require_relative 'data/rls'
|
|
16
|
+
require_relative 'data/extract'
|
|
16
17
|
|
|
17
18
|
module Legion
|
|
18
19
|
module Data
|
|
@@ -55,6 +56,51 @@ module Legion
|
|
|
55
56
|
}
|
|
56
57
|
end
|
|
57
58
|
|
|
59
|
+
def connected?
|
|
60
|
+
Legion::Settings[:data][:connected] == true
|
|
61
|
+
rescue StandardError
|
|
62
|
+
false
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def can_write?(table_name)
|
|
66
|
+
return false unless connected?
|
|
67
|
+
|
|
68
|
+
adapter = Legion::Settings[:data][:adapter]&.to_s
|
|
69
|
+
return true if adapter == 'sqlite'
|
|
70
|
+
|
|
71
|
+
@write_privileges ||= {}
|
|
72
|
+
return @write_privileges[table_name] unless @write_privileges[table_name].nil?
|
|
73
|
+
|
|
74
|
+
@write_privileges[table_name] = connection
|
|
75
|
+
.fetch("SELECT has_table_privilege(current_user, ?, 'INSERT') AS can", table_name.to_s)
|
|
76
|
+
.first[:can] == true
|
|
77
|
+
rescue StandardError
|
|
78
|
+
@write_privileges[table_name] = false if @write_privileges
|
|
79
|
+
false
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def can_read?(table_name)
|
|
83
|
+
return false unless connected?
|
|
84
|
+
|
|
85
|
+
adapter = Legion::Settings[:data][:adapter]&.to_s
|
|
86
|
+
return true if adapter == 'sqlite'
|
|
87
|
+
|
|
88
|
+
@read_privileges ||= {}
|
|
89
|
+
return @read_privileges[table_name] unless @read_privileges[table_name].nil?
|
|
90
|
+
|
|
91
|
+
@read_privileges[table_name] = connection
|
|
92
|
+
.fetch("SELECT has_table_privilege(current_user, ?, 'SELECT') AS can", table_name.to_s)
|
|
93
|
+
.first[:can] == true
|
|
94
|
+
rescue StandardError
|
|
95
|
+
@read_privileges[table_name] = false if @read_privileges
|
|
96
|
+
false
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def reset_privileges!
|
|
100
|
+
@write_privileges = nil
|
|
101
|
+
@read_privileges = nil
|
|
102
|
+
end
|
|
103
|
+
|
|
58
104
|
def setup_cache
|
|
59
105
|
cache_settings = Legion::Settings[:data][:cache]
|
|
60
106
|
setup_static_cache if cache_settings[:static_cache]
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: legion-data
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.6.
|
|
4
|
+
version: 1.6.6
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Esity
|
|
@@ -9,6 +9,20 @@ bindir: bin
|
|
|
9
9
|
cert_chain: []
|
|
10
10
|
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: csv
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - ">="
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '3.2'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - ">="
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '3.2'
|
|
12
26
|
- !ruby/object:Gem::Dependency
|
|
13
27
|
name: legion-logging
|
|
14
28
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -98,6 +112,19 @@ files:
|
|
|
98
112
|
- lib/legion/data/encryption/sequel_plugin.rb
|
|
99
113
|
- lib/legion/data/event_store.rb
|
|
100
114
|
- lib/legion/data/event_store/projection.rb
|
|
115
|
+
- lib/legion/data/extract.rb
|
|
116
|
+
- lib/legion/data/extract/handlers/base.rb
|
|
117
|
+
- lib/legion/data/extract/handlers/csv.rb
|
|
118
|
+
- lib/legion/data/extract/handlers/docx.rb
|
|
119
|
+
- lib/legion/data/extract/handlers/html.rb
|
|
120
|
+
- lib/legion/data/extract/handlers/json.rb
|
|
121
|
+
- lib/legion/data/extract/handlers/jsonl.rb
|
|
122
|
+
- lib/legion/data/extract/handlers/markdown.rb
|
|
123
|
+
- lib/legion/data/extract/handlers/pdf.rb
|
|
124
|
+
- lib/legion/data/extract/handlers/pptx.rb
|
|
125
|
+
- lib/legion/data/extract/handlers/text.rb
|
|
126
|
+
- lib/legion/data/extract/handlers/xlsx.rb
|
|
127
|
+
- lib/legion/data/extract/type_detector.rb
|
|
101
128
|
- lib/legion/data/helper.rb
|
|
102
129
|
- lib/legion/data/local.rb
|
|
103
130
|
- lib/legion/data/migration.rb
|
|
@@ -147,6 +174,7 @@ files:
|
|
|
147
174
|
- lib/legion/data/migrations/044_expand_memory_traces.rb
|
|
148
175
|
- lib/legion/data/migrations/045_add_memory_associations.rb
|
|
149
176
|
- lib/legion/data/migrations/046_add_metering_hourly_rollup.rb
|
|
177
|
+
- lib/legion/data/migrations/047_apollo_knowledge_capture.rb
|
|
150
178
|
- lib/legion/data/model.rb
|
|
151
179
|
- lib/legion/data/models/apollo_access_log.rb
|
|
152
180
|
- lib/legion/data/models/apollo_entry.rb
|