legion-data 1.6.4 → 1.6.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -0
- data/legion-data.gemspec +1 -0
- data/lib/legion/data/extract/handlers/base.rb +58 -0
- data/lib/legion/data/extract/handlers/csv.rb +26 -0
- data/lib/legion/data/extract/handlers/docx.rb +28 -0
- data/lib/legion/data/extract/handlers/html.rb +33 -0
- data/lib/legion/data/extract/handlers/json.rb +26 -0
- data/lib/legion/data/extract/handlers/jsonl.rb +26 -0
- data/lib/legion/data/extract/handlers/markdown.rb +24 -0
- data/lib/legion/data/extract/handlers/pdf.rb +27 -0
- data/lib/legion/data/extract/handlers/pptx.rb +36 -0
- data/lib/legion/data/extract/handlers/text.rb +22 -0
- data/lib/legion/data/extract/handlers/xlsx.rb +37 -0
- data/lib/legion/data/extract/type_detector.rb +45 -0
- data/lib/legion/data/extract.rb +60 -0
- data/lib/legion/data/version.rb +1 -1
- data/lib/legion/data.rb +46 -0
- metadata +28 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 55c32ec7725b3d773d0050f52873892de6d9091082546c25969288a7abe159a9
|
|
4
|
+
data.tar.gz: 9d837ec4d10952b7103373972354d32886737f845bd623ad09d3671eddcd9f1d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: c4d314992432f8b4e6c67b1d8c69bc60d0f0da5c28f0e7906ecd455e8d12e8c7756d32b99fa96e7ce57ed2c1478ce88f1ffe8a0d17f9fdd0fa0896e5501d0ac4
|
|
7
|
+
data.tar.gz: b98331ae6bb79d68525cbe361ec054d0919ee14ee1e4322c3d191ad0d15f86c1c587e33ff358cf5fdd824b37e47ae40ad7b7d73d96dc6cbf38b917d6f01c7a5c
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,20 @@
|
|
|
1
1
|
# Legion::Data Changelog
|
|
2
2
|
|
|
3
|
+
## [1.6.6] - 2026-03-25
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- `connected?` — returns true when the shared DB is connected (reads `Settings[:data][:connected]`)
|
|
7
|
+
- `can_write?(table_name)` — checks INSERT privilege; sqlite always returns true, postgres queries `has_table_privilege`, results cached per table
|
|
8
|
+
- `can_read?(table_name)` — checks SELECT privilege; sqlite always returns true, postgres queries `has_table_privilege`, results cached per table
|
|
9
|
+
- `reset_privileges!` — clears cached privilege results (used in tests and after re-connect)
|
|
10
|
+
- `Legion::Data::Extract` — file format extraction with handler registry
|
|
11
|
+
- Built-in handlers: text, markdown, csv, json, jsonl (no external gems required)
|
|
12
|
+
- Optional handlers: pdf (pdf-reader), docx (docx), pptx (rubyzip), xlsx (rubyXL), html (nokogiri) — lazy-loaded, degrade gracefully if gem not installed
|
|
13
|
+
- `Extract.register_handler(type, klass)` — register custom format handlers
|
|
14
|
+
- `Extract.can_extract?(type)` — check if a type can be extracted (handler present and gem available)
|
|
15
|
+
- `Extract.supported_types` — list all registered types
|
|
16
|
+
- Added `csv` gem dependency (Ruby 3.4 stdlib split)
|
|
17
|
+
|
|
3
18
|
## [1.6.4] - 2026-03-25
|
|
4
19
|
|
|
5
20
|
### Added
|
data/legion-data.gemspec
CHANGED
|
@@ -26,6 +26,7 @@ Gem::Specification.new do |spec|
|
|
|
26
26
|
'rubygems_mfa_required' => 'true'
|
|
27
27
|
}
|
|
28
28
|
|
|
29
|
+
spec.add_dependency 'csv', '>= 3.2'
|
|
29
30
|
spec.add_dependency 'legion-logging', '>= 1.2.8'
|
|
30
31
|
spec.add_dependency 'legion-settings', '>= 1.3.12'
|
|
31
32
|
spec.add_dependency 'sequel', '>= 5.70'
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Data
|
|
5
|
+
module Extract
|
|
6
|
+
module Handlers
|
|
7
|
+
class Base
|
|
8
|
+
@registry = {}
|
|
9
|
+
|
|
10
|
+
class << self
|
|
11
|
+
attr_reader :registry
|
|
12
|
+
|
|
13
|
+
def inherited(subclass)
|
|
14
|
+
super
|
|
15
|
+
# Deferred registration — subclass defines type after class body loads
|
|
16
|
+
TracePoint.new(:end) do |tp|
|
|
17
|
+
if tp.self == subclass
|
|
18
|
+
register(subclass) if subclass.respond_to?(:type) && subclass.type
|
|
19
|
+
tp.disable
|
|
20
|
+
end
|
|
21
|
+
end.enable
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def register(handler_class)
|
|
25
|
+
@registry[handler_class.type] = handler_class
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def for_type(type)
|
|
29
|
+
@registry[type&.to_sym]
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def supported_types
|
|
33
|
+
@registry.keys
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Override in subclasses
|
|
37
|
+
def type = nil
|
|
38
|
+
def extensions = []
|
|
39
|
+
def gem_name = nil
|
|
40
|
+
|
|
41
|
+
def extract(_source)
|
|
42
|
+
raise NotImplementedError, "#{name} must implement .extract"
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def available?
|
|
46
|
+
return true if gem_name.nil?
|
|
47
|
+
|
|
48
|
+
require gem_name
|
|
49
|
+
true
|
|
50
|
+
rescue LoadError
|
|
51
|
+
false
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'csv'
|
|
4
|
+
|
|
5
|
+
module Legion
|
|
6
|
+
module Data
|
|
7
|
+
module Extract
|
|
8
|
+
module Handlers
|
|
9
|
+
class Csv < Base
|
|
10
|
+
def self.type = :csv
|
|
11
|
+
def self.extensions = %w[.csv]
|
|
12
|
+
def self.gem_name = nil
|
|
13
|
+
|
|
14
|
+
def self.extract(source)
|
|
15
|
+
content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
|
|
16
|
+
table = ::CSV.parse(content, headers: true)
|
|
17
|
+
text = table.map { |row| row.to_h.map { |k, v| "#{k}: #{v}" }.join(', ') }.join("\n")
|
|
18
|
+
{ text: text, metadata: { rows: table.size, columns: table.headers.size, headers: table.headers } }
|
|
19
|
+
rescue StandardError => e
|
|
20
|
+
{ text: nil, error: e.message }
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Data
|
|
5
|
+
module Extract
|
|
6
|
+
module Handlers
|
|
7
|
+
class Docx < Base
|
|
8
|
+
def self.type = :docx
|
|
9
|
+
def self.extensions = %w[.docx]
|
|
10
|
+
def self.gem_name = 'docx'
|
|
11
|
+
|
|
12
|
+
def self.extract(source)
|
|
13
|
+
require 'docx'
|
|
14
|
+
|
|
15
|
+
doc = ::Docx::Document.open(source)
|
|
16
|
+
paragraphs = doc.paragraphs.map(&:text).reject(&:empty?)
|
|
17
|
+
text = paragraphs.join("\n\n")
|
|
18
|
+
{ text: text, metadata: { paragraphs: paragraphs.size } }
|
|
19
|
+
rescue LoadError
|
|
20
|
+
{ text: nil, error: :gem_not_installed, gem: gem_name }
|
|
21
|
+
rescue StandardError => e
|
|
22
|
+
{ text: nil, error: e.message }
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Data
|
|
5
|
+
module Extract
|
|
6
|
+
module Handlers
|
|
7
|
+
class Html < Base
|
|
8
|
+
def self.type = :html
|
|
9
|
+
def self.extensions = %w[.html .htm]
|
|
10
|
+
def self.gem_name = 'nokogiri'
|
|
11
|
+
|
|
12
|
+
def self.extract(source)
|
|
13
|
+
require 'nokogiri'
|
|
14
|
+
|
|
15
|
+
content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
|
|
16
|
+
doc = ::Nokogiri::HTML(content)
|
|
17
|
+
|
|
18
|
+
# Remove script and style elements
|
|
19
|
+
doc.css('script, style, noscript').each(&:remove)
|
|
20
|
+
|
|
21
|
+
title = doc.at_css('title')&.text&.strip
|
|
22
|
+
text = doc.text.gsub(/\s+/, ' ').strip
|
|
23
|
+
{ text: text, metadata: { title: title } }
|
|
24
|
+
rescue LoadError
|
|
25
|
+
{ text: nil, error: :gem_not_installed, gem: gem_name }
|
|
26
|
+
rescue StandardError => e
|
|
27
|
+
{ text: nil, error: e.message }
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module Legion
|
|
6
|
+
module Data
|
|
7
|
+
module Extract
|
|
8
|
+
module Handlers
|
|
9
|
+
class Json < Base
|
|
10
|
+
def self.type = :json
|
|
11
|
+
def self.extensions = %w[.json]
|
|
12
|
+
def self.gem_name = nil
|
|
13
|
+
|
|
14
|
+
def self.extract(source)
|
|
15
|
+
content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
|
|
16
|
+
parsed = ::JSON.parse(content)
|
|
17
|
+
text = ::JSON.pretty_generate(parsed)
|
|
18
|
+
{ text: text, metadata: { keys: parsed.is_a?(Hash) ? parsed.keys : nil } }
|
|
19
|
+
rescue StandardError => e
|
|
20
|
+
{ text: nil, error: e.message }
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module Legion
|
|
6
|
+
module Data
|
|
7
|
+
module Extract
|
|
8
|
+
module Handlers
|
|
9
|
+
class Jsonl < Base
|
|
10
|
+
def self.type = :jsonl
|
|
11
|
+
def self.extensions = %w[.jsonl]
|
|
12
|
+
def self.gem_name = nil
|
|
13
|
+
|
|
14
|
+
def self.extract(source)
|
|
15
|
+
content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
|
|
16
|
+
lines = content.each_line.map { |l| ::JSON.parse(l.strip) rescue l.strip } # rubocop:disable Style/RescueModifier
|
|
17
|
+
text = lines.map { |l| l.is_a?(Hash) ? ::JSON.pretty_generate(l) : l }.join("\n---\n")
|
|
18
|
+
{ text: text, metadata: { lines: lines.size } }
|
|
19
|
+
rescue StandardError => e
|
|
20
|
+
{ text: nil, error: e.message }
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Data
|
|
5
|
+
module Extract
|
|
6
|
+
module Handlers
|
|
7
|
+
class Markdown < Base
|
|
8
|
+
def self.type = :markdown
|
|
9
|
+
def self.extensions = %w[.md .markdown]
|
|
10
|
+
def self.gem_name = nil
|
|
11
|
+
|
|
12
|
+
def self.extract(source)
|
|
13
|
+
content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
|
|
14
|
+
# Strip YAML frontmatter if present
|
|
15
|
+
text = content.sub(/\A---\n.*?\n---\n/m, '')
|
|
16
|
+
{ text: text.strip, metadata: { bytes: content.bytesize, has_frontmatter: content != text } }
|
|
17
|
+
rescue StandardError => e
|
|
18
|
+
{ text: nil, error: e.message }
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Data
|
|
5
|
+
module Extract
|
|
6
|
+
module Handlers
|
|
7
|
+
class Pdf < Base
|
|
8
|
+
def self.type = :pdf
|
|
9
|
+
def self.extensions = %w[.pdf]
|
|
10
|
+
def self.gem_name = 'pdf-reader'
|
|
11
|
+
|
|
12
|
+
def self.extract(source)
|
|
13
|
+
require 'pdf-reader'
|
|
14
|
+
|
|
15
|
+
reader = ::PDF::Reader.new(source)
|
|
16
|
+
text = reader.pages.map(&:text).join("\n\n")
|
|
17
|
+
{ text: text, metadata: { pages: reader.page_count, title: reader.info[:Title] } }
|
|
18
|
+
rescue LoadError
|
|
19
|
+
{ text: nil, error: :gem_not_installed, gem: gem_name }
|
|
20
|
+
rescue StandardError => e
|
|
21
|
+
{ text: nil, error: e.message }
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Data
|
|
5
|
+
module Extract
|
|
6
|
+
module Handlers
|
|
7
|
+
class Pptx < Base
|
|
8
|
+
def self.type = :pptx
|
|
9
|
+
def self.extensions = %w[.pptx]
|
|
10
|
+
def self.gem_name = 'rubyzip'
|
|
11
|
+
|
|
12
|
+
def self.extract(source)
|
|
13
|
+
require 'zip'
|
|
14
|
+
require 'rexml/document'
|
|
15
|
+
|
|
16
|
+
slides = []
|
|
17
|
+
::Zip::File.open(source) do |zip|
|
|
18
|
+
zip.glob('ppt/slides/slide*.xml').sort_by(&:name).each do |entry|
|
|
19
|
+
doc = REXML::Document.new(entry.get_input_stream.read)
|
|
20
|
+
texts = []
|
|
21
|
+
doc.each_element('//a:t') { |e| texts << e.text }
|
|
22
|
+
slides << texts.join(' ') unless texts.empty?
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
text = slides.each_with_index.map { |s, i| "Slide #{i + 1}: #{s}" }.join("\n\n")
|
|
26
|
+
{ text: text, metadata: { slides: slides.size } }
|
|
27
|
+
rescue LoadError
|
|
28
|
+
{ text: nil, error: :gem_not_installed, gem: 'rubyzip' }
|
|
29
|
+
rescue StandardError => e
|
|
30
|
+
{ text: nil, error: e.message }
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Data
|
|
5
|
+
module Extract
|
|
6
|
+
module Handlers
|
|
7
|
+
class Text < Base
|
|
8
|
+
def self.type = :text
|
|
9
|
+
def self.extensions = %w[.txt]
|
|
10
|
+
def self.gem_name = nil
|
|
11
|
+
|
|
12
|
+
def self.extract(source)
|
|
13
|
+
content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
|
|
14
|
+
{ text: content, metadata: { bytes: content.bytesize } }
|
|
15
|
+
rescue StandardError => e
|
|
16
|
+
{ text: nil, error: e.message }
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Data
|
|
5
|
+
module Extract
|
|
6
|
+
module Handlers
|
|
7
|
+
class Xlsx < Base
|
|
8
|
+
def self.type = :xlsx
|
|
9
|
+
def self.extensions = %w[.xlsx .xls]
|
|
10
|
+
def self.gem_name = 'rubyXL'
|
|
11
|
+
|
|
12
|
+
def self.extract(source)
|
|
13
|
+
require 'rubyXL'
|
|
14
|
+
require 'rubyXL/convenience_methods'
|
|
15
|
+
|
|
16
|
+
workbook = ::RubyXL::Parser.parse(source)
|
|
17
|
+
sheets = []
|
|
18
|
+
workbook.worksheets.each do |sheet|
|
|
19
|
+
rows = sheet.each.map do |row|
|
|
20
|
+
next unless row
|
|
21
|
+
|
|
22
|
+
row.cells.map { |c| c&.value.to_s }.join(', ')
|
|
23
|
+
end.compact
|
|
24
|
+
sheets << "Sheet: #{sheet.sheet_name}\n#{rows.join("\n")}" unless rows.empty?
|
|
25
|
+
end
|
|
26
|
+
text = sheets.join("\n\n")
|
|
27
|
+
{ text: text, metadata: { sheets: workbook.worksheets.size } }
|
|
28
|
+
rescue LoadError
|
|
29
|
+
{ text: nil, error: :gem_not_installed, gem: gem_name }
|
|
30
|
+
rescue StandardError => e
|
|
31
|
+
{ text: nil, error: e.message }
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Data
|
|
5
|
+
module Extract
|
|
6
|
+
module TypeDetector
|
|
7
|
+
EXTENSION_MAP = {
|
|
8
|
+
'.pdf' => :pdf,
|
|
9
|
+
'.docx' => :docx,
|
|
10
|
+
'.pptx' => :pptx,
|
|
11
|
+
'.xlsx' => :xlsx,
|
|
12
|
+
'.xls' => :xlsx,
|
|
13
|
+
'.md' => :markdown,
|
|
14
|
+
'.markdown' => :markdown,
|
|
15
|
+
'.txt' => :text,
|
|
16
|
+
'.csv' => :csv,
|
|
17
|
+
'.json' => :json,
|
|
18
|
+
'.jsonl' => :jsonl,
|
|
19
|
+
'.html' => :html,
|
|
20
|
+
'.htm' => :html
|
|
21
|
+
}.freeze
|
|
22
|
+
|
|
23
|
+
module_function
|
|
24
|
+
|
|
25
|
+
def detect(source)
|
|
26
|
+
return detect_from_path(source) if source.is_a?(String) && File.exist?(source)
|
|
27
|
+
return detect_from_io(source) if source.respond_to?(:path)
|
|
28
|
+
|
|
29
|
+
nil
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def detect_from_path(path)
|
|
33
|
+
ext = File.extname(path).downcase
|
|
34
|
+
EXTENSION_MAP[ext]
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def detect_from_io(io)
|
|
38
|
+
return nil unless io.respond_to?(:path) && io.path
|
|
39
|
+
|
|
40
|
+
detect_from_path(io.path)
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'extract/type_detector'
|
|
4
|
+
require_relative 'extract/handlers/base'
|
|
5
|
+
|
|
6
|
+
module Legion
|
|
7
|
+
module Data
|
|
8
|
+
module Extract
|
|
9
|
+
class << self
|
|
10
|
+
def extract(source, type: :auto)
|
|
11
|
+
detected_type = type == :auto ? TypeDetector.detect(source) : type&.to_sym
|
|
12
|
+
return { success: false, text: nil, error: :unknown_type } unless detected_type
|
|
13
|
+
|
|
14
|
+
handler = Handlers::Base.for_type(detected_type)
|
|
15
|
+
return { success: false, text: nil, error: :no_handler, type: detected_type } unless handler
|
|
16
|
+
|
|
17
|
+
unless handler.available?
|
|
18
|
+
return { success: false, text: nil, error: :gem_not_installed,
|
|
19
|
+
gem: handler.gem_name, type: detected_type }
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
result = handler.extract(source)
|
|
23
|
+
if result[:text]
|
|
24
|
+
{ success: true, text: result[:text], metadata: result[:metadata], type: detected_type }
|
|
25
|
+
else
|
|
26
|
+
{ success: false, text: nil, error: result[:error], type: detected_type }
|
|
27
|
+
end
|
|
28
|
+
rescue StandardError => e
|
|
29
|
+
{ success: false, text: nil, error: e.message, type: detected_type }
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def supported_types
|
|
33
|
+
load_all_handlers
|
|
34
|
+
Handlers::Base.supported_types
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def can_extract?(type)
|
|
38
|
+
load_all_handlers
|
|
39
|
+
handler = Handlers::Base.for_type(type&.to_sym)
|
|
40
|
+
handler&.available? || false
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def register_handler(type, klass)
|
|
44
|
+
Handlers::Base.registry[type.to_sym] = klass
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
private
|
|
48
|
+
|
|
49
|
+
def load_all_handlers
|
|
50
|
+
return if @handlers_loaded
|
|
51
|
+
|
|
52
|
+
Dir[File.join(__dir__, 'extract', 'handlers', '*.rb')].each do |f|
|
|
53
|
+
require f unless f.end_with?('base.rb')
|
|
54
|
+
end
|
|
55
|
+
@handlers_loaded = true
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
data/lib/legion/data/version.rb
CHANGED
data/lib/legion/data.rb
CHANGED
|
@@ -13,6 +13,7 @@ require_relative 'data/partition_manager'
|
|
|
13
13
|
require_relative 'data/archiver'
|
|
14
14
|
require_relative 'data/helper'
|
|
15
15
|
require_relative 'data/rls'
|
|
16
|
+
require_relative 'data/extract'
|
|
16
17
|
|
|
17
18
|
module Legion
|
|
18
19
|
module Data
|
|
@@ -55,6 +56,51 @@ module Legion
|
|
|
55
56
|
}
|
|
56
57
|
end
|
|
57
58
|
|
|
59
|
+
def connected?
|
|
60
|
+
Legion::Settings[:data][:connected] == true
|
|
61
|
+
rescue StandardError
|
|
62
|
+
false
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def can_write?(table_name)
|
|
66
|
+
return false unless connected?
|
|
67
|
+
|
|
68
|
+
adapter = Legion::Settings[:data][:adapter]&.to_s
|
|
69
|
+
return true if adapter == 'sqlite'
|
|
70
|
+
|
|
71
|
+
@write_privileges ||= {}
|
|
72
|
+
return @write_privileges[table_name] unless @write_privileges[table_name].nil?
|
|
73
|
+
|
|
74
|
+
@write_privileges[table_name] = connection
|
|
75
|
+
.fetch("SELECT has_table_privilege(current_user, ?, 'INSERT') AS can", table_name.to_s)
|
|
76
|
+
.first[:can] == true
|
|
77
|
+
rescue StandardError
|
|
78
|
+
@write_privileges[table_name] = false if @write_privileges
|
|
79
|
+
false
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def can_read?(table_name)
|
|
83
|
+
return false unless connected?
|
|
84
|
+
|
|
85
|
+
adapter = Legion::Settings[:data][:adapter]&.to_s
|
|
86
|
+
return true if adapter == 'sqlite'
|
|
87
|
+
|
|
88
|
+
@read_privileges ||= {}
|
|
89
|
+
return @read_privileges[table_name] unless @read_privileges[table_name].nil?
|
|
90
|
+
|
|
91
|
+
@read_privileges[table_name] = connection
|
|
92
|
+
.fetch("SELECT has_table_privilege(current_user, ?, 'SELECT') AS can", table_name.to_s)
|
|
93
|
+
.first[:can] == true
|
|
94
|
+
rescue StandardError
|
|
95
|
+
@read_privileges[table_name] = false if @read_privileges
|
|
96
|
+
false
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def reset_privileges!
|
|
100
|
+
@write_privileges = nil
|
|
101
|
+
@read_privileges = nil
|
|
102
|
+
end
|
|
103
|
+
|
|
58
104
|
def setup_cache
|
|
59
105
|
cache_settings = Legion::Settings[:data][:cache]
|
|
60
106
|
setup_static_cache if cache_settings[:static_cache]
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: legion-data
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.6.
|
|
4
|
+
version: 1.6.6
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Esity
|
|
@@ -9,6 +9,20 @@ bindir: bin
|
|
|
9
9
|
cert_chain: []
|
|
10
10
|
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: csv
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - ">="
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '3.2'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - ">="
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '3.2'
|
|
12
26
|
- !ruby/object:Gem::Dependency
|
|
13
27
|
name: legion-logging
|
|
14
28
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -98,6 +112,19 @@ files:
|
|
|
98
112
|
- lib/legion/data/encryption/sequel_plugin.rb
|
|
99
113
|
- lib/legion/data/event_store.rb
|
|
100
114
|
- lib/legion/data/event_store/projection.rb
|
|
115
|
+
- lib/legion/data/extract.rb
|
|
116
|
+
- lib/legion/data/extract/handlers/base.rb
|
|
117
|
+
- lib/legion/data/extract/handlers/csv.rb
|
|
118
|
+
- lib/legion/data/extract/handlers/docx.rb
|
|
119
|
+
- lib/legion/data/extract/handlers/html.rb
|
|
120
|
+
- lib/legion/data/extract/handlers/json.rb
|
|
121
|
+
- lib/legion/data/extract/handlers/jsonl.rb
|
|
122
|
+
- lib/legion/data/extract/handlers/markdown.rb
|
|
123
|
+
- lib/legion/data/extract/handlers/pdf.rb
|
|
124
|
+
- lib/legion/data/extract/handlers/pptx.rb
|
|
125
|
+
- lib/legion/data/extract/handlers/text.rb
|
|
126
|
+
- lib/legion/data/extract/handlers/xlsx.rb
|
|
127
|
+
- lib/legion/data/extract/type_detector.rb
|
|
101
128
|
- lib/legion/data/helper.rb
|
|
102
129
|
- lib/legion/data/local.rb
|
|
103
130
|
- lib/legion/data/migration.rb
|