legion-data 1.6.4 → 1.6.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4affe9deaa1903b9ff010b805e8f37814bec22ca669186daf69de345a5e8d711
4
- data.tar.gz: 6e78bf1992c58cd92f9c05395486f1b1c6f211ae3ad653fc385240563bef96ba
3
+ metadata.gz: 55c32ec7725b3d773d0050f52873892de6d9091082546c25969288a7abe159a9
4
+ data.tar.gz: 9d837ec4d10952b7103373972354d32886737f845bd623ad09d3671eddcd9f1d
5
5
  SHA512:
6
- metadata.gz: 8a16917741977b4bd9c3ed8b873769dc6f3356cdb950a2bef2abd9809cbc340ec2df966bc71700e5a6b116d86453988f3cbd79db5b9f828e74dabba350e106a8
7
- data.tar.gz: 9f32dc1dd38e9a9181b0f799509c06e1dd58e0661251426a135cf04bad8e64ca39f38f8ec93adc11e5715bbb3d0c95e878d4c82ebd9977425ace69be3b23cd84
6
+ metadata.gz: c4d314992432f8b4e6c67b1d8c69bc60d0f0da5c28f0e7906ecd455e8d12e8c7756d32b99fa96e7ce57ed2c1478ce88f1ffe8a0d17f9fdd0fa0896e5501d0ac4
7
+ data.tar.gz: b98331ae6bb79d68525cbe361ec054d0919ee14ee1e4322c3d191ad0d15f86c1c587e33ff358cf5fdd824b37e47ae40ad7b7d73d96dc6cbf38b917d6f01c7a5c
data/CHANGELOG.md CHANGED
@@ -1,5 +1,20 @@
1
1
  # Legion::Data Changelog
2
2
 
3
+ ## [1.6.6] - 2026-03-25
4
+
5
+ ### Added
6
+ - `connected?` — returns true when the shared DB is connected (reads `Settings[:data][:connected]`)
7
+ - `can_write?(table_name)` — checks INSERT privilege; sqlite always returns true, postgres queries `has_table_privilege`, results cached per table
8
+ - `can_read?(table_name)` — checks SELECT privilege; sqlite always returns true, postgres queries `has_table_privilege`, results cached per table
9
+ - `reset_privileges!` — clears cached privilege results (used in tests and after re-connect)
10
+ - `Legion::Data::Extract` — file format extraction with handler registry
11
+ - Built-in handlers: text, markdown, csv, json, jsonl (no external gems required)
12
+ - Optional handlers: pdf (pdf-reader), docx (docx), pptx (rubyzip), xlsx (rubyXL), html (nokogiri) — lazy-loaded, degrade gracefully if gem not installed
13
+ - `Extract.register_handler(type, klass)` — register custom format handlers
14
+ - `Extract.can_extract?(type)` — check if a type can be extracted (handler present and gem available)
15
+ - `Extract.supported_types` — list all registered types
16
+ - Added `csv` gem dependency (Ruby 3.4 stdlib split)
17
+
3
18
  ## [1.6.4] - 2026-03-25
4
19
 
5
20
  ### Added
data/legion-data.gemspec CHANGED
@@ -26,6 +26,7 @@ Gem::Specification.new do |spec|
26
26
  'rubygems_mfa_required' => 'true'
27
27
  }
28
28
 
29
+ spec.add_dependency 'csv', '>= 3.2'
29
30
  spec.add_dependency 'legion-logging', '>= 1.2.8'
30
31
  spec.add_dependency 'legion-settings', '>= 1.3.12'
31
32
  spec.add_dependency 'sequel', '>= 5.70'
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Data
5
+ module Extract
6
+ module Handlers
7
+ class Base
8
+ @registry = {}
9
+
10
+ class << self
11
+ attr_reader :registry
12
+
13
+ def inherited(subclass)
14
+ super
15
+ # Deferred registration — subclass defines type after class body loads
16
+ TracePoint.new(:end) do |tp|
17
+ if tp.self == subclass
18
+ register(subclass) if subclass.respond_to?(:type) && subclass.type
19
+ tp.disable
20
+ end
21
+ end.enable
22
+ end
23
+
24
+ def register(handler_class)
25
+ @registry[handler_class.type] = handler_class
26
+ end
27
+
28
+ def for_type(type)
29
+ @registry[type&.to_sym]
30
+ end
31
+
32
+ def supported_types
33
+ @registry.keys
34
+ end
35
+
36
+ # Override in subclasses
37
+ def type = nil
38
+ def extensions = []
39
+ def gem_name = nil
40
+
41
+ def extract(_source)
42
+ raise NotImplementedError, "#{name} must implement .extract"
43
+ end
44
+
45
+ def available?
46
+ return true if gem_name.nil?
47
+
48
+ require gem_name
49
+ true
50
+ rescue LoadError
51
+ false
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'csv'
4
+
5
+ module Legion
6
+ module Data
7
+ module Extract
8
+ module Handlers
9
+ class Csv < Base
10
+ def self.type = :csv
11
+ def self.extensions = %w[.csv]
12
+ def self.gem_name = nil
13
+
14
+ def self.extract(source)
15
+ content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
16
+ table = ::CSV.parse(content, headers: true)
17
+ text = table.map { |row| row.to_h.map { |k, v| "#{k}: #{v}" }.join(', ') }.join("\n")
18
+ { text: text, metadata: { rows: table.size, columns: table.headers.size, headers: table.headers } }
19
+ rescue StandardError => e
20
+ { text: nil, error: e.message }
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Data
5
+ module Extract
6
+ module Handlers
7
+ class Docx < Base
8
+ def self.type = :docx
9
+ def self.extensions = %w[.docx]
10
+ def self.gem_name = 'docx'
11
+
12
+ def self.extract(source)
13
+ require 'docx'
14
+
15
+ doc = ::Docx::Document.open(source)
16
+ paragraphs = doc.paragraphs.map(&:text).reject(&:empty?)
17
+ text = paragraphs.join("\n\n")
18
+ { text: text, metadata: { paragraphs: paragraphs.size } }
19
+ rescue LoadError
20
+ { text: nil, error: :gem_not_installed, gem: gem_name }
21
+ rescue StandardError => e
22
+ { text: nil, error: e.message }
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Data
5
+ module Extract
6
+ module Handlers
7
+ class Html < Base
8
+ def self.type = :html
9
+ def self.extensions = %w[.html .htm]
10
+ def self.gem_name = 'nokogiri'
11
+
12
+ def self.extract(source)
13
+ require 'nokogiri'
14
+
15
+ content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
16
+ doc = ::Nokogiri::HTML(content)
17
+
18
+ # Remove script and style elements
19
+ doc.css('script, style, noscript').each(&:remove)
20
+
21
+ title = doc.at_css('title')&.text&.strip
22
+ text = doc.text.gsub(/\s+/, ' ').strip
23
+ { text: text, metadata: { title: title } }
24
+ rescue LoadError
25
+ { text: nil, error: :gem_not_installed, gem: gem_name }
26
+ rescue StandardError => e
27
+ { text: nil, error: e.message }
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module Legion
6
+ module Data
7
+ module Extract
8
+ module Handlers
9
+ class Json < Base
10
+ def self.type = :json
11
+ def self.extensions = %w[.json]
12
+ def self.gem_name = nil
13
+
14
+ def self.extract(source)
15
+ content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
16
+ parsed = ::JSON.parse(content)
17
+ text = ::JSON.pretty_generate(parsed)
18
+ { text: text, metadata: { keys: parsed.is_a?(Hash) ? parsed.keys : nil } }
19
+ rescue StandardError => e
20
+ { text: nil, error: e.message }
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module Legion
6
+ module Data
7
+ module Extract
8
+ module Handlers
9
+ class Jsonl < Base
10
+ def self.type = :jsonl
11
+ def self.extensions = %w[.jsonl]
12
+ def self.gem_name = nil
13
+
14
+ def self.extract(source)
15
+ content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
16
+ lines = content.each_line.map { |l| ::JSON.parse(l.strip) rescue l.strip } # rubocop:disable Style/RescueModifier
17
+ text = lines.map { |l| l.is_a?(Hash) ? ::JSON.pretty_generate(l) : l }.join("\n---\n")
18
+ { text: text, metadata: { lines: lines.size } }
19
+ rescue StandardError => e
20
+ { text: nil, error: e.message }
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Data
5
+ module Extract
6
+ module Handlers
7
+ class Markdown < Base
8
+ def self.type = :markdown
9
+ def self.extensions = %w[.md .markdown]
10
+ def self.gem_name = nil
11
+
12
+ def self.extract(source)
13
+ content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
14
+ # Strip YAML frontmatter if present
15
+ text = content.sub(/\A---\n.*?\n---\n/m, '')
16
+ { text: text.strip, metadata: { bytes: content.bytesize, has_frontmatter: content != text } }
17
+ rescue StandardError => e
18
+ { text: nil, error: e.message }
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Data
5
+ module Extract
6
+ module Handlers
7
+ class Pdf < Base
8
+ def self.type = :pdf
9
+ def self.extensions = %w[.pdf]
10
+ def self.gem_name = 'pdf-reader'
11
+
12
+ def self.extract(source)
13
+ require 'pdf-reader'
14
+
15
+ reader = ::PDF::Reader.new(source)
16
+ text = reader.pages.map(&:text).join("\n\n")
17
+ { text: text, metadata: { pages: reader.page_count, title: reader.info[:Title] } }
18
+ rescue LoadError
19
+ { text: nil, error: :gem_not_installed, gem: gem_name }
20
+ rescue StandardError => e
21
+ { text: nil, error: e.message }
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Data
5
+ module Extract
6
+ module Handlers
7
+ class Pptx < Base
8
+ def self.type = :pptx
9
+ def self.extensions = %w[.pptx]
10
+ def self.gem_name = 'rubyzip'
11
+
12
+ def self.extract(source)
13
+ require 'zip'
14
+ require 'rexml/document'
15
+
16
+ slides = []
17
+ ::Zip::File.open(source) do |zip|
18
+ zip.glob('ppt/slides/slide*.xml').sort_by(&:name).each do |entry|
19
+ doc = REXML::Document.new(entry.get_input_stream.read)
20
+ texts = []
21
+ doc.each_element('//a:t') { |e| texts << e.text }
22
+ slides << texts.join(' ') unless texts.empty?
23
+ end
24
+ end
25
+ text = slides.each_with_index.map { |s, i| "Slide #{i + 1}: #{s}" }.join("\n\n")
26
+ { text: text, metadata: { slides: slides.size } }
27
+ rescue LoadError
28
+ { text: nil, error: :gem_not_installed, gem: 'rubyzip' }
29
+ rescue StandardError => e
30
+ { text: nil, error: e.message }
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Data
5
+ module Extract
6
+ module Handlers
7
+ class Text < Base
8
+ def self.type = :text
9
+ def self.extensions = %w[.txt]
10
+ def self.gem_name = nil
11
+
12
+ def self.extract(source)
13
+ content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
14
+ { text: content, metadata: { bytes: content.bytesize } }
15
+ rescue StandardError => e
16
+ { text: nil, error: e.message }
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Data
5
+ module Extract
6
+ module Handlers
7
+ class Xlsx < Base
8
+ def self.type = :xlsx
9
+ def self.extensions = %w[.xlsx .xls]
10
+ def self.gem_name = 'rubyXL'
11
+
12
+ def self.extract(source)
13
+ require 'rubyXL'
14
+ require 'rubyXL/convenience_methods'
15
+
16
+ workbook = ::RubyXL::Parser.parse(source)
17
+ sheets = []
18
+ workbook.worksheets.each do |sheet|
19
+ rows = sheet.each.map do |row|
20
+ next unless row
21
+
22
+ row.cells.map { |c| c&.value.to_s }.join(', ')
23
+ end.compact
24
+ sheets << "Sheet: #{sheet.sheet_name}\n#{rows.join("\n")}" unless rows.empty?
25
+ end
26
+ text = sheets.join("\n\n")
27
+ { text: text, metadata: { sheets: workbook.worksheets.size } }
28
+ rescue LoadError
29
+ { text: nil, error: :gem_not_installed, gem: gem_name }
30
+ rescue StandardError => e
31
+ { text: nil, error: e.message }
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Data
5
+ module Extract
6
+ module TypeDetector
7
+ EXTENSION_MAP = {
8
+ '.pdf' => :pdf,
9
+ '.docx' => :docx,
10
+ '.pptx' => :pptx,
11
+ '.xlsx' => :xlsx,
12
+ '.xls' => :xlsx,
13
+ '.md' => :markdown,
14
+ '.markdown' => :markdown,
15
+ '.txt' => :text,
16
+ '.csv' => :csv,
17
+ '.json' => :json,
18
+ '.jsonl' => :jsonl,
19
+ '.html' => :html,
20
+ '.htm' => :html
21
+ }.freeze
22
+
23
+ module_function
24
+
25
+ def detect(source)
26
+ return detect_from_path(source) if source.is_a?(String) && File.exist?(source)
27
+ return detect_from_io(source) if source.respond_to?(:path)
28
+
29
+ nil
30
+ end
31
+
32
+ def detect_from_path(path)
33
+ ext = File.extname(path).downcase
34
+ EXTENSION_MAP[ext]
35
+ end
36
+
37
+ def detect_from_io(io)
38
+ return nil unless io.respond_to?(:path) && io.path
39
+
40
+ detect_from_path(io.path)
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'extract/type_detector'
4
+ require_relative 'extract/handlers/base'
5
+
6
+ module Legion
7
+ module Data
8
+ module Extract
9
+ class << self
10
+ def extract(source, type: :auto)
11
+ detected_type = type == :auto ? TypeDetector.detect(source) : type&.to_sym
12
+ return { success: false, text: nil, error: :unknown_type } unless detected_type
13
+
14
+ handler = Handlers::Base.for_type(detected_type)
15
+ return { success: false, text: nil, error: :no_handler, type: detected_type } unless handler
16
+
17
+ unless handler.available?
18
+ return { success: false, text: nil, error: :gem_not_installed,
19
+ gem: handler.gem_name, type: detected_type }
20
+ end
21
+
22
+ result = handler.extract(source)
23
+ if result[:text]
24
+ { success: true, text: result[:text], metadata: result[:metadata], type: detected_type }
25
+ else
26
+ { success: false, text: nil, error: result[:error], type: detected_type }
27
+ end
28
+ rescue StandardError => e
29
+ { success: false, text: nil, error: e.message, type: detected_type }
30
+ end
31
+
32
+ def supported_types
33
+ load_all_handlers
34
+ Handlers::Base.supported_types
35
+ end
36
+
37
+ def can_extract?(type)
38
+ load_all_handlers
39
+ handler = Handlers::Base.for_type(type&.to_sym)
40
+ handler&.available? || false
41
+ end
42
+
43
+ def register_handler(type, klass)
44
+ Handlers::Base.registry[type.to_sym] = klass
45
+ end
46
+
47
+ private
48
+
49
+ def load_all_handlers
50
+ return if @handlers_loaded
51
+
52
+ Dir[File.join(__dir__, 'extract', 'handlers', '*.rb')].each do |f|
53
+ require f unless f.end_with?('base.rb')
54
+ end
55
+ @handlers_loaded = true
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Legion
4
4
  module Data
5
- VERSION = '1.6.4'
5
+ VERSION = '1.6.6'
6
6
  end
7
7
  end
data/lib/legion/data.rb CHANGED
@@ -13,6 +13,7 @@ require_relative 'data/partition_manager'
13
13
  require_relative 'data/archiver'
14
14
  require_relative 'data/helper'
15
15
  require_relative 'data/rls'
16
+ require_relative 'data/extract'
16
17
 
17
18
  module Legion
18
19
  module Data
@@ -55,6 +56,51 @@ module Legion
55
56
  }
56
57
  end
57
58
 
59
+ def connected?
60
+ Legion::Settings[:data][:connected] == true
61
+ rescue StandardError
62
+ false
63
+ end
64
+
65
+ def can_write?(table_name)
66
+ return false unless connected?
67
+
68
+ adapter = Legion::Settings[:data][:adapter]&.to_s
69
+ return true if adapter == 'sqlite'
70
+
71
+ @write_privileges ||= {}
72
+ return @write_privileges[table_name] unless @write_privileges[table_name].nil?
73
+
74
+ @write_privileges[table_name] = connection
75
+ .fetch("SELECT has_table_privilege(current_user, ?, 'INSERT') AS can", table_name.to_s)
76
+ .first[:can] == true
77
+ rescue StandardError
78
+ @write_privileges[table_name] = false if @write_privileges
79
+ false
80
+ end
81
+
82
+ def can_read?(table_name)
83
+ return false unless connected?
84
+
85
+ adapter = Legion::Settings[:data][:adapter]&.to_s
86
+ return true if adapter == 'sqlite'
87
+
88
+ @read_privileges ||= {}
89
+ return @read_privileges[table_name] unless @read_privileges[table_name].nil?
90
+
91
+ @read_privileges[table_name] = connection
92
+ .fetch("SELECT has_table_privilege(current_user, ?, 'SELECT') AS can", table_name.to_s)
93
+ .first[:can] == true
94
+ rescue StandardError
95
+ @read_privileges[table_name] = false if @read_privileges
96
+ false
97
+ end
98
+
99
+ def reset_privileges!
100
+ @write_privileges = nil
101
+ @read_privileges = nil
102
+ end
103
+
58
104
  def setup_cache
59
105
  cache_settings = Legion::Settings[:data][:cache]
60
106
  setup_static_cache if cache_settings[:static_cache]
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: legion-data
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.6.4
4
+ version: 1.6.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Esity
@@ -9,6 +9,20 @@ bindir: bin
9
9
  cert_chain: []
10
10
  date: 1980-01-02 00:00:00.000000000 Z
11
11
  dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: csv
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '3.2'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - ">="
24
+ - !ruby/object:Gem::Version
25
+ version: '3.2'
12
26
  - !ruby/object:Gem::Dependency
13
27
  name: legion-logging
14
28
  requirement: !ruby/object:Gem::Requirement
@@ -98,6 +112,19 @@ files:
98
112
  - lib/legion/data/encryption/sequel_plugin.rb
99
113
  - lib/legion/data/event_store.rb
100
114
  - lib/legion/data/event_store/projection.rb
115
+ - lib/legion/data/extract.rb
116
+ - lib/legion/data/extract/handlers/base.rb
117
+ - lib/legion/data/extract/handlers/csv.rb
118
+ - lib/legion/data/extract/handlers/docx.rb
119
+ - lib/legion/data/extract/handlers/html.rb
120
+ - lib/legion/data/extract/handlers/json.rb
121
+ - lib/legion/data/extract/handlers/jsonl.rb
122
+ - lib/legion/data/extract/handlers/markdown.rb
123
+ - lib/legion/data/extract/handlers/pdf.rb
124
+ - lib/legion/data/extract/handlers/pptx.rb
125
+ - lib/legion/data/extract/handlers/text.rb
126
+ - lib/legion/data/extract/handlers/xlsx.rb
127
+ - lib/legion/data/extract/type_detector.rb
101
128
  - lib/legion/data/helper.rb
102
129
  - lib/legion/data/local.rb
103
130
  - lib/legion/data/migration.rb