legion-data 1.6.3 → 1.6.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '048cfb8d6ead50ef6ddc554765251d61121dc0f6e99491b9675e410f086cd2fe'
4
- data.tar.gz: 58cced043937061ce27fd3c111d5b0b5c5fd3416ab622a3a7700f1ce553628a4
3
+ metadata.gz: 55c32ec7725b3d773d0050f52873892de6d9091082546c25969288a7abe159a9
4
+ data.tar.gz: 9d837ec4d10952b7103373972354d32886737f845bd623ad09d3671eddcd9f1d
5
5
  SHA512:
6
- metadata.gz: 9ba6d97c8cee58a5f41cd5ecc5c2229e75d4880368c43ec44d2f3b969ff144c81c7d6f49fd91d84819b1d3665eab37a7f7ff08f22c0e579109a96ace83d24262
7
- data.tar.gz: ba69290675fbf53d9185dcdbc62e46e344b6aad3594c7dc4fd76fdf48af386a2b97d5b1e7fe70606b1c91dacee32e5a4117d6cc5ccf34a184fac89f4ce7b2e26
6
+ metadata.gz: c4d314992432f8b4e6c67b1d8c69bc60d0f0da5c28f0e7906ecd455e8d12e8c7756d32b99fa96e7ce57ed2c1478ce88f1ffe8a0d17f9fdd0fa0896e5501d0ac4
7
+ data.tar.gz: b98331ae6bb79d68525cbe361ec054d0919ee14ee1e4322c3d191ad0d15f86c1c587e33ff358cf5fdd824b37e47ae40ad7b7d73d96dc6cbf38b917d6f01c7a5c
data/CHANGELOG.md CHANGED
@@ -1,5 +1,25 @@
1
1
  # Legion::Data Changelog
2
2
 
3
+ ## [1.6.6] - 2026-03-25
4
+
5
+ ### Added
6
+ - `connected?` — returns true when the shared DB is connected (reads `Settings[:data][:connected]`)
7
+ - `can_write?(table_name)` — checks INSERT privilege; sqlite always returns true, postgres queries `has_table_privilege`, results cached per table
8
+ - `can_read?(table_name)` — checks SELECT privilege; sqlite always returns true, postgres queries `has_table_privilege`, results cached per table
9
+ - `reset_privileges!` — clears cached privilege results (used in tests and after re-connect)
10
+ - `Legion::Data::Extract` — file format extraction with handler registry
11
+ - Built-in handlers: text, markdown, csv, json, jsonl (no external gems required)
12
+ - Optional handlers: pdf (pdf-reader), docx (docx), pptx (rubyzip), xlsx (rubyXL), html (nokogiri) — lazy-loaded, degrade gracefully if gem not installed
13
+ - `Extract.register_handler(type, klass)` — register custom format handlers
14
+ - `Extract.can_extract?(type)` — check if a type can be extracted (handler present and gem available)
15
+ - `Extract.supported_types` — list all registered types
16
+ - Added `csv` gem dependency (Ruby 3.4 stdlib split)
17
+
18
+ ## [1.6.4] - 2026-03-25
19
+
20
+ ### Added
21
+ - Migration 047: Apollo identity columns (submitted_by, submitted_from), content hash dedup, apollo_operations table, apollo_entries_archive table, comprehensive indexes including partial HNSW on active entries only
22
+
3
23
  ## [1.6.2] - 2026-03-25
4
24
 
5
25
  ### Changed
data/legion-data.gemspec CHANGED
@@ -26,6 +26,7 @@ Gem::Specification.new do |spec|
26
26
  'rubygems_mfa_required' => 'true'
27
27
  }
28
28
 
29
+ spec.add_dependency 'csv', '>= 3.2'
29
30
  spec.add_dependency 'legion-logging', '>= 1.2.8'
30
31
  spec.add_dependency 'legion-settings', '>= 1.3.12'
31
32
  spec.add_dependency 'sequel', '>= 5.70'
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Data
5
+ module Extract
6
+ module Handlers
7
+ class Base
8
+ @registry = {}
9
+
10
+ class << self
11
+ attr_reader :registry
12
+
13
+ def inherited(subclass)
14
+ super
15
+ # Deferred registration — subclass defines type after class body loads
16
+ TracePoint.new(:end) do |tp|
17
+ if tp.self == subclass
18
+ register(subclass) if subclass.respond_to?(:type) && subclass.type
19
+ tp.disable
20
+ end
21
+ end.enable
22
+ end
23
+
24
+ def register(handler_class)
25
+ @registry[handler_class.type] = handler_class
26
+ end
27
+
28
+ def for_type(type)
29
+ @registry[type&.to_sym]
30
+ end
31
+
32
+ def supported_types
33
+ @registry.keys
34
+ end
35
+
36
+ # Override in subclasses
37
+ def type = nil
38
+ def extensions = []
39
+ def gem_name = nil
40
+
41
+ def extract(_source)
42
+ raise NotImplementedError, "#{name} must implement .extract"
43
+ end
44
+
45
+ def available?
46
+ return true if gem_name.nil?
47
+
48
+ require gem_name
49
+ true
50
+ rescue LoadError
51
+ false
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'csv'
4
+
5
+ module Legion
6
+ module Data
7
+ module Extract
8
+ module Handlers
9
+ class Csv < Base
10
+ def self.type = :csv
11
+ def self.extensions = %w[.csv]
12
+ def self.gem_name = nil
13
+
14
+ def self.extract(source)
15
+ content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
16
+ table = ::CSV.parse(content, headers: true)
17
+ text = table.map { |row| row.to_h.map { |k, v| "#{k}: #{v}" }.join(', ') }.join("\n")
18
+ { text: text, metadata: { rows: table.size, columns: table.headers.size, headers: table.headers } }
19
+ rescue StandardError => e
20
+ { text: nil, error: e.message }
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Data
5
+ module Extract
6
+ module Handlers
7
+ class Docx < Base
8
+ def self.type = :docx
9
+ def self.extensions = %w[.docx]
10
+ def self.gem_name = 'docx'
11
+
12
+ def self.extract(source)
13
+ require 'docx'
14
+
15
+ doc = ::Docx::Document.open(source)
16
+ paragraphs = doc.paragraphs.map(&:text).reject(&:empty?)
17
+ text = paragraphs.join("\n\n")
18
+ { text: text, metadata: { paragraphs: paragraphs.size } }
19
+ rescue LoadError
20
+ { text: nil, error: :gem_not_installed, gem: gem_name }
21
+ rescue StandardError => e
22
+ { text: nil, error: e.message }
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Data
5
+ module Extract
6
+ module Handlers
7
+ class Html < Base
8
+ def self.type = :html
9
+ def self.extensions = %w[.html .htm]
10
+ def self.gem_name = 'nokogiri'
11
+
12
+ def self.extract(source)
13
+ require 'nokogiri'
14
+
15
+ content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
16
+ doc = ::Nokogiri::HTML(content)
17
+
18
+ # Remove script and style elements
19
+ doc.css('script, style, noscript').each(&:remove)
20
+
21
+ title = doc.at_css('title')&.text&.strip
22
+ text = doc.text.gsub(/\s+/, ' ').strip
23
+ { text: text, metadata: { title: title } }
24
+ rescue LoadError
25
+ { text: nil, error: :gem_not_installed, gem: gem_name }
26
+ rescue StandardError => e
27
+ { text: nil, error: e.message }
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module Legion
6
+ module Data
7
+ module Extract
8
+ module Handlers
9
+ class Json < Base
10
+ def self.type = :json
11
+ def self.extensions = %w[.json]
12
+ def self.gem_name = nil
13
+
14
+ def self.extract(source)
15
+ content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
16
+ parsed = ::JSON.parse(content)
17
+ text = ::JSON.pretty_generate(parsed)
18
+ { text: text, metadata: { keys: parsed.is_a?(Hash) ? parsed.keys : nil } }
19
+ rescue StandardError => e
20
+ { text: nil, error: e.message }
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module Legion
6
+ module Data
7
+ module Extract
8
+ module Handlers
9
+ class Jsonl < Base
10
+ def self.type = :jsonl
11
+ def self.extensions = %w[.jsonl]
12
+ def self.gem_name = nil
13
+
14
+ def self.extract(source)
15
+ content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
16
+ lines = content.each_line.map { |l| ::JSON.parse(l.strip) rescue l.strip } # rubocop:disable Style/RescueModifier
17
+ text = lines.map { |l| l.is_a?(Hash) ? ::JSON.pretty_generate(l) : l }.join("\n---\n")
18
+ { text: text, metadata: { lines: lines.size } }
19
+ rescue StandardError => e
20
+ { text: nil, error: e.message }
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Data
5
+ module Extract
6
+ module Handlers
7
+ class Markdown < Base
8
+ def self.type = :markdown
9
+ def self.extensions = %w[.md .markdown]
10
+ def self.gem_name = nil
11
+
12
+ def self.extract(source)
13
+ content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
14
+ # Strip YAML frontmatter if present
15
+ text = content.sub(/\A---\n.*?\n---\n/m, '')
16
+ { text: text.strip, metadata: { bytes: content.bytesize, has_frontmatter: content != text } }
17
+ rescue StandardError => e
18
+ { text: nil, error: e.message }
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Data
5
+ module Extract
6
+ module Handlers
7
+ class Pdf < Base
8
+ def self.type = :pdf
9
+ def self.extensions = %w[.pdf]
10
+ def self.gem_name = 'pdf-reader'
11
+
12
+ def self.extract(source)
13
+ require 'pdf-reader'
14
+
15
+ reader = ::PDF::Reader.new(source)
16
+ text = reader.pages.map(&:text).join("\n\n")
17
+ { text: text, metadata: { pages: reader.page_count, title: reader.info[:Title] } }
18
+ rescue LoadError
19
+ { text: nil, error: :gem_not_installed, gem: gem_name }
20
+ rescue StandardError => e
21
+ { text: nil, error: e.message }
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Data
5
+ module Extract
6
+ module Handlers
7
+ class Pptx < Base
8
+ def self.type = :pptx
9
+ def self.extensions = %w[.pptx]
10
+ def self.gem_name = 'rubyzip'
11
+
12
+ def self.extract(source)
13
+ require 'zip'
14
+ require 'rexml/document'
15
+
16
+ slides = []
17
+ ::Zip::File.open(source) do |zip|
18
+ zip.glob('ppt/slides/slide*.xml').sort_by(&:name).each do |entry|
19
+ doc = REXML::Document.new(entry.get_input_stream.read)
20
+ texts = []
21
+ doc.each_element('//a:t') { |e| texts << e.text }
22
+ slides << texts.join(' ') unless texts.empty?
23
+ end
24
+ end
25
+ text = slides.each_with_index.map { |s, i| "Slide #{i + 1}: #{s}" }.join("\n\n")
26
+ { text: text, metadata: { slides: slides.size } }
27
+ rescue LoadError
28
+ { text: nil, error: :gem_not_installed, gem: 'rubyzip' }
29
+ rescue StandardError => e
30
+ { text: nil, error: e.message }
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Data
5
+ module Extract
6
+ module Handlers
7
+ class Text < Base
8
+ def self.type = :text
9
+ def self.extensions = %w[.txt]
10
+ def self.gem_name = nil
11
+
12
+ def self.extract(source)
13
+ content = source.respond_to?(:read) ? source.read : File.read(source.to_s)
14
+ { text: content, metadata: { bytes: content.bytesize } }
15
+ rescue StandardError => e
16
+ { text: nil, error: e.message }
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Data
5
+ module Extract
6
+ module Handlers
7
+ class Xlsx < Base
8
+ def self.type = :xlsx
9
+ def self.extensions = %w[.xlsx .xls]
10
+ def self.gem_name = 'rubyXL'
11
+
12
+ def self.extract(source)
13
+ require 'rubyXL'
14
+ require 'rubyXL/convenience_methods'
15
+
16
+ workbook = ::RubyXL::Parser.parse(source)
17
+ sheets = []
18
+ workbook.worksheets.each do |sheet|
19
+ rows = sheet.each.map do |row|
20
+ next unless row
21
+
22
+ row.cells.map { |c| c&.value.to_s }.join(', ')
23
+ end.compact
24
+ sheets << "Sheet: #{sheet.sheet_name}\n#{rows.join("\n")}" unless rows.empty?
25
+ end
26
+ text = sheets.join("\n\n")
27
+ { text: text, metadata: { sheets: workbook.worksheets.size } }
28
+ rescue LoadError
29
+ { text: nil, error: :gem_not_installed, gem: gem_name }
30
+ rescue StandardError => e
31
+ { text: nil, error: e.message }
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Data
5
+ module Extract
6
+ module TypeDetector
7
+ EXTENSION_MAP = {
8
+ '.pdf' => :pdf,
9
+ '.docx' => :docx,
10
+ '.pptx' => :pptx,
11
+ '.xlsx' => :xlsx,
12
+ '.xls' => :xlsx,
13
+ '.md' => :markdown,
14
+ '.markdown' => :markdown,
15
+ '.txt' => :text,
16
+ '.csv' => :csv,
17
+ '.json' => :json,
18
+ '.jsonl' => :jsonl,
19
+ '.html' => :html,
20
+ '.htm' => :html
21
+ }.freeze
22
+
23
+ module_function
24
+
25
+ def detect(source)
26
+ return detect_from_path(source) if source.is_a?(String) && File.exist?(source)
27
+ return detect_from_io(source) if source.respond_to?(:path)
28
+
29
+ nil
30
+ end
31
+
32
+ def detect_from_path(path)
33
+ ext = File.extname(path).downcase
34
+ EXTENSION_MAP[ext]
35
+ end
36
+
37
+ def detect_from_io(io)
38
+ return nil unless io.respond_to?(:path) && io.path
39
+
40
+ detect_from_path(io.path)
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'extract/type_detector'
4
+ require_relative 'extract/handlers/base'
5
+
6
+ module Legion
7
+ module Data
8
+ module Extract
9
+ class << self
10
+ def extract(source, type: :auto)
11
+ detected_type = type == :auto ? TypeDetector.detect(source) : type&.to_sym
12
+ return { success: false, text: nil, error: :unknown_type } unless detected_type
13
+
14
+ handler = Handlers::Base.for_type(detected_type)
15
+ return { success: false, text: nil, error: :no_handler, type: detected_type } unless handler
16
+
17
+ unless handler.available?
18
+ return { success: false, text: nil, error: :gem_not_installed,
19
+ gem: handler.gem_name, type: detected_type }
20
+ end
21
+
22
+ result = handler.extract(source)
23
+ if result[:text]
24
+ { success: true, text: result[:text], metadata: result[:metadata], type: detected_type }
25
+ else
26
+ { success: false, text: nil, error: result[:error], type: detected_type }
27
+ end
28
+ rescue StandardError => e
29
+ { success: false, text: nil, error: e.message, type: detected_type }
30
+ end
31
+
32
+ def supported_types
33
+ load_all_handlers
34
+ Handlers::Base.supported_types
35
+ end
36
+
37
+ def can_extract?(type)
38
+ load_all_handlers
39
+ handler = Handlers::Base.for_type(type&.to_sym)
40
+ handler&.available? || false
41
+ end
42
+
43
+ def register_handler(type, klass)
44
+ Handlers::Base.registry[type.to_sym] = klass
45
+ end
46
+
47
+ private
48
+
49
+ def load_all_handlers
50
+ return if @handlers_loaded
51
+
52
+ Dir[File.join(__dir__, 'extract', 'handlers', '*.rb')].each do |f|
53
+ require f unless f.end_with?('base.rb')
54
+ end
55
+ @handlers_loaded = true
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,152 @@
1
+ # frozen_string_literal: true
2
+
3
+ Sequel.migration do
4
+ up do
5
+ next unless adapter_scheme == :postgres
6
+
7
+ # --- Identity columns on apollo_entries ---
8
+ alter_table(:apollo_entries) do
9
+ add_column :submitted_by, String, size: 255
10
+ add_column :submitted_from, String, size: 255
11
+ add_column :content_hash, String, fixed: true, size: 32
12
+ end
13
+
14
+ # --- apollo_operations table ---
15
+ run <<~SQL
16
+ CREATE TABLE IF NOT EXISTS apollo_operations (
17
+ id BIGSERIAL PRIMARY KEY,
18
+ operation VARCHAR(50) NOT NULL,
19
+ actor VARCHAR(100) NOT NULL,
20
+ target_type VARCHAR(50),
21
+ target_ids INTEGER[],
22
+ summary TEXT,
23
+ detail JSONB,
24
+ old_state JSONB,
25
+ new_state JSONB,
26
+ reason TEXT,
27
+ principal_id VARCHAR(255),
28
+ created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
29
+ );
30
+ SQL
31
+
32
+ # --- apollo_entries_archive table ---
33
+ run <<~SQL
34
+ CREATE TABLE IF NOT EXISTS apollo_entries_archive (
35
+ LIKE apollo_entries INCLUDING ALL,
36
+ archived_at TIMESTAMPTZ DEFAULT NOW(),
37
+ archive_reason TEXT
38
+ );
39
+ SQL
40
+
41
+ # --- Indexes: apollo_entries ---
42
+ run 'CREATE INDEX IF NOT EXISTS idx_apollo_submitted_by ON apollo_entries (submitted_by);'
43
+ run 'CREATE INDEX IF NOT EXISTS idx_apollo_submitted_from ON apollo_entries (submitted_from);'
44
+
45
+ # Content hash dedup (unique among active entries only)
46
+ run <<~SQL
47
+ CREATE UNIQUE INDEX IF NOT EXISTS idx_apollo_content_hash
48
+ ON apollo_entries (content_hash)
49
+ WHERE status != 'archived';
50
+ SQL
51
+
52
+ # Status filtering (every read query filters on status)
53
+ run 'CREATE INDEX IF NOT EXISTS idx_apollo_status ON apollo_entries (status);'
54
+
55
+ # Partial index: active entries only (hot path)
56
+ run <<~SQL
57
+ CREATE INDEX IF NOT EXISTS idx_apollo_active
58
+ ON apollo_entries (id)
59
+ WHERE status IN ('candidate', 'confirmed', 'disputed');
60
+ SQL
61
+
62
+ # Confidence ranking and decay targeting
63
+ run 'CREATE INDEX IF NOT EXISTS idx_apollo_confidence ON apollo_entries (confidence);'
64
+
65
+ # Time-based: decay age, archival sweep
66
+ run 'CREATE INDEX IF NOT EXISTS idx_apollo_created ON apollo_entries (created_at);'
67
+ run 'CREATE INDEX IF NOT EXISTS idx_apollo_updated ON apollo_entries (updated_at);'
68
+
69
+ # Composite: decay cycle targets
70
+ run <<~SQL
71
+ CREATE INDEX IF NOT EXISTS idx_apollo_decay_target
72
+ ON apollo_entries (updated_at)
73
+ WHERE status != 'archived';
74
+ SQL
75
+
76
+ # Composite: corroboration targets
77
+ run <<~SQL
78
+ CREATE INDEX IF NOT EXISTS idx_apollo_candidates
79
+ ON apollo_entries (status, source_provider, source_channel)
80
+ WHERE status = 'candidate' AND embedding IS NOT NULL;
81
+ SQL
82
+
83
+ # Knowledge domain (expertise, RBAC)
84
+ run 'CREATE INDEX IF NOT EXISTS idx_apollo_domain ON apollo_entries (knowledge_domain);'
85
+
86
+ # Source agent (expertise aggregation)
87
+ run 'CREATE INDEX IF NOT EXISTS idx_apollo_source_agent ON apollo_entries (source_agent);'
88
+
89
+ # Drop existing HNSW index and recreate as partial (active entries only)
90
+ run 'DROP INDEX IF EXISTS apollo_entries_embedding_idx;'
91
+ run <<~SQL
92
+ CREATE INDEX IF NOT EXISTS idx_apollo_embedding_active
93
+ ON apollo_entries USING hnsw (embedding vector_cosine_ops)
94
+ WITH (m = 16, ef_construction = 64)
95
+ WHERE status IN ('candidate', 'confirmed', 'disputed');
96
+ SQL
97
+
98
+ # --- Indexes: apollo_relations ---
99
+ run 'CREATE INDEX IF NOT EXISTS idx_apollo_rel_from ON apollo_relations (from_entry_id);'
100
+ run 'CREATE INDEX IF NOT EXISTS idx_apollo_rel_to ON apollo_relations (to_entry_id);'
101
+ run 'CREATE INDEX IF NOT EXISTS idx_apollo_rel_type ON apollo_relations (relation_type);'
102
+ run 'CREATE INDEX IF NOT EXISTS idx_apollo_rel_composite ON apollo_relations (from_entry_id, relation_type);'
103
+
104
+ # --- Indexes: apollo_expertise ---
105
+ run 'CREATE INDEX IF NOT EXISTS idx_apollo_exp_agent ON apollo_expertise (agent_id);'
106
+ run 'CREATE INDEX IF NOT EXISTS idx_apollo_exp_domain ON apollo_expertise (domain);'
107
+ run 'CREATE INDEX IF NOT EXISTS idx_apollo_exp_composite ON apollo_expertise (agent_id, domain);'
108
+
109
+ # --- Indexes: apollo_operations ---
110
+ run 'CREATE INDEX IF NOT EXISTS idx_apollo_ops_created ON apollo_operations (created_at);'
111
+ run 'CREATE INDEX IF NOT EXISTS idx_apollo_ops_operation ON apollo_operations (operation);'
112
+ run 'CREATE INDEX IF NOT EXISTS idx_apollo_ops_actor ON apollo_operations (actor);'
113
+ run 'CREATE INDEX IF NOT EXISTS idx_apollo_ops_target ON apollo_operations USING GIN (target_ids);'
114
+
115
+ # --- Indexes: apollo_entries_archive ---
116
+ run 'CREATE INDEX IF NOT EXISTS idx_archive_content_hash ON apollo_entries_archive (content_hash);'
117
+ run 'CREATE INDEX IF NOT EXISTS idx_archive_source_agent ON apollo_entries_archive (source_agent);'
118
+ run 'CREATE INDEX IF NOT EXISTS idx_archive_archived_at ON apollo_entries_archive (archived_at);'
119
+ end
120
+
121
+ down do
122
+ next unless adapter_scheme == :postgres
123
+
124
+ # Restore original HNSW index (non-partial)
125
+ run 'DROP INDEX IF EXISTS idx_apollo_embedding_active;'
126
+ run <<~SQL
127
+ CREATE INDEX IF NOT EXISTS apollo_entries_embedding_idx
128
+ ON apollo_entries USING hnsw (embedding vector_cosine_ops);
129
+ SQL
130
+
131
+ drop_table?(:apollo_entries_archive)
132
+ drop_table?(:apollo_operations)
133
+
134
+ # Drop new indexes
135
+ %w[
136
+ idx_apollo_submitted_by idx_apollo_submitted_from idx_apollo_content_hash
137
+ idx_apollo_status idx_apollo_active idx_apollo_confidence
138
+ idx_apollo_created idx_apollo_updated idx_apollo_decay_target
139
+ idx_apollo_candidates idx_apollo_domain idx_apollo_source_agent
140
+ idx_apollo_rel_from idx_apollo_rel_to idx_apollo_rel_type idx_apollo_rel_composite
141
+ idx_apollo_exp_agent idx_apollo_exp_domain idx_apollo_exp_composite
142
+ idx_apollo_ops_created idx_apollo_ops_operation idx_apollo_ops_actor idx_apollo_ops_target
143
+ idx_archive_content_hash idx_archive_source_agent idx_archive_archived_at
144
+ ].each { |idx| run "DROP INDEX IF EXISTS #{idx};" }
145
+
146
+ alter_table(:apollo_entries) do
147
+ drop_column :content_hash
148
+ drop_column :submitted_from
149
+ drop_column :submitted_by
150
+ end
151
+ end
152
+ end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Legion
4
4
  module Data
5
- VERSION = '1.6.3'
5
+ VERSION = '1.6.6'
6
6
  end
7
7
  end
data/lib/legion/data.rb CHANGED
@@ -13,6 +13,7 @@ require_relative 'data/partition_manager'
13
13
  require_relative 'data/archiver'
14
14
  require_relative 'data/helper'
15
15
  require_relative 'data/rls'
16
+ require_relative 'data/extract'
16
17
 
17
18
  module Legion
18
19
  module Data
@@ -55,6 +56,51 @@ module Legion
55
56
  }
56
57
  end
57
58
 
59
+ def connected?
60
+ Legion::Settings[:data][:connected] == true
61
+ rescue StandardError
62
+ false
63
+ end
64
+
65
+ def can_write?(table_name)
66
+ return false unless connected?
67
+
68
+ adapter = Legion::Settings[:data][:adapter]&.to_s
69
+ return true if adapter == 'sqlite'
70
+
71
+ @write_privileges ||= {}
72
+ return @write_privileges[table_name] unless @write_privileges[table_name].nil?
73
+
74
+ @write_privileges[table_name] = connection
75
+ .fetch("SELECT has_table_privilege(current_user, ?, 'INSERT') AS can", table_name.to_s)
76
+ .first[:can] == true
77
+ rescue StandardError
78
+ @write_privileges[table_name] = false if @write_privileges
79
+ false
80
+ end
81
+
82
+ def can_read?(table_name)
83
+ return false unless connected?
84
+
85
+ adapter = Legion::Settings[:data][:adapter]&.to_s
86
+ return true if adapter == 'sqlite'
87
+
88
+ @read_privileges ||= {}
89
+ return @read_privileges[table_name] unless @read_privileges[table_name].nil?
90
+
91
+ @read_privileges[table_name] = connection
92
+ .fetch("SELECT has_table_privilege(current_user, ?, 'SELECT') AS can", table_name.to_s)
93
+ .first[:can] == true
94
+ rescue StandardError
95
+ @read_privileges[table_name] = false if @read_privileges
96
+ false
97
+ end
98
+
99
+ def reset_privileges!
100
+ @write_privileges = nil
101
+ @read_privileges = nil
102
+ end
103
+
58
104
  def setup_cache
59
105
  cache_settings = Legion::Settings[:data][:cache]
60
106
  setup_static_cache if cache_settings[:static_cache]
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: legion-data
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.6.3
4
+ version: 1.6.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Esity
@@ -9,6 +9,20 @@ bindir: bin
9
9
  cert_chain: []
10
10
  date: 1980-01-02 00:00:00.000000000 Z
11
11
  dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: csv
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '3.2'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - ">="
24
+ - !ruby/object:Gem::Version
25
+ version: '3.2'
12
26
  - !ruby/object:Gem::Dependency
13
27
  name: legion-logging
14
28
  requirement: !ruby/object:Gem::Requirement
@@ -98,6 +112,19 @@ files:
98
112
  - lib/legion/data/encryption/sequel_plugin.rb
99
113
  - lib/legion/data/event_store.rb
100
114
  - lib/legion/data/event_store/projection.rb
115
+ - lib/legion/data/extract.rb
116
+ - lib/legion/data/extract/handlers/base.rb
117
+ - lib/legion/data/extract/handlers/csv.rb
118
+ - lib/legion/data/extract/handlers/docx.rb
119
+ - lib/legion/data/extract/handlers/html.rb
120
+ - lib/legion/data/extract/handlers/json.rb
121
+ - lib/legion/data/extract/handlers/jsonl.rb
122
+ - lib/legion/data/extract/handlers/markdown.rb
123
+ - lib/legion/data/extract/handlers/pdf.rb
124
+ - lib/legion/data/extract/handlers/pptx.rb
125
+ - lib/legion/data/extract/handlers/text.rb
126
+ - lib/legion/data/extract/handlers/xlsx.rb
127
+ - lib/legion/data/extract/type_detector.rb
101
128
  - lib/legion/data/helper.rb
102
129
  - lib/legion/data/local.rb
103
130
  - lib/legion/data/migration.rb
@@ -147,6 +174,7 @@ files:
147
174
  - lib/legion/data/migrations/044_expand_memory_traces.rb
148
175
  - lib/legion/data/migrations/045_add_memory_associations.rb
149
176
  - lib/legion/data/migrations/046_add_metering_hourly_rollup.rb
177
+ - lib/legion/data/migrations/047_apollo_knowledge_capture.rb
150
178
  - lib/legion/data/model.rb
151
179
  - lib/legion/data/models/apollo_access_log.rb
152
180
  - lib/legion/data/models/apollo_entry.rb