tabula-rb 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +39 -0
- data/CHANGELOG.md +59 -0
- data/LICENSE +21 -0
- data/README.md +176 -0
- data/Rakefile +28 -0
- data/exe/tabula +7 -0
- data/lib/tabula/algorithms/cohen_sutherland_clipping.rb +94 -0
- data/lib/tabula/algorithms/projection_profile.rb +109 -0
- data/lib/tabula/cli.rb +271 -0
- data/lib/tabula/configuration.rb +119 -0
- data/lib/tabula/core/point.rb +60 -0
- data/lib/tabula/core/rectangle.rb +218 -0
- data/lib/tabula/core/ruling.rb +303 -0
- data/lib/tabula/core/spatial_index.rb +120 -0
- data/lib/tabula/detectors/detection_algorithm.rb +34 -0
- data/lib/tabula/detectors/nurminen_detection_algorithm.rb +211 -0
- data/lib/tabula/detectors/spreadsheet_detection_algorithm.rb +142 -0
- data/lib/tabula/extractors/basic_extraction_algorithm.rb +168 -0
- data/lib/tabula/extractors/extraction_algorithm.rb +34 -0
- data/lib/tabula/extractors/spreadsheet_extraction_algorithm.rb +299 -0
- data/lib/tabula/pdf/object_extractor.rb +400 -0
- data/lib/tabula/pdf/page.rb +230 -0
- data/lib/tabula/pdf/text_stripper.rb +150 -0
- data/lib/tabula/table/cell.rb +110 -0
- data/lib/tabula/table/table.rb +184 -0
- data/lib/tabula/text/line.rb +133 -0
- data/lib/tabula/text/text_chunk.rb +185 -0
- data/lib/tabula/text/text_element.rb +120 -0
- data/lib/tabula/version.rb +5 -0
- data/lib/tabula/writers/csv_writer.rb +49 -0
- data/lib/tabula/writers/json_writer.rb +41 -0
- data/lib/tabula/writers/markdown_writer.rb +71 -0
- data/lib/tabula/writers/tsv_writer.rb +35 -0
- data/lib/tabula/writers/writer.rb +39 -0
- data/lib/tabula.rb +160 -0
- data/mise.toml +2 -0
- data/tabula-rb.gemspec +44 -0
- metadata +115 -0
data/lib/tabula.rb
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'tabula/version'
|
|
4
|
+
require_relative 'tabula/configuration'
|
|
5
|
+
|
|
6
|
+
# Core geometry
|
|
7
|
+
require_relative 'tabula/core/point'
|
|
8
|
+
require_relative 'tabula/core/rectangle'
|
|
9
|
+
require_relative 'tabula/core/ruling'
|
|
10
|
+
require_relative 'tabula/core/spatial_index'
|
|
11
|
+
|
|
12
|
+
# Text handling
|
|
13
|
+
require_relative 'tabula/text/text_element'
|
|
14
|
+
require_relative 'tabula/text/text_chunk'
|
|
15
|
+
require_relative 'tabula/text/line'
|
|
16
|
+
|
|
17
|
+
# Table structures
|
|
18
|
+
require_relative 'tabula/table/cell'
|
|
19
|
+
require_relative 'tabula/table/table'
|
|
20
|
+
|
|
21
|
+
# PDF processing
|
|
22
|
+
require_relative 'tabula/pdf/page'
|
|
23
|
+
require_relative 'tabula/pdf/text_stripper'
|
|
24
|
+
require_relative 'tabula/pdf/object_extractor'
|
|
25
|
+
|
|
26
|
+
# Extraction algorithms
|
|
27
|
+
require_relative 'tabula/extractors/extraction_algorithm'
|
|
28
|
+
require_relative 'tabula/extractors/basic_extraction_algorithm'
|
|
29
|
+
require_relative 'tabula/extractors/spreadsheet_extraction_algorithm'
|
|
30
|
+
|
|
31
|
+
# Detection algorithms
|
|
32
|
+
require_relative 'tabula/detectors/detection_algorithm'
|
|
33
|
+
require_relative 'tabula/detectors/spreadsheet_detection_algorithm'
|
|
34
|
+
require_relative 'tabula/detectors/nurminen_detection_algorithm'
|
|
35
|
+
|
|
36
|
+
# Writers
|
|
37
|
+
require_relative 'tabula/writers/writer'
|
|
38
|
+
require_relative 'tabula/writers/csv_writer'
|
|
39
|
+
require_relative 'tabula/writers/tsv_writer'
|
|
40
|
+
require_relative 'tabula/writers/json_writer'
|
|
41
|
+
require_relative 'tabula/writers/markdown_writer'
|
|
42
|
+
|
|
43
|
+
# Geometric algorithms
|
|
44
|
+
require_relative 'tabula/algorithms/cohen_sutherland_clipping'
|
|
45
|
+
require_relative 'tabula/algorithms/projection_profile'
|
|
46
|
+
|
|
47
|
+
module Tabula
|
|
48
|
+
class Error < StandardError; end
|
|
49
|
+
class InvalidPDFError < Error; end
|
|
50
|
+
class PasswordRequiredError < Error; end
|
|
51
|
+
class FileNotFoundError < Error; end
|
|
52
|
+
class InvalidOptionsError < Error; end
|
|
53
|
+
|
|
54
|
+
VALID_METHODS = %i[lattice stream auto].freeze
|
|
55
|
+
|
|
56
|
+
class << self
|
|
57
|
+
# Extract tables from a PDF file
|
|
58
|
+
#
|
|
59
|
+
# @param path [String] path to PDF file
|
|
60
|
+
# @param options [Hash] extraction options
|
|
61
|
+
# @option options [Array<Integer>] :pages pages to extract (1-indexed, nil for all)
|
|
62
|
+
# @option options [Symbol] :method extraction method (:lattice, :stream, or :auto)
|
|
63
|
+
# @option options [Array<Float>] :area area to extract [top, left, bottom, right]
|
|
64
|
+
# @option options [Array<Float>] :columns column boundaries
|
|
65
|
+
# @option options [String] :password PDF password
|
|
66
|
+
# @option options [Boolean] :guess auto-detect table areas
|
|
67
|
+
# @return [Array<Table>] extracted tables
|
|
68
|
+
# @raise [FileNotFoundError] if the file does not exist
|
|
69
|
+
# @raise [InvalidOptionsError] if options are invalid
|
|
70
|
+
def extract(path, **options)
|
|
71
|
+
validate_file!(path)
|
|
72
|
+
validate_options!(options)
|
|
73
|
+
|
|
74
|
+
ObjectExtractor.open(path, password: options[:password]) do |extractor|
|
|
75
|
+
pages = options[:pages] || (1..extractor.page_count).to_a
|
|
76
|
+
method = options[:method] || :auto
|
|
77
|
+
area = options[:area]
|
|
78
|
+
columns = options[:columns]
|
|
79
|
+
guess = options.fetch(:guess, false)
|
|
80
|
+
|
|
81
|
+
tables = []
|
|
82
|
+
|
|
83
|
+
pages.each do |page_num|
|
|
84
|
+
page = extractor.extract_page(page_num)
|
|
85
|
+
page = page.get_area(*area) if area
|
|
86
|
+
|
|
87
|
+
if guess
|
|
88
|
+
detected_areas = Detectors::Nurminen.detect(page)
|
|
89
|
+
detected_areas.each do |detected_area|
|
|
90
|
+
sub_page = page.get_area(*detected_area.bounds)
|
|
91
|
+
tables.concat(extract_from_page(sub_page, method, columns))
|
|
92
|
+
end
|
|
93
|
+
else
|
|
94
|
+
tables.concat(extract_from_page(page, method, columns))
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
tables
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
private
|
|
103
|
+
|
|
104
|
+
def validate_file!(path)
|
|
105
|
+
raise FileNotFoundError, "File not found: #{path}" unless File.exist?(path)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def validate_options!(options)
|
|
109
|
+
validate_pages!(options[:pages]) if options[:pages]
|
|
110
|
+
validate_area!(options[:area]) if options[:area]
|
|
111
|
+
validate_method!(options[:method]) if options[:method]
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def validate_pages!(pages)
|
|
115
|
+
raise InvalidOptionsError, "Pages must be an array, got #{pages.class}" unless pages.is_a?(Array)
|
|
116
|
+
|
|
117
|
+
pages.each do |page|
|
|
118
|
+
unless page.is_a?(Integer) && page.positive?
|
|
119
|
+
raise InvalidOptionsError, "Page numbers must be positive integers, got #{page.inspect}"
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def validate_area!(area)
|
|
125
|
+
unless area.is_a?(Array) && area.size == 4
|
|
126
|
+
raise InvalidOptionsError,
|
|
127
|
+
"Area must be an array of exactly 4 values [top, left, bottom, right], got #{area.inspect}"
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
area.each_with_index do |value, index|
|
|
131
|
+
labels = %w[top left bottom right]
|
|
132
|
+
unless value.is_a?(Numeric)
|
|
133
|
+
raise InvalidOptionsError, "Area #{labels[index]} must be numeric, got #{value.inspect}"
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def validate_method!(method)
|
|
139
|
+
return if VALID_METHODS.include?(method)
|
|
140
|
+
|
|
141
|
+
raise InvalidOptionsError,
|
|
142
|
+
"Method must be one of #{VALID_METHODS.map(&:inspect).join(', ')}, got #{method.inspect}"
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def extract_from_page(page, method, columns)
|
|
146
|
+
case method
|
|
147
|
+
when :lattice
|
|
148
|
+
Extractors::Spreadsheet.extract(page)
|
|
149
|
+
when :stream
|
|
150
|
+
Extractors::Basic.extract(page, columns: columns)
|
|
151
|
+
when :auto
|
|
152
|
+
# Try lattice first, fall back to stream
|
|
153
|
+
tables = Extractors::Spreadsheet.extract(page)
|
|
154
|
+
tables.empty? ? Extractors::Basic.extract(page, columns: columns) : tables
|
|
155
|
+
else
|
|
156
|
+
raise ArgumentError, "Unknown extraction method: #{method}"
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
end
|
data/mise.toml
ADDED
data/tabula-rb.gemspec
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'lib/tabula/version'
|
|
4
|
+
|
|
5
|
+
Gem::Specification.new do |spec|
|
|
6
|
+
spec.name = 'tabula-rb'
|
|
7
|
+
spec.version = Tabula::VERSION
|
|
8
|
+
spec.authors = ['Chris Hasiński']
|
|
9
|
+
spec.email = ['krzysztof.hasinski@gmail.com']
|
|
10
|
+
|
|
11
|
+
spec.summary = 'Extract tables from PDF files (Ruby port of tabula-java)'
|
|
12
|
+
spec.description = <<~DESC
|
|
13
|
+
Tabula is a Ruby port of tabula-java (https://github.com/tabulapdf/tabula-java),
|
|
14
|
+
the library that powers the Tabula PDF table extraction tool. It supports both
|
|
15
|
+
lattice-mode extraction (for PDFs with visible cell borders) and stream-mode
|
|
16
|
+
extraction (for PDFs without ruling lines, using text positioning).
|
|
17
|
+
DESC
|
|
18
|
+
spec.homepage = 'https://github.com/tabulapdf/tabula-rb'
|
|
19
|
+
spec.license = 'MIT'
|
|
20
|
+
spec.required_ruby_version = '>= 3.1.0'
|
|
21
|
+
|
|
22
|
+
spec.metadata['homepage_uri'] = spec.homepage
|
|
23
|
+
spec.metadata['source_code_uri'] = spec.homepage
|
|
24
|
+
spec.metadata['changelog_uri'] = "#{spec.homepage}/blob/main/CHANGELOG.md"
|
|
25
|
+
|
|
26
|
+
spec.files = Dir.chdir(__dir__) do
|
|
27
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
|
28
|
+
(File.expand_path(f) == __FILE__) ||
|
|
29
|
+
f.start_with?(*%w[bin/ test/ spec/ features/ .git .github appveyor Gemfile])
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
spec.bindir = 'exe'
|
|
33
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
|
34
|
+
spec.require_paths = ['lib']
|
|
35
|
+
|
|
36
|
+
# Runtime dependencies
|
|
37
|
+
spec.add_dependency 'csv', '~> 3.0'
|
|
38
|
+
spec.add_dependency 'pdf-reader', '~> 2.0'
|
|
39
|
+
|
|
40
|
+
# Optional dependencies for advanced features
|
|
41
|
+
# spec.add_dependency "mini_magick", "~> 4.0" # For Nurminen detection
|
|
42
|
+
|
|
43
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
|
44
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: tabula-rb
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Chris Hasiński
|
|
8
|
+
bindir: exe
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: csv
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - "~>"
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '3.0'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - "~>"
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '3.0'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: pdf-reader
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - "~>"
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '2.0'
|
|
33
|
+
type: :runtime
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - "~>"
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '2.0'
|
|
40
|
+
description: |
|
|
41
|
+
Tabula is a Ruby port of tabula-java (https://github.com/tabulapdf/tabula-java),
|
|
42
|
+
the library that powers the Tabula PDF table extraction tool. It supports both
|
|
43
|
+
lattice-mode extraction (for PDFs with visible cell borders) and stream-mode
|
|
44
|
+
extraction (for PDFs without ruling lines, using text positioning).
|
|
45
|
+
email:
|
|
46
|
+
- krzysztof.hasinski@gmail.com
|
|
47
|
+
executables:
|
|
48
|
+
- tabula
|
|
49
|
+
extensions: []
|
|
50
|
+
extra_rdoc_files: []
|
|
51
|
+
files:
|
|
52
|
+
- ".rspec"
|
|
53
|
+
- ".rubocop.yml"
|
|
54
|
+
- CHANGELOG.md
|
|
55
|
+
- LICENSE
|
|
56
|
+
- README.md
|
|
57
|
+
- Rakefile
|
|
58
|
+
- exe/tabula
|
|
59
|
+
- lib/tabula.rb
|
|
60
|
+
- lib/tabula/algorithms/cohen_sutherland_clipping.rb
|
|
61
|
+
- lib/tabula/algorithms/projection_profile.rb
|
|
62
|
+
- lib/tabula/cli.rb
|
|
63
|
+
- lib/tabula/configuration.rb
|
|
64
|
+
- lib/tabula/core/point.rb
|
|
65
|
+
- lib/tabula/core/rectangle.rb
|
|
66
|
+
- lib/tabula/core/ruling.rb
|
|
67
|
+
- lib/tabula/core/spatial_index.rb
|
|
68
|
+
- lib/tabula/detectors/detection_algorithm.rb
|
|
69
|
+
- lib/tabula/detectors/nurminen_detection_algorithm.rb
|
|
70
|
+
- lib/tabula/detectors/spreadsheet_detection_algorithm.rb
|
|
71
|
+
- lib/tabula/extractors/basic_extraction_algorithm.rb
|
|
72
|
+
- lib/tabula/extractors/extraction_algorithm.rb
|
|
73
|
+
- lib/tabula/extractors/spreadsheet_extraction_algorithm.rb
|
|
74
|
+
- lib/tabula/pdf/object_extractor.rb
|
|
75
|
+
- lib/tabula/pdf/page.rb
|
|
76
|
+
- lib/tabula/pdf/text_stripper.rb
|
|
77
|
+
- lib/tabula/table/cell.rb
|
|
78
|
+
- lib/tabula/table/table.rb
|
|
79
|
+
- lib/tabula/text/line.rb
|
|
80
|
+
- lib/tabula/text/text_chunk.rb
|
|
81
|
+
- lib/tabula/text/text_element.rb
|
|
82
|
+
- lib/tabula/version.rb
|
|
83
|
+
- lib/tabula/writers/csv_writer.rb
|
|
84
|
+
- lib/tabula/writers/json_writer.rb
|
|
85
|
+
- lib/tabula/writers/markdown_writer.rb
|
|
86
|
+
- lib/tabula/writers/tsv_writer.rb
|
|
87
|
+
- lib/tabula/writers/writer.rb
|
|
88
|
+
- mise.toml
|
|
89
|
+
- tabula-rb.gemspec
|
|
90
|
+
homepage: https://github.com/tabulapdf/tabula-rb
|
|
91
|
+
licenses:
|
|
92
|
+
- MIT
|
|
93
|
+
metadata:
|
|
94
|
+
homepage_uri: https://github.com/tabulapdf/tabula-rb
|
|
95
|
+
source_code_uri: https://github.com/tabulapdf/tabula-rb
|
|
96
|
+
changelog_uri: https://github.com/tabulapdf/tabula-rb/blob/main/CHANGELOG.md
|
|
97
|
+
rubygems_mfa_required: 'true'
|
|
98
|
+
rdoc_options: []
|
|
99
|
+
require_paths:
|
|
100
|
+
- lib
|
|
101
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
102
|
+
requirements:
|
|
103
|
+
- - ">="
|
|
104
|
+
- !ruby/object:Gem::Version
|
|
105
|
+
version: 3.1.0
|
|
106
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
107
|
+
requirements:
|
|
108
|
+
- - ">="
|
|
109
|
+
- !ruby/object:Gem::Version
|
|
110
|
+
version: '0'
|
|
111
|
+
requirements: []
|
|
112
|
+
rubygems_version: 4.0.2
|
|
113
|
+
specification_version: 4
|
|
114
|
+
summary: Extract tables from PDF files (Ruby port of tabula-java)
|
|
115
|
+
test_files: []
|