tabula-rb 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +39 -0
  4. data/CHANGELOG.md +59 -0
  5. data/LICENSE +21 -0
  6. data/README.md +176 -0
  7. data/Rakefile +28 -0
  8. data/exe/tabula +7 -0
  9. data/lib/tabula/algorithms/cohen_sutherland_clipping.rb +94 -0
  10. data/lib/tabula/algorithms/projection_profile.rb +109 -0
  11. data/lib/tabula/cli.rb +271 -0
  12. data/lib/tabula/configuration.rb +119 -0
  13. data/lib/tabula/core/point.rb +60 -0
  14. data/lib/tabula/core/rectangle.rb +218 -0
  15. data/lib/tabula/core/ruling.rb +303 -0
  16. data/lib/tabula/core/spatial_index.rb +120 -0
  17. data/lib/tabula/detectors/detection_algorithm.rb +34 -0
  18. data/lib/tabula/detectors/nurminen_detection_algorithm.rb +211 -0
  19. data/lib/tabula/detectors/spreadsheet_detection_algorithm.rb +142 -0
  20. data/lib/tabula/extractors/basic_extraction_algorithm.rb +168 -0
  21. data/lib/tabula/extractors/extraction_algorithm.rb +34 -0
  22. data/lib/tabula/extractors/spreadsheet_extraction_algorithm.rb +299 -0
  23. data/lib/tabula/pdf/object_extractor.rb +400 -0
  24. data/lib/tabula/pdf/page.rb +230 -0
  25. data/lib/tabula/pdf/text_stripper.rb +150 -0
  26. data/lib/tabula/table/cell.rb +110 -0
  27. data/lib/tabula/table/table.rb +184 -0
  28. data/lib/tabula/text/line.rb +133 -0
  29. data/lib/tabula/text/text_chunk.rb +185 -0
  30. data/lib/tabula/text/text_element.rb +120 -0
  31. data/lib/tabula/version.rb +5 -0
  32. data/lib/tabula/writers/csv_writer.rb +49 -0
  33. data/lib/tabula/writers/json_writer.rb +41 -0
  34. data/lib/tabula/writers/markdown_writer.rb +71 -0
  35. data/lib/tabula/writers/tsv_writer.rb +35 -0
  36. data/lib/tabula/writers/writer.rb +39 -0
  37. data/lib/tabula.rb +160 -0
  38. data/mise.toml +2 -0
  39. data/tabula-rb.gemspec +44 -0
  40. metadata +115 -0
data/lib/tabula.rb ADDED
@@ -0,0 +1,160 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'tabula/version'
4
+ require_relative 'tabula/configuration'
5
+
6
+ # Core geometry
7
+ require_relative 'tabula/core/point'
8
+ require_relative 'tabula/core/rectangle'
9
+ require_relative 'tabula/core/ruling'
10
+ require_relative 'tabula/core/spatial_index'
11
+
12
+ # Text handling
13
+ require_relative 'tabula/text/text_element'
14
+ require_relative 'tabula/text/text_chunk'
15
+ require_relative 'tabula/text/line'
16
+
17
+ # Table structures
18
+ require_relative 'tabula/table/cell'
19
+ require_relative 'tabula/table/table'
20
+
21
+ # PDF processing
22
+ require_relative 'tabula/pdf/page'
23
+ require_relative 'tabula/pdf/text_stripper'
24
+ require_relative 'tabula/pdf/object_extractor'
25
+
26
+ # Extraction algorithms
27
+ require_relative 'tabula/extractors/extraction_algorithm'
28
+ require_relative 'tabula/extractors/basic_extraction_algorithm'
29
+ require_relative 'tabula/extractors/spreadsheet_extraction_algorithm'
30
+
31
+ # Detection algorithms
32
+ require_relative 'tabula/detectors/detection_algorithm'
33
+ require_relative 'tabula/detectors/spreadsheet_detection_algorithm'
34
+ require_relative 'tabula/detectors/nurminen_detection_algorithm'
35
+
36
+ # Writers
37
+ require_relative 'tabula/writers/writer'
38
+ require_relative 'tabula/writers/csv_writer'
39
+ require_relative 'tabula/writers/tsv_writer'
40
+ require_relative 'tabula/writers/json_writer'
41
+ require_relative 'tabula/writers/markdown_writer'
42
+
43
+ # Geometric algorithms
44
+ require_relative 'tabula/algorithms/cohen_sutherland_clipping'
45
+ require_relative 'tabula/algorithms/projection_profile'
46
+
47
+ module Tabula
48
+ class Error < StandardError; end
49
+ class InvalidPDFError < Error; end
50
+ class PasswordRequiredError < Error; end
51
+ class FileNotFoundError < Error; end
52
+ class InvalidOptionsError < Error; end
53
+
54
+ VALID_METHODS = %i[lattice stream auto].freeze
55
+
56
+ class << self
57
+ # Extract tables from a PDF file
58
+ #
59
+ # @param path [String] path to PDF file
60
+ # @param options [Hash] extraction options
61
+ # @option options [Array<Integer>] :pages pages to extract (1-indexed, nil for all)
62
+ # @option options [Symbol] :method extraction method (:lattice, :stream, or :auto)
63
+ # @option options [Array<Float>] :area area to extract [top, left, bottom, right]
64
+ # @option options [Array<Float>] :columns column boundaries
65
+ # @option options [String] :password PDF password
66
+ # @option options [Boolean] :guess auto-detect table areas
67
+ # @return [Array<Table>] extracted tables
68
+ # @raise [FileNotFoundError] if the file does not exist
69
+ # @raise [InvalidOptionsError] if options are invalid
70
+ def extract(path, **options)
71
+ validate_file!(path)
72
+ validate_options!(options)
73
+
74
+ ObjectExtractor.open(path, password: options[:password]) do |extractor|
75
+ pages = options[:pages] || (1..extractor.page_count).to_a
76
+ method = options[:method] || :auto
77
+ area = options[:area]
78
+ columns = options[:columns]
79
+ guess = options.fetch(:guess, false)
80
+
81
+ tables = []
82
+
83
+ pages.each do |page_num|
84
+ page = extractor.extract_page(page_num)
85
+ page = page.get_area(*area) if area
86
+
87
+ if guess
88
+ detected_areas = Detectors::Nurminen.detect(page)
89
+ detected_areas.each do |detected_area|
90
+ sub_page = page.get_area(*detected_area.bounds)
91
+ tables.concat(extract_from_page(sub_page, method, columns))
92
+ end
93
+ else
94
+ tables.concat(extract_from_page(page, method, columns))
95
+ end
96
+ end
97
+
98
+ tables
99
+ end
100
+ end
101
+
102
+ private
103
+
104
+ def validate_file!(path)
105
+ raise FileNotFoundError, "File not found: #{path}" unless File.exist?(path)
106
+ end
107
+
108
+ def validate_options!(options)
109
+ validate_pages!(options[:pages]) if options[:pages]
110
+ validate_area!(options[:area]) if options[:area]
111
+ validate_method!(options[:method]) if options[:method]
112
+ end
113
+
114
+ def validate_pages!(pages)
115
+ raise InvalidOptionsError, "Pages must be an array, got #{pages.class}" unless pages.is_a?(Array)
116
+
117
+ pages.each do |page|
118
+ unless page.is_a?(Integer) && page.positive?
119
+ raise InvalidOptionsError, "Page numbers must be positive integers, got #{page.inspect}"
120
+ end
121
+ end
122
+ end
123
+
124
+ def validate_area!(area)
125
+ unless area.is_a?(Array) && area.size == 4
126
+ raise InvalidOptionsError,
127
+ "Area must be an array of exactly 4 values [top, left, bottom, right], got #{area.inspect}"
128
+ end
129
+
130
+ area.each_with_index do |value, index|
131
+ labels = %w[top left bottom right]
132
+ unless value.is_a?(Numeric)
133
+ raise InvalidOptionsError, "Area #{labels[index]} must be numeric, got #{value.inspect}"
134
+ end
135
+ end
136
+ end
137
+
138
+ def validate_method!(method)
139
+ return if VALID_METHODS.include?(method)
140
+
141
+ raise InvalidOptionsError,
142
+ "Method must be one of #{VALID_METHODS.map(&:inspect).join(', ')}, got #{method.inspect}"
143
+ end
144
+
145
+ def extract_from_page(page, method, columns)
146
+ case method
147
+ when :lattice
148
+ Extractors::Spreadsheet.extract(page)
149
+ when :stream
150
+ Extractors::Basic.extract(page, columns: columns)
151
+ when :auto
152
+ # Try lattice first, fall back to stream
153
+ tables = Extractors::Spreadsheet.extract(page)
154
+ tables.empty? ? Extractors::Basic.extract(page, columns: columns) : tables
155
+ else
156
+ raise ArgumentError, "Unknown extraction method: #{method}"
157
+ end
158
+ end
159
+ end
160
+ end
data/mise.toml ADDED
@@ -0,0 +1,2 @@
1
+ [tools]
2
+ ruby = "3.4"
data/tabula-rb.gemspec ADDED
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/tabula/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'tabula-rb'
7
+ spec.version = Tabula::VERSION
8
+ spec.authors = ['Chris Hasiński']
9
+ spec.email = ['krzysztof.hasinski@gmail.com']
10
+
11
+ spec.summary = 'Extract tables from PDF files (Ruby port of tabula-java)'
12
+ spec.description = <<~DESC
13
+ Tabula is a Ruby port of tabula-java (https://github.com/tabulapdf/tabula-java),
14
+ the library that powers the Tabula PDF table extraction tool. It supports both
15
+ lattice-mode extraction (for PDFs with visible cell borders) and stream-mode
16
+ extraction (for PDFs without ruling lines, using text positioning).
17
+ DESC
18
+ spec.homepage = 'https://github.com/tabulapdf/tabula-rb'
19
+ spec.license = 'MIT'
20
+ spec.required_ruby_version = '>= 3.1.0'
21
+
22
+ spec.metadata['homepage_uri'] = spec.homepage
23
+ spec.metadata['source_code_uri'] = spec.homepage
24
+ spec.metadata['changelog_uri'] = "#{spec.homepage}/blob/main/CHANGELOG.md"
25
+
26
+ spec.files = Dir.chdir(__dir__) do
27
+ `git ls-files -z`.split("\x0").reject do |f|
28
+ (File.expand_path(f) == __FILE__) ||
29
+ f.start_with?(*%w[bin/ test/ spec/ features/ .git .github appveyor Gemfile])
30
+ end
31
+ end
32
+ spec.bindir = 'exe'
33
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
34
+ spec.require_paths = ['lib']
35
+
36
+ # Runtime dependencies
37
+ spec.add_dependency 'csv', '~> 3.0'
38
+ spec.add_dependency 'pdf-reader', '~> 2.0'
39
+
40
+ # Optional dependencies for advanced features
41
+ # spec.add_dependency "mini_magick", "~> 4.0" # For Nurminen detection
42
+
43
+ spec.metadata['rubygems_mfa_required'] = 'true'
44
+ end
metadata ADDED
@@ -0,0 +1,115 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tabula-rb
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Chris Hasiński
8
+ bindir: exe
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: csv
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '3.0'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: '3.0'
26
+ - !ruby/object:Gem::Dependency
27
+ name: pdf-reader
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - "~>"
31
+ - !ruby/object:Gem::Version
32
+ version: '2.0'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '2.0'
40
+ description: |
41
+ Tabula is a Ruby port of tabula-java (https://github.com/tabulapdf/tabula-java),
42
+ the library that powers the Tabula PDF table extraction tool. It supports both
43
+ lattice-mode extraction (for PDFs with visible cell borders) and stream-mode
44
+ extraction (for PDFs without ruling lines, using text positioning).
45
+ email:
46
+ - krzysztof.hasinski@gmail.com
47
+ executables:
48
+ - tabula
49
+ extensions: []
50
+ extra_rdoc_files: []
51
+ files:
52
+ - ".rspec"
53
+ - ".rubocop.yml"
54
+ - CHANGELOG.md
55
+ - LICENSE
56
+ - README.md
57
+ - Rakefile
58
+ - exe/tabula
59
+ - lib/tabula.rb
60
+ - lib/tabula/algorithms/cohen_sutherland_clipping.rb
61
+ - lib/tabula/algorithms/projection_profile.rb
62
+ - lib/tabula/cli.rb
63
+ - lib/tabula/configuration.rb
64
+ - lib/tabula/core/point.rb
65
+ - lib/tabula/core/rectangle.rb
66
+ - lib/tabula/core/ruling.rb
67
+ - lib/tabula/core/spatial_index.rb
68
+ - lib/tabula/detectors/detection_algorithm.rb
69
+ - lib/tabula/detectors/nurminen_detection_algorithm.rb
70
+ - lib/tabula/detectors/spreadsheet_detection_algorithm.rb
71
+ - lib/tabula/extractors/basic_extraction_algorithm.rb
72
+ - lib/tabula/extractors/extraction_algorithm.rb
73
+ - lib/tabula/extractors/spreadsheet_extraction_algorithm.rb
74
+ - lib/tabula/pdf/object_extractor.rb
75
+ - lib/tabula/pdf/page.rb
76
+ - lib/tabula/pdf/text_stripper.rb
77
+ - lib/tabula/table/cell.rb
78
+ - lib/tabula/table/table.rb
79
+ - lib/tabula/text/line.rb
80
+ - lib/tabula/text/text_chunk.rb
81
+ - lib/tabula/text/text_element.rb
82
+ - lib/tabula/version.rb
83
+ - lib/tabula/writers/csv_writer.rb
84
+ - lib/tabula/writers/json_writer.rb
85
+ - lib/tabula/writers/markdown_writer.rb
86
+ - lib/tabula/writers/tsv_writer.rb
87
+ - lib/tabula/writers/writer.rb
88
+ - mise.toml
89
+ - tabula-rb.gemspec
90
+ homepage: https://github.com/tabulapdf/tabula-rb
91
+ licenses:
92
+ - MIT
93
+ metadata:
94
+ homepage_uri: https://github.com/tabulapdf/tabula-rb
95
+ source_code_uri: https://github.com/tabulapdf/tabula-rb
96
+ changelog_uri: https://github.com/tabulapdf/tabula-rb/blob/main/CHANGELOG.md
97
+ rubygems_mfa_required: 'true'
98
+ rdoc_options: []
99
+ require_paths:
100
+ - lib
101
+ required_ruby_version: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - - ">="
104
+ - !ruby/object:Gem::Version
105
+ version: 3.1.0
106
+ required_rubygems_version: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ requirements: []
112
+ rubygems_version: 4.0.2
113
+ specification_version: 4
114
+ summary: Extract tables from PDF files (Ruby port of tabula-java)
115
+ test_files: []