tabula-rb 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +39 -0
  4. data/CHANGELOG.md +59 -0
  5. data/LICENSE +21 -0
  6. data/README.md +176 -0
  7. data/Rakefile +28 -0
  8. data/exe/tabula +7 -0
  9. data/lib/tabula/algorithms/cohen_sutherland_clipping.rb +94 -0
  10. data/lib/tabula/algorithms/projection_profile.rb +109 -0
  11. data/lib/tabula/cli.rb +271 -0
  12. data/lib/tabula/configuration.rb +119 -0
  13. data/lib/tabula/core/point.rb +60 -0
  14. data/lib/tabula/core/rectangle.rb +218 -0
  15. data/lib/tabula/core/ruling.rb +303 -0
  16. data/lib/tabula/core/spatial_index.rb +120 -0
  17. data/lib/tabula/detectors/detection_algorithm.rb +34 -0
  18. data/lib/tabula/detectors/nurminen_detection_algorithm.rb +211 -0
  19. data/lib/tabula/detectors/spreadsheet_detection_algorithm.rb +142 -0
  20. data/lib/tabula/extractors/basic_extraction_algorithm.rb +168 -0
  21. data/lib/tabula/extractors/extraction_algorithm.rb +34 -0
  22. data/lib/tabula/extractors/spreadsheet_extraction_algorithm.rb +299 -0
  23. data/lib/tabula/pdf/object_extractor.rb +400 -0
  24. data/lib/tabula/pdf/page.rb +230 -0
  25. data/lib/tabula/pdf/text_stripper.rb +150 -0
  26. data/lib/tabula/table/cell.rb +110 -0
  27. data/lib/tabula/table/table.rb +184 -0
  28. data/lib/tabula/text/line.rb +133 -0
  29. data/lib/tabula/text/text_chunk.rb +185 -0
  30. data/lib/tabula/text/text_element.rb +120 -0
  31. data/lib/tabula/version.rb +5 -0
  32. data/lib/tabula/writers/csv_writer.rb +49 -0
  33. data/lib/tabula/writers/json_writer.rb +41 -0
  34. data/lib/tabula/writers/markdown_writer.rb +71 -0
  35. data/lib/tabula/writers/tsv_writer.rb +35 -0
  36. data/lib/tabula/writers/writer.rb +39 -0
  37. data/lib/tabula.rb +160 -0
  38. data/mise.toml +2 -0
  39. data/tabula-rb.gemspec +44 -0
  40. metadata +115 -0
data/lib/tabula/cli.rb ADDED
@@ -0,0 +1,271 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'optparse'
4
+
5
+ module Tabula
6
+ # Command-line interface for tabula
7
+ class CLI
8
+ FORMATS = %w[CSV TSV JSON MARKDOWN].freeze
9
+
10
+ def self.run(args)
11
+ new.run(args)
12
+ end
13
+
14
+ def initialize
15
+ @options = default_options
16
+ end
17
+
18
+ def run(args)
19
+ parser = build_parser
20
+ files = parser.parse(args)
21
+
22
+ if @options[:help]
23
+ puts parser
24
+ return 0
25
+ end
26
+
27
+ if @options[:version]
28
+ puts "tabula #{Tabula::VERSION}"
29
+ return 0
30
+ end
31
+
32
+ if files.empty?
33
+ warn 'Error: No PDF file specified'
34
+ warn parser
35
+ return 1
36
+ end
37
+
38
+ process_files(files)
39
+ rescue OptionParser::InvalidOption => e
40
+ warn "Error: #{e.message}"
41
+ warn parser
42
+ 1
43
+ rescue Tabula::FileNotFoundError => e
44
+ warn "Error: #{e.message}"
45
+ warn 'Please check that the file path is correct and the file exists.'
46
+ 1
47
+ rescue Tabula::InvalidOptionsError => e
48
+ warn "Error: Invalid option - #{e.message}"
49
+ warn 'Use --help to see available options and their valid values.'
50
+ 1
51
+ rescue Tabula::InvalidPDFError => e
52
+ warn "Error: Invalid PDF file - #{e.message}"
53
+ 1
54
+ rescue Tabula::PasswordRequiredError => e
55
+ warn "Error: PDF is password protected - #{e.message}"
56
+ warn 'Use the -s/--password option to provide the password.'
57
+ 1
58
+ rescue StandardError => e
59
+ warn "Error: #{e.message}"
60
+ warn e.backtrace.first(5).join("\n") if @options[:debug]
61
+ 1
62
+ end
63
+
64
+ private
65
+
66
+ def default_options
67
+ {
68
+ area: nil,
69
+ columns: nil,
70
+ format: 'CSV',
71
+ guess: false,
72
+ lattice: false,
73
+ stream: false,
74
+ pages: nil,
75
+ output: nil,
76
+ password: nil,
77
+ help: false,
78
+ version: false,
79
+ debug: false
80
+ }
81
+ end
82
+
83
+ def build_parser
84
+ OptionParser.new do |opts|
85
+ opts.banner = 'Usage: tabula [OPTIONS] <pdf_file> [<pdf_file> ...]'
86
+ opts.separator ''
87
+ opts.separator 'Extract tables from PDF files'
88
+ opts.separator ''
89
+ opts.separator 'Options:'
90
+
91
+ opts.on('-a', '--area AREA', 'Extraction area (top,left,bottom,right in points)') do |v|
92
+ @options[:area] = parse_area(v)
93
+ end
94
+
95
+ opts.on('-c', '--columns COLUMNS', 'Column boundaries (comma-separated x coordinates)') do |v|
96
+ @options[:columns] = v.split(',').map(&:to_f)
97
+ end
98
+
99
+ opts.on('-f', '--format FORMAT', FORMATS, "Output format: #{FORMATS.join(', ')} (default: CSV)") do |v|
100
+ @options[:format] = v.upcase
101
+ end
102
+
103
+ opts.on('-g', '--guess', 'Guess table areas (use detection algorithm)') do
104
+ @options[:guess] = true
105
+ end
106
+
107
+ opts.on('-l', '--lattice', 'Force lattice mode (use ruling lines)') do
108
+ @options[:lattice] = true
109
+ @options[:stream] = false
110
+ end
111
+
112
+ opts.on('-t', '--stream', 'Force stream mode (use text positions)') do
113
+ @options[:stream] = true
114
+ @options[:lattice] = false
115
+ end
116
+
117
+ opts.on('-p', '--pages PAGES', "Pages to extract (e.g., '1,2,3' or '1-5' or 'all')") do |v|
118
+ @options[:pages] = parse_pages(v)
119
+ end
120
+
121
+ opts.on('-o', '--output FILE', 'Output file (default: stdout)') do |v|
122
+ @options[:output] = v
123
+ end
124
+
125
+ opts.on('-s', '--password PASSWORD', 'PDF password') do |v|
126
+ @options[:password] = v
127
+ end
128
+
129
+ opts.on('--debug', 'Show debug information') do
130
+ @options[:debug] = true
131
+ end
132
+
133
+ opts.on('-v', '--version', 'Show version') do
134
+ @options[:version] = true
135
+ end
136
+
137
+ opts.on('-h', '--help', 'Show this help') do
138
+ @options[:help] = true
139
+ end
140
+ end
141
+ end
142
+
143
+ def parse_area(value)
144
+ parts = value.split(',').map(&:strip)
145
+ unless parts.size == 4
146
+ raise Tabula::InvalidOptionsError, "Area must have 4 values: top,left,bottom,right (got #{parts.size} values)"
147
+ end
148
+
149
+ parts.each_with_index do |p, idx|
150
+ labels = %w[top left bottom right]
151
+ unless p.match?(/\A-?\d+(\.\d+)?\z/)
152
+ raise Tabula::InvalidOptionsError, "Area #{labels[idx]} must be numeric, got '#{p}'"
153
+ end
154
+ end
155
+
156
+ parts.map(&:to_f)
157
+ end
158
+
159
+ def parse_pages(value)
160
+ return nil if value.downcase == 'all'
161
+
162
+ pages = []
163
+ value.split(',').each do |part|
164
+ part = part.strip
165
+ if part.include?('-')
166
+ range_parts = part.split('-')
167
+ unless range_parts.size == 2 && range_parts.all? { |p| p.match?(/\A\d+\z/) }
168
+ raise Tabula::InvalidOptionsError, "Invalid page range: '#{part}'. Use format like '1-5'"
169
+ end
170
+
171
+ range = range_parts.map(&:to_i)
172
+ if range[0] <= 0 || range[1] <= 0
173
+ raise Tabula::InvalidOptionsError, "Page numbers must be positive integers, got '#{part}'"
174
+ end
175
+ if range[0] > range[1]
176
+ raise Tabula::InvalidOptionsError, "Invalid page range: '#{part}'. Start must be less than or equal to end"
177
+ end
178
+
179
+ pages.concat((range[0]..range[1]).to_a)
180
+ else
181
+ unless part.match?(/\A\d+\z/)
182
+ raise Tabula::InvalidOptionsError, "Invalid page number: '#{part}'. Page numbers must be positive integers"
183
+ end
184
+
185
+ page_num = part.to_i
186
+ raise Tabula::InvalidOptionsError, "Page numbers must be positive integers, got '#{part}'" if page_num <= 0
187
+
188
+ pages << page_num
189
+ end
190
+ end
191
+ pages.uniq.sort
192
+ end
193
+
194
+ def process_files(files)
195
+ output_io = @options[:output] ? File.open(@options[:output], 'w') : $stdout
196
+ had_error = false
197
+
198
+ begin
199
+ files.each_with_index do |file, idx|
200
+ output_io.puts if idx.positive? # Separate multiple files
201
+
202
+ unless File.exist?(file)
203
+ warn "Error: File not found: #{file}"
204
+ warn 'Please check that the file path is correct and the file exists.'
205
+ had_error = true
206
+ next
207
+ end
208
+
209
+ process_file(file, output_io)
210
+ end
211
+ ensure
212
+ output_io.close if @options[:output]
213
+ end
214
+
215
+ had_error ? 1 : 0
216
+ end
217
+
218
+ def process_file(file, output_io)
219
+ extraction_options = build_extraction_options
220
+
221
+ tables = Tabula.extract(file, **extraction_options)
222
+
223
+ if tables.empty?
224
+ warn "No tables found in #{file}" if @options[:debug]
225
+ return
226
+ end
227
+
228
+ write_tables(tables, output_io)
229
+ end
230
+
231
+ def build_extraction_options
232
+ options = {
233
+ password: @options[:password],
234
+ guess: @options[:guess]
235
+ }
236
+
237
+ # Set extraction method
238
+ options[:method] = if @options[:lattice]
239
+ :lattice
240
+ elsif @options[:stream]
241
+ :stream
242
+ else
243
+ :auto
244
+ end
245
+
246
+ # Set pages
247
+ options[:pages] = @options[:pages] if @options[:pages]
248
+
249
+ # Set area
250
+ options[:area] = @options[:area] if @options[:area]
251
+
252
+ # Set columns
253
+ options[:columns] = @options[:columns] if @options[:columns]
254
+
255
+ options
256
+ end
257
+
258
+ def write_tables(tables, output_io)
259
+ case @options[:format]
260
+ when 'CSV'
261
+ Writers::CSVWriter.new.write(tables, output_io)
262
+ when 'TSV'
263
+ Writers::TSVWriter.new.write(tables, output_io)
264
+ when 'JSON'
265
+ Writers::JSONWriter.new(pretty: true).write(tables, output_io)
266
+ when 'MARKDOWN'
267
+ Writers::MarkdownWriter.new.write(tables, output_io)
268
+ end
269
+ end
270
+ end
271
+ end
@@ -0,0 +1,119 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tabula
4
+ # Configuration class for customizable extraction parameters.
5
+ # All thresholds can be adjusted to tune extraction behavior.
6
+ class Configuration
7
+ # --- Ruling Detection ---
8
+
9
+ # Tolerance for determining if a ruling is horizontal or vertical (in points)
10
+ # Lines with less than this difference in position are considered aligned
11
+ attr_accessor :orientation_tolerance
12
+
13
+ # Tolerance for ruling intersection detection (in points)
14
+ attr_accessor :intersection_tolerance
15
+
16
+ # Maximum thickness of a filled rectangle to be treated as a ruling line (in points)
17
+ attr_accessor :ruling_thickness_threshold
18
+
19
+ # --- Text Element Merging ---
20
+
21
+ # Multiplier for space width when determining word boundaries
22
+ # Lower values = more aggressive word merging
23
+ attr_accessor :word_gap_multiplier
24
+
25
+ # Multiplier for determining line boundaries
26
+ attr_accessor :line_gap_multiplier
27
+
28
+ # --- Cell Detection ---
29
+
30
+ # Minimum number of cells required for a valid table
31
+ attr_accessor :min_cells
32
+
33
+ # Minimum dimension (width or height) for a valid table region (in points)
34
+ attr_accessor :min_table_dimension
35
+
36
+ # Tolerance for cell corner detection (in points)
37
+ attr_accessor :cell_tolerance
38
+
39
+ # --- Table Detection ---
40
+
41
+ # Minimum number of rows required for table detection
42
+ attr_accessor :min_rows
43
+
44
+ # Overlap threshold for merging duplicate table detections
45
+ attr_accessor :overlap_threshold
46
+
47
+ # Threshold for determining if a table has valid row/column ratio
48
+ attr_accessor :tabular_ratio_threshold
49
+
50
+ # Tolerance for clustering text edges during detection (in points)
51
+ attr_accessor :edge_clustering_tolerance
52
+
53
+ # Padding around detected table areas (in points)
54
+ attr_accessor :detection_padding
55
+
56
+ # --- Rectangle Comparison ---
57
+
58
+ # Threshold for vertical overlap comparison
59
+ attr_accessor :vertical_comparison_threshold
60
+
61
+ def initialize
62
+ # Ruling detection
63
+ @orientation_tolerance = 1.0
64
+ @intersection_tolerance = 1.0
65
+ @ruling_thickness_threshold = 8.0
66
+
67
+ # Text element merging
68
+ @word_gap_multiplier = 0.5
69
+ @line_gap_multiplier = 0.5
70
+
71
+ # Cell detection
72
+ @min_cells = 4
73
+ @min_table_dimension = 10.0
74
+ @cell_tolerance = 2.0
75
+
76
+ # Table detection
77
+ @min_rows = 2
78
+ @overlap_threshold = 0.9
79
+ @tabular_ratio_threshold = 0.65
80
+ @edge_clustering_tolerance = 8.0
81
+ @detection_padding = 2.0
82
+
83
+ # Rectangle comparison
84
+ @vertical_comparison_threshold = 0.4
85
+ end
86
+
87
+ # Create a copy with overrides
88
+ # @param overrides [Hash] values to override
89
+ # @return [Configuration]
90
+ def with(**overrides)
91
+ dup.tap do |config|
92
+ overrides.each { |key, value| config.send("#{key}=", value) }
93
+ end
94
+ end
95
+ end
96
+
97
+ # Default configuration instance
98
+ @default_configuration = Configuration.new
99
+
100
+ class << self
101
+ # Get the default configuration
102
+ # @return [Configuration]
103
+ def configuration
104
+ @default_configuration
105
+ end
106
+
107
+ # Set the default configuration
108
+ # @param config [Configuration]
109
+ def configuration=(config)
110
+ @default_configuration = config
111
+ end
112
+
113
+ # Configure with a block
114
+ # @yield [Configuration]
115
+ def configure
116
+ yield configuration
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tabula
4
+ # Represents a 2D point with x and y coordinates
5
+ class Point
6
+ attr_accessor :x, :y
7
+
8
+ def initialize(x, y)
9
+ @x = x.to_f
10
+ @y = y.to_f
11
+ end
12
+
13
+ def to_a
14
+ [x, y]
15
+ end
16
+
17
+ def ==(other)
18
+ return false unless other.is_a?(Point)
19
+
20
+ x == other.x && y == other.y
21
+ end
22
+ alias eql? ==
23
+
24
+ def hash
25
+ [x, y].hash
26
+ end
27
+
28
+ def distance_to(other)
29
+ Math.sqrt(((x - other.x)**2) + ((y - other.y)**2))
30
+ end
31
+
32
+ def distance_squared_to(other)
33
+ ((x - other.x)**2) + ((y - other.y)**2)
34
+ end
35
+
36
+ def +(other)
37
+ Point.new(x + other.x, y + other.y)
38
+ end
39
+
40
+ def -(other)
41
+ Point.new(x - other.x, y - other.y)
42
+ end
43
+
44
+ def *(other)
45
+ Point.new(x * other, y * other)
46
+ end
47
+
48
+ def /(other)
49
+ Point.new(x / other, y / other)
50
+ end
51
+
52
+ def to_s
53
+ "Point(#{x}, #{y})"
54
+ end
55
+
56
+ def inspect
57
+ to_s
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,218 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tabula
4
+ # Represents a rectangle with position and dimensions.
5
+ # Coordinates use PDF coordinate system (origin at bottom-left).
6
+ class Rectangle
7
+ # Threshold for vertical overlap comparison (40% overlap)
8
+ VERTICAL_COMPARISON_THRESHOLD = 0.4
9
+
10
+ attr_accessor :top, :left, :width, :height
11
+
12
+ def initialize(top, left, width, height)
13
+ @top = top.to_f
14
+ @left = left.to_f
15
+ @width = width.to_f
16
+ @height = height.to_f
17
+ end
18
+
19
+ # Create rectangle from bounds [top, left, bottom, right]
20
+ def self.from_bounds(top, left, bottom, right)
21
+ new(top, left, right - left, bottom - top)
22
+ end
23
+
24
+ # Create rectangle from two points
25
+ def self.from_points(p1, p2)
26
+ top = [p1.y, p2.y].min
27
+ left = [p1.x, p2.x].min
28
+ bottom = [p1.y, p2.y].max
29
+ right = [p1.x, p2.x].max
30
+ from_bounds(top, left, bottom, right)
31
+ end
32
+
33
+ # Compute bounding box of multiple rectangles
34
+ def self.bounding_box_of(rectangles)
35
+ return nil if rectangles.empty?
36
+
37
+ top = rectangles.map(&:top).min
38
+ left = rectangles.map(&:left).min
39
+ bottom = rectangles.map(&:bottom).max
40
+ right = rectangles.map(&:right).max
41
+
42
+ from_bounds(top, left, bottom, right)
43
+ end
44
+
45
+ def bottom
46
+ top + height
47
+ end
48
+
49
+ def bottom=(value)
50
+ @height = value - top
51
+ end
52
+
53
+ def right
54
+ left + width
55
+ end
56
+
57
+ def right=(value)
58
+ @width = value - left
59
+ end
60
+
61
+ def x
62
+ left
63
+ end
64
+
65
+ def x=(value)
66
+ self.left = value
67
+ end
68
+
69
+ def y
70
+ top
71
+ end
72
+
73
+ def y=(value)
74
+ self.top = value
75
+ end
76
+
77
+ def area
78
+ width * height
79
+ end
80
+
81
+ def center
82
+ Point.new(left + (width / 2.0), top + (height / 2.0))
83
+ end
84
+
85
+ def bounds
86
+ [top, left, bottom, right]
87
+ end
88
+
89
+ def points
90
+ [
91
+ Point.new(left, top),
92
+ Point.new(right, top),
93
+ Point.new(right, bottom),
94
+ Point.new(left, bottom)
95
+ ]
96
+ end
97
+
98
+ # Calculate vertical overlap with another rectangle
99
+ def vertical_overlap(other)
100
+ [0, [bottom, other.bottom].min - [top, other.top].max].max
101
+ end
102
+
103
+ # Calculate horizontal overlap with another rectangle
104
+ def horizontal_overlap(other)
105
+ [0, [right, other.right].min - [left, other.left].max].max
106
+ end
107
+
108
+ # Check if rectangles overlap vertically
109
+ def vertically_overlaps?(other, threshold = VERTICAL_COMPARISON_THRESHOLD)
110
+ overlap = vertical_overlap(other)
111
+ min_height = [height, other.height].min
112
+ return false if min_height.zero?
113
+
114
+ (overlap / min_height) >= threshold
115
+ end
116
+
117
+ # Check if rectangles overlap horizontally
118
+ def horizontally_overlaps?(other, threshold = 0.0)
119
+ overlap = horizontal_overlap(other)
120
+ min_width = [width, other.width].min
121
+ return true if min_width.zero? && overlap.zero?
122
+ return false if min_width.zero?
123
+
124
+ (overlap / min_width) > threshold
125
+ end
126
+
127
+ # Calculate overlap ratio (intersection area / union area)
128
+ def overlap_ratio(other)
129
+ intersection_area = vertical_overlap(other) * horizontal_overlap(other)
130
+ return 0.0 if intersection_area.zero?
131
+
132
+ union_area = area + other.area - intersection_area
133
+ return 0.0 if union_area.zero?
134
+
135
+ intersection_area / union_area
136
+ end
137
+
138
+ # Check if this rectangle contains a point
139
+ def contains_point?(point)
140
+ point.x.between?(left, right) && point.y >= top && point.y <= bottom
141
+ end
142
+
143
+ # Check if this rectangle fully contains another
144
+ def contains?(other)
145
+ left <= other.left && right >= other.right && top <= other.top && bottom >= other.bottom
146
+ end
147
+
148
+ # Check if this rectangle intersects another
149
+ def intersects?(other)
150
+ !(other.left > right || other.right < left || other.top > bottom || other.bottom < top)
151
+ end
152
+
153
+ # Merge this rectangle with another, returning the bounding box
154
+ def merge(other)
155
+ Rectangle.from_bounds(
156
+ [top, other.top].min,
157
+ [left, other.left].min,
158
+ [bottom, other.bottom].max,
159
+ [right, other.right].max
160
+ )
161
+ end
162
+
163
+ # Merge in place
164
+ def merge!(other)
165
+ merged = merge(other)
166
+ @top = merged.top
167
+ @left = merged.left
168
+ @width = merged.width
169
+ @height = merged.height
170
+ self
171
+ end
172
+
173
+ # Return intersection rectangle, or nil if no intersection
174
+ def intersection(other)
175
+ return nil unless intersects?(other)
176
+
177
+ Rectangle.from_bounds(
178
+ [top, other.top].max,
179
+ [left, other.left].max,
180
+ [bottom, other.bottom].min,
181
+ [right, other.right].min
182
+ )
183
+ end
184
+
185
+ def ==(other)
186
+ return false unless other.is_a?(Rectangle)
187
+
188
+ top == other.top && left == other.left && width == other.width && height == other.height
189
+ end
190
+ alias eql? ==
191
+
192
+ def hash
193
+ [top, left, width, height].hash
194
+ end
195
+
196
+ def dup
197
+ Rectangle.new(top, left, width, height)
198
+ end
199
+
200
+ def to_s
201
+ "Rectangle[top=#{top}, left=#{left}, width=#{width}, height=#{height}]"
202
+ end
203
+
204
+ def inspect
205
+ to_s
206
+ end
207
+
208
+ # Comparator for sorting by position (top to bottom, left to right)
209
+ def <=>(other)
210
+ result = top <=> other.top
211
+ return result unless result.zero?
212
+
213
+ left <=> other.left
214
+ end
215
+
216
+ include Comparable
217
+ end
218
+ end