tabula-rb 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +39 -0
  4. data/CHANGELOG.md +59 -0
  5. data/LICENSE +21 -0
  6. data/README.md +176 -0
  7. data/Rakefile +28 -0
  8. data/exe/tabula +7 -0
  9. data/lib/tabula/algorithms/cohen_sutherland_clipping.rb +94 -0
  10. data/lib/tabula/algorithms/projection_profile.rb +109 -0
  11. data/lib/tabula/cli.rb +271 -0
  12. data/lib/tabula/configuration.rb +119 -0
  13. data/lib/tabula/core/point.rb +60 -0
  14. data/lib/tabula/core/rectangle.rb +218 -0
  15. data/lib/tabula/core/ruling.rb +303 -0
  16. data/lib/tabula/core/spatial_index.rb +120 -0
  17. data/lib/tabula/detectors/detection_algorithm.rb +34 -0
  18. data/lib/tabula/detectors/nurminen_detection_algorithm.rb +211 -0
  19. data/lib/tabula/detectors/spreadsheet_detection_algorithm.rb +142 -0
  20. data/lib/tabula/extractors/basic_extraction_algorithm.rb +168 -0
  21. data/lib/tabula/extractors/extraction_algorithm.rb +34 -0
  22. data/lib/tabula/extractors/spreadsheet_extraction_algorithm.rb +299 -0
  23. data/lib/tabula/pdf/object_extractor.rb +400 -0
  24. data/lib/tabula/pdf/page.rb +230 -0
  25. data/lib/tabula/pdf/text_stripper.rb +150 -0
  26. data/lib/tabula/table/cell.rb +110 -0
  27. data/lib/tabula/table/table.rb +184 -0
  28. data/lib/tabula/text/line.rb +133 -0
  29. data/lib/tabula/text/text_chunk.rb +185 -0
  30. data/lib/tabula/text/text_element.rb +120 -0
  31. data/lib/tabula/version.rb +5 -0
  32. data/lib/tabula/writers/csv_writer.rb +49 -0
  33. data/lib/tabula/writers/json_writer.rb +41 -0
  34. data/lib/tabula/writers/markdown_writer.rb +71 -0
  35. data/lib/tabula/writers/tsv_writer.rb +35 -0
  36. data/lib/tabula/writers/writer.rb +39 -0
  37. data/lib/tabula.rb +160 -0
  38. data/mise.toml +2 -0
  39. data/tabula-rb.gemspec +44 -0
  40. metadata +115 -0
@@ -0,0 +1,400 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pdf-reader'
4
+
5
+ module Tabula
6
+ # Extracts content from PDF documents.
7
+ # Wraps pdf-reader and provides access to pages with text and rulings.
8
+ class ObjectExtractor
9
+ attr_reader :pdf_reader
10
+
11
+ # Open a PDF file for extraction
12
+ # @param path [String] path to PDF file
13
+ # @param password [String, nil] password for encrypted PDFs
14
+ # @yield [ObjectExtractor] yields extractor for block usage
15
+ # @return [ObjectExtractor, Object] extractor or block result
16
+ def self.open(path, password: nil, &block)
17
+ extractor = new(path, password: password)
18
+ if block
19
+ begin
20
+ yield extractor
21
+ ensure
22
+ extractor.close
23
+ end
24
+ else
25
+ extractor
26
+ end
27
+ end
28
+
29
+ # @param path [String] path to PDF file
30
+ # @param password [String, nil] password for encrypted PDFs
31
+ def initialize(path, password: nil)
32
+ @path = path
33
+ @password = password
34
+ @pdf_reader = open_pdf
35
+ @closed = false
36
+ end
37
+
38
+ # Extract a specific page
39
+ # @param page_number [Integer] page number (1-indexed)
40
+ # @return [Page] extracted page
41
+ def extract_page(page_number)
42
+ validate_page_number(page_number)
43
+
44
+ pdf_page = @pdf_reader.pages[page_number - 1]
45
+ process_page(pdf_page, page_number)
46
+ end
47
+
48
+ # Extract all pages
49
+ # @return [PageIterator] iterator over pages
50
+ def extract
51
+ extract_pages(1..page_count)
52
+ end
53
+
54
+ # Extract specific pages
55
+ # @param pages [Range, Array<Integer>] page numbers to extract
56
+ # @return [PageIterator] iterator over pages
57
+ def extract_pages(pages)
58
+ PageIterator.new(self, pages.to_a)
59
+ end
60
+
61
+ # Get page count
62
+ # @return [Integer]
63
+ def page_count
64
+ @pdf_reader.page_count
65
+ end
66
+
67
+ # Get pages iterator
68
+ # @return [Enumerator]
69
+ def pages
70
+ (1..page_count).lazy.map { |n| extract_page(n) }
71
+ end
72
+
73
+ # Close the PDF
74
+ def close
75
+ @closed = true
76
+ end
77
+
78
+ def closed?
79
+ @closed
80
+ end
81
+
82
+ private
83
+
84
+ def open_pdf
85
+ PDF::Reader.new(@path, password: @password)
86
+ rescue PDF::Reader::EncryptedPDFError
87
+ raise PasswordRequiredError, 'PDF is encrypted and requires a password'
88
+ rescue PDF::Reader::MalformedPDFError => e
89
+ raise InvalidPDFError, "Invalid PDF file: #{e.message}"
90
+ end
91
+
92
+ def validate_page_number(page_number)
93
+ return if page_number.between?(1, page_count)
94
+
95
+ raise ArgumentError, "Page number #{page_number} out of range (1-#{page_count})"
96
+ end
97
+
98
+ def process_page(pdf_page, page_number)
99
+ # Get page boxes
100
+ media_box = pdf_page.attributes[:MediaBox]
101
+ crop_box = pdf_page.attributes[:CropBox]
102
+ has_crop_box = !crop_box.nil?
103
+ crop_box ||= media_box
104
+
105
+ # Calculate MediaBox dimensions (always positive)
106
+ media_height = (media_box[3].to_f - media_box[1].to_f).abs
107
+
108
+ # Detect if Y-axis is inverted (negative height in MediaBox)
109
+ y_inverted = media_box[3].to_f < media_box[1].to_f
110
+
111
+ # Calculate CropBox dimensions and offsets
112
+ crop_left = [crop_box[0].to_f, crop_box[2].to_f].min
113
+ crop_bottom = [crop_box[1].to_f, crop_box[3].to_f].min
114
+ crop_right = [crop_box[0].to_f, crop_box[2].to_f].max
115
+ crop_top = [crop_box[1].to_f, crop_box[3].to_f].max
116
+
117
+ page_width = crop_right - crop_left
118
+ page_height = crop_top - crop_bottom
119
+
120
+ # Handle rotation
121
+ rotation = pdf_page.attributes[:Rotate] || 0
122
+ page_width, page_height = page_height, page_width if [90, 270].include?(rotation)
123
+
124
+ # Extract text
125
+ stripper = TextStripper.new(pdf_page)
126
+ text_elements = stripper.extract
127
+
128
+ # Extract rulings
129
+ rulings = extract_rulings(pdf_page, media_height, y_inverted: y_inverted)
130
+
131
+ # Only transform coordinates if there's a CropBox that differs from MediaBox
132
+ if has_crop_box
133
+ text_elements = transform_to_crop_space(text_elements, media_height, crop_left, crop_bottom, crop_top,
134
+ y_inverted)
135
+ rulings = transform_rulings_to_crop_space(rulings, media_height, crop_left, crop_bottom, crop_top, y_inverted)
136
+ end
137
+
138
+ # Build page object
139
+ Page::Builder.new
140
+ .top(0)
141
+ .left(0)
142
+ .width(page_width)
143
+ .height(page_height)
144
+ .page_number(page_number)
145
+ .rotation(rotation)
146
+ .text_elements(text_elements)
147
+ .rulings(rulings)
148
+ .min_char_width(stripper.min_char_width)
149
+ .min_char_height(stripper.min_char_height)
150
+ .build
151
+ end
152
+
153
+ def extract_rulings(pdf_page, page_height, y_inverted:)
154
+ receiver = RulingReceiver.new(page_height, y_inverted: y_inverted)
155
+ pdf_page.walk(receiver)
156
+ receiver.rulings
157
+ end
158
+
159
+ def transform_to_crop_space(text_elements, media_height, crop_left, _crop_bottom, crop_top, _y_inverted)
160
+ # Transform text element coordinates from MediaBox to CropBox space
161
+ text_elements.map do |te|
162
+ # Calculate Y offset in top-left coordinate system
163
+ # In MediaBox space: top of crop area is at (media_height - crop_top)
164
+ y_offset = media_height - crop_top
165
+ new_top = te.top - y_offset
166
+ new_left = te.left - crop_left
167
+
168
+ TextElement.new(
169
+ top: new_top,
170
+ left: new_left,
171
+ width: te.width,
172
+ height: te.height,
173
+ text: te.text,
174
+ font_name: te.font_name,
175
+ font_size: te.font_size,
176
+ width_of_space: te.width_of_space
177
+ )
178
+ end
179
+ end
180
+
181
+ def transform_rulings_to_crop_space(rulings, media_height, crop_left, _crop_bottom, crop_top, _y_inverted)
182
+ # Transform ruling coordinates from MediaBox to CropBox space
183
+ y_offset = media_height - crop_top
184
+
185
+ rulings.map do |r|
186
+ new_y1 = r.y1 - y_offset
187
+ new_y2 = r.y2 - y_offset
188
+ new_x1 = r.x1 - crop_left
189
+ new_x2 = r.x2 - crop_left
190
+
191
+ Ruling.new(new_x1, new_y1, new_x2, new_y2)
192
+ end
193
+ end
194
+
195
+ # Receiver for extracting ruling lines from PDF graphics
196
+ class RulingReceiver
197
+ attr_reader :rulings
198
+
199
+ def initialize(page_height, y_inverted: false)
200
+ @page_height = page_height
201
+ @y_inverted = y_inverted
202
+ @rulings = []
203
+ @current_path = []
204
+ @subpaths = [] # Collect all subpaths for filling
205
+ @ctm = [[1, 0, 0], [0, 1, 0], [0, 0, 1]]
206
+ @graphics_state_stack = []
207
+ end
208
+
209
+ # Path construction
210
+ def begin_new_subpath(x, y)
211
+ # Save current subpath if it has content
212
+ @subpaths << @current_path.dup if @current_path.any?
213
+ @current_path = [[transform_point(x, y), :move]]
214
+ end
215
+
216
+ def append_line(x, y)
217
+ @current_path << [transform_point(x, y), :line]
218
+ end
219
+
220
+ def append_rectangle(x, y, width, height)
221
+ # Convert rectangle to four lines
222
+ p1 = transform_point(x, y)
223
+ p2 = transform_point(x + width, y)
224
+ p3 = transform_point(x + width, y + height)
225
+ p4 = transform_point(x, y + height)
226
+
227
+ @current_path = [
228
+ [p1, :move],
229
+ [p2, :line],
230
+ [p3, :line],
231
+ [p4, :line],
232
+ [p1, :line]
233
+ ]
234
+ end
235
+
236
+ # Path painting
237
+ def stroke_path
238
+ extract_lines_from_path
239
+ @current_path = []
240
+ end
241
+
242
+ def fill_path_with_nonzero
243
+ # Include current path
244
+ @subpaths << @current_path.dup if @current_path.any?
245
+
246
+ # Process all subpaths
247
+ @subpaths.each do |subpath|
248
+ @current_path = subpath
249
+ extract_rulings_from_filled_path
250
+ end
251
+
252
+ @current_path = []
253
+ @subpaths = []
254
+ end
255
+
256
+ def fill_path_with_even_odd
257
+ # Include current path
258
+ @subpaths << @current_path.dup if @current_path.any?
259
+
260
+ # Process all subpaths
261
+ @subpaths.each do |subpath|
262
+ @current_path = subpath
263
+ extract_rulings_from_filled_path
264
+ end
265
+
266
+ @current_path = []
267
+ @subpaths = []
268
+ end
269
+
270
+ def close_and_stroke_path
271
+ close_path
272
+ stroke_path
273
+ end
274
+
275
+ def close_fill_stroke
276
+ close_path
277
+ stroke_path
278
+ end
279
+
280
+ def end_path
281
+ @current_path = []
282
+ end
283
+
284
+ def close_path
285
+ return if @current_path.empty?
286
+
287
+ first_point = @current_path.first[0]
288
+ @current_path << [first_point, :line]
289
+ end
290
+
291
+ # CTM operations
292
+ def concatenate_matrix(a, b, c, d, e, f)
293
+ matrix = [[a, b, 0], [c, d, 0], [e, f, 1]]
294
+ @ctm = matrix_multiply(matrix, @ctm)
295
+ end
296
+
297
+ def save_graphics_state
298
+ @graphics_state_stack.push(@ctm.map(&:dup))
299
+ end
300
+
301
+ def restore_graphics_state
302
+ @ctm = @graphics_state_stack.pop || [[1, 0, 0], [0, 1, 0], [0, 0, 1]]
303
+ end
304
+
305
+ private
306
+
307
+ def transform_point(x, y)
308
+ tx = (@ctm[0][0] * x) + (@ctm[1][0] * y) + @ctm[2][0]
309
+ ty = (@ctm[0][1] * x) + (@ctm[1][1] * y) + @ctm[2][1]
310
+ # Convert to top-left origin, handling inverted coordinate systems
311
+ if @y_inverted
312
+ [ty.abs, tx]
313
+ else
314
+ [@page_height - ty, tx]
315
+ end
316
+ end
317
+
318
+ def extract_lines_from_path
319
+ return if @current_path.size < 2
320
+
321
+ @current_path.each_cons(2) do |(p1, _), (p2, type)|
322
+ next unless type == :line
323
+
324
+ y1, x1 = p1
325
+ y2, x2 = p2
326
+
327
+ # Only keep horizontal and vertical lines
328
+ ruling = Ruling.new(x1, y1, x2, y2)
329
+ @rulings << ruling unless ruling.oblique?
330
+ end
331
+ end
332
+
333
+ def extract_rulings_from_filled_path
334
+ return if @current_path.size < 4
335
+
336
+ # Get bounding box of the path
337
+ points = @current_path.map { |p, _| p }
338
+ y_coords = points.map { |p| p[0] }
339
+ x_coords = points.map { |p| p[1] }
340
+
341
+ min_y = y_coords.min
342
+ max_y = y_coords.max
343
+ min_x = x_coords.min
344
+ max_x = x_coords.max
345
+
346
+ width = max_x - min_x
347
+ height = max_y - min_y
348
+
349
+ # Threshold for considering a filled rectangle as a ruling line
350
+ # If one dimension is much smaller than the other, treat it as a line
351
+ ruling_threshold = 8.0
352
+
353
+ if height <= ruling_threshold && width > ruling_threshold
354
+ # Horizontal ruling
355
+ mid_y = (min_y + max_y) / 2.0
356
+ @rulings << Ruling.new(min_x, mid_y, max_x, mid_y)
357
+ elsif width <= ruling_threshold && height > ruling_threshold
358
+ # Vertical ruling
359
+ mid_x = (min_x + max_x) / 2.0
360
+ @rulings << Ruling.new(mid_x, min_y, mid_x, max_y)
361
+ end
362
+ # Otherwise, ignore (it's a filled area, not a line)
363
+ end
364
+
365
+ def matrix_multiply(a, b)
366
+ result = Array.new(3) { Array.new(3, 0.0) }
367
+ 3.times do |i|
368
+ 3.times do |j|
369
+ 3.times do |k|
370
+ result[i][j] += a[i][k] * b[k][j]
371
+ end
372
+ end
373
+ end
374
+ result
375
+ end
376
+ end
377
+ end
378
+
379
+ # Iterator for pages
380
+ class PageIterator
381
+ include Enumerable
382
+
383
+ def initialize(extractor, page_numbers)
384
+ @extractor = extractor
385
+ @page_numbers = page_numbers
386
+ end
387
+
388
+ def each(&block)
389
+ return enum_for(:each) unless block
390
+
391
+ @page_numbers.each do |page_number|
392
+ yield @extractor.extract_page(page_number)
393
+ end
394
+ end
395
+
396
+ def size
397
+ @page_numbers.size
398
+ end
399
+ end
400
+ end
@@ -0,0 +1,230 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tabula
4
+ # Represents a PDF page with extracted text elements and rulings.
5
+ # Provides methods for accessing page content and creating sub-areas.
6
+ class Page < Rectangle
7
+ attr_reader :page_number, :rotation, :text_elements, :rulings,
8
+ :min_char_width, :min_char_height, :spatial_index
9
+
10
+ # @param top [Float] top coordinate
11
+ # @param left [Float] left coordinate
12
+ # @param width [Float] page width
13
+ # @param height [Float] page height
14
+ # @param page_number [Integer] page number (1-indexed)
15
+ # @param rotation [Integer] page rotation in degrees
16
+ # @param text_elements [Array<TextElement>] extracted text elements
17
+ # @param rulings [Array<Ruling>] extracted ruling lines
18
+ # @param min_char_width [Float] minimum character width
19
+ # @param min_char_height [Float] minimum character height
20
+ def initialize(top:, left:, width:, height:, page_number:, rotation: 0,
21
+ text_elements: [], rulings: [], min_char_width: nil, min_char_height: nil)
22
+ super(top, left, width, height)
23
+ @page_number = page_number
24
+ @rotation = rotation
25
+ @text_elements = text_elements
26
+ @rulings = rulings
27
+ @min_char_width = min_char_width
28
+ @min_char_height = min_char_height
29
+ @spatial_index = build_spatial_index
30
+ @processed_rulings = nil
31
+ end
32
+
33
+ # Get text elements within a rectangular area
34
+ # @param area [Rectangle] area to query
35
+ # @return [Array<TextElement>]
36
+ def get_text(area = nil)
37
+ return @text_elements if area.nil?
38
+
39
+ # Use intersects because text elements may extend beyond cell boundaries
40
+ # (e.g., text with descenders or tall characters)
41
+ # Filter to elements whose origin (top-left) is within the area
42
+ @spatial_index.intersects(area).select do |te|
43
+ te.top >= area.top && te.top < area.bottom &&
44
+ te.left >= area.left && te.left < area.right
45
+ end
46
+ end
47
+
48
+ # Get the bounding box of all text on the page
49
+ # @return [Rectangle, nil]
50
+ def text_bounds
51
+ Rectangle.bounding_box_of(@text_elements)
52
+ end
53
+
54
+ # Create a sub-page for a specific area
55
+ # @param top [Float] area top
56
+ # @param left [Float] area left
57
+ # @param bottom [Float] area bottom
58
+ # @param right [Float] area right
59
+ # @return [Page] sub-page containing only elements in the area
60
+ def get_area(top, left, bottom, right)
61
+ area = Rectangle.from_bounds(top, left, bottom, right)
62
+
63
+ # Filter text elements
64
+ area_elements = get_text(area)
65
+
66
+ # Filter and clip rulings
67
+ area_rulings = Ruling.crop_to_area(rulings, area)
68
+
69
+ Page.new(
70
+ top: top,
71
+ left: left,
72
+ width: right - left,
73
+ height: bottom - top,
74
+ page_number: page_number,
75
+ rotation: rotation,
76
+ text_elements: area_elements,
77
+ rulings: area_rulings,
78
+ min_char_width: min_char_width,
79
+ min_char_height: min_char_height
80
+ )
81
+ end
82
+
83
+ # Get processed ruling lines (collapsed and cleaned)
84
+ # @return [Array<Ruling>]
85
+ def get_rulings
86
+ @get_rulings ||= process_rulings
87
+ end
88
+
89
+ # Get horizontal ruling lines
90
+ # @return [Array<Ruling>]
91
+ def horizontal_rulings
92
+ get_rulings.select(&:horizontal?)
93
+ end
94
+
95
+ # Get vertical ruling lines
96
+ # @return [Array<Ruling>]
97
+ def vertical_rulings
98
+ get_rulings.select(&:vertical?)
99
+ end
100
+
101
+ # Get raw (unprocessed) rulings
102
+ # @return [Array<Ruling>]
103
+ def unprocessed_rulings
104
+ @rulings
105
+ end
106
+
107
+ # Add a ruling to the page
108
+ # @param ruling [Ruling] ruling to add
109
+ def add_ruling(ruling)
110
+ return if ruling.oblique?
111
+
112
+ @rulings << ruling
113
+ @processed_rulings = nil # Invalidate cache
114
+ end
115
+
116
+ # Check if page has ruling lines
117
+ # @return [Boolean]
118
+ def has_rulings?
119
+ !@rulings.empty?
120
+ end
121
+
122
+ # Get text chunks (words) from the page
123
+ # @return [Array<TextChunk>]
124
+ def text_chunks
125
+ TextElement.merge_words(@text_elements, vertical_rulings: vertical_rulings)
126
+ end
127
+
128
+ # Get lines of text
129
+ # @return [Array<Line>]
130
+ def text_lines
131
+ TextChunk.group_by_lines(text_chunks)
132
+ end
133
+
134
+ def to_s
135
+ "Page[#{page_number}](#{left}, #{top}, #{width}, #{height})"
136
+ end
137
+
138
+ def inspect
139
+ to_s
140
+ end
141
+
142
+ private
143
+
144
+ def build_spatial_index
145
+ index = SpatialIndex.new
146
+ index.add_all(@text_elements)
147
+ index
148
+ end
149
+
150
+ def process_rulings
151
+ # Remove oblique lines
152
+ clean = @rulings.reject(&:oblique?)
153
+
154
+ # Collapse colinear rulings
155
+ Ruling.collapse_oriented_rulings(clean)
156
+ end
157
+
158
+ # Builder class for constructing Page objects
159
+ class Builder
160
+ def initialize
161
+ @attrs = {
162
+ top: 0,
163
+ left: 0,
164
+ width: 0,
165
+ height: 0,
166
+ page_number: 1,
167
+ rotation: 0,
168
+ text_elements: [],
169
+ rulings: [],
170
+ min_char_width: nil,
171
+ min_char_height: nil
172
+ }
173
+ end
174
+
175
+ def top(value)
176
+ @attrs[:top] = value
177
+ self
178
+ end
179
+
180
+ def left(value)
181
+ @attrs[:left] = value
182
+ self
183
+ end
184
+
185
+ def width(value)
186
+ @attrs[:width] = value
187
+ self
188
+ end
189
+
190
+ def height(value)
191
+ @attrs[:height] = value
192
+ self
193
+ end
194
+
195
+ def page_number(value)
196
+ @attrs[:page_number] = value
197
+ self
198
+ end
199
+
200
+ def rotation(value)
201
+ @attrs[:rotation] = value
202
+ self
203
+ end
204
+
205
+ def text_elements(value)
206
+ @attrs[:text_elements] = value
207
+ self
208
+ end
209
+
210
+ def rulings(value)
211
+ @attrs[:rulings] = value
212
+ self
213
+ end
214
+
215
+ def min_char_width(value)
216
+ @attrs[:min_char_width] = value
217
+ self
218
+ end
219
+
220
+ def min_char_height(value)
221
+ @attrs[:min_char_height] = value
222
+ self
223
+ end
224
+
225
+ def build
226
+ Page.new(**@attrs)
227
+ end
228
+ end
229
+ end
230
+ end