tabula-rb 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +39 -0
  4. data/CHANGELOG.md +59 -0
  5. data/LICENSE +21 -0
  6. data/README.md +176 -0
  7. data/Rakefile +28 -0
  8. data/exe/tabula +7 -0
  9. data/lib/tabula/algorithms/cohen_sutherland_clipping.rb +94 -0
  10. data/lib/tabula/algorithms/projection_profile.rb +109 -0
  11. data/lib/tabula/cli.rb +271 -0
  12. data/lib/tabula/configuration.rb +119 -0
  13. data/lib/tabula/core/point.rb +60 -0
  14. data/lib/tabula/core/rectangle.rb +218 -0
  15. data/lib/tabula/core/ruling.rb +303 -0
  16. data/lib/tabula/core/spatial_index.rb +120 -0
  17. data/lib/tabula/detectors/detection_algorithm.rb +34 -0
  18. data/lib/tabula/detectors/nurminen_detection_algorithm.rb +211 -0
  19. data/lib/tabula/detectors/spreadsheet_detection_algorithm.rb +142 -0
  20. data/lib/tabula/extractors/basic_extraction_algorithm.rb +168 -0
  21. data/lib/tabula/extractors/extraction_algorithm.rb +34 -0
  22. data/lib/tabula/extractors/spreadsheet_extraction_algorithm.rb +299 -0
  23. data/lib/tabula/pdf/object_extractor.rb +400 -0
  24. data/lib/tabula/pdf/page.rb +230 -0
  25. data/lib/tabula/pdf/text_stripper.rb +150 -0
  26. data/lib/tabula/table/cell.rb +110 -0
  27. data/lib/tabula/table/table.rb +184 -0
  28. data/lib/tabula/text/line.rb +133 -0
  29. data/lib/tabula/text/text_chunk.rb +185 -0
  30. data/lib/tabula/text/text_element.rb +120 -0
  31. data/lib/tabula/version.rb +5 -0
  32. data/lib/tabula/writers/csv_writer.rb +49 -0
  33. data/lib/tabula/writers/json_writer.rb +41 -0
  34. data/lib/tabula/writers/markdown_writer.rb +71 -0
  35. data/lib/tabula/writers/tsv_writer.rb +35 -0
  36. data/lib/tabula/writers/writer.rb +39 -0
  37. data/lib/tabula.rb +160 -0
  38. data/mise.toml +2 -0
  39. data/tabula-rb.gemspec +44 -0
  40. metadata +115 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 80dbf2efc9afdaa82c43fed84838b003121a9ec2af277621b7be59f3394840a3
4
+ data.tar.gz: 8db614e4185f5fbd5e5b969e1ee76469a6d883542e83942121f18dace8eb80c3
5
+ SHA512:
6
+ metadata.gz: 79e3c2de05740e98a587710dec2426e8d5294e928771136a51f22e81ed9eecb1b1d6018bf2dfc83dfb97c1ac64bbc2e6c5660f863e7cce4252b644ef9b495670
7
+ data.tar.gz: 52280f8b0a1fd27ea842bab9f250274d42854b141e7f056d00e7902eb38024bd77ed498c336ed1810fb94ca253787520408f9d29d82fde52b507bf98246be465
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --require spec_helper
2
+ --format documentation
3
+ --color
data/.rubocop.yml ADDED
@@ -0,0 +1,39 @@
1
+ require:
2
+ - rubocop-rspec
3
+
4
+ AllCops:
5
+ TargetRubyVersion: 3.1
6
+ NewCops: enable
7
+ SuggestExtensions: false
8
+
9
+ Style/Documentation:
10
+ Enabled: false
11
+
12
+ Style/FrozenStringLiteralComment:
13
+ Enabled: true
14
+
15
+ Metrics/ClassLength:
16
+ Max: 200
17
+
18
+ Metrics/MethodLength:
19
+ Max: 30
20
+
21
+ Metrics/AbcSize:
22
+ Max: 30
23
+
24
+ Metrics/BlockLength:
25
+ Exclude:
26
+ - "spec/**/*"
27
+ - "*.gemspec"
28
+
29
+ Layout/LineLength:
30
+ Max: 120
31
+
32
+ RSpec/MultipleExpectations:
33
+ Max: 5
34
+
35
+ RSpec/ExampleLength:
36
+ Max: 15
37
+
38
+ RSpec/NestedGroups:
39
+ Max: 4
data/CHANGELOG.md ADDED
@@ -0,0 +1,59 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [1.0.0] - 2024
9
+
10
+ Initial stable release of tabula-rb, a pure Ruby port of tabula-java.
11
+
12
+ ### Added
13
+
14
+ - **Table Extraction**: Extract tables from PDF files using two modes:
15
+ - Lattice mode: For PDFs with visible ruling lines/borders
16
+ - Stream mode: For PDFs without visible borders (uses text positioning)
17
+ - Auto mode: Tries lattice first, falls back to stream
18
+
19
+ - **Output Formats**:
20
+ - CSV (with customizable separator and quoting)
21
+ - TSV
22
+ - JSON (with optional pretty-printing and metadata)
23
+ - Markdown (GitHub-flavored, with alignment options)
24
+
25
+ - **Command Line Interface**:
26
+ - Extract tables from multiple PDF files
27
+ - Page selection (individual pages, ranges, or all)
28
+ - Area extraction (specify top, left, bottom, right coordinates)
29
+ - Column boundary specification
30
+ - Auto-detection of table areas (`--guess`)
31
+ - Password-protected PDF support
32
+
33
+ - **Text Handling**:
34
+ - Proper UTF-8 encoding support
35
+ - Right-to-left (RTL) text support (Arabic, Hebrew, etc.)
36
+ - Merged text runs for proper word/phrase extraction
37
+
38
+ - **PDF Features**:
39
+ - Support for rotated pages
40
+ - Password-protected PDF support
41
+ - Ruling line detection from PDF graphics stream
42
+
43
+ - **Core Geometry**:
44
+ - Rectangle, Point, and Ruling primitives
45
+ - Spatial indexing for efficient text lookup
46
+ - Cohen-Sutherland line clipping algorithm
47
+ - Projection profile analysis
48
+
49
+ - **Detection Algorithms**:
50
+ - Spreadsheet detection (ruling-based)
51
+ - Nurminen detection algorithm for table area detection
52
+
53
+ - **Configuration**:
54
+ - Customizable tolerance thresholds for text merging
55
+ - Configurable cell detection parameters
56
+
57
+ ### Notes
58
+
59
+ - PDFs without drawn ruling lines require stream mode (lattice mode needs visible cell borders)
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Tabula Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,176 @@
1
+ # Tabula
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/tabula-rb.svg)](https://rubygems.org/gems/tabula-rb)
4
+
5
+ A Ruby library for extracting tables from PDF files.
6
+
7
+ This is a pure Ruby port of [tabula-java](https://github.com/tabulapdf/tabula-java), the open-source library that powers [Tabula](https://tabula.technology/). It implements the same extraction algorithms and produces compatible output, allowing you to extract tables from PDFs without requiring Java or JRuby.
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ ```ruby
14
+ gem 'tabula-rb'
15
+ ```
16
+
17
+ And then execute:
18
+
19
+ ```bash
20
+ bundle install
21
+ ```
22
+
23
+ Or install it directly:
24
+
25
+ ```bash
26
+ gem install tabula-rb
27
+ ```
28
+
29
+ ## Usage
30
+
31
+ ### Library
32
+
33
+ ```ruby
34
+ require 'tabula'
35
+
36
+ # Extract all tables from a PDF
37
+ tables = Tabula.extract("document.pdf")
38
+
39
+ # Each table can be converted to different formats
40
+ tables.each do |table|
41
+ puts table.to_a.inspect # Array of arrays
42
+ puts table.to_csv # CSV string
43
+ end
44
+
45
+ # Extract from specific pages
46
+ tables = Tabula.extract("document.pdf", pages: [1, 2, 3])
47
+
48
+ # Use lattice mode (for PDFs with ruling lines/borders)
49
+ tables = Tabula.extract("document.pdf", method: :lattice)
50
+
51
+ # Use stream mode (for PDFs without ruling lines)
52
+ tables = Tabula.extract("document.pdf", method: :stream)
53
+
54
+ # Extract a specific area (top, left, bottom, right in points)
55
+ tables = Tabula.extract("document.pdf", area: [0, 0, 500, 800])
56
+
57
+ # Auto-detect table areas
58
+ tables = Tabula.extract("document.pdf", guess: true)
59
+
60
+ # Password-protected PDFs
61
+ tables = Tabula.extract("document.pdf", password: "secret")
62
+ ```
63
+
64
+ ### Output Formats
65
+
66
+ ```ruby
67
+ tables = Tabula.extract("document.pdf")
68
+
69
+ # CSV
70
+ Tabula::Writers::CSVWriter.to_string(tables)
71
+
72
+ # TSV
73
+ Tabula::Writers::TSVWriter.to_string(tables)
74
+
75
+ # JSON
76
+ Tabula::Writers::JSONWriter.to_string(tables)
77
+ Tabula::Writers::JSONWriter.to_string(tables, pretty: true)
78
+
79
+ # Markdown
80
+ Tabula::Writers::MarkdownWriter.to_string(tables)
81
+ Tabula::Writers::MarkdownWriter.to_string(tables, alignment: :center)
82
+ ```
83
+
84
+ ### Command Line
85
+
86
+ ```bash
87
+ # Basic extraction (outputs CSV to stdout)
88
+ tabula document.pdf
89
+
90
+ # Specify output format
91
+ tabula -f CSV document.pdf
92
+ tabula -f TSV document.pdf
93
+ tabula -f JSON document.pdf
94
+ tabula -f MARKDOWN document.pdf
95
+
96
+ # Write to file
97
+ tabula -o output.csv document.pdf
98
+
99
+ # Extract specific pages
100
+ tabula -p 1,2,3 document.pdf
101
+ tabula -p 1-5 document.pdf
102
+ tabula -p all document.pdf
103
+
104
+ # Force extraction mode
105
+ tabula -l document.pdf # Lattice mode (ruling lines)
106
+ tabula -t document.pdf # Stream mode (text positions)
107
+
108
+ # Extract specific area
109
+ tabula -a 0,0,500,800 document.pdf
110
+
111
+ # Auto-detect table areas
112
+ tabula -g document.pdf
113
+
114
+ # Password-protected PDF
115
+ tabula -s mypassword document.pdf
116
+ ```
117
+
118
+ Full CLI options:
119
+
120
+ ```
121
+ Usage: tabula [OPTIONS] <pdf_file> [<pdf_file> ...]
122
+
123
+ Options:
124
+ -a, --area AREA Extraction area (top,left,bottom,right)
125
+ -c, --columns COLUMNS Column boundaries (comma-separated x coordinates)
126
+ -f, --format FORMAT Output format: CSV, TSV, JSON, MARKDOWN (default: CSV)
127
+ -g, --guess Guess table areas (use detection algorithm)
128
+ -l, --lattice Force lattice mode (use ruling lines)
129
+ -t, --stream Force stream mode (use text positions)
130
+ -p, --pages PAGES Pages to extract (e.g., '1,2,3' or '1-5' or 'all')
131
+ -o, --output FILE Output file (default: stdout)
132
+ -s, --password PASSWORD PDF password
133
+ --debug Show debug information
134
+ -v, --version Show version
135
+ -h, --help Show this help
136
+ ```
137
+
138
+ ## Extraction Modes
139
+
140
+ ### Lattice Mode (`-l` / `:lattice`)
141
+
142
+ Best for tables with visible borders/ruling lines. The algorithm detects cell boundaries by finding intersections of horizontal and vertical lines drawn in the PDF.
143
+
144
+ ### Stream Mode (`-t` / `:stream`)
145
+
146
+ Best for tables without visible borders. The algorithm infers table structure from text positioning, looking for gaps between text elements to determine column boundaries.
147
+
148
+ ### Auto Mode (default)
149
+
150
+ Tries lattice mode first. If no tables are found, falls back to stream mode.
151
+
152
+ ## Requirements
153
+
154
+ - Ruby 3.1+
155
+ - pdf-reader gem (automatically installed as dependency)
156
+
157
+ ## Development
158
+
159
+ After checking out the repo, run:
160
+
161
+ ```bash
162
+ bundle install
163
+ bundle exec rspec
164
+ ```
165
+
166
+ ## License
167
+
168
+ MIT License. See [LICENSE](LICENSE) for details.
169
+
170
+ ## Acknowledgments
171
+
172
+ This gem is a Ruby port of [tabula-java](https://github.com/tabulapdf/tabula-java) by the [Tabula](https://tabula.technology/) team. The extraction algorithms, test fixtures, and expected behaviors are derived from the original Java implementation.
173
+
174
+ Special thanks to:
175
+ - [Manuel Aristarán](https://github.com/jazzido) and the Tabula team for creating the original tabula-java
176
+ - The [pdf-reader](https://github.com/yob/pdf-reader) gem maintainers for the excellent PDF parsing library
data/Rakefile ADDED
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rspec/core/rake_task'
5
+ require 'rubocop/rake_task'
6
+
7
+ RSpec::Core::RakeTask.new(:spec)
8
+ RuboCop::RakeTask.new
9
+
10
+ task default: %i[spec rubocop]
11
+
12
+ namespace :spec do
13
+ RSpec::Core::RakeTask.new(:unit) do |t|
14
+ t.pattern = 'spec/{core,text,table,pdf,extractors,detectors,writers,algorithms}/**/*_spec.rb'
15
+ end
16
+
17
+ RSpec::Core::RakeTask.new(:integration) do |t|
18
+ t.pattern = 'spec/integration/**/*_spec.rb'
19
+ end
20
+ end
21
+
22
+ namespace :doc do
23
+ require 'yard'
24
+ YARD::Rake::YardocTask.new do |t|
25
+ t.files = ['lib/**/*.rb']
26
+ t.options = ['--output-dir', 'doc']
27
+ end
28
+ end
data/exe/tabula ADDED
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'tabula'
5
+ require 'tabula/cli'
6
+
7
+ exit Tabula::CLI.run(ARGV)
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tabula
4
+ # Cohen-Sutherland line clipping algorithm.
5
+ # Clips a line segment to a rectangular region.
6
+ module CohenSutherlandClipping
7
+ # Region codes for Cohen-Sutherland algorithm
8
+ INSIDE = 0b0000
9
+ LEFT = 0b0001
10
+ RIGHT = 0b0010
11
+ BOTTOM = 0b0100
12
+ TOP = 0b1000
13
+
14
+ class << self
15
+ # Clip a ruling to a rectangular region
16
+ # @param ruling [Ruling] the line segment to clip
17
+ # @param rect [Rectangle] the clipping region
18
+ # @return [Ruling, nil] clipped ruling, or nil if entirely outside
19
+ def clip(ruling, rect)
20
+ x1 = ruling.x1
21
+ y1 = ruling.y1
22
+ x2 = ruling.x2
23
+ y2 = ruling.y2
24
+
25
+ min_x = rect.left
26
+ max_x = rect.right
27
+ min_y = rect.top
28
+ max_y = rect.bottom
29
+
30
+ code1 = compute_code(x1, y1, min_x, max_x, min_y, max_y)
31
+ code2 = compute_code(x2, y2, min_x, max_x, min_y, max_y)
32
+
33
+ loop do
34
+ # Both endpoints inside - trivially accept
35
+ return Ruling.new(x1, y1, x2, y2) if (code1 | code2).zero?
36
+
37
+ # Both endpoints share an outside region - trivially reject
38
+ return nil if (code1 & code2).nonzero?
39
+
40
+ # At least one endpoint is outside, select it
41
+ code_out = code1.nonzero? ? code1 : code2
42
+
43
+ # Find intersection point
44
+ x, y = find_intersection(x1, y1, x2, y2, code_out, min_x, max_x, min_y, max_y)
45
+
46
+ # Replace the outside point
47
+ if code_out == code1
48
+ x1 = x
49
+ y1 = y
50
+ code1 = compute_code(x1, y1, min_x, max_x, min_y, max_y)
51
+ else
52
+ x2 = x
53
+ y2 = y
54
+ code2 = compute_code(x2, y2, min_x, max_x, min_y, max_y)
55
+ end
56
+ end
57
+ end
58
+
59
+ private
60
+
61
+ def compute_code(x, y, min_x, max_x, min_y, max_y)
62
+ code = INSIDE
63
+ code |= LEFT if x < min_x
64
+ code |= RIGHT if x > max_x
65
+ code |= TOP if y < min_y
66
+ code |= BOTTOM if y > max_y
67
+ code
68
+ end
69
+
70
+ def find_intersection(x1, y1, x2, y2, code_out, min_x, max_x, min_y, max_y)
71
+ x = 0.0
72
+ y = 0.0
73
+ dx = x2 - x1
74
+ dy = y2 - y1
75
+
76
+ if (code_out & BOTTOM).nonzero?
77
+ x = x1 + (dx * (max_y - y1) / dy)
78
+ y = max_y
79
+ elsif (code_out & TOP).nonzero?
80
+ x = x1 + (dx * (min_y - y1) / dy)
81
+ y = min_y
82
+ elsif (code_out & RIGHT).nonzero?
83
+ y = y1 + (dy * (max_x - x1) / dx)
84
+ x = max_x
85
+ elsif (code_out & LEFT).nonzero?
86
+ y = y1 + (dy * (min_x - x1) / dx)
87
+ x = min_x
88
+ end
89
+
90
+ [x, y]
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,109 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Tabula
4
+ # Projection profile analysis for detecting table structure.
5
+ # Computes histograms of text element positions to find gaps.
6
+ class ProjectionProfile
7
+ attr_reader :min_value, :max_value, :bins
8
+
9
+ # @param elements [Array<Rectangle>] elements to analyze
10
+ # @param orientation [Symbol] :horizontal or :vertical
11
+ # @param bin_size [Float] size of histogram bins
12
+ def initialize(elements, orientation:, bin_size: 1.0)
13
+ @orientation = orientation
14
+ @bin_size = bin_size
15
+ @bins = Hash.new(0)
16
+ @min_value = Float::INFINITY
17
+ @max_value = -Float::INFINITY
18
+
19
+ compute_profile(elements)
20
+ end
21
+
22
+ # Find gaps in the projection profile
23
+ # @param min_gap_size [Float] minimum gap size to detect
24
+ # @return [Array<Array<Float>>] array of [start, end] gap ranges
25
+ def find_gaps(min_gap_size: 3.0)
26
+ return [] if @bins.empty?
27
+
28
+ gaps = []
29
+ gap_start = nil
30
+ last_filled = nil
31
+
32
+ (min_bin..max_bin).each do |bin|
33
+ value = @bins[bin]
34
+
35
+ if value.positive?
36
+ if gap_start && last_filled
37
+ gap_end = bin * @bin_size
38
+ gap_size = gap_end - gap_start
39
+ gaps << [gap_start, gap_end] if gap_size >= min_gap_size
40
+ end
41
+ gap_start = nil
42
+ last_filled = (bin * @bin_size) + @bin_size
43
+ elsif last_filled && gap_start.nil?
44
+ gap_start = last_filled
45
+ end
46
+ end
47
+
48
+ gaps
49
+ end
50
+
51
+ # Get midpoints of gaps (useful for column detection)
52
+ # @param min_gap_size [Float] minimum gap size
53
+ # @return [Array<Float>] gap midpoint positions
54
+ def gap_midpoints(min_gap_size: 3.0)
55
+ find_gaps(min_gap_size: min_gap_size).map { |start, stop| (start + stop) / 2.0 }
56
+ end
57
+
58
+ # Get value at a specific position
59
+ # @param position [Float] position to query
60
+ # @return [Integer] count at that position
61
+ def [](position)
62
+ bin = (position / @bin_size).floor
63
+ @bins[bin]
64
+ end
65
+
66
+ # Check if a position is in a gap
67
+ # @param position [Float] position to check
68
+ # @param min_gap_size [Float] minimum gap size
69
+ # @return [Boolean] true if position is in a gap
70
+ def in_gap?(position, min_gap_size: 3.0)
71
+ find_gaps(min_gap_size: min_gap_size).any? do |gap_start, gap_end|
72
+ position.between?(gap_start, gap_end)
73
+ end
74
+ end
75
+
76
+ private
77
+
78
+ def compute_profile(elements)
79
+ elements.each do |element|
80
+ if @orientation == :horizontal
81
+ # For horizontal profile, we project onto the X axis
82
+ add_range(element.left, element.right)
83
+ @min_value = [@min_value, element.top].min
84
+ @max_value = [@max_value, element.bottom].max
85
+ else
86
+ # For vertical profile, we project onto the Y axis
87
+ add_range(element.top, element.bottom)
88
+ @min_value = [@min_value, element.left].min
89
+ @max_value = [@max_value, element.right].max
90
+ end
91
+ end
92
+ end
93
+
94
+ def add_range(start_pos, end_pos)
95
+ start_bin = (start_pos / @bin_size).floor
96
+ end_bin = (end_pos / @bin_size).floor
97
+
98
+ (start_bin..end_bin).each { |bin| @bins[bin] += 1 }
99
+ end
100
+
101
+ def min_bin
102
+ @bins.keys.min || 0
103
+ end
104
+
105
+ def max_bin
106
+ @bins.keys.max || 0
107
+ end
108
+ end
109
+ end