tabula-rb 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +39 -0
- data/CHANGELOG.md +59 -0
- data/LICENSE +21 -0
- data/README.md +176 -0
- data/Rakefile +28 -0
- data/exe/tabula +7 -0
- data/lib/tabula/algorithms/cohen_sutherland_clipping.rb +94 -0
- data/lib/tabula/algorithms/projection_profile.rb +109 -0
- data/lib/tabula/cli.rb +271 -0
- data/lib/tabula/configuration.rb +119 -0
- data/lib/tabula/core/point.rb +60 -0
- data/lib/tabula/core/rectangle.rb +218 -0
- data/lib/tabula/core/ruling.rb +303 -0
- data/lib/tabula/core/spatial_index.rb +120 -0
- data/lib/tabula/detectors/detection_algorithm.rb +34 -0
- data/lib/tabula/detectors/nurminen_detection_algorithm.rb +211 -0
- data/lib/tabula/detectors/spreadsheet_detection_algorithm.rb +142 -0
- data/lib/tabula/extractors/basic_extraction_algorithm.rb +168 -0
- data/lib/tabula/extractors/extraction_algorithm.rb +34 -0
- data/lib/tabula/extractors/spreadsheet_extraction_algorithm.rb +299 -0
- data/lib/tabula/pdf/object_extractor.rb +400 -0
- data/lib/tabula/pdf/page.rb +230 -0
- data/lib/tabula/pdf/text_stripper.rb +150 -0
- data/lib/tabula/table/cell.rb +110 -0
- data/lib/tabula/table/table.rb +184 -0
- data/lib/tabula/text/line.rb +133 -0
- data/lib/tabula/text/text_chunk.rb +185 -0
- data/lib/tabula/text/text_element.rb +120 -0
- data/lib/tabula/version.rb +5 -0
- data/lib/tabula/writers/csv_writer.rb +49 -0
- data/lib/tabula/writers/json_writer.rb +41 -0
- data/lib/tabula/writers/markdown_writer.rb +71 -0
- data/lib/tabula/writers/tsv_writer.rb +35 -0
- data/lib/tabula/writers/writer.rb +39 -0
- data/lib/tabula.rb +160 -0
- data/mise.toml +2 -0
- data/tabula-rb.gemspec +44 -0
- metadata +115 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 80dbf2efc9afdaa82c43fed84838b003121a9ec2af277621b7be59f3394840a3
|
|
4
|
+
data.tar.gz: 8db614e4185f5fbd5e5b969e1ee76469a6d883542e83942121f18dace8eb80c3
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 79e3c2de05740e98a587710dec2426e8d5294e928771136a51f22e81ed9eecb1b1d6018bf2dfc83dfb97c1ac64bbc2e6c5660f863e7cce4252b644ef9b495670
|
|
7
|
+
data.tar.gz: 52280f8b0a1fd27ea842bab9f250274d42854b141e7f056d00e7902eb38024bd77ed498c336ed1810fb94ca253787520408f9d29d82fde52b507bf98246be465
|
data/.rspec
ADDED
data/.rubocop.yml
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
require:
|
|
2
|
+
- rubocop-rspec
|
|
3
|
+
|
|
4
|
+
AllCops:
|
|
5
|
+
TargetRubyVersion: 3.1
|
|
6
|
+
NewCops: enable
|
|
7
|
+
SuggestExtensions: false
|
|
8
|
+
|
|
9
|
+
Style/Documentation:
|
|
10
|
+
Enabled: false
|
|
11
|
+
|
|
12
|
+
Style/FrozenStringLiteralComment:
|
|
13
|
+
Enabled: true
|
|
14
|
+
|
|
15
|
+
Metrics/ClassLength:
|
|
16
|
+
Max: 200
|
|
17
|
+
|
|
18
|
+
Metrics/MethodLength:
|
|
19
|
+
Max: 30
|
|
20
|
+
|
|
21
|
+
Metrics/AbcSize:
|
|
22
|
+
Max: 30
|
|
23
|
+
|
|
24
|
+
Metrics/BlockLength:
|
|
25
|
+
Exclude:
|
|
26
|
+
- "spec/**/*"
|
|
27
|
+
- "*.gemspec"
|
|
28
|
+
|
|
29
|
+
Layout/LineLength:
|
|
30
|
+
Max: 120
|
|
31
|
+
|
|
32
|
+
RSpec/MultipleExpectations:
|
|
33
|
+
Max: 5
|
|
34
|
+
|
|
35
|
+
RSpec/ExampleLength:
|
|
36
|
+
Max: 15
|
|
37
|
+
|
|
38
|
+
RSpec/NestedGroups:
|
|
39
|
+
Max: 4
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [1.0.0] - 2024
|
|
9
|
+
|
|
10
|
+
Initial stable release of tabula-rb, a pure Ruby port of tabula-java.
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- **Table Extraction**: Extract tables from PDF files using two modes:
|
|
15
|
+
- Lattice mode: For PDFs with visible ruling lines/borders
|
|
16
|
+
- Stream mode: For PDFs without visible borders (uses text positioning)
|
|
17
|
+
- Auto mode: Tries lattice first, falls back to stream
|
|
18
|
+
|
|
19
|
+
- **Output Formats**:
|
|
20
|
+
- CSV (with customizable separator and quoting)
|
|
21
|
+
- TSV
|
|
22
|
+
- JSON (with optional pretty-printing and metadata)
|
|
23
|
+
- Markdown (GitHub-flavored, with alignment options)
|
|
24
|
+
|
|
25
|
+
- **Command Line Interface**:
|
|
26
|
+
- Extract tables from multiple PDF files
|
|
27
|
+
- Page selection (individual pages, ranges, or all)
|
|
28
|
+
- Area extraction (specify top, left, bottom, right coordinates)
|
|
29
|
+
- Column boundary specification
|
|
30
|
+
- Auto-detection of table areas (`--guess`)
|
|
31
|
+
- Password-protected PDF support
|
|
32
|
+
|
|
33
|
+
- **Text Handling**:
|
|
34
|
+
- Proper UTF-8 encoding support
|
|
35
|
+
- Right-to-left (RTL) text support (Arabic, Hebrew, etc.)
|
|
36
|
+
- Merged text runs for proper word/phrase extraction
|
|
37
|
+
|
|
38
|
+
- **PDF Features**:
|
|
39
|
+
- Support for rotated pages
|
|
40
|
+
- Password-protected PDF support
|
|
41
|
+
- Ruling line detection from PDF graphics stream
|
|
42
|
+
|
|
43
|
+
- **Core Geometry**:
|
|
44
|
+
- Rectangle, Point, and Ruling primitives
|
|
45
|
+
- Spatial indexing for efficient text lookup
|
|
46
|
+
- Cohen-Sutherland line clipping algorithm
|
|
47
|
+
- Projection profile analysis
|
|
48
|
+
|
|
49
|
+
- **Detection Algorithms**:
|
|
50
|
+
- Spreadsheet detection (ruling-based)
|
|
51
|
+
- Nurminen detection algorithm for table area detection
|
|
52
|
+
|
|
53
|
+
- **Configuration**:
|
|
54
|
+
- Customizable tolerance thresholds for text merging
|
|
55
|
+
- Configurable cell detection parameters
|
|
56
|
+
|
|
57
|
+
### Notes
|
|
58
|
+
|
|
59
|
+
- PDFs without drawn ruling lines require stream mode (lattice mode needs visible cell borders)
|
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Tabula Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
# Tabula
|
|
2
|
+
|
|
3
|
+
[](https://rubygems.org/gems/tabula-rb)
|
|
4
|
+
|
|
5
|
+
A Ruby library for extracting tables from PDF files.
|
|
6
|
+
|
|
7
|
+
This is a pure Ruby port of [tabula-java](https://github.com/tabulapdf/tabula-java), the open-source library that powers [Tabula](https://tabula.technology/). It implements the same extraction algorithms and produces compatible output, allowing you to extract tables from PDFs without requiring Java or JRuby.
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
Add this line to your application's Gemfile:
|
|
12
|
+
|
|
13
|
+
```ruby
|
|
14
|
+
gem 'tabula-rb'
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
And then execute:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
bundle install
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
Or install it directly:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
gem install tabula-rb
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Usage
|
|
30
|
+
|
|
31
|
+
### Library
|
|
32
|
+
|
|
33
|
+
```ruby
|
|
34
|
+
require 'tabula'
|
|
35
|
+
|
|
36
|
+
# Extract all tables from a PDF
|
|
37
|
+
tables = Tabula.extract("document.pdf")
|
|
38
|
+
|
|
39
|
+
# Each table can be converted to different formats
|
|
40
|
+
tables.each do |table|
|
|
41
|
+
puts table.to_a.inspect # Array of arrays
|
|
42
|
+
puts table.to_csv # CSV string
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Extract from specific pages
|
|
46
|
+
tables = Tabula.extract("document.pdf", pages: [1, 2, 3])
|
|
47
|
+
|
|
48
|
+
# Use lattice mode (for PDFs with ruling lines/borders)
|
|
49
|
+
tables = Tabula.extract("document.pdf", method: :lattice)
|
|
50
|
+
|
|
51
|
+
# Use stream mode (for PDFs without ruling lines)
|
|
52
|
+
tables = Tabula.extract("document.pdf", method: :stream)
|
|
53
|
+
|
|
54
|
+
# Extract a specific area (top, left, bottom, right in points)
|
|
55
|
+
tables = Tabula.extract("document.pdf", area: [0, 0, 500, 800])
|
|
56
|
+
|
|
57
|
+
# Auto-detect table areas
|
|
58
|
+
tables = Tabula.extract("document.pdf", guess: true)
|
|
59
|
+
|
|
60
|
+
# Password-protected PDFs
|
|
61
|
+
tables = Tabula.extract("document.pdf", password: "secret")
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Output Formats
|
|
65
|
+
|
|
66
|
+
```ruby
|
|
67
|
+
tables = Tabula.extract("document.pdf")
|
|
68
|
+
|
|
69
|
+
# CSV
|
|
70
|
+
Tabula::Writers::CSVWriter.to_string(tables)
|
|
71
|
+
|
|
72
|
+
# TSV
|
|
73
|
+
Tabula::Writers::TSVWriter.to_string(tables)
|
|
74
|
+
|
|
75
|
+
# JSON
|
|
76
|
+
Tabula::Writers::JSONWriter.to_string(tables)
|
|
77
|
+
Tabula::Writers::JSONWriter.to_string(tables, pretty: true)
|
|
78
|
+
|
|
79
|
+
# Markdown
|
|
80
|
+
Tabula::Writers::MarkdownWriter.to_string(tables)
|
|
81
|
+
Tabula::Writers::MarkdownWriter.to_string(tables, alignment: :center)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Command Line
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
# Basic extraction (outputs CSV to stdout)
|
|
88
|
+
tabula document.pdf
|
|
89
|
+
|
|
90
|
+
# Specify output format
|
|
91
|
+
tabula -f CSV document.pdf
|
|
92
|
+
tabula -f TSV document.pdf
|
|
93
|
+
tabula -f JSON document.pdf
|
|
94
|
+
tabula -f MARKDOWN document.pdf
|
|
95
|
+
|
|
96
|
+
# Write to file
|
|
97
|
+
tabula -o output.csv document.pdf
|
|
98
|
+
|
|
99
|
+
# Extract specific pages
|
|
100
|
+
tabula -p 1,2,3 document.pdf
|
|
101
|
+
tabula -p 1-5 document.pdf
|
|
102
|
+
tabula -p all document.pdf
|
|
103
|
+
|
|
104
|
+
# Force extraction mode
|
|
105
|
+
tabula -l document.pdf # Lattice mode (ruling lines)
|
|
106
|
+
tabula -t document.pdf # Stream mode (text positions)
|
|
107
|
+
|
|
108
|
+
# Extract specific area
|
|
109
|
+
tabula -a 0,0,500,800 document.pdf
|
|
110
|
+
|
|
111
|
+
# Auto-detect table areas
|
|
112
|
+
tabula -g document.pdf
|
|
113
|
+
|
|
114
|
+
# Password-protected PDF
|
|
115
|
+
tabula -s mypassword document.pdf
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
Full CLI options:
|
|
119
|
+
|
|
120
|
+
```
|
|
121
|
+
Usage: tabula [OPTIONS] <pdf_file> [<pdf_file> ...]
|
|
122
|
+
|
|
123
|
+
Options:
|
|
124
|
+
-a, --area AREA Extraction area (top,left,bottom,right)
|
|
125
|
+
-c, --columns COLUMNS Column boundaries (comma-separated x coordinates)
|
|
126
|
+
-f, --format FORMAT Output format: CSV, TSV, JSON, MARKDOWN (default: CSV)
|
|
127
|
+
-g, --guess Guess table areas (use detection algorithm)
|
|
128
|
+
-l, --lattice Force lattice mode (use ruling lines)
|
|
129
|
+
-t, --stream Force stream mode (use text positions)
|
|
130
|
+
-p, --pages PAGES Pages to extract (e.g., '1,2,3' or '1-5' or 'all')
|
|
131
|
+
-o, --output FILE Output file (default: stdout)
|
|
132
|
+
-s, --password PASSWORD PDF password
|
|
133
|
+
--debug Show debug information
|
|
134
|
+
-v, --version Show version
|
|
135
|
+
-h, --help Show this help
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## Extraction Modes
|
|
139
|
+
|
|
140
|
+
### Lattice Mode (`-l` / `:lattice`)
|
|
141
|
+
|
|
142
|
+
Best for tables with visible borders/ruling lines. The algorithm detects cell boundaries by finding intersections of horizontal and vertical lines drawn in the PDF.
|
|
143
|
+
|
|
144
|
+
### Stream Mode (`-t` / `:stream`)
|
|
145
|
+
|
|
146
|
+
Best for tables without visible borders. The algorithm infers table structure from text positioning, looking for gaps between text elements to determine column boundaries.
|
|
147
|
+
|
|
148
|
+
### Auto Mode (default)
|
|
149
|
+
|
|
150
|
+
Tries lattice mode first. If no tables are found, falls back to stream mode.
|
|
151
|
+
|
|
152
|
+
## Requirements
|
|
153
|
+
|
|
154
|
+
- Ruby 3.1+
|
|
155
|
+
- pdf-reader gem (automatically installed as dependency)
|
|
156
|
+
|
|
157
|
+
## Development
|
|
158
|
+
|
|
159
|
+
After checking out the repo, run:
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
bundle install
|
|
163
|
+
bundle exec rspec
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## License
|
|
167
|
+
|
|
168
|
+
MIT License. See [LICENSE](LICENSE) for details.
|
|
169
|
+
|
|
170
|
+
## Acknowledgments
|
|
171
|
+
|
|
172
|
+
This gem is a Ruby port of [tabula-java](https://github.com/tabulapdf/tabula-java) by the [Tabula](https://tabula.technology/) team. The extraction algorithms, test fixtures, and expected behaviors are derived from the original Java implementation.
|
|
173
|
+
|
|
174
|
+
Special thanks to:
|
|
175
|
+
- [Manuel Aristarán](https://github.com/jazzido) and the Tabula team for creating the original tabula-java
|
|
176
|
+
- The [pdf-reader](https://github.com/yob/pdf-reader) gem maintainers for the excellent PDF parsing library
|
data/Rakefile
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'bundler/gem_tasks'
|
|
4
|
+
require 'rspec/core/rake_task'
|
|
5
|
+
require 'rubocop/rake_task'
|
|
6
|
+
|
|
7
|
+
RSpec::Core::RakeTask.new(:spec)
|
|
8
|
+
RuboCop::RakeTask.new
|
|
9
|
+
|
|
10
|
+
task default: %i[spec rubocop]
|
|
11
|
+
|
|
12
|
+
namespace :spec do
|
|
13
|
+
RSpec::Core::RakeTask.new(:unit) do |t|
|
|
14
|
+
t.pattern = 'spec/{core,text,table,pdf,extractors,detectors,writers,algorithms}/**/*_spec.rb'
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
RSpec::Core::RakeTask.new(:integration) do |t|
|
|
18
|
+
t.pattern = 'spec/integration/**/*_spec.rb'
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
namespace :doc do
|
|
23
|
+
require 'yard'
|
|
24
|
+
YARD::Rake::YardocTask.new do |t|
|
|
25
|
+
t.files = ['lib/**/*.rb']
|
|
26
|
+
t.options = ['--output-dir', 'doc']
|
|
27
|
+
end
|
|
28
|
+
end
|
data/exe/tabula
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Tabula
|
|
4
|
+
# Cohen-Sutherland line clipping algorithm.
|
|
5
|
+
# Clips a line segment to a rectangular region.
|
|
6
|
+
module CohenSutherlandClipping
|
|
7
|
+
# Region codes for Cohen-Sutherland algorithm
|
|
8
|
+
INSIDE = 0b0000
|
|
9
|
+
LEFT = 0b0001
|
|
10
|
+
RIGHT = 0b0010
|
|
11
|
+
BOTTOM = 0b0100
|
|
12
|
+
TOP = 0b1000
|
|
13
|
+
|
|
14
|
+
class << self
|
|
15
|
+
# Clip a ruling to a rectangular region
|
|
16
|
+
# @param ruling [Ruling] the line segment to clip
|
|
17
|
+
# @param rect [Rectangle] the clipping region
|
|
18
|
+
# @return [Ruling, nil] clipped ruling, or nil if entirely outside
|
|
19
|
+
def clip(ruling, rect)
|
|
20
|
+
x1 = ruling.x1
|
|
21
|
+
y1 = ruling.y1
|
|
22
|
+
x2 = ruling.x2
|
|
23
|
+
y2 = ruling.y2
|
|
24
|
+
|
|
25
|
+
min_x = rect.left
|
|
26
|
+
max_x = rect.right
|
|
27
|
+
min_y = rect.top
|
|
28
|
+
max_y = rect.bottom
|
|
29
|
+
|
|
30
|
+
code1 = compute_code(x1, y1, min_x, max_x, min_y, max_y)
|
|
31
|
+
code2 = compute_code(x2, y2, min_x, max_x, min_y, max_y)
|
|
32
|
+
|
|
33
|
+
loop do
|
|
34
|
+
# Both endpoints inside - trivially accept
|
|
35
|
+
return Ruling.new(x1, y1, x2, y2) if (code1 | code2).zero?
|
|
36
|
+
|
|
37
|
+
# Both endpoints share an outside region - trivially reject
|
|
38
|
+
return nil if (code1 & code2).nonzero?
|
|
39
|
+
|
|
40
|
+
# At least one endpoint is outside, select it
|
|
41
|
+
code_out = code1.nonzero? ? code1 : code2
|
|
42
|
+
|
|
43
|
+
# Find intersection point
|
|
44
|
+
x, y = find_intersection(x1, y1, x2, y2, code_out, min_x, max_x, min_y, max_y)
|
|
45
|
+
|
|
46
|
+
# Replace the outside point
|
|
47
|
+
if code_out == code1
|
|
48
|
+
x1 = x
|
|
49
|
+
y1 = y
|
|
50
|
+
code1 = compute_code(x1, y1, min_x, max_x, min_y, max_y)
|
|
51
|
+
else
|
|
52
|
+
x2 = x
|
|
53
|
+
y2 = y
|
|
54
|
+
code2 = compute_code(x2, y2, min_x, max_x, min_y, max_y)
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
private
|
|
60
|
+
|
|
61
|
+
def compute_code(x, y, min_x, max_x, min_y, max_y)
|
|
62
|
+
code = INSIDE
|
|
63
|
+
code |= LEFT if x < min_x
|
|
64
|
+
code |= RIGHT if x > max_x
|
|
65
|
+
code |= TOP if y < min_y
|
|
66
|
+
code |= BOTTOM if y > max_y
|
|
67
|
+
code
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def find_intersection(x1, y1, x2, y2, code_out, min_x, max_x, min_y, max_y)
|
|
71
|
+
x = 0.0
|
|
72
|
+
y = 0.0
|
|
73
|
+
dx = x2 - x1
|
|
74
|
+
dy = y2 - y1
|
|
75
|
+
|
|
76
|
+
if (code_out & BOTTOM).nonzero?
|
|
77
|
+
x = x1 + (dx * (max_y - y1) / dy)
|
|
78
|
+
y = max_y
|
|
79
|
+
elsif (code_out & TOP).nonzero?
|
|
80
|
+
x = x1 + (dx * (min_y - y1) / dy)
|
|
81
|
+
y = min_y
|
|
82
|
+
elsif (code_out & RIGHT).nonzero?
|
|
83
|
+
y = y1 + (dy * (max_x - x1) / dx)
|
|
84
|
+
x = max_x
|
|
85
|
+
elsif (code_out & LEFT).nonzero?
|
|
86
|
+
y = y1 + (dy * (min_x - x1) / dx)
|
|
87
|
+
x = min_x
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
[x, y]
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Tabula
|
|
4
|
+
# Projection profile analysis for detecting table structure.
|
|
5
|
+
# Computes histograms of text element positions to find gaps.
|
|
6
|
+
class ProjectionProfile
|
|
7
|
+
attr_reader :min_value, :max_value, :bins
|
|
8
|
+
|
|
9
|
+
# @param elements [Array<Rectangle>] elements to analyze
|
|
10
|
+
# @param orientation [Symbol] :horizontal or :vertical
|
|
11
|
+
# @param bin_size [Float] size of histogram bins
|
|
12
|
+
def initialize(elements, orientation:, bin_size: 1.0)
|
|
13
|
+
@orientation = orientation
|
|
14
|
+
@bin_size = bin_size
|
|
15
|
+
@bins = Hash.new(0)
|
|
16
|
+
@min_value = Float::INFINITY
|
|
17
|
+
@max_value = -Float::INFINITY
|
|
18
|
+
|
|
19
|
+
compute_profile(elements)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Find gaps in the projection profile
|
|
23
|
+
# @param min_gap_size [Float] minimum gap size to detect
|
|
24
|
+
# @return [Array<Array<Float>>] array of [start, end] gap ranges
|
|
25
|
+
def find_gaps(min_gap_size: 3.0)
|
|
26
|
+
return [] if @bins.empty?
|
|
27
|
+
|
|
28
|
+
gaps = []
|
|
29
|
+
gap_start = nil
|
|
30
|
+
last_filled = nil
|
|
31
|
+
|
|
32
|
+
(min_bin..max_bin).each do |bin|
|
|
33
|
+
value = @bins[bin]
|
|
34
|
+
|
|
35
|
+
if value.positive?
|
|
36
|
+
if gap_start && last_filled
|
|
37
|
+
gap_end = bin * @bin_size
|
|
38
|
+
gap_size = gap_end - gap_start
|
|
39
|
+
gaps << [gap_start, gap_end] if gap_size >= min_gap_size
|
|
40
|
+
end
|
|
41
|
+
gap_start = nil
|
|
42
|
+
last_filled = (bin * @bin_size) + @bin_size
|
|
43
|
+
elsif last_filled && gap_start.nil?
|
|
44
|
+
gap_start = last_filled
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
gaps
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Get midpoints of gaps (useful for column detection)
|
|
52
|
+
# @param min_gap_size [Float] minimum gap size
|
|
53
|
+
# @return [Array<Float>] gap midpoint positions
|
|
54
|
+
def gap_midpoints(min_gap_size: 3.0)
|
|
55
|
+
find_gaps(min_gap_size: min_gap_size).map { |start, stop| (start + stop) / 2.0 }
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Get value at a specific position
|
|
59
|
+
# @param position [Float] position to query
|
|
60
|
+
# @return [Integer] count at that position
|
|
61
|
+
def [](position)
|
|
62
|
+
bin = (position / @bin_size).floor
|
|
63
|
+
@bins[bin]
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Check if a position is in a gap
|
|
67
|
+
# @param position [Float] position to check
|
|
68
|
+
# @param min_gap_size [Float] minimum gap size
|
|
69
|
+
# @return [Boolean] true if position is in a gap
|
|
70
|
+
def in_gap?(position, min_gap_size: 3.0)
|
|
71
|
+
find_gaps(min_gap_size: min_gap_size).any? do |gap_start, gap_end|
|
|
72
|
+
position.between?(gap_start, gap_end)
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
private
|
|
77
|
+
|
|
78
|
+
def compute_profile(elements)
|
|
79
|
+
elements.each do |element|
|
|
80
|
+
if @orientation == :horizontal
|
|
81
|
+
# For horizontal profile, we project onto the X axis
|
|
82
|
+
add_range(element.left, element.right)
|
|
83
|
+
@min_value = [@min_value, element.top].min
|
|
84
|
+
@max_value = [@max_value, element.bottom].max
|
|
85
|
+
else
|
|
86
|
+
# For vertical profile, we project onto the Y axis
|
|
87
|
+
add_range(element.top, element.bottom)
|
|
88
|
+
@min_value = [@min_value, element.left].min
|
|
89
|
+
@max_value = [@max_value, element.right].max
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def add_range(start_pos, end_pos)
|
|
95
|
+
start_bin = (start_pos / @bin_size).floor
|
|
96
|
+
end_bin = (end_pos / @bin_size).floor
|
|
97
|
+
|
|
98
|
+
(start_bin..end_bin).each { |bin| @bins[bin] += 1 }
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def min_bin
|
|
102
|
+
@bins.keys.min || 0
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def max_bin
|
|
106
|
+
@bins.keys.max || 0
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
end
|