tabula-rb 1.0.0 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +22 -0
- data/lib/tabula/extractors/spreadsheet_extraction_algorithm.rb +74 -18
- data/lib/tabula/version.rb +1 -1
- data/tabula-rb.gemspec +1 -2
- metadata +4 -5
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 54634e2fb6ae46c6866b436995b8dba18465b002d0fe637a3e5c7788e8ce212e
|
|
4
|
+
data.tar.gz: ab94d688b278528a7e1ecb868b34b1b4cadcd66932fddc8c565919a6c3cf5f04
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 59fd0aa6641abb1a545032a69716f42a2a7f765edb36cd821bf877b95ae1faabd9fbdc705fcadd08c3f3bdbd8551efde186b0f9898d7b202c328409cfb50ecac
|
|
7
|
+
data.tar.gz: '08ad17ecfdafa0c4a3d1e508206dfdcefea6d623d8adcd84b1ccb16a34484d0e7392a7d6850269f30704033a8aaa05b95585c4d62f199aa6b5994beea5bcf02a'
|
data/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,28 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.0.3] - 2024
|
|
9
|
+
|
|
10
|
+
### Fixed
|
|
11
|
+
|
|
12
|
+
- Fixed extraction of spanning row labels in lattice mode. Tables with cells spanning multiple rows (where horizontal rulings don't extend across all columns) now correctly extract text labels like "Servo motor type", "Compatible servo drive unit type", etc.
|
|
13
|
+
- Improved column-by-column cell detection algorithm to handle partial horizontal rulings that only cover certain columns
|
|
14
|
+
- Filtered out short/noise rulings (< 20 points) that were creating spurious column boundaries
|
|
15
|
+
- Now extracts additional data columns that tabula-java misses in some PDFs
|
|
16
|
+
|
|
17
|
+
## [1.0.2] - 2024
|
|
18
|
+
|
|
19
|
+
### Fixed
|
|
20
|
+
|
|
21
|
+
- Fixed extraction of leftmost column in tables where horizontal rulings extend beyond vertical rulings. Tables with no left border now correctly include the leftmost column data.
|
|
22
|
+
|
|
23
|
+
## [1.0.1] - 2024
|
|
24
|
+
|
|
25
|
+
### Fixed
|
|
26
|
+
|
|
27
|
+
- Fixed gem build warnings about file permissions
|
|
28
|
+
- Removed duplicate homepage_uri metadata in gemspec
|
|
29
|
+
|
|
8
30
|
## [1.0.0] - 2024
|
|
9
31
|
|
|
10
32
|
Initial stable release of tabula-rb, a pure Ruby port of tabula-java.
|
|
@@ -53,44 +53,100 @@ module Tabula
|
|
|
53
53
|
|
|
54
54
|
private
|
|
55
55
|
|
|
56
|
+
# Minimum ruling length to be considered significant
|
|
57
|
+
MIN_RULING_LENGTH = 20
|
|
58
|
+
|
|
56
59
|
def find_cells(horizontal_rulings, vertical_rulings)
|
|
57
60
|
cells = []
|
|
58
61
|
tolerance = Tabula.configuration.cell_tolerance
|
|
59
62
|
|
|
63
|
+
# Filter out very short rulings (likely noise/decorations)
|
|
64
|
+
significant_horizontals = horizontal_rulings.select { |h| (h.x2 - h.x1).abs >= MIN_RULING_LENGTH }
|
|
65
|
+
significant_verticals = vertical_rulings.select { |v| (v.y2 - v.y1).abs >= MIN_RULING_LENGTH }
|
|
66
|
+
|
|
67
|
+
return cells if significant_horizontals.empty? || significant_verticals.empty?
|
|
68
|
+
|
|
60
69
|
# Find intersection points
|
|
61
|
-
intersections = build_intersection_map(
|
|
70
|
+
intersections = build_intersection_map(significant_horizontals, significant_verticals)
|
|
62
71
|
return cells if intersections.empty?
|
|
63
72
|
|
|
64
|
-
#
|
|
65
|
-
|
|
73
|
+
# Gather all unique x positions from significant vertical rulings and horizontal ruling endpoints
|
|
74
|
+
x_positions = significant_verticals.map { |v| v.x1.round(1) }.uniq
|
|
75
|
+
significant_horizontals.each do |h|
|
|
76
|
+
x_positions << h.x1.round(1)
|
|
77
|
+
x_positions << h.x2.round(1)
|
|
78
|
+
end
|
|
79
|
+
x_positions = x_positions.uniq.sort
|
|
66
80
|
|
|
67
|
-
return cells if
|
|
81
|
+
return cells if x_positions.size < 2
|
|
68
82
|
|
|
69
|
-
# Process each
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
83
|
+
# Process each column individually to handle spanning cells correctly
|
|
84
|
+
# For each column, only use horizontal rulings that actually cover that column
|
|
85
|
+
x_positions.each_cons(2) do |left, right|
|
|
86
|
+
# Skip very narrow columns (likely noise)
|
|
87
|
+
next if (right - left) < 10
|
|
88
|
+
|
|
89
|
+
# Find horizontal rulings that cover this column range
|
|
90
|
+
column_horizontals = significant_horizontals.select do |h|
|
|
91
|
+
h.x1 <= left + tolerance && h.x2 >= right - tolerance
|
|
74
92
|
end
|
|
75
93
|
|
|
76
|
-
|
|
77
|
-
|
|
94
|
+
next if column_horizontals.empty?
|
|
95
|
+
|
|
96
|
+
# Get y positions from rulings that cover this column
|
|
97
|
+
y_positions = column_horizontals.map { |r| r.y1.round(1) }.uniq.sort
|
|
98
|
+
|
|
99
|
+
next if y_positions.size < 2
|
|
78
100
|
|
|
79
|
-
|
|
101
|
+
# Create cells for this column using only the relevant horizontal rulings
|
|
102
|
+
y_positions.each_cons(2) do |top, bottom|
|
|
103
|
+
# Check if vertical rulings span this row for this column
|
|
104
|
+
has_left = significant_verticals.any? do |v|
|
|
105
|
+
(v.x1 - left).abs <= tolerance &&
|
|
106
|
+
v.y1 <= top + tolerance && v.y2 >= bottom - tolerance
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
has_right = significant_verticals.any? do |v|
|
|
110
|
+
(v.x1 - right).abs <= tolerance &&
|
|
111
|
+
v.y1 <= top + tolerance && v.y2 >= bottom - tolerance
|
|
112
|
+
end
|
|
80
113
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
#
|
|
84
|
-
|
|
114
|
+
# Accept cells with:
|
|
115
|
+
# 1. Full edges (top, bottom, left, right)
|
|
116
|
+
# 2. Horizontal edges only (top, bottom) - for cells without side borders
|
|
117
|
+
# 3. Valid corners in intersection map
|
|
118
|
+
if has_left && has_right
|
|
85
119
|
cells << Cell.new(top, left, right - left, bottom - top)
|
|
86
|
-
# Also accept cells with corner validation
|
|
87
120
|
elsif valid_cell_by_corners?(left, right, top, bottom, intersections, tolerance)
|
|
88
121
|
cells << Cell.new(top, left, right - left, bottom - top)
|
|
122
|
+
else
|
|
123
|
+
# Accept cell if we have valid horizontal edges even without vertical borders
|
|
124
|
+
cells << Cell.new(top, left, right - left, bottom - top)
|
|
89
125
|
end
|
|
90
126
|
end
|
|
91
127
|
end
|
|
92
128
|
|
|
93
|
-
cells
|
|
129
|
+
# Remove duplicate cells (same position)
|
|
130
|
+
cells.uniq { |c| [c.top.round(1), c.left.round(1), c.bottom.round(1), c.right.round(1)] }
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Check if a cell has valid top and bottom horizontal edges (for cells without side borders)
|
|
134
|
+
def valid_cell_horizontal_edges?(left, right, top, bottom, horizontal_rulings, tolerance)
|
|
135
|
+
# Check for top edge (horizontal ruling at top that covers left to right)
|
|
136
|
+
has_top = horizontal_rulings.any? do |h|
|
|
137
|
+
(h.y1 - top).abs <= tolerance &&
|
|
138
|
+
h.x1 <= left + tolerance &&
|
|
139
|
+
h.x2 >= right - tolerance
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Check for bottom edge
|
|
143
|
+
has_bottom = horizontal_rulings.any? do |h|
|
|
144
|
+
(h.y1 - bottom).abs <= tolerance &&
|
|
145
|
+
h.x1 <= left + tolerance &&
|
|
146
|
+
h.x2 >= right - tolerance
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
has_top && has_bottom
|
|
94
150
|
end
|
|
95
151
|
|
|
96
152
|
def build_intersection_map(horizontal_rulings, vertical_rulings)
|
data/lib/tabula/version.rb
CHANGED
data/tabula-rb.gemspec
CHANGED
|
@@ -15,11 +15,10 @@ Gem::Specification.new do |spec|
|
|
|
15
15
|
lattice-mode extraction (for PDFs with visible cell borders) and stream-mode
|
|
16
16
|
extraction (for PDFs without ruling lines, using text positioning).
|
|
17
17
|
DESC
|
|
18
|
-
spec.homepage = 'https://github.com/
|
|
18
|
+
spec.homepage = 'https://github.com/khasinski/tabula-rb'
|
|
19
19
|
spec.license = 'MIT'
|
|
20
20
|
spec.required_ruby_version = '>= 3.1.0'
|
|
21
21
|
|
|
22
|
-
spec.metadata['homepage_uri'] = spec.homepage
|
|
23
22
|
spec.metadata['source_code_uri'] = spec.homepage
|
|
24
23
|
spec.metadata['changelog_uri'] = "#{spec.homepage}/blob/main/CHANGELOG.md"
|
|
25
24
|
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tabula-rb
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.0.
|
|
4
|
+
version: 1.0.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Chris Hasiński
|
|
@@ -87,13 +87,12 @@ files:
|
|
|
87
87
|
- lib/tabula/writers/writer.rb
|
|
88
88
|
- mise.toml
|
|
89
89
|
- tabula-rb.gemspec
|
|
90
|
-
homepage: https://github.com/
|
|
90
|
+
homepage: https://github.com/khasinski/tabula-rb
|
|
91
91
|
licenses:
|
|
92
92
|
- MIT
|
|
93
93
|
metadata:
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
changelog_uri: https://github.com/tabulapdf/tabula-rb/blob/main/CHANGELOG.md
|
|
94
|
+
source_code_uri: https://github.com/khasinski/tabula-rb
|
|
95
|
+
changelog_uri: https://github.com/khasinski/tabula-rb/blob/main/CHANGELOG.md
|
|
97
96
|
rubygems_mfa_required: 'true'
|
|
98
97
|
rdoc_options: []
|
|
99
98
|
require_paths:
|