tabula-rb 1.0.0 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 80dbf2efc9afdaa82c43fed84838b003121a9ec2af277621b7be59f3394840a3
4
- data.tar.gz: 8db614e4185f5fbd5e5b969e1ee76469a6d883542e83942121f18dace8eb80c3
3
+ metadata.gz: 54634e2fb6ae46c6866b436995b8dba18465b002d0fe637a3e5c7788e8ce212e
4
+ data.tar.gz: ab94d688b278528a7e1ecb868b34b1b4cadcd66932fddc8c565919a6c3cf5f04
5
5
  SHA512:
6
- metadata.gz: 79e3c2de05740e98a587710dec2426e8d5294e928771136a51f22e81ed9eecb1b1d6018bf2dfc83dfb97c1ac64bbc2e6c5660f863e7cce4252b644ef9b495670
7
- data.tar.gz: 52280f8b0a1fd27ea842bab9f250274d42854b141e7f056d00e7902eb38024bd77ed498c336ed1810fb94ca253787520408f9d29d82fde52b507bf98246be465
6
+ metadata.gz: 59fd0aa6641abb1a545032a69716f42a2a7f765edb36cd821bf877b95ae1faabd9fbdc705fcadd08c3f3bdbd8551efde186b0f9898d7b202c328409cfb50ecac
7
+ data.tar.gz: '08ad17ecfdafa0c4a3d1e508206dfdcefea6d623d8adcd84b1ccb16a34484d0e7392a7d6850269f30704033a8aaa05b95585c4d62f199aa6b5994beea5bcf02a'
data/CHANGELOG.md CHANGED
@@ -5,6 +5,28 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.0.3] - 2024
9
+
10
+ ### Fixed
11
+
12
+ - Fixed extraction of spanning row labels in lattice mode. Tables with cells spanning multiple rows (where horizontal rulings don't extend across all columns) now correctly extract text labels like "Servo motor type", "Compatible servo drive unit type", etc.
13
+ - Improved column-by-column cell detection algorithm to handle partial horizontal rulings that only cover certain columns
14
+ - Filtered out short/noise rulings (< 20 points) that were creating spurious column boundaries
15
+ - Now extracts additional data columns that tabula-java misses in some PDFs
16
+
17
+ ## [1.0.2] - 2024
18
+
19
+ ### Fixed
20
+
21
+ - Fixed extraction of leftmost column in tables where horizontal rulings extend beyond vertical rulings. Tables with no left border now correctly include the leftmost column data.
22
+
23
+ ## [1.0.1] - 2024
24
+
25
+ ### Fixed
26
+
27
+ - Fixed gem build warnings about file permissions
28
+ - Removed duplicate homepage_uri metadata in gemspec
29
+
8
30
  ## [1.0.0] - 2024
9
31
 
10
32
  Initial stable release of tabula-rb, a pure Ruby port of tabula-java.
@@ -53,44 +53,100 @@ module Tabula
53
53
 
54
54
  private
55
55
 
56
+ # Minimum ruling length to be considered significant
57
+ MIN_RULING_LENGTH = 20
58
+
56
59
  def find_cells(horizontal_rulings, vertical_rulings)
57
60
  cells = []
58
61
  tolerance = Tabula.configuration.cell_tolerance
59
62
 
63
+ # Filter out very short rulings (likely noise/decorations)
64
+ significant_horizontals = horizontal_rulings.select { |h| (h.x2 - h.x1).abs >= MIN_RULING_LENGTH }
65
+ significant_verticals = vertical_rulings.select { |v| (v.y2 - v.y1).abs >= MIN_RULING_LENGTH }
66
+
67
+ return cells if significant_horizontals.empty? || significant_verticals.empty?
68
+
60
69
  # Find intersection points
61
- intersections = build_intersection_map(horizontal_rulings, vertical_rulings)
70
+ intersections = build_intersection_map(significant_horizontals, significant_verticals)
62
71
  return cells if intersections.empty?
63
72
 
64
- # Get unique y positions from horizontal rulings (row boundaries)
65
- y_positions = horizontal_rulings.map { |r| r.y1.round(1) }.uniq.sort
73
+ # Gather all unique x positions from significant vertical rulings and horizontal ruling endpoints
74
+ x_positions = significant_verticals.map { |v| v.x1.round(1) }.uniq
75
+ significant_horizontals.each do |h|
76
+ x_positions << h.x1.round(1)
77
+ x_positions << h.x2.round(1)
78
+ end
79
+ x_positions = x_positions.uniq.sort
66
80
 
67
- return cells if y_positions.size < 2
81
+ return cells if x_positions.size < 2
68
82
 
69
- # Process each row individually to handle spanning cells
70
- y_positions.each_cons(2) do |top, bottom|
71
- # Find vertical rulings that span this row (intersect with row's Y range)
72
- row_verticals = vertical_rulings.select do |v|
73
- v.y1 <= top + tolerance && v.y2 >= bottom - tolerance
83
+ # Process each column individually to handle spanning cells correctly
84
+ # For each column, only use horizontal rulings that actually cover that column
85
+ x_positions.each_cons(2) do |left, right|
86
+ # Skip very narrow columns (likely noise)
87
+ next if (right - left) < 10
88
+
89
+ # Find horizontal rulings that cover this column range
90
+ column_horizontals = significant_horizontals.select do |h|
91
+ h.x1 <= left + tolerance && h.x2 >= right - tolerance
74
92
  end
75
93
 
76
- # Get unique X positions from vertical rulings only
77
- x_positions = row_verticals.map { |v| v.x1.round(1) }.uniq.sort
94
+ next if column_horizontals.empty?
95
+
96
+ # Get y positions from rulings that cover this column
97
+ y_positions = column_horizontals.map { |r| r.y1.round(1) }.uniq.sort
98
+
99
+ next if y_positions.size < 2
78
100
 
79
- next if x_positions.size < 2
101
+ # Create cells for this column using only the relevant horizontal rulings
102
+ y_positions.each_cons(2) do |top, bottom|
103
+ # Check if vertical rulings span this row for this column
104
+ has_left = significant_verticals.any? do |v|
105
+ (v.x1 - left).abs <= tolerance &&
106
+ v.y1 <= top + tolerance && v.y2 >= bottom - tolerance
107
+ end
108
+
109
+ has_right = significant_verticals.any? do |v|
110
+ (v.x1 - right).abs <= tolerance &&
111
+ v.y1 <= top + tolerance && v.y2 >= bottom - tolerance
112
+ end
80
113
 
81
- # Create cells for this row
82
- x_positions.each_cons(2) do |left, right|
83
- # Verify this cell has valid edges
84
- if valid_cell_by_edges?(left, right, top, bottom, horizontal_rulings, vertical_rulings, tolerance)
114
+ # Accept cells with:
115
+ # 1. Full edges (top, bottom, left, right)
116
+ # 2. Horizontal edges only (top, bottom) - for cells without side borders
117
+ # 3. Valid corners in intersection map
118
+ if has_left && has_right
85
119
  cells << Cell.new(top, left, right - left, bottom - top)
86
- # Also accept cells with corner validation
87
120
  elsif valid_cell_by_corners?(left, right, top, bottom, intersections, tolerance)
88
121
  cells << Cell.new(top, left, right - left, bottom - top)
122
+ else
123
+ # Accept cell if we have valid horizontal edges even without vertical borders
124
+ cells << Cell.new(top, left, right - left, bottom - top)
89
125
  end
90
126
  end
91
127
  end
92
128
 
93
- cells
129
+ # Remove duplicate cells (same position)
130
+ cells.uniq { |c| [c.top.round(1), c.left.round(1), c.bottom.round(1), c.right.round(1)] }
131
+ end
132
+
133
+ # Check if a cell has valid top and bottom horizontal edges (for cells without side borders)
134
+ def valid_cell_horizontal_edges?(left, right, top, bottom, horizontal_rulings, tolerance)
135
+ # Check for top edge (horizontal ruling at top that covers left to right)
136
+ has_top = horizontal_rulings.any? do |h|
137
+ (h.y1 - top).abs <= tolerance &&
138
+ h.x1 <= left + tolerance &&
139
+ h.x2 >= right - tolerance
140
+ end
141
+
142
+ # Check for bottom edge
143
+ has_bottom = horizontal_rulings.any? do |h|
144
+ (h.y1 - bottom).abs <= tolerance &&
145
+ h.x1 <= left + tolerance &&
146
+ h.x2 >= right - tolerance
147
+ end
148
+
149
+ has_top && has_bottom
94
150
  end
95
151
 
96
152
  def build_intersection_map(horizontal_rulings, vertical_rulings)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Tabula
4
- VERSION = '1.0.0'
4
+ VERSION = '1.0.3'
5
5
  end
data/tabula-rb.gemspec CHANGED
@@ -15,11 +15,10 @@ Gem::Specification.new do |spec|
15
15
  lattice-mode extraction (for PDFs with visible cell borders) and stream-mode
16
16
  extraction (for PDFs without ruling lines, using text positioning).
17
17
  DESC
18
- spec.homepage = 'https://github.com/tabulapdf/tabula-rb'
18
+ spec.homepage = 'https://github.com/khasinski/tabula-rb'
19
19
  spec.license = 'MIT'
20
20
  spec.required_ruby_version = '>= 3.1.0'
21
21
 
22
- spec.metadata['homepage_uri'] = spec.homepage
23
22
  spec.metadata['source_code_uri'] = spec.homepage
24
23
  spec.metadata['changelog_uri'] = "#{spec.homepage}/blob/main/CHANGELOG.md"
25
24
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tabula-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Hasiński
@@ -87,13 +87,12 @@ files:
87
87
  - lib/tabula/writers/writer.rb
88
88
  - mise.toml
89
89
  - tabula-rb.gemspec
90
- homepage: https://github.com/tabulapdf/tabula-rb
90
+ homepage: https://github.com/khasinski/tabula-rb
91
91
  licenses:
92
92
  - MIT
93
93
  metadata:
94
- homepage_uri: https://github.com/tabulapdf/tabula-rb
95
- source_code_uri: https://github.com/tabulapdf/tabula-rb
96
- changelog_uri: https://github.com/tabulapdf/tabula-rb/blob/main/CHANGELOG.md
94
+ source_code_uri: https://github.com/khasinski/tabula-rb
95
+ changelog_uri: https://github.com/khasinski/tabula-rb/blob/main/CHANGELOG.md
97
96
  rubygems_mfa_required: 'true'
98
97
  rdoc_options: []
99
98
  require_paths: