pdftdx 1.1.8 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0f4d29312cf65b9cdb06d6f67a5a6c00a0dba8ad
4
- data.tar.gz: 53f54c86ef77df038ac25091d771c8d7a32d3ea0
3
+ metadata.gz: 369947ea208604dcb52e6a9f329df412709d0f0f
4
+ data.tar.gz: 14d1803d485efed81cfc14e89d720f8da9e0d5bc
5
5
  SHA512:
6
- metadata.gz: 10f71afdd59fae8a75ccce78955bd5465e72efc1bfdf6e600269e119479a3b09d444e414eb0be83e33c6d7c4cdb98904e1b145e713cd8f0ea2c49bb2a439a68e
7
- data.tar.gz: 3de4e256aa6dec1969c1d0c65e75a09228bb0ab072cc1ec7283f705e29d4e1203d2e1d244b46f7aad5c91dbdb3847728db6f1ec49ff171926d8e910e5908afa9
6
+ metadata.gz: 71965bd9e8648e4be72546718891ea87d3d521080711396adaf50325359a0d90d445b0a56e0c210307698561f49f6c93379b47c4416d6a0fb05ad0211886b9db
7
+ data.tar.gz: bb9f966e063c2a8d3e83b60c9e8af61635ed8e7e7d9e98508ebc456bf2cca3b40e2a243bafb1d3036fc5eb5fab30ea1ce8a4b0437e80119e08432a3416d8aadc
data/lib/pdftdx/parser.rb CHANGED
@@ -152,6 +152,28 @@ module PDFTDX
152
152
  Hash[*(r.to_a.sort { |a, b| ((a[0] == b[0]) ? 0 : (a[0] > b[0] ? 1 : -1)) }.flatten)]
153
153
  end
154
154
 
155
+ # Fix Dupes
156
+ # Shifts Duplicate Cells (Cells which share their x-offset with others) to the right (so they don't get overwritten)
157
+ # @param [Array] r A row of data in the form [[xoffset, cell]] (Example: [[120, 'cell 0'], [200, 'cell 1'], [280, 'cell 2']])
158
+ # @param [Array] The same row of data, but with duplicate cells shifted so that no x-offset-collisions occur
159
+ def self.fix_dupes r
160
+
161
+ # Deep-Duplicate Row
162
+ nr = r.collect { |e| e.clone }
163
+
164
+ # Run through Cells
165
+ nr.length.times do |i|
166
+
167
+ # Acquire Duplicate Length
168
+ dupes = nr.slice(i + 1, nr.length).inject(0) { |a, c| a + (c[0] == nr[i][0] ? 1 : 0) }
169
+
170
+ # Fix Dupes
171
+ dupes.times { |j| nr[i + j + 1][0] = nr[i + j + 1][0] + 1 }
172
+ end
173
+
174
+ nr
175
+ end
176
+
155
177
  # Touch up Table
156
178
  # Splits Table into multiple headered tables.
157
179
  # Also, strips Left Offset info from Table Cells.
@@ -177,8 +199,8 @@ module PDFTDX
177
199
  # Compute Row Base (Default Columns)
178
200
  row_base = Hash[*(cols.collect { |c| [c, ''] }.flatten)]
179
201
 
180
- # Tables
181
- { head: t[:head], data: t[:data].collect { |r| sort_row row_base.merge(Hash[*(r.collect { |o, c| [(cols.reverse.find { |co| co <= o }) || o, c] }.flatten)]) } }
202
+ # Re-Build Table
203
+ { head: t[:head], data: t[:data].collect { |r| sort_row row_base.merge(Hash[*((fix_dupes r.collect { |o, c| [(cols.reverse.find { |co| co <= o }) || o, c] }).flatten)]) } }
182
204
  end
183
205
 
184
206
  # Drop Offsets
@@ -5,5 +5,5 @@
5
5
  module PDFTDX
6
6
 
7
7
  # Version
8
- VERSION = '1.1.8'
8
+ VERSION = '1.2.0'
9
9
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdftdx
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.8
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eresse