pdftdx 1.1.8 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0f4d29312cf65b9cdb06d6f67a5a6c00a0dba8ad
4
- data.tar.gz: 53f54c86ef77df038ac25091d771c8d7a32d3ea0
3
+ metadata.gz: 369947ea208604dcb52e6a9f329df412709d0f0f
4
+ data.tar.gz: 14d1803d485efed81cfc14e89d720f8da9e0d5bc
5
5
  SHA512:
6
- metadata.gz: 10f71afdd59fae8a75ccce78955bd5465e72efc1bfdf6e600269e119479a3b09d444e414eb0be83e33c6d7c4cdb98904e1b145e713cd8f0ea2c49bb2a439a68e
7
- data.tar.gz: 3de4e256aa6dec1969c1d0c65e75a09228bb0ab072cc1ec7283f705e29d4e1203d2e1d244b46f7aad5c91dbdb3847728db6f1ec49ff171926d8e910e5908afa9
6
+ metadata.gz: 71965bd9e8648e4be72546718891ea87d3d521080711396adaf50325359a0d90d445b0a56e0c210307698561f49f6c93379b47c4416d6a0fb05ad0211886b9db
7
+ data.tar.gz: bb9f966e063c2a8d3e83b60c9e8af61635ed8e7e7d9e98508ebc456bf2cca3b40e2a243bafb1d3036fc5eb5fab30ea1ce8a4b0437e80119e08432a3416d8aadc
data/lib/pdftdx/parser.rb CHANGED
@@ -152,6 +152,28 @@ module PDFTDX
152
152
  Hash[*(r.to_a.sort { |a, b| ((a[0] == b[0]) ? 0 : (a[0] > b[0] ? 1 : -1)) }.flatten)]
153
153
  end
154
154
 
155
+ # Fix Dupes
156
+ # Shifts Duplicate Cells (Cells which share their x-offset with others) to the right (so they don't get overwritten)
157
+ # @param [Array] r A row of data in the form [[xoffset, cell]] (Example: [[120, 'cell 0'], [200, 'cell 1'], [280, 'cell 2']])
158
+ # @param [Array] The same row of data, but with duplicate cells shifted so that no x-offset-collisions occur
159
+ def self.fix_dupes r
160
+
161
+ # Deep-Duplicate Row
162
+ nr = r.collect { |e| e.clone }
163
+
164
+ # Run through Cells
165
+ nr.length.times do |i|
166
+
167
+ # Acquire Duplicate Length
168
+ dupes = nr.slice(i + 1, nr.length).inject(0) { |a, c| a + (c[0] == nr[i][0] ? 1 : 0) }
169
+
170
+ # Fix Dupes
171
+ dupes.times { |j| nr[i + j + 1][0] = nr[i + j + 1][0] + 1 }
172
+ end
173
+
174
+ nr
175
+ end
176
+
155
177
  # Touch up Table
156
178
  # Splits Table into multiple headered tables.
157
179
  # Also, strips Left Offset info from Table Cells.
@@ -177,8 +199,8 @@ module PDFTDX
177
199
  # Compute Row Base (Default Columns)
178
200
  row_base = Hash[*(cols.collect { |c| [c, ''] }.flatten)]
179
201
 
180
- # Tables
181
- { head: t[:head], data: t[:data].collect { |r| sort_row row_base.merge(Hash[*(r.collect { |o, c| [(cols.reverse.find { |co| co <= o }) || o, c] }.flatten)]) } }
202
+ # Re-Build Table
203
+ { head: t[:head], data: t[:data].collect { |r| sort_row row_base.merge(Hash[*((fix_dupes r.collect { |o, c| [(cols.reverse.find { |co| co <= o }) || o, c] }).flatten)]) } }
182
204
  end
183
205
 
184
206
  # Drop Offsets
@@ -5,5 +5,5 @@
5
5
  module PDFTDX
6
6
 
7
7
  # Version
8
- VERSION = '1.1.8'
8
+ VERSION = '1.2.0'
9
9
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdftdx
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.8
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eresse