pdftdx 1.1.8 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/pdftdx/parser.rb +24 -2
- data/lib/pdftdx/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 369947ea208604dcb52e6a9f329df412709d0f0f
|
4
|
+
data.tar.gz: 14d1803d485efed81cfc14e89d720f8da9e0d5bc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 71965bd9e8648e4be72546718891ea87d3d521080711396adaf50325359a0d90d445b0a56e0c210307698561f49f6c93379b47c4416d6a0fb05ad0211886b9db
|
7
|
+
data.tar.gz: bb9f966e063c2a8d3e83b60c9e8af61635ed8e7e7d9e98508ebc456bf2cca3b40e2a243bafb1d3036fc5eb5fab30ea1ce8a4b0437e80119e08432a3416d8aadc
|
data/lib/pdftdx/parser.rb
CHANGED
@@ -152,6 +152,28 @@ module PDFTDX
|
|
152
152
|
Hash[*(r.to_a.sort { |a, b| ((a[0] == b[0]) ? 0 : (a[0] > b[0] ? 1 : -1)) }.flatten)]
|
153
153
|
end
|
154
154
|
|
155
|
+
# Fix Dupes
|
156
|
+
# Shifts Duplicate Cells (Cells which share their x-offset with others) to the right (so they don't get overwritten)
|
157
|
+
# @param [Array] r A row of data in the form [[xoffset, cell]] (Example: [[120, 'cell 0'], [200, 'cell 1'], [280, 'cell 2']])
|
158
|
+
# @param [Array] The same row of data, but with duplicate cells shifted so that no x-offset-collisions occur
|
159
|
+
def self.fix_dupes r
|
160
|
+
|
161
|
+
# Deep-Duplicate Row
|
162
|
+
nr = r.collect { |e| e.clone }
|
163
|
+
|
164
|
+
# Run through Cells
|
165
|
+
nr.length.times do |i|
|
166
|
+
|
167
|
+
# Acquire Duplicate Length
|
168
|
+
dupes = nr.slice(i + 1, nr.length).inject(0) { |a, c| a + (c[0] == nr[i][0] ? 1 : 0) }
|
169
|
+
|
170
|
+
# Fix Dupes
|
171
|
+
dupes.times { |j| nr[i + j + 1][0] = nr[i + j + 1][0] + 1 }
|
172
|
+
end
|
173
|
+
|
174
|
+
nr
|
175
|
+
end
|
176
|
+
|
155
177
|
# Touch up Table
|
156
178
|
# Splits Table into multiple headered tables.
|
157
179
|
# Also, strips Left Offset info from Table Cells.
|
@@ -177,8 +199,8 @@ module PDFTDX
|
|
177
199
|
# Compute Row Base (Default Columns)
|
178
200
|
row_base = Hash[*(cols.collect { |c| [c, ''] }.flatten)]
|
179
201
|
|
180
|
-
#
|
181
|
-
{ head: t[:head], data: t[:data].collect { |r| sort_row row_base.merge(Hash[*(r.collect { |o, c| [(cols.reverse.find { |co| co <= o }) || o, c] }.flatten)]) } }
|
202
|
+
# Re-Build Table
|
203
|
+
{ head: t[:head], data: t[:data].collect { |r| sort_row row_base.merge(Hash[*((fix_dupes r.collect { |o, c| [(cols.reverse.find { |co| co <= o }) || o, c] }).flatten)]) } }
|
182
204
|
end
|
183
205
|
|
184
206
|
# Drop Offsets
|
data/lib/pdftdx/version.rb
CHANGED