pdftdx 1.0.4 → 1.1.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/pdftdx/parser.rb +47 -9
- data/lib/pdftdx/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8b5995c6e0360a17c900a2a6a7cf9f0ad0de4217
|
4
|
+
data.tar.gz: 229ee00d6dfbeecc5b568408fd9133ee41864e5e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 45d5f8303e371c3a8241896dfc1debb24f79ff3fd3fd10019decd7eb56d91894235fda44a17874ff9e23cf14a89f885ec862ea2d8a453dad8e5777a2696764a1
|
7
|
+
data.tar.gz: c05db7fbf0194cf1d72258706d00a9e493d86baeb613920df9cbacc7f283cd1a0568c40a6975eca27ee1975422c227536aa281c390c43c214d709ec6669fcfbf
|
data/lib/pdftdx/parser.rb
CHANGED
@@ -126,37 +126,75 @@ module PDFTDX
|
|
126
126
|
subtab_titles = htable_data.collect.with_index { |r, i| { idx: i, row: r } }.select { |e| TITLE_CELL_REGEX =~ e[:row][0] }.collect { |e| { title: e[:row][0], idx: e[:idx] } }
|
127
127
|
|
128
128
|
# Pull up Sub-tables
|
129
|
-
stables = subtab_titles.collect.with_index
|
129
|
+
stables = subtab_titles.collect.with_index do |t, i|
|
130
|
+
{
|
131
|
+
name: t[:title].gsub(/<\/?b>/, ''), # Extract Sub-Table Name
|
132
|
+
data: htable_data # Extract Sub-Table Data
|
133
|
+
.slice(t[:idx], sub_tab_len(htable_data, subtab_titles, t, i)) # Slice Table Data until next Sub-Table
|
134
|
+
.collect { |e| e.reject.with_index { |c, ii| ii == 0 && TITLE_CELL_REGEX =~ c } } # Reject Table Headers
|
135
|
+
}
|
136
|
+
end
|
130
137
|
|
131
138
|
# Data until first sub-table index is considered 'unsorted'
|
132
139
|
unsorted_end = subtab_titles.empty? ? htable_data.length : subtab_titles[0][:idx]
|
133
140
|
|
134
|
-
|
141
|
+
# Insert last part (Unsorted)
|
142
|
+
stables << htable_data.slice(0, unsorted_end) if unsorted_end > 0
|
143
|
+
|
144
|
+
stables
|
145
|
+
end
|
146
|
+
|
147
|
+
# Sort Row
|
148
|
+
# Sorts Cells according to their x-offset
|
149
|
+
# @param [Hash] r A row of data in the form { xoffset => cell } (Example: { 120 => 'cell 0', 200 => 'cell 1', 280 => 'cell 2' })
|
150
|
+
# @return [Hash] The same row of data, but sorted according to x-offset
|
151
|
+
def self.sort_row r
|
152
|
+
Hash[*(r.to_a.sort { |a, b| ((a[0] == b[0]) ? 0 : (a[0] > b[0] ? 1 : -1)) }.flatten)]
|
135
153
|
end
|
136
154
|
|
137
155
|
# Touch up Table
|
138
156
|
# Splits Table into multiple headered tables.
|
139
157
|
# Also, strips Left Offset info from Table Cells.
|
140
158
|
# @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
|
141
|
-
# @return [Array] An array of tables, each represented as either a single array of rows, or a hash containing a header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }
|
159
|
+
# @return [Array] An array of tables, each represented as either a single array of rows, or a hash containing a header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }] }]
|
142
160
|
def self.touch_up table
|
143
161
|
|
144
|
-
# Remove Column Offsets
|
145
|
-
table.collect! { |r| r.collect { |_left, cell| cell } }
|
146
|
-
|
147
162
|
# Split Table into multiple Headered Tables
|
148
|
-
headers = table
|
163
|
+
headers = table
|
164
|
+
.collect.with_index { |r, i| { idx: i, row: r } }
|
165
|
+
.select { |e| e[:row].inject(true) { |b, c| b && (TITLE_CELL_REGEX =~ c[1]) } }
|
166
|
+
.collect { |r| { idx: r[:idx], row: r[:row].collect { |o, v| { o => v.gsub(/<\/?b>/, '') } } } }
|
149
167
|
|
150
168
|
# Pull up Headered Tables
|
151
169
|
htables = headers.collect.with_index { |h, i| { head: h[:row], data: table.slice(h[:idx] + 1, htable_length(table, headers, h, i) - 1) } }
|
152
170
|
|
171
|
+
# Fix Rows
|
172
|
+
nh = htables.collect do |t|
|
173
|
+
|
174
|
+
# Acquire Column Offsets
|
175
|
+
cols = t[:head].collect { |o| o.first[0] }.sort
|
176
|
+
|
177
|
+
# Compute Row Base (Default Columns)
|
178
|
+
row_base = Hash[*(cols.collect { |c| [c, ''] }.flatten)]
|
179
|
+
|
180
|
+
# Tables
|
181
|
+
{ head: t[:head], data: t[:data].collect { |r| sort_row row_base.merge(Hash[*(r.collect { |o, c| [(cols.reverse.find { |co| co <= o }) || o, c] }.flatten)]) } }
|
182
|
+
end
|
183
|
+
|
184
|
+
# Drop Offsets
|
185
|
+
htables = nh.collect { |t| { head: t[:head].collect { |h| h.first[1] }, data: t[:data].collect { |r| r.collect { |_o, c| c } } } }
|
186
|
+
ntable = table.collect { |r| r.collect { |_o, c| c } }
|
187
|
+
|
153
188
|
# Split Headered Tables into multiple Named Sub-Tables
|
154
189
|
htables.collect! { |ht| { head: ht[:head], data: sub_tablize(ht[:data]) } }
|
155
190
|
|
156
191
|
# Data until first Header index is considered 'unsorted'
|
157
|
-
unsorted_end = headers.empty? ?
|
192
|
+
unsorted_end = headers.empty? ? ntable.length : headers[0][:idx]
|
193
|
+
|
194
|
+
# Insert last part (Unsorted)
|
195
|
+
htables << sub_tablize(ntable.slice(0, unsorted_end)) if unsorted_end > 0
|
158
196
|
|
159
|
-
htables
|
197
|
+
htables
|
160
198
|
end
|
161
199
|
|
162
200
|
# Process
|
data/lib/pdftdx/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdftdx
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eresse
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-01-
|
11
|
+
date: 2017-01-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|