pdftdx 1.0.4 → 1.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/pdftdx/parser.rb +47 -9
- data/lib/pdftdx/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8b5995c6e0360a17c900a2a6a7cf9f0ad0de4217
|
4
|
+
data.tar.gz: 229ee00d6dfbeecc5b568408fd9133ee41864e5e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 45d5f8303e371c3a8241896dfc1debb24f79ff3fd3fd10019decd7eb56d91894235fda44a17874ff9e23cf14a89f885ec862ea2d8a453dad8e5777a2696764a1
|
7
|
+
data.tar.gz: c05db7fbf0194cf1d72258706d00a9e493d86baeb613920df9cbacc7f283cd1a0568c40a6975eca27ee1975422c227536aa281c390c43c214d709ec6669fcfbf
|
data/lib/pdftdx/parser.rb
CHANGED
@@ -126,37 +126,75 @@ module PDFTDX
|
|
126
126
|
subtab_titles = htable_data.collect.with_index { |r, i| { idx: i, row: r } }.select { |e| TITLE_CELL_REGEX =~ e[:row][0] }.collect { |e| { title: e[:row][0], idx: e[:idx] } }
|
127
127
|
|
128
128
|
# Pull up Sub-tables
|
129
|
-
stables = subtab_titles.collect.with_index
|
129
|
+
stables = subtab_titles.collect.with_index do |t, i|
|
130
|
+
{
|
131
|
+
name: t[:title].gsub(/<\/?b>/, ''), # Extract Sub-Table Name
|
132
|
+
data: htable_data # Extract Sub-Table Data
|
133
|
+
.slice(t[:idx], sub_tab_len(htable_data, subtab_titles, t, i)) # Slice Table Data until next Sub-Table
|
134
|
+
.collect { |e| e.reject.with_index { |c, ii| ii == 0 && TITLE_CELL_REGEX =~ c } } # Reject Table Headers
|
135
|
+
}
|
136
|
+
end
|
130
137
|
|
131
138
|
# Data until first sub-table index is considered 'unsorted'
|
132
139
|
unsorted_end = subtab_titles.empty? ? htable_data.length : subtab_titles[0][:idx]
|
133
140
|
|
134
|
-
|
141
|
+
# Insert last part (Unsorted)
|
142
|
+
stables << htable_data.slice(0, unsorted_end) if unsorted_end > 0
|
143
|
+
|
144
|
+
stables
|
145
|
+
end
|
146
|
+
|
147
|
+
# Sort Row
|
148
|
+
# Sorts Cells according to their x-offset
|
149
|
+
# @param [Hash] r A row of data in the form { xoffset => cell } (Example: { 120 => 'cell 0', 200 => 'cell 1', 280 => 'cell 2' })
|
150
|
+
# @return [Hash] The same row of data, but sorted according to x-offset
|
151
|
+
def self.sort_row r
|
152
|
+
Hash[*(r.to_a.sort { |a, b| ((a[0] == b[0]) ? 0 : (a[0] > b[0] ? 1 : -1)) }.flatten)]
|
135
153
|
end
|
136
154
|
|
137
155
|
# Touch up Table
|
138
156
|
# Splits Table into multiple headered tables.
|
139
157
|
# Also, strips Left Offset info from Table Cells.
|
140
158
|
# @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
|
141
|
-
# @return [Array] An array of tables, each represented as either a single array of rows, or a hash containing a header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }
|
159
|
+
# @return [Array] An array of tables, each represented as either a single array of rows, or a hash containing a header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }] }]
|
142
160
|
def self.touch_up table
|
143
161
|
|
144
|
-
# Remove Column Offsets
|
145
|
-
table.collect! { |r| r.collect { |_left, cell| cell } }
|
146
|
-
|
147
162
|
# Split Table into multiple Headered Tables
|
148
|
-
headers = table
|
163
|
+
headers = table
|
164
|
+
.collect.with_index { |r, i| { idx: i, row: r } }
|
165
|
+
.select { |e| e[:row].inject(true) { |b, c| b && (TITLE_CELL_REGEX =~ c[1]) } }
|
166
|
+
.collect { |r| { idx: r[:idx], row: r[:row].collect { |o, v| { o => v.gsub(/<\/?b>/, '') } } } }
|
149
167
|
|
150
168
|
# Pull up Headered Tables
|
151
169
|
htables = headers.collect.with_index { |h, i| { head: h[:row], data: table.slice(h[:idx] + 1, htable_length(table, headers, h, i) - 1) } }
|
152
170
|
|
171
|
+
# Fix Rows
|
172
|
+
nh = htables.collect do |t|
|
173
|
+
|
174
|
+
# Acquire Column Offsets
|
175
|
+
cols = t[:head].collect { |o| o.first[0] }.sort
|
176
|
+
|
177
|
+
# Compute Row Base (Default Columns)
|
178
|
+
row_base = Hash[*(cols.collect { |c| [c, ''] }.flatten)]
|
179
|
+
|
180
|
+
# Tables
|
181
|
+
{ head: t[:head], data: t[:data].collect { |r| sort_row row_base.merge(Hash[*(r.collect { |o, c| [(cols.reverse.find { |co| co <= o }) || o, c] }.flatten)]) } }
|
182
|
+
end
|
183
|
+
|
184
|
+
# Drop Offsets
|
185
|
+
htables = nh.collect { |t| { head: t[:head].collect { |h| h.first[1] }, data: t[:data].collect { |r| r.collect { |_o, c| c } } } }
|
186
|
+
ntable = table.collect { |r| r.collect { |_o, c| c } }
|
187
|
+
|
153
188
|
# Split Headered Tables into multiple Named Sub-Tables
|
154
189
|
htables.collect! { |ht| { head: ht[:head], data: sub_tablize(ht[:data]) } }
|
155
190
|
|
156
191
|
# Data until first Header index is considered 'unsorted'
|
157
|
-
unsorted_end = headers.empty? ?
|
192
|
+
unsorted_end = headers.empty? ? ntable.length : headers[0][:idx]
|
193
|
+
|
194
|
+
# Insert last part (Unsorted)
|
195
|
+
htables << sub_tablize(ntable.slice(0, unsorted_end)) if unsorted_end > 0
|
158
196
|
|
159
|
-
htables
|
197
|
+
htables
|
160
198
|
end
|
161
199
|
|
162
200
|
# Process
|
data/lib/pdftdx/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdftdx
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eresse
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-01-
|
11
|
+
date: 2017-01-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|