pdftdx 1.0.4 → 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ac8bab6085d60063b15d62cd7c3b23cf706eebad
4
- data.tar.gz: 2aeec796b1e049ed35cad9c89e18c31ed6627c4b
3
+ metadata.gz: 8b5995c6e0360a17c900a2a6a7cf9f0ad0de4217
4
+ data.tar.gz: 229ee00d6dfbeecc5b568408fd9133ee41864e5e
5
5
  SHA512:
6
- metadata.gz: 956a6181277c105d53c1c91c632e067d9bb8223e0fc5388c5fa61f9dffd7b3eaa2b51f4695ff5e0efaa7b5c28a395c354d5a027f09ebcfb75aebb954513fb8b2
7
- data.tar.gz: 21dd314df4f0959edbd0d066f18ab9398626ad165fcf3849f81b4a4ff5e72dac6cf28a543bdadbf0c4258becc989886384ce8cbfe4c56c393a67d603422ebc89
6
+ metadata.gz: 45d5f8303e371c3a8241896dfc1debb24f79ff3fd3fd10019decd7eb56d91894235fda44a17874ff9e23cf14a89f885ec862ea2d8a453dad8e5777a2696764a1
7
+ data.tar.gz: c05db7fbf0194cf1d72258706d00a9e493d86baeb613920df9cbacc7f283cd1a0568c40a6975eca27ee1975422c227536aa281c390c43c214d709ec6669fcfbf
data/lib/pdftdx/parser.rb CHANGED
@@ -126,37 +126,75 @@ module PDFTDX
126
126
  subtab_titles = htable_data.collect.with_index { |r, i| { idx: i, row: r } }.select { |e| TITLE_CELL_REGEX =~ e[:row][0] }.collect { |e| { title: e[:row][0], idx: e[:idx] } }
127
127
 
128
128
  # Pull up Sub-tables
129
- stables = subtab_titles.collect.with_index { |t, i| { name: t[:title].gsub(/<\/?b>/, ''), data: htable_data.slice(t[:idx], sub_tab_len(htable_data, subtab_titles, t, i)).collect { |e| e.reject.with_index { |c, ii| ii == 0 && TITLE_CELL_REGEX =~ c } } } }
129
+ stables = subtab_titles.collect.with_index do |t, i|
130
+ {
131
+ name: t[:title].gsub(/<\/?b>/, ''), # Extract Sub-Table Name
132
+ data: htable_data # Extract Sub-Table Data
133
+ .slice(t[:idx], sub_tab_len(htable_data, subtab_titles, t, i)) # Slice Table Data until next Sub-Table
134
+ .collect { |e| e.reject.with_index { |c, ii| ii == 0 && TITLE_CELL_REGEX =~ c } } # Reject Table Headers
135
+ }
136
+ end
130
137
 
131
138
  # Data until first sub-table index is considered 'unsorted'
132
139
  unsorted_end = subtab_titles.empty? ? htable_data.length : subtab_titles[0][:idx]
133
140
 
134
- stables << htable_data.slice(0, unsorted_end)
141
+ # Insert last part (Unsorted)
142
+ stables << htable_data.slice(0, unsorted_end) if unsorted_end > 0
143
+
144
+ stables
145
+ end
146
+
147
+ # Sort Row
148
+ # Sorts Cells according to their x-offset
149
+ # @param [Hash] r A row of data in the form { xoffset => cell } (Example: { 120 => 'cell 0', 200 => 'cell 1', 280 => 'cell 2' })
150
+ # @return [Hash] The same row of data, but sorted according to x-offset
151
+ def self.sort_row r
152
+ Hash[*(r.to_a.sort { |a, b| ((a[0] == b[0]) ? 0 : (a[0] > b[0] ? 1 : -1)) }.flatten)]
135
153
  end
136
154
 
137
155
  # Touch up Table
138
156
  # Splits Table into multiple headered tables.
139
157
  # Also, strips Left Offset info from Table Cells.
140
158
  # @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
141
- # @return [Array] An array of tables, each represented as either a single array of rows, or a hash containing a header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }, []] }, []]
159
+ # @return [Array] An array of tables, each represented as either a single array of rows, or a hash containing a header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }] }]
142
160
  def self.touch_up table
143
161
 
144
- # Remove Column Offsets
145
- table.collect! { |r| r.collect { |_left, cell| cell } }
146
-
147
162
  # Split Table into multiple Headered Tables
148
- headers = table.collect.with_index { |r, i| { idx: i, row: r } }.select { |e| e[:row].inject(true) { |b, c| b && (TITLE_CELL_REGEX =~ c) } }.collect { |r| { idx: r[:idx], row: r[:row].collect { |v| v.gsub /<\/?b>/, '' } } }
163
+ headers = table
164
+ .collect.with_index { |r, i| { idx: i, row: r } }
165
+ .select { |e| e[:row].inject(true) { |b, c| b && (TITLE_CELL_REGEX =~ c[1]) } }
166
+ .collect { |r| { idx: r[:idx], row: r[:row].collect { |o, v| { o => v.gsub(/<\/?b>/, '') } } } }
149
167
 
150
168
  # Pull up Headered Tables
151
169
  htables = headers.collect.with_index { |h, i| { head: h[:row], data: table.slice(h[:idx] + 1, htable_length(table, headers, h, i) - 1) } }
152
170
 
171
+ # Fix Rows
172
+ nh = htables.collect do |t|
173
+
174
+ # Acquire Column Offsets
175
+ cols = t[:head].collect { |o| o.first[0] }.sort
176
+
177
+ # Compute Row Base (Default Columns)
178
+ row_base = Hash[*(cols.collect { |c| [c, ''] }.flatten)]
179
+
180
+ # Tables
181
+ { head: t[:head], data: t[:data].collect { |r| sort_row row_base.merge(Hash[*(r.collect { |o, c| [(cols.reverse.find { |co| co <= o }) || o, c] }.flatten)]) } }
182
+ end
183
+
184
+ # Drop Offsets
185
+ htables = nh.collect { |t| { head: t[:head].collect { |h| h.first[1] }, data: t[:data].collect { |r| r.collect { |_o, c| c } } } }
186
+ ntable = table.collect { |r| r.collect { |_o, c| c } }
187
+
153
188
  # Split Headered Tables into multiple Named Sub-Tables
154
189
  htables.collect! { |ht| { head: ht[:head], data: sub_tablize(ht[:data]) } }
155
190
 
156
191
  # Data until first Header index is considered 'unsorted'
157
- unsorted_end = headers.empty? ? table.length : headers[0][:idx]
192
+ unsorted_end = headers.empty? ? ntable.length : headers[0][:idx]
193
+
194
+ # Insert last part (Unsorted)
195
+ htables << sub_tablize(ntable.slice(0, unsorted_end)) if unsorted_end > 0
158
196
 
159
- htables << sub_tablize(table.slice(0, unsorted_end))
197
+ htables
160
198
  end
161
199
 
162
200
  # Process
@@ -5,5 +5,5 @@
5
5
  module PDFTDX
6
6
 
7
7
  # Version
8
- VERSION = '1.0.4'
8
+ VERSION = '1.1.7'
9
9
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdftdx
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.4
4
+ version: 1.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eresse
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-01-13 00:00:00.000000000 Z
11
+ date: 2017-01-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler