pdftdx 1.0.4 → 1.1.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ac8bab6085d60063b15d62cd7c3b23cf706eebad
4
- data.tar.gz: 2aeec796b1e049ed35cad9c89e18c31ed6627c4b
3
+ metadata.gz: 8b5995c6e0360a17c900a2a6a7cf9f0ad0de4217
4
+ data.tar.gz: 229ee00d6dfbeecc5b568408fd9133ee41864e5e
5
5
  SHA512:
6
- metadata.gz: 956a6181277c105d53c1c91c632e067d9bb8223e0fc5388c5fa61f9dffd7b3eaa2b51f4695ff5e0efaa7b5c28a395c354d5a027f09ebcfb75aebb954513fb8b2
7
- data.tar.gz: 21dd314df4f0959edbd0d066f18ab9398626ad165fcf3849f81b4a4ff5e72dac6cf28a543bdadbf0c4258becc989886384ce8cbfe4c56c393a67d603422ebc89
6
+ metadata.gz: 45d5f8303e371c3a8241896dfc1debb24f79ff3fd3fd10019decd7eb56d91894235fda44a17874ff9e23cf14a89f885ec862ea2d8a453dad8e5777a2696764a1
7
+ data.tar.gz: c05db7fbf0194cf1d72258706d00a9e493d86baeb613920df9cbacc7f283cd1a0568c40a6975eca27ee1975422c227536aa281c390c43c214d709ec6669fcfbf
data/lib/pdftdx/parser.rb CHANGED
@@ -126,37 +126,75 @@ module PDFTDX
126
126
  subtab_titles = htable_data.collect.with_index { |r, i| { idx: i, row: r } }.select { |e| TITLE_CELL_REGEX =~ e[:row][0] }.collect { |e| { title: e[:row][0], idx: e[:idx] } }
127
127
 
128
128
  # Pull up Sub-tables
129
- stables = subtab_titles.collect.with_index { |t, i| { name: t[:title].gsub(/<\/?b>/, ''), data: htable_data.slice(t[:idx], sub_tab_len(htable_data, subtab_titles, t, i)).collect { |e| e.reject.with_index { |c, ii| ii == 0 && TITLE_CELL_REGEX =~ c } } } }
129
+ stables = subtab_titles.collect.with_index do |t, i|
130
+ {
131
+ name: t[:title].gsub(/<\/?b>/, ''), # Extract Sub-Table Name
132
+ data: htable_data # Extract Sub-Table Data
133
+ .slice(t[:idx], sub_tab_len(htable_data, subtab_titles, t, i)) # Slice Table Data until next Sub-Table
134
+ .collect { |e| e.reject.with_index { |c, ii| ii == 0 && TITLE_CELL_REGEX =~ c } } # Reject Table Headers
135
+ }
136
+ end
130
137
 
131
138
  # Data until first sub-table index is considered 'unsorted'
132
139
  unsorted_end = subtab_titles.empty? ? htable_data.length : subtab_titles[0][:idx]
133
140
 
134
- stables << htable_data.slice(0, unsorted_end)
141
+ # Insert last part (Unsorted)
142
+ stables << htable_data.slice(0, unsorted_end) if unsorted_end > 0
143
+
144
+ stables
145
+ end
146
+
147
+ # Sort Row
148
+ # Sorts Cells according to their x-offset
149
+ # @param [Hash] r A row of data in the form { xoffset => cell } (Example: { 120 => 'cell 0', 200 => 'cell 1', 280 => 'cell 2' })
150
+ # @return [Hash] The same row of data, but sorted according to x-offset
151
+ def self.sort_row r
152
+ Hash[*(r.to_a.sort { |a, b| ((a[0] == b[0]) ? 0 : (a[0] > b[0] ? 1 : -1)) }.flatten)]
135
153
  end
136
154
 
137
155
  # Touch up Table
138
156
  # Splits Table into multiple headered tables.
139
157
  # Also, strips Left Offset info from Table Cells.
140
158
  # @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
141
- # @return [Array] An array of tables, each represented as either a single array of rows, or a hash containing a header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }, []] }, []]
159
+ # @return [Array] An array of tables, each represented as either a single array of rows, or a hash containing a header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }] }]
142
160
  def self.touch_up table
143
161
 
144
- # Remove Column Offsets
145
- table.collect! { |r| r.collect { |_left, cell| cell } }
146
-
147
162
  # Split Table into multiple Headered Tables
148
- headers = table.collect.with_index { |r, i| { idx: i, row: r } }.select { |e| e[:row].inject(true) { |b, c| b && (TITLE_CELL_REGEX =~ c) } }.collect { |r| { idx: r[:idx], row: r[:row].collect { |v| v.gsub /<\/?b>/, '' } } }
163
+ headers = table
164
+ .collect.with_index { |r, i| { idx: i, row: r } }
165
+ .select { |e| e[:row].inject(true) { |b, c| b && (TITLE_CELL_REGEX =~ c[1]) } }
166
+ .collect { |r| { idx: r[:idx], row: r[:row].collect { |o, v| { o => v.gsub(/<\/?b>/, '') } } } }
149
167
 
150
168
  # Pull up Headered Tables
151
169
  htables = headers.collect.with_index { |h, i| { head: h[:row], data: table.slice(h[:idx] + 1, htable_length(table, headers, h, i) - 1) } }
152
170
 
171
+ # Fix Rows
172
+ nh = htables.collect do |t|
173
+
174
+ # Acquire Column Offsets
175
+ cols = t[:head].collect { |o| o.first[0] }.sort
176
+
177
+ # Compute Row Base (Default Columns)
178
+ row_base = Hash[*(cols.collect { |c| [c, ''] }.flatten)]
179
+
180
+ # Tables
181
+ { head: t[:head], data: t[:data].collect { |r| sort_row row_base.merge(Hash[*(r.collect { |o, c| [(cols.reverse.find { |co| co <= o }) || o, c] }.flatten)]) } }
182
+ end
183
+
184
+ # Drop Offsets
185
+ htables = nh.collect { |t| { head: t[:head].collect { |h| h.first[1] }, data: t[:data].collect { |r| r.collect { |_o, c| c } } } }
186
+ ntable = table.collect { |r| r.collect { |_o, c| c } }
187
+
153
188
  # Split Headered Tables into multiple Named Sub-Tables
154
189
  htables.collect! { |ht| { head: ht[:head], data: sub_tablize(ht[:data]) } }
155
190
 
156
191
  # Data until first Header index is considered 'unsorted'
157
- unsorted_end = headers.empty? ? table.length : headers[0][:idx]
192
+ unsorted_end = headers.empty? ? ntable.length : headers[0][:idx]
193
+
194
+ # Insert last part (Unsorted)
195
+ htables << sub_tablize(ntable.slice(0, unsorted_end)) if unsorted_end > 0
158
196
 
159
- htables << sub_tablize(table.slice(0, unsorted_end))
197
+ htables
160
198
  end
161
199
 
162
200
  # Process
@@ -5,5 +5,5 @@
5
5
  module PDFTDX
6
6
 
7
7
  # Version
8
- VERSION = '1.0.4'
8
+ VERSION = '1.1.7'
9
9
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdftdx
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.4
4
+ version: 1.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eresse
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-01-13 00:00:00.000000000 Z
11
+ date: 2017-01-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler