RubyGems - pdftdx - Versions diffs - 1.0.4 → 1.1.7 - Mend

pdftdx 1.0.4 → 1.1.7

Files changed (4) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: ac8bab6085d60063b15d62cd7c3b23cf706eebad
-  data.tar.gz: 2aeec796b1e049ed35cad9c89e18c31ed6627c4b
+  metadata.gz: 8b5995c6e0360a17c900a2a6a7cf9f0ad0de4217
+  data.tar.gz: 229ee00d6dfbeecc5b568408fd9133ee41864e5e
 SHA512:
-  metadata.gz: 956a6181277c105d53c1c91c632e067d9bb8223e0fc5388c5fa61f9dffd7b3eaa2b51f4695ff5e0efaa7b5c28a395c354d5a027f09ebcfb75aebb954513fb8b2
-  data.tar.gz: 21dd314df4f0959edbd0d066f18ab9398626ad165fcf3849f81b4a4ff5e72dac6cf28a543bdadbf0c4258becc989886384ce8cbfe4c56c393a67d603422ebc89
+  metadata.gz: 45d5f8303e371c3a8241896dfc1debb24f79ff3fd3fd10019decd7eb56d91894235fda44a17874ff9e23cf14a89f885ec862ea2d8a453dad8e5777a2696764a1
+  data.tar.gz: c05db7fbf0194cf1d72258706d00a9e493d86baeb613920df9cbacc7f283cd1a0568c40a6975eca27ee1975422c227536aa281c390c43c214d709ec6669fcfbf

data/lib/pdftdx/parser.rb CHANGED Viewed

@@ -126,37 +126,75 @@ module PDFTDX
 			subtab_titles = htable_data.collect.with_index { |r, i| { idx: i, row: r } }.select { |e| TITLE_CELL_REGEX =~ e[:row][0] }.collect { |e| { title: e[:row][0], idx: e[:idx] } }
 			# Pull up Sub-tables
-			stables = subtab_titles.collect.with_index { |t, i| { name: t[:title].gsub(/<\/?b>/, ''), data: htable_data.slice(t[:idx], sub_tab_len(htable_data, subtab_titles, t, i)).collect { |e| e.reject.with_index { |c, ii| ii == 0 && TITLE_CELL_REGEX =~ c } } } }
+			stables = subtab_titles.collect.with_index do |t, i|
+				{
+					name: t[:title].gsub(/<\/?b>/, ''),                                                             # Extract Sub-Table Name
+					data: htable_data                                                                               # Extract Sub-Table Data
+						.slice(t[:idx], sub_tab_len(htable_data, subtab_titles, t, i))                              # Slice Table Data until next Sub-Table
+						.collect { |e| e.reject.with_index { |c, ii| ii == 0 && TITLE_CELL_REGEX =~ c } }           # Reject Table Headers
+				}
+			end
 			# Data until first sub-table index is considered 'unsorted'
 			unsorted_end = subtab_titles.empty? ? htable_data.length : subtab_titles[0][:idx]
-			stables << htable_data.slice(0, unsorted_end)
+			# Insert last part (Unsorted)
+			stables << htable_data.slice(0, unsorted_end) if unsorted_end > 0
+			stables
+		end
+		# Sort Row
+		# Sorts Cells according to their x-offset
+		# @param [Hash] r A row of data in the form { xoffset => cell } (Example: { 120 => 'cell 0', 200 => 'cell 1', 280 => 'cell 2' })
+		# @return [Hash] The same row of data, but sorted according to x-offset
+		def self.sort_row r
+			Hash[*(r.to_a.sort { |a, b| ((a[0] == b[0]) ? 0 : (a[0] > b[0] ? 1 : -1)) }.flatten)]
 		end
 		# Touch up Table
 		# Splits Table into multiple headered tables.
 		# Also, strips Left Offset info from Table Cells.
 		# @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
-		# @return [Array] An array of tables, each represented as either a single array of rows, or a hash containing a header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }, []] }, []]
+		# @return [Array] An array of tables, each represented as either a single array of rows, or a hash containing a header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }] }]
 		def self.touch_up table
-			# Remove Column Offsets
-			table.collect! { |r| r.collect { |_left, cell| cell } }
 			# Split Table into multiple Headered Tables
-			headers = table.collect.with_index { |r, i| { idx: i, row: r } }.select { |e| e[:row].inject(true) { |b, c| b && (TITLE_CELL_REGEX =~ c) } }.collect { |r| { idx: r[:idx], row: r[:row].collect { |v| v.gsub /<\/?b>/, '' } } }
+			headers = table
+				.collect.with_index { |r, i| { idx: i, row: r } }
+				.select { |e| e[:row].inject(true) { |b, c| b && (TITLE_CELL_REGEX =~ c[1]) } }
+				.collect { |r| { idx: r[:idx], row: r[:row].collect { |o, v| { o => v.gsub(/<\/?b>/, '') } } } }
 			# Pull up Headered Tables
 			htables = headers.collect.with_index { |h, i| { head: h[:row], data: table.slice(h[:idx] + 1, htable_length(table, headers, h, i) - 1) } }
+			# Fix Rows
+			nh = htables.collect do |t|
+				# Acquire Column Offsets
+				cols = t[:head].collect { |o| o.first[0] }.sort
+				# Compute Row Base (Default Columns)
+				row_base = Hash[*(cols.collect { |c| [c, ''] }.flatten)]
+				# Tables
+				{ head: t[:head], data: t[:data].collect { |r| sort_row row_base.merge(Hash[*(r.collect { |o, c| [(cols.reverse.find { |co| co <= o }) || o, c] }.flatten)]) } }
+			end
+			# Drop Offsets
+			htables = nh.collect { |t| { head: t[:head].collect { |h| h.first[1] }, data: t[:data].collect { |r| r.collect { |_o, c| c } } } }
+			ntable = table.collect { |r| r.collect { |_o, c| c } }
 			# Split Headered Tables into multiple Named Sub-Tables
 			htables.collect! { |ht| { head: ht[:head], data: sub_tablize(ht[:data]) } }
 			# Data until first Header index is considered 'unsorted'
-			unsorted_end = headers.empty? ? table.length : headers[0][:idx]
+			unsorted_end = headers.empty? ? ntable.length : headers[0][:idx]
+			# Insert last part (Unsorted)
+			htables << sub_tablize(ntable.slice(0, unsorted_end)) if unsorted_end > 0
-			htables << sub_tablize(table.slice(0, unsorted_end))
+			htables
 		end
 		# Process

data/lib/pdftdx/version.rb CHANGED Viewed

@@ -5,5 +5,5 @@
 module PDFTDX
 	# Version
-	VERSION = '1.0.4'
+	VERSION = '1.1.7'
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pdftdx
 version: !ruby/object:Gem::Version
-  version: 1.0.4
+  version: 1.1.7
 platform: ruby
 authors:
 - Eresse
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2017-01-13 00:00:00.000000000 Z
+date: 2017-01-19 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler