RubyGems - pdftdx - Versions diffs - 1.0.4 → 1.1.7 - Mend

pdftdx 1.0.4 → 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: ac8bab6085d60063b15d62cd7c3b23cf706eebad
-  data.tar.gz: 2aeec796b1e049ed35cad9c89e18c31ed6627c4b
+  metadata.gz: 8b5995c6e0360a17c900a2a6a7cf9f0ad0de4217
+  data.tar.gz: 229ee00d6dfbeecc5b568408fd9133ee41864e5e
 SHA512:
-  metadata.gz: 956a6181277c105d53c1c91c632e067d9bb8223e0fc5388c5fa61f9dffd7b3eaa2b51f4695ff5e0efaa7b5c28a395c354d5a027f09ebcfb75aebb954513fb8b2
-  data.tar.gz: 21dd314df4f0959edbd0d066f18ab9398626ad165fcf3849f81b4a4ff5e72dac6cf28a543bdadbf0c4258becc989886384ce8cbfe4c56c393a67d603422ebc89
+  metadata.gz: 45d5f8303e371c3a8241896dfc1debb24f79ff3fd3fd10019decd7eb56d91894235fda44a17874ff9e23cf14a89f885ec862ea2d8a453dad8e5777a2696764a1
+  data.tar.gz: c05db7fbf0194cf1d72258706d00a9e493d86baeb613920df9cbacc7f283cd1a0568c40a6975eca27ee1975422c227536aa281c390c43c214d709ec6669fcfbf

data/lib/pdftdx/parser.rb CHANGED Viewed

@@ -126,37 +126,75 @@ module PDFTDX
 			subtab_titles = htable_data.collect.with_index { |r, i| { idx: i, row: r } }.select { |e| TITLE_CELL_REGEX =~ e[:row][0] }.collect { |e| { title: e[:row][0], idx: e[:idx] } }
 			# Pull up Sub-tables
-			stables = subtab_titles.collect.with_index { |t, i| { name: t[:title].gsub(/<\/?b>/, ''), data: htable_data.slice(t[:idx], sub_tab_len(htable_data, subtab_titles, t, i)).collect { |e| e.reject.with_index { |c, ii| ii == 0 && TITLE_CELL_REGEX =~ c } } } }
+			stables = subtab_titles.collect.with_index do |t, i|
+				{
+					name: t[:title].gsub(/<\/?b>/, ''),                                                             # Extract Sub-Table Name
+					data: htable_data                                                                               # Extract Sub-Table Data
+						.slice(t[:idx], sub_tab_len(htable_data, subtab_titles, t, i))                              # Slice Table Data until next Sub-Table
+						.collect { |e| e.reject.with_index { |c, ii| ii == 0 && TITLE_CELL_REGEX =~ c } }           # Reject Table Headers
+				}
+			end
 			# Data until first sub-table index is considered 'unsorted'
 			unsorted_end = subtab_titles.empty? ? htable_data.length : subtab_titles[0][:idx]
-			stables << htable_data.slice(0, unsorted_end)
+			# Insert last part (Unsorted)
+			stables << htable_data.slice(0, unsorted_end) if unsorted_end > 0
+			stables
+		end
+		# Sort Row
+		# Sorts Cells according to their x-offset
+		# @param [Hash] r A row of data in the form { xoffset => cell } (Example: { 120 => 'cell 0', 200 => 'cell 1', 280 => 'cell 2' })
+		# @return [Hash] The same row of data, but sorted according to x-offset
+		def self.sort_row r
+			Hash[*(r.to_a.sort { |a, b| ((a[0] == b[0]) ? 0 : (a[0] > b[0] ? 1 : -1)) }.flatten)]
 		end
 		# Touch up Table
 		# Splits Table into multiple headered tables.
 		# Also, strips Left Offset info from Table Cells.
 		# @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
-		# @return [Array] An array of tables, each represented as either a single array of rows, or a hash containing a header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }, []] }, []]
+		# @return [Array] An array of tables, each represented as either a single array of rows, or a hash containing a header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }] }]
 		def self.touch_up table
-			# Remove Column Offsets
-			table.collect! { |r| r.collect { |_left, cell| cell } }
 			# Split Table into multiple Headered Tables
-			headers = table.collect.with_index { |r, i| { idx: i, row: r } }.select { |e| e[:row].inject(true) { |b, c| b && (TITLE_CELL_REGEX =~ c) } }.collect { |r| { idx: r[:idx], row: r[:row].collect { |v| v.gsub /<\/?b>/, '' } } }
+			headers = table
+				.collect.with_index { |r, i| { idx: i, row: r } }
+				.select { |e| e[:row].inject(true) { |b, c| b && (TITLE_CELL_REGEX =~ c[1]) } }
+				.collect { |r| { idx: r[:idx], row: r[:row].collect { |o, v| { o => v.gsub(/<\/?b>/, '') } } } }
 			# Pull up Headered Tables
 			htables = headers.collect.with_index { |h, i| { head: h[:row], data: table.slice(h[:idx] + 1, htable_length(table, headers, h, i) - 1) } }
+			# Fix Rows
+			nh = htables.collect do |t|
+				# Acquire Column Offsets
+				cols = t[:head].collect { |o| o.first[0] }.sort
+				# Compute Row Base (Default Columns)
+				row_base = Hash[*(cols.collect { |c| [c, ''] }.flatten)]
+				# Tables
+				{ head: t[:head], data: t[:data].collect { |r| sort_row row_base.merge(Hash[*(r.collect { |o, c| [(cols.reverse.find { |co| co <= o }) || o, c] }.flatten)]) } }
+			end
+			# Drop Offsets
+			htables = nh.collect { |t| { head: t[:head].collect { |h| h.first[1] }, data: t[:data].collect { |r| r.collect { |_o, c| c } } } }
+			ntable = table.collect { |r| r.collect { |_o, c| c } }
 			# Split Headered Tables into multiple Named Sub-Tables
 			htables.collect! { |ht| { head: ht[:head], data: sub_tablize(ht[:data]) } }
 			# Data until first Header index is considered 'unsorted'
-			unsorted_end = headers.empty? ? table.length : headers[0][:idx]
+			unsorted_end = headers.empty? ? ntable.length : headers[0][:idx]
+			# Insert last part (Unsorted)
+			htables << sub_tablize(ntable.slice(0, unsorted_end)) if unsorted_end > 0
-			htables << sub_tablize(table.slice(0, unsorted_end))
+			htables
 		end
 		# Process

data/lib/pdftdx/version.rb CHANGED Viewed

@@ -5,5 +5,5 @@
 module PDFTDX
 	# Version
-	VERSION = '1.0.4'
+	VERSION = '1.1.7'
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pdftdx
 version: !ruby/object:Gem::Version
-  version: 1.0.4
+  version: 1.1.7
 platform: ruby
 authors:
 - Eresse
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2017-01-13 00:00:00.000000000 Z
+date: 2017-01-19 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler