RubyGems - pdftdx - Versions diffs - 1.2.1 → 1.2.2 - Mend

pdftdx 1.2.1 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 625c68c5b06fc15adae4491dc2d68d2a041ff07a
-  data.tar.gz: f06e7f4bbefed6d394a023e4208390f79d939aee
+  metadata.gz: a243b5d1cd30ce908060382654ca48b3ae97c242
+  data.tar.gz: ee753466c9b422f8fbcc301e5b93a8f6f61f3168
 SHA512:
-  metadata.gz: 94935c3774765d6a4e7c520ff773b58b8a43c64c29dae883d9709aac84867530d1cb019e0a1f7099c50182c4842a075ed7d82841958b8affa4c6cfda94ad3fc9
-  data.tar.gz: 4097b0496dc13e6c54138bcbacf3f5128d6c44c2a8b3f96e92ef29bdd34fda865e9ac628cc16199ddfa84fda57b047907a8270ed39358bda5917dc81e3726178
+  metadata.gz: 98b3420792f93b5bcdc59b16499be930ef52087014da36b6f63a582f013742f9af6081412958f44502e36e3c9710f6c634762087ff7e795531aaec8c2c67d671
+  data.tar.gz: 3704fb9ff4661dca80a0a6cb5fd744af7373a83cf870937ce8d799f977db0e8f67fb0406505328b4863a6e612f30bcce318d12493ed9f15c54871c24dd56ca05

data/lib/pdftdx.rb CHANGED Viewed

@@ -9,10 +9,12 @@ require 'pdftohtml'
 require 'pdftdx/parser'
 require 'pdftdx/version'
-# PDF TDX Module
+# PDF TDX Module:
+# Root Module for Pdftdx.
 module PDFTDX
-	# Extract Data from PDF
+	# Extract Data from PDF:
+	# Converts a PDF file to HTML format and then extracts anything that looks like tabular data.
 	# @param [String] pdf_file Path to a PDF file
 	# @return [Array] An array of tables, each represented as a hash containing an optional header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: { 'System' => [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] } }]
 	def self.extract_data pdf_file

data/lib/pdftdx/parser.rb CHANGED Viewed

@@ -28,7 +28,7 @@ module PDFTDX
 		# Title Cell Regex
 		TITLE_CELL_REGEX = /<b>/
-		# Is All Same Data
+		# Is All Same Data:
 		# Determine whether a row's cells all contain the same data.
 		# @param [Hash] row_data A hash of table cells, mapped by their offset from the left. Example: { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }
 		# @return [Boolean] True if all cells contain the same data, False otherwise.
@@ -37,7 +37,7 @@ module PDFTDX
 			row_data.inject(true) { |b, e| b && (e[1] == n) }
 		end
-		# Contains Unusable Data (Empty / Long Strings)
+		# Contains Unusable Data (Empty / Long Strings):
 		# Determines whether a row contains unusable data.
 		# @param [Hash] row_data A hash of table cells, mapped by their offset from the left. Example: { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }
 		# @return [Boolean] True if at least one cell is unusable (empty, oversize), False otherwise
@@ -45,7 +45,7 @@ module PDFTDX
 			row_data.inject(false) { |b, e| b || (e[1].length == 0) || (e[1].length > MAX_CELL_LEN) }
 		end
-		# HTML Filter
+		# HTML Filter:
 		# Replaces HTML newlines by UNIX-style newlines.
 		# @param [String] s A string of HTML data
 		# @return [String] The same string of HTML data, with all newlines (<br/> tags) converted to UNIX newlines.
@@ -53,7 +53,7 @@ module PDFTDX
 			s.gsub '<br/>', "\n"
 		end
-		# Collect Data
+		# Collect Data:
 		# Extracts table-like chunks of HTML data from a hash of HTML pages.
 		# @param [Hash] data A hash of document pages, mapped by their page index. Each page is an array of chomp'd lines of HTML data. Example: { 1 => ['<h1>Hello World!</h1>', 'This is page one.'], 2 => ['Wow, another page of data !', '<b>Important stuff</b>', 'That's it for page 2 !'] }
 		# @return [Array] An array of HTML chunks, each represented as a hash containing the chunk position and data. Example: [{ top: 10, left: 100, data: 'Machine OS' }, { top: 10, left: 220, data: 'Win32' }, { top: 10, left: 340, data: 'Linux' }, { top: 10, left: 460, data: 'MacOS' }]
@@ -73,7 +73,7 @@ module PDFTDX
 			end.flatten
 		end
-		# Build Data Table
+		# Build Data Table:
 		# Produces an organized Table (in the form a 2-level nested hash) from an array of HTML chunks.
 		# @param [Array] data An array of document chunks, each represented as a hash containing the position and body of the chunk. Example: [{ top: 10, left: 100, data: 'Machine OS' }, { top: 10, left: 220, data: 'Win32' }, { top: 10, left: 340, data: 'Linux' }, { top: 10, left: 460, data: 'MacOS' }]
 		# @return [Hash] A hash of table rows, mapped by their offset from the top, where each row is represented as a hash of table cells, mapped by their offset from the left. Example: { 10 => { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, 35 => { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' } }
@@ -83,7 +83,7 @@ module PDFTDX
 			table
 		end
-		# Filter Table Rows
+		# Filter Table Rows:
 		# Filters out rows considered unusable, empty, oversize, footers, etc...
 		# Also, strips Top Offset info from Table Rows.
 		# @param [Hash] data A hash of table rows, mapped by their offset from the top, where each row is represented as a hash of table cells, mapped by their offset from the left. Example: { 10 => { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, 35 => { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' } }
@@ -94,7 +94,7 @@ module PDFTDX
 				.collect { |_top, r| r }.reject { |r| r.size < 2 }                                                                              # Remove 'top offset' information and re-drop single-element rows
 		end
-		# Determine Headered Table Length
+		# Determine Headered Table Length:
 		# Computes the number of rows to be included in a given headered table.
 		# @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
 		# @param [Array] headers An array of header rows, each represented as a hash containing the header row's index within the *table* array, and the actual row data. Example: [{ idx: 0, row: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'] }]
@@ -105,7 +105,7 @@ module PDFTDX
 			(headers[i + 1] ? headers[i + 1][:idx] : table.length) - h[:idx]
 		end
-		# Sub Table Length
+		# Sub Table Length:
 		# Computes the number of rows to be included in a given sub-table.
 		# @param [Array] table An array of table rows, each represented as an array of table cells. Example: [['<b>System</b>', 'Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']]
 		# @param [Array] stables An array of named tables, each represented as a hash containing the name and its starting index within the *table* array. Example: [{ title: '<b>System Info</b>', idx: 0 }]
@@ -116,7 +116,7 @@ module PDFTDX
 			(stables[i + 1] ? stables[i + 1][:idx] : table.length) - t[:idx]
 		end
-		# Sub-Tablize
+		# Sub-Tablize:
 		# Splits a table into multiple named tables.
 		# @param [Array] htable_data An array of table rows, each represented as an array of table cells. Example: [['<b>System</b>', 'Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']]
 		# @return [Array] An array of named tables, each represented as a hash containing the name and the table itself. May also contain a single array, containing all remaining table data (unnamed). Example: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }, [['32.40 $', '34.00 $', '88.40 $'], ['21.40 km', '12.00 km', '99.10 km']]]
@@ -144,7 +144,7 @@ module PDFTDX
 			stables
 		end
-		# Sort Row
+		# Sort Row:
 		# Sorts Cells according to their x-offset
 		# @param [Hash] r A row of data in the form { xoffset => cell } (Example: { 120 => 'cell 0', 200 => 'cell 1', 280 => 'cell 2' })
 		# @return [Hash] The same row of data, but sorted according to x-offset
@@ -152,7 +152,7 @@ module PDFTDX
 			Hash[*(r.to_a.sort { |a, b| ((a[0] == b[0]) ? 0 : (a[0] > b[0] ? 1 : -1)) }.flatten)]
 		end
-		# Fix Dupes
+		# Fix Dupes:
 		# Shifts Duplicate Cells (Cells which share their x-offset with others) to the right (so they don't get overwritten)
 		# @param [Array] r A row of data in the form [[xoffset, cell]] (Example: [[120, 'cell 0'], [200, 'cell 1'], [280, 'cell 2']])
 		# @param [Array] The same row of data, but with duplicate cells shifted so that no x-offset-collisions occur
@@ -174,7 +174,7 @@ module PDFTDX
 			nr
 		end
-		# Touch up Table
+		# Touch up Table:
 		# Splits Table into multiple headered tables.
 		# Also, strips Left Offset info from Table Cells.
 		# @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
@@ -219,7 +219,7 @@ module PDFTDX
 			htables
 		end
-		# Process
+		# Process:
 		# Transforms a hash of page data (as produced by _pdftohtml_) into a usable information table tree structure.
 		# @param [Hash] page_data A hash of document pages, mapped by their page index. Each page is an array of chomp'd lines of HTML data. Example: { 1 => ['<h1>Hello World!</h1>', 'This is page one.'], 2 => ['Wow, another page of data !', '<b>Important stuff</b>', 'That's it for page 2 !'] }
 		# @return [Array] An array of tables, each represented as a hash containing an optional header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: { 'System' => [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] } }]

data/lib/pdftdx/version.rb CHANGED Viewed

@@ -5,5 +5,5 @@
 module PDFTDX
 	# Version
-	VERSION = '1.2.1'
+	VERSION = '1.2.2'
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pdftdx
 version: !ruby/object:Gem::Version
-  version: 1.2.1
+  version: 1.2.2
 platform: ruby
 authors:
 - Eresse
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2017-01-28 00:00:00.000000000 Z
+date: 2017-03-01 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -116,7 +116,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.6.10
+rubygems_version: 2.5.1
 signing_key:
 specification_version: 4
 summary: Simple PDF Table Data Extractor