RubyGems - pdftdx - Versions diffs - 0.3.1 → 1.0.0 - Mend

pdftdx 0.3.1 → 1.0.0

Files changed (9) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: b6ac984f258036c4d20985529d0fdcda9fd3254f
-  data.tar.gz: 9b325fba42e742f0521317f4202a8b2a783114da
+  metadata.gz: 703bd26469ea409ff86c2a9383cdc34be4b7dcb6
+  data.tar.gz: 387ef2d99b28d53f961673780aa72c75941e2be1
 SHA512:
-  metadata.gz: 0ee08da850b0cb3ee593c7bbcbb0dd4f0d1244619e512018a83964fa103797d108ba2a4f5eca28f8e19ff2901c2d26d20bf36ca26d3f4787cd20a6286ef50513
-  data.tar.gz: 8d230f85707d4820ce341222e95a6f71fb553a3d7d57b7622a70b8668a966f8e4626e37538a2fe563855c0c3c50cdd846563ba3f6408318d0c39879c9ab2d710
+  metadata.gz: d802ca74deb4729983dafb4d1d7f5b25f15471e1f2b1007751779658baa86bdc0ec45074a14e98ac78232f05d9e3b3ba42e3e76ac8945575ad58236886d5fd4c
+  data.tar.gz: 4c60551311fd9cf6f4b355c2b4b92aa3e9abbd4d204361f479da88bf91731d061c67c15e6b1f9d8655888a392dc53550b5e68b4f32a86c6d25d57460c2103083

data/.idea/.rakeTasks CHANGED Viewed

@@ -4,4 +4,4 @@ You are allowed to:
 1. Remove rake task
 2. Add existing rake tasks
 To add existing rake tasks automatically delete this file and reload the project.
---><RakeGroup description="" fullCmd="" taksId="rake"><RakeTask description="Build pdftdx-0.1.0.gem into the pkg directory" fullCmd="build" taksId="build" /><RakeTask description="Remove any temporary products" fullCmd="clean" taksId="clean" /><RakeTask description="Remove any generated files" fullCmd="clobber" taksId="clobber" /><RakeTask description="Build and install pdftdx-0.1.0.gem into system gems" fullCmd="install" taksId="install" /><RakeGroup description="" fullCmd="" taksId="install"><RakeTask description="Build and install pdftdx-0.1.0.gem into system gems without network access" fullCmd="install:local" taksId="local" /></RakeGroup><RakeTask description="Create tag v0.1.0 and build and push pdftdx-0.1.0.gem to Rubygems" fullCmd="release[remote]" taksId="release[remote]" /><RakeTask description="" fullCmd="default" taksId="default" /><RakeTask description="" fullCmd="release" taksId="release" /><RakeGroup description="" fullCmd="" taksId="release"><RakeTask description="" fullCmd="release:guard_clean" taksId="guard_clean" /><RakeTask description="" fullCmd="release:rubygem_push" taksId="rubygem_push" /><RakeTask description="" fullCmd="release:source_control_push" taksId="source_control_push" /></RakeGroup></RakeGroup></Settings>
+--><RakeGroup description="" fullCmd="" taksId="rake"><RakeTask description="Build pdftdx-0.3.2.gem into the pkg directory" fullCmd="build" taksId="build" /><RakeTask description="Remove any temporary products" fullCmd="clean" taksId="clean" /><RakeTask description="Remove any generated files" fullCmd="clobber" taksId="clobber" /><RakeTask description="Build and install pdftdx-0.3.2.gem into system gems" fullCmd="install" taksId="install" /><RakeGroup description="" fullCmd="" taksId="install"><RakeTask description="Build and install pdftdx-0.3.2.gem into system gems without network access" fullCmd="install:local" taksId="local" /></RakeGroup><RakeTask description="Create tag v0.3.2 and build and push pdftdx-0.3.2.gem to Rubygems" fullCmd="release[remote]" taksId="release[remote]" /><RakeTask description="Run tests" fullCmd="test" taksId="test" /><RakeTask description="" fullCmd="default" taksId="default" /><RakeTask description="" fullCmd="release" taksId="release" /><RakeGroup description="" fullCmd="" taksId="release"><RakeTask description="" fullCmd="release:guard_clean" taksId="guard_clean" /><RakeTask description="" fullCmd="release:rubygem_push" taksId="rubygem_push" /><RakeTask description="" fullCmd="release:source_control_push" taksId="source_control_push" /></RakeGroup></RakeGroup></Settings>

data/.idea/pdftdx.iml CHANGED Viewed

@@ -1,27 +1,149 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="RUBY_MODULE" version="4">
   <component name="ModuleRunConfigurationManager">
-    <configuration default="false" name="release[remote]: pdftdx" type="RakeRunConfigurationType" factoryName="Rake" temporary="true">
+    <configuration default="false" name="test_filter_rows: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
+      <predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
       <module name="pdftdx" />
-      <RAKE_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
-      <RAKE_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$" />
-      <RAKE_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
-      <RAKE_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
-      <RAKE_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
-      <envs />
-      <EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="false" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test/pdftdx" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
+      <envs>
+        <env name="JRUBY_OPTS" value="-X+O" />
+      </envs>
+      <EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
       <EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
       <EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
         <COVERAGE_PATTERN ENABLED="true">
           <PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
         </COVERAGE_PATTERN>
       </EXTENSION>
-      <RAKE_RUN_CONFIG_SETTINGS_ID NAME="RAKE_TASK_NAME" VALUE="release" />
-      <RAKE_RUN_CONFIG_SETTINGS_ID NAME="RAKE_TASK_ARGS" VALUE="" />
-      <RAKE_RUN_CONFIG_SETTINGS_ID NAME="RAKE_TASK_ATTACHED_TEST_FRAMEWORKS" VALUE="" />
-      <RAKE_RUN_CONFIG_SETTINGS_ID NAME="RAKE_TASK_OPTION_TRACE" VALUE="false" />
-      <RAKE_RUN_CONFIG_SETTINGS_ID NAME="RAKE_TASK_OPTION_DRYRUN" VALUE="false" />
-      <RAKE_RUN_CONFIG_SETTINGS_ID NAME="RAKE_TASK_OPTION_PREREQS" VALUE="false" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="$MODULE_DIR$/test/pdftdx/test_parser.rb" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="test_filter_rows" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="TEST_METHOD" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
+      <method />
+    </configuration>
+    <configuration default="false" name="test_collect_data: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
+      <predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
+      <module name="pdftdx" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test/pdftdx" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
+      <envs>
+        <env name="JRUBY_OPTS" value="-X+O" />
+      </envs>
+      <EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
+      <EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
+      <EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
+        <COVERAGE_PATTERN ENABLED="true">
+          <PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
+        </COVERAGE_PATTERN>
+      </EXTENSION>
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="$MODULE_DIR$/test/pdftdx/test_parser.rb" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="test_collect_data" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="TEST_METHOD" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
+      <method />
+    </configuration>
+    <configuration default="false" name="test_build_table: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
+      <predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
+      <module name="pdftdx" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test/pdftdx" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
+      <envs>
+        <env name="JRUBY_OPTS" value="-X+O" />
+      </envs>
+      <EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
+      <EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
+      <EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
+        <COVERAGE_PATTERN ENABLED="true">
+          <PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
+        </COVERAGE_PATTERN>
+      </EXTENSION>
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="$MODULE_DIR$/test/pdftdx/test_parser.rb" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="test_build_table" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="TEST_METHOD" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
+      <method />
+    </configuration>
+    <configuration default="false" name="test_process: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
+      <predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
+      <module name="pdftdx" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test/pdftdx" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
+      <envs>
+        <env name="JRUBY_OPTS" value="-X+O" />
+      </envs>
+      <EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
+      <EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
+      <EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
+        <COVERAGE_PATTERN ENABLED="true">
+          <PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
+        </COVERAGE_PATTERN>
+      </EXTENSION>
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="$MODULE_DIR$/test/pdftdx/test_parser.rb" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="test_process" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="TEST_METHOD" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
+      <method />
+    </configuration>
+    <configuration default="false" name="All tests in test: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
+      <predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
+      <module name="pdftdx" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
+      <envs>
+        <env name="JRUBY_OPTS" value="-X+O" />
+      </envs>
+      <EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
+      <EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
+      <EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
+        <COVERAGE_PATTERN ENABLED="true">
+          <PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
+        </COVERAGE_PATTERN>
+      </EXTENSION>
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="$MODULE_DIR$/test" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="**/{*_test,test_*}.rb" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="ALL_IN_FOLDER" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
+      <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
       <method />
     </configuration>
   </component>
@@ -30,7 +152,9 @@
     <orderEntry type="inheritedJdk" />
     <orderEntry type="sourceFolder" forTests="false" />
     <orderEntry type="library" scope="PROVIDED" name="bundler (v1.12.5, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
-    <orderEntry type="library" scope="PROVIDED" name="pdftohtml (v0.2.1, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="htmlentities (v4.3.4, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="minitest (v5.10.1, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="pdftohtml (v0.2.3, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
     <orderEntry type="library" scope="PROVIDED" name="rake (v10.5.0, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
   </component>
 </module>

data/Gemfile.lock CHANGED Viewed

@@ -1,13 +1,17 @@
 PATH
   remote: .
   specs:
-    pdftdx (0.2.0)
+    pdftdx (1.0.0)
+      htmlentities
+      minitest
       pdftohtml
 GEM
   remote: https://rubygems.org/
   specs:
-    pdftohtml (0.2.1)
+    htmlentities (4.3.4)
+    minitest (5.10.1)
+    pdftohtml (0.2.3)
     rake (10.5.0)
 PLATFORMS

data/Rakefile CHANGED Viewed

@@ -1,2 +1,8 @@
 require "bundler/gem_tasks"
+require 'rake/testtask'
 task :default => :spec
+Rake::TestTask.new do |t|
+	t.libs << 'test'
+end

data/lib/pdftdx/parser.rb CHANGED Viewed

@@ -22,61 +22,160 @@ module PDFTDX
 		# Page Offset
 		PAGE_OFF = 10000
-		# Title Cell Regex
-		TITLE_CELL_REGEX = /<bbb>/
+		# Maximum Allowed Offset from Page Top
+		PAGE_MAX_TOP = 1100
-		# Check Same Line
-		def self.same_line data, idx_a, idx_b
-			data[idx_a][:top] == data[idx_b][:top]
-		end
+		# Title Cell Regex
+		TITLE_CELL_REGEX = /<b>/
 		# Is All Same Data
-		def self.is_all_same row_data
+		# Determine whether a row's cells all contain the same data.
+		# @param [Hash] row_data A hash of table cells, mapped by their offset from the left. Example: { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }
+		# @return [Boolean] True if all cells contain the same data, False otherwise.
+		def self.is_all_same? row_data
 			n = row_data[row_data.keys[0]]
 			row_data.inject(true) { |b, e| b && (e[1] == n) }
 		end
 		# Contains Unusable Data (Empty / Long Strings)
-		def self.contains_unusable row_data
+		# Determines whether a row contains unusable data.
+		# @param [Hash] row_data A hash of table cells, mapped by their offset from the left. Example: { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }
+		# @return [Boolean] True if at least one cell is unusable (empty, oversize), False otherwise
+		def self.contains_unusable? row_data
 			row_data.inject(false) { |b, e| b || (e[1].length == 0) || (e[1].length > MAX_CELL_LEN) }
 		end
-		# Process Data
-		def self.process_data data
+		# HTML Filter
+		# Replaces HTML newlines by UNIX-style newlines.
+		# @param [String] s A string of HTML data
+		# @return [String] The same string of HTML data, with all newlines (<br/> tags) converted to UNIX newlines.
+		def self.hfilter s
+			s.gsub '<br/>', "\n"
+		end
+		# Collect Data
+		# Extracts table-like chunks of HTML data from a hash of HTML pages.
+		# @param [Hash] data A hash of document pages, mapped by their page index. Each page is an array of chomp'd lines of HTML data. Example: { 1 => ['<h1>Hello World!</h1>', 'This is page one.'], 2 => ['Wow, another page of data !', '<b>Important stuff</b>', 'That's it for page 2 !'] }
+		# @return [Array] An array of HTML chunks, each represented as a hash containing the chunk position and data. Example: [{ top: 10, left: 100, data: 'Machine OS' }, { top: 10, left: 220, data: 'Win32' }, { top: 10, left: 340, data: 'Linux' }, { top: 10, left: 460, data: 'MacOS' }]
+		def self.collect_data data
-			# Build Data Table
+			# Build HTML Entity Decoder
+			coder = HTMLEntities.new
+			# Collect File Data
+			off = 0
+			data.collect do |_idx, page|
+				off = off + PAGE_OFF
+				page
+					.select { |l| LINE_REGEX =~ l }                                                                                             # Collect Table-like data
+					.collect { |l| LINE_REGEX.match l }                                                                                         # Extract Table Element Metadata (Position)
+					.collect { |d| { top: off + d[1].to_i, left: d[2].to_i, data: hfilter(coder.decode(d[3])) } }                               # Produce Hash of Raw Table Data
+			end.flatten
+		end
+		# Build Data Table
+		# Produces an organized Table (in the form a 2-level nested hash) from an array of HTML chunks.
+		# @param [Array] data An array of document chunks, each represented as a hash containing the position and body of the chunk. Example: [{ top: 10, left: 100, data: 'Machine OS' }, { top: 10, left: 220, data: 'Win32' }, { top: 10, left: 340, data: 'Linux' }, { top: 10, left: 460, data: 'MacOS' }]
+		# @return [Hash] A hash of table rows, mapped by their offset from the top, where each row is represented as a hash of table cells, mapped by their offset from the left. Example: { 10 => { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, 35 => { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' } }
+		def self.build_table data
 			table = {}
 			data.each { |d| table[d[:top]] ||= {}; table[d[:top]][d[:left]] = d[:data] }
+			table
+		end
-			# Filter Table Rows (Remove Lone Elements & Footers)
-			table.reject! { |top, row| row.size < 2 || (top % PAGE_OFF) >= 1110 || is_all_same(row) || contains_unusable(row) }
+		# Filter Table Rows
+		# Filters out rows considered unusable, empty, oversize, footers, etc...
+		# Also, strips Top Offset info from Table Rows.
+		# @param [Hash] data A hash of table rows, mapped by their offset from the top, where each row is represented as a hash of table cells, mapped by their offset from the left. Example: { 10 => { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, 35 => { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' } }
+		# @return [Array] An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
+		def self.filter_rows data
+			data
+				.reject { |top, row| row.size < 2 || (top % PAGE_OFF) >= PAGE_MAX_TOP || is_all_same?(row) || contains_unusable?(row) }         # Drop Single-Element Rows, Footer Data, Useless Rows (all cells identical) & Unusable Rows (Empty / Oversize Cells)
+				.collect { |_top, r| r }.reject { |r| r.size < 2 }                                                                              # Remove 'top offset' information and re-drop single-element rows
+		end
-			# Filter Table Cells
-			table = table.collect { |_top, r| r.reject { |_left, d| TITLE_CELL_REGEX =~ d } }.reject { |r| r.size < 1 }
+		# Determine Headered Table Length
+		# Computes the number of rows to be included in a given headered table.
+		# @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
+		# @param [Array] headers An array of header rows, each represented as a hash containing the header row's index within the *table* array, and the actual row data. Example: [{ idx: 0, row: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'] }]
+		# @param [Hash] h The current header row (determine htable length from this)
+		# @param [Fixnum] i The current header's index within the *headers* array
+		# @return [Fixnum] The number of rows
+		def self.htable_length table, headers, h, i
+			(headers[i + 1] ? headers[i + 1][:idx] : table.length) - h[:idx]
+		end
-			# Cleanup Table ( IS THIS NECESSARY ? )
-			table.reject! { |r| r.size < 2 }
+		# Sub Table Length
+		# Computes the number of rows to be included in a given sub-table.
+		# @param [Array] table An array of table rows, each represented as an array of table cells. Example: [['<b>System</b>', 'Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']]
+		# @param [Array] stables An array of named tables, each represented as a hash containing the name and its starting index within the *table* array. Example: [{ title: '<b>System Info</b>', idx: 0 }]
+		# @param [Hash] t The current sub-table title row (determine stable length from this)
+		# @param [Fixnum] i The current sub-table title's index within the *stable* array
+		# @return [Fixnum] The number of rows
+		def self.sub_tab_len table, stables, t, i
+			(stables[i + 1] ? stables[i + 1][:idx] : table.length) - t[:idx]
+		end
-			# DEBUG
-			puts "=============> #{table}"
+		# Sub-Tablize
+		# Splits a table into multiple named tables.
+		# @param [Array] htable_data An array of table rows, each represented as an array of table cells. Example: [['<b>System</b>', 'Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']]
+		# @return [Array] An array of named tables, each represented as a hash containing the name and the table itself. May also contain a single array, containing all remaining table data (unnamed). Example: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }, [['32.40 $', '34.00 $', '88.40 $'], ['21.40 km', '12.00 km', '99.10 km']]]
+		def self.sub_tablize htable_data
-			table
+			# Collect Sub-table Title Rows
+			subtab_titles = htable_data.collect.with_index { |r, i| { idx: i, row: r } }.select { |e| TITLE_CELL_REGEX =~ e[:row][0] }.collect { |e| { title: e[:row][0], idx: e[:idx] } }
+			# Pull up Sub-tables
+			stables = subtab_titles.collect.with_index { |t, i| { name: t[:title].gsub(/<\/?b>/, ''), data: htable_data.slice(t[:idx], sub_tab_len(htable_data, subtab_titles, t, i)).collect { |e| e.reject.with_index { |c, ii| ii == 0 && TITLE_CELL_REGEX =~ c } } } }
+			# Data until first sub-table index is considered 'unsorted'
+			unsorted_end = subtab_titles.empty? ? htable_data.length : subtab_titles[0][:idx]
+			stables << htable_data.slice(0, unsorted_end)
 		end
-		# HTML Filter
-		def self.hfilter s
-			s.gsub '<br/>', "\n"
+		# Touch up Table
+		# Splits Table into multiple headered tables.
+		# Also, strips Left Offset info from Table Cells.
+		# @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
+		# @return [Array] An array of tables, each represented as either a single array of rows, or a hash containing a header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }, []] }, []]
+		def self.touch_up table
+			# Remove Column Offsets
+			table.collect! { |r| r.collect { |_left, cell| cell } }
+			# Split Table into multiple Headered Tables
+			headers = table.collect.with_index { |r, i| { idx: i, row: r } }.select { |e| e[:row].inject(true) { |b, c| b && (TITLE_CELL_REGEX =~ c) } }.collect { |r| { idx: r[:idx], row: r[:row].collect { |v| v.gsub /<\/?b>/, '' } } }
+			# Pull up Headered Tables
+			htables = headers.collect.with_index { |h, i| { head: h[:row], data: table.slice(h[:idx] + 1, htable_length(table, headers, h, i) - 1) } }
+			# Split Headered Tables into multiple Named Sub-Tables
+			htables.collect! { |ht| { head: ht[:head], data: sub_tablize(ht[:data]) } }
+			# Data until first Header index is considered 'unsorted'
+			unsorted_end = headers.empty? ? table.length : headers[0][:idx]
+			htables << sub_tablize(table.slice(0, unsorted_end))
 		end
-		# Process Page Files
-		def self.process_page_files page_data
+		# Process
+		# Transforms a hash of page data (as produced by _pdftohtml_) into a usable information table tree structure.
+		# @param [Hash] page_data A hash of document pages, mapped by their page index. Each page is an array of chomp'd lines of HTML data. Example: { 1 => ['<h1>Hello World!</h1>', 'This is page one.'], 2 => ['Wow, another page of data !', '<b>Important stuff</b>', 'That's it for page 2 !'] }
+		# @return [Array] An array of tables, each represented as a hash containing an optional header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: { 'System' => [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] } }]
+		def self.process page_data
-			# Build HTML Entity Decoder
-			coder = HTMLEntities.new
+			# Collect Data
+			data = collect_data page_data
-			# Collect & Process File Data
-			off = 0
-			process_data page_data.collect { |_idx, page| off = off + PAGE_OFF; page.select { |l| LINE_REGEX =~ l }.collect { |l| LINE_REGEX.match l }.collect { |d| { top: off + d[1].to_i, left: d[2].to_i, data: hfilter(coder.decode(d[3])) } } }.flatten
+			# Build Data Table
+			table = build_table data
+			# Filter Rows
+			table = filter_rows table
+			# Filter Table Cells & Touch up
+			touch_up table
 		end
 	end
 end

data/lib/pdftdx/version.rb CHANGED Viewed

@@ -5,5 +5,5 @@
 module PDFTDX
 	# Version
-	VERSION = '0.3.1'
+	VERSION = '1.0.0'
 end

data/pdftdx.gemspec CHANGED Viewed

@@ -21,5 +21,7 @@ Gem::Specification.new do |spec|
 	spec.add_development_dependency "bundler", "~> 1.12"
 	spec.add_development_dependency "rake", "~> 10.0"
+	spec.add_runtime_dependency "minitest"
+	spec.add_runtime_dependency "htmlentities"
 	spec.add_runtime_dependency "pdftohtml"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pdftdx
 version: !ruby/object:Gem::Version
-  version: 0.3.1
+  version: 1.0.0
 platform: ruby
 authors:
 - Eresse
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2016-12-09 00:00:00.000000000 Z
+date: 2016-12-12 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -38,6 +38,34 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '10.0'
+- !ruby/object:Gem::Dependency
+  name: minitest
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: htmlentities
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: pdftohtml
   requirement: !ruby/object:Gem::Requirement