pdftdx 0.3.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b6ac984f258036c4d20985529d0fdcda9fd3254f
4
- data.tar.gz: 9b325fba42e742f0521317f4202a8b2a783114da
3
+ metadata.gz: 703bd26469ea409ff86c2a9383cdc34be4b7dcb6
4
+ data.tar.gz: 387ef2d99b28d53f961673780aa72c75941e2be1
5
5
  SHA512:
6
- metadata.gz: 0ee08da850b0cb3ee593c7bbcbb0dd4f0d1244619e512018a83964fa103797d108ba2a4f5eca28f8e19ff2901c2d26d20bf36ca26d3f4787cd20a6286ef50513
7
- data.tar.gz: 8d230f85707d4820ce341222e95a6f71fb553a3d7d57b7622a70b8668a966f8e4626e37538a2fe563855c0c3c50cdd846563ba3f6408318d0c39879c9ab2d710
6
+ metadata.gz: d802ca74deb4729983dafb4d1d7f5b25f15471e1f2b1007751779658baa86bdc0ec45074a14e98ac78232f05d9e3b3ba42e3e76ac8945575ad58236886d5fd4c
7
+ data.tar.gz: 4c60551311fd9cf6f4b355c2b4b92aa3e9abbd4d204361f479da88bf91731d061c67c15e6b1f9d8655888a392dc53550b5e68b4f32a86c6d25d57460c2103083
data/.idea/.rakeTasks CHANGED
@@ -4,4 +4,4 @@ You are allowed to:
4
4
  1. Remove rake task
5
5
  2. Add existing rake tasks
6
6
  To add existing rake tasks automatically delete this file and reload the project.
7
- --><RakeGroup description="" fullCmd="" taksId="rake"><RakeTask description="Build pdftdx-0.1.0.gem into the pkg directory" fullCmd="build" taksId="build" /><RakeTask description="Remove any temporary products" fullCmd="clean" taksId="clean" /><RakeTask description="Remove any generated files" fullCmd="clobber" taksId="clobber" /><RakeTask description="Build and install pdftdx-0.1.0.gem into system gems" fullCmd="install" taksId="install" /><RakeGroup description="" fullCmd="" taksId="install"><RakeTask description="Build and install pdftdx-0.1.0.gem into system gems without network access" fullCmd="install:local" taksId="local" /></RakeGroup><RakeTask description="Create tag v0.1.0 and build and push pdftdx-0.1.0.gem to Rubygems" fullCmd="release[remote]" taksId="release[remote]" /><RakeTask description="" fullCmd="default" taksId="default" /><RakeTask description="" fullCmd="release" taksId="release" /><RakeGroup description="" fullCmd="" taksId="release"><RakeTask description="" fullCmd="release:guard_clean" taksId="guard_clean" /><RakeTask description="" fullCmd="release:rubygem_push" taksId="rubygem_push" /><RakeTask description="" fullCmd="release:source_control_push" taksId="source_control_push" /></RakeGroup></RakeGroup></Settings>
7
+ --><RakeGroup description="" fullCmd="" taksId="rake"><RakeTask description="Build pdftdx-0.3.2.gem into the pkg directory" fullCmd="build" taksId="build" /><RakeTask description="Remove any temporary products" fullCmd="clean" taksId="clean" /><RakeTask description="Remove any generated files" fullCmd="clobber" taksId="clobber" /><RakeTask description="Build and install pdftdx-0.3.2.gem into system gems" fullCmd="install" taksId="install" /><RakeGroup description="" fullCmd="" taksId="install"><RakeTask description="Build and install pdftdx-0.3.2.gem into system gems without network access" fullCmd="install:local" taksId="local" /></RakeGroup><RakeTask description="Create tag v0.3.2 and build and push pdftdx-0.3.2.gem to Rubygems" fullCmd="release[remote]" taksId="release[remote]" /><RakeTask description="Run tests" fullCmd="test" taksId="test" /><RakeTask description="" fullCmd="default" taksId="default" /><RakeTask description="" fullCmd="release" taksId="release" /><RakeGroup description="" fullCmd="" taksId="release"><RakeTask description="" fullCmd="release:guard_clean" taksId="guard_clean" /><RakeTask description="" fullCmd="release:rubygem_push" taksId="rubygem_push" /><RakeTask description="" fullCmd="release:source_control_push" taksId="source_control_push" /></RakeGroup></RakeGroup></Settings>
data/.idea/pdftdx.iml CHANGED
@@ -1,27 +1,149 @@
1
1
  <?xml version="1.0" encoding="UTF-8"?>
2
2
  <module type="RUBY_MODULE" version="4">
3
3
  <component name="ModuleRunConfigurationManager">
4
- <configuration default="false" name="release[remote]: pdftdx" type="RakeRunConfigurationType" factoryName="Rake" temporary="true">
4
+ <configuration default="false" name="test_filter_rows: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
5
+ <predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
5
6
  <module name="pdftdx" />
6
- <RAKE_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
7
- <RAKE_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$" />
8
- <RAKE_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
9
- <RAKE_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
10
- <RAKE_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
11
- <envs />
12
- <EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="false" />
7
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
8
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test/pdftdx" />
9
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
10
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
11
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
12
+ <envs>
13
+ <env name="JRUBY_OPTS" value="-X+O" />
14
+ </envs>
15
+ <EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
13
16
  <EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
14
17
  <EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
15
18
  <COVERAGE_PATTERN ENABLED="true">
16
19
  <PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
17
20
  </COVERAGE_PATTERN>
18
21
  </EXTENSION>
19
- <RAKE_RUN_CONFIG_SETTINGS_ID NAME="RAKE_TASK_NAME" VALUE="release" />
20
- <RAKE_RUN_CONFIG_SETTINGS_ID NAME="RAKE_TASK_ARGS" VALUE="" />
21
- <RAKE_RUN_CONFIG_SETTINGS_ID NAME="RAKE_TASK_ATTACHED_TEST_FRAMEWORKS" VALUE="" />
22
- <RAKE_RUN_CONFIG_SETTINGS_ID NAME="RAKE_TASK_OPTION_TRACE" VALUE="false" />
23
- <RAKE_RUN_CONFIG_SETTINGS_ID NAME="RAKE_TASK_OPTION_DRYRUN" VALUE="false" />
24
- <RAKE_RUN_CONFIG_SETTINGS_ID NAME="RAKE_TASK_OPTION_PREREQS" VALUE="false" />
22
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="" />
23
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="$MODULE_DIR$/test/pdftdx/test_parser.rb" />
24
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="" />
25
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="test_filter_rows" />
26
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="TEST_METHOD" />
27
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
28
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
29
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
30
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
31
+ <method />
32
+ </configuration>
33
+ <configuration default="false" name="test_collect_data: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
34
+ <predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
35
+ <module name="pdftdx" />
36
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
37
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test/pdftdx" />
38
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
39
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
40
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
41
+ <envs>
42
+ <env name="JRUBY_OPTS" value="-X+O" />
43
+ </envs>
44
+ <EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
45
+ <EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
46
+ <EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
47
+ <COVERAGE_PATTERN ENABLED="true">
48
+ <PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
49
+ </COVERAGE_PATTERN>
50
+ </EXTENSION>
51
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="" />
52
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="$MODULE_DIR$/test/pdftdx/test_parser.rb" />
53
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="" />
54
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="test_collect_data" />
55
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="TEST_METHOD" />
56
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
57
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
58
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
59
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
60
+ <method />
61
+ </configuration>
62
+ <configuration default="false" name="test_build_table: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
63
+ <predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
64
+ <module name="pdftdx" />
65
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
66
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test/pdftdx" />
67
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
68
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
69
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
70
+ <envs>
71
+ <env name="JRUBY_OPTS" value="-X+O" />
72
+ </envs>
73
+ <EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
74
+ <EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
75
+ <EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
76
+ <COVERAGE_PATTERN ENABLED="true">
77
+ <PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
78
+ </COVERAGE_PATTERN>
79
+ </EXTENSION>
80
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="" />
81
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="$MODULE_DIR$/test/pdftdx/test_parser.rb" />
82
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="" />
83
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="test_build_table" />
84
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="TEST_METHOD" />
85
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
86
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
87
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
88
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
89
+ <method />
90
+ </configuration>
91
+ <configuration default="false" name="test_process: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
92
+ <predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
93
+ <module name="pdftdx" />
94
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
95
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test/pdftdx" />
96
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
97
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
98
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
99
+ <envs>
100
+ <env name="JRUBY_OPTS" value="-X+O" />
101
+ </envs>
102
+ <EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
103
+ <EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
104
+ <EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
105
+ <COVERAGE_PATTERN ENABLED="true">
106
+ <PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
107
+ </COVERAGE_PATTERN>
108
+ </EXTENSION>
109
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="" />
110
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="$MODULE_DIR$/test/pdftdx/test_parser.rb" />
111
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="" />
112
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="test_process" />
113
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="TEST_METHOD" />
114
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
115
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
116
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
117
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
118
+ <method />
119
+ </configuration>
120
+ <configuration default="false" name="All tests in test: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
121
+ <predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
122
+ <module name="pdftdx" />
123
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
124
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test" />
125
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
126
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
127
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
128
+ <envs>
129
+ <env name="JRUBY_OPTS" value="-X+O" />
130
+ </envs>
131
+ <EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
132
+ <EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
133
+ <EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
134
+ <COVERAGE_PATTERN ENABLED="true">
135
+ <PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
136
+ </COVERAGE_PATTERN>
137
+ </EXTENSION>
138
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="$MODULE_DIR$/test" />
139
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="" />
140
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="**/{*_test,test_*}.rb" />
141
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="" />
142
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="ALL_IN_FOLDER" />
143
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
144
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
145
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
146
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
25
147
  <method />
26
148
  </configuration>
27
149
  </component>
@@ -30,7 +152,9 @@
30
152
  <orderEntry type="inheritedJdk" />
31
153
  <orderEntry type="sourceFolder" forTests="false" />
32
154
  <orderEntry type="library" scope="PROVIDED" name="bundler (v1.12.5, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
33
- <orderEntry type="library" scope="PROVIDED" name="pdftohtml (v0.2.1, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
155
+ <orderEntry type="library" scope="PROVIDED" name="htmlentities (v4.3.4, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
156
+ <orderEntry type="library" scope="PROVIDED" name="minitest (v5.10.1, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
157
+ <orderEntry type="library" scope="PROVIDED" name="pdftohtml (v0.2.3, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
34
158
  <orderEntry type="library" scope="PROVIDED" name="rake (v10.5.0, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
35
159
  </component>
36
160
  </module>
data/Gemfile.lock CHANGED
@@ -1,13 +1,17 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- pdftdx (0.2.0)
4
+ pdftdx (1.0.0)
5
+ htmlentities
6
+ minitest
5
7
  pdftohtml
6
8
 
7
9
  GEM
8
10
  remote: https://rubygems.org/
9
11
  specs:
10
- pdftohtml (0.2.1)
12
+ htmlentities (4.3.4)
13
+ minitest (5.10.1)
14
+ pdftohtml (0.2.3)
11
15
  rake (10.5.0)
12
16
 
13
17
  PLATFORMS
data/Rakefile CHANGED
@@ -1,2 +1,8 @@
1
1
  require "bundler/gem_tasks"
2
+ require 'rake/testtask'
3
+
2
4
  task :default => :spec
5
+
6
+ Rake::TestTask.new do |t|
7
+ t.libs << 'test'
8
+ end
data/lib/pdftdx/parser.rb CHANGED
@@ -22,61 +22,160 @@ module PDFTDX
22
22
  # Page Offset
23
23
  PAGE_OFF = 10000
24
24
 
25
- # Title Cell Regex
26
- TITLE_CELL_REGEX = /<bbb>/
25
+ # Maximum Allowed Offset from Page Top
26
+ PAGE_MAX_TOP = 1100
27
27
 
28
- # Check Same Line
29
- def self.same_line data, idx_a, idx_b
30
- data[idx_a][:top] == data[idx_b][:top]
31
- end
28
+ # Title Cell Regex
29
+ TITLE_CELL_REGEX = /<b>/
32
30
 
33
31
  # Is All Same Data
34
- def self.is_all_same row_data
32
+ # Determine whether a row's cells all contain the same data.
33
+ # @param [Hash] row_data A hash of table cells, mapped by their offset from the left. Example: { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }
34
+ # @return [Boolean] True if all cells contain the same data, False otherwise.
35
+ def self.is_all_same? row_data
35
36
  n = row_data[row_data.keys[0]]
36
37
  row_data.inject(true) { |b, e| b && (e[1] == n) }
37
38
  end
38
39
 
39
40
  # Contains Unusable Data (Empty / Long Strings)
40
- def self.contains_unusable row_data
41
+ # Determines whether a row contains unusable data.
42
+ # @param [Hash] row_data A hash of table cells, mapped by their offset from the left. Example: { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }
43
+ # @return [Boolean] True if at least one cell is unusable (empty, oversize), False otherwise
44
+ def self.contains_unusable? row_data
41
45
  row_data.inject(false) { |b, e| b || (e[1].length == 0) || (e[1].length > MAX_CELL_LEN) }
42
46
  end
43
47
 
44
- # Process Data
45
- def self.process_data data
48
+ # HTML Filter
49
+ # Replaces HTML newlines by UNIX-style newlines.
50
+ # @param [String] s A string of HTML data
51
+ # @return [String] The same string of HTML data, with all newlines (<br/> tags) converted to UNIX newlines.
52
+ def self.hfilter s
53
+ s.gsub '<br/>', "\n"
54
+ end
55
+
56
+ # Collect Data
57
+ # Extracts table-like chunks of HTML data from a hash of HTML pages.
58
+ # @param [Hash] data A hash of document pages, mapped by their page index. Each page is an array of chomp'd lines of HTML data. Example: { 1 => ['<h1>Hello World!</h1>', 'This is page one.'], 2 => ['Wow, another page of data !', '<b>Important stuff</b>', 'That's it for page 2 !'] }
59
+ # @return [Array] An array of HTML chunks, each represented as a hash containing the chunk position and data. Example: [{ top: 10, left: 100, data: 'Machine OS' }, { top: 10, left: 220, data: 'Win32' }, { top: 10, left: 340, data: 'Linux' }, { top: 10, left: 460, data: 'MacOS' }]
60
+ def self.collect_data data
46
61
 
47
- # Build Data Table
62
+ # Build HTML Entity Decoder
63
+ coder = HTMLEntities.new
64
+
65
+ # Collect File Data
66
+ off = 0
67
+ data.collect do |_idx, page|
68
+ off = off + PAGE_OFF
69
+ page
70
+ .select { |l| LINE_REGEX =~ l } # Collect Table-like data
71
+ .collect { |l| LINE_REGEX.match l } # Extract Table Element Metadata (Position)
72
+ .collect { |d| { top: off + d[1].to_i, left: d[2].to_i, data: hfilter(coder.decode(d[3])) } } # Produce Hash of Raw Table Data
73
+ end.flatten
74
+ end
75
+
76
+ # Build Data Table
77
+ # Produces an organized Table (in the form a 2-level nested hash) from an array of HTML chunks.
78
+ # @param [Array] data An array of document chunks, each represented as a hash containing the position and body of the chunk. Example: [{ top: 10, left: 100, data: 'Machine OS' }, { top: 10, left: 220, data: 'Win32' }, { top: 10, left: 340, data: 'Linux' }, { top: 10, left: 460, data: 'MacOS' }]
79
+ # @return [Hash] A hash of table rows, mapped by their offset from the top, where each row is represented as a hash of table cells, mapped by their offset from the left. Example: { 10 => { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, 35 => { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' } }
80
+ def self.build_table data
48
81
  table = {}
49
82
  data.each { |d| table[d[:top]] ||= {}; table[d[:top]][d[:left]] = d[:data] }
83
+ table
84
+ end
50
85
 
51
- # Filter Table Rows (Remove Lone Elements & Footers)
52
- table.reject! { |top, row| row.size < 2 || (top % PAGE_OFF) >= 1110 || is_all_same(row) || contains_unusable(row) }
86
+ # Filter Table Rows
87
+ # Filters out rows considered unusable, empty, oversize, footers, etc...
88
+ # Also, strips Top Offset info from Table Rows.
89
+ # @param [Hash] data A hash of table rows, mapped by their offset from the top, where each row is represented as a hash of table cells, mapped by their offset from the left. Example: { 10 => { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, 35 => { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' } }
90
+ # @return [Array] An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
91
+ def self.filter_rows data
92
+ data
93
+ .reject { |top, row| row.size < 2 || (top % PAGE_OFF) >= PAGE_MAX_TOP || is_all_same?(row) || contains_unusable?(row) } # Drop Single-Element Rows, Footer Data, Useless Rows (all cells identical) & Unusable Rows (Empty / Oversize Cells)
94
+ .collect { |_top, r| r }.reject { |r| r.size < 2 } # Remove 'top offset' information and re-drop single-element rows
95
+ end
53
96
 
54
- # Filter Table Cells
55
- table = table.collect { |_top, r| r.reject { |_left, d| TITLE_CELL_REGEX =~ d } }.reject { |r| r.size < 1 }
97
+ # Determine Headered Table Length
98
+ # Computes the number of rows to be included in a given headered table.
99
+ # @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
100
+ # @param [Array] headers An array of header rows, each represented as a hash containing the header row's index within the *table* array, and the actual row data. Example: [{ idx: 0, row: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'] }]
101
+ # @param [Hash] h The current header row (determine htable length from this)
102
+ # @param [Fixnum] i The current header's index within the *headers* array
103
+ # @return [Fixnum] The number of rows
104
+ def self.htable_length table, headers, h, i
105
+ (headers[i + 1] ? headers[i + 1][:idx] : table.length) - h[:idx]
106
+ end
56
107
 
57
- # Cleanup Table ( IS THIS NECESSARY ? )
58
- table.reject! { |r| r.size < 2 }
108
+ # Sub Table Length
109
+ # Computes the number of rows to be included in a given sub-table.
110
+ # @param [Array] table An array of table rows, each represented as an array of table cells. Example: [['<b>System</b>', 'Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']]
111
+ # @param [Array] stables An array of named tables, each represented as a hash containing the name and its starting index within the *table* array. Example: [{ title: '<b>System Info</b>', idx: 0 }]
112
+ # @param [Hash] t The current sub-table title row (determine stable length from this)
113
+ # @param [Fixnum] i The current sub-table title's index within the *stable* array
114
+ # @return [Fixnum] The number of rows
115
+ def self.sub_tab_len table, stables, t, i
116
+ (stables[i + 1] ? stables[i + 1][:idx] : table.length) - t[:idx]
117
+ end
59
118
 
60
- # DEBUG
61
- puts "=============> #{table}"
119
+ # Sub-Tablize
120
+ # Splits a table into multiple named tables.
121
+ # @param [Array] htable_data An array of table rows, each represented as an array of table cells. Example: [['<b>System</b>', 'Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']]
122
+ # @return [Array] An array of named tables, each represented as a hash containing the name and the table itself. May also contain a single array, containing all remaining table data (unnamed). Example: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }, [['32.40 $', '34.00 $', '88.40 $'], ['21.40 km', '12.00 km', '99.10 km']]]
123
+ def self.sub_tablize htable_data
62
124
 
63
- table
125
+ # Collect Sub-table Title Rows
126
+ subtab_titles = htable_data.collect.with_index { |r, i| { idx: i, row: r } }.select { |e| TITLE_CELL_REGEX =~ e[:row][0] }.collect { |e| { title: e[:row][0], idx: e[:idx] } }
127
+
128
+ # Pull up Sub-tables
129
+ stables = subtab_titles.collect.with_index { |t, i| { name: t[:title].gsub(/<\/?b>/, ''), data: htable_data.slice(t[:idx], sub_tab_len(htable_data, subtab_titles, t, i)).collect { |e| e.reject.with_index { |c, ii| ii == 0 && TITLE_CELL_REGEX =~ c } } } }
130
+
131
+ # Data until first sub-table index is considered 'unsorted'
132
+ unsorted_end = subtab_titles.empty? ? htable_data.length : subtab_titles[0][:idx]
133
+
134
+ stables << htable_data.slice(0, unsorted_end)
64
135
  end
65
136
 
66
- # HTML Filter
67
- def self.hfilter s
68
- s.gsub '<br/>', "\n"
137
+ # Touch up Table
138
+ # Splits Table into multiple headered tables.
139
+ # Also, strips Left Offset info from Table Cells.
140
+ # @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
141
+ # @return [Array] An array of tables, each represented as either a single array of rows, or a hash containing a header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }, []] }, []]
142
+ def self.touch_up table
143
+
144
+ # Remove Column Offsets
145
+ table.collect! { |r| r.collect { |_left, cell| cell } }
146
+
147
+ # Split Table into multiple Headered Tables
148
+ headers = table.collect.with_index { |r, i| { idx: i, row: r } }.select { |e| e[:row].inject(true) { |b, c| b && (TITLE_CELL_REGEX =~ c) } }.collect { |r| { idx: r[:idx], row: r[:row].collect { |v| v.gsub /<\/?b>/, '' } } }
149
+
150
+ # Pull up Headered Tables
151
+ htables = headers.collect.with_index { |h, i| { head: h[:row], data: table.slice(h[:idx] + 1, htable_length(table, headers, h, i) - 1) } }
152
+
153
+ # Split Headered Tables into multiple Named Sub-Tables
154
+ htables.collect! { |ht| { head: ht[:head], data: sub_tablize(ht[:data]) } }
155
+
156
+ # Data until first Header index is considered 'unsorted'
157
+ unsorted_end = headers.empty? ? table.length : headers[0][:idx]
158
+
159
+ htables << sub_tablize(table.slice(0, unsorted_end))
69
160
  end
70
161
 
71
- # Process Page Files
72
- def self.process_page_files page_data
162
+ # Process
163
+ # Transforms a hash of page data (as produced by _pdftohtml_) into a usable information table tree structure.
164
+ # @param [Hash] page_data A hash of document pages, mapped by their page index. Each page is an array of chomp'd lines of HTML data. Example: { 1 => ['<h1>Hello World!</h1>', 'This is page one.'], 2 => ['Wow, another page of data !', '<b>Important stuff</b>', 'That's it for page 2 !'] }
165
+ # @return [Array] An array of tables, each represented as a hash containing an optional header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: { 'System' => [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] } }]
166
+ def self.process page_data
73
167
 
74
- # Build HTML Entity Decoder
75
- coder = HTMLEntities.new
168
+ # Collect Data
169
+ data = collect_data page_data
76
170
 
77
- # Collect & Process File Data
78
- off = 0
79
- process_data page_data.collect { |_idx, page| off = off + PAGE_OFF; page.select { |l| LINE_REGEX =~ l }.collect { |l| LINE_REGEX.match l }.collect { |d| { top: off + d[1].to_i, left: d[2].to_i, data: hfilter(coder.decode(d[3])) } } }.flatten
171
+ # Build Data Table
172
+ table = build_table data
173
+
174
+ # Filter Rows
175
+ table = filter_rows table
176
+
177
+ # Filter Table Cells & Touch up
178
+ touch_up table
80
179
  end
81
180
  end
82
181
  end
@@ -5,5 +5,5 @@
5
5
  module PDFTDX
6
6
 
7
7
  # Version
8
- VERSION = '0.3.1'
8
+ VERSION = '1.0.0'
9
9
  end
data/pdftdx.gemspec CHANGED
@@ -21,5 +21,7 @@ Gem::Specification.new do |spec|
21
21
 
22
22
  spec.add_development_dependency "bundler", "~> 1.12"
23
23
  spec.add_development_dependency "rake", "~> 10.0"
24
+ spec.add_runtime_dependency "minitest"
25
+ spec.add_runtime_dependency "htmlentities"
24
26
  spec.add_runtime_dependency "pdftohtml"
25
27
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdftdx
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eresse
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-12-09 00:00:00.000000000 Z
11
+ date: 2016-12-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -38,6 +38,34 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: minitest
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: htmlentities
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
41
69
  - !ruby/object:Gem::Dependency
42
70
  name: pdftohtml
43
71
  requirement: !ruby/object:Gem::Requirement