pdftdx 0.3.1 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b6ac984f258036c4d20985529d0fdcda9fd3254f
4
- data.tar.gz: 9b325fba42e742f0521317f4202a8b2a783114da
3
+ metadata.gz: 703bd26469ea409ff86c2a9383cdc34be4b7dcb6
4
+ data.tar.gz: 387ef2d99b28d53f961673780aa72c75941e2be1
5
5
  SHA512:
6
- metadata.gz: 0ee08da850b0cb3ee593c7bbcbb0dd4f0d1244619e512018a83964fa103797d108ba2a4f5eca28f8e19ff2901c2d26d20bf36ca26d3f4787cd20a6286ef50513
7
- data.tar.gz: 8d230f85707d4820ce341222e95a6f71fb553a3d7d57b7622a70b8668a966f8e4626e37538a2fe563855c0c3c50cdd846563ba3f6408318d0c39879c9ab2d710
6
+ metadata.gz: d802ca74deb4729983dafb4d1d7f5b25f15471e1f2b1007751779658baa86bdc0ec45074a14e98ac78232f05d9e3b3ba42e3e76ac8945575ad58236886d5fd4c
7
+ data.tar.gz: 4c60551311fd9cf6f4b355c2b4b92aa3e9abbd4d204361f479da88bf91731d061c67c15e6b1f9d8655888a392dc53550b5e68b4f32a86c6d25d57460c2103083
data/.idea/.rakeTasks CHANGED
@@ -4,4 +4,4 @@ You are allowed to:
4
4
  1. Remove rake task
5
5
  2. Add existing rake tasks
6
6
  To add existing rake tasks automatically delete this file and reload the project.
7
- --><RakeGroup description="" fullCmd="" taksId="rake"><RakeTask description="Build pdftdx-0.1.0.gem into the pkg directory" fullCmd="build" taksId="build" /><RakeTask description="Remove any temporary products" fullCmd="clean" taksId="clean" /><RakeTask description="Remove any generated files" fullCmd="clobber" taksId="clobber" /><RakeTask description="Build and install pdftdx-0.1.0.gem into system gems" fullCmd="install" taksId="install" /><RakeGroup description="" fullCmd="" taksId="install"><RakeTask description="Build and install pdftdx-0.1.0.gem into system gems without network access" fullCmd="install:local" taksId="local" /></RakeGroup><RakeTask description="Create tag v0.1.0 and build and push pdftdx-0.1.0.gem to Rubygems" fullCmd="release[remote]" taksId="release[remote]" /><RakeTask description="" fullCmd="default" taksId="default" /><RakeTask description="" fullCmd="release" taksId="release" /><RakeGroup description="" fullCmd="" taksId="release"><RakeTask description="" fullCmd="release:guard_clean" taksId="guard_clean" /><RakeTask description="" fullCmd="release:rubygem_push" taksId="rubygem_push" /><RakeTask description="" fullCmd="release:source_control_push" taksId="source_control_push" /></RakeGroup></RakeGroup></Settings>
7
+ --><RakeGroup description="" fullCmd="" taksId="rake"><RakeTask description="Build pdftdx-0.3.2.gem into the pkg directory" fullCmd="build" taksId="build" /><RakeTask description="Remove any temporary products" fullCmd="clean" taksId="clean" /><RakeTask description="Remove any generated files" fullCmd="clobber" taksId="clobber" /><RakeTask description="Build and install pdftdx-0.3.2.gem into system gems" fullCmd="install" taksId="install" /><RakeGroup description="" fullCmd="" taksId="install"><RakeTask description="Build and install pdftdx-0.3.2.gem into system gems without network access" fullCmd="install:local" taksId="local" /></RakeGroup><RakeTask description="Create tag v0.3.2 and build and push pdftdx-0.3.2.gem to Rubygems" fullCmd="release[remote]" taksId="release[remote]" /><RakeTask description="Run tests" fullCmd="test" taksId="test" /><RakeTask description="" fullCmd="default" taksId="default" /><RakeTask description="" fullCmd="release" taksId="release" /><RakeGroup description="" fullCmd="" taksId="release"><RakeTask description="" fullCmd="release:guard_clean" taksId="guard_clean" /><RakeTask description="" fullCmd="release:rubygem_push" taksId="rubygem_push" /><RakeTask description="" fullCmd="release:source_control_push" taksId="source_control_push" /></RakeGroup></RakeGroup></Settings>
data/.idea/pdftdx.iml CHANGED
@@ -1,27 +1,149 @@
1
1
  <?xml version="1.0" encoding="UTF-8"?>
2
2
  <module type="RUBY_MODULE" version="4">
3
3
  <component name="ModuleRunConfigurationManager">
4
- <configuration default="false" name="release[remote]: pdftdx" type="RakeRunConfigurationType" factoryName="Rake" temporary="true">
4
+ <configuration default="false" name="test_filter_rows: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
5
+ <predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
5
6
  <module name="pdftdx" />
6
- <RAKE_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
7
- <RAKE_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$" />
8
- <RAKE_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
9
- <RAKE_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
10
- <RAKE_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
11
- <envs />
12
- <EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="false" />
7
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
8
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test/pdftdx" />
9
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
10
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
11
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
12
+ <envs>
13
+ <env name="JRUBY_OPTS" value="-X+O" />
14
+ </envs>
15
+ <EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
13
16
  <EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
14
17
  <EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
15
18
  <COVERAGE_PATTERN ENABLED="true">
16
19
  <PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
17
20
  </COVERAGE_PATTERN>
18
21
  </EXTENSION>
19
- <RAKE_RUN_CONFIG_SETTINGS_ID NAME="RAKE_TASK_NAME" VALUE="release" />
20
- <RAKE_RUN_CONFIG_SETTINGS_ID NAME="RAKE_TASK_ARGS" VALUE="" />
21
- <RAKE_RUN_CONFIG_SETTINGS_ID NAME="RAKE_TASK_ATTACHED_TEST_FRAMEWORKS" VALUE="" />
22
- <RAKE_RUN_CONFIG_SETTINGS_ID NAME="RAKE_TASK_OPTION_TRACE" VALUE="false" />
23
- <RAKE_RUN_CONFIG_SETTINGS_ID NAME="RAKE_TASK_OPTION_DRYRUN" VALUE="false" />
24
- <RAKE_RUN_CONFIG_SETTINGS_ID NAME="RAKE_TASK_OPTION_PREREQS" VALUE="false" />
22
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="" />
23
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="$MODULE_DIR$/test/pdftdx/test_parser.rb" />
24
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="" />
25
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="test_filter_rows" />
26
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="TEST_METHOD" />
27
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
28
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
29
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
30
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
31
+ <method />
32
+ </configuration>
33
+ <configuration default="false" name="test_collect_data: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
34
+ <predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
35
+ <module name="pdftdx" />
36
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
37
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test/pdftdx" />
38
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
39
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
40
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
41
+ <envs>
42
+ <env name="JRUBY_OPTS" value="-X+O" />
43
+ </envs>
44
+ <EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
45
+ <EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
46
+ <EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
47
+ <COVERAGE_PATTERN ENABLED="true">
48
+ <PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
49
+ </COVERAGE_PATTERN>
50
+ </EXTENSION>
51
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="" />
52
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="$MODULE_DIR$/test/pdftdx/test_parser.rb" />
53
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="" />
54
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="test_collect_data" />
55
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="TEST_METHOD" />
56
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
57
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
58
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
59
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
60
+ <method />
61
+ </configuration>
62
+ <configuration default="false" name="test_build_table: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
63
+ <predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
64
+ <module name="pdftdx" />
65
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
66
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test/pdftdx" />
67
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
68
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
69
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
70
+ <envs>
71
+ <env name="JRUBY_OPTS" value="-X+O" />
72
+ </envs>
73
+ <EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
74
+ <EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
75
+ <EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
76
+ <COVERAGE_PATTERN ENABLED="true">
77
+ <PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
78
+ </COVERAGE_PATTERN>
79
+ </EXTENSION>
80
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="" />
81
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="$MODULE_DIR$/test/pdftdx/test_parser.rb" />
82
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="" />
83
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="test_build_table" />
84
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="TEST_METHOD" />
85
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
86
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
87
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
88
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
89
+ <method />
90
+ </configuration>
91
+ <configuration default="false" name="test_process: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
92
+ <predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
93
+ <module name="pdftdx" />
94
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
95
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test/pdftdx" />
96
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
97
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
98
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
99
+ <envs>
100
+ <env name="JRUBY_OPTS" value="-X+O" />
101
+ </envs>
102
+ <EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
103
+ <EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
104
+ <EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
105
+ <COVERAGE_PATTERN ENABLED="true">
106
+ <PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
107
+ </COVERAGE_PATTERN>
108
+ </EXTENSION>
109
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="" />
110
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="$MODULE_DIR$/test/pdftdx/test_parser.rb" />
111
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="" />
112
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="test_process" />
113
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="TEST_METHOD" />
114
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
115
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
116
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
117
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
118
+ <method />
119
+ </configuration>
120
+ <configuration default="false" name="All tests in test: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
121
+ <predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
122
+ <module name="pdftdx" />
123
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
124
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test" />
125
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
126
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
127
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
128
+ <envs>
129
+ <env name="JRUBY_OPTS" value="-X+O" />
130
+ </envs>
131
+ <EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
132
+ <EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
133
+ <EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
134
+ <COVERAGE_PATTERN ENABLED="true">
135
+ <PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
136
+ </COVERAGE_PATTERN>
137
+ </EXTENSION>
138
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="$MODULE_DIR$/test" />
139
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="" />
140
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="**/{*_test,test_*}.rb" />
141
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="" />
142
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="ALL_IN_FOLDER" />
143
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
144
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
145
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
146
+ <RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
25
147
  <method />
26
148
  </configuration>
27
149
  </component>
@@ -30,7 +152,9 @@
30
152
  <orderEntry type="inheritedJdk" />
31
153
  <orderEntry type="sourceFolder" forTests="false" />
32
154
  <orderEntry type="library" scope="PROVIDED" name="bundler (v1.12.5, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
33
- <orderEntry type="library" scope="PROVIDED" name="pdftohtml (v0.2.1, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
155
+ <orderEntry type="library" scope="PROVIDED" name="htmlentities (v4.3.4, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
156
+ <orderEntry type="library" scope="PROVIDED" name="minitest (v5.10.1, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
157
+ <orderEntry type="library" scope="PROVIDED" name="pdftohtml (v0.2.3, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
34
158
  <orderEntry type="library" scope="PROVIDED" name="rake (v10.5.0, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
35
159
  </component>
36
160
  </module>
data/Gemfile.lock CHANGED
@@ -1,13 +1,17 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- pdftdx (0.2.0)
4
+ pdftdx (1.0.0)
5
+ htmlentities
6
+ minitest
5
7
  pdftohtml
6
8
 
7
9
  GEM
8
10
  remote: https://rubygems.org/
9
11
  specs:
10
- pdftohtml (0.2.1)
12
+ htmlentities (4.3.4)
13
+ minitest (5.10.1)
14
+ pdftohtml (0.2.3)
11
15
  rake (10.5.0)
12
16
 
13
17
  PLATFORMS
data/Rakefile CHANGED
@@ -1,2 +1,8 @@
1
1
  require "bundler/gem_tasks"
2
+ require 'rake/testtask'
3
+
2
4
  task :default => :spec
5
+
6
+ Rake::TestTask.new do |t|
7
+ t.libs << 'test'
8
+ end
data/lib/pdftdx/parser.rb CHANGED
@@ -22,61 +22,160 @@ module PDFTDX
22
22
  # Page Offset
23
23
  PAGE_OFF = 10000
24
24
 
25
- # Title Cell Regex
26
- TITLE_CELL_REGEX = /<bbb>/
25
+ # Maximum Allowed Offset from Page Top
26
+ PAGE_MAX_TOP = 1100
27
27
 
28
- # Check Same Line
29
- def self.same_line data, idx_a, idx_b
30
- data[idx_a][:top] == data[idx_b][:top]
31
- end
28
+ # Title Cell Regex
29
+ TITLE_CELL_REGEX = /<b>/
32
30
 
33
31
  # Is All Same Data
34
- def self.is_all_same row_data
32
+ # Determine whether a row's cells all contain the same data.
33
+ # @param [Hash] row_data A hash of table cells, mapped by their offset from the left. Example: { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }
34
+ # @return [Boolean] True if all cells contain the same data, False otherwise.
35
+ def self.is_all_same? row_data
35
36
  n = row_data[row_data.keys[0]]
36
37
  row_data.inject(true) { |b, e| b && (e[1] == n) }
37
38
  end
38
39
 
39
40
  # Contains Unusable Data (Empty / Long Strings)
40
- def self.contains_unusable row_data
41
+ # Determines whether a row contains unusable data.
42
+ # @param [Hash] row_data A hash of table cells, mapped by their offset from the left. Example: { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }
43
+ # @return [Boolean] True if at least one cell is unusable (empty, oversize), False otherwise
44
+ def self.contains_unusable? row_data
41
45
  row_data.inject(false) { |b, e| b || (e[1].length == 0) || (e[1].length > MAX_CELL_LEN) }
42
46
  end
43
47
 
44
- # Process Data
45
- def self.process_data data
48
+ # HTML Filter
49
+ # Replaces HTML newlines by UNIX-style newlines.
50
+ # @param [String] s A string of HTML data
51
+ # @return [String] The same string of HTML data, with all newlines (<br/> tags) converted to UNIX newlines.
52
+ def self.hfilter s
53
+ s.gsub '<br/>', "\n"
54
+ end
55
+
56
+ # Collect Data
57
+ # Extracts table-like chunks of HTML data from a hash of HTML pages.
58
+ # @param [Hash] data A hash of document pages, mapped by their page index. Each page is an array of chomp'd lines of HTML data. Example: { 1 => ['<h1>Hello World!</h1>', 'This is page one.'], 2 => ['Wow, another page of data !', '<b>Important stuff</b>', 'That's it for page 2 !'] }
59
+ # @return [Array] An array of HTML chunks, each represented as a hash containing the chunk position and data. Example: [{ top: 10, left: 100, data: 'Machine OS' }, { top: 10, left: 220, data: 'Win32' }, { top: 10, left: 340, data: 'Linux' }, { top: 10, left: 460, data: 'MacOS' }]
60
+ def self.collect_data data
46
61
 
47
- # Build Data Table
62
+ # Build HTML Entity Decoder
63
+ coder = HTMLEntities.new
64
+
65
+ # Collect File Data
66
+ off = 0
67
+ data.collect do |_idx, page|
68
+ off = off + PAGE_OFF
69
+ page
70
+ .select { |l| LINE_REGEX =~ l } # Collect Table-like data
71
+ .collect { |l| LINE_REGEX.match l } # Extract Table Element Metadata (Position)
72
+ .collect { |d| { top: off + d[1].to_i, left: d[2].to_i, data: hfilter(coder.decode(d[3])) } } # Produce Hash of Raw Table Data
73
+ end.flatten
74
+ end
75
+
76
+ # Build Data Table
77
+ # Produces an organized Table (in the form a 2-level nested hash) from an array of HTML chunks.
78
+ # @param [Array] data An array of document chunks, each represented as a hash containing the position and body of the chunk. Example: [{ top: 10, left: 100, data: 'Machine OS' }, { top: 10, left: 220, data: 'Win32' }, { top: 10, left: 340, data: 'Linux' }, { top: 10, left: 460, data: 'MacOS' }]
79
+ # @return [Hash] A hash of table rows, mapped by their offset from the top, where each row is represented as a hash of table cells, mapped by their offset from the left. Example: { 10 => { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, 35 => { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' } }
80
+ def self.build_table data
48
81
  table = {}
49
82
  data.each { |d| table[d[:top]] ||= {}; table[d[:top]][d[:left]] = d[:data] }
83
+ table
84
+ end
50
85
 
51
- # Filter Table Rows (Remove Lone Elements & Footers)
52
- table.reject! { |top, row| row.size < 2 || (top % PAGE_OFF) >= 1110 || is_all_same(row) || contains_unusable(row) }
86
+ # Filter Table Rows
87
+ # Filters out rows considered unusable, empty, oversize, footers, etc...
88
+ # Also, strips Top Offset info from Table Rows.
89
+ # @param [Hash] data A hash of table rows, mapped by their offset from the top, where each row is represented as a hash of table cells, mapped by their offset from the left. Example: { 10 => { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, 35 => { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' } }
90
+ # @return [Array] An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
91
+ def self.filter_rows data
92
+ data
93
+ .reject { |top, row| row.size < 2 || (top % PAGE_OFF) >= PAGE_MAX_TOP || is_all_same?(row) || contains_unusable?(row) } # Drop Single-Element Rows, Footer Data, Useless Rows (all cells identical) & Unusable Rows (Empty / Oversize Cells)
94
+ .collect { |_top, r| r }.reject { |r| r.size < 2 } # Remove 'top offset' information and re-drop single-element rows
95
+ end
53
96
 
54
- # Filter Table Cells
55
- table = table.collect { |_top, r| r.reject { |_left, d| TITLE_CELL_REGEX =~ d } }.reject { |r| r.size < 1 }
97
+ # Determine Headered Table Length
98
+ # Computes the number of rows to be included in a given headered table.
99
+ # @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
100
+ # @param [Array] headers An array of header rows, each represented as a hash containing the header row's index within the *table* array, and the actual row data. Example: [{ idx: 0, row: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'] }]
101
+ # @param [Hash] h The current header row (determine htable length from this)
102
+ # @param [Fixnum] i The current header's index within the *headers* array
103
+ # @return [Fixnum] The number of rows
104
+ def self.htable_length table, headers, h, i
105
+ (headers[i + 1] ? headers[i + 1][:idx] : table.length) - h[:idx]
106
+ end
56
107
 
57
- # Cleanup Table ( IS THIS NECESSARY ? )
58
- table.reject! { |r| r.size < 2 }
108
+ # Sub Table Length
109
+ # Computes the number of rows to be included in a given sub-table.
110
+ # @param [Array] table An array of table rows, each represented as an array of table cells. Example: [['<b>System</b>', 'Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']]
111
+ # @param [Array] stables An array of named tables, each represented as a hash containing the name and its starting index within the *table* array. Example: [{ title: '<b>System Info</b>', idx: 0 }]
112
+ # @param [Hash] t The current sub-table title row (determine stable length from this)
113
+ # @param [Fixnum] i The current sub-table title's index within the *stable* array
114
+ # @return [Fixnum] The number of rows
115
+ def self.sub_tab_len table, stables, t, i
116
+ (stables[i + 1] ? stables[i + 1][:idx] : table.length) - t[:idx]
117
+ end
59
118
 
60
- # DEBUG
61
- puts "=============> #{table}"
119
+ # Sub-Tablize
120
+ # Splits a table into multiple named tables.
121
+ # @param [Array] htable_data An array of table rows, each represented as an array of table cells. Example: [['<b>System</b>', 'Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']]
122
+ # @return [Array] An array of named tables, each represented as a hash containing the name and the table itself. May also contain a single array, containing all remaining table data (unnamed). Example: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }, [['32.40 $', '34.00 $', '88.40 $'], ['21.40 km', '12.00 km', '99.10 km']]]
123
+ def self.sub_tablize htable_data
62
124
 
63
- table
125
+ # Collect Sub-table Title Rows
126
+ subtab_titles = htable_data.collect.with_index { |r, i| { idx: i, row: r } }.select { |e| TITLE_CELL_REGEX =~ e[:row][0] }.collect { |e| { title: e[:row][0], idx: e[:idx] } }
127
+
128
+ # Pull up Sub-tables
129
+ stables = subtab_titles.collect.with_index { |t, i| { name: t[:title].gsub(/<\/?b>/, ''), data: htable_data.slice(t[:idx], sub_tab_len(htable_data, subtab_titles, t, i)).collect { |e| e.reject.with_index { |c, ii| ii == 0 && TITLE_CELL_REGEX =~ c } } } }
130
+
131
+ # Data until first sub-table index is considered 'unsorted'
132
+ unsorted_end = subtab_titles.empty? ? htable_data.length : subtab_titles[0][:idx]
133
+
134
+ stables << htable_data.slice(0, unsorted_end)
64
135
  end
65
136
 
66
- # HTML Filter
67
- def self.hfilter s
68
- s.gsub '<br/>', "\n"
137
+ # Touch up Table
138
+ # Splits Table into multiple headered tables.
139
+ # Also, strips Left Offset info from Table Cells.
140
+ # @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
141
+ # @return [Array] An array of tables, each represented as either a single array of rows, or a hash containing a header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }, []] }, []]
142
+ def self.touch_up table
143
+
144
+ # Remove Column Offsets
145
+ table.collect! { |r| r.collect { |_left, cell| cell } }
146
+
147
+ # Split Table into multiple Headered Tables
148
+ headers = table.collect.with_index { |r, i| { idx: i, row: r } }.select { |e| e[:row].inject(true) { |b, c| b && (TITLE_CELL_REGEX =~ c) } }.collect { |r| { idx: r[:idx], row: r[:row].collect { |v| v.gsub /<\/?b>/, '' } } }
149
+
150
+ # Pull up Headered Tables
151
+ htables = headers.collect.with_index { |h, i| { head: h[:row], data: table.slice(h[:idx] + 1, htable_length(table, headers, h, i) - 1) } }
152
+
153
+ # Split Headered Tables into multiple Named Sub-Tables
154
+ htables.collect! { |ht| { head: ht[:head], data: sub_tablize(ht[:data]) } }
155
+
156
+ # Data until first Header index is considered 'unsorted'
157
+ unsorted_end = headers.empty? ? table.length : headers[0][:idx]
158
+
159
+ htables << sub_tablize(table.slice(0, unsorted_end))
69
160
  end
70
161
 
71
- # Process Page Files
72
- def self.process_page_files page_data
162
+ # Process
163
+ # Transforms a hash of page data (as produced by _pdftohtml_) into a usable information table tree structure.
164
+ # @param [Hash] page_data A hash of document pages, mapped by their page index. Each page is an array of chomp'd lines of HTML data. Example: { 1 => ['<h1>Hello World!</h1>', 'This is page one.'], 2 => ['Wow, another page of data !', '<b>Important stuff</b>', 'That's it for page 2 !'] }
165
+ # @return [Array] An array of tables, each represented as a hash containing an optional header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: { 'System' => [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] } }]
166
+ def self.process page_data
73
167
 
74
- # Build HTML Entity Decoder
75
- coder = HTMLEntities.new
168
+ # Collect Data
169
+ data = collect_data page_data
76
170
 
77
- # Collect & Process File Data
78
- off = 0
79
- process_data page_data.collect { |_idx, page| off = off + PAGE_OFF; page.select { |l| LINE_REGEX =~ l }.collect { |l| LINE_REGEX.match l }.collect { |d| { top: off + d[1].to_i, left: d[2].to_i, data: hfilter(coder.decode(d[3])) } } }.flatten
171
+ # Build Data Table
172
+ table = build_table data
173
+
174
+ # Filter Rows
175
+ table = filter_rows table
176
+
177
+ # Filter Table Cells & Touch up
178
+ touch_up table
80
179
  end
81
180
  end
82
181
  end
@@ -5,5 +5,5 @@
5
5
  module PDFTDX
6
6
 
7
7
  # Version
8
- VERSION = '0.3.1'
8
+ VERSION = '1.0.0'
9
9
  end
data/pdftdx.gemspec CHANGED
@@ -21,5 +21,7 @@ Gem::Specification.new do |spec|
21
21
 
22
22
  spec.add_development_dependency "bundler", "~> 1.12"
23
23
  spec.add_development_dependency "rake", "~> 10.0"
24
+ spec.add_runtime_dependency "minitest"
25
+ spec.add_runtime_dependency "htmlentities"
24
26
  spec.add_runtime_dependency "pdftohtml"
25
27
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdftdx
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eresse
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-12-09 00:00:00.000000000 Z
11
+ date: 2016-12-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -38,6 +38,34 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: minitest
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: htmlentities
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
41
69
  - !ruby/object:Gem::Dependency
42
70
  name: pdftohtml
43
71
  requirement: !ruby/object:Gem::Requirement