pdftdx 0.3.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.idea/.rakeTasks +1 -1
- data/.idea/pdftdx.iml +139 -15
- data/Gemfile.lock +6 -2
- data/Rakefile +6 -0
- data/lib/pdftdx/parser.rb +129 -30
- data/lib/pdftdx/version.rb +1 -1
- data/pdftdx.gemspec +2 -0
- metadata +30 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 703bd26469ea409ff86c2a9383cdc34be4b7dcb6
|
|
4
|
+
data.tar.gz: 387ef2d99b28d53f961673780aa72c75941e2be1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d802ca74deb4729983dafb4d1d7f5b25f15471e1f2b1007751779658baa86bdc0ec45074a14e98ac78232f05d9e3b3ba42e3e76ac8945575ad58236886d5fd4c
|
|
7
|
+
data.tar.gz: 4c60551311fd9cf6f4b355c2b4b92aa3e9abbd4d204361f479da88bf91731d061c67c15e6b1f9d8655888a392dc53550b5e68b4f32a86c6d25d57460c2103083
|
data/.idea/.rakeTasks
CHANGED
|
@@ -4,4 +4,4 @@ You are allowed to:
|
|
|
4
4
|
1. Remove rake task
|
|
5
5
|
2. Add existing rake tasks
|
|
6
6
|
To add existing rake tasks automatically delete this file and reload the project.
|
|
7
|
-
--><RakeGroup description="" fullCmd="" taksId="rake"><RakeTask description="Build pdftdx-0.
|
|
7
|
+
--><RakeGroup description="" fullCmd="" taksId="rake"><RakeTask description="Build pdftdx-0.3.2.gem into the pkg directory" fullCmd="build" taksId="build" /><RakeTask description="Remove any temporary products" fullCmd="clean" taksId="clean" /><RakeTask description="Remove any generated files" fullCmd="clobber" taksId="clobber" /><RakeTask description="Build and install pdftdx-0.3.2.gem into system gems" fullCmd="install" taksId="install" /><RakeGroup description="" fullCmd="" taksId="install"><RakeTask description="Build and install pdftdx-0.3.2.gem into system gems without network access" fullCmd="install:local" taksId="local" /></RakeGroup><RakeTask description="Create tag v0.3.2 and build and push pdftdx-0.3.2.gem to Rubygems" fullCmd="release[remote]" taksId="release[remote]" /><RakeTask description="Run tests" fullCmd="test" taksId="test" /><RakeTask description="" fullCmd="default" taksId="default" /><RakeTask description="" fullCmd="release" taksId="release" /><RakeGroup description="" fullCmd="" taksId="release"><RakeTask description="" fullCmd="release:guard_clean" taksId="guard_clean" /><RakeTask description="" fullCmd="release:rubygem_push" taksId="rubygem_push" /><RakeTask description="" fullCmd="release:source_control_push" taksId="source_control_push" /></RakeGroup></RakeGroup></Settings>
|
data/.idea/pdftdx.iml
CHANGED
|
@@ -1,27 +1,149 @@
|
|
|
1
1
|
<?xml version="1.0" encoding="UTF-8"?>
|
|
2
2
|
<module type="RUBY_MODULE" version="4">
|
|
3
3
|
<component name="ModuleRunConfigurationManager">
|
|
4
|
-
<configuration default="false" name="
|
|
4
|
+
<configuration default="false" name="test_filter_rows: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
|
|
5
|
+
<predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
|
|
5
6
|
<module name="pdftdx" />
|
|
6
|
-
<
|
|
7
|
-
<
|
|
8
|
-
<
|
|
9
|
-
<
|
|
10
|
-
<
|
|
11
|
-
<envs
|
|
12
|
-
|
|
7
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
|
|
8
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test/pdftdx" />
|
|
9
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
|
|
10
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
|
|
11
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
|
|
12
|
+
<envs>
|
|
13
|
+
<env name="JRUBY_OPTS" value="-X+O" />
|
|
14
|
+
</envs>
|
|
15
|
+
<EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
|
|
13
16
|
<EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
|
|
14
17
|
<EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
|
|
15
18
|
<COVERAGE_PATTERN ENABLED="true">
|
|
16
19
|
<PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
|
|
17
20
|
</COVERAGE_PATTERN>
|
|
18
21
|
</EXTENSION>
|
|
19
|
-
<
|
|
20
|
-
<
|
|
21
|
-
<
|
|
22
|
-
<
|
|
23
|
-
<
|
|
24
|
-
<
|
|
22
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="" />
|
|
23
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="$MODULE_DIR$/test/pdftdx/test_parser.rb" />
|
|
24
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="" />
|
|
25
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="test_filter_rows" />
|
|
26
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="TEST_METHOD" />
|
|
27
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
|
|
28
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
|
|
29
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
|
|
30
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
|
|
31
|
+
<method />
|
|
32
|
+
</configuration>
|
|
33
|
+
<configuration default="false" name="test_collect_data: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
|
|
34
|
+
<predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
|
|
35
|
+
<module name="pdftdx" />
|
|
36
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
|
|
37
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test/pdftdx" />
|
|
38
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
|
|
39
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
|
|
40
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
|
|
41
|
+
<envs>
|
|
42
|
+
<env name="JRUBY_OPTS" value="-X+O" />
|
|
43
|
+
</envs>
|
|
44
|
+
<EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
|
|
45
|
+
<EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
|
|
46
|
+
<EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
|
|
47
|
+
<COVERAGE_PATTERN ENABLED="true">
|
|
48
|
+
<PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
|
|
49
|
+
</COVERAGE_PATTERN>
|
|
50
|
+
</EXTENSION>
|
|
51
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="" />
|
|
52
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="$MODULE_DIR$/test/pdftdx/test_parser.rb" />
|
|
53
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="" />
|
|
54
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="test_collect_data" />
|
|
55
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="TEST_METHOD" />
|
|
56
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
|
|
57
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
|
|
58
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
|
|
59
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
|
|
60
|
+
<method />
|
|
61
|
+
</configuration>
|
|
62
|
+
<configuration default="false" name="test_build_table: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
|
|
63
|
+
<predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
|
|
64
|
+
<module name="pdftdx" />
|
|
65
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
|
|
66
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test/pdftdx" />
|
|
67
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
|
|
68
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
|
|
69
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
|
|
70
|
+
<envs>
|
|
71
|
+
<env name="JRUBY_OPTS" value="-X+O" />
|
|
72
|
+
</envs>
|
|
73
|
+
<EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
|
|
74
|
+
<EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
|
|
75
|
+
<EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
|
|
76
|
+
<COVERAGE_PATTERN ENABLED="true">
|
|
77
|
+
<PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
|
|
78
|
+
</COVERAGE_PATTERN>
|
|
79
|
+
</EXTENSION>
|
|
80
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="" />
|
|
81
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="$MODULE_DIR$/test/pdftdx/test_parser.rb" />
|
|
82
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="" />
|
|
83
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="test_build_table" />
|
|
84
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="TEST_METHOD" />
|
|
85
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
|
|
86
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
|
|
87
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
|
|
88
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
|
|
89
|
+
<method />
|
|
90
|
+
</configuration>
|
|
91
|
+
<configuration default="false" name="test_process: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
|
|
92
|
+
<predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
|
|
93
|
+
<module name="pdftdx" />
|
|
94
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
|
|
95
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test/pdftdx" />
|
|
96
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
|
|
97
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
|
|
98
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
|
|
99
|
+
<envs>
|
|
100
|
+
<env name="JRUBY_OPTS" value="-X+O" />
|
|
101
|
+
</envs>
|
|
102
|
+
<EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
|
|
103
|
+
<EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
|
|
104
|
+
<EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
|
|
105
|
+
<COVERAGE_PATTERN ENABLED="true">
|
|
106
|
+
<PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
|
|
107
|
+
</COVERAGE_PATTERN>
|
|
108
|
+
</EXTENSION>
|
|
109
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="" />
|
|
110
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="$MODULE_DIR$/test/pdftdx/test_parser.rb" />
|
|
111
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="" />
|
|
112
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="test_process" />
|
|
113
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="TEST_METHOD" />
|
|
114
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
|
|
115
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
|
|
116
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
|
|
117
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
|
|
118
|
+
<method />
|
|
119
|
+
</configuration>
|
|
120
|
+
<configuration default="false" name="All tests in test: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
|
|
121
|
+
<predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
|
|
122
|
+
<module name="pdftdx" />
|
|
123
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
|
|
124
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test" />
|
|
125
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
|
|
126
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
|
|
127
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
|
|
128
|
+
<envs>
|
|
129
|
+
<env name="JRUBY_OPTS" value="-X+O" />
|
|
130
|
+
</envs>
|
|
131
|
+
<EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
|
|
132
|
+
<EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
|
|
133
|
+
<EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
|
|
134
|
+
<COVERAGE_PATTERN ENABLED="true">
|
|
135
|
+
<PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
|
|
136
|
+
</COVERAGE_PATTERN>
|
|
137
|
+
</EXTENSION>
|
|
138
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="$MODULE_DIR$/test" />
|
|
139
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="" />
|
|
140
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="**/{*_test,test_*}.rb" />
|
|
141
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="" />
|
|
142
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="ALL_IN_FOLDER" />
|
|
143
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
|
|
144
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
|
|
145
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
|
|
146
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
|
|
25
147
|
<method />
|
|
26
148
|
</configuration>
|
|
27
149
|
</component>
|
|
@@ -30,7 +152,9 @@
|
|
|
30
152
|
<orderEntry type="inheritedJdk" />
|
|
31
153
|
<orderEntry type="sourceFolder" forTests="false" />
|
|
32
154
|
<orderEntry type="library" scope="PROVIDED" name="bundler (v1.12.5, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
|
|
33
|
-
<orderEntry type="library" scope="PROVIDED" name="
|
|
155
|
+
<orderEntry type="library" scope="PROVIDED" name="htmlentities (v4.3.4, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
|
|
156
|
+
<orderEntry type="library" scope="PROVIDED" name="minitest (v5.10.1, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
|
|
157
|
+
<orderEntry type="library" scope="PROVIDED" name="pdftohtml (v0.2.3, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
|
|
34
158
|
<orderEntry type="library" scope="PROVIDED" name="rake (v10.5.0, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
|
|
35
159
|
</component>
|
|
36
160
|
</module>
|
data/Gemfile.lock
CHANGED
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
pdftdx (0.
|
|
4
|
+
pdftdx (1.0.0)
|
|
5
|
+
htmlentities
|
|
6
|
+
minitest
|
|
5
7
|
pdftohtml
|
|
6
8
|
|
|
7
9
|
GEM
|
|
8
10
|
remote: https://rubygems.org/
|
|
9
11
|
specs:
|
|
10
|
-
|
|
12
|
+
htmlentities (4.3.4)
|
|
13
|
+
minitest (5.10.1)
|
|
14
|
+
pdftohtml (0.2.3)
|
|
11
15
|
rake (10.5.0)
|
|
12
16
|
|
|
13
17
|
PLATFORMS
|
data/Rakefile
CHANGED
data/lib/pdftdx/parser.rb
CHANGED
|
@@ -22,61 +22,160 @@ module PDFTDX
|
|
|
22
22
|
# Page Offset
|
|
23
23
|
PAGE_OFF = 10000
|
|
24
24
|
|
|
25
|
-
#
|
|
26
|
-
|
|
25
|
+
# Maximum Allowed Offset from Page Top
|
|
26
|
+
PAGE_MAX_TOP = 1100
|
|
27
27
|
|
|
28
|
-
#
|
|
29
|
-
|
|
30
|
-
data[idx_a][:top] == data[idx_b][:top]
|
|
31
|
-
end
|
|
28
|
+
# Title Cell Regex
|
|
29
|
+
TITLE_CELL_REGEX = /<b>/
|
|
32
30
|
|
|
33
31
|
# Is All Same Data
|
|
34
|
-
|
|
32
|
+
# Determine whether a row's cells all contain the same data.
|
|
33
|
+
# @param [Hash] row_data A hash of table cells, mapped by their offset from the left. Example: { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }
|
|
34
|
+
# @return [Boolean] True if all cells contain the same data, False otherwise.
|
|
35
|
+
def self.is_all_same? row_data
|
|
35
36
|
n = row_data[row_data.keys[0]]
|
|
36
37
|
row_data.inject(true) { |b, e| b && (e[1] == n) }
|
|
37
38
|
end
|
|
38
39
|
|
|
39
40
|
# Contains Unusable Data (Empty / Long Strings)
|
|
40
|
-
|
|
41
|
+
# Determines whether a row contains unusable data.
|
|
42
|
+
# @param [Hash] row_data A hash of table cells, mapped by their offset from the left. Example: { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }
|
|
43
|
+
# @return [Boolean] True if at least one cell is unusable (empty, oversize), False otherwise
|
|
44
|
+
def self.contains_unusable? row_data
|
|
41
45
|
row_data.inject(false) { |b, e| b || (e[1].length == 0) || (e[1].length > MAX_CELL_LEN) }
|
|
42
46
|
end
|
|
43
47
|
|
|
44
|
-
#
|
|
45
|
-
|
|
48
|
+
# HTML Filter
|
|
49
|
+
# Replaces HTML newlines by UNIX-style newlines.
|
|
50
|
+
# @param [String] s A string of HTML data
|
|
51
|
+
# @return [String] The same string of HTML data, with all newlines (<br/> tags) converted to UNIX newlines.
|
|
52
|
+
def self.hfilter s
|
|
53
|
+
s.gsub '<br/>', "\n"
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Collect Data
|
|
57
|
+
# Extracts table-like chunks of HTML data from a hash of HTML pages.
|
|
58
|
+
# @param [Hash] data A hash of document pages, mapped by their page index. Each page is an array of chomp'd lines of HTML data. Example: { 1 => ['<h1>Hello World!</h1>', 'This is page one.'], 2 => ['Wow, another page of data !', '<b>Important stuff</b>', 'That's it for page 2 !'] }
|
|
59
|
+
# @return [Array] An array of HTML chunks, each represented as a hash containing the chunk position and data. Example: [{ top: 10, left: 100, data: 'Machine OS' }, { top: 10, left: 220, data: 'Win32' }, { top: 10, left: 340, data: 'Linux' }, { top: 10, left: 460, data: 'MacOS' }]
|
|
60
|
+
def self.collect_data data
|
|
46
61
|
|
|
47
|
-
# Build
|
|
62
|
+
# Build HTML Entity Decoder
|
|
63
|
+
coder = HTMLEntities.new
|
|
64
|
+
|
|
65
|
+
# Collect File Data
|
|
66
|
+
off = 0
|
|
67
|
+
data.collect do |_idx, page|
|
|
68
|
+
off = off + PAGE_OFF
|
|
69
|
+
page
|
|
70
|
+
.select { |l| LINE_REGEX =~ l } # Collect Table-like data
|
|
71
|
+
.collect { |l| LINE_REGEX.match l } # Extract Table Element Metadata (Position)
|
|
72
|
+
.collect { |d| { top: off + d[1].to_i, left: d[2].to_i, data: hfilter(coder.decode(d[3])) } } # Produce Hash of Raw Table Data
|
|
73
|
+
end.flatten
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Build Data Table
|
|
77
|
+
# Produces an organized Table (in the form a 2-level nested hash) from an array of HTML chunks.
|
|
78
|
+
# @param [Array] data An array of document chunks, each represented as a hash containing the position and body of the chunk. Example: [{ top: 10, left: 100, data: 'Machine OS' }, { top: 10, left: 220, data: 'Win32' }, { top: 10, left: 340, data: 'Linux' }, { top: 10, left: 460, data: 'MacOS' }]
|
|
79
|
+
# @return [Hash] A hash of table rows, mapped by their offset from the top, where each row is represented as a hash of table cells, mapped by their offset from the left. Example: { 10 => { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, 35 => { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' } }
|
|
80
|
+
def self.build_table data
|
|
48
81
|
table = {}
|
|
49
82
|
data.each { |d| table[d[:top]] ||= {}; table[d[:top]][d[:left]] = d[:data] }
|
|
83
|
+
table
|
|
84
|
+
end
|
|
50
85
|
|
|
51
|
-
|
|
52
|
-
|
|
86
|
+
# Filter Table Rows
|
|
87
|
+
# Filters out rows considered unusable, empty, oversize, footers, etc...
|
|
88
|
+
# Also, strips Top Offset info from Table Rows.
|
|
89
|
+
# @param [Hash] data A hash of table rows, mapped by their offset from the top, where each row is represented as a hash of table cells, mapped by their offset from the left. Example: { 10 => { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, 35 => { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' } }
|
|
90
|
+
# @return [Array] An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
|
|
91
|
+
def self.filter_rows data
|
|
92
|
+
data
|
|
93
|
+
.reject { |top, row| row.size < 2 || (top % PAGE_OFF) >= PAGE_MAX_TOP || is_all_same?(row) || contains_unusable?(row) } # Drop Single-Element Rows, Footer Data, Useless Rows (all cells identical) & Unusable Rows (Empty / Oversize Cells)
|
|
94
|
+
.collect { |_top, r| r }.reject { |r| r.size < 2 } # Remove 'top offset' information and re-drop single-element rows
|
|
95
|
+
end
|
|
53
96
|
|
|
54
|
-
|
|
55
|
-
|
|
97
|
+
# Determine Headered Table Length
|
|
98
|
+
# Computes the number of rows to be included in a given headered table.
|
|
99
|
+
# @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
|
|
100
|
+
# @param [Array] headers An array of header rows, each represented as a hash containing the header row's index within the *table* array, and the actual row data. Example: [{ idx: 0, row: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'] }]
|
|
101
|
+
# @param [Hash] h The current header row (determine htable length from this)
|
|
102
|
+
# @param [Fixnum] i The current header's index within the *headers* array
|
|
103
|
+
# @return [Fixnum] The number of rows
|
|
104
|
+
def self.htable_length table, headers, h, i
|
|
105
|
+
(headers[i + 1] ? headers[i + 1][:idx] : table.length) - h[:idx]
|
|
106
|
+
end
|
|
56
107
|
|
|
57
|
-
|
|
58
|
-
|
|
108
|
+
# Sub Table Length
|
|
109
|
+
# Computes the number of rows to be included in a given sub-table.
|
|
110
|
+
# @param [Array] table An array of table rows, each represented as an array of table cells. Example: [['<b>System</b>', 'Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']]
|
|
111
|
+
# @param [Array] stables An array of named tables, each represented as a hash containing the name and its starting index within the *table* array. Example: [{ title: '<b>System Info</b>', idx: 0 }]
|
|
112
|
+
# @param [Hash] t The current sub-table title row (determine stable length from this)
|
|
113
|
+
# @param [Fixnum] i The current sub-table title's index within the *stable* array
|
|
114
|
+
# @return [Fixnum] The number of rows
|
|
115
|
+
def self.sub_tab_len table, stables, t, i
|
|
116
|
+
(stables[i + 1] ? stables[i + 1][:idx] : table.length) - t[:idx]
|
|
117
|
+
end
|
|
59
118
|
|
|
60
|
-
|
|
61
|
-
|
|
119
|
+
# Sub-Tablize
|
|
120
|
+
# Splits a table into multiple named tables.
|
|
121
|
+
# @param [Array] htable_data An array of table rows, each represented as an array of table cells. Example: [['<b>System</b>', 'Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']]
|
|
122
|
+
# @return [Array] An array of named tables, each represented as a hash containing the name and the table itself. May also contain a single array, containing all remaining table data (unnamed). Example: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }, [['32.40 $', '34.00 $', '88.40 $'], ['21.40 km', '12.00 km', '99.10 km']]]
|
|
123
|
+
def self.sub_tablize htable_data
|
|
62
124
|
|
|
63
|
-
table
|
|
125
|
+
# Collect Sub-table Title Rows
|
|
126
|
+
subtab_titles = htable_data.collect.with_index { |r, i| { idx: i, row: r } }.select { |e| TITLE_CELL_REGEX =~ e[:row][0] }.collect { |e| { title: e[:row][0], idx: e[:idx] } }
|
|
127
|
+
|
|
128
|
+
# Pull up Sub-tables
|
|
129
|
+
stables = subtab_titles.collect.with_index { |t, i| { name: t[:title].gsub(/<\/?b>/, ''), data: htable_data.slice(t[:idx], sub_tab_len(htable_data, subtab_titles, t, i)).collect { |e| e.reject.with_index { |c, ii| ii == 0 && TITLE_CELL_REGEX =~ c } } } }
|
|
130
|
+
|
|
131
|
+
# Data until first sub-table index is considered 'unsorted'
|
|
132
|
+
unsorted_end = subtab_titles.empty? ? htable_data.length : subtab_titles[0][:idx]
|
|
133
|
+
|
|
134
|
+
stables << htable_data.slice(0, unsorted_end)
|
|
64
135
|
end
|
|
65
136
|
|
|
66
|
-
#
|
|
67
|
-
|
|
68
|
-
|
|
137
|
+
# Touch up Table
|
|
138
|
+
# Splits Table into multiple headered tables.
|
|
139
|
+
# Also, strips Left Offset info from Table Cells.
|
|
140
|
+
# @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
|
|
141
|
+
# @return [Array] An array of tables, each represented as either a single array of rows, or a hash containing a header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }, []] }, []]
|
|
142
|
+
def self.touch_up table
|
|
143
|
+
|
|
144
|
+
# Remove Column Offsets
|
|
145
|
+
table.collect! { |r| r.collect { |_left, cell| cell } }
|
|
146
|
+
|
|
147
|
+
# Split Table into multiple Headered Tables
|
|
148
|
+
headers = table.collect.with_index { |r, i| { idx: i, row: r } }.select { |e| e[:row].inject(true) { |b, c| b && (TITLE_CELL_REGEX =~ c) } }.collect { |r| { idx: r[:idx], row: r[:row].collect { |v| v.gsub /<\/?b>/, '' } } }
|
|
149
|
+
|
|
150
|
+
# Pull up Headered Tables
|
|
151
|
+
htables = headers.collect.with_index { |h, i| { head: h[:row], data: table.slice(h[:idx] + 1, htable_length(table, headers, h, i) - 1) } }
|
|
152
|
+
|
|
153
|
+
# Split Headered Tables into multiple Named Sub-Tables
|
|
154
|
+
htables.collect! { |ht| { head: ht[:head], data: sub_tablize(ht[:data]) } }
|
|
155
|
+
|
|
156
|
+
# Data until first Header index is considered 'unsorted'
|
|
157
|
+
unsorted_end = headers.empty? ? table.length : headers[0][:idx]
|
|
158
|
+
|
|
159
|
+
htables << sub_tablize(table.slice(0, unsorted_end))
|
|
69
160
|
end
|
|
70
161
|
|
|
71
|
-
# Process
|
|
72
|
-
|
|
162
|
+
# Process
|
|
163
|
+
# Transforms a hash of page data (as produced by _pdftohtml_) into a usable information table tree structure.
|
|
164
|
+
# @param [Hash] page_data A hash of document pages, mapped by their page index. Each page is an array of chomp'd lines of HTML data. Example: { 1 => ['<h1>Hello World!</h1>', 'This is page one.'], 2 => ['Wow, another page of data !', '<b>Important stuff</b>', 'That's it for page 2 !'] }
|
|
165
|
+
# @return [Array] An array of tables, each represented as a hash containing an optional header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: { 'System' => [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] } }]
|
|
166
|
+
def self.process page_data
|
|
73
167
|
|
|
74
|
-
#
|
|
75
|
-
|
|
168
|
+
# Collect Data
|
|
169
|
+
data = collect_data page_data
|
|
76
170
|
|
|
77
|
-
#
|
|
78
|
-
|
|
79
|
-
|
|
171
|
+
# Build Data Table
|
|
172
|
+
table = build_table data
|
|
173
|
+
|
|
174
|
+
# Filter Rows
|
|
175
|
+
table = filter_rows table
|
|
176
|
+
|
|
177
|
+
# Filter Table Cells & Touch up
|
|
178
|
+
touch_up table
|
|
80
179
|
end
|
|
81
180
|
end
|
|
82
181
|
end
|
data/lib/pdftdx/version.rb
CHANGED
data/pdftdx.gemspec
CHANGED
|
@@ -21,5 +21,7 @@ Gem::Specification.new do |spec|
|
|
|
21
21
|
|
|
22
22
|
spec.add_development_dependency "bundler", "~> 1.12"
|
|
23
23
|
spec.add_development_dependency "rake", "~> 10.0"
|
|
24
|
+
spec.add_runtime_dependency "minitest"
|
|
25
|
+
spec.add_runtime_dependency "htmlentities"
|
|
24
26
|
spec.add_runtime_dependency "pdftohtml"
|
|
25
27
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: pdftdx
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 1.0.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Eresse
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2016-12-
|
|
11
|
+
date: 2016-12-12 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -38,6 +38,34 @@ dependencies:
|
|
|
38
38
|
- - "~>"
|
|
39
39
|
- !ruby/object:Gem::Version
|
|
40
40
|
version: '10.0'
|
|
41
|
+
- !ruby/object:Gem::Dependency
|
|
42
|
+
name: minitest
|
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - ">="
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: '0'
|
|
48
|
+
type: :runtime
|
|
49
|
+
prerelease: false
|
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - ">="
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '0'
|
|
55
|
+
- !ruby/object:Gem::Dependency
|
|
56
|
+
name: htmlentities
|
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
|
58
|
+
requirements:
|
|
59
|
+
- - ">="
|
|
60
|
+
- !ruby/object:Gem::Version
|
|
61
|
+
version: '0'
|
|
62
|
+
type: :runtime
|
|
63
|
+
prerelease: false
|
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
65
|
+
requirements:
|
|
66
|
+
- - ">="
|
|
67
|
+
- !ruby/object:Gem::Version
|
|
68
|
+
version: '0'
|
|
41
69
|
- !ruby/object:Gem::Dependency
|
|
42
70
|
name: pdftohtml
|
|
43
71
|
requirement: !ruby/object:Gem::Requirement
|