pdftdx 0.3.1 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.idea/.rakeTasks +1 -1
- data/.idea/pdftdx.iml +139 -15
- data/Gemfile.lock +6 -2
- data/Rakefile +6 -0
- data/lib/pdftdx/parser.rb +129 -30
- data/lib/pdftdx/version.rb +1 -1
- data/pdftdx.gemspec +2 -0
- metadata +30 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 703bd26469ea409ff86c2a9383cdc34be4b7dcb6
|
4
|
+
data.tar.gz: 387ef2d99b28d53f961673780aa72c75941e2be1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d802ca74deb4729983dafb4d1d7f5b25f15471e1f2b1007751779658baa86bdc0ec45074a14e98ac78232f05d9e3b3ba42e3e76ac8945575ad58236886d5fd4c
|
7
|
+
data.tar.gz: 4c60551311fd9cf6f4b355c2b4b92aa3e9abbd4d204361f479da88bf91731d061c67c15e6b1f9d8655888a392dc53550b5e68b4f32a86c6d25d57460c2103083
|
data/.idea/.rakeTasks
CHANGED
@@ -4,4 +4,4 @@ You are allowed to:
|
|
4
4
|
1. Remove rake task
|
5
5
|
2. Add existing rake tasks
|
6
6
|
To add existing rake tasks automatically delete this file and reload the project.
|
7
|
-
--><RakeGroup description="" fullCmd="" taksId="rake"><RakeTask description="Build pdftdx-0.
|
7
|
+
--><RakeGroup description="" fullCmd="" taksId="rake"><RakeTask description="Build pdftdx-0.3.2.gem into the pkg directory" fullCmd="build" taksId="build" /><RakeTask description="Remove any temporary products" fullCmd="clean" taksId="clean" /><RakeTask description="Remove any generated files" fullCmd="clobber" taksId="clobber" /><RakeTask description="Build and install pdftdx-0.3.2.gem into system gems" fullCmd="install" taksId="install" /><RakeGroup description="" fullCmd="" taksId="install"><RakeTask description="Build and install pdftdx-0.3.2.gem into system gems without network access" fullCmd="install:local" taksId="local" /></RakeGroup><RakeTask description="Create tag v0.3.2 and build and push pdftdx-0.3.2.gem to Rubygems" fullCmd="release[remote]" taksId="release[remote]" /><RakeTask description="Run tests" fullCmd="test" taksId="test" /><RakeTask description="" fullCmd="default" taksId="default" /><RakeTask description="" fullCmd="release" taksId="release" /><RakeGroup description="" fullCmd="" taksId="release"><RakeTask description="" fullCmd="release:guard_clean" taksId="guard_clean" /><RakeTask description="" fullCmd="release:rubygem_push" taksId="rubygem_push" /><RakeTask description="" fullCmd="release:source_control_push" taksId="source_control_push" /></RakeGroup></RakeGroup></Settings>
|
data/.idea/pdftdx.iml
CHANGED
@@ -1,27 +1,149 @@
|
|
1
1
|
<?xml version="1.0" encoding="UTF-8"?>
|
2
2
|
<module type="RUBY_MODULE" version="4">
|
3
3
|
<component name="ModuleRunConfigurationManager">
|
4
|
-
<configuration default="false" name="
|
4
|
+
<configuration default="false" name="test_filter_rows: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
|
5
|
+
<predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
|
5
6
|
<module name="pdftdx" />
|
6
|
-
<
|
7
|
-
<
|
8
|
-
<
|
9
|
-
<
|
10
|
-
<
|
11
|
-
<envs
|
12
|
-
|
7
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
|
8
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test/pdftdx" />
|
9
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
|
10
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
|
11
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
|
12
|
+
<envs>
|
13
|
+
<env name="JRUBY_OPTS" value="-X+O" />
|
14
|
+
</envs>
|
15
|
+
<EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
|
13
16
|
<EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
|
14
17
|
<EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
|
15
18
|
<COVERAGE_PATTERN ENABLED="true">
|
16
19
|
<PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
|
17
20
|
</COVERAGE_PATTERN>
|
18
21
|
</EXTENSION>
|
19
|
-
<
|
20
|
-
<
|
21
|
-
<
|
22
|
-
<
|
23
|
-
<
|
24
|
-
<
|
22
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="" />
|
23
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="$MODULE_DIR$/test/pdftdx/test_parser.rb" />
|
24
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="" />
|
25
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="test_filter_rows" />
|
26
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="TEST_METHOD" />
|
27
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
|
28
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
|
29
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
|
30
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
|
31
|
+
<method />
|
32
|
+
</configuration>
|
33
|
+
<configuration default="false" name="test_collect_data: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
|
34
|
+
<predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
|
35
|
+
<module name="pdftdx" />
|
36
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
|
37
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test/pdftdx" />
|
38
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
|
39
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
|
40
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
|
41
|
+
<envs>
|
42
|
+
<env name="JRUBY_OPTS" value="-X+O" />
|
43
|
+
</envs>
|
44
|
+
<EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
|
45
|
+
<EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
|
46
|
+
<EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
|
47
|
+
<COVERAGE_PATTERN ENABLED="true">
|
48
|
+
<PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
|
49
|
+
</COVERAGE_PATTERN>
|
50
|
+
</EXTENSION>
|
51
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="" />
|
52
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="$MODULE_DIR$/test/pdftdx/test_parser.rb" />
|
53
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="" />
|
54
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="test_collect_data" />
|
55
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="TEST_METHOD" />
|
56
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
|
57
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
|
58
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
|
59
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
|
60
|
+
<method />
|
61
|
+
</configuration>
|
62
|
+
<configuration default="false" name="test_build_table: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
|
63
|
+
<predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
|
64
|
+
<module name="pdftdx" />
|
65
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
|
66
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test/pdftdx" />
|
67
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
|
68
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
|
69
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
|
70
|
+
<envs>
|
71
|
+
<env name="JRUBY_OPTS" value="-X+O" />
|
72
|
+
</envs>
|
73
|
+
<EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
|
74
|
+
<EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
|
75
|
+
<EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
|
76
|
+
<COVERAGE_PATTERN ENABLED="true">
|
77
|
+
<PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
|
78
|
+
</COVERAGE_PATTERN>
|
79
|
+
</EXTENSION>
|
80
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="" />
|
81
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="$MODULE_DIR$/test/pdftdx/test_parser.rb" />
|
82
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="" />
|
83
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="test_build_table" />
|
84
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="TEST_METHOD" />
|
85
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
|
86
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
|
87
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
|
88
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
|
89
|
+
<method />
|
90
|
+
</configuration>
|
91
|
+
<configuration default="false" name="test_process: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
|
92
|
+
<predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
|
93
|
+
<module name="pdftdx" />
|
94
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
|
95
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test/pdftdx" />
|
96
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
|
97
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
|
98
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
|
99
|
+
<envs>
|
100
|
+
<env name="JRUBY_OPTS" value="-X+O" />
|
101
|
+
</envs>
|
102
|
+
<EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
|
103
|
+
<EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
|
104
|
+
<EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
|
105
|
+
<COVERAGE_PATTERN ENABLED="true">
|
106
|
+
<PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
|
107
|
+
</COVERAGE_PATTERN>
|
108
|
+
</EXTENSION>
|
109
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="" />
|
110
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="$MODULE_DIR$/test/pdftdx/test_parser.rb" />
|
111
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="" />
|
112
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="test_process" />
|
113
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="TEST_METHOD" />
|
114
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
|
115
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
|
116
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
|
117
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
|
118
|
+
<method />
|
119
|
+
</configuration>
|
120
|
+
<configuration default="false" name="All tests in test: pdftdx" type="TestUnitRunConfigurationType" factoryName="Test::Unit/Shoulda/Minitest" temporary="true">
|
121
|
+
<predefined_log_file id="RUBY_TESTUNIT" enabled="true" />
|
122
|
+
<module name="pdftdx" />
|
123
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUBY_ARGS" VALUE="-e $stdout.sync=true;$stderr.sync=true;load($0=ARGV.shift)" />
|
124
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="WORK DIR" VALUE="$MODULE_DIR$/test" />
|
125
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="SHOULD_USE_SDK" VALUE="false" />
|
126
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="ALTERN_SDK_NAME" VALUE="" />
|
127
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="myPassParentEnvs" VALUE="true" />
|
128
|
+
<envs>
|
129
|
+
<env name="JRUBY_OPTS" value="-X+O" />
|
130
|
+
</envs>
|
131
|
+
<EXTENSION ID="BundlerRunConfigurationExtension" bundleExecEnabled="true" />
|
132
|
+
<EXTENSION ID="JRubyRunConfigurationExtension" NailgunExecEnabled="false" />
|
133
|
+
<EXTENSION ID="RubyCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" track_test_folders="true" runner="rcov">
|
134
|
+
<COVERAGE_PATTERN ENABLED="true">
|
135
|
+
<PATTERN REGEXPS="/.rvm/" INCLUDED="false" />
|
136
|
+
</COVERAGE_PATTERN>
|
137
|
+
</EXTENSION>
|
138
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TESTS_FOLDER_PATH" VALUE="$MODULE_DIR$/test" />
|
139
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_SCRIPT_PATH" VALUE="" />
|
140
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_FILE_MASK" VALUE="**/{*_test,test_*}.rb" />
|
141
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_METHOD_NAME" VALUE="" />
|
142
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="TEST_TEST_TYPE" VALUE="ALL_IN_FOLDER" />
|
143
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="DRB" VALUE="false" />
|
144
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="ZEUS" VALUE="false" />
|
145
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="SPRING" VALUE="false" />
|
146
|
+
<RTEST_RUN_CONFIG_SETTINGS_ID NAME="RUNNER_OPTIONS" VALUE="" />
|
25
147
|
<method />
|
26
148
|
</configuration>
|
27
149
|
</component>
|
@@ -30,7 +152,9 @@
|
|
30
152
|
<orderEntry type="inheritedJdk" />
|
31
153
|
<orderEntry type="sourceFolder" forTests="false" />
|
32
154
|
<orderEntry type="library" scope="PROVIDED" name="bundler (v1.12.5, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
|
33
|
-
<orderEntry type="library" scope="PROVIDED" name="
|
155
|
+
<orderEntry type="library" scope="PROVIDED" name="htmlentities (v4.3.4, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
|
156
|
+
<orderEntry type="library" scope="PROVIDED" name="minitest (v5.10.1, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
|
157
|
+
<orderEntry type="library" scope="PROVIDED" name="pdftohtml (v0.2.3, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
|
34
158
|
<orderEntry type="library" scope="PROVIDED" name="rake (v10.5.0, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
|
35
159
|
</component>
|
36
160
|
</module>
|
data/Gemfile.lock
CHANGED
@@ -1,13 +1,17 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
pdftdx (0.
|
4
|
+
pdftdx (1.0.0)
|
5
|
+
htmlentities
|
6
|
+
minitest
|
5
7
|
pdftohtml
|
6
8
|
|
7
9
|
GEM
|
8
10
|
remote: https://rubygems.org/
|
9
11
|
specs:
|
10
|
-
|
12
|
+
htmlentities (4.3.4)
|
13
|
+
minitest (5.10.1)
|
14
|
+
pdftohtml (0.2.3)
|
11
15
|
rake (10.5.0)
|
12
16
|
|
13
17
|
PLATFORMS
|
data/Rakefile
CHANGED
data/lib/pdftdx/parser.rb
CHANGED
@@ -22,61 +22,160 @@ module PDFTDX
|
|
22
22
|
# Page Offset
|
23
23
|
PAGE_OFF = 10000
|
24
24
|
|
25
|
-
#
|
26
|
-
|
25
|
+
# Maximum Allowed Offset from Page Top
|
26
|
+
PAGE_MAX_TOP = 1100
|
27
27
|
|
28
|
-
#
|
29
|
-
|
30
|
-
data[idx_a][:top] == data[idx_b][:top]
|
31
|
-
end
|
28
|
+
# Title Cell Regex
|
29
|
+
TITLE_CELL_REGEX = /<b>/
|
32
30
|
|
33
31
|
# Is All Same Data
|
34
|
-
|
32
|
+
# Determine whether a row's cells all contain the same data.
|
33
|
+
# @param [Hash] row_data A hash of table cells, mapped by their offset from the left. Example: { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }
|
34
|
+
# @return [Boolean] True if all cells contain the same data, False otherwise.
|
35
|
+
def self.is_all_same? row_data
|
35
36
|
n = row_data[row_data.keys[0]]
|
36
37
|
row_data.inject(true) { |b, e| b && (e[1] == n) }
|
37
38
|
end
|
38
39
|
|
39
40
|
# Contains Unusable Data (Empty / Long Strings)
|
40
|
-
|
41
|
+
# Determines whether a row contains unusable data.
|
42
|
+
# @param [Hash] row_data A hash of table cells, mapped by their offset from the left. Example: { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }
|
43
|
+
# @return [Boolean] True if at least one cell is unusable (empty, oversize), False otherwise
|
44
|
+
def self.contains_unusable? row_data
|
41
45
|
row_data.inject(false) { |b, e| b || (e[1].length == 0) || (e[1].length > MAX_CELL_LEN) }
|
42
46
|
end
|
43
47
|
|
44
|
-
#
|
45
|
-
|
48
|
+
# HTML Filter
|
49
|
+
# Replaces HTML newlines by UNIX-style newlines.
|
50
|
+
# @param [String] s A string of HTML data
|
51
|
+
# @return [String] The same string of HTML data, with all newlines (<br/> tags) converted to UNIX newlines.
|
52
|
+
def self.hfilter s
|
53
|
+
s.gsub '<br/>', "\n"
|
54
|
+
end
|
55
|
+
|
56
|
+
# Collect Data
|
57
|
+
# Extracts table-like chunks of HTML data from a hash of HTML pages.
|
58
|
+
# @param [Hash] data A hash of document pages, mapped by their page index. Each page is an array of chomp'd lines of HTML data. Example: { 1 => ['<h1>Hello World!</h1>', 'This is page one.'], 2 => ['Wow, another page of data !', '<b>Important stuff</b>', 'That's it for page 2 !'] }
|
59
|
+
# @return [Array] An array of HTML chunks, each represented as a hash containing the chunk position and data. Example: [{ top: 10, left: 100, data: 'Machine OS' }, { top: 10, left: 220, data: 'Win32' }, { top: 10, left: 340, data: 'Linux' }, { top: 10, left: 460, data: 'MacOS' }]
|
60
|
+
def self.collect_data data
|
46
61
|
|
47
|
-
# Build
|
62
|
+
# Build HTML Entity Decoder
|
63
|
+
coder = HTMLEntities.new
|
64
|
+
|
65
|
+
# Collect File Data
|
66
|
+
off = 0
|
67
|
+
data.collect do |_idx, page|
|
68
|
+
off = off + PAGE_OFF
|
69
|
+
page
|
70
|
+
.select { |l| LINE_REGEX =~ l } # Collect Table-like data
|
71
|
+
.collect { |l| LINE_REGEX.match l } # Extract Table Element Metadata (Position)
|
72
|
+
.collect { |d| { top: off + d[1].to_i, left: d[2].to_i, data: hfilter(coder.decode(d[3])) } } # Produce Hash of Raw Table Data
|
73
|
+
end.flatten
|
74
|
+
end
|
75
|
+
|
76
|
+
# Build Data Table
|
77
|
+
# Produces an organized Table (in the form a 2-level nested hash) from an array of HTML chunks.
|
78
|
+
# @param [Array] data An array of document chunks, each represented as a hash containing the position and body of the chunk. Example: [{ top: 10, left: 100, data: 'Machine OS' }, { top: 10, left: 220, data: 'Win32' }, { top: 10, left: 340, data: 'Linux' }, { top: 10, left: 460, data: 'MacOS' }]
|
79
|
+
# @return [Hash] A hash of table rows, mapped by their offset from the top, where each row is represented as a hash of table cells, mapped by their offset from the left. Example: { 10 => { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, 35 => { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' } }
|
80
|
+
def self.build_table data
|
48
81
|
table = {}
|
49
82
|
data.each { |d| table[d[:top]] ||= {}; table[d[:top]][d[:left]] = d[:data] }
|
83
|
+
table
|
84
|
+
end
|
50
85
|
|
51
|
-
|
52
|
-
|
86
|
+
# Filter Table Rows
|
87
|
+
# Filters out rows considered unusable, empty, oversize, footers, etc...
|
88
|
+
# Also, strips Top Offset info from Table Rows.
|
89
|
+
# @param [Hash] data A hash of table rows, mapped by their offset from the top, where each row is represented as a hash of table cells, mapped by their offset from the left. Example: { 10 => { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, 35 => { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' } }
|
90
|
+
# @return [Array] An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
|
91
|
+
def self.filter_rows data
|
92
|
+
data
|
93
|
+
.reject { |top, row| row.size < 2 || (top % PAGE_OFF) >= PAGE_MAX_TOP || is_all_same?(row) || contains_unusable?(row) } # Drop Single-Element Rows, Footer Data, Useless Rows (all cells identical) & Unusable Rows (Empty / Oversize Cells)
|
94
|
+
.collect { |_top, r| r }.reject { |r| r.size < 2 } # Remove 'top offset' information and re-drop single-element rows
|
95
|
+
end
|
53
96
|
|
54
|
-
|
55
|
-
|
97
|
+
# Determine Headered Table Length
|
98
|
+
# Computes the number of rows to be included in a given headered table.
|
99
|
+
# @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
|
100
|
+
# @param [Array] headers An array of header rows, each represented as a hash containing the header row's index within the *table* array, and the actual row data. Example: [{ idx: 0, row: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'] }]
|
101
|
+
# @param [Hash] h The current header row (determine htable length from this)
|
102
|
+
# @param [Fixnum] i The current header's index within the *headers* array
|
103
|
+
# @return [Fixnum] The number of rows
|
104
|
+
def self.htable_length table, headers, h, i
|
105
|
+
(headers[i + 1] ? headers[i + 1][:idx] : table.length) - h[:idx]
|
106
|
+
end
|
56
107
|
|
57
|
-
|
58
|
-
|
108
|
+
# Sub Table Length
|
109
|
+
# Computes the number of rows to be included in a given sub-table.
|
110
|
+
# @param [Array] table An array of table rows, each represented as an array of table cells. Example: [['<b>System</b>', 'Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']]
|
111
|
+
# @param [Array] stables An array of named tables, each represented as a hash containing the name and its starting index within the *table* array. Example: [{ title: '<b>System Info</b>', idx: 0 }]
|
112
|
+
# @param [Hash] t The current sub-table title row (determine stable length from this)
|
113
|
+
# @param [Fixnum] i The current sub-table title's index within the *stable* array
|
114
|
+
# @return [Fixnum] The number of rows
|
115
|
+
def self.sub_tab_len table, stables, t, i
|
116
|
+
(stables[i + 1] ? stables[i + 1][:idx] : table.length) - t[:idx]
|
117
|
+
end
|
59
118
|
|
60
|
-
|
61
|
-
|
119
|
+
# Sub-Tablize
|
120
|
+
# Splits a table into multiple named tables.
|
121
|
+
# @param [Array] htable_data An array of table rows, each represented as an array of table cells. Example: [['<b>System</b>', 'Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']]
|
122
|
+
# @return [Array] An array of named tables, each represented as a hash containing the name and the table itself. May also contain a single array, containing all remaining table data (unnamed). Example: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }, [['32.40 $', '34.00 $', '88.40 $'], ['21.40 km', '12.00 km', '99.10 km']]]
|
123
|
+
def self.sub_tablize htable_data
|
62
124
|
|
63
|
-
table
|
125
|
+
# Collect Sub-table Title Rows
|
126
|
+
subtab_titles = htable_data.collect.with_index { |r, i| { idx: i, row: r } }.select { |e| TITLE_CELL_REGEX =~ e[:row][0] }.collect { |e| { title: e[:row][0], idx: e[:idx] } }
|
127
|
+
|
128
|
+
# Pull up Sub-tables
|
129
|
+
stables = subtab_titles.collect.with_index { |t, i| { name: t[:title].gsub(/<\/?b>/, ''), data: htable_data.slice(t[:idx], sub_tab_len(htable_data, subtab_titles, t, i)).collect { |e| e.reject.with_index { |c, ii| ii == 0 && TITLE_CELL_REGEX =~ c } } } }
|
130
|
+
|
131
|
+
# Data until first sub-table index is considered 'unsorted'
|
132
|
+
unsorted_end = subtab_titles.empty? ? htable_data.length : subtab_titles[0][:idx]
|
133
|
+
|
134
|
+
stables << htable_data.slice(0, unsorted_end)
|
64
135
|
end
|
65
136
|
|
66
|
-
#
|
67
|
-
|
68
|
-
|
137
|
+
# Touch up Table
|
138
|
+
# Splits Table into multiple headered tables.
|
139
|
+
# Also, strips Left Offset info from Table Cells.
|
140
|
+
# @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
|
141
|
+
# @return [Array] An array of tables, each represented as either a single array of rows, or a hash containing a header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }, []] }, []]
|
142
|
+
def self.touch_up table
|
143
|
+
|
144
|
+
# Remove Column Offsets
|
145
|
+
table.collect! { |r| r.collect { |_left, cell| cell } }
|
146
|
+
|
147
|
+
# Split Table into multiple Headered Tables
|
148
|
+
headers = table.collect.with_index { |r, i| { idx: i, row: r } }.select { |e| e[:row].inject(true) { |b, c| b && (TITLE_CELL_REGEX =~ c) } }.collect { |r| { idx: r[:idx], row: r[:row].collect { |v| v.gsub /<\/?b>/, '' } } }
|
149
|
+
|
150
|
+
# Pull up Headered Tables
|
151
|
+
htables = headers.collect.with_index { |h, i| { head: h[:row], data: table.slice(h[:idx] + 1, htable_length(table, headers, h, i) - 1) } }
|
152
|
+
|
153
|
+
# Split Headered Tables into multiple Named Sub-Tables
|
154
|
+
htables.collect! { |ht| { head: ht[:head], data: sub_tablize(ht[:data]) } }
|
155
|
+
|
156
|
+
# Data until first Header index is considered 'unsorted'
|
157
|
+
unsorted_end = headers.empty? ? table.length : headers[0][:idx]
|
158
|
+
|
159
|
+
htables << sub_tablize(table.slice(0, unsorted_end))
|
69
160
|
end
|
70
161
|
|
71
|
-
# Process
|
72
|
-
|
162
|
+
# Process
|
163
|
+
# Transforms a hash of page data (as produced by _pdftohtml_) into a usable information table tree structure.
|
164
|
+
# @param [Hash] page_data A hash of document pages, mapped by their page index. Each page is an array of chomp'd lines of HTML data. Example: { 1 => ['<h1>Hello World!</h1>', 'This is page one.'], 2 => ['Wow, another page of data !', '<b>Important stuff</b>', 'That's it for page 2 !'] }
|
165
|
+
# @return [Array] An array of tables, each represented as a hash containing an optional header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: { 'System' => [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] } }]
|
166
|
+
def self.process page_data
|
73
167
|
|
74
|
-
#
|
75
|
-
|
168
|
+
# Collect Data
|
169
|
+
data = collect_data page_data
|
76
170
|
|
77
|
-
#
|
78
|
-
|
79
|
-
|
171
|
+
# Build Data Table
|
172
|
+
table = build_table data
|
173
|
+
|
174
|
+
# Filter Rows
|
175
|
+
table = filter_rows table
|
176
|
+
|
177
|
+
# Filter Table Cells & Touch up
|
178
|
+
touch_up table
|
80
179
|
end
|
81
180
|
end
|
82
181
|
end
|
data/lib/pdftdx/version.rb
CHANGED
data/pdftdx.gemspec
CHANGED
@@ -21,5 +21,7 @@ Gem::Specification.new do |spec|
|
|
21
21
|
|
22
22
|
spec.add_development_dependency "bundler", "~> 1.12"
|
23
23
|
spec.add_development_dependency "rake", "~> 10.0"
|
24
|
+
spec.add_runtime_dependency "minitest"
|
25
|
+
spec.add_runtime_dependency "htmlentities"
|
24
26
|
spec.add_runtime_dependency "pdftohtml"
|
25
27
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdftdx
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eresse
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-12-
|
11
|
+
date: 2016-12-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -38,6 +38,34 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: minitest
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: htmlentities
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
41
69
|
- !ruby/object:Gem::Dependency
|
42
70
|
name: pdftohtml
|
43
71
|
requirement: !ruby/object:Gem::Requirement
|