pdftdx 1.2.1 → 1.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/pdftdx.rb +4 -2
- data/lib/pdftdx/parser.rb +13 -13
- data/lib/pdftdx/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a243b5d1cd30ce908060382654ca48b3ae97c242
|
4
|
+
data.tar.gz: ee753466c9b422f8fbcc301e5b93a8f6f61f3168
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 98b3420792f93b5bcdc59b16499be930ef52087014da36b6f63a582f013742f9af6081412958f44502e36e3c9710f6c634762087ff7e795531aaec8c2c67d671
|
7
|
+
data.tar.gz: 3704fb9ff4661dca80a0a6cb5fd744af7373a83cf870937ce8d799f977db0e8f67fb0406505328b4863a6e612f30bcce318d12493ed9f15c54871c24dd56ca05
|
data/lib/pdftdx.rb
CHANGED
@@ -9,10 +9,12 @@ require 'pdftohtml'
|
|
9
9
|
require 'pdftdx/parser'
|
10
10
|
require 'pdftdx/version'
|
11
11
|
|
12
|
-
# PDF TDX Module
|
12
|
+
# PDF TDX Module:
|
13
|
+
# Root Module for Pdftdx.
|
13
14
|
module PDFTDX
|
14
15
|
|
15
|
-
# Extract Data from PDF
|
16
|
+
# Extract Data from PDF:
|
17
|
+
# Converts a PDF file to HTML format and then extracts anything that looks like tabular data.
|
16
18
|
# @param [String] pdf_file Path to a PDF file
|
17
19
|
# @return [Array] An array of tables, each represented as a hash containing an optional header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: { 'System' => [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] } }]
|
18
20
|
def self.extract_data pdf_file
|
data/lib/pdftdx/parser.rb
CHANGED
@@ -28,7 +28,7 @@ module PDFTDX
|
|
28
28
|
# Title Cell Regex
|
29
29
|
TITLE_CELL_REGEX = /<b>/
|
30
30
|
|
31
|
-
# Is All Same Data
|
31
|
+
# Is All Same Data:
|
32
32
|
# Determine whether a row's cells all contain the same data.
|
33
33
|
# @param [Hash] row_data A hash of table cells, mapped by their offset from the left. Example: { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }
|
34
34
|
# @return [Boolean] True if all cells contain the same data, False otherwise.
|
@@ -37,7 +37,7 @@ module PDFTDX
|
|
37
37
|
row_data.inject(true) { |b, e| b && (e[1] == n) }
|
38
38
|
end
|
39
39
|
|
40
|
-
# Contains Unusable Data (Empty / Long Strings)
|
40
|
+
# Contains Unusable Data (Empty / Long Strings):
|
41
41
|
# Determines whether a row contains unusable data.
|
42
42
|
# @param [Hash] row_data A hash of table cells, mapped by their offset from the left. Example: { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }
|
43
43
|
# @return [Boolean] True if at least one cell is unusable (empty, oversize), False otherwise
|
@@ -45,7 +45,7 @@ module PDFTDX
|
|
45
45
|
row_data.inject(false) { |b, e| b || (e[1].length == 0) || (e[1].length > MAX_CELL_LEN) }
|
46
46
|
end
|
47
47
|
|
48
|
-
# HTML Filter
|
48
|
+
# HTML Filter:
|
49
49
|
# Replaces HTML newlines by UNIX-style newlines.
|
50
50
|
# @param [String] s A string of HTML data
|
51
51
|
# @return [String] The same string of HTML data, with all newlines (<br/> tags) converted to UNIX newlines.
|
@@ -53,7 +53,7 @@ module PDFTDX
|
|
53
53
|
s.gsub '<br/>', "\n"
|
54
54
|
end
|
55
55
|
|
56
|
-
# Collect Data
|
56
|
+
# Collect Data:
|
57
57
|
# Extracts table-like chunks of HTML data from a hash of HTML pages.
|
58
58
|
# @param [Hash] data A hash of document pages, mapped by their page index. Each page is an array of chomp'd lines of HTML data. Example: { 1 => ['<h1>Hello World!</h1>', 'This is page one.'], 2 => ['Wow, another page of data !', '<b>Important stuff</b>', 'That's it for page 2 !'] }
|
59
59
|
# @return [Array] An array of HTML chunks, each represented as a hash containing the chunk position and data. Example: [{ top: 10, left: 100, data: 'Machine OS' }, { top: 10, left: 220, data: 'Win32' }, { top: 10, left: 340, data: 'Linux' }, { top: 10, left: 460, data: 'MacOS' }]
|
@@ -73,7 +73,7 @@ module PDFTDX
|
|
73
73
|
end.flatten
|
74
74
|
end
|
75
75
|
|
76
|
-
# Build Data Table
|
76
|
+
# Build Data Table:
|
77
77
|
# Produces an organized Table (in the form a 2-level nested hash) from an array of HTML chunks.
|
78
78
|
# @param [Array] data An array of document chunks, each represented as a hash containing the position and body of the chunk. Example: [{ top: 10, left: 100, data: 'Machine OS' }, { top: 10, left: 220, data: 'Win32' }, { top: 10, left: 340, data: 'Linux' }, { top: 10, left: 460, data: 'MacOS' }]
|
79
79
|
# @return [Hash] A hash of table rows, mapped by their offset from the top, where each row is represented as a hash of table cells, mapped by their offset from the left. Example: { 10 => { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, 35 => { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' } }
|
@@ -83,7 +83,7 @@ module PDFTDX
|
|
83
83
|
table
|
84
84
|
end
|
85
85
|
|
86
|
-
# Filter Table Rows
|
86
|
+
# Filter Table Rows:
|
87
87
|
# Filters out rows considered unusable, empty, oversize, footers, etc...
|
88
88
|
# Also, strips Top Offset info from Table Rows.
|
89
89
|
# @param [Hash] data A hash of table rows, mapped by their offset from the top, where each row is represented as a hash of table cells, mapped by their offset from the left. Example: { 10 => { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, 35 => { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' } }
|
@@ -94,7 +94,7 @@ module PDFTDX
|
|
94
94
|
.collect { |_top, r| r }.reject { |r| r.size < 2 } # Remove 'top offset' information and re-drop single-element rows
|
95
95
|
end
|
96
96
|
|
97
|
-
# Determine Headered Table Length
|
97
|
+
# Determine Headered Table Length:
|
98
98
|
# Computes the number of rows to be included in a given headered table.
|
99
99
|
# @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
|
100
100
|
# @param [Array] headers An array of header rows, each represented as a hash containing the header row's index within the *table* array, and the actual row data. Example: [{ idx: 0, row: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'] }]
|
@@ -105,7 +105,7 @@ module PDFTDX
|
|
105
105
|
(headers[i + 1] ? headers[i + 1][:idx] : table.length) - h[:idx]
|
106
106
|
end
|
107
107
|
|
108
|
-
# Sub Table Length
|
108
|
+
# Sub Table Length:
|
109
109
|
# Computes the number of rows to be included in a given sub-table.
|
110
110
|
# @param [Array] table An array of table rows, each represented as an array of table cells. Example: [['<b>System</b>', 'Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']]
|
111
111
|
# @param [Array] stables An array of named tables, each represented as a hash containing the name and its starting index within the *table* array. Example: [{ title: '<b>System Info</b>', idx: 0 }]
|
@@ -116,7 +116,7 @@ module PDFTDX
|
|
116
116
|
(stables[i + 1] ? stables[i + 1][:idx] : table.length) - t[:idx]
|
117
117
|
end
|
118
118
|
|
119
|
-
# Sub-Tablize
|
119
|
+
# Sub-Tablize:
|
120
120
|
# Splits a table into multiple named tables.
|
121
121
|
# @param [Array] htable_data An array of table rows, each represented as an array of table cells. Example: [['<b>System</b>', 'Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']]
|
122
122
|
# @return [Array] An array of named tables, each represented as a hash containing the name and the table itself. May also contain a single array, containing all remaining table data (unnamed). Example: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }, [['32.40 $', '34.00 $', '88.40 $'], ['21.40 km', '12.00 km', '99.10 km']]]
|
@@ -144,7 +144,7 @@ module PDFTDX
|
|
144
144
|
stables
|
145
145
|
end
|
146
146
|
|
147
|
-
# Sort Row
|
147
|
+
# Sort Row:
|
148
148
|
# Sorts Cells according to their x-offset
|
149
149
|
# @param [Hash] r A row of data in the form { xoffset => cell } (Example: { 120 => 'cell 0', 200 => 'cell 1', 280 => 'cell 2' })
|
150
150
|
# @return [Hash] The same row of data, but sorted according to x-offset
|
@@ -152,7 +152,7 @@ module PDFTDX
|
|
152
152
|
Hash[*(r.to_a.sort { |a, b| ((a[0] == b[0]) ? 0 : (a[0] > b[0] ? 1 : -1)) }.flatten)]
|
153
153
|
end
|
154
154
|
|
155
|
-
# Fix Dupes
|
155
|
+
# Fix Dupes:
|
156
156
|
# Shifts Duplicate Cells (Cells which share their x-offset with others) to the right (so they don't get overwritten)
|
157
157
|
# @param [Array] r A row of data in the form [[xoffset, cell]] (Example: [[120, 'cell 0'], [200, 'cell 1'], [280, 'cell 2']])
|
158
158
|
# @param [Array] The same row of data, but with duplicate cells shifted so that no x-offset-collisions occur
|
@@ -174,7 +174,7 @@ module PDFTDX
|
|
174
174
|
nr
|
175
175
|
end
|
176
176
|
|
177
|
-
# Touch up Table
|
177
|
+
# Touch up Table:
|
178
178
|
# Splits Table into multiple headered tables.
|
179
179
|
# Also, strips Left Offset info from Table Cells.
|
180
180
|
# @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
|
@@ -219,7 +219,7 @@ module PDFTDX
|
|
219
219
|
htables
|
220
220
|
end
|
221
221
|
|
222
|
-
# Process
|
222
|
+
# Process:
|
223
223
|
# Transforms a hash of page data (as produced by _pdftohtml_) into a usable information table tree structure.
|
224
224
|
# @param [Hash] page_data A hash of document pages, mapped by their page index. Each page is an array of chomp'd lines of HTML data. Example: { 1 => ['<h1>Hello World!</h1>', 'This is page one.'], 2 => ['Wow, another page of data !', '<b>Important stuff</b>', 'That's it for page 2 !'] }
|
225
225
|
# @return [Array] An array of tables, each represented as a hash containing an optional header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: { 'System' => [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] } }]
|
data/lib/pdftdx/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdftdx
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eresse
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-01
|
11
|
+
date: 2017-03-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -116,7 +116,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
116
116
|
version: '0'
|
117
117
|
requirements: []
|
118
118
|
rubyforge_project:
|
119
|
-
rubygems_version: 2.
|
119
|
+
rubygems_version: 2.5.1
|
120
120
|
signing_key:
|
121
121
|
specification_version: 4
|
122
122
|
summary: Simple PDF Table Data Extractor
|