pdftdx 1.2.1 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/pdftdx.rb +4 -2
- data/lib/pdftdx/parser.rb +13 -13
- data/lib/pdftdx/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a243b5d1cd30ce908060382654ca48b3ae97c242
|
4
|
+
data.tar.gz: ee753466c9b422f8fbcc301e5b93a8f6f61f3168
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 98b3420792f93b5bcdc59b16499be930ef52087014da36b6f63a582f013742f9af6081412958f44502e36e3c9710f6c634762087ff7e795531aaec8c2c67d671
|
7
|
+
data.tar.gz: 3704fb9ff4661dca80a0a6cb5fd744af7373a83cf870937ce8d799f977db0e8f67fb0406505328b4863a6e612f30bcce318d12493ed9f15c54871c24dd56ca05
|
data/lib/pdftdx.rb
CHANGED
@@ -9,10 +9,12 @@ require 'pdftohtml'
|
|
9
9
|
require 'pdftdx/parser'
|
10
10
|
require 'pdftdx/version'
|
11
11
|
|
12
|
-
# PDF TDX Module
|
12
|
+
# PDF TDX Module:
|
13
|
+
# Root Module for Pdftdx.
|
13
14
|
module PDFTDX
|
14
15
|
|
15
|
-
# Extract Data from PDF
|
16
|
+
# Extract Data from PDF:
|
17
|
+
# Converts a PDF file to HTML format and then extracts anything that looks like tabular data.
|
16
18
|
# @param [String] pdf_file Path to a PDF file
|
17
19
|
# @return [Array] An array of tables, each represented as a hash containing an optional header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: { 'System' => [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] } }]
|
18
20
|
def self.extract_data pdf_file
|
data/lib/pdftdx/parser.rb
CHANGED
@@ -28,7 +28,7 @@ module PDFTDX
|
|
28
28
|
# Title Cell Regex
|
29
29
|
TITLE_CELL_REGEX = /<b>/
|
30
30
|
|
31
|
-
# Is All Same Data
|
31
|
+
# Is All Same Data:
|
32
32
|
# Determine whether a row's cells all contain the same data.
|
33
33
|
# @param [Hash] row_data A hash of table cells, mapped by their offset from the left. Example: { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }
|
34
34
|
# @return [Boolean] True if all cells contain the same data, False otherwise.
|
@@ -37,7 +37,7 @@ module PDFTDX
|
|
37
37
|
row_data.inject(true) { |b, e| b && (e[1] == n) }
|
38
38
|
end
|
39
39
|
|
40
|
-
# Contains Unusable Data (Empty / Long Strings)
|
40
|
+
# Contains Unusable Data (Empty / Long Strings):
|
41
41
|
# Determines whether a row contains unusable data.
|
42
42
|
# @param [Hash] row_data A hash of table cells, mapped by their offset from the left. Example: { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }
|
43
43
|
# @return [Boolean] True if at least one cell is unusable (empty, oversize), False otherwise
|
@@ -45,7 +45,7 @@ module PDFTDX
|
|
45
45
|
row_data.inject(false) { |b, e| b || (e[1].length == 0) || (e[1].length > MAX_CELL_LEN) }
|
46
46
|
end
|
47
47
|
|
48
|
-
# HTML Filter
|
48
|
+
# HTML Filter:
|
49
49
|
# Replaces HTML newlines by UNIX-style newlines.
|
50
50
|
# @param [String] s A string of HTML data
|
51
51
|
# @return [String] The same string of HTML data, with all newlines (<br/> tags) converted to UNIX newlines.
|
@@ -53,7 +53,7 @@ module PDFTDX
|
|
53
53
|
s.gsub '<br/>', "\n"
|
54
54
|
end
|
55
55
|
|
56
|
-
# Collect Data
|
56
|
+
# Collect Data:
|
57
57
|
# Extracts table-like chunks of HTML data from a hash of HTML pages.
|
58
58
|
# @param [Hash] data A hash of document pages, mapped by their page index. Each page is an array of chomp'd lines of HTML data. Example: { 1 => ['<h1>Hello World!</h1>', 'This is page one.'], 2 => ['Wow, another page of data !', '<b>Important stuff</b>', 'That's it for page 2 !'] }
|
59
59
|
# @return [Array] An array of HTML chunks, each represented as a hash containing the chunk position and data. Example: [{ top: 10, left: 100, data: 'Machine OS' }, { top: 10, left: 220, data: 'Win32' }, { top: 10, left: 340, data: 'Linux' }, { top: 10, left: 460, data: 'MacOS' }]
|
@@ -73,7 +73,7 @@ module PDFTDX
|
|
73
73
|
end.flatten
|
74
74
|
end
|
75
75
|
|
76
|
-
# Build Data Table
|
76
|
+
# Build Data Table:
|
77
77
|
# Produces an organized Table (in the form a 2-level nested hash) from an array of HTML chunks.
|
78
78
|
# @param [Array] data An array of document chunks, each represented as a hash containing the position and body of the chunk. Example: [{ top: 10, left: 100, data: 'Machine OS' }, { top: 10, left: 220, data: 'Win32' }, { top: 10, left: 340, data: 'Linux' }, { top: 10, left: 460, data: 'MacOS' }]
|
79
79
|
# @return [Hash] A hash of table rows, mapped by their offset from the top, where each row is represented as a hash of table cells, mapped by their offset from the left. Example: { 10 => { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, 35 => { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' } }
|
@@ -83,7 +83,7 @@ module PDFTDX
|
|
83
83
|
table
|
84
84
|
end
|
85
85
|
|
86
|
-
# Filter Table Rows
|
86
|
+
# Filter Table Rows:
|
87
87
|
# Filters out rows considered unusable, empty, oversize, footers, etc...
|
88
88
|
# Also, strips Top Offset info from Table Rows.
|
89
89
|
# @param [Hash] data A hash of table rows, mapped by their offset from the top, where each row is represented as a hash of table cells, mapped by their offset from the left. Example: { 10 => { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, 35 => { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' } }
|
@@ -94,7 +94,7 @@ module PDFTDX
|
|
94
94
|
.collect { |_top, r| r }.reject { |r| r.size < 2 } # Remove 'top offset' information and re-drop single-element rows
|
95
95
|
end
|
96
96
|
|
97
|
-
# Determine Headered Table Length
|
97
|
+
# Determine Headered Table Length:
|
98
98
|
# Computes the number of rows to be included in a given headered table.
|
99
99
|
# @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
|
100
100
|
# @param [Array] headers An array of header rows, each represented as a hash containing the header row's index within the *table* array, and the actual row data. Example: [{ idx: 0, row: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'] }]
|
@@ -105,7 +105,7 @@ module PDFTDX
|
|
105
105
|
(headers[i + 1] ? headers[i + 1][:idx] : table.length) - h[:idx]
|
106
106
|
end
|
107
107
|
|
108
|
-
# Sub Table Length
|
108
|
+
# Sub Table Length:
|
109
109
|
# Computes the number of rows to be included in a given sub-table.
|
110
110
|
# @param [Array] table An array of table rows, each represented as an array of table cells. Example: [['<b>System</b>', 'Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']]
|
111
111
|
# @param [Array] stables An array of named tables, each represented as a hash containing the name and its starting index within the *table* array. Example: [{ title: '<b>System Info</b>', idx: 0 }]
|
@@ -116,7 +116,7 @@ module PDFTDX
|
|
116
116
|
(stables[i + 1] ? stables[i + 1][:idx] : table.length) - t[:idx]
|
117
117
|
end
|
118
118
|
|
119
|
-
# Sub-Tablize
|
119
|
+
# Sub-Tablize:
|
120
120
|
# Splits a table into multiple named tables.
|
121
121
|
# @param [Array] htable_data An array of table rows, each represented as an array of table cells. Example: [['<b>System</b>', 'Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']]
|
122
122
|
# @return [Array] An array of named tables, each represented as a hash containing the name and the table itself. May also contain a single array, containing all remaining table data (unnamed). Example: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }, [['32.40 $', '34.00 $', '88.40 $'], ['21.40 km', '12.00 km', '99.10 km']]]
|
@@ -144,7 +144,7 @@ module PDFTDX
|
|
144
144
|
stables
|
145
145
|
end
|
146
146
|
|
147
|
-
# Sort Row
|
147
|
+
# Sort Row:
|
148
148
|
# Sorts Cells according to their x-offset
|
149
149
|
# @param [Hash] r A row of data in the form { xoffset => cell } (Example: { 120 => 'cell 0', 200 => 'cell 1', 280 => 'cell 2' })
|
150
150
|
# @return [Hash] The same row of data, but sorted according to x-offset
|
@@ -152,7 +152,7 @@ module PDFTDX
|
|
152
152
|
Hash[*(r.to_a.sort { |a, b| ((a[0] == b[0]) ? 0 : (a[0] > b[0] ? 1 : -1)) }.flatten)]
|
153
153
|
end
|
154
154
|
|
155
|
-
# Fix Dupes
|
155
|
+
# Fix Dupes:
|
156
156
|
# Shifts Duplicate Cells (Cells which share their x-offset with others) to the right (so they don't get overwritten)
|
157
157
|
# @param [Array] r A row of data in the form [[xoffset, cell]] (Example: [[120, 'cell 0'], [200, 'cell 1'], [280, 'cell 2']])
|
158
158
|
# @param [Array] The same row of data, but with duplicate cells shifted so that no x-offset-collisions occur
|
@@ -174,7 +174,7 @@ module PDFTDX
|
|
174
174
|
nr
|
175
175
|
end
|
176
176
|
|
177
|
-
# Touch up Table
|
177
|
+
# Touch up Table:
|
178
178
|
# Splits Table into multiple headered tables.
|
179
179
|
# Also, strips Left Offset info from Table Cells.
|
180
180
|
# @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
|
@@ -219,7 +219,7 @@ module PDFTDX
|
|
219
219
|
htables
|
220
220
|
end
|
221
221
|
|
222
|
-
# Process
|
222
|
+
# Process:
|
223
223
|
# Transforms a hash of page data (as produced by _pdftohtml_) into a usable information table tree structure.
|
224
224
|
# @param [Hash] page_data A hash of document pages, mapped by their page index. Each page is an array of chomp'd lines of HTML data. Example: { 1 => ['<h1>Hello World!</h1>', 'This is page one.'], 2 => ['Wow, another page of data !', '<b>Important stuff</b>', 'That's it for page 2 !'] }
|
225
225
|
# @return [Array] An array of tables, each represented as a hash containing an optional header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: { 'System' => [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] } }]
|
data/lib/pdftdx/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdftdx
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eresse
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-01
|
11
|
+
date: 2017-03-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -116,7 +116,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
116
116
|
version: '0'
|
117
117
|
requirements: []
|
118
118
|
rubyforge_project:
|
119
|
-
rubygems_version: 2.
|
119
|
+
rubygems_version: 2.5.1
|
120
120
|
signing_key:
|
121
121
|
specification_version: 4
|
122
122
|
summary: Simple PDF Table Data Extractor
|