pdftdx 1.2.1 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 625c68c5b06fc15adae4491dc2d68d2a041ff07a
4
- data.tar.gz: f06e7f4bbefed6d394a023e4208390f79d939aee
3
+ metadata.gz: a243b5d1cd30ce908060382654ca48b3ae97c242
4
+ data.tar.gz: ee753466c9b422f8fbcc301e5b93a8f6f61f3168
5
5
  SHA512:
6
- metadata.gz: 94935c3774765d6a4e7c520ff773b58b8a43c64c29dae883d9709aac84867530d1cb019e0a1f7099c50182c4842a075ed7d82841958b8affa4c6cfda94ad3fc9
7
- data.tar.gz: 4097b0496dc13e6c54138bcbacf3f5128d6c44c2a8b3f96e92ef29bdd34fda865e9ac628cc16199ddfa84fda57b047907a8270ed39358bda5917dc81e3726178
6
+ metadata.gz: 98b3420792f93b5bcdc59b16499be930ef52087014da36b6f63a582f013742f9af6081412958f44502e36e3c9710f6c634762087ff7e795531aaec8c2c67d671
7
+ data.tar.gz: 3704fb9ff4661dca80a0a6cb5fd744af7373a83cf870937ce8d799f977db0e8f67fb0406505328b4863a6e612f30bcce318d12493ed9f15c54871c24dd56ca05
data/lib/pdftdx.rb CHANGED
@@ -9,10 +9,12 @@ require 'pdftohtml'
9
9
  require 'pdftdx/parser'
10
10
  require 'pdftdx/version'
11
11
 
12
- # PDF TDX Module
12
+ # PDF TDX Module:
13
+ # Root Module for Pdftdx.
13
14
  module PDFTDX
14
15
 
15
- # Extract Data from PDF
16
+ # Extract Data from PDF:
17
+ # Converts a PDF file to HTML format and then extracts anything that looks like tabular data.
16
18
  # @param [String] pdf_file Path to a PDF file
17
19
  # @return [Array] An array of tables, each represented as a hash containing an optional header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: { 'System' => [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] } }]
18
20
  def self.extract_data pdf_file
data/lib/pdftdx/parser.rb CHANGED
@@ -28,7 +28,7 @@ module PDFTDX
28
28
  # Title Cell Regex
29
29
  TITLE_CELL_REGEX = /<b>/
30
30
 
31
- # Is All Same Data
31
+ # Is All Same Data:
32
32
  # Determine whether a row's cells all contain the same data.
33
33
  # @param [Hash] row_data A hash of table cells, mapped by their offset from the left. Example: { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }
34
34
  # @return [Boolean] True if all cells contain the same data, False otherwise.
@@ -37,7 +37,7 @@ module PDFTDX
37
37
  row_data.inject(true) { |b, e| b && (e[1] == n) }
38
38
  end
39
39
 
40
- # Contains Unusable Data (Empty / Long Strings)
40
+ # Contains Unusable Data (Empty / Long Strings):
41
41
  # Determines whether a row contains unusable data.
42
42
  # @param [Hash] row_data A hash of table cells, mapped by their offset from the left. Example: { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }
43
43
  # @return [Boolean] True if at least one cell is unusable (empty, oversize), False otherwise
@@ -45,7 +45,7 @@ module PDFTDX
45
45
  row_data.inject(false) { |b, e| b || (e[1].length == 0) || (e[1].length > MAX_CELL_LEN) }
46
46
  end
47
47
 
48
- # HTML Filter
48
+ # HTML Filter:
49
49
  # Replaces HTML newlines by UNIX-style newlines.
50
50
  # @param [String] s A string of HTML data
51
51
  # @return [String] The same string of HTML data, with all newlines (<br/> tags) converted to UNIX newlines.
@@ -53,7 +53,7 @@ module PDFTDX
53
53
  s.gsub '<br/>', "\n"
54
54
  end
55
55
 
56
- # Collect Data
56
+ # Collect Data:
57
57
  # Extracts table-like chunks of HTML data from a hash of HTML pages.
58
58
  # @param [Hash] data A hash of document pages, mapped by their page index. Each page is an array of chomp'd lines of HTML data. Example: { 1 => ['<h1>Hello World!</h1>', 'This is page one.'], 2 => ['Wow, another page of data !', '<b>Important stuff</b>', 'That's it for page 2 !'] }
59
59
  # @return [Array] An array of HTML chunks, each represented as a hash containing the chunk position and data. Example: [{ top: 10, left: 100, data: 'Machine OS' }, { top: 10, left: 220, data: 'Win32' }, { top: 10, left: 340, data: 'Linux' }, { top: 10, left: 460, data: 'MacOS' }]
@@ -73,7 +73,7 @@ module PDFTDX
73
73
  end.flatten
74
74
  end
75
75
 
76
- # Build Data Table
76
+ # Build Data Table:
77
77
  # Produces an organized Table (in the form a 2-level nested hash) from an array of HTML chunks.
78
78
  # @param [Array] data An array of document chunks, each represented as a hash containing the position and body of the chunk. Example: [{ top: 10, left: 100, data: 'Machine OS' }, { top: 10, left: 220, data: 'Win32' }, { top: 10, left: 340, data: 'Linux' }, { top: 10, left: 460, data: 'MacOS' }]
79
79
  # @return [Hash] A hash of table rows, mapped by their offset from the top, where each row is represented as a hash of table cells, mapped by their offset from the left. Example: { 10 => { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, 35 => { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' } }
@@ -83,7 +83,7 @@ module PDFTDX
83
83
  table
84
84
  end
85
85
 
86
- # Filter Table Rows
86
+ # Filter Table Rows:
87
87
  # Filters out rows considered unusable, empty, oversize, footers, etc...
88
88
  # Also, strips Top Offset info from Table Rows.
89
89
  # @param [Hash] data A hash of table rows, mapped by their offset from the top, where each row is represented as a hash of table cells, mapped by their offset from the left. Example: { 10 => { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, 35 => { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' } }
@@ -94,7 +94,7 @@ module PDFTDX
94
94
  .collect { |_top, r| r }.reject { |r| r.size < 2 } # Remove 'top offset' information and re-drop single-element rows
95
95
  end
96
96
 
97
- # Determine Headered Table Length
97
+ # Determine Headered Table Length:
98
98
  # Computes the number of rows to be included in a given headered table.
99
99
  # @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
100
100
  # @param [Array] headers An array of header rows, each represented as a hash containing the header row's index within the *table* array, and the actual row data. Example: [{ idx: 0, row: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'] }]
@@ -105,7 +105,7 @@ module PDFTDX
105
105
  (headers[i + 1] ? headers[i + 1][:idx] : table.length) - h[:idx]
106
106
  end
107
107
 
108
- # Sub Table Length
108
+ # Sub Table Length:
109
109
  # Computes the number of rows to be included in a given sub-table.
110
110
  # @param [Array] table An array of table rows, each represented as an array of table cells. Example: [['<b>System</b>', 'Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']]
111
111
  # @param [Array] stables An array of named tables, each represented as a hash containing the name and its starting index within the *table* array. Example: [{ title: '<b>System Info</b>', idx: 0 }]
@@ -116,7 +116,7 @@ module PDFTDX
116
116
  (stables[i + 1] ? stables[i + 1][:idx] : table.length) - t[:idx]
117
117
  end
118
118
 
119
- # Sub-Tablize
119
+ # Sub-Tablize:
120
120
  # Splits a table into multiple named tables.
121
121
  # @param [Array] htable_data An array of table rows, each represented as an array of table cells. Example: [['<b>System</b>', 'Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']]
122
122
  # @return [Array] An array of named tables, each represented as a hash containing the name and the table itself. May also contain a single array, containing all remaining table data (unnamed). Example: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }, [['32.40 $', '34.00 $', '88.40 $'], ['21.40 km', '12.00 km', '99.10 km']]]
@@ -144,7 +144,7 @@ module PDFTDX
144
144
  stables
145
145
  end
146
146
 
147
- # Sort Row
147
+ # Sort Row:
148
148
  # Sorts Cells according to their x-offset
149
149
  # @param [Hash] r A row of data in the form { xoffset => cell } (Example: { 120 => 'cell 0', 200 => 'cell 1', 280 => 'cell 2' })
150
150
  # @return [Hash] The same row of data, but sorted according to x-offset
@@ -152,7 +152,7 @@ module PDFTDX
152
152
  Hash[*(r.to_a.sort { |a, b| ((a[0] == b[0]) ? 0 : (a[0] > b[0] ? 1 : -1)) }.flatten)]
153
153
  end
154
154
 
155
- # Fix Dupes
155
+ # Fix Dupes:
156
156
  # Shifts Duplicate Cells (Cells which share their x-offset with others) to the right (so they don't get overwritten)
157
157
  # @param [Array] r A row of data in the form [[xoffset, cell]] (Example: [[120, 'cell 0'], [200, 'cell 1'], [280, 'cell 2']])
158
158
  # @param [Array] The same row of data, but with duplicate cells shifted so that no x-offset-collisions occur
@@ -174,7 +174,7 @@ module PDFTDX
174
174
  nr
175
175
  end
176
176
 
177
- # Touch up Table
177
+ # Touch up Table:
178
178
  # Splits Table into multiple headered tables.
179
179
  # Also, strips Left Offset info from Table Cells.
180
180
  # @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
@@ -219,7 +219,7 @@ module PDFTDX
219
219
  htables
220
220
  end
221
221
 
222
- # Process
222
+ # Process:
223
223
  # Transforms a hash of page data (as produced by _pdftohtml_) into a usable information table tree structure.
224
224
  # @param [Hash] page_data A hash of document pages, mapped by their page index. Each page is an array of chomp'd lines of HTML data. Example: { 1 => ['<h1>Hello World!</h1>', 'This is page one.'], 2 => ['Wow, another page of data !', '<b>Important stuff</b>', 'That's it for page 2 !'] }
225
225
  # @return [Array] An array of tables, each represented as a hash containing an optional header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: { 'System' => [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] } }]
@@ -5,5 +5,5 @@
5
5
  module PDFTDX
6
6
 
7
7
  # Version
8
- VERSION = '1.2.1'
8
+ VERSION = '1.2.2'
9
9
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdftdx
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.1
4
+ version: 1.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eresse
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-01-28 00:00:00.000000000 Z
11
+ date: 2017-03-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -116,7 +116,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
116
116
  version: '0'
117
117
  requirements: []
118
118
  rubyforge_project:
119
- rubygems_version: 2.6.10
119
+ rubygems_version: 2.5.1
120
120
  signing_key:
121
121
  specification_version: 4
122
122
  summary: Simple PDF Table Data Extractor