pdftdx 1.2.1 → 1.2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 625c68c5b06fc15adae4491dc2d68d2a041ff07a
4
- data.tar.gz: f06e7f4bbefed6d394a023e4208390f79d939aee
3
+ metadata.gz: a243b5d1cd30ce908060382654ca48b3ae97c242
4
+ data.tar.gz: ee753466c9b422f8fbcc301e5b93a8f6f61f3168
5
5
  SHA512:
6
- metadata.gz: 94935c3774765d6a4e7c520ff773b58b8a43c64c29dae883d9709aac84867530d1cb019e0a1f7099c50182c4842a075ed7d82841958b8affa4c6cfda94ad3fc9
7
- data.tar.gz: 4097b0496dc13e6c54138bcbacf3f5128d6c44c2a8b3f96e92ef29bdd34fda865e9ac628cc16199ddfa84fda57b047907a8270ed39358bda5917dc81e3726178
6
+ metadata.gz: 98b3420792f93b5bcdc59b16499be930ef52087014da36b6f63a582f013742f9af6081412958f44502e36e3c9710f6c634762087ff7e795531aaec8c2c67d671
7
+ data.tar.gz: 3704fb9ff4661dca80a0a6cb5fd744af7373a83cf870937ce8d799f977db0e8f67fb0406505328b4863a6e612f30bcce318d12493ed9f15c54871c24dd56ca05
data/lib/pdftdx.rb CHANGED
@@ -9,10 +9,12 @@ require 'pdftohtml'
9
9
  require 'pdftdx/parser'
10
10
  require 'pdftdx/version'
11
11
 
12
- # PDF TDX Module
12
+ # PDF TDX Module:
13
+ # Root Module for Pdftdx.
13
14
  module PDFTDX
14
15
 
15
- # Extract Data from PDF
16
+ # Extract Data from PDF:
17
+ # Converts a PDF file to HTML format and then extracts anything that looks like tabular data.
16
18
  # @param [String] pdf_file Path to a PDF file
17
19
  # @return [Array] An array of tables, each represented as a hash containing an optional header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: { 'System' => [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] } }]
18
20
  def self.extract_data pdf_file
data/lib/pdftdx/parser.rb CHANGED
@@ -28,7 +28,7 @@ module PDFTDX
28
28
  # Title Cell Regex
29
29
  TITLE_CELL_REGEX = /<b>/
30
30
 
31
- # Is All Same Data
31
+ # Is All Same Data:
32
32
  # Determine whether a row's cells all contain the same data.
33
33
  # @param [Hash] row_data A hash of table cells, mapped by their offset from the left. Example: { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }
34
34
  # @return [Boolean] True if all cells contain the same data, False otherwise.
@@ -37,7 +37,7 @@ module PDFTDX
37
37
  row_data.inject(true) { |b, e| b && (e[1] == n) }
38
38
  end
39
39
 
40
- # Contains Unusable Data (Empty / Long Strings)
40
+ # Contains Unusable Data (Empty / Long Strings):
41
41
  # Determines whether a row contains unusable data.
42
42
  # @param [Hash] row_data A hash of table cells, mapped by their offset from the left. Example: { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }
43
43
  # @return [Boolean] True if at least one cell is unusable (empty, oversize), False otherwise
@@ -45,7 +45,7 @@ module PDFTDX
45
45
  row_data.inject(false) { |b, e| b || (e[1].length == 0) || (e[1].length > MAX_CELL_LEN) }
46
46
  end
47
47
 
48
- # HTML Filter
48
+ # HTML Filter:
49
49
  # Replaces HTML newlines by UNIX-style newlines.
50
50
  # @param [String] s A string of HTML data
51
51
  # @return [String] The same string of HTML data, with all newlines (<br/> tags) converted to UNIX newlines.
@@ -53,7 +53,7 @@ module PDFTDX
53
53
  s.gsub '<br/>', "\n"
54
54
  end
55
55
 
56
- # Collect Data
56
+ # Collect Data:
57
57
  # Extracts table-like chunks of HTML data from a hash of HTML pages.
58
58
  # @param [Hash] data A hash of document pages, mapped by their page index. Each page is an array of chomp'd lines of HTML data. Example: { 1 => ['<h1>Hello World!</h1>', 'This is page one.'], 2 => ['Wow, another page of data !', '<b>Important stuff</b>', 'That's it for page 2 !'] }
59
59
  # @return [Array] An array of HTML chunks, each represented as a hash containing the chunk position and data. Example: [{ top: 10, left: 100, data: 'Machine OS' }, { top: 10, left: 220, data: 'Win32' }, { top: 10, left: 340, data: 'Linux' }, { top: 10, left: 460, data: 'MacOS' }]
@@ -73,7 +73,7 @@ module PDFTDX
73
73
  end.flatten
74
74
  end
75
75
 
76
- # Build Data Table
76
+ # Build Data Table:
77
77
  # Produces an organized Table (in the form a 2-level nested hash) from an array of HTML chunks.
78
78
  # @param [Array] data An array of document chunks, each represented as a hash containing the position and body of the chunk. Example: [{ top: 10, left: 100, data: 'Machine OS' }, { top: 10, left: 220, data: 'Win32' }, { top: 10, left: 340, data: 'Linux' }, { top: 10, left: 460, data: 'MacOS' }]
79
79
  # @return [Hash] A hash of table rows, mapped by their offset from the top, where each row is represented as a hash of table cells, mapped by their offset from the left. Example: { 10 => { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, 35 => { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' } }
@@ -83,7 +83,7 @@ module PDFTDX
83
83
  table
84
84
  end
85
85
 
86
- # Filter Table Rows
86
+ # Filter Table Rows:
87
87
  # Filters out rows considered unusable, empty, oversize, footers, etc...
88
88
  # Also, strips Top Offset info from Table Rows.
89
89
  # @param [Hash] data A hash of table rows, mapped by their offset from the top, where each row is represented as a hash of table cells, mapped by their offset from the left. Example: { 10 => { 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, 35 => { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' } }
@@ -94,7 +94,7 @@ module PDFTDX
94
94
  .collect { |_top, r| r }.reject { |r| r.size < 2 } # Remove 'top offset' information and re-drop single-element rows
95
95
  end
96
96
 
97
- # Determine Headered Table Length
97
+ # Determine Headered Table Length:
98
98
  # Computes the number of rows to be included in a given headered table.
99
99
  # @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
100
100
  # @param [Array] headers An array of header rows, each represented as a hash containing the header row's index within the *table* array, and the actual row data. Example: [{ idx: 0, row: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'] }]
@@ -105,7 +105,7 @@ module PDFTDX
105
105
  (headers[i + 1] ? headers[i + 1][:idx] : table.length) - h[:idx]
106
106
  end
107
107
 
108
- # Sub Table Length
108
+ # Sub Table Length:
109
109
  # Computes the number of rows to be included in a given sub-table.
110
110
  # @param [Array] table An array of table rows, each represented as an array of table cells. Example: [['<b>System</b>', 'Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']]
111
111
  # @param [Array] stables An array of named tables, each represented as a hash containing the name and its starting index within the *table* array. Example: [{ title: '<b>System Info</b>', idx: 0 }]
@@ -116,7 +116,7 @@ module PDFTDX
116
116
  (stables[i + 1] ? stables[i + 1][:idx] : table.length) - t[:idx]
117
117
  end
118
118
 
119
- # Sub-Tablize
119
+ # Sub-Tablize:
120
120
  # Splits a table into multiple named tables.
121
121
  # @param [Array] htable_data An array of table rows, each represented as an array of table cells. Example: [['<b>System</b>', 'Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']]
122
122
  # @return [Array] An array of named tables, each represented as a hash containing the name and the table itself. May also contain a single array, containing all remaining table data (unnamed). Example: [{ name: 'System', data: [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] }, [['32.40 $', '34.00 $', '88.40 $'], ['21.40 km', '12.00 km', '99.10 km']]]
@@ -144,7 +144,7 @@ module PDFTDX
144
144
  stables
145
145
  end
146
146
 
147
- # Sort Row
147
+ # Sort Row:
148
148
  # Sorts Cells according to their x-offset
149
149
  # @param [Hash] r A row of data in the form { xoffset => cell } (Example: { 120 => 'cell 0', 200 => 'cell 1', 280 => 'cell 2' })
150
150
  # @return [Hash] The same row of data, but sorted according to x-offset
@@ -152,7 +152,7 @@ module PDFTDX
152
152
  Hash[*(r.to_a.sort { |a, b| ((a[0] == b[0]) ? 0 : (a[0] > b[0] ? 1 : -1)) }.flatten)]
153
153
  end
154
154
 
155
- # Fix Dupes
155
+ # Fix Dupes:
156
156
  # Shifts Duplicate Cells (Cells which share their x-offset with others) to the right (so they don't get overwritten)
157
157
  # @param [Array] r A row of data in the form [[xoffset, cell]] (Example: [[120, 'cell 0'], [200, 'cell 1'], [280, 'cell 2']])
158
158
  # @param [Array] The same row of data, but with duplicate cells shifted so that no x-offset-collisions occur
@@ -174,7 +174,7 @@ module PDFTDX
174
174
  nr
175
175
  end
176
176
 
177
- # Touch up Table
177
+ # Touch up Table:
178
178
  # Splits Table into multiple headered tables.
179
179
  # Also, strips Left Offset info from Table Cells.
180
180
  # @param [Array] table An array of table rows, each represented as a hash of table cells, mapped by their offset from the left. Example: [{ 100 => 'Machine OS', 220 => 'Win32', 340 => 'Linux', 460 => 'MacOS' }, { 100 => 'IP Address', 220 => '10.0.232.48', 340 => '10.0.232.134', 460 => '10.0.232.108' }]
@@ -219,7 +219,7 @@ module PDFTDX
219
219
  htables
220
220
  end
221
221
 
222
- # Process
222
+ # Process:
223
223
  # Transforms a hash of page data (as produced by _pdftohtml_) into a usable information table tree structure.
224
224
  # @param [Hash] page_data A hash of document pages, mapped by their page index. Each page is an array of chomp'd lines of HTML data. Example: { 1 => ['<h1>Hello World!</h1>', 'This is page one.'], 2 => ['Wow, another page of data !', '<b>Important stuff</b>', 'That's it for page 2 !'] }
225
225
  # @return [Array] An array of tables, each represented as a hash containing an optional header and table data, in the form of either one single array of rows, or a hash of sub-tables (arrays of rows) mapped by name. Table rows are represented as an array of table cells. Example: [{ head: ['trauma.eresse.net', 'durjaya.dooba.io', 'suessmost.eresse.net'], data: { 'System' => [['Machine OS', 'Win32', 'Linux', 'MacOS'], ['IP Address', '10.0.232.48', '10.0.232.134', '10.0.232.108']] } }]
@@ -5,5 +5,5 @@
5
5
  module PDFTDX
6
6
 
7
7
  # Version
8
- VERSION = '1.2.1'
8
+ VERSION = '1.2.2'
9
9
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdftdx
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.1
4
+ version: 1.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eresse
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-01-28 00:00:00.000000000 Z
11
+ date: 2017-03-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -116,7 +116,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
116
116
  version: '0'
117
117
  requirements: []
118
118
  rubyforge_project:
119
- rubygems_version: 2.6.10
119
+ rubygems_version: 2.5.1
120
120
  signing_key:
121
121
  specification_version: 4
122
122
  summary: Simple PDF Table Data Extractor