pdftdx 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 87f8818e7d617b36b12f835b98aa2b93acf47a1d
4
+ data.tar.gz: 743387ff99d01c64d164c864ebef401802529c63
5
+ SHA512:
6
+ metadata.gz: 0742c4e3b4ccc06926a1ff2a71721ed5c545078f54b67b0554364d8c405970ea45788c37f33c1d6426700099feedd4ab7bc8dd6a67f5383c136ffefc3ae59a2c
7
+ data.tar.gz: 4f85e5f1bcb20146fc1223afa80c6993e2fc379193e246bcf7b34c2422620ce9e8674e4262a55e9edfe2afb59cdc572685b351a091f64056450570e1b4ffe5d7
data/.gitignore ADDED
@@ -0,0 +1,10 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ .idea
data/.idea/.rakeTasks ADDED
@@ -0,0 +1,7 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <Settings><!--This file was automatically generated by Ruby plugin.
3
+ You are allowed to:
4
+ 1. Remove rake task
5
+ 2. Add existing rake tasks
6
+ To add existing rake tasks automatically delete this file and reload the project.
7
+ --><RakeGroup description="" fullCmd="" taksId="rake"><RakeTask description="Build pdftdx-0.1.0.gem into the pkg directory" fullCmd="build" taksId="build" /><RakeTask description="Remove any temporary products" fullCmd="clean" taksId="clean" /><RakeTask description="Remove any generated files" fullCmd="clobber" taksId="clobber" /><RakeTask description="Build and install pdftdx-0.1.0.gem into system gems" fullCmd="install" taksId="install" /><RakeGroup description="" fullCmd="" taksId="install"><RakeTask description="Build and install pdftdx-0.1.0.gem into system gems without network access" fullCmd="install:local" taksId="local" /></RakeGroup><RakeTask description="Create tag v0.1.0 and build and push pdftdx-0.1.0.gem to Rubygems" fullCmd="release[remote]" taksId="release[remote]" /><RakeTask description="" fullCmd="default" taksId="default" /><RakeTask description="" fullCmd="release" taksId="release" /><RakeGroup description="" fullCmd="" taksId="release"><RakeTask description="" fullCmd="release:guard_clean" taksId="guard_clean" /><RakeTask description="" fullCmd="release:rubygem_push" taksId="rubygem_push" /><RakeTask description="" fullCmd="release:source_control_push" taksId="source_control_push" /></RakeGroup></RakeGroup></Settings>
@@ -0,0 +1,10 @@
1
+ <component name="InspectionProjectProfileManager">
2
+ <profile version="1.0">
3
+ <option name="myName" value="Project Default" />
4
+ <inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
5
+ <option name="processCode" value="true" />
6
+ <option name="processLiterals" value="true" />
7
+ <option name="processComments" value="true" />
8
+ </inspection_tool>
9
+ </profile>
10
+ </component>
@@ -0,0 +1,7 @@
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="PROJECT_PROFILE" value="Project Default" />
4
+ <option name="USE_PROJECT_PROFILE" value="true" />
5
+ <version value="1.0" />
6
+ </settings>
7
+ </component>
data/.idea/misc.xml ADDED
@@ -0,0 +1,14 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectLevelVcsManager" settingsEditedManually="false">
4
+ <OptionsSetting value="true" id="Add" />
5
+ <OptionsSetting value="true" id="Remove" />
6
+ <OptionsSetting value="true" id="Checkout" />
7
+ <OptionsSetting value="true" id="Update" />
8
+ <OptionsSetting value="true" id="Status" />
9
+ <OptionsSetting value="true" id="Edit" />
10
+ <ConfirmationsSetting value="0" id="Add" />
11
+ <ConfirmationsSetting value="0" id="Remove" />
12
+ </component>
13
+ <component name="ProjectRootManager" version="2" project-jdk-name="RVM: ruby-2.3.0 [global]" project-jdk-type="RUBY_SDK" />
14
+ </project>
data/.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/pdftdx.iml" filepath="$PROJECT_DIR$/.idea/pdftdx.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
data/.idea/pdftdx.iml ADDED
@@ -0,0 +1,11 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="RUBY_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$" />
5
+ <orderEntry type="inheritedJdk" />
6
+ <orderEntry type="sourceFolder" forTests="false" />
7
+ <orderEntry type="library" scope="PROVIDED" name="bundler (v1.12.5, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
8
+ <orderEntry type="library" scope="PROVIDED" name="pdftohtml (v0.2.1, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
9
+ <orderEntry type="library" scope="PROVIDED" name="rake (v10.5.0, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
10
+ </component>
11
+ </module>
data/.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="$PROJECT_DIR$" vcs="Git" />
5
+ </component>
6
+ </project>
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,22 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ pdftdx (0.1.0)
5
+ pdftohtml
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ pdftohtml (0.2.1)
11
+ rake (10.5.0)
12
+
13
+ PLATFORMS
14
+ ruby
15
+
16
+ DEPENDENCIES
17
+ bundler (~> 1.12)
18
+ pdftdx!
19
+ rake (~> 10.0)
20
+
21
+ BUNDLED WITH
22
+ 1.12.5
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Paul Duncan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,7 @@
1
+ # PDF Table Data Extractor
2
+
3
+ Simple tool to extract Table Data from PDFs
4
+
5
+ ## License
6
+
7
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+ task :default => :spec
data/bin/pdftdx ADDED
@@ -0,0 +1,26 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # PDF Table Data Extractor
4
+ # by Eresse <eresse@eresse.net>
5
+
6
+ # Internal Includes
7
+ require 'pdftdx'
8
+
9
+ # Usage
10
+ def usage
11
+ puts "Usage: #{$0} <PDF_FILE>"
12
+ exit 1
13
+ end
14
+
15
+ # Main
16
+ def main args
17
+
18
+ # Check Args
19
+ usage unless args.length == 1
20
+
21
+ # Extract Data from provided PDF File\
22
+ PDFTDX.extract_data args[0]
23
+ end
24
+
25
+ # Call Main
26
+ main ARGV
@@ -0,0 +1,80 @@
1
+ # PDF Table Data Extractor
2
+ # by Eresse <eresse@eresse.net>
3
+
4
+ # External Includes
5
+ require 'htmlentities'
6
+
7
+ # Internal Includes
8
+ require 'pdftdx/version'
9
+
10
+ # PDF TDX Module
11
+ module PDFTDX
12
+
13
+ # Parser Module
14
+ module Parser
15
+
16
+ # Line Regex
17
+ LINE_REGEX = /^<p style[^>]+top:([0-9]+)px[^>]+left:([0-9]+)px[^>]+>(.*)<\/p>/
18
+
19
+ # Maximum Cell Length (to be considered usable data)
20
+ MAX_CELL_LEN = 100
21
+
22
+ # Page Offset
23
+ PAGE_OFF = 10000
24
+
25
+ # Title Cell Regex
26
+ TITLE_CELL_REGEX = /<bbb>/
27
+
28
+ # Check Same Line
29
+ def self.same_line data, idx_a, idx_b
30
+ data[idx_a][:top] == data[idx_b][:top]
31
+ end
32
+
33
+ # Is All Same Data
34
+ def self.is_all_same row_data
35
+ n = row_data[row_data.keys[0]]
36
+ row_data.inject(true) { |b, e| b && (e[1] == n) }
37
+ end
38
+
39
+ # Contains Unusable Data (Empty / Long Strings)
40
+ def self.contains_unusable row_data
41
+ row_data.inject(false) { |b, e| b || (e[1].length == 0) || (e[1].length > MAX_CELL_LEN) }
42
+ end
43
+
44
+ # Process Data
45
+ def self.process_data data
46
+
47
+ # Build Data Table
48
+ table = {}
49
+ data.each { |d| table[d[:top]] ||= {}; table[d[:top]][d[:left]] = d[:data] }
50
+
51
+ # Filter Table Rows (Remove Lone Elements & Footers)
52
+ table.reject! { |top, row| row.size < 2 || (top % PAGE_OFF) >= 1110 || is_all_same(row) || contains_unusable(row) }
53
+
54
+ # Filter Table Cells
55
+ table = table.collect { |_top, r| r.reject { |_left, d| TITLE_CELL_REGEX =~ d } }.reject { |r| r.size < 1 }
56
+
57
+ # Cleanup Table ( IS THIS NECESSARY ? )
58
+ table.reject! { |r| r.size < 2 }
59
+
60
+ # DEBUG
61
+ puts "=============> #{table}"
62
+ end
63
+
64
+ # HTML Filter
65
+ def self.hfilter s
66
+ s.gsub '<br/>', "\n"
67
+ end
68
+
69
+ # Process Page Files
70
+ def self.process_page_files page_data
71
+
72
+ # Build HTML Entity Decoder
73
+ coder = HTMLEntities.new
74
+
75
+ # Collect & Process File Data
76
+ off = 0
77
+ process page_data.collect { |_idx, page| off = off + PAGE_OFF; page.select { |l| LINE_REGEX =~ l }.collect { |l| LINE_REGEX.match l }.collect { |d| { top: off + d[1].to_i, left: d[2].to_i, data: hfilter(coder.decode(d[3])) } } }.flatten
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,9 @@
1
+ # PDF Table Data Extractor
2
+ # by Eresse <eresse@eresse.net>
3
+
4
+ # PDF TDX Module
5
+ module PDFTDX
6
+
7
+ # Version
8
+ VERSION = '0.1.0'
9
+ end
data/lib/pdftdx.rb ADDED
@@ -0,0 +1,25 @@
1
+ # PDF Table Data Extractor
2
+ # by Eresse <eresse@eresse.net>
3
+
4
+ # External Includes
5
+ require 'htmlentities'
6
+ require 'pdftohtml'
7
+
8
+ # Internal Includes
9
+ require 'pdftdx/dumper'
10
+ require 'pdftdx/parser'
11
+ require 'pdftdx/version'
12
+
13
+ # PDF TDX Module
14
+ module PDFTDX
15
+
16
+ # Extract Data from PDF
17
+ def self.extract_data pdf_file
18
+
19
+ # Dump PDF Data
20
+ page_data = Pdftohtml.convert pdf_file
21
+
22
+ # Process Page Data
23
+ PDFTDX::Parser.process_page_files page_data
24
+ end
25
+ end
data/pdftdx.gemspec ADDED
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'pdftdx/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "pdftdx"
8
+ spec.version = PDFTDX::VERSION
9
+ spec.authors = ["Eresse"]
10
+ spec.email = ["eresse@eresse.net"]
11
+
12
+ spec.summary = "Simple PDF Table Data Extractor"
13
+ spec.description = "Simple tool to extract Table Data from PDFs"
14
+ spec.homepage = "http://redmine.eresse.net/projects/pdftdx"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_development_dependency "bundler", "~> 1.12"
23
+ spec.add_development_dependency "rake", "~> 10.0"
24
+ spec.add_runtime_dependency "pdftohtml"
25
+ end
metadata ADDED
@@ -0,0 +1,104 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pdftdx
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Eresse
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-12-09 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.12'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.12'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: pdftohtml
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: Simple tool to extract Table Data from PDFs
56
+ email:
57
+ - eresse@eresse.net
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - ".gitignore"
63
+ - ".idea/.rakeTasks"
64
+ - ".idea/inspectionProfiles/Project_Default.xml"
65
+ - ".idea/inspectionProfiles/profiles_settings.xml"
66
+ - ".idea/misc.xml"
67
+ - ".idea/modules.xml"
68
+ - ".idea/pdftdx.iml"
69
+ - ".idea/vcs.xml"
70
+ - Gemfile
71
+ - Gemfile.lock
72
+ - LICENSE.txt
73
+ - README.md
74
+ - Rakefile
75
+ - bin/pdftdx
76
+ - lib/pdftdx.rb
77
+ - lib/pdftdx/parser.rb
78
+ - lib/pdftdx/version.rb
79
+ - pdftdx.gemspec
80
+ homepage: http://redmine.eresse.net/projects/pdftdx
81
+ licenses:
82
+ - MIT
83
+ metadata: {}
84
+ post_install_message:
85
+ rdoc_options: []
86
+ require_paths:
87
+ - lib
88
+ required_ruby_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ required_rubygems_version: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
98
+ requirements: []
99
+ rubyforge_project:
100
+ rubygems_version: 2.5.1
101
+ signing_key:
102
+ specification_version: 4
103
+ summary: Simple PDF Table Data Extractor
104
+ test_files: []