pdftdx 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 87f8818e7d617b36b12f835b98aa2b93acf47a1d
4
+ data.tar.gz: 743387ff99d01c64d164c864ebef401802529c63
5
+ SHA512:
6
+ metadata.gz: 0742c4e3b4ccc06926a1ff2a71721ed5c545078f54b67b0554364d8c405970ea45788c37f33c1d6426700099feedd4ab7bc8dd6a67f5383c136ffefc3ae59a2c
7
+ data.tar.gz: 4f85e5f1bcb20146fc1223afa80c6993e2fc379193e246bcf7b34c2422620ce9e8674e4262a55e9edfe2afb59cdc572685b351a091f64056450570e1b4ffe5d7
data/.gitignore ADDED
@@ -0,0 +1,10 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ .idea
data/.idea/.rakeTasks ADDED
@@ -0,0 +1,7 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <Settings><!--This file was automatically generated by Ruby plugin.
3
+ You are allowed to:
4
+ 1. Remove rake task
5
+ 2. Add existing rake tasks
6
+ To add existing rake tasks automatically delete this file and reload the project.
7
+ --><RakeGroup description="" fullCmd="" taksId="rake"><RakeTask description="Build pdftdx-0.1.0.gem into the pkg directory" fullCmd="build" taksId="build" /><RakeTask description="Remove any temporary products" fullCmd="clean" taksId="clean" /><RakeTask description="Remove any generated files" fullCmd="clobber" taksId="clobber" /><RakeTask description="Build and install pdftdx-0.1.0.gem into system gems" fullCmd="install" taksId="install" /><RakeGroup description="" fullCmd="" taksId="install"><RakeTask description="Build and install pdftdx-0.1.0.gem into system gems without network access" fullCmd="install:local" taksId="local" /></RakeGroup><RakeTask description="Create tag v0.1.0 and build and push pdftdx-0.1.0.gem to Rubygems" fullCmd="release[remote]" taksId="release[remote]" /><RakeTask description="" fullCmd="default" taksId="default" /><RakeTask description="" fullCmd="release" taksId="release" /><RakeGroup description="" fullCmd="" taksId="release"><RakeTask description="" fullCmd="release:guard_clean" taksId="guard_clean" /><RakeTask description="" fullCmd="release:rubygem_push" taksId="rubygem_push" /><RakeTask description="" fullCmd="release:source_control_push" taksId="source_control_push" /></RakeGroup></RakeGroup></Settings>
@@ -0,0 +1,10 @@
1
+ <component name="InspectionProjectProfileManager">
2
+ <profile version="1.0">
3
+ <option name="myName" value="Project Default" />
4
+ <inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
5
+ <option name="processCode" value="true" />
6
+ <option name="processLiterals" value="true" />
7
+ <option name="processComments" value="true" />
8
+ </inspection_tool>
9
+ </profile>
10
+ </component>
@@ -0,0 +1,7 @@
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="PROJECT_PROFILE" value="Project Default" />
4
+ <option name="USE_PROJECT_PROFILE" value="true" />
5
+ <version value="1.0" />
6
+ </settings>
7
+ </component>
data/.idea/misc.xml ADDED
@@ -0,0 +1,14 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectLevelVcsManager" settingsEditedManually="false">
4
+ <OptionsSetting value="true" id="Add" />
5
+ <OptionsSetting value="true" id="Remove" />
6
+ <OptionsSetting value="true" id="Checkout" />
7
+ <OptionsSetting value="true" id="Update" />
8
+ <OptionsSetting value="true" id="Status" />
9
+ <OptionsSetting value="true" id="Edit" />
10
+ <ConfirmationsSetting value="0" id="Add" />
11
+ <ConfirmationsSetting value="0" id="Remove" />
12
+ </component>
13
+ <component name="ProjectRootManager" version="2" project-jdk-name="RVM: ruby-2.3.0 [global]" project-jdk-type="RUBY_SDK" />
14
+ </project>
data/.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/pdftdx.iml" filepath="$PROJECT_DIR$/.idea/pdftdx.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
data/.idea/pdftdx.iml ADDED
@@ -0,0 +1,11 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="RUBY_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$" />
5
+ <orderEntry type="inheritedJdk" />
6
+ <orderEntry type="sourceFolder" forTests="false" />
7
+ <orderEntry type="library" scope="PROVIDED" name="bundler (v1.12.5, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
8
+ <orderEntry type="library" scope="PROVIDED" name="pdftohtml (v0.2.1, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
9
+ <orderEntry type="library" scope="PROVIDED" name="rake (v10.5.0, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
10
+ </component>
11
+ </module>
data/.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="$PROJECT_DIR$" vcs="Git" />
5
+ </component>
6
+ </project>
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,22 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ pdftdx (0.1.0)
5
+ pdftohtml
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ pdftohtml (0.2.1)
11
+ rake (10.5.0)
12
+
13
+ PLATFORMS
14
+ ruby
15
+
16
+ DEPENDENCIES
17
+ bundler (~> 1.12)
18
+ pdftdx!
19
+ rake (~> 10.0)
20
+
21
+ BUNDLED WITH
22
+ 1.12.5
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Paul Duncan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,7 @@
1
+ # PDF Table Data Extractor
2
+
3
+ Simple tool to extract Table Data from PDFs
4
+
5
+ ## License
6
+
7
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+ task :default => :spec
data/bin/pdftdx ADDED
@@ -0,0 +1,26 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # PDF Table Data Extractor
4
+ # by Eresse <eresse@eresse.net>
5
+
6
+ # Internal Includes
7
+ require 'pdftdx'
8
+
9
+ # Usage
10
+ def usage
11
+ puts "Usage: #{$0} <PDF_FILE>"
12
+ exit 1
13
+ end
14
+
15
+ # Main
16
+ def main args
17
+
18
+ # Check Args
19
+ usage unless args.length == 1
20
+
21
+ # Extract Data from provided PDF File\
22
+ PDFTDX.extract_data args[0]
23
+ end
24
+
25
+ # Call Main
26
+ main ARGV
@@ -0,0 +1,80 @@
1
+ # PDF Table Data Extractor
2
+ # by Eresse <eresse@eresse.net>
3
+
4
+ # External Includes
5
+ require 'htmlentities'
6
+
7
+ # Internal Includes
8
+ require 'pdftdx/version'
9
+
10
+ # PDF TDX Module
11
+ module PDFTDX
12
+
13
+ # Parser Module
14
+ module Parser
15
+
16
+ # Line Regex
17
+ LINE_REGEX = /^<p style[^>]+top:([0-9]+)px[^>]+left:([0-9]+)px[^>]+>(.*)<\/p>/
18
+
19
+ # Maximum Cell Length (to be considered usable data)
20
+ MAX_CELL_LEN = 100
21
+
22
+ # Page Offset
23
+ PAGE_OFF = 10000
24
+
25
+ # Title Cell Regex
26
+ TITLE_CELL_REGEX = /<bbb>/
27
+
28
+ # Check Same Line
29
+ def self.same_line data, idx_a, idx_b
30
+ data[idx_a][:top] == data[idx_b][:top]
31
+ end
32
+
33
+ # Is All Same Data
34
+ def self.is_all_same row_data
35
+ n = row_data[row_data.keys[0]]
36
+ row_data.inject(true) { |b, e| b && (e[1] == n) }
37
+ end
38
+
39
+ # Contains Unusable Data (Empty / Long Strings)
40
+ def self.contains_unusable row_data
41
+ row_data.inject(false) { |b, e| b || (e[1].length == 0) || (e[1].length > MAX_CELL_LEN) }
42
+ end
43
+
44
+ # Process Data
45
+ def self.process_data data
46
+
47
+ # Build Data Table
48
+ table = {}
49
+ data.each { |d| table[d[:top]] ||= {}; table[d[:top]][d[:left]] = d[:data] }
50
+
51
+ # Filter Table Rows (Remove Lone Elements & Footers)
52
+ table.reject! { |top, row| row.size < 2 || (top % PAGE_OFF) >= 1110 || is_all_same(row) || contains_unusable(row) }
53
+
54
+ # Filter Table Cells
55
+ table = table.collect { |_top, r| r.reject { |_left, d| TITLE_CELL_REGEX =~ d } }.reject { |r| r.size < 1 }
56
+
57
+ # Cleanup Table ( IS THIS NECESSARY ? )
58
+ table.reject! { |r| r.size < 2 }
59
+
60
+ # DEBUG
61
+ puts "=============> #{table}"
62
+ end
63
+
64
+ # HTML Filter
65
+ def self.hfilter s
66
+ s.gsub '<br/>', "\n"
67
+ end
68
+
69
+ # Process Page Files
70
+ def self.process_page_files page_data
71
+
72
+ # Build HTML Entity Decoder
73
+ coder = HTMLEntities.new
74
+
75
+ # Collect & Process File Data
76
+ off = 0
77
+ process page_data.collect { |_idx, page| off = off + PAGE_OFF; page.select { |l| LINE_REGEX =~ l }.collect { |l| LINE_REGEX.match l }.collect { |d| { top: off + d[1].to_i, left: d[2].to_i, data: hfilter(coder.decode(d[3])) } } }.flatten
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,9 @@
1
+ # PDF Table Data Extractor
2
+ # by Eresse <eresse@eresse.net>
3
+
4
+ # PDF TDX Module
5
+ module PDFTDX
6
+
7
+ # Version
8
+ VERSION = '0.1.0'
9
+ end
data/lib/pdftdx.rb ADDED
@@ -0,0 +1,25 @@
1
+ # PDF Table Data Extractor
2
+ # by Eresse <eresse@eresse.net>
3
+
4
+ # External Includes
5
+ require 'htmlentities'
6
+ require 'pdftohtml'
7
+
8
+ # Internal Includes
9
+ require 'pdftdx/dumper'
10
+ require 'pdftdx/parser'
11
+ require 'pdftdx/version'
12
+
13
+ # PDF TDX Module
14
+ module PDFTDX
15
+
16
+ # Extract Data from PDF
17
+ def self.extract_data pdf_file
18
+
19
+ # Dump PDF Data
20
+ page_data = Pdftohtml.convert pdf_file
21
+
22
+ # Process Page Data
23
+ PDFTDX::Parser.process_page_files page_data
24
+ end
25
+ end
data/pdftdx.gemspec ADDED
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'pdftdx/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "pdftdx"
8
+ spec.version = PDFTDX::VERSION
9
+ spec.authors = ["Eresse"]
10
+ spec.email = ["eresse@eresse.net"]
11
+
12
+ spec.summary = "Simple PDF Table Data Extractor"
13
+ spec.description = "Simple tool to extract Table Data from PDFs"
14
+ spec.homepage = "http://redmine.eresse.net/projects/pdftdx"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_development_dependency "bundler", "~> 1.12"
23
+ spec.add_development_dependency "rake", "~> 10.0"
24
+ spec.add_runtime_dependency "pdftohtml"
25
+ end
metadata ADDED
@@ -0,0 +1,104 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pdftdx
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Eresse
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-12-09 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.12'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.12'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: pdftohtml
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: Simple tool to extract Table Data from PDFs
56
+ email:
57
+ - eresse@eresse.net
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - ".gitignore"
63
+ - ".idea/.rakeTasks"
64
+ - ".idea/inspectionProfiles/Project_Default.xml"
65
+ - ".idea/inspectionProfiles/profiles_settings.xml"
66
+ - ".idea/misc.xml"
67
+ - ".idea/modules.xml"
68
+ - ".idea/pdftdx.iml"
69
+ - ".idea/vcs.xml"
70
+ - Gemfile
71
+ - Gemfile.lock
72
+ - LICENSE.txt
73
+ - README.md
74
+ - Rakefile
75
+ - bin/pdftdx
76
+ - lib/pdftdx.rb
77
+ - lib/pdftdx/parser.rb
78
+ - lib/pdftdx/version.rb
79
+ - pdftdx.gemspec
80
+ homepage: http://redmine.eresse.net/projects/pdftdx
81
+ licenses:
82
+ - MIT
83
+ metadata: {}
84
+ post_install_message:
85
+ rdoc_options: []
86
+ require_paths:
87
+ - lib
88
+ required_ruby_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ required_rubygems_version: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
98
+ requirements: []
99
+ rubyforge_project:
100
+ rubygems_version: 2.5.1
101
+ signing_key:
102
+ specification_version: 4
103
+ summary: Simple PDF Table Data Extractor
104
+ test_files: []