pdftdx 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.idea/.rakeTasks +7 -0
- data/.idea/inspectionProfiles/Project_Default.xml +10 -0
- data/.idea/inspectionProfiles/profiles_settings.xml +7 -0
- data/.idea/misc.xml +14 -0
- data/.idea/modules.xml +8 -0
- data/.idea/pdftdx.iml +11 -0
- data/.idea/vcs.xml +6 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +22 -0
- data/LICENSE.txt +21 -0
- data/README.md +7 -0
- data/Rakefile +2 -0
- data/bin/pdftdx +26 -0
- data/lib/pdftdx/parser.rb +80 -0
- data/lib/pdftdx/version.rb +9 -0
- data/lib/pdftdx.rb +25 -0
- data/pdftdx.gemspec +25 -0
- metadata +104 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA1:
|
|
3
|
+
metadata.gz: 87f8818e7d617b36b12f835b98aa2b93acf47a1d
|
|
4
|
+
data.tar.gz: 743387ff99d01c64d164c864ebef401802529c63
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 0742c4e3b4ccc06926a1ff2a71721ed5c545078f54b67b0554364d8c405970ea45788c37f33c1d6426700099feedd4ab7bc8dd6a67f5383c136ffefc3ae59a2c
|
|
7
|
+
data.tar.gz: 4f85e5f1bcb20146fc1223afa80c6993e2fc379193e246bcf7b34c2422620ce9e8674e4262a55e9edfe2afb59cdc572685b351a091f64056450570e1b4ffe5d7
|
data/.gitignore
ADDED
data/.idea/.rakeTasks
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
2
|
+
<Settings><!--This file was automatically generated by Ruby plugin.
|
|
3
|
+
You are allowed to:
|
|
4
|
+
1. Remove rake task
|
|
5
|
+
2. Add existing rake tasks
|
|
6
|
+
To add existing rake tasks automatically delete this file and reload the project.
|
|
7
|
+
--><RakeGroup description="" fullCmd="" taksId="rake"><RakeTask description="Build pdftdx-0.1.0.gem into the pkg directory" fullCmd="build" taksId="build" /><RakeTask description="Remove any temporary products" fullCmd="clean" taksId="clean" /><RakeTask description="Remove any generated files" fullCmd="clobber" taksId="clobber" /><RakeTask description="Build and install pdftdx-0.1.0.gem into system gems" fullCmd="install" taksId="install" /><RakeGroup description="" fullCmd="" taksId="install"><RakeTask description="Build and install pdftdx-0.1.0.gem into system gems without network access" fullCmd="install:local" taksId="local" /></RakeGroup><RakeTask description="Create tag v0.1.0 and build and push pdftdx-0.1.0.gem to Rubygems" fullCmd="release[remote]" taksId="release[remote]" /><RakeTask description="" fullCmd="default" taksId="default" /><RakeTask description="" fullCmd="release" taksId="release" /><RakeGroup description="" fullCmd="" taksId="release"><RakeTask description="" fullCmd="release:guard_clean" taksId="guard_clean" /><RakeTask description="" fullCmd="release:rubygem_push" taksId="rubygem_push" /><RakeTask description="" fullCmd="release:source_control_push" taksId="source_control_push" /></RakeGroup></RakeGroup></Settings>
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
<component name="InspectionProjectProfileManager">
|
|
2
|
+
<profile version="1.0">
|
|
3
|
+
<option name="myName" value="Project Default" />
|
|
4
|
+
<inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
|
|
5
|
+
<option name="processCode" value="true" />
|
|
6
|
+
<option name="processLiterals" value="true" />
|
|
7
|
+
<option name="processComments" value="true" />
|
|
8
|
+
</inspection_tool>
|
|
9
|
+
</profile>
|
|
10
|
+
</component>
|
data/.idea/misc.xml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
2
|
+
<project version="4">
|
|
3
|
+
<component name="ProjectLevelVcsManager" settingsEditedManually="false">
|
|
4
|
+
<OptionsSetting value="true" id="Add" />
|
|
5
|
+
<OptionsSetting value="true" id="Remove" />
|
|
6
|
+
<OptionsSetting value="true" id="Checkout" />
|
|
7
|
+
<OptionsSetting value="true" id="Update" />
|
|
8
|
+
<OptionsSetting value="true" id="Status" />
|
|
9
|
+
<OptionsSetting value="true" id="Edit" />
|
|
10
|
+
<ConfirmationsSetting value="0" id="Add" />
|
|
11
|
+
<ConfirmationsSetting value="0" id="Remove" />
|
|
12
|
+
</component>
|
|
13
|
+
<component name="ProjectRootManager" version="2" project-jdk-name="RVM: ruby-2.3.0 [global]" project-jdk-type="RUBY_SDK" />
|
|
14
|
+
</project>
|
data/.idea/modules.xml
ADDED
data/.idea/pdftdx.iml
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
2
|
+
<module type="RUBY_MODULE" version="4">
|
|
3
|
+
<component name="NewModuleRootManager">
|
|
4
|
+
<content url="file://$MODULE_DIR$" />
|
|
5
|
+
<orderEntry type="inheritedJdk" />
|
|
6
|
+
<orderEntry type="sourceFolder" forTests="false" />
|
|
7
|
+
<orderEntry type="library" scope="PROVIDED" name="bundler (v1.12.5, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
|
|
8
|
+
<orderEntry type="library" scope="PROVIDED" name="pdftohtml (v0.2.1, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
|
|
9
|
+
<orderEntry type="library" scope="PROVIDED" name="rake (v10.5.0, RVM: ruby-2.3.0 [global]) [gem]" level="application" />
|
|
10
|
+
</component>
|
|
11
|
+
</module>
|
data/.idea/vcs.xml
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
PATH
|
|
2
|
+
remote: .
|
|
3
|
+
specs:
|
|
4
|
+
pdftdx (0.1.0)
|
|
5
|
+
pdftohtml
|
|
6
|
+
|
|
7
|
+
GEM
|
|
8
|
+
remote: https://rubygems.org/
|
|
9
|
+
specs:
|
|
10
|
+
pdftohtml (0.2.1)
|
|
11
|
+
rake (10.5.0)
|
|
12
|
+
|
|
13
|
+
PLATFORMS
|
|
14
|
+
ruby
|
|
15
|
+
|
|
16
|
+
DEPENDENCIES
|
|
17
|
+
bundler (~> 1.12)
|
|
18
|
+
pdftdx!
|
|
19
|
+
rake (~> 10.0)
|
|
20
|
+
|
|
21
|
+
BUNDLED WITH
|
|
22
|
+
1.12.5
|
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2016 Paul Duncan
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
data/Rakefile
ADDED
data/bin/pdftdx
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
# PDF Table Data Extractor
|
|
4
|
+
# by Eresse <eresse@eresse.net>
|
|
5
|
+
|
|
6
|
+
# Internal Includes
|
|
7
|
+
require 'pdftdx'
|
|
8
|
+
|
|
9
|
+
# Usage
|
|
10
|
+
def usage
|
|
11
|
+
puts "Usage: #{$0} <PDF_FILE>"
|
|
12
|
+
exit 1
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Main
|
|
16
|
+
def main args
|
|
17
|
+
|
|
18
|
+
# Check Args
|
|
19
|
+
usage unless args.length == 1
|
|
20
|
+
|
|
21
|
+
# Extract Data from provided PDF File\
|
|
22
|
+
PDFTDX.extract_data args[0]
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Call Main
|
|
26
|
+
main ARGV
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# PDF Table Data Extractor
|
|
2
|
+
# by Eresse <eresse@eresse.net>
|
|
3
|
+
|
|
4
|
+
# External Includes
|
|
5
|
+
require 'htmlentities'
|
|
6
|
+
|
|
7
|
+
# Internal Includes
|
|
8
|
+
require 'pdftdx/version'
|
|
9
|
+
|
|
10
|
+
# PDF TDX Module
|
|
11
|
+
module PDFTDX
|
|
12
|
+
|
|
13
|
+
# Parser Module
|
|
14
|
+
module Parser
|
|
15
|
+
|
|
16
|
+
# Line Regex
|
|
17
|
+
LINE_REGEX = /^<p style[^>]+top:([0-9]+)px[^>]+left:([0-9]+)px[^>]+>(.*)<\/p>/
|
|
18
|
+
|
|
19
|
+
# Maximum Cell Length (to be considered usable data)
|
|
20
|
+
MAX_CELL_LEN = 100
|
|
21
|
+
|
|
22
|
+
# Page Offset
|
|
23
|
+
PAGE_OFF = 10000
|
|
24
|
+
|
|
25
|
+
# Title Cell Regex
|
|
26
|
+
TITLE_CELL_REGEX = /<bbb>/
|
|
27
|
+
|
|
28
|
+
# Check Same Line
|
|
29
|
+
def self.same_line data, idx_a, idx_b
|
|
30
|
+
data[idx_a][:top] == data[idx_b][:top]
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Is All Same Data
|
|
34
|
+
def self.is_all_same row_data
|
|
35
|
+
n = row_data[row_data.keys[0]]
|
|
36
|
+
row_data.inject(true) { |b, e| b && (e[1] == n) }
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Contains Unusable Data (Empty / Long Strings)
|
|
40
|
+
def self.contains_unusable row_data
|
|
41
|
+
row_data.inject(false) { |b, e| b || (e[1].length == 0) || (e[1].length > MAX_CELL_LEN) }
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Process Data
|
|
45
|
+
def self.process_data data
|
|
46
|
+
|
|
47
|
+
# Build Data Table
|
|
48
|
+
table = {}
|
|
49
|
+
data.each { |d| table[d[:top]] ||= {}; table[d[:top]][d[:left]] = d[:data] }
|
|
50
|
+
|
|
51
|
+
# Filter Table Rows (Remove Lone Elements & Footers)
|
|
52
|
+
table.reject! { |top, row| row.size < 2 || (top % PAGE_OFF) >= 1110 || is_all_same(row) || contains_unusable(row) }
|
|
53
|
+
|
|
54
|
+
# Filter Table Cells
|
|
55
|
+
table = table.collect { |_top, r| r.reject { |_left, d| TITLE_CELL_REGEX =~ d } }.reject { |r| r.size < 1 }
|
|
56
|
+
|
|
57
|
+
# Cleanup Table ( IS THIS NECESSARY ? )
|
|
58
|
+
table.reject! { |r| r.size < 2 }
|
|
59
|
+
|
|
60
|
+
# DEBUG
|
|
61
|
+
puts "=============> #{table}"
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# HTML Filter
|
|
65
|
+
def self.hfilter s
|
|
66
|
+
s.gsub '<br/>', "\n"
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Process Page Files
|
|
70
|
+
def self.process_page_files page_data
|
|
71
|
+
|
|
72
|
+
# Build HTML Entity Decoder
|
|
73
|
+
coder = HTMLEntities.new
|
|
74
|
+
|
|
75
|
+
# Collect & Process File Data
|
|
76
|
+
off = 0
|
|
77
|
+
process page_data.collect { |_idx, page| off = off + PAGE_OFF; page.select { |l| LINE_REGEX =~ l }.collect { |l| LINE_REGEX.match l }.collect { |d| { top: off + d[1].to_i, left: d[2].to_i, data: hfilter(coder.decode(d[3])) } } }.flatten
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
data/lib/pdftdx.rb
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# PDF Table Data Extractor
|
|
2
|
+
# by Eresse <eresse@eresse.net>
|
|
3
|
+
|
|
4
|
+
# External Includes
|
|
5
|
+
require 'htmlentities'
|
|
6
|
+
require 'pdftohtml'
|
|
7
|
+
|
|
8
|
+
# Internal Includes
|
|
9
|
+
require 'pdftdx/dumper'
|
|
10
|
+
require 'pdftdx/parser'
|
|
11
|
+
require 'pdftdx/version'
|
|
12
|
+
|
|
13
|
+
# PDF TDX Module
|
|
14
|
+
module PDFTDX
|
|
15
|
+
|
|
16
|
+
# Extract Data from PDF
|
|
17
|
+
def self.extract_data pdf_file
|
|
18
|
+
|
|
19
|
+
# Dump PDF Data
|
|
20
|
+
page_data = Pdftohtml.convert pdf_file
|
|
21
|
+
|
|
22
|
+
# Process Page Data
|
|
23
|
+
PDFTDX::Parser.process_page_files page_data
|
|
24
|
+
end
|
|
25
|
+
end
|
data/pdftdx.gemspec
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# coding: utf-8
|
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
|
+
require 'pdftdx/version'
|
|
5
|
+
|
|
6
|
+
Gem::Specification.new do |spec|
|
|
7
|
+
spec.name = "pdftdx"
|
|
8
|
+
spec.version = PDFTDX::VERSION
|
|
9
|
+
spec.authors = ["Eresse"]
|
|
10
|
+
spec.email = ["eresse@eresse.net"]
|
|
11
|
+
|
|
12
|
+
spec.summary = "Simple PDF Table Data Extractor"
|
|
13
|
+
spec.description = "Simple tool to extract Table Data from PDFs"
|
|
14
|
+
spec.homepage = "http://redmine.eresse.net/projects/pdftdx"
|
|
15
|
+
spec.license = "MIT"
|
|
16
|
+
|
|
17
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
|
18
|
+
spec.bindir = "exe"
|
|
19
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
|
20
|
+
spec.require_paths = ["lib"]
|
|
21
|
+
|
|
22
|
+
spec.add_development_dependency "bundler", "~> 1.12"
|
|
23
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
|
24
|
+
spec.add_runtime_dependency "pdftohtml"
|
|
25
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: pdftdx
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Eresse
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: exe
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2016-12-09 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: bundler
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - "~>"
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '1.12'
|
|
20
|
+
type: :development
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - "~>"
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '1.12'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: rake
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - "~>"
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: '10.0'
|
|
34
|
+
type: :development
|
|
35
|
+
prerelease: false
|
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - "~>"
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: '10.0'
|
|
41
|
+
- !ruby/object:Gem::Dependency
|
|
42
|
+
name: pdftohtml
|
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - ">="
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: '0'
|
|
48
|
+
type: :runtime
|
|
49
|
+
prerelease: false
|
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - ">="
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '0'
|
|
55
|
+
description: Simple tool to extract Table Data from PDFs
|
|
56
|
+
email:
|
|
57
|
+
- eresse@eresse.net
|
|
58
|
+
executables: []
|
|
59
|
+
extensions: []
|
|
60
|
+
extra_rdoc_files: []
|
|
61
|
+
files:
|
|
62
|
+
- ".gitignore"
|
|
63
|
+
- ".idea/.rakeTasks"
|
|
64
|
+
- ".idea/inspectionProfiles/Project_Default.xml"
|
|
65
|
+
- ".idea/inspectionProfiles/profiles_settings.xml"
|
|
66
|
+
- ".idea/misc.xml"
|
|
67
|
+
- ".idea/modules.xml"
|
|
68
|
+
- ".idea/pdftdx.iml"
|
|
69
|
+
- ".idea/vcs.xml"
|
|
70
|
+
- Gemfile
|
|
71
|
+
- Gemfile.lock
|
|
72
|
+
- LICENSE.txt
|
|
73
|
+
- README.md
|
|
74
|
+
- Rakefile
|
|
75
|
+
- bin/pdftdx
|
|
76
|
+
- lib/pdftdx.rb
|
|
77
|
+
- lib/pdftdx/parser.rb
|
|
78
|
+
- lib/pdftdx/version.rb
|
|
79
|
+
- pdftdx.gemspec
|
|
80
|
+
homepage: http://redmine.eresse.net/projects/pdftdx
|
|
81
|
+
licenses:
|
|
82
|
+
- MIT
|
|
83
|
+
metadata: {}
|
|
84
|
+
post_install_message:
|
|
85
|
+
rdoc_options: []
|
|
86
|
+
require_paths:
|
|
87
|
+
- lib
|
|
88
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
89
|
+
requirements:
|
|
90
|
+
- - ">="
|
|
91
|
+
- !ruby/object:Gem::Version
|
|
92
|
+
version: '0'
|
|
93
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
94
|
+
requirements:
|
|
95
|
+
- - ">="
|
|
96
|
+
- !ruby/object:Gem::Version
|
|
97
|
+
version: '0'
|
|
98
|
+
requirements: []
|
|
99
|
+
rubyforge_project:
|
|
100
|
+
rubygems_version: 2.5.1
|
|
101
|
+
signing_key:
|
|
102
|
+
specification_version: 4
|
|
103
|
+
summary: Simple PDF Table Data Extractor
|
|
104
|
+
test_files: []
|