pdf_extractor 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.gitlab-ci.yml +44 -0
- data/.rubocop.yml +15 -0
- data/.rubocop/rubocop-all.yml +10 -0
- data/.rubocop/rubocop-bundler.yml +3 -0
- data/.rubocop/rubocop-custom.yml +76 -0
- data/.rubocop/rubocop-gemspec.yml +5 -0
- data/.rubocop/rubocop-layout.yml +227 -0
- data/.rubocop/rubocop-lint.yml +198 -0
- data/.rubocop/rubocop-metrics.yml +46 -0
- data/.rubocop/rubocop-naming.yml +48 -0
- data/.rubocop/rubocop-performance.yml +55 -0
- data/.rubocop/rubocop-rails.yml +97 -0
- data/.rubocop/rubocop-rspec.yml +143 -0
- data/.rubocop/rubocop-security.yml +46 -0
- data/.rubocop/rubocop-style.yml +319 -0
- data/.ruby-version +1 -0
- data/.travis.yml +7 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +61 -0
- data/LICENSE.txt +21 -0
- data/README.md +62 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/pdf_extractor.rb +25 -0
- data/lib/pdf_extractor/output_parser.rb +63 -0
- data/lib/pdf_extractor/pdf_tk.rb +20 -0
- data/lib/pdf_extractor/version.rb +5 -0
- data/pdf_extractor.gemspec +36 -0
- data/script/changelog.sh +41 -0
- data/script/dependencies.sh +17 -0
- data/script/publish.sh +6 -0
- data/script/release.sh +29 -0
- metadata +203 -0
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.4.0
|
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
pdf_extractor (0.1.0)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
ansi (1.5.0)
|
10
|
+
ast (2.4.0)
|
11
|
+
builder (3.2.3)
|
12
|
+
docile (1.3.2)
|
13
|
+
jaro_winkler (1.5.3)
|
14
|
+
jaro_winkler (1.5.3-java)
|
15
|
+
json (2.2.0)
|
16
|
+
json (2.2.0-java)
|
17
|
+
minitest (5.11.3)
|
18
|
+
minitest-reporters (1.3.6)
|
19
|
+
ansi
|
20
|
+
builder
|
21
|
+
minitest (>= 5.0)
|
22
|
+
ruby-progressbar
|
23
|
+
minitest-reporters-json_reporter (1.0.0)
|
24
|
+
minitest-reporters (~> 1.1, >= 1.1.8)
|
25
|
+
nexus (1.4.0)
|
26
|
+
parallel (1.17.0)
|
27
|
+
parser (2.6.3.0)
|
28
|
+
ast (~> 2.4.0)
|
29
|
+
rainbow (3.0.0)
|
30
|
+
rake (10.5.0)
|
31
|
+
rubocop (0.71.0)
|
32
|
+
jaro_winkler (~> 1.5.1)
|
33
|
+
parallel (~> 1.10)
|
34
|
+
parser (>= 2.6)
|
35
|
+
rainbow (>= 2.2.2, < 4.0)
|
36
|
+
ruby-progressbar (~> 1.7)
|
37
|
+
unicode-display_width (>= 1.4.0, < 1.7)
|
38
|
+
ruby-progressbar (1.10.1)
|
39
|
+
simplecov (0.16.1)
|
40
|
+
docile (~> 1.1)
|
41
|
+
json (>= 1.8, < 3)
|
42
|
+
simplecov-html (~> 0.10.0)
|
43
|
+
simplecov-html (0.10.2)
|
44
|
+
unicode-display_width (1.6.0)
|
45
|
+
|
46
|
+
PLATFORMS
|
47
|
+
java
|
48
|
+
ruby
|
49
|
+
|
50
|
+
DEPENDENCIES
|
51
|
+
bundler (~> 2.0)
|
52
|
+
minitest (~> 5.0)
|
53
|
+
minitest-reporters-json_reporter (~> 1.0, >= 1.0.0)
|
54
|
+
nexus (~> 1.4, >= 1.4.0)
|
55
|
+
pdf_extractor!
|
56
|
+
rake (~> 10.0)
|
57
|
+
rubocop (~> 0.58, >= 0.58.2)
|
58
|
+
simplecov (~> 0.16, >= 0.16.1)
|
59
|
+
|
60
|
+
BUNDLED WITH
|
61
|
+
2.0.2
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2019 Jindrich Skupa
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
# PdfExtractor
|
2
|
+
|
3
|
+
Extracts PDF information via [PDFtk](https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/)
|
4
|
+
|
5
|
+
## Dependencies
|
6
|
+
|
7
|
+
* [PDFtk](https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/)
|
8
|
+
|
9
|
+
## Installation
|
10
|
+
|
11
|
+
Add this line to your application's Gemfile:
|
12
|
+
|
13
|
+
```ruby
|
14
|
+
gem 'pdf_extractor'
|
15
|
+
```
|
16
|
+
|
17
|
+
And then execute:
|
18
|
+
|
19
|
+
$ bundle
|
20
|
+
|
21
|
+
Or install it yourself as:
|
22
|
+
|
23
|
+
$ gem install pdf_extractor
|
24
|
+
|
25
|
+
## Usage
|
26
|
+
|
27
|
+
```ruby
|
28
|
+
require 'pdf_extractor'
|
29
|
+
form = PdfExtractor.dump_data_fields("~/Downloads/testing_pdf.pdf")
|
30
|
+
puts form.first.inspect
|
31
|
+
# => {"FieldType"=>"Text", "FieldName"=>"customerNumber", "FieldNameAlt"=>"Customer ID number", "FieldFlags"=>"12582912", "FieldValue"=>"ID0000011", "FieldJustification"=>"Left"}
|
32
|
+
```
|
33
|
+
|
34
|
+
or
|
35
|
+
|
36
|
+
```ruby
|
37
|
+
form = PdfExtractor.dump_data_fields_key_value("test/fixtures/testing_pdf.pdf")
|
38
|
+
puts form.inspect
|
39
|
+
# => {"customerNumber"=>"ID0000011"}
|
40
|
+
```
|
41
|
+
|
42
|
+
or
|
43
|
+
|
44
|
+
```ruby
|
45
|
+
meta = PdfExtractor.dump_data("test/fixtures/testing_pdf.pdf")
|
46
|
+
puts meta.inspect
|
47
|
+
# => {"Creator"=>"PDFescape Online - https://www.pdfescape.com", "Title"=>"testing_pdf", "Producer"=>"RAD PDF 3.9.6.0 - https://www.radpdf.com", "ModDate"=>"D:20190625142342Z", "CreationDate"=>"D:20190625142056Z", "PdfID0"=>"fd1f5ca5bedbac4d46fa47241f696430", "PdfID1"=>"fd1f5ca5bedbac4d46fa47241f696430", "NumberOfPages"=>"1", "PageMediaNumber"=>"1", "PageMediaRotation"=>"0", "PageMediaRect"=>"0 0 595.28 841.89", "PageMediaDimensions"=>"595.28 841.89"}
|
48
|
+
```
|
49
|
+
|
50
|
+
## Development
|
51
|
+
|
52
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
53
|
+
|
54
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
55
|
+
|
56
|
+
## Contributing
|
57
|
+
|
58
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/jindrichskupa/pdf_extractor.
|
59
|
+
|
60
|
+
## License
|
61
|
+
|
62
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "pdf_extractor"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'pdf_extractor/version'
|
4
|
+
require 'pdf_extractor/pdf_tk'
|
5
|
+
require 'pdf_extractor/output_parser'
|
6
|
+
|
7
|
+
# PdfExtractor module to extract PDF form data
|
8
|
+
module PdfExtractor
|
9
|
+
class Error < StandardError; end
|
10
|
+
# Your code goes here...
|
11
|
+
def self.dump_data_fields(file = nil)
|
12
|
+
output = PdfTk.dump_data_fields(file)
|
13
|
+
OutputParser.dump_data_fields(output)
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.dump_data_fields_key_value(file = nil)
|
17
|
+
output = PdfTk.dump_data_fields(file)
|
18
|
+
OutputParser.dump_data_fields_key_value(output)
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.dump_data(file = nil)
|
22
|
+
output = PdfTk.dump_data(file)
|
23
|
+
OutputParser.dump_data(output)
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module PdfExtractor
|
4
|
+
# Parse PDFTk outputs
|
5
|
+
class OutputParser
|
6
|
+
def self.dump_data(output = nil)
|
7
|
+
fields = {}
|
8
|
+
pending_key = pending_value = nil
|
9
|
+
output.each do |line|
|
10
|
+
key, value = line.split(': ').map(&:strip)
|
11
|
+
next if %w[InfoBegin PageMediaBegin].include? key
|
12
|
+
|
13
|
+
if key == 'InfoKey'
|
14
|
+
pending_key = value
|
15
|
+
if both_not_nil?(pending_key, pending_value)
|
16
|
+
fields[pending_key] = pending_value
|
17
|
+
pending_key = pending_value = nil
|
18
|
+
end
|
19
|
+
elsif key == 'InfoValue'
|
20
|
+
pending_value = value
|
21
|
+
if both_not_nil?(pending_key, pending_value)
|
22
|
+
fields[pending_key] = pending_value
|
23
|
+
pending_key = pending_value = nil
|
24
|
+
end
|
25
|
+
else
|
26
|
+
fields[key] = value
|
27
|
+
end
|
28
|
+
end
|
29
|
+
fields
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.dump_data_fields(output = nil)
|
33
|
+
return if output.nil?
|
34
|
+
|
35
|
+
fields = []
|
36
|
+
field = {}
|
37
|
+
output.each do |line|
|
38
|
+
if line.strip == '---'
|
39
|
+
fields << field unless field.empty?
|
40
|
+
field = {}
|
41
|
+
else
|
42
|
+
key, value = line.split(': ')
|
43
|
+
field[key] = value.strip
|
44
|
+
end
|
45
|
+
end
|
46
|
+
fields << field unless field.empty?
|
47
|
+
fields
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.dump_data_fields_key_value(output = nil)
|
51
|
+
field_array = dump_data_fields(output)
|
52
|
+
return if field_array.empty?
|
53
|
+
|
54
|
+
form = field_array.map { |i| { i['FieldName'] => i['FieldValue'] } }.each { hash }
|
55
|
+
Hash[*form.collect(&:to_a).flatten]
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.both_not_nil?(a = nil, b = nil)
|
59
|
+
!a.nil? && !b.nil?
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module PdfExtractor
|
4
|
+
# Call PDFTk with params
|
5
|
+
class PdfTk
|
6
|
+
def self.dump_data_fields(file = nil)
|
7
|
+
return if file.nil?
|
8
|
+
|
9
|
+
command = IO.popen("pdftk #{file} dump_data_fields", 'r+')
|
10
|
+
command.readlines
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.dump_data(file = nil)
|
14
|
+
return if file.nil?
|
15
|
+
|
16
|
+
command = IO.popen("pdftk #{file} dump_data", 'r+')
|
17
|
+
command.readlines
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
lib = File.expand_path("lib", __dir__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
require "pdf_extractor/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = 'pdf_extractor'
|
7
|
+
spec.version = PdfExtractor::VERSION
|
8
|
+
spec.authors = ['Jindrich Skupa']
|
9
|
+
spec.email = ['jindrich.skupa@gmail.com']
|
10
|
+
|
11
|
+
spec.summary = %q(PDFTk wrapper to extract form fiels)
|
12
|
+
spec.description = %q(PDFTk wrapper to extract form fiels)
|
13
|
+
spec.homepage = 'https://github.com/jindrichskupa/pdf-extractor'
|
14
|
+
spec.license = 'MIT'
|
15
|
+
|
16
|
+
spec.metadata['homepage_uri'] = spec.homepage
|
17
|
+
spec.metadata['source_code_uri'] = spec.homepage
|
18
|
+
spec.metadata['changelog_uri'] = spec.homepage
|
19
|
+
|
20
|
+
# Specify which files should be added to the gem when it is released.
|
21
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
22
|
+
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
23
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
24
|
+
end
|
25
|
+
spec.bindir = 'exe'
|
26
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
27
|
+
spec.require_paths = ['lib']
|
28
|
+
|
29
|
+
spec.add_development_dependency 'bundler', '~> 2.0'
|
30
|
+
spec.add_development_dependency 'minitest', '~> 5.0'
|
31
|
+
spec.add_development_dependency 'minitest-reporters-json_reporter', '~> 1.0', '>= 1.0.0'
|
32
|
+
spec.add_development_dependency 'nexus', '~> 1.4', '>= 1.4.0'
|
33
|
+
spec.add_development_dependency 'rubocop', '~> 0.58', '>= 0.58.2'
|
34
|
+
spec.add_development_dependency 'simplecov', '~> 0.16', '>= 0.16.1'
|
35
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
36
|
+
end
|
data/script/changelog.sh
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
[ -z "$1" ] && { echo "missing type: major, minor, patch, none"; exit 1; }
|
3
|
+
|
4
|
+
old_version="$(git describe --tags --abbrev=0)"
|
5
|
+
|
6
|
+
[ -z "$old_version" ] && { echo "missing previous tag"; exit 1; }
|
7
|
+
|
8
|
+
old_version=$(echo "$old_version" | tr -d '[:lower:]')
|
9
|
+
|
10
|
+
new_version=$(echo "$old_version" | awk -F. -v CHANGE="$1" -v SPECIAL="$2" '
|
11
|
+
BEGIN{type=CHANGE;special=SPECIAL}
|
12
|
+
{
|
13
|
+
if (special != "")
|
14
|
+
special="-"special
|
15
|
+
if (type=="patch")
|
16
|
+
print $1"."$2"."$3+1""special ;
|
17
|
+
else if (type == "minor")
|
18
|
+
print $1"."$2+1".0"special ;
|
19
|
+
else if (type == "major")
|
20
|
+
print $1+1".0.0"special ;
|
21
|
+
else if (type == "none")
|
22
|
+
print $1"."$2"."$3""special
|
23
|
+
}')
|
24
|
+
|
25
|
+
new_version="$new_version"
|
26
|
+
|
27
|
+
change_log_header="# PDF Extractor ($new_version) RELEASED\n"
|
28
|
+
change_log_changes=$(git log "$old_version"..HEAD --oneline --no-merges --format=' * %s' | sort | uniq)
|
29
|
+
change_log_authors=$(git log "$old_version"..HEAD --oneline --format='* %aN' | sort -u)
|
30
|
+
|
31
|
+
[ -f CHANGELOG.MD ] || touch CHANGELOG.MD
|
32
|
+
|
33
|
+
{
|
34
|
+
echo -e "$change_log_header"
|
35
|
+
echo -e "$change_log_changes"
|
36
|
+
echo -e "\nAuthors\n"
|
37
|
+
echo -e "$change_log_authors\n"
|
38
|
+
} | cat - CHANGELOG.MD > CHANGELOG.work.MD && mv CHANGELOG.work.MD CHANGELOG.MD
|
39
|
+
|
40
|
+
echo "Check updated CHANGELOG.MD: $old_version -> $new_version"
|
41
|
+
echo "Tag this version with $new_version"
|
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
echo '{' > libraries.json
|
4
|
+
echo '"libraries":[' >> libraries.json
|
5
|
+
comma=""
|
6
|
+
bundle show | tr -d '()' | sed 1d | awk '{print $2" "$3}' | while read gem version; do
|
7
|
+
echo " $comma{\"bundle\": \"$gem\", \"version_code\": \"$version\", \"type\": \"Gem\"}"
|
8
|
+
comma=","
|
9
|
+
done >> libraries.json
|
10
|
+
echo ']' >> libraries.json
|
11
|
+
echo '}' >> libraries.json
|
12
|
+
|
13
|
+
curl -H "X-App-Token: ${KRAKEN_APP_TOKEN}" \
|
14
|
+
-H "X-App-Version: ${CI_COMMIT_TAG}" \
|
15
|
+
-H "Content-Type: application/json" \
|
16
|
+
-X POST -d @libraries.json https://versions.eman.cz/api/packages
|
17
|
+
|
data/script/publish.sh
ADDED
data/script/release.sh
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
current_tag=${CI_COMMIT_TAG}
|
4
|
+
prev_tag=$(git tag | sort --version-sort | grep -B 1 ${CI_COMMIT_TAG} | head -1)
|
5
|
+
|
6
|
+
change_log_header="# ${CI_PROJECT_NAME} (${current_tag}) RELEASED\n"
|
7
|
+
change_log_changes=$(git log ${prev_tag}..${current_tag} --oneline --no-merges --format=' * %s' | sort | uniq | sed 's/$/\\n/' | tr -d \"\'\\n)
|
8
|
+
change_log_authors=$(git log ${prev_tag}.."$current_tag" --format='* %aN\n' | sort -u | tr -d \"\'\\n)
|
9
|
+
|
10
|
+
cat << EOF > release.json
|
11
|
+
{
|
12
|
+
"tag_name": "$current_tag",
|
13
|
+
"description": "$change_log_header\n$change_log_changes\nAuthors\n\n$change_log_authors"
|
14
|
+
}
|
15
|
+
EOF
|
16
|
+
|
17
|
+
RET=$(curl -s -o /dev/null -w "%{http_code}" -XPOST \
|
18
|
+
-H "PRIVATE-TOKEN: $GL_PRIVATE_TOKEN" \
|
19
|
+
-H 'Content-type: application/json' \
|
20
|
+
-d @release.json \
|
21
|
+
https://gitlab.eman.cz/api/v4/projects/${CI_PROJECT_ID}/repository/tags/${CI_COMMIT_TAG}/release)
|
22
|
+
|
23
|
+
[ $RET -eq 409 ] && RET=$(curl -s -o /dev/null -w "%{http_code}" -XPUT \
|
24
|
+
-H "PRIVATE-TOKEN: $GL_PRIVATE_TOKEN" \
|
25
|
+
-H 'Content-type: application/json' \
|
26
|
+
-d @release.json \
|
27
|
+
https://gitlab.eman.cz/api/v4/projects/${CI_PROJECT_ID}/repository/tags/${CI_COMMIT_TAG}/release)
|
28
|
+
|
29
|
+
exit 0
|