pdf_extractor 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1 @@
1
+ 2.4.0
@@ -0,0 +1,7 @@
1
+ ---
2
+ sudo: false
3
+ language: ruby
4
+ cache: bundler
5
+ rvm:
6
+ - 2.4.0
7
+ before_install: gem install bundler -v 2.0.2
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "https://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in pdf_extractor.gemspec
4
+ gemspec
@@ -0,0 +1,61 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ pdf_extractor (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ ansi (1.5.0)
10
+ ast (2.4.0)
11
+ builder (3.2.3)
12
+ docile (1.3.2)
13
+ jaro_winkler (1.5.3)
14
+ jaro_winkler (1.5.3-java)
15
+ json (2.2.0)
16
+ json (2.2.0-java)
17
+ minitest (5.11.3)
18
+ minitest-reporters (1.3.6)
19
+ ansi
20
+ builder
21
+ minitest (>= 5.0)
22
+ ruby-progressbar
23
+ minitest-reporters-json_reporter (1.0.0)
24
+ minitest-reporters (~> 1.1, >= 1.1.8)
25
+ nexus (1.4.0)
26
+ parallel (1.17.0)
27
+ parser (2.6.3.0)
28
+ ast (~> 2.4.0)
29
+ rainbow (3.0.0)
30
+ rake (10.5.0)
31
+ rubocop (0.71.0)
32
+ jaro_winkler (~> 1.5.1)
33
+ parallel (~> 1.10)
34
+ parser (>= 2.6)
35
+ rainbow (>= 2.2.2, < 4.0)
36
+ ruby-progressbar (~> 1.7)
37
+ unicode-display_width (>= 1.4.0, < 1.7)
38
+ ruby-progressbar (1.10.1)
39
+ simplecov (0.16.1)
40
+ docile (~> 1.1)
41
+ json (>= 1.8, < 3)
42
+ simplecov-html (~> 0.10.0)
43
+ simplecov-html (0.10.2)
44
+ unicode-display_width (1.6.0)
45
+
46
+ PLATFORMS
47
+ java
48
+ ruby
49
+
50
+ DEPENDENCIES
51
+ bundler (~> 2.0)
52
+ minitest (~> 5.0)
53
+ minitest-reporters-json_reporter (~> 1.0, >= 1.0.0)
54
+ nexus (~> 1.4, >= 1.4.0)
55
+ pdf_extractor!
56
+ rake (~> 10.0)
57
+ rubocop (~> 0.58, >= 0.58.2)
58
+ simplecov (~> 0.16, >= 0.16.1)
59
+
60
+ BUNDLED WITH
61
+ 2.0.2
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2019 Jindrich Skupa
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,62 @@
1
+ # PdfExtractor
2
+
3
+ Extracts PDF information via [PDFtk](https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/)
4
+
5
+ ## Dependencies
6
+
7
+ * [PDFtk](https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/)
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ ```ruby
14
+ gem 'pdf_extractor'
15
+ ```
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install pdf_extractor
24
+
25
+ ## Usage
26
+
27
+ ```ruby
28
+ require 'pdf_extractor'
29
+ form = PdfExtractor.dump_data_fields("~/Downloads/testing_pdf.pdf")
30
+ puts form.first.inspect
31
+ # => {"FieldType"=>"Text", "FieldName"=>"customerNumber", "FieldNameAlt"=>"Customer ID number", "FieldFlags"=>"12582912", "FieldValue"=>"ID0000011", "FieldJustification"=>"Left"}
32
+ ```
33
+
34
+ or
35
+
36
+ ```ruby
37
+ form = PdfExtractor.dump_data_fields_key_value("test/fixtures/testing_pdf.pdf")
38
+ puts form.inspect
39
+ # => {"customerNumber"=>"ID0000011"}
40
+ ```
41
+
42
+ or
43
+
44
+ ```ruby
45
+ meta = PdfExtractor.dump_data("test/fixtures/testing_pdf.pdf")
46
+ puts meta.inspect
47
+ # => {"Creator"=>"PDFescape Online - https://www.pdfescape.com", "Title"=>"testing_pdf", "Producer"=>"RAD PDF 3.9.6.0 - https://www.radpdf.com", "ModDate"=>"D:20190625142342Z", "CreationDate"=>"D:20190625142056Z", "PdfID0"=>"fd1f5ca5bedbac4d46fa47241f696430", "PdfID1"=>"fd1f5ca5bedbac4d46fa47241f696430", "NumberOfPages"=>"1", "PageMediaNumber"=>"1", "PageMediaRotation"=>"0", "PageMediaRect"=>"0 0 595.28 841.89", "PageMediaDimensions"=>"595.28 841.89"}
48
+ ```
49
+
50
+ ## Development
51
+
52
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
53
+
54
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
55
+
56
+ ## Contributing
57
+
58
+ Bug reports and pull requests are welcome on GitHub at https://github.com/jindrichskupa/pdf_extractor.
59
+
60
+ ## License
61
+
62
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new(:test) do |t|
5
+ t.libs << "test"
6
+ t.libs << "lib"
7
+ t.test_files = FileList["test/**/*_test.rb"]
8
+ end
9
+
10
+ task default: :test
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "pdf_extractor"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pdf_extractor/version'
4
+ require 'pdf_extractor/pdf_tk'
5
+ require 'pdf_extractor/output_parser'
6
+
7
+ # PdfExtractor module to extract PDF form data
8
+ module PdfExtractor
9
+ class Error < StandardError; end
10
+ # Your code goes here...
11
+ def self.dump_data_fields(file = nil)
12
+ output = PdfTk.dump_data_fields(file)
13
+ OutputParser.dump_data_fields(output)
14
+ end
15
+
16
+ def self.dump_data_fields_key_value(file = nil)
17
+ output = PdfTk.dump_data_fields(file)
18
+ OutputParser.dump_data_fields_key_value(output)
19
+ end
20
+
21
+ def self.dump_data(file = nil)
22
+ output = PdfTk.dump_data(file)
23
+ OutputParser.dump_data(output)
24
+ end
25
+ end
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ module PdfExtractor
4
+ # Parse PDFTk outputs
5
+ class OutputParser
6
+ def self.dump_data(output = nil)
7
+ fields = {}
8
+ pending_key = pending_value = nil
9
+ output.each do |line|
10
+ key, value = line.split(': ').map(&:strip)
11
+ next if %w[InfoBegin PageMediaBegin].include? key
12
+
13
+ if key == 'InfoKey'
14
+ pending_key = value
15
+ if both_not_nil?(pending_key, pending_value)
16
+ fields[pending_key] = pending_value
17
+ pending_key = pending_value = nil
18
+ end
19
+ elsif key == 'InfoValue'
20
+ pending_value = value
21
+ if both_not_nil?(pending_key, pending_value)
22
+ fields[pending_key] = pending_value
23
+ pending_key = pending_value = nil
24
+ end
25
+ else
26
+ fields[key] = value
27
+ end
28
+ end
29
+ fields
30
+ end
31
+
32
+ def self.dump_data_fields(output = nil)
33
+ return if output.nil?
34
+
35
+ fields = []
36
+ field = {}
37
+ output.each do |line|
38
+ if line.strip == '---'
39
+ fields << field unless field.empty?
40
+ field = {}
41
+ else
42
+ key, value = line.split(': ')
43
+ field[key] = value.strip
44
+ end
45
+ end
46
+ fields << field unless field.empty?
47
+ fields
48
+ end
49
+
50
+ def self.dump_data_fields_key_value(output = nil)
51
+ field_array = dump_data_fields(output)
52
+ return if field_array.empty?
53
+
54
+ form = field_array.map { |i| { i['FieldName'] => i['FieldValue'] } }.each { hash }
55
+ Hash[*form.collect(&:to_a).flatten]
56
+ end
57
+
58
+ def self.both_not_nil?(a = nil, b = nil)
59
+ !a.nil? && !b.nil?
60
+ end
61
+
62
+ end
63
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module PdfExtractor
4
+ # Call PDFTk with params
5
+ class PdfTk
6
+ def self.dump_data_fields(file = nil)
7
+ return if file.nil?
8
+
9
+ command = IO.popen("pdftk #{file} dump_data_fields", 'r+')
10
+ command.readlines
11
+ end
12
+
13
+ def self.dump_data(file = nil)
14
+ return if file.nil?
15
+
16
+ command = IO.popen("pdftk #{file} dump_data", 'r+')
17
+ command.readlines
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module PdfExtractor
4
+ VERSION = '0.1.1'
5
+ end
@@ -0,0 +1,36 @@
1
+ lib = File.expand_path("lib", __dir__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require "pdf_extractor/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'pdf_extractor'
7
+ spec.version = PdfExtractor::VERSION
8
+ spec.authors = ['Jindrich Skupa']
9
+ spec.email = ['jindrich.skupa@gmail.com']
10
+
11
+ spec.summary = %q(PDFTk wrapper to extract form fiels)
12
+ spec.description = %q(PDFTk wrapper to extract form fiels)
13
+ spec.homepage = 'https://github.com/jindrichskupa/pdf-extractor'
14
+ spec.license = 'MIT'
15
+
16
+ spec.metadata['homepage_uri'] = spec.homepage
17
+ spec.metadata['source_code_uri'] = spec.homepage
18
+ spec.metadata['changelog_uri'] = spec.homepage
19
+
20
+ # Specify which files should be added to the gem when it is released.
21
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
22
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
23
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
24
+ end
25
+ spec.bindir = 'exe'
26
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
27
+ spec.require_paths = ['lib']
28
+
29
+ spec.add_development_dependency 'bundler', '~> 2.0'
30
+ spec.add_development_dependency 'minitest', '~> 5.0'
31
+ spec.add_development_dependency 'minitest-reporters-json_reporter', '~> 1.0', '>= 1.0.0'
32
+ spec.add_development_dependency 'nexus', '~> 1.4', '>= 1.4.0'
33
+ spec.add_development_dependency 'rubocop', '~> 0.58', '>= 0.58.2'
34
+ spec.add_development_dependency 'simplecov', '~> 0.16', '>= 0.16.1'
35
+ spec.add_development_dependency 'rake', '~> 10.0'
36
+ end
@@ -0,0 +1,41 @@
1
+ #!/bin/bash
2
+ [ -z "$1" ] && { echo "missing type: major, minor, patch, none"; exit 1; }
3
+
4
+ old_version="$(git describe --tags --abbrev=0)"
5
+
6
+ [ -z "$old_version" ] && { echo "missing previous tag"; exit 1; }
7
+
8
+ old_version=$(echo "$old_version" | tr -d '[:lower:]')
9
+
10
+ new_version=$(echo "$old_version" | awk -F. -v CHANGE="$1" -v SPECIAL="$2" '
11
+ BEGIN{type=CHANGE;special=SPECIAL}
12
+ {
13
+ if (special != "")
14
+ special="-"special
15
+ if (type=="patch")
16
+ print $1"."$2"."$3+1""special ;
17
+ else if (type == "minor")
18
+ print $1"."$2+1".0"special ;
19
+ else if (type == "major")
20
+ print $1+1".0.0"special ;
21
+ else if (type == "none")
22
+ print $1"."$2"."$3""special
23
+ }')
24
+
25
+ new_version="$new_version"
26
+
27
+ change_log_header="# PDF Extractor ($new_version) RELEASED\n"
28
+ change_log_changes=$(git log "$old_version"..HEAD --oneline --no-merges --format=' * %s' | sort | uniq)
29
+ change_log_authors=$(git log "$old_version"..HEAD --oneline --format='* %aN' | sort -u)
30
+
31
+ [ -f CHANGELOG.MD ] || touch CHANGELOG.MD
32
+
33
+ {
34
+ echo -e "$change_log_header"
35
+ echo -e "$change_log_changes"
36
+ echo -e "\nAuthors\n"
37
+ echo -e "$change_log_authors\n"
38
+ } | cat - CHANGELOG.MD > CHANGELOG.work.MD && mv CHANGELOG.work.MD CHANGELOG.MD
39
+
40
+ echo "Check updated CHANGELOG.MD: $old_version -> $new_version"
41
+ echo "Tag this version with $new_version"
@@ -0,0 +1,17 @@
1
+ #!/bin/bash
2
+
3
+ echo '{' > libraries.json
4
+ echo '"libraries":[' >> libraries.json
5
+ comma=""
6
+ bundle show | tr -d '()' | sed 1d | awk '{print $2" "$3}' | while read gem version; do
7
+ echo " $comma{\"bundle\": \"$gem\", \"version_code\": \"$version\", \"type\": \"Gem\"}"
8
+ comma=","
9
+ done >> libraries.json
10
+ echo ']' >> libraries.json
11
+ echo '}' >> libraries.json
12
+
13
+ curl -H "X-App-Token: ${KRAKEN_APP_TOKEN}" \
14
+ -H "X-App-Version: ${CI_COMMIT_TAG}" \
15
+ -H "Content-Type: application/json" \
16
+ -X POST -d @libraries.json https://versions.eman.cz/api/packages
17
+
@@ -0,0 +1,6 @@
1
+ #!/bin/bash
2
+
3
+ bundle install
4
+ gem build pdf_extractor.gemspec
5
+ gem install pdf_extractor-${CI_COMMIT_TAG}.gem
6
+ gem nexus pdf_extractor-${CI_COMMIT_TAG}.gem --url https://nexus.eman.cz/repository/eman-gems/ --credential "$NEXUS_USER:$NEXUS_PASSWORD"
@@ -0,0 +1,29 @@
1
+ #!/bin/bash
2
+
3
+ current_tag=${CI_COMMIT_TAG}
4
+ prev_tag=$(git tag | sort --version-sort | grep -B 1 ${CI_COMMIT_TAG} | head -1)
5
+
6
+ change_log_header="# ${CI_PROJECT_NAME} (${current_tag}) RELEASED\n"
7
+ change_log_changes=$(git log ${prev_tag}..${current_tag} --oneline --no-merges --format=' * %s' | sort | uniq | sed 's/$/\\n/' | tr -d \"\'\\n)
8
+ change_log_authors=$(git log ${prev_tag}.."$current_tag" --format='* %aN\n' | sort -u | tr -d \"\'\\n)
9
+
10
+ cat << EOF > release.json
11
+ {
12
+ "tag_name": "$current_tag",
13
+ "description": "$change_log_header\n$change_log_changes\nAuthors\n\n$change_log_authors"
14
+ }
15
+ EOF
16
+
17
+ RET=$(curl -s -o /dev/null -w "%{http_code}" -XPOST \
18
+ -H "PRIVATE-TOKEN: $GL_PRIVATE_TOKEN" \
19
+ -H 'Content-type: application/json' \
20
+ -d @release.json \
21
+ https://gitlab.eman.cz/api/v4/projects/${CI_PROJECT_ID}/repository/tags/${CI_COMMIT_TAG}/release)
22
+
23
+ [ $RET -eq 409 ] && RET=$(curl -s -o /dev/null -w "%{http_code}" -XPUT \
24
+ -H "PRIVATE-TOKEN: $GL_PRIVATE_TOKEN" \
25
+ -H 'Content-type: application/json' \
26
+ -d @release.json \
27
+ https://gitlab.eman.cz/api/v4/projects/${CI_PROJECT_ID}/repository/tags/${CI_COMMIT_TAG}/release)
28
+
29
+ exit 0