pdf-extract-meta 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1ba7ccb34a06cf140cff5fd67f51d1791f7db824
4
+ data.tar.gz: 83569f79d1c7f119fdbc841c51f8b8a76feeb4f0
5
+ SHA512:
6
+ metadata.gz: 8d3673261b65898ebdca05b6086858165c19d7796b82de0a08507605bd2b944733a0ac2df8637c45f07549b347a621d52bae84460d47c4945bd46ddf5564c117
7
+ data.tar.gz: d6edd539c7a50d77062bb752a8f2e78bad968d0a947e2cb46fdab3f2e15b637777e96bf73a12c5664737fc6897d36c6a50fdedbe173051d5a2fda3a48698d1d9
data/.gitignore ADDED
@@ -0,0 +1,13 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/local/
8
+ /spec/reports/
9
+ /tmp/
10
+
11
+ .byebug_history
12
+ # rspec failure tracking
13
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.tool-versions ADDED
@@ -0,0 +1 @@
1
+ ruby 2.4.5
data/.travis.yml ADDED
@@ -0,0 +1,9 @@
1
+ ---
2
+ sudo: false
3
+ language: ruby
4
+ cache: bundler
5
+ rvm:
6
+ - 2.4.1
7
+ - 2.4.5
8
+ - 2.5.1
9
+ before_install: gem install bundler -v 2.0.1
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "https://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in pdf-extract.gemspec
4
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,55 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ pdf-extract-meta (0.1.0)
5
+ commander (~> 4.4)
6
+ oj (~> 3.0)
7
+ pdf-reader (~> 2.2)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ Ascii85 (1.0.3)
13
+ afm (0.2.2)
14
+ byebug (11.0.0)
15
+ commander (4.4.7)
16
+ highline (~> 2.0.0)
17
+ diff-lcs (1.3)
18
+ hashery (2.1.2)
19
+ highline (2.0.1)
20
+ oj (3.7.9)
21
+ pdf-reader (2.2.0)
22
+ Ascii85 (~> 1.0.0)
23
+ afm (~> 0.2.1)
24
+ hashery (~> 2.0)
25
+ ruby-rc4
26
+ ttfunk
27
+ rake (10.5.0)
28
+ rspec (3.8.0)
29
+ rspec-core (~> 3.8.0)
30
+ rspec-expectations (~> 3.8.0)
31
+ rspec-mocks (~> 3.8.0)
32
+ rspec-core (3.8.0)
33
+ rspec-support (~> 3.8.0)
34
+ rspec-expectations (3.8.2)
35
+ diff-lcs (>= 1.2.0, < 2.0)
36
+ rspec-support (~> 3.8.0)
37
+ rspec-mocks (3.8.0)
38
+ diff-lcs (>= 1.2.0, < 2.0)
39
+ rspec-support (~> 3.8.0)
40
+ rspec-support (3.8.0)
41
+ ruby-rc4 (0.1.5)
42
+ ttfunk (1.5.1)
43
+
44
+ PLATFORMS
45
+ ruby
46
+
47
+ DEPENDENCIES
48
+ bundler (~> 2.0)
49
+ byebug (~> 11.0)
50
+ pdf-extract-meta!
51
+ rake (~> 10.0)
52
+ rspec (~> 3.0)
53
+
54
+ BUNDLED WITH
55
+ 2.0.1
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2019 Access Marketing Communications LLC
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,50 @@
1
+ # PDF::Extract
2
+
3
+ [![Build Status](https://travis-ci.org/Scrimmage/pdf-extract-meta.svg)](https://travis-ci.org/Scrimmage/pdf-extract-meta)
4
+
5
+ [![Code Climate](https://codeclimate.com/github/Scrimmage/pdf-extract-meta.png)](https://codeclimate.com/github/Scrimmage/pdf-extract-meta)
6
+
7
+ This gem provides a command line interface to extract field and annotation metadata from a PDF.
8
+
9
+ ```
10
+ bin/pdf-extract fields spec/data/field-examples/text.pdf
11
+ [{"name":"Sample Text Field","value":"Hello"},{"name":"Sample Text Field (required)","value":null}]
12
+ ```
13
+
14
+ ```
15
+ bin/pdf-extract annotations spec/data/annotation-examples/note.pdf
16
+ [{"name":null,"contents":"Hello"},{"name":null,"contents":"Hello"}]
17
+ ```
18
+
19
+ ## Installation
20
+
21
+ Add this line to your application's Gemfile:
22
+
23
+ ```ruby
24
+ gem 'pdf-extract-meta'
25
+ ```
26
+
27
+ And then execute:
28
+
29
+ $ bundle
30
+
31
+ Or install it yourself as:
32
+
33
+ $ gem install pdf-extract-meta
34
+
35
+ ## Usage
36
+
37
+ Run `bin/pdf-extract --help` for usage.
38
+
39
+ From within Ruby:
40
+ ```
41
+ Bundler.with_clean_env do
42
+ JSON.parse(`pdf-extract fields '#{pdf_path}'`)
43
+ end
44
+ ```
45
+
46
+ ## Development
47
+
48
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
49
+
50
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version and push git commits and tags.
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "pdf/extract"
5
+ require "irb"
6
+
7
+ IRB.start(__FILE__)
data/bin/pdf-extract ADDED
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $:.push File.expand_path("../../lib", __FILE__)
4
+
5
+ require "pdf/extract"
6
+ require "pdf/extract/commands"
data/bin/setup ADDED
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
@@ -0,0 +1,17 @@
1
+ $:.push File.join(File.expand_path("../", __FILE__), "extract")
2
+
3
+ require "base64"
4
+ require "oj"
5
+ require "pdf-reader"
6
+
7
+ require "pdf/extract/annotation"
8
+ require "pdf/extract/document"
9
+ require "pdf/extract/field"
10
+ require "pdf/extract/reference_resolver"
11
+ require "pdf/extract/version"
12
+
13
+ module PDF
14
+ module Extract
15
+ class Error < StandardError; end
16
+ end
17
+ end
@@ -0,0 +1,43 @@
1
+ module PDF
2
+ module Extract
3
+ class Annotation
4
+
5
+ attr_reader :data
6
+
7
+ def initialize(data)
8
+ @data = data || {}
9
+ end
10
+
11
+ # PDF Reference 6th Edition, Version 1.7, November 2006 page 606
12
+ # The annotation name, a text string uniquely identifying it among all the annotations on its
13
+ # page.
14
+ def name
15
+ data[:NM]
16
+ end
17
+
18
+ # PDF Reference 6th Edition, Version 1.7, November 2006 page 606
19
+ # Text to be displayed for the annotation or, if this type of annotation does not display
20
+ # text, an alternate description of the annotation’s contents in human-readable form. In
21
+ # either case, this text is useful when extracting the document’s contents in support of
22
+ # accessibility to users with disabilities or for other purposes (see Section 10.8.2,
23
+ # “Alternate Descriptions”). See Section 8.4.5, “Annotation Types” for more details on the
24
+ # meaning of this entry for each annotation type.
25
+ def contents
26
+ data[:Contents]
27
+ end
28
+
29
+ def subtype
30
+ data[:Subtype]
31
+ end
32
+
33
+ def as_json
34
+ {
35
+ "name" => name,
36
+ "contents" => contents,
37
+ "subtype" => subtype,
38
+ }
39
+ end
40
+
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,3 @@
1
+ require "commands/init"
2
+ require "commands/annotations"
3
+ require "commands/fields"
@@ -0,0 +1,16 @@
1
+ command :annotations do |c|
2
+
3
+ STDOUT.sync = true
4
+
5
+ c.syntax = "pdf-extract annotations <path>"
6
+
7
+ c.action do |args, options|
8
+ path = args.pop
9
+ say_error "Unspecified file" and abort if !path
10
+
11
+ pdf = PDF::Extract::Document.new(path: path)
12
+ data = pdf.annotations.map { |f| f.as_json }
13
+ puts Oj.dump(data)
14
+ end
15
+
16
+ end
@@ -0,0 +1,16 @@
1
+ command :fields do |c|
2
+
3
+ STDOUT.sync = true
4
+
5
+ c.syntax = "pdf-extract fields <path>"
6
+
7
+ c.action do |args, options|
8
+ path = args.pop
9
+ say_error "Unspecified file" and abort if !path
10
+
11
+ pdf = PDF::Extract::Document.new(path: path)
12
+ data = pdf.fields.map { |f| f.as_json }
13
+ puts Oj.dump(data)
14
+ end
15
+
16
+ end
@@ -0,0 +1,9 @@
1
+ require "commander/import"
2
+
3
+ program :description, "Extract data from PDF files"
4
+ program :help_formatter, :compact
5
+ program :version, PDF::Extract::VERSION
6
+
7
+ global_option("-v", "--verbose") { $verbose = true }
8
+
9
+ default_command :extract
@@ -0,0 +1,57 @@
1
+ module PDF
2
+ module Extract
3
+ class Document
4
+
5
+ PAGE_ANNOTATIONS_KEY = :Annots
6
+
7
+ attr_reader :document
8
+
9
+ def initialize(path:)
10
+ @document = ::PDF::Reader.new(path)
11
+ end
12
+
13
+ def annotations
14
+ reference_resoler.lookup(annotation_references).map { |h|
15
+ Annotation.new(h)
16
+ }
17
+ end
18
+
19
+ def fields
20
+ reference_resoler.lookup(field_references).map { |h|
21
+ Field.new(h, reference_resoler)
22
+ }
23
+ end
24
+
25
+ private
26
+
27
+ def annotations_for_page(page)
28
+ page.attributes[PAGE_ANNOTATIONS_KEY]
29
+ end
30
+
31
+ def annotation_references
32
+ pages.map { |page| annotations_for_page(page) }.flatten.compact
33
+ end
34
+
35
+ def field_references
36
+ # PDF Reference 6th Edition, Version 1.7, November 2006 page 672
37
+ # Interactive Form Dictionary
38
+ ifd = objects.values.select { |x| x.respond_to?(:keys) && x.keys.include?(:Fields) }.first || {}
39
+ refs = ifd[:Fields]
40
+ refs || []
41
+ end
42
+
43
+ def objects
44
+ document.objects || {}
45
+ end
46
+
47
+ def pages
48
+ document.pages
49
+ end
50
+
51
+ def reference_resoler
52
+ ReferenceResolver.new(document: document)
53
+ end
54
+
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,76 @@
1
+ module PDF
2
+ module Extract
3
+ class Field
4
+
5
+ attr_reader :data, :reference_resoler
6
+
7
+ def initialize(data, reference_resoler)
8
+ @data = data
9
+ @reference_resoler = reference_resoler
10
+ end
11
+
12
+ # PDF Reference 6th Edition, Version 1.7, November 2006 page 675
13
+ # The partial field name
14
+ def name
15
+ data[:T]
16
+ end
17
+
18
+ # PDF Reference 6th Edition, Version 1.7, November 2006 page 675
19
+ # The type of field that this dictionary describes.
20
+ def type
21
+ data[:FT]
22
+ end
23
+
24
+ # PDF Reference 6th Edition, Version 1.7, November 2006 page 676
25
+ # The field’s value, whose format varies depending on the field type.
26
+ def value
27
+ data[:V]
28
+ end
29
+
30
+ def image
31
+ # PDF Reference 6th Edition, Version 1.7, November 2006 page 641
32
+ # MK: An appearance characteristics dictionary (see Table 8.40) to be used in constructing
33
+ # a dynamic appearance stream specifying the annotation’s visual presentation on the page.
34
+ # The name MK for this entry is of historical significance only and has no direct meaning.
35
+ #
36
+ # PDF Reference 6th Edition, Version 1.7, November 2006 page 1118
37
+ # Implementation Notes
38
+ # If the MK entry is present in the field’s widget annotation dictionary (see Table 8.39),
39
+ # Acrobat viewers regenerate the entire XObject appearance stream. If MK is not present,
40
+ # the contents of the stream outside /Tx BMC ... EMC are preserved.
41
+ mk = data[:MK] || {}
42
+
43
+ mk = mk.is_a?(PDF::Reader::Reference) ? reference_resoler.lookup(mk) : mk
44
+
45
+ # PDF Reference 6th Edition, Version 1.7, November 2006 page 642
46
+ # I: A form XObject defining the widget annotation’s normal icon, displayed when it is not
47
+ # interacting with the user.
48
+ stream = reference_resoler.lookup(mk[:I])&.hash || {}
49
+
50
+ # PDF Reference 6th Edition, Version 1.7, November 2006 page 358
51
+ # form dictionary
52
+ resources = reference_resoler.lookup(stream[:Resources]) || {}
53
+
54
+ xobject = resources[:XObject] || {}
55
+
56
+ stream = reference_resoler.lookup(xobject[:Im1])
57
+
58
+ data = stream&.data
59
+
60
+ data ? Base64.encode64(data) : nil
61
+ end
62
+
63
+ def as_json
64
+ h = {
65
+ "name" => name,
66
+ "value" => value
67
+ }
68
+
69
+ image.tap { |i| h["image"] = i if i }
70
+
71
+ h
72
+ end
73
+
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,30 @@
1
+ module PDF
2
+ module Extract
3
+ class ReferenceResolver
4
+
5
+ attr_reader :objects
6
+
7
+ def initialize(document:)
8
+ @objects = document.objects
9
+ end
10
+
11
+ def lookup(reference)
12
+ reference.is_a?(Array) ? lookup_mutiple(reference) : lookup_single(reference)
13
+ rescue SystemStackError
14
+ raise PDF::Extract::Error.new("map contains infinite recursion")
15
+ end
16
+
17
+ private
18
+
19
+ def lookup_mutiple(references)
20
+ (_ = *references).map { |ref| lookup(ref) }.flatten
21
+ end
22
+
23
+ def lookup_single(reference)
24
+ object = objects[reference]
25
+ object.is_a?(Array) ? lookup_mutiple(object) : object
26
+ end
27
+
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,5 @@
1
+ module PDF
2
+ module Extract
3
+ VERSION = "0.1.0"
4
+ end
5
+ end
metadata ADDED
@@ -0,0 +1,165 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pdf-extract-meta
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Matthew Chadwick
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2019-03-01 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: commander
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '4.4'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '4.4'
27
+ - !ruby/object:Gem::Dependency
28
+ name: oj
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '3.0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '3.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: pdf-reader
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '2.2'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '2.2'
55
+ - !ruby/object:Gem::Dependency
56
+ name: bundler
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '2.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '2.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: byebug
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '11.0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '11.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '10.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '10.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rspec
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '3.0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '3.0'
111
+ description:
112
+ email:
113
+ - matthew@wescrimmage.com
114
+ executables: []
115
+ extensions: []
116
+ extra_rdoc_files: []
117
+ files:
118
+ - ".gitignore"
119
+ - ".rspec"
120
+ - ".tool-versions"
121
+ - ".travis.yml"
122
+ - Gemfile
123
+ - Gemfile.lock
124
+ - LICENSE
125
+ - README.md
126
+ - Rakefile
127
+ - bin/console
128
+ - bin/pdf-extract
129
+ - bin/setup
130
+ - lib/pdf/extract.rb
131
+ - lib/pdf/extract/annotation.rb
132
+ - lib/pdf/extract/commands.rb
133
+ - lib/pdf/extract/commands/annotations.rb
134
+ - lib/pdf/extract/commands/fields.rb
135
+ - lib/pdf/extract/commands/init.rb
136
+ - lib/pdf/extract/document.rb
137
+ - lib/pdf/extract/field.rb
138
+ - lib/pdf/extract/reference_resolver.rb
139
+ - lib/pdf/extract/version.rb
140
+ homepage: https://github.com/Scrimmage/pdf-extract
141
+ licenses:
142
+ - MIT
143
+ metadata: {}
144
+ post_install_message:
145
+ rdoc_options: []
146
+ require_paths:
147
+ - lib
148
+ required_ruby_version: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
153
+ required_rubygems_version: !ruby/object:Gem::Requirement
154
+ requirements:
155
+ - - ">="
156
+ - !ruby/object:Gem::Version
157
+ version: '0'
158
+ requirements: []
159
+ rubyforge_project:
160
+ rubygems_version: 2.6.14.3
161
+ signing_key:
162
+ specification_version: 4
163
+ summary: A command line utility for extracting annotation and field metadata from
164
+ a PDF in JSON format.
165
+ test_files: []