RubyGems - pdf-extract-meta - Versions diffs - 0.1.0 - Mend

pdf-extract-meta 0.1.0

Files changed (24) hide show

checksums.yaml +7 -0
data/.gitignore +13 -0
data/.rspec +3 -0
data/.tool-versions +1 -0
data/.travis.yml +9 -0
data/Gemfile +4 -0
data/Gemfile.lock +55 -0
data/LICENSE +21 -0
data/README.md +50 -0
data/Rakefile +6 -0
data/bin/console +7 -0
data/bin/pdf-extract +6 -0
data/bin/setup +6 -0
data/lib/pdf/extract.rb +17 -0
data/lib/pdf/extract/annotation.rb +43 -0
data/lib/pdf/extract/commands.rb +3 -0
data/lib/pdf/extract/commands/annotations.rb +16 -0
data/lib/pdf/extract/commands/fields.rb +16 -0
data/lib/pdf/extract/commands/init.rb +9 -0
data/lib/pdf/extract/document.rb +57 -0
data/lib/pdf/extract/field.rb +76 -0
data/lib/pdf/extract/reference_resolver.rb +30 -0
data/lib/pdf/extract/version.rb +5 -0
metadata +165 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 1ba7ccb34a06cf140cff5fd67f51d1791f7db824
+  data.tar.gz: 83569f79d1c7f119fdbc841c51f8b8a76feeb4f0
+SHA512:
+  metadata.gz: 8d3673261b65898ebdca05b6086858165c19d7796b82de0a08507605bd2b944733a0ac2df8637c45f07549b347a621d52bae84460d47c4945bd46ddf5564c117
+  data.tar.gz: d6edd539c7a50d77062bb752a8f2e78bad968d0a947e2cb46fdab3f2e15b637777e96bf73a12c5664737fc6897d36c6a50fdedbe173051d5a2fda3a48698d1d9

data/.gitignore ADDED Viewed

@@ -0,0 +1,13 @@
+/.bundle/
+/.yardoc
+/_yardoc/
+/coverage/
+/doc/
+/pkg/
+/spec/local/
+/spec/reports/
+/tmp/
+.byebug_history
+# rspec failure tracking
+.rspec_status

data/.rspec ADDED Viewed

@@ -0,0 +1,3 @@
+--format documentation
+--color
+--require spec_helper

data/.tool-versions ADDED Viewed

	@@ -0,0 +1 @@
1	+ ruby 2.4.5

data/.travis.yml ADDED Viewed

@@ -0,0 +1,9 @@
+---
+sudo: false
+language: ruby
+cache: bundler
+rvm:
+  - 2.4.1
+  - 2.4.5
+  - 2.5.1
+before_install: gem install bundler -v 2.0.1

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source "https://rubygems.org"
+# Specify your gem's dependencies in pdf-extract.gemspec
+gemspec

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,55 @@
+PATH
+  remote: .
+  specs:
+    pdf-extract-meta (0.1.0)
+      commander (~> 4.4)
+      oj (~> 3.0)
+      pdf-reader (~> 2.2)
+GEM
+  remote: https://rubygems.org/
+  specs:
+    Ascii85 (1.0.3)
+    afm (0.2.2)
+    byebug (11.0.0)
+    commander (4.4.7)
+      highline (~> 2.0.0)
+    diff-lcs (1.3)
+    hashery (2.1.2)
+    highline (2.0.1)
+    oj (3.7.9)
+    pdf-reader (2.2.0)
+      Ascii85 (~> 1.0.0)
+      afm (~> 0.2.1)
+      hashery (~> 2.0)
+      ruby-rc4
+      ttfunk
+    rake (10.5.0)
+    rspec (3.8.0)
+      rspec-core (~> 3.8.0)
+      rspec-expectations (~> 3.8.0)
+      rspec-mocks (~> 3.8.0)
+    rspec-core (3.8.0)
+      rspec-support (~> 3.8.0)
+    rspec-expectations (3.8.2)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.8.0)
+    rspec-mocks (3.8.0)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.8.0)
+    rspec-support (3.8.0)
+    ruby-rc4 (0.1.5)
+    ttfunk (1.5.1)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  bundler (~> 2.0)
+  byebug (~> 11.0)
+  pdf-extract-meta!
+  rake (~> 10.0)
+  rspec (~> 3.0)
+BUNDLED WITH
+   2.0.1

data/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2019 Access Marketing Communications LLC
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,50 @@
+# PDF::Extract
+[![Build Status](https://travis-ci.org/Scrimmage/pdf-extract-meta.svg)](https://travis-ci.org/Scrimmage/pdf-extract-meta)
+[![Code Climate](https://codeclimate.com/github/Scrimmage/pdf-extract-meta.png)](https://codeclimate.com/github/Scrimmage/pdf-extract-meta)
+This gem provides a command line interface to extract field and annotation metadata from a PDF.
+```
+bin/pdf-extract fields spec/data/field-examples/text.pdf
+[{"name":"Sample Text Field","value":"Hello"},{"name":"Sample Text Field (required)","value":null}]
+```
+```
+bin/pdf-extract annotations spec/data/annotation-examples/note.pdf
+[{"name":null,"contents":"Hello"},{"name":null,"contents":"Hello"}]
+```
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'pdf-extract-meta'
+```
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install pdf-extract-meta
+## Usage
+Run `bin/pdf-extract --help` for usage.
+From within Ruby:
+```
+Bundler.with_clean_env do
+  JSON.parse(`pdf-extract fields '#{pdf_path}'`)
+end
+```
+## Development
+After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
+To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version and push git commits and tags.

data/Rakefile ADDED Viewed

@@ -0,0 +1,6 @@
+require "bundler/gem_tasks"
+require "rspec/core/rake_task"
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec

data/bin/console ADDED Viewed

@@ -0,0 +1,7 @@
+#!/usr/bin/env ruby
+require "bundler/setup"
+require "pdf/extract"
+require "irb"
+IRB.start(__FILE__)

data/bin/pdf-extract ADDED Viewed

@@ -0,0 +1,6 @@
+#!/usr/bin/env ruby
+$:.push File.expand_path("../../lib", __FILE__)
+require "pdf/extract"
+require "pdf/extract/commands"

data/bin/setup ADDED Viewed

@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n\t'
+set -vx
+bundle install

data/lib/pdf/extract.rb ADDED Viewed

@@ -0,0 +1,17 @@
+$:.push File.join(File.expand_path("../", __FILE__), "extract")
+require "base64"
+require "oj"
+require "pdf-reader"
+require "pdf/extract/annotation"
+require "pdf/extract/document"
+require "pdf/extract/field"
+require "pdf/extract/reference_resolver"
+require "pdf/extract/version"
+module PDF
+  module Extract
+    class Error < StandardError; end
+  end
+end

data/lib/pdf/extract/annotation.rb ADDED Viewed

@@ -0,0 +1,43 @@
+module PDF
+  module Extract
+    class Annotation
+      attr_reader :data
+      def initialize(data)
+        @data = data || {}
+      end
+      # PDF Reference 6th Edition, Version 1.7, November 2006 page 606
+      # The annotation name, a text string uniquely identifying it among all the annotations on its
+      # page.
+      def name
+        data[:NM]
+      end
+      # PDF Reference 6th Edition, Version 1.7, November 2006 page 606
+      # Text to be displayed for the annotation or, if this type of annotation does not display
+      # text, an alternate description of the annotation’s contents in human-readable form. In
+      # either case, this text is useful when extracting the document’s contents in support of
+      # accessibility to users with disabilities or for other purposes (see Section 10.8.2,
+      # “Alternate Descriptions”). See Section 8.4.5, “Annotation Types” for more details on the
+      # meaning of this entry for each annotation type.
+      def contents
+        data[:Contents]
+      end
+      def subtype
+        data[:Subtype]
+      end
+      def as_json
+        {
+          "name" => name,
+          "contents" => contents,
+          "subtype" => subtype,
+        }
+      end
+    end
+  end
+end

data/lib/pdf/extract/commands.rb ADDED Viewed

@@ -0,0 +1,3 @@
+require "commands/init"
+require "commands/annotations"
+require "commands/fields"

data/lib/pdf/extract/commands/annotations.rb ADDED Viewed

@@ -0,0 +1,16 @@
+command :annotations do |c|
+  STDOUT.sync = true
+  c.syntax = "pdf-extract annotations <path>"
+  c.action do |args, options|
+    path = args.pop
+    say_error "Unspecified file" and abort if !path
+    pdf = PDF::Extract::Document.new(path: path)
+    data = pdf.annotations.map { |f| f.as_json }
+    puts Oj.dump(data)
+  end
+end

data/lib/pdf/extract/commands/fields.rb ADDED Viewed

@@ -0,0 +1,16 @@
+command :fields do |c|
+  STDOUT.sync = true
+  c.syntax = "pdf-extract fields <path>"
+  c.action do |args, options|
+    path = args.pop
+    say_error "Unspecified file" and abort if !path
+    pdf = PDF::Extract::Document.new(path: path)
+    data = pdf.fields.map { |f| f.as_json }
+    puts Oj.dump(data)
+  end
+end

data/lib/pdf/extract/commands/init.rb ADDED Viewed

@@ -0,0 +1,9 @@
+require "commander/import"
+program :description, "Extract data from PDF files"
+program :help_formatter, :compact
+program :version, PDF::Extract::VERSION
+global_option("-v", "--verbose") { $verbose = true }
+default_command :extract

data/lib/pdf/extract/document.rb ADDED Viewed

@@ -0,0 +1,57 @@
+module PDF
+  module Extract
+    class Document
+      PAGE_ANNOTATIONS_KEY = :Annots
+      attr_reader :document
+      def initialize(path:)
+        @document = ::PDF::Reader.new(path)
+      end
+      def annotations
+        reference_resoler.lookup(annotation_references).map { |h|
+          Annotation.new(h)
+        }
+      end
+      def fields
+        reference_resoler.lookup(field_references).map { |h|
+          Field.new(h, reference_resoler)
+        }
+      end
+      private
+      def annotations_for_page(page)
+        page.attributes[PAGE_ANNOTATIONS_KEY]
+      end
+      def annotation_references
+        pages.map { |page| annotations_for_page(page) }.flatten.compact
+      end
+      def field_references
+        # PDF Reference 6th Edition, Version 1.7, November 2006 page 672
+        # Interactive Form Dictionary
+        ifd = objects.values.select { |x| x.respond_to?(:keys) && x.keys.include?(:Fields) }.first || {}
+        refs = ifd[:Fields]
+        refs || []
+      end
+      def objects
+        document.objects || {}
+      end
+      def pages
+        document.pages
+      end
+      def reference_resoler
+        ReferenceResolver.new(document: document)
+      end
+    end
+  end
+end

data/lib/pdf/extract/field.rb ADDED Viewed

@@ -0,0 +1,76 @@
+module PDF
+  module Extract
+    class Field
+      attr_reader :data, :reference_resoler
+      def initialize(data, reference_resoler)
+        @data = data
+        @reference_resoler = reference_resoler
+      end
+      # PDF Reference 6th Edition, Version 1.7, November 2006 page 675
+      # The partial field name
+      def name
+        data[:T]
+      end
+      # PDF Reference 6th Edition, Version 1.7, November 2006 page 675
+      # The type of field that this dictionary describes.
+      def type
+        data[:FT]
+      end
+      # PDF Reference 6th Edition, Version 1.7, November 2006 page 676
+      # The field’s value, whose format varies depending on the field type.
+      def value
+        data[:V]
+      end
+      def image
+        # PDF Reference 6th Edition, Version 1.7, November 2006 page 641
+        # MK: An appearance characteristics dictionary (see Table 8.40) to be used in constructing
+        # a dynamic appearance stream specifying the annotation’s visual presentation on the page.
+        # The name MK for this entry is of historical significance only and has no direct meaning.
+        #
+        # PDF Reference 6th Edition, Version 1.7, November 2006 page 1118
+        # Implementation Notes
+        # If the MK entry is present in the field’s widget annotation dictionary (see Table 8.39),
+        # Acrobat viewers regenerate the entire XObject appearance stream. If MK is not present,
+        # the contents of the stream outside /Tx BMC ... EMC are preserved.
+        mk = data[:MK] || {}
+        mk = mk.is_a?(PDF::Reader::Reference) ? reference_resoler.lookup(mk) : mk
+        # PDF Reference 6th Edition, Version 1.7, November 2006 page 642
+        # I: A form XObject defining the widget annotation’s normal icon, displayed when it is not
+        # interacting with the user.
+        stream = reference_resoler.lookup(mk[:I])&.hash || {}
+        # PDF Reference 6th Edition, Version 1.7, November 2006 page 358
+        # form dictionary
+        resources = reference_resoler.lookup(stream[:Resources]) || {}
+        xobject = resources[:XObject] || {}
+        stream = reference_resoler.lookup(xobject[:Im1])
+        data = stream&.data
+        data ? Base64.encode64(data) : nil
+      end
+      def as_json
+        h = {
+          "name" => name,
+          "value" => value
+        }
+        image.tap { |i| h["image"] = i if i }
+        h
+      end
+    end
+  end
+end

data/lib/pdf/extract/reference_resolver.rb ADDED Viewed

@@ -0,0 +1,30 @@
+module PDF
+  module Extract
+    class ReferenceResolver
+      attr_reader :objects
+      def initialize(document:)
+        @objects = document.objects
+      end
+      def lookup(reference)
+        reference.is_a?(Array) ? lookup_mutiple(reference) : lookup_single(reference)
+      rescue SystemStackError
+        raise PDF::Extract::Error.new("map contains infinite recursion")
+      end
+      private
+      def lookup_mutiple(references)
+        (_ = *references).map { |ref| lookup(ref) }.flatten
+      end
+      def lookup_single(reference)
+        object = objects[reference]
+        object.is_a?(Array) ? lookup_mutiple(object) : object
+      end
+    end
+  end
+end

data/lib/pdf/extract/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module PDF
+  module Extract
+    VERSION = "0.1.0"
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,165 @@
+--- !ruby/object:Gem::Specification
+name: pdf-extract-meta
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Matthew Chadwick
+autorequire:
+bindir: exe
+cert_chain: []
+date: 2019-03-01 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: commander
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '4.4'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '4.4'
+- !ruby/object:Gem::Dependency
+  name: oj
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+- !ruby/object:Gem::Dependency
+  name: pdf-reader
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.2'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.2'
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2.0'
+- !ruby/object:Gem::Dependency
+  name: byebug
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '11.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '11.0'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.0'
+description:
+email:
+- matthew@wescrimmage.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- ".rspec"
+- ".tool-versions"
+- ".travis.yml"
+- Gemfile
+- Gemfile.lock
+- LICENSE
+- README.md
+- Rakefile
+- bin/console
+- bin/pdf-extract
+- bin/setup
+- lib/pdf/extract.rb
+- lib/pdf/extract/annotation.rb
+- lib/pdf/extract/commands.rb
+- lib/pdf/extract/commands/annotations.rb
+- lib/pdf/extract/commands/fields.rb
+- lib/pdf/extract/commands/init.rb
+- lib/pdf/extract/document.rb
+- lib/pdf/extract/field.rb
+- lib/pdf/extract/reference_resolver.rb
+- lib/pdf/extract/version.rb
+homepage: https://github.com/Scrimmage/pdf-extract
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.6.14.3
+signing_key:
+specification_version: 4
+summary: A command line utility for extracting annotation and field metadata from
+  a PDF in JSON format.
+test_files: []