RubyGems - fireinc-pdf-reader - Versions diffs - 0.11.0.alpha - Mend

fireinc-pdf-reader 0.11.0.alpha

Files changed (54) hide show

data/CHANGELOG +168 -0
data/MIT-LICENSE +21 -0
data/README.rdoc +137 -0
data/Rakefile +34 -0
data/TODO +45 -0
data/bin/pdf_list_callbacks +15 -0
data/bin/pdf_object +48 -0
data/bin/pdf_text +15 -0
data/examples/callbacks.rb +21 -0
data/examples/extract_bates.rb +49 -0
data/examples/extract_images.rb +108 -0
data/examples/hash.rb +12 -0
data/examples/metadata.rb +25 -0
data/examples/page_counter_improved.rb +23 -0
data/examples/page_counter_naive.rb +24 -0
data/examples/rspec.rb +57 -0
data/examples/text.rb +40 -0
data/examples/version.rb +25 -0
data/lib/pdf/hash.rb +15 -0
data/lib/pdf/reader/abstract_strategy.rb +81 -0
data/lib/pdf/reader/buffer.rb +346 -0
data/lib/pdf/reader/cmap.rb +138 -0
data/lib/pdf/reader/encoding.rb +190 -0
data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
data/lib/pdf/reader/encodings/standard.txt +47 -0
data/lib/pdf/reader/encodings/symbol.txt +154 -0
data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
data/lib/pdf/reader/error.rb +53 -0
data/lib/pdf/reader/filter.rb +219 -0
data/lib/pdf/reader/font.rb +133 -0
data/lib/pdf/reader/form_xobject.rb +83 -0
data/lib/pdf/reader/glyphlist.txt +4322 -0
data/lib/pdf/reader/lzw.rb +123 -0
data/lib/pdf/reader/metadata_strategy.rb +56 -0
data/lib/pdf/reader/object_cache.rb +85 -0
data/lib/pdf/reader/object_hash.rb +289 -0
data/lib/pdf/reader/object_stream.rb +51 -0
data/lib/pdf/reader/page.rb +185 -0
data/lib/pdf/reader/page_text_receiver.rb +278 -0
data/lib/pdf/reader/pages_strategy.rb +475 -0
data/lib/pdf/reader/parser.rb +225 -0
data/lib/pdf/reader/print_receiver.rb +18 -0
data/lib/pdf/reader/reference.rb +66 -0
data/lib/pdf/reader/register_receiver.rb +95 -0
data/lib/pdf/reader/stream.rb +69 -0
data/lib/pdf/reader/text_receiver.rb +264 -0
data/lib/pdf/reader/token.rb +41 -0
data/lib/pdf/reader/xref.rb +220 -0
data/lib/pdf/reader.rb +296 -0
data/lib/pdf-reader.rb +1 -0
metadata +211 -0

data/lib/pdf/reader/xref.rb ADDED Viewed

@@ -0,0 +1,220 @@
+################################################################################
+#
+# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+################################################################################
+class PDF::Reader
+  ################################################################################
+  # An internal PDF::Reader class that represents the XRef table in a PDF file as a
+  # hash-like object.
+  #
+  # An Xref table is a map of object identifiers and byte offsets. Any time a particular
+  # object needs to be found, the Xref table is used to find where it is stored in the
+  # file.
+  #
+  # Hash keys are object ids, values are either:
+  #
+  # * a byte offset where the object starts (regular PDF objects)
+  # * a PDF::Reader::Reference instance that points to a stream that contains the
+  #   desired object (PDF objects embedded in an object stream)
+  #
+  # The class behaves much like a standard Ruby hash, including the use of
+  # the Enumerable mixin. The key difference is no []= method - the hash
+  # is read only.
+  #
+  class XRef
+    include Enumerable
+    attr_reader :trailer
+    ################################################################################
+    # create a new Xref table based on the contents of the supplied io object
+    #
+    # io - must be an IO object, generally either a file or a StringIO
+    #
+    def initialize (io)
+      @io = io
+      @xref = {}
+      @trailer = load_offsets
+    end
+    ################################################################################
+    # return the number of objects in this file. Objects with multiple generations are
+    # only counter once.
+    def size
+      @xref.size
+    end
+    ################################################################################
+    # returns the byte offset for the specified PDF object.
+    #
+    # ref - a PDF::Reader::Reference object containing an object ID and revision number
+    def [](ref)
+      @xref[ref.id][ref.gen]
+    rescue
+      raise InvalidObjectError, "Object #{ref.id}, Generation #{ref.gen} is invalid"
+    end
+    ################################################################################
+    # iterate over each object in the xref table
+    def each(&block)
+      ids = @xref.keys.sort
+      ids.each do |id|
+        gen = @xref[id].keys.sort[-1]
+        yield PDF::Reader::Reference.new(id, gen)
+      end
+    end
+    ################################################################################
+    private
+    ################################################################################
+    # Read a xref table from the underlying buffer.
+    #
+    # If offset is specified the table will be loaded from there, otherwise the
+    # default offset will be located and used.
+    #
+    # After seeking to the offset, processing is handed of to either load_xref_table()
+    # or load_xref_stream() based on what we find there.
+    #
+    def load_offsets(offset = nil)
+      offset ||= new_buffer.find_first_xref_offset
+      buf = new_buffer(offset)
+      tok_one = buf.token
+      return load_xref_table(buf) if tok_one == "xref" || tok_one == "ref"
+      tok_two   = buf.token
+      tok_three = buf.token
+      if tok_one.to_i >= 0 && tok_two.to_i >= 0 && tok_three == "obj"
+        buf = new_buffer(offset)
+        stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i)
+        return load_xref_stream(stream)
+      end
+      raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{tok_one} != xref)"
+    end
+    ################################################################################
+    # Assumes the underlying buffer is positioned at the start of a traditional
+    # Xref table and processes it into memory.
+    def load_xref_table(buf)
+      params = []
+      while !params.include?("trailer") && !params.include?(nil)
+        if params.size == 2
+          objid, count = params[0].to_i, params[1].to_i
+          count.times do
+            offset = buf.token.to_i
+            generation = buf.token.to_i
+            state = buf.token
+            store(objid, generation, offset) if state == "n"
+            objid += 1
+            params.clear
+          end
+        end
+        params << buf.token
+      end
+      trailer = Parser.new(buf, self).parse_token
+      raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless trailer.kind_of?(Hash)
+      load_offsets(trailer[:XRefStm])   if trailer.has_key?(:XRefStm)
+      load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
+      trailer
+    end
+    ################################################################################
+    # Read a XReaf stream from the underlying buffer instead of a traditional xref table.
+    #
+    def load_xref_stream(stream)
+      unless stream.hash[:Type] == :XRef
+        raise PDF::Reader::MalformedPDFError, "xref stream not found when expected"
+      end
+      trailer = Hash[stream.hash.select { |key, value|
+        [:Size, :Prev, :Root, :Encrypt, :Info, :ID].include?(key)
+      }]
+      widths       = stream.hash[:W]
+      entry_length = widths.inject(0) { |s, w| s + w }
+      raw_data     = StringIO.new(stream.unfiltered_data)
+      if stream.hash[:Index]
+        index = stream.hash[:Index]
+      else
+        index = [0, stream.hash[:Size]]
+      end
+      index.each_slice(2) do |start_id, size|
+        obj_ids = (start_id..(start_id+(size-1)))
+        obj_ids.each do |objid|
+          entry = raw_data.read(entry_length) || ""
+          f1    = unpack_bytes(entry[0,widths[0]])
+          f2    = unpack_bytes(entry[widths[0],widths[1]])
+          f3    = unpack_bytes(entry[widths[0]+widths[1],widths[2]])
+          if f1 == 1 && f2 > 0
+            store(objid, f3, f2)
+          elsif f1 == 2 && f2 > 0
+            store(objid, 0, PDF::Reader::Reference.new(f2, 0))
+          end
+        end
+      end
+      load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
+      trailer
+    end
+    ################################################################################
+    # XRef streams pack info into integers 1-N bytes wide. Depending on the number of
+    # bytes they need to be converted to an int in different ways.
+    #
+    def unpack_bytes(bytes)
+      if bytes.to_s.size == 0
+        0
+      elsif bytes.size == 1
+        bytes.unpack("C")[0]
+      elsif bytes.size == 2
+        bytes.unpack("n")[0]
+      elsif bytes.size == 3
+        ("\x00" + bytes).unpack("N")[0]
+      elsif bytes.size == 4
+        bytes.unpack("N")[0]
+      else
+        raise UnsupportedFeatureError, "Unable to unpack xref stream entries with more than 4 bytes"
+      end
+    end
+    ################################################################################
+    # Wrap the io stream we're working with in a buffer that can tokenise it for us.
+    #
+    # We create multiple buffers so we can be tokenising multiple sections of the file
+    # at the same time without worring about clearing the buffers contents.
+    #
+    def new_buffer(offset = 0)
+      PDF::Reader::Buffer.new(@io, :seek => offset)
+    end
+    ################################################################################
+    # Stores an offset value for a particular PDF object ID and revision number
+    #
+    def store (id, gen, offset)
+      (@xref[id] ||= {})[gen] ||= offset
+    end
+  end
+  ################################################################################
+end
+################################################################################

data/lib/pdf/reader.rb ADDED Viewed

@@ -0,0 +1,296 @@
+################################################################################
+#
+# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
+# Copyright (C) 2011 James Healy
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+################################################################################
+require 'stringio'
+require 'zlib'
+require 'ascii85'
+module PDF
+  ################################################################################
+  # The Reader class serves as an entry point for parsing a PDF file.
+  #
+  # PDF is a page based file format. There is some data associated with the
+  # document (metadata, bookmarks, etc) but all visible content is stored
+  # under a Page object.
+  #
+  # In most use cases for extracting and examining the contents of a PDF it
+  # makes sense to traverse the information using page based iteration.
+  #
+  # In addition to the documentation here, check out the
+  # PDF::Reader::Page class.
+  #
+  # == File Metadata
+  #
+  #   reader = PDF::Reader.new("somefile.pdf")
+  #
+  #   puts reader.pdf_version
+  #   puts reader.info
+  #   puts reader.metadata
+  #   puts reader.page_count
+  #
+  # == Iterating over page content
+  #
+  #   reader = PDF::Reader.new("somefile.pdf")
+  #
+  #   reader.pages.each do |page|
+  #     puts page.fonts
+  #     puts page.images
+  #     puts page.text
+  #   end
+  #
+  # == Extracting all text
+  #
+  #   reader = PDF::Reader.new("somefile.pdf")
+  #
+  #   reader.pages.map(&:text)
+  #
+  # == Extracting content from a single page
+  #
+  #   reader = PDF::Reader.new("somefile.pdf")
+  #
+  #   page = reader.page(1)
+  #   puts page.fonts
+  #   puts page.images
+  #   puts page.text
+  #
+  # == Low level callbacks (ala current version of PDF::Reader)
+  #
+  #   reader = PDF::Reader.new("somefile.pdf")
+  #
+  #   page = reader.page(1)
+  #   page.walk(receiver)
+  #
+  class Reader
+    # lowlevel hash-like access to all objects in the underlying PDF
+    attr_reader :objects
+    attr_reader :page_count, :pdf_version, :info, :metadata
+    # creates a new document reader for the provided PDF.
+    #
+    # input can be an IO-ish object (StringIO, File, etc) containing a PDF
+    # or a filename
+    #
+    #   reader = PDF::Reader.new("somefile.pdf")
+    #
+    #   File.open("somefile.pdf","rb") do |file|
+    #     reader = PDF::Reader.new(file)
+    #   end
+    #
+    def initialize(input = nil)
+      if input # support the deprecated Reader API
+        @objects = PDF::Reader::ObjectHash.new(input)
+        @page_count  = get_page_count
+        @pdf_version = @objects.pdf_version
+        @info        = @objects.deref(@objects.trailer[:Info])
+        @metadata    = get_metadata
+      end
+    end
+    # syntactic sugar for opening a PDF file. Accepts the same arguments
+    # as new().
+    #
+    #   PDF::Reader.open("somefile.pdf") do |reader|
+    #     puts reader.pdf_version
+    #   end
+    #
+    def self.open(input, &block)
+      yield PDF::Reader.new(input)
+    end
+    # DEPRECATED: this method was deprecated in version 0.11.0 and will
+    #             eventually be removed
+    #
+    #
+    # Parse the file with the given name, sending events to the given receiver.
+    #
+    def self.file(name, receivers, opts = {})
+      File.open(name,"rb") do |f|
+        new.parse(f, receivers, opts)
+      end
+    end
+    # DEPRECATED: this method was deprecated in version 0.11.0 and will
+    #             eventually be removed
+    #
+    # Parse the given string, sending events to the given receiver.
+    #
+    def self.string(str, receivers, opts = {})
+      StringIO.open(str) do |s|
+        new.parse(s, receivers, opts)
+      end
+    end
+    # DEPRECATED: this method was deprecated in version 0.11.0 and will
+    #             eventually be removed
+    #
+    # Parse the file with the given name, returning an unmarshalled ruby version of
+    # represents the requested pdf object
+    #
+    def self.object_file(name, id, gen = 0)
+      File.open(name,"rb") { |f|
+        new.object(f, id.to_i, gen.to_i)
+      }
+    end
+    # DEPRECATED: this method was deprecated in version 0.11.0 and will
+    #             eventually be removed
+    #
+    # Parse the given string, returning an unmarshalled ruby version of represents
+    # the requested pdf object
+    #
+    def self.object_string(str, id, gen = 0)
+      StringIO.open(str) { |s|
+        new.object(s, id.to_i, gen.to_i)
+      }
+    end
+    # returns an array of PDF::Reader::Page objects, one for each
+    # page in the source PDF.
+    #
+    #   reader = PDF::Reader.new("somefile.pdf")
+    #
+    #   reader.pages.each do |page|
+    #     puts page.fonts
+    #     puts page.images
+    #     puts page.text
+    #   end
+    #
+    # See the docs for PDF::Reader::Page to read more about the
+    # methods available on each page
+    #
+    def pages
+      (1..@page_count).map { |num|
+        PDF::Reader::Page.new(@objects, num)
+      }
+    end
+    # returns a single PDF::Reader::Page for the specified page.
+    # Use this instead of pages method when you need to access just a single
+    # page
+    #
+    #   reader = PDF::Reader.new("somefile.pdf")
+    #   page   = reader.page(10)
+    #
+    #   puts page.text
+    #
+    # See the docs for PDF::Reader::Page to read more about the
+    # methods available on each page
+    #
+    def page(num)
+      num = num.to_i
+      raise ArgumentError, "valid pages are 1 .. #{@page_count}" if num < 1 || num > @page_count
+      PDF::Reader::Page.new(@objects, num)
+    end
+    # DEPRECATED: this method was deprecated in version 0.11.0 and will
+    #             eventually be removed
+    #
+    # Given an IO object that contains PDF data, parse it.
+    #
+    def parse(io, receivers, opts = {})
+      ohash    = ObjectHash.new(io)
+      if ohash.trailer[:Encrypt]
+        raise ::PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
+      end
+      options = {:pages => true, :raw_text => false, :metadata => true}
+      options.merge!(opts)
+      strategies.each do |s|
+        s.new(ohash, receivers, options).process
+      end
+      self
+    end
+    # DEPRECATED: this method was deprecated in version 0.11.0 and will
+    #             eventually be removed
+    #
+    # Given an IO object that contains PDF data, return the contents of a single object
+    #
+    def object (io, id, gen)
+      @objects = ObjectHash.new(io)
+      @objects.deref(Reference.new(id, gen))
+    end
+    private
+    def strategies
+      @strategies ||= [
+        ::PDF::Reader::MetadataStrategy,
+        ::PDF::Reader::PagesStrategy
+      ]
+    end
+    def root
+      root ||= @objects.deref(@objects.trailer[:Root])
+    end
+    def get_metadata
+      stream = @objects.deref(root[:Metadata])
+      stream ? stream.unfiltered_data : nil
+    end
+    def get_page_count
+      pages = @objects.deref(root[:Pages])
+      pages[:Count]
+    end
+  end
+end
+################################################################################
+require 'pdf/reader/abstract_strategy'
+require 'pdf/reader/buffer'
+require 'pdf/reader/cmap'
+require 'pdf/reader/encoding'
+require 'pdf/reader/error'
+require 'pdf/reader/filter'
+require 'pdf/reader/font'
+require 'pdf/reader/form_xobject'
+require 'pdf/reader/lzw'
+require 'pdf/reader/metadata_strategy'
+require 'pdf/reader/object_cache'
+require 'pdf/reader/object_hash'
+require 'pdf/reader/object_stream'
+require 'pdf/reader/pages_strategy'
+require 'pdf/reader/parser'
+require 'pdf/reader/print_receiver'
+require 'pdf/reader/reference'
+require 'pdf/reader/register_receiver'
+require 'pdf/reader/stream'
+require 'pdf/reader/text_receiver'
+require 'pdf/reader/page_text_receiver'
+require 'pdf/reader/token'
+require 'pdf/reader/xref'
+require 'pdf/reader/page'
+require 'pdf/hash'

data/lib/pdf-reader.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ require "pdf/reader"