RubyGems - pdf-reader - Versions diffs - 0.10.1 → 0.11.0.alpha - Mend

pdf-reader 0.10.1 → 0.11.0.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

data/CHANGELOG +1 -4
data/README.rdoc +30 -21
data/bin/pdf_text +5 -35
data/examples/callbacks.rb +9 -4
data/examples/extract_bates.rb +15 -29
data/lib/pdf/reader.rb +150 -37
data/lib/pdf/reader/abstract_strategy.rb +2 -0
data/lib/pdf/reader/buffer.rb +12 -13
data/lib/pdf/reader/font.rb +56 -0
data/lib/pdf/reader/glyphlist.txt +40 -1
data/lib/pdf/reader/metadata_strategy.rb +3 -0
data/lib/pdf/reader/object_cache.rb +85 -0
data/lib/pdf/reader/object_hash.rb +19 -5
data/lib/pdf/reader/page.rb +172 -0
data/lib/pdf/reader/page_text_receiver.rb +253 -0
data/lib/pdf/reader/pages_strategy.rb +3 -11
data/lib/pdf/reader/text_receiver.rb +3 -0
data/lib/pdf/reader/xref.rb +3 -4
metadata +41 -35

data/lib/pdf/reader/abstract_strategy.rb CHANGED

@@ -2,6 +2,8 @@
 class PDF::Reader
+  # DEPRECATED: this class was deprecated in version 0.11.0 and will
+  #             eventually be removed
   class AbstractStrategy # :nodoc:
     def initialize(ohash, receivers, options = {})

data/lib/pdf/reader/buffer.rb CHANGED

@@ -74,24 +74,23 @@ class PDF::Reader
     #
     # options:
     #
-    #   :skip_eol - if true, the IO stream is advanced past any LF or CR
-    #               bytes before it reads any data. This is to handle
-    #               content streams, which have a CRLF or LF after the stream
-    #               token.
+    #   :skip_eol - if true, the IO stream is advanced past a CRLF or LF that
+    #               is sitting under the io cursor.
     #
     def read(bytes, opts = {})
       reset_pos
       if opts[:skip_eol]
-        done = false
-        while !done
-          chr = @io.read(1)
-          if chr.nil?
-            return nil
-          elsif chr != "\n" && chr != "\r"
-            @io.seek(-1, IO::SEEK_CUR)
-            done = true
-          end
+        @io.seek(-1, IO::SEEK_CUR)
+        str = @io.read(2)
+        if str.nil?
+          return nil
+        elsif str == "\r\n"
+          # do nothing
+        elsif str[0,1] == "\n"
+          @io.seek(-1, IO::SEEK_CUR)
+        else
+          @io.seek(-2, IO::SEEK_CUR)
         end
       end

data/lib/pdf/reader/font.rb CHANGED

@@ -26,8 +26,21 @@
 class PDF::Reader
   class Font
     attr_accessor :label, :subtype, :encoding, :descendantfonts, :tounicode
+    attr_reader :widths, :first_char, :ascent, :descent, :missing_width, :bbox
     attr_reader :basefont
+    def initialize(ohash = nil, obj = nil)
+      if ohash.nil? || obj.nil?
+        $stderr.puts "DEPREACTION WARNING - PDF::Reader::Font.new should be called with 2 args"
+        return
+      end
+      @ohash = ohash
+      extract_base_info(obj)
+      extract_descriptor(obj)
+      extract_descendants(obj)
+    end
     # returns a hash that maps glyph names to unicode codepoints. The mapping is based on
     # a text file supplied by Adobe at:
     # http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
@@ -73,5 +86,48 @@ class PDF::Reader
         params
       end
     end
+    def glyph_width(c)
+      @missing_width ||= 0
+      if @widths.nil?
+        0
+      else
+        @widths.fetch(c.codepoints.first - @first_char, @missing_width)
+      end
+    end
+    private
+    def extract_base_info(obj)
+      @subtype  = @ohash.object(obj[:Subtype])
+      @basefont = @ohash.object(obj[:BaseFont])
+      @encoding = PDF::Reader::Encoding.new(@ohash.object(obj[:Encoding]))
+      @widths   = @ohash.object(obj[:Widths])
+      @first_char = @ohash.object(obj[:FirstChar])
+      if obj[:ToUnicode]
+        stream = @ohash.object(obj[:ToUnicode])
+        @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
+      end
+    end
+    def extract_descriptor(obj)
+      return unless obj[:FontDescriptor]
+      fd       = @ohash.object(obj[:FontDescriptor])
+      @ascent  = @ohash.object(fd[:Ascent])
+      @descent = @ohash.object(fd[:Descent])
+      @missing_width = @ohash.object(fd[:MissingWidth])
+      @bbox    = @ohash.object(fd[:FontBBox])
+    end
+    def extract_descendants(obj)
+      return unless obj[:DescendantFonts]
+      descendants = @ohash.object(obj[:DescendantFonts])
+      @descendantfonts = descendants.map { |desc|
+        PDF::Reader::Font.new(@ohash, @ohash.object(desc))
+      }
+    end
   end
 end

data/lib/pdf/reader/glyphlist.txt CHANGED

@@ -1,4 +1,43 @@
-# This file maps glyph names to unicode codepoints
+# ###################################################################################
+# Copyright (c) 1997,1998,2002,2007 Adobe Systems Incorporated
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this documentation file to use, copy, publish, distribute,
+# sublicense, and/or sell copies of the documentation, and to permit
+# others to do the same, provided that:
+# - No modification, editing or other alteration of this document is
+# allowed; and
+# - The above copyright notice and this permission notice shall be
+# included in all copies of the documentation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this documentation file, to create their own derivative works
+# from the content of this document to use, copy, publish, distribute,
+# sublicense, and/or sell the derivative works, and to permit others to do
+# the same, provided that the derived work is not represented as being a
+# copy or version of this document.
+#
+# Adobe shall not be liable to any party for any loss of revenue or profit
+# or for indirect, incidental, special, consequential, or other similar
+# damages, whether based on tort (including without limitation negligence
+# or strict liability), contract or other legal or equitable grounds even
+# if Adobe has been advised or had reason to know of the possibility of
+# such damages.� The Adobe materials are provided on an "AS IS" basis.�
+# Adobe specifically disclaims all express, statutory, or implied
+# warranties relating to the Adobe materials, including but not limited to
+# those concerning merchantability or fitness for a particular purpose or
+# non-infringement of any third party rights regarding the Adobe
+# materials.
+# ###################################################################################
+# Name:          Adobe Glyph List
+# Table version: 2.0
+# Date:          September 20, 2002
+#
+# See http://partners.adobe.com/asn/developer/typeforum/unicodegn.html
+#
+# Format: Semicolon-delimited fields:
+#            (1) glyph name
+#            (2) Unicode scalar value
 A;0041
 AE;00C6
 AEacute;01FC

data/lib/pdf/reader/metadata_strategy.rb CHANGED

@@ -2,6 +2,9 @@
 class PDF::Reader
+  # DEPRECATED: this class was deprecated in version 0.11.0 and will
+  #             eventually be removed
+  #
   class MetadataStrategy < AbstractStrategy # :nodoc:
     def self.to_sym

data/lib/pdf/reader/object_cache.rb ADDED

@@ -0,0 +1,85 @@
+# coding: utf-8
+class PDF::Reader
+  # A Hash-like object for caching commonly used objects from a PDF file.
+  #
+  # This is an internal class used by PDF::Reader::ObjectHash
+  #
+  class ObjectCache # nodoc
+    # These object types use little memory and are accessed a heap of times as
+    # part of random page access, so we'll cache the unmarshalled objects and
+    # avoid lots of repetitive (and expensive) tokenising
+    CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
+    def initialize
+      @objects = {}
+    end
+    def [](key)
+      @objects[key]
+    end
+    def []=(key, value)
+      @objects[key] = value if cacheable?(value)
+    end
+    def fetch(key, local_default = nil)
+      @objects.fetch(key, local_default)
+    end
+    def each(&block)
+      @objects.each(&block)
+    end
+    alias :each_pair :each
+    def each_key(&block)
+      @objects.each_key(&block)
+    end
+    def each_value(&block)
+      @objects.each_value(&block)
+    end
+    def size
+      @objects.size
+    end
+    alias :length :size
+    def empty?
+      @objects.empty?
+    end
+    def has_key?(key)
+      @objects.has_key?(key)
+    end
+    alias :include? :has_key?
+    alias :key? :has_key?
+    alias :member? :has_key?
+    def has_value?(value)
+      @objects.has_value?(value)
+    end
+    def to_s
+      "<PDF::Reader::ObjectCache size: #{self.size}>"
+    end
+    def keys
+      @objects.keys
+    end
+    def values
+      @objects.values
+    end
+    private
+    def cacheable?(obj)
+      obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
+    end
+  end
+end

data/lib/pdf/reader/object_hash.rb CHANGED

@@ -5,7 +5,7 @@ class PDF::Reader
   # object.
   #
   # A PDF file can be viewed as a large hash map. It is a series of objects
-  # stored at an exact byte offsets, and a table that maps object IDs to byte
+  # stored at precise byte offsets, and a table that maps object IDs to byte
   # offsets. Given an object ID, looking up an object is an O(1) operation.
   #
   # Each PDF object can be mapped to a ruby object, so by passing an object
@@ -28,6 +28,8 @@ class PDF::Reader
   class ObjectHash
     include Enumerable
+    CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
     attr_accessor :default
     attr_reader :trailer, :pdf_version
@@ -50,6 +52,11 @@ class PDF::Reader
       @pdf_version = read_version
       @xref        = PDF::Reader::XRef.new(@io)
       @trailer     = @xref.trailer
+      @cache       = PDF::Reader::ObjectCache.new
+      if trailer[:Encrypt]
+        raise ::PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
+      end
     end
     # returns the type of object a ref points to
@@ -81,25 +88,32 @@ class PDF::Reader
         unless key.kind_of?(PDF::Reader::Reference)
           key = PDF::Reader::Reference.new(key.to_i, 0)
         end
-        if xref[key].is_a?(Fixnum)
+        if @cache.has_key?(key)
+          @cache[key]
+        elsif xref[key].is_a?(Fixnum)
           buf = new_buffer(xref[key])
-          Parser.new(buf, self).object(key.id, key.gen)
+          @cache[key] = Parser.new(buf, self).object(key.id, key.gen)
         elsif xref[key].is_a?(PDF::Reader::Reference)
           container_key = xref[key]
           object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
-          object_streams[container_key][key.id]
+          @cache[key] = object_streams[container_key][key.id]
         end
       rescue InvalidObjectError
         return default
       end
     end
+    def cacheable?(obj)
+      obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
+    end
     # If key is a PDF::Reader::Reference object, lookup the corresponding
     # object in the PDF and return it. Otherwise return key untouched.
     #
     def object(key)
       key.is_a?(PDF::Reader::Reference) ? self[key] : key
     end
+    alias :deref :object
     # Access an object from the PDF. key can be an int or a PDF::Reader::Reference
     # object.
@@ -192,7 +206,7 @@ class PDF::Reader
     alias :value? :has_key?
     def to_s
-      "<PDF::Reader::ObejctHash size: #{self.size}>"
+      "<PDF::Reader::ObjectHash size: #{self.size}>"
     end
     # return an array of all keys in the file

data/lib/pdf/reader/page.rb ADDED

@@ -0,0 +1,172 @@
+# coding: utf-8
+module PDF
+  class Reader
+    # high level representation of a single PDF page. Ties together the various
+    # low level classes in PDF::Reader and provides access to the various
+    # components of the page (text, images, fonts, etc) in convenient formats.
+    #
+    # If you require access to the raw PDF objects for this page, you can access
+    # the Page dictionary via the page_object accessor. You will need to use the
+    # objects accessor to help walk the page dictionary in any useful way.
+    #
+    class Page
+      # lowlevel hash-like access to all objects in the underlying PDF
+      attr_reader :objects
+      # the raw PDF object that defines this page
+      attr_reader :page_object
+      # creates a new page wrapper.
+      #
+      # * objects - an ObjectHash instance that wraps a PDF file
+      # * pagenum - an int specifying the page number to expose. 1 indexed.
+      #
+      def initialize(objects, pagenum)
+        @objects, @pagenum = objects, pagenum
+        @page_object = objects.deref(objects.page_references[pagenum - 1])
+        unless @page_object.is_a?(::Hash)
+          raise ArgumentError, "invalid page: #{pagenum}"
+        end
+      end
+      # return the number of this page within the full document
+      #
+      def number
+        @pagenum
+      end
+      # return a friendly string representation of this page
+      #
+      def inspect
+        "<PDF::Reader::Page page: #{@pagenum}>"
+      end
+      # Returns the attributes that accompany this page. Includes
+      # attributes inherited from parents.
+      #
+      def attributes
+        hash = {}
+        page_with_ancestors.reverse.each do |obj|
+          hash.merge!(@objects.deref(obj))
+        end
+        hash
+      end
+      # Returns the resources that accompany this page. Includes
+      # resources inherited from parents.
+      #
+      def resources
+        @resources ||= @objects.deref(attributes[:Resources]) || {}
+      end
+      # return a hash of fonts used on this page.
+      #
+      # The keys are the font labels used within the page content stream.
+      #
+      # The values are a PDF::Reader::Font instances that provide access
+      # to most available metrics for each font.
+      #
+      def fonts
+        raw_fonts = objects.deref(resources[:Font] || {})
+        ::Hash[raw_fonts.map { |label, font|
+          [label, PDF::Reader::Font.new(objects, objects.deref(font))]
+        }]
+      end
+      # returns the plain text content of this page encoded as UTF-8. Any
+      # characters that can't be translated will be returned as a ▯
+      #
+      def text
+        text_receiver = PageTextReceiver.new(fonts)
+        walk(text_receiver)
+        text_receiver.content
+      end
+      alias :to_s :text
+      # processes the raw content stream for this page in sequential order and
+      # passes callbacks to the receiver objects.
+      #
+      # This is mostly low level and you can probably ignore it unless you need
+      # access to soemthing like the raw encoded text. For an example of how
+      # this can be used as a basis for higher level functionality, see the
+      # text() method
+      #
+      def walk(*receivers)
+        callback(receivers, :page=, [self])
+        content_stream(receivers, raw_content)
+      end
+      # returns the raw content stream for this page. This is plumbing, nothing to
+      # see here unless you're a PDF nerd like me.
+      #
+      def raw_content
+        contents = objects.deref(@page_object[:Contents])
+        [contents].flatten.compact.map { |obj|
+          objects.deref(obj)
+        }.map { |obj|
+          obj.unfiltered_data
+        }.join
+      end
+      private
+      def root
+        root ||= objects.deref(@objects.trailer[:Root])
+      end
+      def xobjects
+        resources[:XObject] || {}
+      end
+      def content_stream(receivers, instructions)
+        buffer       = Buffer.new(StringIO.new(instructions), :content_stream => true)
+        parser       = Parser.new(buffer, @objects)
+        params       = []
+        while (token = parser.parse_token(PagesStrategy::OPERATORS))
+          if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
+            callback(receivers, PagesStrategy::OPERATORS[token], params)
+            params.clear
+          else
+            params << token
+          end
+        end
+      rescue EOFError => e
+        raise MalformedPDFError, "End Of File while processing a content stream"
+      end
+      # calls the name callback method on the receiver class with params as the arguments
+      #
+      def callback (receivers, name, params=[])
+        receivers.each do |receiver|
+          receiver.send(name, *params) if receiver.respond_to?(name)
+        end
+      end
+      def page_with_ancestors(obj = nil)
+        obj = objects.deref(obj)
+        if obj.nil?
+          [@page_object] + page_with_ancestors(@page_object[:Parent])
+        elsif obj[:Parent]
+          [select_inheritable(obj)] + page_with_ancestors(obj[:Parent])
+        else
+          [select_inheritable(obj)]
+        end
+      end
+      # select the elements from a Pages dictionary that can be inherited by
+      # child Page dictionaries.
+      #
+      def select_inheritable(obj)
+        ::Hash[obj.select { |key, value|
+          [:Resources, :MediaBox, :CropBox, :Rotate, :Parent].include?(key)
+        }]
+      end
+    end
+  end
+end