RubyGems - pdf-reader - Versions diffs - 0.11.0.alpha → 0.12.0.alpha - Mend

pdf-reader 0.11.0.alpha → 0.12.0.alpha

Files changed (28) hide show

data/CHANGELOG +17 -1
data/README.rdoc +31 -1
data/bin/pdf_list_callbacks +2 -0
data/examples/callbacks.rb +2 -1
data/examples/extract_bates.rb +3 -2
data/examples/extract_images.rb +146 -23
data/examples/hash.rb +5 -5
data/examples/metadata.rb +5 -16
data/examples/page_count.rb +13 -0
data/examples/rspec.rb +17 -41
data/examples/text.rb +4 -29
data/examples/version.rb +3 -15
data/lib/pdf/reader.rb +45 -27
data/lib/pdf/reader/encoding.rb +3 -3
data/lib/pdf/reader/error.rb +1 -0
data/lib/pdf/reader/filter.rb +64 -9
data/lib/pdf/reader/font.rb +0 -17
data/lib/pdf/reader/form_xobject.rb +83 -0
data/lib/pdf/reader/glyph_hash.rb +88 -0
data/lib/pdf/reader/glyphlist.txt +1 -1
data/lib/pdf/reader/object_hash.rb +42 -12
data/lib/pdf/reader/page.rb +63 -17
data/lib/pdf/reader/page_text_receiver.rb +38 -4
data/lib/pdf/reader/standard_security_handler.rb +186 -0
data/lib/pdf/reader/stream.rb +2 -2
metadata +39 -9
data/examples/page_counter_improved.rb +0 -23
data/examples/page_counter_naive.rb +0 -24

data/examples/text.rb CHANGED Viewed

@@ -6,35 +6,10 @@
 require 'rubygems'
 require 'pdf/reader'
-class PageTextReceiver
-  attr_accessor :content
+filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cairo-unicode.pdf"
-  def initialize
-    @content = []
-  end
-  # Called when page parsing starts
-  def begin_page(arg = nil)
-    @content << ""
-  end
-  # record text that is drawn on the page
-  def show_text(string, *params)
-    @content.last << string.strip
-  end
-  # there's a few text callbacks, so make sure we process them all
-  alias :super_show_text :show_text
-  alias :move_to_next_line_and_show_text :show_text
-  alias :set_spacing_next_line_show_text :show_text
-  # this final text callback takes slightly different arguments
-  def show_text_with_positioning(*params)
-    params = params.first
-    params.each { |str| show_text(str) if str.kind_of?(String)}
+PDF::Reader.open(filename) do |reader|
+  reader.pages.each do |page|
+    puts page.text
   end
 end
-receiver = PageTextReceiver.new
-pdf = PDF::Reader.file("somefile.pdf", receiver)
-puts receiver.content.inspect

data/examples/version.rb CHANGED Viewed

@@ -6,20 +6,8 @@
 require 'rubygems'
 require 'pdf/reader'
-class VersionReceiver
-  attr_accessor :version
-  def initialize
-    @version = nil
-  end
-  # Called when document parsing starts
-  def pdf_version(arg = nil)
-    @version = arg
-  end
+filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cairo-basic.pdf"
+PDF::Reader.open(filename) do |reader|
+  puts reader.pdf_version
 end
-receiver = VersionReceiver.new
-pdf = PDF::Reader.file(ARGV.shift, receiver)
-puts receiver.version

data/lib/pdf/reader.rb CHANGED Viewed

@@ -84,13 +84,18 @@ module PDF
   #   page = reader.page(1)
   #   page.walk(receiver)
   #
+  # == Encrypted Files
+  #
+  # Depending on the algorithm it may be possible to parse an encrypted file.
+  # For standard PDF encryption you'll need the :password option
+  #
+  #   reader = PDF::Reader.new("somefile.pdf", :password => "apples")
+  #
   class Reader
     # lowlevel hash-like access to all objects in the underlying PDF
     attr_reader :objects
-    attr_reader :page_count, :pdf_version, :info, :metadata
     # creates a new document reader for the provided PDF.
     #
     # input can be an IO-ish object (StringIO, File, etc) containing a PDF
@@ -102,16 +107,34 @@ module PDF
     #     reader = PDF::Reader.new(file)
     #   end
     #
-    def initialize(input = nil)
+    # If the source file is encrypted you can provide a password for decrypting
+    #
+    #   reader = PDF::Reader.new("somefile.pdf", :password => "apples")
+    #
+    def initialize(input = nil, opts = {})
       if input # support the deprecated Reader API
-        @objects = PDF::Reader::ObjectHash.new(input)
-        @page_count  = get_page_count
-        @pdf_version = @objects.pdf_version
-        @info        = @objects.deref(@objects.trailer[:Info])
-        @metadata    = get_metadata
+        @objects = PDF::Reader::ObjectHash.new(input, opts)
       end
     end
+    def info
+      @objects.deref(@objects.trailer[:Info])
+    end
+    def metadata
+      stream = @objects.deref(root[:Metadata])
+      stream ? stream.unfiltered_data : nil
+    end
+    def page_count
+      pages = @objects.deref(root[:Pages])
+      @page_count ||= pages[:Count]
+    end
+    def pdf_version
+      @objects.pdf_version
+    end
     # syntactic sugar for opening a PDF file. Accepts the same arguments
     # as new().
     #
@@ -119,8 +142,14 @@ module PDF
     #     puts reader.pdf_version
     #   end
     #
-    def self.open(input, &block)
-      yield PDF::Reader.new(input)
+    # or
+    #
+    #   PDF::Reader.open("somefile.pdf", :password => "apples") do |reader|
+    #     puts reader.pdf_version
+    #   end
+    #
+    def self.open(input, opts = {}, &block)
+      yield PDF::Reader.new(input, opts)
     end
     # DEPRECATED: this method was deprecated in version 0.11.0 and will
@@ -185,7 +214,7 @@ module PDF
     # methods available on each page
     #
     def pages
-      (1..@page_count).map { |num|
+      (1..self.page_count).map { |num|
         PDF::Reader::Page.new(@objects, num)
       }
     end
@@ -204,7 +233,7 @@ module PDF
     #
     def page(num)
       num = num.to_i
-      raise ArgumentError, "valid pages are 1 .. #{@page_count}" if num < 1 || num > @page_count
+      raise ArgumentError, "valid pages are 1 .. #{self.page_count}" if num < 1 || num > self.page_count
       PDF::Reader::Page.new(@objects, num)
     end
@@ -217,10 +246,6 @@ module PDF
     def parse(io, receivers, opts = {})
       ohash    = ObjectHash.new(io)
-      if ohash.trailer[:Encrypt]
-        raise ::PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
-      end
       options = {:pages => true, :raw_text => false, :metadata => true}
       options.merge!(opts)
@@ -252,17 +277,7 @@ module PDF
     end
     def root
-      root ||= @objects.deref(@objects.trailer[:Root])
-    end
-    def get_metadata
-      stream = @objects.deref(root[:Metadata])
-      stream ? stream.unfiltered_data : nil
-    end
-    def get_page_count
-      pages = @objects.deref(root[:Pages])
-      pages[:Count]
+      @root ||= @objects.deref(@objects.trailer[:Root])
     end
   end
@@ -276,6 +291,8 @@ require 'pdf/reader/encoding'
 require 'pdf/reader/error'
 require 'pdf/reader/filter'
 require 'pdf/reader/font'
+require 'pdf/reader/form_xobject'
+require 'pdf/reader/glyph_hash'
 require 'pdf/reader/lzw'
 require 'pdf/reader/metadata_strategy'
 require 'pdf/reader/object_cache'
@@ -286,6 +303,7 @@ require 'pdf/reader/parser'
 require 'pdf/reader/print_receiver'
 require 'pdf/reader/reference'
 require 'pdf/reader/register_receiver'
+require 'pdf/reader/standard_security_handler'
 require 'pdf/reader/stream'
 require 'pdf/reader/text_receiver'
 require 'pdf/reader/page_text_receiver'

data/lib/pdf/reader/encoding.rb CHANGED Viewed

@@ -97,7 +97,7 @@ class PDF::Reader
       }.map { |num|
         original_codepoint_to_unicode(num, tounicode)
       }.map { |c|
-        glyphnames[c] || c
+        names_to_unicode[c] || c
       }.map { |c|
         if c.nil? || !c.is_a?(Fixnum)
           PDF::Reader::Encoding::UNKNOWN_CHAR
@@ -170,8 +170,8 @@ class PDF::Reader
       mapping.size > 0
     end
-    def glyphnames
-      @glyphnames ||= PDF::Reader::Font.glyphnames
+    def names_to_unicode
+      @names_to_unicode ||= PDF::Reader::GlyphHash.new
     end
     def load_mapping(file)

data/lib/pdf/reader/error.rb CHANGED Viewed

@@ -49,5 +49,6 @@ class PDF::Reader
   class MalformedPDFError < RuntimeError; end
   class InvalidObjectError < MalformedPDFError; end
   class UnsupportedFeatureError < RuntimeError; end
+  class EncryptedPDFError < UnsupportedFeatureError; end
 end
 ################################################################################

data/lib/pdf/reader/filter.rb CHANGED Viewed

@@ -31,6 +31,7 @@ class PDF::Reader
   # content.
   #
   class Filter # :nodoc:
     ################################################################################
     # creates a new filter for decoding content.
     #
@@ -41,14 +42,16 @@ class PDF::Reader
       @options = options
       case name.to_sym
-      when :ASCII85Decode  then @filter = :ascii85
-      when :ASCIIHexDecode then @filter = :asciihex
-      when :CCITTFaxDecode then @filter = nil
-      when :DCTDecode      then @filter = nil
-      when :FlateDecode    then @filter = :flate
-      when :JBIG2Decode    then @filter = nil
-      when :LZWDecode      then @filter = :lzw
-      else                 raise UnsupportedFeatureError, "Unknown filter: #{name}"
+      when :ASCII85Decode   then @filter = :ascii85
+      when :ASCIIHexDecode  then @filter = :asciihex
+      when :CCITTFaxDecode  then @filter = nil
+      when :DCTDecode       then @filter = nil
+      when :FlateDecode     then @filter = :flate
+      when :JBIG2Decode     then @filter = nil
+      when :LZWDecode       then @filter = :lzw
+      when :RunLengthDecode then @filter = :runlength
+      else
+        raise UnsupportedFeatureError, "Unknown filter: #{name}"
       end
     end
     ################################################################################
@@ -117,6 +120,36 @@ class PDF::Reader
       depredict(data, @options)
     end
     ################################################################################
+    # Decode the specified data with the RunLengthDecode compression algorithm
+    def runlength(data)
+      pos = 0
+      out = ""
+      while pos < data.length
+        length = data.getbyte(pos)
+        pos += 1
+        case
+        when length == 128
+          break
+        when length < 128
+          # When the length is < 128, we copy the following length+1 bytes
+          # literally.
+          out << data[pos, length + 1]
+          pos += length
+        else
+          # When the length is > 128, we copy the next byte (257 - length)
+          # times; i.e., "\xFA\x00" ([250, 0]) will expand to
+          # "\x00\x00\x00\x00\x00\x00\x00".
+          out << data[pos, 1] * (257 - length)
+        end
+        pos += 1
+      end
+      out
+    end
+    ################################################################################
     def depredict(data, opts = {})
       predictor = (opts || {})[:Predictor].to_i
@@ -133,7 +166,29 @@ class PDF::Reader
     end
     ################################################################################
     def tiff_depredict(data, opts = {})
-      raise UnsupportedFeatureError, "TIFF predictor not supported"
+      data        = data.unpack("C*")
+      unfiltered  = []
+      bpc         = opts[:BitsPerComponent] || 8
+      pixel_bits  = bpc * opts[:Colors]
+      pixel_bytes = pixel_bits / 8
+      line_len    = (pixel_bytes * opts[:Columns])
+      pos         = 0
+      if bpc != 8
+        raise UnsupportedFeatureError, "TIFF predictor onlys supports 8 Bits Per Component"
+      end
+      until pos > data.size
+        row_data = data[pos, line_len]
+        row_data.each_with_index do |byte, index|
+          left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
+          row_data[index] = (byte + left) % 256
+        end
+        unfiltered += row_data
+        pos += line_len
+      end
+      unfiltered.pack("C*")
     end
     ################################################################################
     def png_depredict(data, opts = {})

data/lib/pdf/reader/font.rb CHANGED Viewed

@@ -41,23 +41,6 @@ class PDF::Reader
       extract_descendants(obj)
     end
-    # returns a hash that maps glyph names to unicode codepoints. The mapping is based on
-    # a text file supplied by Adobe at:
-    # http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
-    def self.glyphnames
-      glyphs = {}
-      RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
-      File.open(File.dirname(__FILE__) + "/glyphlist.txt",mode) do |f|
-        f.each do |l|
-          m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
-          glyphs[name.to_sym] = "0x#{code}".hex if name
-        end
-      end
-      glyphs
-    end
     def basefont=(font)
       # setup a default encoding for the selected font. It can always be overridden
       # with encoding= if required

data/lib/pdf/reader/form_xobject.rb ADDED Viewed

@@ -0,0 +1,83 @@
+# coding: utf-8
+module PDF
+  class Reader
+    # High level representation of a single PDF form xobject. Form xobjects
+    # are contained pieces of content that can be inserted onto multiple
+    # pages. They're generally used as a space efficient way to store
+    # repetative content (like logos, header, footers, etc).
+    #
+    # This behaves and looks much like a limited PDF::Reader::Page class.
+    #
+    class FormXObject
+      def initialize(page, xobject)
+        @page    = page
+        @objects = page.objects
+        @xobject = @objects.deref(xobject)
+      end
+      # Returns the resources that accompany this form.
+      #
+      def resources
+        @resources ||= @objects.deref(@xobject.hash[:Resources]) || {}
+      end
+      # return a hash of fonts used on this form.
+      #
+      # The keys are the font labels used within the form content stream.
+      #
+      # The values are a PDF::Reader::Font instances that provide access
+      # to most available metrics for each font.
+      #
+      def fonts
+        raw_fonts = @objects.deref(resources[:Font] || {})
+        ::Hash[raw_fonts.map { |label, font|
+          [label, PDF::Reader::Font.new(@objects, @objects.deref(font))]
+        }]
+      end
+      # processes the raw content stream for this form in sequential order and
+      # passes callbacks to the receiver objects.
+      #
+      # See the comments on PDF::Reader::Page#walk for more detail.
+      #
+      def walk(*receivers)
+        content_stream(receivers, raw_content)
+      end
+      # returns the raw content stream for this page. This is plumbing, nothing to
+      # see here unless you're a PDF nerd like me.
+      #
+      def raw_content
+        @xobject.unfiltered_data
+      end
+      private
+      def callback(receivers, name, params=[])
+        receivers.each do |receiver|
+          receiver.send(name, *params) if receiver.respond_to?(name)
+        end
+      end
+      def content_stream(receivers, instructions)
+        buffer       = Buffer.new(StringIO.new(instructions), :content_stream => true)
+        parser       = Parser.new(buffer, @objects)
+        params       = []
+        while (token = parser.parse_token(PagesStrategy::OPERATORS))
+          if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
+            callback(receivers, PagesStrategy::OPERATORS[token], params)
+            params.clear
+          else
+            params << token
+          end
+        end
+      rescue EOFError => e
+        raise MalformedPDFError, "End Of File while processing a content stream"
+      end
+    end
+  end
+end

data/lib/pdf/reader/glyph_hash.rb ADDED Viewed

@@ -0,0 +1,88 @@
+################################################################################
+#
+# Copyright (C) 2011 James Healy (jimmy@deefa.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+################################################################################
+class PDF::Reader
+  class GlyphHash # :nodoc:
+    def initialize
+      @adobe = load_adobe_glyph_mapping
+    end
+    # attempt to convert a PDF Name to a unicode codepoint. Returns nil
+    # if no conversion is possible.
+    #
+    #   h = GlyphHash.new
+    #
+    #   h[:A]
+    #   => 65
+    #
+    #   h[:Euro]
+    #   => 8364
+    #
+    #   h[:G30]
+    #   => 48
+    #
+    #   h[:34]
+    #
+    def [](name)
+      return nil unless name.is_a?(Symbol)
+      str = name.to_s
+      if @adobe.has_key?(name)
+        @adobe[name]
+      elsif str.match(/\Auni[A-F\d]{4}\Z/)
+        "0x#{str[3,4]}".hex
+      elsif str.match(/\Au[A-F\d]{4,6}\Z/)
+        "0x#{str[1,6]}".hex
+      elsif str.match(/\A[A-Za-z]\d{2,4}\Z/)
+        str[1,4].to_i
+      elsif str.match(/\A[A-Za-z]{2}\d{2,4}\Z/)
+        str[2,4].to_i
+      else
+        nil
+      end
+    end
+    private
+    # returns a hash that maps glyph names to unicode codepoints. The mapping is based on
+    # a text file supplied by Adobe at:
+    # http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
+    def load_adobe_glyph_mapping
+      glyphs = {}
+      RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
+      File.open(File.dirname(__FILE__) + "/glyphlist.txt", mode) do |f|
+        f.each do |l|
+          m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
+          glyphs[name.to_sym] = "0x#{code}".hex if name
+        end
+      end
+      glyphs
+    end
+  end
+end