RubyGems - pdf-reader - Versions diffs - 0.7.1 → 0.7.2 - Mend

pdf-reader 0.7.1 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

data/CHANGELOG CHANGED

@@ -1,3 +1,10 @@
+v0.7.2 (20th May 2008)
+- Throw an UnsupportedFeatureError if we try to open an encrypted/secure PDF
+- Correctly handle page content instruction sets with trailing whitespace
+- Represent PDF Streams with a new object, PDF::Reader::Stream
+  - their really wasn't any point in separating the stream content from it's associated dict. You need both
+    parts to correctly interpret the content
 v0.7.1 (6th May 2008)
 - Non-page strings (ie. metadata, etc) are now converted to UTF-8 more accurately
 - Fixed a regression between 0.6.2 and 0.7 that prevented difference tables from being applied

data/README CHANGED

@@ -48,6 +48,9 @@ UnsupportedFeatureError - The PDF uses a feature that PDF::Reader doesn't curren
 support. Again, we welcome submissions of PDF files that exhibit these features to help
 us with future code improvements.
+Any other exceptions should be considered bugs and should be reported (unless they originate
+inside your receiver, in which case you're on your own)
 = Maintainers
 - Peter Jones <mailto:pjones@pmade.com>
@@ -229,9 +232,9 @@ layout of the file, not the order objects are displayed to the user. As a
 consequence of this it is highly unlikely that text will be completely in
 order.
-Occasionally some text cannot be extracted properly due to the way it has been stored, or the use
-of invalid bytes. In these cases PDF::Reader will output a little UTF-8 friendly box to indicate
-an unrecognisable character.
+Occasionally some text cannot be extracted properly due to the way it has been
+stored, or the use of invalid bytes. In these cases PDF::Reader will output a
+little UTF-8 friendly box to indicate an unrecognisable character.
 = Resources

data/Rakefile CHANGED

@@ -6,7 +6,7 @@ require 'rake/testtask'
 require "rake/gempackagetask"
 require 'spec/rake/spectask'
-PKG_VERSION = "0.7.1"
+PKG_VERSION = "0.7.2"
 PKG_NAME = "pdf-reader"
 PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
@@ -65,6 +65,9 @@ spec = Gem::Specification.new do |spec|
                       ["Rakefile"]
   spec.require_path = "lib"
+  spec.bindir = "bin"
+  spec.executables << "pdf_text"
+  spec.executables << "pdf_list_callbacks"
 	spec.has_rdoc = true
 	spec.extra_rdoc_files = %w{README TODO CHANGELOG}
 	spec.rdoc_options << '--title' << 'PDF::Reader Documentation' <<

data/bin/pdf_list_callbacks ADDED

@@ -0,0 +1,17 @@
+#!/usr/bin/env ruby
+$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
+require 'pdf/reader'
+receiver = PDF::Reader::RegisterReceiver.new
+if ARGV.empty?
+  PDF::Reader.new.parse($stdin, receiver)
+else
+  PDF::Reader.file(ARGV[0], receiver)
+end
+receiver.callbacks.each do |callback|
+  puts "#{callback[:name]} - #{callback[:args].inspect}"
+end

data/bin/pdf_text ADDED

@@ -0,0 +1,40 @@
+#!/usr/bin/env ruby
+$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
+require 'pdf/reader'
+class PageTextReceiver
+  attr_accessor :content
+  # Called when page parsing starts
+  def end_page(arg = nil)
+    if @content
+      puts @content
+      puts
+    end
+  end
+  def show_text(string, *params)
+    @content = "" if @content.nil?
+    @content << string
+  end
+  # there's a few text callbacks, so make sure we process them all
+  alias :super_show_text :show_text
+  alias :move_to_next_line_and_show_text :show_text
+  alias :set_spacing_next_line_show_text :show_text
+  def show_text_with_positioning(*params)
+    params = params.first
+    params.each { |str| show_text(str) if str.kind_of?(String)}
+  end
+end
+receiver = PageTextReceiver.new
+if ARGV.empty?
+  PDF::Reader.new.parse($stdin, receiver)
+else
+  PDF::Reader.file(ARGV[0], receiver)
+end

data/lib/pdf/reader.rb CHANGED

@@ -98,6 +98,7 @@ require 'pdf/reader/font'
 require 'pdf/reader/parser'
 require 'pdf/reader/reference'
 require 'pdf/reader/register_receiver'
+require 'pdf/reader/stream'
 require 'pdf/reader/text_receiver'
 require 'pdf/reader/token'
 require 'pdf/reader/xref'
@@ -119,8 +120,9 @@ class PDF::Reader
     options.merge!(opts)
     trailer = @xref.load
-    @content.metadata(@xref.object(trailer[:Info]).first) if options[:metadata]
-    @content.document(@xref.object(trailer[:Root]).first) if options[:pages]
+    raise PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files' if trailer[:Encrypt]
+    @content.metadata(@xref.object(trailer[:Info])) if options[:metadata]
+    @content.document(@xref.object(trailer[:Root])) if options[:pages]
     self
   end
   ################################################################################

data/lib/pdf/reader/buffer.rb CHANGED

@@ -77,6 +77,7 @@ class PDF::Reader
     # returns true if the underlying IO object is at end and the internal buffer
     # is empty
     def eof?
+      ready_token
       if @buffer
         @buffer.empty? && @io.eof?
       else
@@ -91,7 +92,7 @@ class PDF::Reader
     # PDF files are processed by tokenising the content into a series of objects and commands.
     # This prepares the buffer for use by reading the next line of tokens into memory.
     def ready_token (with_strip=true, skip_blanks=true)
-      while @buffer.nil? or @buffer.empty?
+      while (@buffer.nil? or @buffer.empty?) && !@io.eof?
         @buffer = @io.readline
         @buffer.force_encoding("BINARY") if @buffer.respond_to?(:force_encoding)
         #@buffer.sub!(/%.*$/, '') if strip_comments

data/lib/pdf/reader/content.rb CHANGED

@@ -260,11 +260,10 @@ class PDF::Reader
     # Begin processing the document
     def document (root)
       if root[:Metadata]
-        obj, stream = @xref.object(root[:Metadata])
-        callback(:xml_metadata,stream)
+        callback(:xml_metadata,@xref.object(root[:Metadata]))
       end
       callback(:begin_document, [root])
-      walk_pages(@xref.object(root[:Pages]).first)
+      walk_pages(@xref.object(root[:Pages]))
       callback(:end_document)
     end
     ################################################################################
@@ -280,26 +279,24 @@ class PDF::Reader
       # extract page content
       if page[:Type] == :Pages
         callback(:begin_page_container, [page])
-        walk_resources(@xref.object(res).first) if res
-        page[:Kids].each {|child| walk_pages(@xref.object(child).first)}
+        walk_resources(@xref.object(res)) if res
+        page[:Kids].each {|child| walk_pages(@xref.object(child))}
         callback(:end_page_container)
       elsif page[:Type] == :Page
         callback(:begin_page, [page])
-        walk_resources(@xref.object(res).first) if res
+        walk_resources(@xref.object(res)) if res
         @page = page
         @params = []
-        if page[:Contents].kind_of?(Array)
-          contents = page[:Contents]
-        elsif @xref.obj_type(page[:Contents]) == :Array
-          contents, stream = @xref.object(page[:Contents])
+        if @xref.object(page[:Contents]).kind_of?(Array)
+          contents = @xref.object(page[:Contents])
         else
           contents = [page[:Contents]]
         end
         contents.each do |content|
-          obj, stream = @xref.object(content)
-          content_stream(stream)
+          obj = @xref.object(content)
+          content_stream(obj)
         end if page.has_key?(:Contents) and page[:Contents]
         callback(:end_page)
@@ -356,42 +353,41 @@ class PDF::Reader
       # extract any xobject information
       if resources[:XObject]
-        @xref.object(resources[:XObject]).first.each do |name, val|
-          obj, stream = @xref.object(val)
-          callback(:resource_xobject, [name, obj, stream])
+        @xref.object(resources[:XObject]).each do |name, val|
+          callback(:resource_xobject, [name, @xref.object(val)])
         end
       end
       # extract any extgstate information
       if resources[:ExtGState]
-        @xref.object(resources[:ExtGState]).first.each do |name, val|
-          callback(:resource_extgstate, [name, @xref.object(val).first])
+        @xref.object(resources[:ExtGState]).each do |name, val|
+          callback(:resource_extgstate, [name, @xref.object(val)])
         end
       end
       # extract any colorspace information
       if resources[:ColorSpace]
-        @xref.object(resources[:ColorSpace]).first.each do |name, val|
-          callback(:resource_colorspace, [name, @xref.object(val).first])
+        @xref.object(resources[:ColorSpace]).each do |name, val|
+          callback(:resource_colorspace, [name, @xref.object(val)])
         end
       end
       # extract any pattern information
       if resources[:Pattern]
-        @xref.object(resources[:Pattern]).first.each do |name, val|
-          callback(:resource_pattern, [name, @xref.object(val).first])
+        @xref.object(resources[:Pattern]).each do |name, val|
+          callback(:resource_pattern, [name, @xref.object(val)])
         end
       end
       # extract any font information
       if resources[:Font]
-        @xref.object(resources[:Font]).first.each do |label, desc|
-          desc = @xref.object(desc).first
+        @xref.object(resources[:Font]).each do |label, desc|
+          desc = @xref.object(desc)
           @fonts[label] = PDF::Reader::Font.new
           @fonts[label].label = label
           @fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
           @fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
-          @fonts[label].encoding = PDF::Reader::Encoding.factory(@xref.object(desc[:Encoding]).first)
+          @fonts[label].encoding = PDF::Reader::Encoding.factory(@xref.object(desc[:Encoding]))
           @fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
           if desc[:ToUnicode]
             # this stream is a cmap
@@ -409,13 +405,11 @@ class PDF::Reader
     # Convert any PDF::Reader::Resource objects into a real object
     def resolve_references(obj)
       case obj
+      when PDF::Reader::Stream then
+        obj.hash = resolve_references(obj.hash)
+        obj
       when PDF::Reader::Reference then
-        obj, stream = @xref.object(obj)
-        if stream
-          stream
-        else
-          resolve_references(obj)
-        end
+        resolve_references(@xref.object(obj))
       when Hash                   then obj.each { |key,val| obj[key] = resolve_references(val) }
       when Array                  then obj.collect { |item| resolve_references(item) }
       else

data/lib/pdf/reader/parser.rb CHANGED

@@ -174,8 +174,8 @@ class PDF::Reader
       obj = parse_token
       post_obj = parse_token
       case post_obj
-      when "endobj"   then return [obj,nil]
-      when "stream"   then return [obj, stream(obj)]
+      when "endobj"   then return obj
+      when "stream"   then return stream(obj)
       else            raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
       end
     end
@@ -183,7 +183,7 @@ class PDF::Reader
     # Decodes the contents of a PDF Stream and returns it as a Ruby String.
     def stream (dict)
       raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
-      data = @buffer.read(@xref.object(dict[:Length]).first)
+      data = @buffer.read(@xref.object(dict[:Length]))
       Error.str_assert(parse_token, "endstream")
       Error.str_assert(parse_token, "endobj")
@@ -200,7 +200,7 @@ class PDF::Reader
         end
       end
-      data
+      PDF::Reader::Stream.new(dict, data)
     end
     ################################################################################
   end

data/lib/pdf/reader/stream.rb ADDED

@@ -0,0 +1,43 @@
+################################################################################
+#
+# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+################################################################################
+class PDF::Reader
+  ################################################################################
+  # An internal PDF::Reader class that represents a single token from a PDF file.
+  #
+  # Behaves exactly like a Ruby String - it basically exists for convenience.
+  class Stream < String
+    attr_accessor :hash
+    ################################################################################
+    # Creates a new token with the specified value
+    def initialize (hash, val)
+      @hash = hash
+      super val
+    end
+    ################################################################################
+  end
+  ################################################################################
+end
+################################################################################

data/lib/pdf/reader/xref.rb CHANGED

@@ -61,15 +61,11 @@ class PDF::Reader
     #
     # If the object is a stream, that is returned as well
     def object (ref, save_pos = true)
-      return ref, nil unless ref.kind_of?(Reference)
+      return ref unless ref.kind_of?(Reference)
       pos = @buffer.pos if save_pos
-      obj, stream = Parser.new(@buffer.seek(offset_for(ref)), self).object(ref.id, ref.gen)
+      obj = Parser.new(@buffer.seek(offset_for(ref)), self).object(ref.id, ref.gen)
       @buffer.seek(pos) if save_pos
-      if stream
-        return [obj, stream]
-      else
-        return [obj, nil]
-      end
+      return obj
     end
     ################################################################################
     # Assumes the underlying buffer is positioned at the start of an Xref table and
@@ -112,7 +108,7 @@ class PDF::Reader
     end
     # returns the type of object a ref points to
     def obj_type(ref)
-      obj, stream = object(ref)
+      obj = object(ref)
       obj.class.to_s.to_sym
     end
     # returns true if the supplied references points to an object with a stream

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: pdf-reader
 version: !ruby/object:Gem::Version
-  version: 0.7.1
+  version: 0.7.2
 platform: ruby
 authors:
 - Peter Jones
@@ -9,14 +9,15 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2008-05-06 00:00:00 +10:00
+date: 2008-05-20 00:00:00 +10:00
 default_executable:
 dependencies: []
 description: The PDF::Reader library implements a PDF parser conforming as much as possible to the PDF specification from Adobe
 email: pjones@pmade.com
-executables: []
+executables:
+- pdf_text
+- pdf_list_callbacks
 extensions: []
 extra_rdoc_files:
@@ -41,6 +42,7 @@ files:
 - lib/pdf/reader/register_receiver.rb
 - lib/pdf/reader/font.rb
 - lib/pdf/reader/glyphlist.txt
+- lib/pdf/reader/stream.rb
 - lib/pdf/reader/parser.rb.rej
 - lib/pdf/reader.rb
 - Rakefile
@@ -73,7 +75,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: pdf-reader
-rubygems_version: 1.0.1
+rubygems_version: 1.1.1
 signing_key:
 specification_version: 2
 summary: A library for accessing the content of PDF files