RubyGems - pdf-reader - Versions diffs - 0.7.6 → 0.7.7 - Mend

pdf-reader 0.7.6 → 0.7.7

Files changed (13) hide show

data/CHANGELOG +5 -0
data/README.rdoc +1 -171
data/Rakefile +1 -1
data/TODO +2 -0
data/examples/callbacks.rb +16 -0
data/examples/extract_bates.rb +63 -0
data/examples/metadata.rb +25 -0
data/examples/page_counter_improved.rb +23 -0
data/examples/page_counter_naive.rb +24 -0
data/examples/rspec.rb +57 -0
data/examples/text.rb +37 -0
data/lib/pdf/reader/content.rb +58 -18
metadata +10 -3

data/CHANGELOG CHANGED

@@ -1,3 +1,8 @@
+v0.7.7 (11th September 2009)
+- Trigger callbacks contained in Form XObjects when we encounter them in a
+  content stream
+- Fix inheritance of page resources to comply with section 3.6.2
 v0.7.6 (28th August 2009)
 - Various bug fixes that increase the files we can successfully parse
   - Treat float and integer tokens differently (thanks Neil)

data/README.rdoc CHANGED

@@ -88,177 +88,7 @@ http://groups.google.com/group/pdf-reader
 = Examples
 The easiest way to explain how this works in practice is to show some examples.
-== Naïve Page Counter
-A simple app to count the number of pages in a PDF File.
-  require 'rubygems'
-  require 'pdf/reader'
-  class PageReceiver
-    attr_accessor :counter
-    def initialize
-      @counter = 0
-    end
-    # Called when page parsing ends
-    def end_page
-      @counter += 1
-    end
-  end
-  receiver = PageReceiver.new
-  pdf = PDF::Reader.file("somefile.pdf", receiver)
-  puts "#{receiver.counter} pages"
-== List all callbacks generated by a single PDF
-WARNING: this will generate a *lot* of output, so you probably want to pipe
-it through less or to a text file.
-  require 'rubygems'
-  require 'pdf/reader'
-  receiver = PDF::Reader::RegisterReceiver.new
-  pdf = PDF::Reader.file("somefile.pdf", receiver)
-  receiver.callbacks.each do |cb|
-    puts cb
-  end
-== Extract all text from a single PDF
-  class PageTextReceiver
-    attr_accessor :content
-    def initialize
-      @content = []
-    end
-    # Called when page parsing starts
-    def begin_page(arg = nil)
-      @content << ""
-    end
-    # record text that is drawn on the page
-    def show_text(string, *params)
-      @content.last << string.strip
-    end
-    # there's a few text callbacks, so make sure we process them all
-    alias :super_show_text :show_text
-    alias :move_to_next_line_and_show_text :show_text
-    alias :set_spacing_next_line_show_text :show_text
-    # this final text callback takes slightly different arguments
-    def show_text_with_positioning(*params)
-      params = params.first
-      params.each { |str| show_text(str) if str.kind_of?(String)}
-    end
-  end
-  receiver = PageTextReceiver.new
-  pdf = PDF::Reader.file("somefile.pdf", receiver)
-  puts receiver.content.inspect
-== Extract metadata only
-  require 'rubygems'
-  require 'pdf/reader'
-  class MetaDataReceiver
-    attr_accessor :regular
-    attr_accessor :xml
-    def metadata(data)
-      @regular = data
-    end
-    def metadata_xml(data)
-      @xml = data
-    end
-  end
-  receiver = MetaDataReceiver.new
-  pdf = PDF::Reader.file(ARGV.shift, receiver, :pages => false, :metadata => true)
-  puts receiver.regular.inspect
-  puts receiver.xml.inspect
-== Improved Page Counter
-A simple app to display the number of pages in a PDF File.
-  require 'rubygems'
-  require 'pdf/reader'
-  class PageReceiver
-    attr_accessor :pages
-    # Called when page parsing ends
-    def page_count(arg)
-      @pages = arg
-    end
-  end
-  receiver = PageReceiver.new
-  pdf = PDF::Reader.file("somefile.pdf", receiver, :pages => false)
-  puts "#{receiver.pages} pages"
-== Basic RSpec of a generated PDF
-  require 'rubygems'
-  require 'pdf/reader'
-  require 'pdf/writer'
-  require 'spec'
-  class PageTextReceiver
-    attr_accessor :content
-    def initialize
-      @content = []
-    end
-    # Called when page parsing starts
-    def begin_page(arg = nil)
-      @content << ""
-    end
-    def show_text(string, *params)
-      @content.last << string.strip
-    end
-    # there's a few text callbacks, so make sure we process them all
-    alias :super_show_text :show_text
-    alias :move_to_next_line_and_show_text :show_text
-    alias :set_spacing_next_line_show_text :show_text
-    def show_text_with_positioning(*params)
-      params = params.first
-      params.each { |str| show_text(str) if str.kind_of?(String)}
-    end
-  end
-  context "My generated PDF" do
-    specify "should have the correct text on 2 pages" do
-      # generate our PDF
-      pdf = PDF::Writer.new
-      pdf.text "Chunky", :font_size => 32, :justification => :center
-      pdf.start_new_page
-      pdf.text "Bacon", :font_size => 32, :justification => :center
-      pdf.save_as("chunkybacon.pdf")
-      # process the PDF
-      receiver = PageTextReceiver.new
-      PDF::Reader.file("chunkybacon.pdf", receiver)
-      # confirm the text appears on the correct pages
-      receiver.content.size.should eql(2)
-      receiver.content[0].should eql("Chunky")
-      receiver.content[1].should eql("Bacon")
-    end
-  end
+Check out the examples/ directory for a few files.
 = Known Limitations

data/Rakefile CHANGED

@@ -6,7 +6,7 @@ require 'rake/testtask'
 require "rake/gempackagetask"
 require 'spec/rake/spectask'
-PKG_VERSION = "0.7.6"
+PKG_VERSION = "0.7.7"
 PKG_NAME = "pdf-reader"
 PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"

data/TODO CHANGED

@@ -16,6 +16,8 @@ v0.8
 - detect when a font's encoding is a CMap (generally used for pre-Unicode, multibyte asian encodings), and display a user friendly error
 - Improve interpretation of non content stream data (ie metadata). recognise dates, etc
 - Support Cross Reference Streams (spec 3.4.7)
+- Fix inheritance of page attributes. Resources has been done, but plenty of other attributes
+  are inheritable. See table 3.2.7 in the spec
 v0.9
 - Add a way to extract raster images

data/examples/callbacks.rb ADDED

@@ -0,0 +1,16 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+# List all callbacks generated by a single PDF
+#
+# WARNING: this will generate a *lot* of output, so you probably want to pipe
+#          it through less or to a text file.
+require 'rubygems'
+require 'pdf/reader'
+receiver = PDF::Reader::RegisterReceiver.new
+pdf = PDF::Reader.file("somefile.pdf", receiver)
+receiver.callbacks.each do |cb|
+  puts cb
+end

data/examples/extract_bates.rb ADDED

@@ -0,0 +1,63 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+# A sample script that attempts to extract bates numbers from a PDF file.
+# Bates numbers are often used to markup documents being used in legal
+# cases. For more info, see http://en.wikipedia.org/wiki/Bates_numbering
+#
+# Acrobat 9 introduced a markup syntax that directly specifies the bates
+# number for each page. For earlier versions, the easiest way to find
+# the number is to look for words that match a pattern.
+#
+# This example attempts to extract numbers using the Acrobat 9 syntax.
+# As a fall back, you can provide a regular expression that will be
+# used to look for words that look like the numbers you expect in the
+# page content.
+require 'rubygems'
+require 'pdf/reader'
+class BatesReceiver
+  def initialize(regexp = nil)
+    @numbers = []
+    @backup  = []
+    @regexp  = regexp
+  end
+  def numbers
+    @numbers.size > 0 ? @numbers : @backup
+  end
+  # Called when page parsing starts
+  def begin_marked_content(*args)
+    return unless args.size >= 2
+    return unless args.first == :Artifact
+    return unless args[1][:Subtype] == :BatesN
+    @numbers << args[1][:Contents]
+  end
+  alias :begin_marked_content_with_pl :begin_marked_content
+  # record text that is drawn on the page
+  def show_text(string, *params)
+    return if @regexp.nil?
+    string.scan(@regexp).each { |m| @backup << m }
+  end
+  # there's a few text callbacks, so make sure we process them all
+  alias :super_show_text :show_text
+  alias :move_to_next_line_and_show_text :show_text
+  alias :set_spacing_next_line_show_text :show_text
+  # this final text callback takes slightly different arguments
+  def show_text_with_positioning(*params)
+    params = params.first
+    params.each { |str| show_text(str) if str.kind_of?(String)}
+  end
+end
+receiver = BatesReceiver.new(/CC.+/)
+PDF::Reader.file("bates.pdf", receiver)
+puts receiver.numbers.inspect

data/examples/metadata.rb ADDED

@@ -0,0 +1,25 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+# Extract metadata only
+require 'rubygems'
+require 'pdf/reader'
+class MetaDataReceiver
+  attr_accessor :regular
+  attr_accessor :xml
+  def metadata(data)
+    @regular = data
+  end
+  def metadata_xml(data)
+    @xml = data
+  end
+end
+receiver = MetaDataReceiver.new
+pdf = PDF::Reader.file(ARGV.shift, receiver, :pages => false, :metadata => true)
+puts receiver.regular.inspect
+puts receiver.xml.inspect

data/examples/page_counter_improved.rb ADDED

@@ -0,0 +1,23 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+# Improved Page Counter
+#
+# A simple app to display the number of pages in a PDF File.
+#
+  require 'rubygems'
+  require 'pdf/reader'
+  class PageReceiver
+    attr_accessor :pages
+    # Called when page parsing ends
+    def page_count(arg)
+      @pages = arg
+    end
+  end
+  receiver = PageReceiver.new
+  pdf = PDF::Reader.file("somefile.pdf", receiver, :pages => false)
+  puts "#{receiver.pages} pages"

data/examples/page_counter_naive.rb ADDED

@@ -0,0 +1,24 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+# A simple app to count the number of pages in a PDF File.
+require 'rubygems'
+require 'pdf/reader'
+class PageReceiver
+  attr_accessor :counter
+  def initialize
+    @counter = 0
+  end
+  # Called when page parsing ends
+  def end_page
+    @counter += 1
+  end
+end
+receiver = PageReceiver.new
+pdf = PDF::Reader.file("somefile.pdf", receiver)
+puts "#{receiver.counter} pages"

data/examples/rspec.rb ADDED

@@ -0,0 +1,57 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+#  Basic RSpec of a generated PDF
+require 'rubygems'
+require 'pdf/reader'
+require 'pdf/writer'
+require 'spec'
+class PageTextReceiver
+  attr_accessor :content
+  def initialize
+    @content = []
+  end
+  # Called when page parsing starts
+  def begin_page(arg = nil)
+    @content << ""
+  end
+  def show_text(string, *params)
+    @content.last << string.strip
+  end
+  # there's a few text callbacks, so make sure we process them all
+  alias :super_show_text :show_text
+  alias :move_to_next_line_and_show_text :show_text
+  alias :set_spacing_next_line_show_text :show_text
+  def show_text_with_positioning(*params)
+    params = params.first
+    params.each { |str| show_text(str) if str.kind_of?(String)}
+  end
+end
+context "My generated PDF" do
+  specify "should have the correct text on 2 pages" do
+    # generate our PDF
+    pdf = PDF::Writer.new
+    pdf.text "Chunky", :font_size => 32, :justification => :center
+    pdf.start_new_page
+    pdf.text "Bacon", :font_size => 32, :justification => :center
+    pdf.save_as("chunkybacon.pdf")
+    # process the PDF
+    receiver = PageTextReceiver.new
+    PDF::Reader.file("chunkybacon.pdf", receiver)
+    # confirm the text appears on the correct pages
+    receiver.content.size.should eql(2)
+    receiver.content[0].should eql("Chunky")
+    receiver.content[1].should eql("Bacon")
+  end
+end

data/examples/text.rb ADDED

@@ -0,0 +1,37 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+# Extract all text from a single PDF
+class PageTextReceiver
+  attr_accessor :content
+  def initialize
+    @content = []
+  end
+  # Called when page parsing starts
+  def begin_page(arg = nil)
+    @content << ""
+  end
+  # record text that is drawn on the page
+  def show_text(string, *params)
+    @content.last << string.strip
+  end
+  # there's a few text callbacks, so make sure we process them all
+  alias :super_show_text :show_text
+  alias :move_to_next_line_and_show_text :show_text
+  alias :set_spacing_next_line_show_text :show_text
+  # this final text callback takes slightly different arguments
+  def show_text_with_positioning(*params)
+    params = params.first
+    params.each { |str| show_text(str) if str.kind_of?(String)}
+  end
+end
+receiver = PageTextReceiver.new
+pdf = PDF::Reader.file("somefile.pdf", receiver)
+puts receiver.content.inspect

data/lib/pdf/reader/content.rb CHANGED

@@ -147,18 +147,14 @@ class PDF::Reader
   # - metadata
   # - xml_metadata
   # - page_count
+  # - begin_form_xobject
+  # - end_form_xobject
   #
   # == Resource Callbacks
   #
-  # Each page and page_container can contain a range of resources required for the page,
+  # Each page can contain (or inherit) a range of resources required for the page,
   # including things like fonts and images. The following callbacks may appear
-  # after begin_page_container and begin_page if the relevant resources exist
-  # on a page:
-  #
-  # In most cases, these callbacks associate a name with each resource, allowing it
-  # to be referred to by name in the page content. For example, an XObject can hold an image.
-  # If it gets mapped to the name "IM1", then it can be placed on the page using
-  # invoke_xobject "IM1".
+  # after begin_page if the relevant resources exist on a page:
   #
   # - resource_procset
   # - resource_xobject
@@ -166,6 +162,12 @@ class PDF::Reader
   # - resource_colorspace
   # - resource_pattern
   # - resource_font
+  #
+  # In most cases, these callbacks associate a name with each resource, allowing it
+  # to be referred to by name in the page content. For example, an XObject can hold an image.
+  # If it gets mapped to the name "IM1", then it can be placed on the page using
+  # invoke_xobject "IM1".
+  #
   class Content
     OPERATORS = {
       'b'   => :close_fill_stroke,
@@ -284,22 +286,19 @@ class PDF::Reader
     # its content
     def walk_pages (page)
-      if page[:Resources]
-        res = page[:Resources]
-        page.delete(:Resources)
-      end
       # extract page content
       if page[:Type] == :Pages
         callback(:begin_page_container, [page])
-        walk_resources(@xref.object(res)) if res
+        res = @xref.object(page[:Resources])
+        resources.push res if res
         @xref.object(page[:Kids]).each {|child| walk_pages(@xref.object(child))}
+        resources.pop if res
         callback(:end_page_container)
       elsif page[:Type] == :Page
         callback(:begin_page, [page])
-        walk_resources(@xref.object(res)) if res
-        @page = page
-        @params = []
+        res = @xref.object(page[:Resources])
+        resources.push res if res
+        walk_resources(current_resources)
         if @xref.object(page[:Contents]).kind_of?(Array)
           contents = @xref.object(page[:Contents])
@@ -312,10 +311,38 @@ class PDF::Reader
           content_stream(obj)
         end if page.has_key?(:Contents) and page[:Contents]
+        resources.pop if res
         callback(:end_page)
       end
     end
     ################################################################################
+    # Retreive the XObject for the supplied label and if it's a Form, walk it
+    # like a regular page content stream.
+    #
+    def walk_xobject_form(label)
+      xobjects = current_resources[:XObject] || {}
+      xobject  = @xref.object(xobjects[label])
+      if xobject && xobject.hash[:Subtype] == :Form
+        callback(:begin_form_xobject)
+        resources = @xref.object(xobject.hash[:Resources])
+        walk_resources(resources) if resources
+        content_stream(xobject.to_s)
+        callback(:end_form_xobject)
+      end
+    end
+    ################################################################################
+    # Return a merged hash of all resources that are current. Pages, page and xobject
+    #
+    def current_resources
+      hash = {}
+      resources.each do |res|
+        hash.merge!(res)
+      end
+      hash
+    end
+    ################################################################################
     # Reads a PDF content stream and calls all the appropriate callback methods for the operators
     # it contains
     def content_stream (instructions)
@@ -341,8 +368,16 @@ class PDF::Reader
             # read the raw image data from the buffer without tokenising
             @params << @buffer.read_until("EI")
           end
           callback(OPERATORS[token], @params)
-          @params.clear
+          if OPERATORS[token] == :invoke_xobject
+            xobject_label = @params.first
+            @params.clear
+            walk_xobject_form(xobject_label)
+          else
+            @params.clear
+          end
         else
           @params << token
         end
@@ -352,6 +387,8 @@ class PDF::Reader
     end
     ################################################################################
     def walk_resources(resources)
+      return unless resources.respond_to?(:[])
       resources = resolve_references(resources)
       # extract any procset information
@@ -446,6 +483,9 @@ class PDF::Reader
         obj
       end
     end
+    def resources
+      @resources ||= []
+    end
   end
   ################################################################################
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: pdf-reader
 version: !ruby/object:Gem::Version
-  version: 0.7.6
+  version: 0.7.7
 platform: ruby
 authors:
 - Peter Jones
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-08-28 00:00:00 +10:00
+date: 2009-09-11 00:00:00 +10:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -36,6 +36,13 @@ extra_rdoc_files:
 - CHANGELOG
 - MIT-LICENSE
 files:
+- examples/extract_bates.rb
+- examples/text.rb
+- examples/page_counter_naive.rb
+- examples/callbacks.rb
+- examples/metadata.rb
+- examples/page_counter_improved.rb
+- examples/rspec.rb
 - lib/pdf/reader.rb
 - lib/pdf/reader/buffer.rb
 - lib/pdf/reader/cmap.rb
@@ -94,7 +101,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: pdf-reader
-rubygems_version: 1.3.4
+rubygems_version: 1.3.5
 signing_key:
 specification_version: 3
 summary: A library for accessing the content of PDF files