RubyGems - pdf-reader - Versions diffs - 0.7.6 → 0.7.7 - Mend

pdf-reader 0.7.6 → 0.7.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

data/CHANGELOG +5 -0
data/README.rdoc +1 -171
data/Rakefile +1 -1
data/TODO +2 -0
data/examples/callbacks.rb +16 -0
data/examples/extract_bates.rb +63 -0
data/examples/metadata.rb +25 -0
data/examples/page_counter_improved.rb +23 -0
data/examples/page_counter_naive.rb +24 -0
data/examples/rspec.rb +57 -0
data/examples/text.rb +37 -0
data/lib/pdf/reader/content.rb +58 -18
metadata +10 -3

data/CHANGELOG CHANGED

@@ -1,3 +1,8 @@
+v0.7.7 (11th September 2009)
+- Trigger callbacks contained in Form XObjects when we encounter them in a
+  content stream
+- Fix inheritance of page resources to comply with section 3.6.2
 v0.7.6 (28th August 2009)
 - Various bug fixes that increase the files we can successfully parse
   - Treat float and integer tokens differently (thanks Neil)

data/README.rdoc CHANGED

@@ -88,177 +88,7 @@ http://groups.google.com/group/pdf-reader
 = Examples
 The easiest way to explain how this works in practice is to show some examples.
-== Naïve Page Counter
-A simple app to count the number of pages in a PDF File.
-  require 'rubygems'
-  require 'pdf/reader'
-  class PageReceiver
-    attr_accessor :counter
-    def initialize
-      @counter = 0
-    end
-    # Called when page parsing ends
-    def end_page
-      @counter += 1
-    end
-  end
-  receiver = PageReceiver.new
-  pdf = PDF::Reader.file("somefile.pdf", receiver)
-  puts "#{receiver.counter} pages"
-== List all callbacks generated by a single PDF
-WARNING: this will generate a *lot* of output, so you probably want to pipe
-it through less or to a text file.
-  require 'rubygems'
-  require 'pdf/reader'
-  receiver = PDF::Reader::RegisterReceiver.new
-  pdf = PDF::Reader.file("somefile.pdf", receiver)
-  receiver.callbacks.each do |cb|
-    puts cb
-  end
-== Extract all text from a single PDF
-  class PageTextReceiver
-    attr_accessor :content
-    def initialize
-      @content = []
-    end
-    # Called when page parsing starts
-    def begin_page(arg = nil)
-      @content << ""
-    end
-    # record text that is drawn on the page
-    def show_text(string, *params)
-      @content.last << string.strip
-    end
-    # there's a few text callbacks, so make sure we process them all
-    alias :super_show_text :show_text
-    alias :move_to_next_line_and_show_text :show_text
-    alias :set_spacing_next_line_show_text :show_text
-    # this final text callback takes slightly different arguments
-    def show_text_with_positioning(*params)
-      params = params.first
-      params.each { |str| show_text(str) if str.kind_of?(String)}
-    end
-  end
-  receiver = PageTextReceiver.new
-  pdf = PDF::Reader.file("somefile.pdf", receiver)
-  puts receiver.content.inspect
-== Extract metadata only
-  require 'rubygems'
-  require 'pdf/reader'
-  class MetaDataReceiver
-    attr_accessor :regular
-    attr_accessor :xml
-    def metadata(data)
-      @regular = data
-    end
-    def metadata_xml(data)
-      @xml = data
-    end
-  end
-  receiver = MetaDataReceiver.new
-  pdf = PDF::Reader.file(ARGV.shift, receiver, :pages => false, :metadata => true)
-  puts receiver.regular.inspect
-  puts receiver.xml.inspect
-== Improved Page Counter
-A simple app to display the number of pages in a PDF File.
-  require 'rubygems'
-  require 'pdf/reader'
-  class PageReceiver
-    attr_accessor :pages
-    # Called when page parsing ends
-    def page_count(arg)
-      @pages = arg
-    end
-  end
-  receiver = PageReceiver.new
-  pdf = PDF::Reader.file("somefile.pdf", receiver, :pages => false)
-  puts "#{receiver.pages} pages"
-== Basic RSpec of a generated PDF
-  require 'rubygems'
-  require 'pdf/reader'
-  require 'pdf/writer'
-  require 'spec'
-  class PageTextReceiver
-    attr_accessor :content
-    def initialize
-      @content = []
-    end
-    # Called when page parsing starts
-    def begin_page(arg = nil)
-      @content << ""
-    end
-    def show_text(string, *params)
-      @content.last << string.strip
-    end
-    # there's a few text callbacks, so make sure we process them all
-    alias :super_show_text :show_text
-    alias :move_to_next_line_and_show_text :show_text
-    alias :set_spacing_next_line_show_text :show_text
-    def show_text_with_positioning(*params)
-      params = params.first
-      params.each { |str| show_text(str) if str.kind_of?(String)}
-    end
-  end
-  context "My generated PDF" do
-    specify "should have the correct text on 2 pages" do
-      # generate our PDF
-      pdf = PDF::Writer.new
-      pdf.text "Chunky", :font_size => 32, :justification => :center
-      pdf.start_new_page
-      pdf.text "Bacon", :font_size => 32, :justification => :center
-      pdf.save_as("chunkybacon.pdf")
-      # process the PDF
-      receiver = PageTextReceiver.new
-      PDF::Reader.file("chunkybacon.pdf", receiver)
-      # confirm the text appears on the correct pages
-      receiver.content.size.should eql(2)
-      receiver.content[0].should eql("Chunky")
-      receiver.content[1].should eql("Bacon")
-    end
-  end
+Check out the examples/ directory for a few files.
 = Known Limitations

data/Rakefile CHANGED

@@ -6,7 +6,7 @@ require 'rake/testtask'
 require "rake/gempackagetask"
 require 'spec/rake/spectask'
-PKG_VERSION = "0.7.6"
+PKG_VERSION = "0.7.7"
 PKG_NAME = "pdf-reader"
 PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"

data/TODO CHANGED

@@ -16,6 +16,8 @@ v0.8
 - detect when a font's encoding is a CMap (generally used for pre-Unicode, multibyte asian encodings), and display a user friendly error
 - Improve interpretation of non content stream data (ie metadata). recognise dates, etc
 - Support Cross Reference Streams (spec 3.4.7)
+- Fix inheritance of page attributes. Resources has been done, but plenty of other attributes
+  are inheritable. See table 3.2.7 in the spec
 v0.9
 - Add a way to extract raster images

data/examples/callbacks.rb ADDED

@@ -0,0 +1,16 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+# List all callbacks generated by a single PDF
+#
+# WARNING: this will generate a *lot* of output, so you probably want to pipe
+#          it through less or to a text file.
+require 'rubygems'
+require 'pdf/reader'
+receiver = PDF::Reader::RegisterReceiver.new
+pdf = PDF::Reader.file("somefile.pdf", receiver)
+receiver.callbacks.each do |cb|
+  puts cb
+end

data/examples/extract_bates.rb ADDED

@@ -0,0 +1,63 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+# A sample script that attempts to extract bates numbers from a PDF file.
+# Bates numbers are often used to markup documents being used in legal
+# cases. For more info, see http://en.wikipedia.org/wiki/Bates_numbering
+#
+# Acrobat 9 introduced a markup syntax that directly specifies the bates
+# number for each page. For earlier versions, the easiest way to find
+# the number is to look for words that match a pattern.
+#
+# This example attempts to extract numbers using the Acrobat 9 syntax.
+# As a fall back, you can provide a regular expression that will be
+# used to look for words that look like the numbers you expect in the
+# page content.
+require 'rubygems'
+require 'pdf/reader'
+class BatesReceiver
+  def initialize(regexp = nil)
+    @numbers = []
+    @backup  = []
+    @regexp  = regexp
+  end
+  def numbers
+    @numbers.size > 0 ? @numbers : @backup
+  end
+  # Called when page parsing starts
+  def begin_marked_content(*args)
+    return unless args.size >= 2
+    return unless args.first == :Artifact
+    return unless args[1][:Subtype] == :BatesN
+    @numbers << args[1][:Contents]
+  end
+  alias :begin_marked_content_with_pl :begin_marked_content
+  # record text that is drawn on the page
+  def show_text(string, *params)
+    return if @regexp.nil?
+    string.scan(@regexp).each { |m| @backup << m }
+  end
+  # there's a few text callbacks, so make sure we process them all
+  alias :super_show_text :show_text
+  alias :move_to_next_line_and_show_text :show_text
+  alias :set_spacing_next_line_show_text :show_text
+  # this final text callback takes slightly different arguments
+  def show_text_with_positioning(*params)
+    params = params.first
+    params.each { |str| show_text(str) if str.kind_of?(String)}
+  end
+end
+receiver = BatesReceiver.new(/CC.+/)
+PDF::Reader.file("bates.pdf", receiver)
+puts receiver.numbers.inspect

data/examples/metadata.rb ADDED

@@ -0,0 +1,25 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+# Extract metadata only
+require 'rubygems'
+require 'pdf/reader'
+class MetaDataReceiver
+  attr_accessor :regular
+  attr_accessor :xml
+  def metadata(data)
+    @regular = data
+  end
+  def metadata_xml(data)
+    @xml = data
+  end
+end
+receiver = MetaDataReceiver.new
+pdf = PDF::Reader.file(ARGV.shift, receiver, :pages => false, :metadata => true)
+puts receiver.regular.inspect
+puts receiver.xml.inspect

data/examples/page_counter_improved.rb ADDED

@@ -0,0 +1,23 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+# Improved Page Counter
+#
+# A simple app to display the number of pages in a PDF File.
+#
+  require 'rubygems'
+  require 'pdf/reader'
+  class PageReceiver
+    attr_accessor :pages
+    # Called when page parsing ends
+    def page_count(arg)
+      @pages = arg
+    end
+  end
+  receiver = PageReceiver.new
+  pdf = PDF::Reader.file("somefile.pdf", receiver, :pages => false)
+  puts "#{receiver.pages} pages"

data/examples/page_counter_naive.rb ADDED

@@ -0,0 +1,24 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+# A simple app to count the number of pages in a PDF File.
+require 'rubygems'
+require 'pdf/reader'
+class PageReceiver
+  attr_accessor :counter
+  def initialize
+    @counter = 0
+  end
+  # Called when page parsing ends
+  def end_page
+    @counter += 1
+  end
+end
+receiver = PageReceiver.new
+pdf = PDF::Reader.file("somefile.pdf", receiver)
+puts "#{receiver.counter} pages"

data/examples/rspec.rb ADDED

@@ -0,0 +1,57 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+#  Basic RSpec of a generated PDF
+require 'rubygems'
+require 'pdf/reader'
+require 'pdf/writer'
+require 'spec'
+class PageTextReceiver
+  attr_accessor :content
+  def initialize
+    @content = []
+  end
+  # Called when page parsing starts
+  def begin_page(arg = nil)
+    @content << ""
+  end
+  def show_text(string, *params)
+    @content.last << string.strip
+  end
+  # there's a few text callbacks, so make sure we process them all
+  alias :super_show_text :show_text
+  alias :move_to_next_line_and_show_text :show_text
+  alias :set_spacing_next_line_show_text :show_text
+  def show_text_with_positioning(*params)
+    params = params.first
+    params.each { |str| show_text(str) if str.kind_of?(String)}
+  end
+end
+context "My generated PDF" do
+  specify "should have the correct text on 2 pages" do
+    # generate our PDF
+    pdf = PDF::Writer.new
+    pdf.text "Chunky", :font_size => 32, :justification => :center
+    pdf.start_new_page
+    pdf.text "Bacon", :font_size => 32, :justification => :center
+    pdf.save_as("chunkybacon.pdf")
+    # process the PDF
+    receiver = PageTextReceiver.new
+    PDF::Reader.file("chunkybacon.pdf", receiver)
+    # confirm the text appears on the correct pages
+    receiver.content.size.should eql(2)
+    receiver.content[0].should eql("Chunky")
+    receiver.content[1].should eql("Bacon")
+  end
+end

data/examples/text.rb ADDED

@@ -0,0 +1,37 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+# Extract all text from a single PDF
+class PageTextReceiver
+  attr_accessor :content
+  def initialize
+    @content = []
+  end
+  # Called when page parsing starts
+  def begin_page(arg = nil)
+    @content << ""
+  end
+  # record text that is drawn on the page
+  def show_text(string, *params)
+    @content.last << string.strip
+  end
+  # there's a few text callbacks, so make sure we process them all
+  alias :super_show_text :show_text
+  alias :move_to_next_line_and_show_text :show_text
+  alias :set_spacing_next_line_show_text :show_text
+  # this final text callback takes slightly different arguments
+  def show_text_with_positioning(*params)
+    params = params.first
+    params.each { |str| show_text(str) if str.kind_of?(String)}
+  end
+end
+receiver = PageTextReceiver.new
+pdf = PDF::Reader.file("somefile.pdf", receiver)
+puts receiver.content.inspect

data/lib/pdf/reader/content.rb CHANGED

@@ -147,18 +147,14 @@ class PDF::Reader
   # - metadata
   # - xml_metadata
   # - page_count
+  # - begin_form_xobject
+  # - end_form_xobject
   #
   # == Resource Callbacks
   #
-  # Each page and page_container can contain a range of resources required for the page,
+  # Each page can contain (or inherit) a range of resources required for the page,
   # including things like fonts and images. The following callbacks may appear
-  # after begin_page_container and begin_page if the relevant resources exist
-  # on a page:
-  #
-  # In most cases, these callbacks associate a name with each resource, allowing it
-  # to be referred to by name in the page content. For example, an XObject can hold an image.
-  # If it gets mapped to the name "IM1", then it can be placed on the page using
-  # invoke_xobject "IM1".
+  # after begin_page if the relevant resources exist on a page:
   #
   # - resource_procset
   # - resource_xobject
@@ -166,6 +162,12 @@ class PDF::Reader
   # - resource_colorspace
   # - resource_pattern
   # - resource_font
+  #
+  # In most cases, these callbacks associate a name with each resource, allowing it
+  # to be referred to by name in the page content. For example, an XObject can hold an image.
+  # If it gets mapped to the name "IM1", then it can be placed on the page using
+  # invoke_xobject "IM1".
+  #
   class Content
     OPERATORS = {
       'b'   => :close_fill_stroke,
@@ -284,22 +286,19 @@ class PDF::Reader
     # its content
     def walk_pages (page)
-      if page[:Resources]
-        res = page[:Resources]
-        page.delete(:Resources)
-      end
       # extract page content
       if page[:Type] == :Pages
         callback(:begin_page_container, [page])
-        walk_resources(@xref.object(res)) if res
+        res = @xref.object(page[:Resources])
+        resources.push res if res
         @xref.object(page[:Kids]).each {|child| walk_pages(@xref.object(child))}
+        resources.pop if res
         callback(:end_page_container)
       elsif page[:Type] == :Page
         callback(:begin_page, [page])
-        walk_resources(@xref.object(res)) if res
-        @page = page
-        @params = []
+        res = @xref.object(page[:Resources])
+        resources.push res if res
+        walk_resources(current_resources)
         if @xref.object(page[:Contents]).kind_of?(Array)
           contents = @xref.object(page[:Contents])
@@ -312,10 +311,38 @@ class PDF::Reader
           content_stream(obj)
         end if page.has_key?(:Contents) and page[:Contents]
+        resources.pop if res
         callback(:end_page)
       end
     end
     ################################################################################
+    # Retreive the XObject for the supplied label and if it's a Form, walk it
+    # like a regular page content stream.
+    #
+    def walk_xobject_form(label)
+      xobjects = current_resources[:XObject] || {}
+      xobject  = @xref.object(xobjects[label])
+      if xobject && xobject.hash[:Subtype] == :Form
+        callback(:begin_form_xobject)
+        resources = @xref.object(xobject.hash[:Resources])
+        walk_resources(resources) if resources
+        content_stream(xobject.to_s)
+        callback(:end_form_xobject)
+      end
+    end
+    ################################################################################
+    # Return a merged hash of all resources that are current. Pages, page and xobject
+    #
+    def current_resources
+      hash = {}
+      resources.each do |res|
+        hash.merge!(res)
+      end
+      hash
+    end
+    ################################################################################
     # Reads a PDF content stream and calls all the appropriate callback methods for the operators
     # it contains
     def content_stream (instructions)
@@ -341,8 +368,16 @@ class PDF::Reader
             # read the raw image data from the buffer without tokenising
             @params << @buffer.read_until("EI")
           end
           callback(OPERATORS[token], @params)
-          @params.clear
+          if OPERATORS[token] == :invoke_xobject
+            xobject_label = @params.first
+            @params.clear
+            walk_xobject_form(xobject_label)
+          else
+            @params.clear
+          end
         else
           @params << token
         end
@@ -352,6 +387,8 @@ class PDF::Reader
     end
     ################################################################################
     def walk_resources(resources)
+      return unless resources.respond_to?(:[])
       resources = resolve_references(resources)
       # extract any procset information
@@ -446,6 +483,9 @@ class PDF::Reader
         obj
       end
     end
+    def resources
+      @resources ||= []
+    end
   end
   ################################################################################
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: pdf-reader
 version: !ruby/object:Gem::Version
-  version: 0.7.6
+  version: 0.7.7
 platform: ruby
 authors:
 - Peter Jones
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-08-28 00:00:00 +10:00
+date: 2009-09-11 00:00:00 +10:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -36,6 +36,13 @@ extra_rdoc_files:
 - CHANGELOG
 - MIT-LICENSE
 files:
+- examples/extract_bates.rb
+- examples/text.rb
+- examples/page_counter_naive.rb
+- examples/callbacks.rb
+- examples/metadata.rb
+- examples/page_counter_improved.rb
+- examples/rspec.rb
 - lib/pdf/reader.rb
 - lib/pdf/reader/buffer.rb
 - lib/pdf/reader/cmap.rb
@@ -94,7 +101,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: pdf-reader
-rubygems_version: 1.3.4
+rubygems_version: 1.3.5
 signing_key:
 specification_version: 3
 summary: A library for accessing the content of PDF files