RubyGems - pdf-reader - Versions diffs - 0.8.6 → 0.9.0 - Mend

pdf-reader 0.8.6 → 0.9.0

Files changed (32) hide show

data/CHANGELOG +17 -0
data/README.rdoc +7 -15
data/Rakefile +10 -63
data/TODO +6 -8
data/bin/pdf_object +3 -0
data/bin/pdf_text +4 -2
data/examples/extract_images.rb +108 -0
data/examples/hash.rb +1 -1
data/examples/text.rb +3 -0
data/lib/pdf/hash.rb +8 -225
data/lib/pdf/reader.rb +79 -55
data/lib/pdf/reader/abstract_strategy.rb +77 -0
data/lib/pdf/reader/buffer.rb +61 -40
data/lib/pdf/reader/cmap.rb +11 -10
data/lib/pdf/reader/encoding.rb +85 -79
data/lib/pdf/reader/error.rb +1 -2
data/lib/pdf/reader/filter.rb +109 -6
data/lib/pdf/reader/font.rb +11 -11
data/lib/pdf/reader/lzw.rb +123 -0
data/lib/pdf/reader/metadata_strategy.rb +53 -0
data/lib/pdf/reader/object_hash.rb +275 -0
data/lib/pdf/reader/object_stream.rb +51 -0
data/lib/pdf/reader/{content.rb → pages_strategy.rb} +63 -100
data/lib/pdf/reader/parser.rb +74 -37
data/lib/pdf/reader/print_receiver.rb +0 -1
data/lib/pdf/reader/register_receiver.rb +21 -0
data/lib/pdf/reader/stream.rb +5 -1
data/lib/pdf/reader/text_receiver.rb +3 -1
data/lib/pdf/reader/token.rb +1 -1
data/lib/pdf/reader/xref.rb +126 -64
metadata +61 -13
data/lib/pdf/reader/explore.rb +0 -116

data/CHANGELOG CHANGED

@@ -1,3 +1,20 @@
+v0.9.0 (19th November 2010)
+- support for pdf 1.5+ files that use object and xref streams
+- support streams that use a flate filter with the predictor option
+- ensure all content instructions are parsed when split over multiple stream
+  - thanks to Jack Rusher for reporting
+- Various string parsing bug
+  - some character conversions to utf-8 were failing (thanks Andrea Barisani)
+  - hashes with nested hex strings were tokenising wronly (thanks Evan Arnold)
+  - escaping bug in tokenising of literal strings (thanks David Westerink)
+- Fix a bug that prevented PDFs with white space after the EOF marker from loading
+  - thanks to Solomon White for reporting the issue
+- Add support for de-filtering some LZW compressed streams
+  - thanks to Jose Ignacio Rubio Iradi for the patch
+- some small speed improvements
+- API CHANGE: PDF::Hash renamed to PDF::Reader::ObjectHash
+  - having a class named Hash was confusing for users
 v0.8.6 (27th August 2010)
 - new method: hash#page_references
   - returns references to all page objects, gives rapid access to objects

data/README.rdoc CHANGED

@@ -8,17 +8,6 @@ The PDF 1.7 specification is a weighty document and not all aspects are
 currently supported. I welcome submission of PDF files that exhibit
 unsupported aspects of the spec to assist with improving out support.
-= Development Status
-I adopted this library in 2007 when I was learning the fundamentals of the PDF
-spec. I do not currently use it in my day to day work and I just don't have the
-spare time to dedicate to adding new features.
-The code as it is works fairly well, and I offer it "as is". All patches, bug
-reports and sample PDFs are welcome - I will work on them when I can. If anyone
-is interested in adding features to PDF::Reader in their own effort to learn
-the PDF file format, I'll happy offer help and support.
 = Installation
 The recommended installation method is via Rubygems.
@@ -37,12 +26,12 @@ methods do is entirely up to you - save the text, extract images, count pages,
 read metadata, whatever.
 For a full list of the supported callback methods and a description of when they
-will be called, refer to PDF::Reader::Content. See the code examples below for a
+will be called, refer to PDF::Reader::PagesStrategy. See the examples directory for a
 way to print a list of all the callbacks generated by a file to STDOUT.
-There is also a class called PDF::Hash. This provides direct access to the objects
-in a PDF file using a ruby hash-like API. Checkout the documentation for the class
-for further information.
+There is also a class called PDF::Reader::ObjectHash. This provides direct
+access to the objects in a PDF file using a ruby hash-like API. Checkout the
+documentation for the class for further information.
 = Text Encoding
@@ -50,6 +39,9 @@ Internally, text can be stored inside a PDF in various encodings, including
 zingbats, win-1252, mac roman and a form of Unicode. To avoid confusion, all
 text will be converted to UTF-8 before it is passed back from PDF::Reader.
+Strings that contain binary data (like font blobs) will be marked as such on
+M17N aware VMs.
 = Exceptions
 There are two key exceptions that you will need to watch out for when processing a

data/Rakefile CHANGED

@@ -1,43 +1,21 @@
 require "rubygems"
+require "bundler"
+Bundler.setup
 require 'rake'
-require 'rake/clean'
 require 'rake/rdoctask'
-require 'rake/testtask'
-require "rake/gempackagetask"
-require 'spec/rake/spectask'
-PKG_VERSION = "0.8.6"
-PKG_NAME = "pdf-reader"
-PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
+require 'rspec/core/rake_task'
+require 'roodi'
+require 'roodi_task'
 desc "Default Task"
 task :default => [ :spec ]
 # run all rspecs
 desc "Run all rspec files"
-Spec::Rake::SpecTask.new("spec") do |t|
-  t.spec_files =  FileList['specs/**/*.rb']
-  t.rcov       =  true
-  t.rcov_dir   =  (ENV['CC_BUILD_ARTIFACTS'] || 'doc') + "/rcov"
-  t.ruby_opts  << "-w"
-  # t.rcov_opts = ["--exclude","spec.*\.rb"]
-end
-# generate specdocs
-desc "Generate Specdocs"
-Spec::Rake::SpecTask.new("specdocs") do |t|
-  t.spec_files = FileList['specs/**/*.rb']
-  t.spec_opts = ["--format", "rdoc"]
-  t.out = (ENV['CC_BUILD_ARTIFACTS'] || 'doc') + '/specdoc.rd'
-end
-# generate failing spec report
-desc "Generate failing spec report"
-Spec::Rake::SpecTask.new("spec_report") do |t|
-  t.spec_files = FileList['specs/**/*.rb']
-  t.spec_opts = ["--format", "html", "--diff"]
-  t.out = (ENV['CC_BUILD_ARTIFACTS'] || 'doc') + '/spec_report.html'
-  t.fail_on_error = false
+RSpec::Core::RakeTask.new("spec") do |t|
+  t.rspec_opts  = ["--color", "--format progress"]
+  t.ruby_opts = "-w"
 end
 # Genereate the RDoc documentation
@@ -53,35 +31,4 @@ Rake::RDocTask.new("doc") do |rdoc|
   rdoc.options << "--inline-source"
 end
-# a gemspec for packaging this library
-# RSpec files aren't included, as they depend on the PDF files,
-# which will make the gem filesize irritatingly large
-spec = Gem::Specification.new do |spec|
-  spec.name = PKG_NAME
-  spec.version = PKG_VERSION
-  spec.platform = Gem::Platform::RUBY
-  spec.summary = "A library for accessing the content of PDF files"
-  spec.files =  Dir.glob("{examples,lib}/**/**/*") + ["Rakefile"]
-  spec.require_path = "lib"
-  spec.bindir = "bin"
-  spec.executables << "pdf_object"
-  spec.executables << "pdf_text"
-  spec.executables << "pdf_list_callbacks"
-  spec.has_rdoc = true
-  spec.extra_rdoc_files = %w{README.rdoc TODO CHANGELOG MIT-LICENSE }
-  spec.rdoc_options << '--title' << 'PDF::Reader Documentation' <<
-                       '--main'  << 'README.rdoc' << '-q'
-  spec.author = "James Healy"
-  spec.email = "jimmy@deefa.com"
-  spec.rubyforge_project = "pdf-reader"
-  spec.homepage = "http://github.com/yob/pdf-reader"
-  spec.description = "The PDF::Reader library implements a PDF parser conforming as much as possible to the PDF specification from Adobe"
-  spec.add_dependency('Ascii85', '>=0.9')
-end
-# package the library into a gem
-desc "Generate a gem for pdf-reader"
-Rake::GemPackageTask.new(spec) do |pkg|
-	pkg.need_zip = true
-	pkg.need_tar = true
-end
+RoodiTask.new 'roodi', ['lib/**/*.rb']

data/TODO CHANGED

@@ -1,8 +1,6 @@
 v0.8
-- optimise PDF::Reader::Reference#from_buffer
-  - ruby-prof shows the match() call in this function is a real killer
 - add extra callbacks
-  - list implemented features
+  - list implemented features
     - encrypted? tagged? bookmarks? annotated? optimised?
 - Allow more than just page content and metadata to be parsed (see spec section 3.6.1)
   - bookmarks?
@@ -15,7 +13,6 @@ v0.8
   from the Original encoding to Unicode.
 - detect when a font's encoding is a CMap (generally used for pre-Unicode, multibyte asian encodings), and display a user friendly error
 - Improve interpretation of non content stream data (ie metadata). recognise dates, etc
-- Support Cross Reference Streams (spec 3.4.7)
 - Fix inheritance of page attributes. Resources has been done, but plenty of other attributes
   are inheritable. See table 3.2.7 in the spec
@@ -33,15 +30,16 @@ Sometime
 - Ship some extra receivers in the standard package, particuarly ones that are useful for running
   rspec over generated PDF files
-- When we encounter Identity-H encoded text with no ToUnicode CMap, render the glyphs and treat them as images, as there's no
+- When we encounter Identity-H encoded text with no ToUnicode CMap, render the glyphs and treat them as images, as there's no
   sensible way to convert them to unicode
 - Add support for additional filters: ASCIIHexDecode, ASCII85Decode, LZWDecode, RunLengthDecode, CCITTFaxDecode, JBIG2Decode, DCTDecode, JPXDecode, Crypt?
-- Add support for additional encodings:
-  - PDFDocEncoding
+- Add support for additional encodings:
   - Identity-V(I *think* this relates to vertical text. Not sure how we'd support it sensibly)
 - Investigate how R->L text is handled
-- Add support for object streams (spec section 3.4.6)
+- fix all callbacks to only ever return basic ruby objects (strings, ints,
+  attays, symbols, hashes, etc). No PDF::Reader::Reference or
+  PDF::Reader::Font, etc.

data/bin/pdf_object CHANGED

@@ -30,6 +30,9 @@ begin
   case obj
   when Hash, Array
     puts obj.inspect
+  when PDF::Reader::Stream
+    puts obj.hash.inspect
+    puts obj.unfiltered_data
   else
     puts obj
   end

data/bin/pdf_text CHANGED

@@ -17,9 +17,11 @@ class PageTextReceiver
     end
   end
-  def show_text(string, *params)
+  def show_text(*params)
     @content = "" if @content.nil?
-    @content << string
+    params.each do |str|
+      @content << str.to_s
+    end
   end
   # there's a few text callbacks, so make sure we process them all

data/examples/extract_images.rb ADDED

@@ -0,0 +1,108 @@
+# coding: utf-8
+# This demonstrates a way to extract some images (those based on the JPG or
+# TIFF formats) from a PDF. There are other ways to store images, so
+# it may need to be expanded for real world usage, but it should serve
+# as a good guide.
+#
+# Thanks to Jack Rusher for the initial version of this example.
+#
+# USAGE:
+#
+#   ruby extract_images.rb somefile.pdf
+require 'pdf/reader'
+module ExtractImages
+  class Receiver
+    attr_reader :count
+    def initialize
+      @count = 0
+    end
+    def resource_xobject(name, stream)
+      return unless stream.hash[:Subtype] == :Image
+      increment_count
+      case stream.hash[:Filter]
+      when :CCITTFaxDecode
+        ExtractImages::Tiff.new(stream).save("#{count}-#{name}.tif")
+      when :DCTDecode
+        ExtractImages::Jpg.new(stream).save("#{count}-#{name}.jpg")
+      else
+        $stderr.puts "unrecognized image filter '#{stream.hash[:Filter]}'!"
+      end
+    end
+    def increment_count
+      @count += 1
+    end
+    private :increment_count
+  end
+  class Jpg
+    attr_reader :stream
+    def initialize(stream)
+      @stream = stream
+    end
+    def save(filename)
+      w = stream.hash[:Width]
+      h = stream.hash[:Height]
+      puts "#{filename}: h=#{h}, w=#{w}"
+      File.open(filename, "wb") { |file| file.write stream.data }
+    end
+  end
+  class Tiff
+    attr_reader :stream
+    def initialize(stream)
+      @stream = stream
+    end
+    def save(filename)
+      if stream.hash[:DecodeParms][:K] <= 0
+        save_group_four(filename)
+      else
+        $stderr.puts "#{filename}: CCITT non-group 4/2D image."
+      end
+    end
+    private
+    # Group 4, 2D
+    def save_group_four(filename)
+      k    = stream.hash[:DecodeParms][:K]
+      h    = stream.hash[:Height]
+      w    = stream.hash[:Width]
+      bpc  = stream.hash[:BitsPerComponent]
+      mask = stream.hash[:ImageMask]
+      len  = stream.hash[:Length]
+      cols = stream.hash[:DecodeParms][:Columns]
+      puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, mask=#{mask}, len=#{len}, cols=#{cols}, k=#{k}"
+      # Synthesize a TIFF header
+      long_tag  = lambda {|tag, value| [ tag, 4, 1, value ].pack( "ssII" ) }
+      short_tag = lambda {|tag, value| [ tag, 3, 1, value ].pack( "ssII" ) }
+      # header = byte order, version magic, offset of directory, directory count,
+      # followed by a series of tags containing metadata: 259 is a magic number for
+      # the compression type; 273 is the offset of the image data.
+      tiff = [ 73, 73, 42, 8, 5 ].pack("ccsIs") \
+      + short_tag.call( 256, cols ) \
+      + short_tag.call( 257, h ) \
+      + short_tag.call( 259, 4 ) \
+      + long_tag.call( 273, (10 + (5*12)) ) \
+      + long_tag.call( 279, len) \
+      + stream.data
+      File.open(filename, "wb") { |file| file.write tiff }
+    end
+  end
+end
+receiver = ExtractImages::Receiver.new
+PDF::Reader.file(ARGV[0], receiver)

data/examples/hash.rb CHANGED

@@ -8,5 +8,5 @@ $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../lib')
 require 'pdf/reader'
 filename = File.dirname(__FILE__) + "/../specs/data/cairo-unicode.pdf"
-hash = PDF::Hash.new(filename)
+hash = PDF::Reader::ObjectHash.new(filename)
 puts hash[3]

data/examples/text.rb CHANGED

@@ -3,6 +3,9 @@
 # Extract all text from a single PDF
+require 'rubygems'
+require 'pdf/reader'
 class PageTextReceiver
   attr_accessor :content

data/lib/pdf/hash.rb CHANGED

@@ -1,232 +1,15 @@
-module PDF
-  # Provides low level access to the objects in a PDF file via a hash-like
-  # object.
-  #
-  # A PDF file can be viewed as a large hash map. It is a series of objects
-  # stored at an exact byte offsets, and a table that maps object IDs to byte
-  # offsets. Given an object ID, looking up an object is an O(1) operation.
-  #
-  # Each PDF object can be mapped to a ruby object, so by passing an object
-  # ID to the [] method, a ruby representation of that object will be
-  # retrieved.
-  #
-  # The class behaves much like a standard Ruby hash, including the use of
-  # the Enumerable mixin. The key difference is no []= method - the hash
-  # is read only.
-  #
-  # == Basic Usage
-  #
-  #     h = PDF::Hash.new("somefile.pdf")
-  #     h[1]
-  #     => 3469
-  #
-  #     h[PDF::Reader::Reference.new(1,0)]
-  #     => 3469
-  #
-  class Hash
-    include Enumerable
-    attr_accessor :default
-    attr_reader :trailer, :version
+# coding: utf-8
-    # Creates a new PDF:Hash object. input can be a string with a valid filename,
-    # a string containing a PDF file, or an IO object.
-    #
+module PDF
+  class Hash < PDF::Reader::ObjectHash # :nodoc:
     def initialize(input)
-      if input.kind_of?(IO) || input.kind_of?(StringIO)
-        io = input
-      elsif File.file?(input.to_s)
-        if File.respond_to?(:binread)
-          input = File.binread(input.to_s)
-        else
-          input = File.read(input.to_s)
-        end
-        io = StringIO.new(input)
-      else
-        raise ArgumentError, "input must be an IO-like object or a filename"
-      end
-      @version = read_version(io)
-      @xref  = PDF::Reader::XRef.new(io)
-      @trailer = @xref.load
-    end
-    # Access an object from the PDF. key can be an int or a PDF::Reader::Reference
-    # object.
-    #
-    # If an int is used, the object with that ID and a generation number of 0 will
-    # be returned.
-    #
-    # If a PDF::Reader::Reference object is used the exact ID and generation number
-    # can be specified.
-    #
-    def [](key)
-      return default if key.to_i <= 0
-      begin
-        unless key.kind_of?(PDF::Reader::Reference)
-          key = PDF::Reader::Reference.new(key.to_i, 0)
-        end
-        @xref.object(key)
-      rescue
-        return default
-      end
-    end
-    # Access an object from the PDF. key can be an int or a PDF::Reader::Reference
-    # object.
-    #
-    # If an int is used, the object with that ID and a generation number of 0 will
-    # be returned.
-    #
-    # If a PDF::Reader::Reference object is used the exact ID and generation number
-    # can be specified.
-    #
-    # local_deault is the object that will be returned if the requested key doesn't
-    # exist.
-    #
-    def fetch(key, local_default = nil)
-      obj = self[key]
-      if obj
-        return obj
-      elsif local_default
-        return local_default
-      else
-        raise IndexError, "#{key} is invalid" if key.to_i <= 0
-      end
-    end
-    # iterate over each key, value. Just like a ruby hash.
-    #
-    def each(&block)
-      @xref.each do |ref, obj|
-        yield ref, obj
-      end
-    end
-    alias :each_pair :each
-    # iterate over each key. Just like a ruby hash.
-    #
-    def each_key(&block)
-      each do |id, obj|
-        yield id
-      end
-    end
-    # iterate over each value. Just like a ruby hash.
-    #
-    def each_value(&block)
-      each do |id, obj|
-        yield obj
-      end
-    end
-    # return the number of objects in the file. An object with multiple generations
-    # is counted once.
-    def size
-      @xref.size
-    end
-    alias :length :size
-    # return true if there are no objects in this file
-    #
-    def empty?
-      size == 0 ? true : false
-    end
-    # return true if the specified key exists in the file. key
-    # can be an int or a PDF::Reader::Reference
-    #
-    def has_key?(check_key)
-      # TODO update from O(n) to O(1)
-      each_key do |key|
-        if check_key.kind_of?(PDF::Reader::Reference)
-          return true if check_key == key
-        else
-          return true if check_key.to_i == key.id
-        end
-      end
-      return false
+      warn "DEPRECATION NOTICE: PDF::Hash has been deprecated, use PDF::Reader::ObjectHash instead"
+      super
     end
-    alias :include? :has_key?
-    alias :key? :has_key?
-    alias :member? :has_key?
-    # return true if the specifiedvalue exists in the file
-    #
-    def has_value?(value)
-      # TODO update from O(n) to O(1)
-      each_value do |obj|
-        return true if obj == value
-      end
-      return false
+    def version
+      warn "DEPRECATION NOTICE: PDF::Hash#version has been deprecated, use PDF::Reader::ObjectHash#pdf_version instead"
+      pdf_version
     end
-    alias :value? :has_key?
-    def to_s
-      "<PDF::Hash size: #{self.size}>"
-    end
-    # return an array of all keys in the file
-    #
-    def keys
-      ret = []
-      each_key { |k| ret << k }
-      ret
-    end
-    # return an array of all values in the file
-    #
-    def values
-      ret = []
-      each_value { |v| ret << v }
-      ret
-    end
-    # return an array of all values from the specified keys
-    #
-    def values_at(*ids)
-      ids.map { |id| self[id] }
-    end
-    # return an array of arrays. Each sub array contains a key/value pair.
-    #
-    def to_a
-      ret = []
-      each do |id, obj|
-        ret << [id, obj]
-      end
-      ret
-    end
-    # returns an array of PDF::Reader::References. Each reference in the
-    # array points a Page object, one for each page in the PDF. The first
-    # reference is page 1, second reference is page 2, etc.
-    #
-    def page_references
-      root  = fetch(trailer[:Root])
-      @page_references ||= get_page_objects(root[:Pages]).flatten
-    end
-    private
-    # returns a nested array of object references for all pages in this object store.
-    #
-    def get_page_objects(ref)
-      obj = fetch(ref)
-      if obj[:Type] == :Page
-        ref
-      elsif obj[:Type] == :Pages
-        obj[:Kids].map { |kid| get_page_objects(kid) }
-      end
-    end
-    def read_version(io)
-      io.seek(0)
-      m, version = *io.read(10).match(/PDF-(\d.\d)/)
-      io.seek(0)
-      version
-    end
   end
 end