RubyGems - fireinc-pdf-reader - Versions diffs - 0.11.0.alpha - Mend

fireinc-pdf-reader 0.11.0.alpha

Files changed (54) hide show

data/CHANGELOG +168 -0
data/MIT-LICENSE +21 -0
data/README.rdoc +137 -0
data/Rakefile +34 -0
data/TODO +45 -0
data/bin/pdf_list_callbacks +15 -0
data/bin/pdf_object +48 -0
data/bin/pdf_text +15 -0
data/examples/callbacks.rb +21 -0
data/examples/extract_bates.rb +49 -0
data/examples/extract_images.rb +108 -0
data/examples/hash.rb +12 -0
data/examples/metadata.rb +25 -0
data/examples/page_counter_improved.rb +23 -0
data/examples/page_counter_naive.rb +24 -0
data/examples/rspec.rb +57 -0
data/examples/text.rb +40 -0
data/examples/version.rb +25 -0
data/lib/pdf/hash.rb +15 -0
data/lib/pdf/reader/abstract_strategy.rb +81 -0
data/lib/pdf/reader/buffer.rb +346 -0
data/lib/pdf/reader/cmap.rb +138 -0
data/lib/pdf/reader/encoding.rb +190 -0
data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
data/lib/pdf/reader/encodings/standard.txt +47 -0
data/lib/pdf/reader/encodings/symbol.txt +154 -0
data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
data/lib/pdf/reader/error.rb +53 -0
data/lib/pdf/reader/filter.rb +219 -0
data/lib/pdf/reader/font.rb +133 -0
data/lib/pdf/reader/form_xobject.rb +83 -0
data/lib/pdf/reader/glyphlist.txt +4322 -0
data/lib/pdf/reader/lzw.rb +123 -0
data/lib/pdf/reader/metadata_strategy.rb +56 -0
data/lib/pdf/reader/object_cache.rb +85 -0
data/lib/pdf/reader/object_hash.rb +289 -0
data/lib/pdf/reader/object_stream.rb +51 -0
data/lib/pdf/reader/page.rb +185 -0
data/lib/pdf/reader/page_text_receiver.rb +278 -0
data/lib/pdf/reader/pages_strategy.rb +475 -0
data/lib/pdf/reader/parser.rb +225 -0
data/lib/pdf/reader/print_receiver.rb +18 -0
data/lib/pdf/reader/reference.rb +66 -0
data/lib/pdf/reader/register_receiver.rb +95 -0
data/lib/pdf/reader/stream.rb +69 -0
data/lib/pdf/reader/text_receiver.rb +264 -0
data/lib/pdf/reader/token.rb +41 -0
data/lib/pdf/reader/xref.rb +220 -0
data/lib/pdf/reader.rb +296 -0
data/lib/pdf-reader.rb +1 -0
metadata +211 -0

data/CHANGELOG ADDED Viewed

@@ -0,0 +1,168 @@
+v0.9.4 (XXX)
+- support multiple receivers within a single pass over a source file
+  - massive time saving when dealing with multiple receivers
+v0.9.3 (2nd July 2011)
+- add PDF::Reader::Reference#hash method
+  - improves behaviour of Reference objects when tehy're used as Hash keys
+v0.9.2 (24th April 2011)
+- add basic support for fonts with Identity-V encoding.
+- bug: improve robustness of text extraction
+  - thanks to Evan Arnold for reporting
+- bug: fix loading of nested resources on XObjects
+  - thanks to Samuel Williams for reporting
+- bug: improve parsing of files with XRef object streams
+v0.9.1 (21st December 2010)
+- force gem to only install on ruby 1.8.7 or higher
+  - maintaining supprot for earlier versions takes more time than I have
+    available at the moment
+- bug: fix parsing of obscure pdf name format
+- bug: fix behaviour when loaded in confunction with htmldoc gem
+v0.9.0 (19th November 2010)
+- support for pdf 1.5+ files that use object and xref streams
+- support streams that use a flate filter with the predictor option
+- ensure all content instructions are parsed when split over multiple stream
+  - thanks to Jack Rusher for reporting
+- Various string parsing bug
+  - some character conversions to utf-8 were failing (thanks Andrea Barisani)
+  - hashes with nested hex strings were tokenising wronly (thanks Evan Arnold)
+  - escaping bug in tokenising of literal strings (thanks David Westerink)
+- Fix a bug that prevented PDFs with white space after the EOF marker from loading
+  - thanks to Solomon White for reporting the issue
+- Add support for de-filtering some LZW compressed streams
+  - thanks to Jose Ignacio Rubio Iradi for the patch
+- some small speed improvements
+- API CHANGE: PDF::Hash renamed to PDF::Reader::ObjectHash
+  - having a class named Hash was confusing for users
+v0.8.6 (27th August 2010)
+- new method: hash#page_references
+  - returns references to all page objects, gives rapid access to objects
+    for a given page
+v0.8.5 (11th April 2010)
+- fix a regression introduced in 0.8.4.
+  - Parameters passed to resource_font callback were inadvertently changed
+v0.8.4 (30th March 2010)
+- fix parsing of files that use Form XObjects
+  - thanks to Andrea Barisani for reporting the issue
+- fix two issues that caused a small number of characters to convert to Unicode
+  incorrectly
+  - thanks to Andrea Barisani for reporting the issue
+- require 'pdf-reader' now works a well as 'pdf/reader'
+  - good practice to have the require file match the gem name
+  - thanks to Chris O'Meara for highlighting this
+v0.8.3 (14th February 2010)
+- Fix a bug in tokenising of hex strings inside dictionaries
+  - Thanks to Brad Ediger for detecting the issue and proposing a solution
+v0.8.2 (1st January 2010)
+- Fix parsing of files that use Form XObjects behind an indirect reference
+  (thanks Cornelius Illi and Patrick Crosby)
+- Rewrote Buffer class to fix various speed issues reported over the years
+  - On my sample file extracting full text reduced from 220 seconds to 9 seconds.
+v0.8.1 (27th November 2009)
+- Added PDF::Hash#version. Provides access to the source file PDF version
+v0.8.0 (20th November 2009)
+- Added PDF::Hash. It provides direct access to objects from a PDF file
+  with an API that emulates the standard Ruby hash
+v0.7.7 (11th September 2009)
+- Trigger callbacks contained in Form XObjects when we encounter them in a
+  content stream
+- Fix inheritance of page resources to comply with section 3.6.2
+v0.7.6 (28th August 2009)
+- Various bug fixes that increase the files we can successfully parse
+  - Treat float and integer tokens differently (thanks Neil)
+  - Correctly handle PDFs where the Kids element of a Pages dict is an indirect
+    reference (thanks Rob Holland)
+  - Fix conversion of PDF strings to Ruby strings on 1.8.6 (thanks Andrès Koetsier)
+  - Fix decoding with ASCII85 and ASCIIHex filters (thanks Andrès Koetsier)
+  - Fix extracting inline images from content streams (thanks Andrès Koetsier)
+  - Fix extracting [ ] from content streams (thanks Christian Rishøj)
+  - Fix conversion of text to UTF8 when the cmap uses bfrange (thanks Federico Gonzalez Lutteroth)
+v0.7.5 (27th August 2008)
+- Fix a 1.8.7ism
+v0.7.4 (7th August 2008)
+- Raise a MalformedPDFError if a content stream contains an unterminated string
+- Fix an bug that was causing an endless loop on some OSX systems
+  - valid strings were incorrectly thought to be unterminated
+  - thanks to Jeff Webb for playing email ping pong with me as I tracked this
+    issue down
+v0.7.3 (11th June 2008)
+- Add a high level way to get direct access to a PDF object, including a new executable: pdf_object
+- Fix a hard loop bug caused by a content stream that is missing a final operator
+- Significantly simplified the internal code for encoding conversions
+  - Fixes YACC parsing bug that occurs on Fedora 8's ruby VM
+- New callbacks
+  - page_count
+  - pdf_version
+- Fix a bug that prevented a font's BaseFont from being recorded correctly
+v0.7.2 (20th May 2008)
+- Throw an UnsupportedFeatureError if we try to open an encrypted/secure PDF
+- Correctly handle page content instruction sets with trailing whitespace
+- Represent PDF Streams with a new object, PDF::Reader::Stream
+  - their really wasn't any point in separating the stream content from it's associated dict. You need both
+    parts to correctly interpret the content
+v0.7.1 (6th May 2008)
+- Non-page strings (ie. metadata, etc) are now converted to UTF-8 more accurately
+- Fixed a regression between 0.6.2 and 0.7 that prevented difference tables from being applied
+  correctly when translating text into UTF-8
+v0.7 (6th May 2008)
+- API INCOMPATIBLE CHANGE: any hashes that are passed to callbacks use symbols as keys instead of PDF::Reader::Name instances.
+- Improved support for converting text in some PDF files to unicode
+- Behave as expected if the Contents key in a Page Dict is a reference
+- Include some basic metadata callbacks
+- Don't interpret a comment token (%) inside a string as a comment
+- Small fixes to improve 1.9 compatibility
+- Improved our Zlib deflating to make it slightly more robust - still some more issues to work out though
+- Throw an UnsupportedFeatureError if a pdf that uses XRef streams is opened
+- Added an option to PDF::Reader#file and PDF::Reader#string to enable parsing of only parts of a PDF file(ie. only metadata, etc)
+v0.6.2 (22nd March 2008)
+- Catch low level errors when applying filters to a content stream and raise a MalformedPDFError instead.
+- Added support for processing inline images
+- Support for parsing XRef tables that have multiple subsections
+- Added a few callbacks to improve the way we supply information on page resources
+- Ignore whitespace in hex strings, as required by the spec (section 3.2.3)
+- Use our "unknown character box" when a single character in an Identity-H string fails to decode
+- Support ToUnicode CMaps that use the bfrange operator
+- Tweaked tokenising code to ensure whitespace doesn't get in the way
+v0.6.1 (12th March 2008)
+- Tweaked behaviour when we encounter Identity-H encoded text that doesn't have a ToUnicode mapping. We
+  just replace each character with a little box.
+- Use the same little box when invalid characters are found in other encodings instead of throwing an ugly
+  NoMethodError.
+- Added a method to RegisterReceiver that returns all occurrences of a callback
+v0.6.0 (27th February 2008)
+- all text is now transparently converted to UTF-8 before being passed to the callbacks.
+  before this version, text was just passed as a byte level copy of what was in the PDF file, which
+  was mildly annoying with some encodings, and resulted in garbled text for Unicode encoded text.
+- Fonts that use a difference table are now handled correctly
+- fixed some 1.9 incompatible syntax
+- expanded RegisterReceiver class to record extra info
+- expanded rspec coverage
+- tweaked a README example
+v0.5.1 (1st January 2008)
+- Several documentation tweaks
+- Improve support for parsing PDFs under windows (thanks to Jari Williamsson)
+v0.5 (14th December 2007)
+- Initial Release

data/MIT-LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+Copyright (c) 2009 Peter Jones
+Copyright (c) 2009 James Healy
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.rdoc ADDED Viewed

@@ -0,0 +1,137 @@
+The PDF::Reader library implements a PDF parser conforming as much as possible
+to the PDF specification from Adobe.
+It provides programmatic access to the contents of a PDF file with a high
+degree of flexibility.
+The PDF 1.7 specification is a weighty document and not all aspects are
+currently supported. I welcome submission of PDF files that exhibit
+unsupported aspects of the spec to assist with improving our support.
+= Installation
+The recommended installation method is via Rubygems.
+  gem install pdf-reader
+= Usage
+Begin by creating a PDF::Reader instance that points to a PDF file. Document
+level information (metadata, page count, bookmarks, etc) is available via
+this object.
+    reader = PDF::Reader.new("somefile.pdf")
+    puts reader.pdf_version
+    puts reader.info
+    puts reader.metadata
+    puts reader.page_count
+PDF::Reader.new can accept an IO stream or a filename. Here's an example with
+an IO stream:
+    require 'open-uri'
+    io     = open('http://example.com/somefile.pdf')
+    reader = PDF::Reader.new(io)
+    puts reader.info
+PDF is a page based file format, so most visible information is available via
+page-based iteration
+    reader = PDF::Reader.new("somefile.pdf")
+    reader.pages.each do |page|
+      puts page.fonts
+      puts page.text
+      puts page.raw_content
+    end
+If you need to access the full program for rendering a page, use the walk() method
+of PDF::Reader::Page.
+    class RedGreenBlue
+      def set_rgb_color_for_nonstroking(r, g, b)
+        puts "R: #{r}, G: #{g}, B: #{b}"
+      end
+    end
+    reader   = PDF::Reader.new("somefile.pdf")
+    page     = reader.page(1)
+    receiver = RedGreenBlue.new
+    page.walk(receiver)
+For low level access to the objects in a PDF file, use the ObjectHash class. You can
+build an ObjectHash instance directly:
+    puts PDF::Reader::ObjectHash.new("somefile.pdf")
+or via a PDF::Reader instance:
+    reader  = PDF::Reader.new("somefile.pdf")
+    puts reader.objects
+The second method is preferred to increase the effectiveness of internal caching.
+= Text Encoding
+Internally, text can be stored inside a PDF in various encodings, including
+zingbats, win-1252, mac roman and a form of Unicode. To avoid confusion, all
+text will be converted to UTF-8 before it is passed back from PDF::Reader.
+Strings that contain binary data (like font blobs) will be marked as such on
+M17N aware VMs.
+= Exceptions
+There are two key exceptions that you will need to watch out for when processing a
+PDF file:
+MalformedPDFError - The PDF appears to be corrupt in some way. If you believe the
+file should be valid, or that a corrupt file didn't raise an exception, please
+forward a copy of the file to the maintainers (preferably via the google group)
+and we can attempt to improve the code.
+UnsupportedFeatureError - The PDF uses a feature that PDF::Reader doesn't currently
+support. Again, we welcome submissions of PDF files that exhibit these features to help
+us with future code improvements.
+MalformedPDFError has some subclasses if you want to detect finer grained issues. If you
+don't, 'rescue MalformedPDFError' will catch all the subclassed errors as well.
+Any other exceptions should be considered bugs in either PDF::Reader (please
+report it!).
+= Maintainers
+- James Healy <mailto:jimmy@deefa.com>
+= Licensing
+This library is distributed under the terms of the MIT License. See the included file for
+more detail.
+= Mailing List
+Any questions or feedback should be sent to the PDF::Reader google group. It's
+better that any answers be available for others instead of hiding in someone's
+inbox.
+http://groups.google.com/group/pdf-reader
+= Examples
+The easiest way to explain how this works in practice is to show some examples.
+Check out the examples/ directory for a few files.
+= Known Limitations
+Occasionally some text cannot be extracted properly due to the way it has been
+stored, or the use of invalid bytes. In these cases PDF::Reader will output a
+little UTF-8 friendly box to indicate an unrecognisable character.
+= Resources
+- PDF::Reader Code Repository: http://github.com/yob/pdf-reader
+- PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
+- PDF Tutorial Slide Presentations: http://home.comcast.net/~jk05/presentations/PDFTutorials.html

data/Rakefile ADDED Viewed

@@ -0,0 +1,34 @@
+require "rubygems"
+require "bundler"
+Bundler.setup
+require 'rake'
+require 'rake/rdoctask'
+require 'rspec/core/rake_task'
+require 'roodi'
+require 'roodi_task'
+desc "Default Task"
+task :default => [ :spec ]
+# run all rspecs
+desc "Run all rspec files"
+RSpec::Core::RakeTask.new("spec") do |t|
+  t.rspec_opts  = ["--color", "--format progress"]
+  t.ruby_opts = "-w"
+end
+# Genereate the RDoc documentation
+desc "Create documentation"
+Rake::RDocTask.new("doc") do |rdoc|
+  rdoc.title = "pdf-reader"
+  rdoc.rdoc_dir = (ENV['CC_BUILD_ARTIFACTS'] || 'doc') + '/rdoc'
+  rdoc.rdoc_files.include('README.rdoc')
+  rdoc.rdoc_files.include('TODO')
+  rdoc.rdoc_files.include('CHANGELOG')
+  rdoc.rdoc_files.include('MIT-LICENSE')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+  rdoc.options << "--inline-source"
+end
+RoodiTask.new 'roodi', ['lib/**/*.rb']

data/TODO ADDED Viewed

@@ -0,0 +1,45 @@
+v0.8
+- add extra callbacks
+  - list implemented features
+    - encrypted? tagged? bookmarks? annotated? optimised?
+- Allow more than just page content and metadata to be parsed (see spec section 3.6.1)
+  - bookmarks?
+  - outline?
+  - articles?
+  - viewer prefs?
+- Don't remove comment when tokenising in the middle of a string
+- Tweak encoding mappings to differentiate between bytes that are invalid for an encoding, and bytes that are unchanged.
+  poppler seems to do this in a quite reasonable way. Original Encoding -> Glyph Names -> Unicode. As of 0.6 we go straight
+  from the Original encoding to Unicode.
+- detect when a font's encoding is a CMap (generally used for pre-Unicode, multibyte asian encodings), and display a user friendly error
+- Improve interpretation of non content stream data (ie metadata). recognise dates, etc
+- Fix inheritance of page attributes. Resources has been done, but plenty of other attributes
+  are inheritable. See table 3.2.7 in the spec
+v0.9
+- Add a way to extract raster images
+  - see XObjects section of spec (section 4.7)
+- Add a way to extract font data?
+Sometime
+- Support for CJK text (convert to UTF-8 like all other encodings. See Section 5.9 of the PDF spec)
+  - Will require significantly improved handling of CMaps, including creating a bunch of predefined ones
+- Work out why specs/data/zlib*.pdf isn't parsed correctly when all the major PDF viewers can display it correctly
+- Ship some extra receivers in the standard package, particuarly ones that are useful for running
+  rspec over generated PDF files
+- When we encounter Identity-H encoded text with no ToUnicode CMap, render the glyphs and treat them as images, as there's no
+  sensible way to convert them to unicode
+- Add support for additional filters: ASCIIHexDecode, ASCII85Decode, LZWDecode, RunLengthDecode, CCITTFaxDecode, JBIG2Decode, DCTDecode, JPXDecode, Crypt?
+- Add support for additional encodings:
+  - Identity-V(I *think* this relates to vertical text. Not sure how we'd support it sensibly)
+- Investigate how R->L text is handled
+- fix all callbacks to only ever return basic ruby objects (strings, ints,
+  attays, symbols, hashes, etc). No PDF::Reader::Reference or
+  PDF::Reader::Font, etc.

data/bin/pdf_list_callbacks ADDED Viewed

@@ -0,0 +1,15 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
+require 'pdf/reader'
+receiver = PDF::Reader::PrintReceiver.new
+if ARGV.empty?
+  PDF::Reader.new.parse($stdin, receiver)
+else
+  PDF::Reader.file(ARGV[0], receiver)
+end

data/bin/pdf_object ADDED Viewed

@@ -0,0 +1,48 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
+USAGE = "USAGE: " + File.basename(__FILE__) + " <file> <object id> [generation]"
+require 'pdf/reader'
+filename, id, gen = *ARGV
+if filename.nil? || id.nil?
+  puts USAGE
+  exit 1
+elsif !File.file?(filename)
+  $stderr.puts "#{filename} does not exist"
+  exit 1
+end
+# tweak the users options
+id  =  id.to_i
+gen ||= 0
+gen = gen.to_i
+# make magic happen
+begin
+  obj = PDF::Reader.object_file(filename, id, gen)
+  case obj
+  when Hash, Array
+    puts obj.inspect
+  when PDF::Reader::Stream
+    puts obj.hash.inspect
+    puts obj.unfiltered_data
+  else
+    puts obj
+  end
+rescue PDF::Reader::InvalidObjectError
+  $stderr.puts "Error retreiving object #{id}, gen #{gen}. Does it exist?"
+  exit 1
+rescue PDF::Reader::MalformedPDFError => e
+  $stderr.puts "Malformed PDF file: #{e.message}"
+  exit 1
+rescue PDF::Reader::UnsupportedFeatureError => e
+  $stderr.puts "PDF file implements a feature unsupported by PDF::Reader: #{e.message}"
+  exit 1
+end

data/bin/pdf_text ADDED Viewed

@@ -0,0 +1,15 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
+require 'pdf/reader'
+if ARGV.empty?
+  browser = PDF::Reader.new($stdin)
+else
+  browser = PDF::Reader.new(ARGV[0])
+end
+browser.pages.each do |page|
+  puts page.text
+end

data/examples/callbacks.rb ADDED Viewed

@@ -0,0 +1,21 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+# List all callbacks generated by each page
+#
+# WARNING: this will generate a *lot* of output, so you probably want to pipe
+#          it through less or to a text file.
+require 'rubygems'
+require 'pdf/reader'
+receiver = PDF::Reader::RegisterReceiver.new
+PDF::Reader.open("somefile.pdf") do |reader|
+  reader.pages.each do |page|
+    page.walk(receiver)
+    receiver.callbacks.each do |cb|
+      puts cb
+    end
+  end
+end

data/examples/extract_bates.rb ADDED Viewed

@@ -0,0 +1,49 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+# A sample script that attempts to extract bates numbers from a PDF file.
+# Bates numbers are often used to markup documents being used in legal
+# cases. For more info, see http://en.wikipedia.org/wiki/Bates_numbering
+#
+# Acrobat 9 introduced a markup syntax that directly specifies the bates
+# number for each page. For earlier versions, the easiest way to find
+# the number is to look for words that match a pattern.
+#
+# This example attempts to extract numbers using the Acrobat 9 syntax.
+# As a fall back, you can use a regular expression to look for words
+# that match the numbers you expect in the page content.
+require 'rubygems'
+require 'pdf/reader'
+class BatesReceiver
+  attr_reader :numbers
+  def initialize
+    @numbers = []
+  end
+  def begin_marked_content(*args)
+    return unless args.size >= 2
+    return unless args.first == :Artifact
+    return unless args[1][:Subtype] == :BatesN
+    @numbers << args[1][:Contents]
+  end
+  alias :begin_marked_content_with_pl :begin_marked_content
+end
+PDF::Reader.open("bates.pdf") do |reader|
+  reader.pages.each do |page|
+    receiver = BatesReceiver.new
+    page.walk(receiver)
+    if receiver.numbers.empty?
+      puts page.scan(/CC.+/)
+    else
+      puts receiver.numbers.inspect
+    end
+  end
+end

data/examples/extract_images.rb ADDED Viewed

@@ -0,0 +1,108 @@
+# coding: utf-8
+# This demonstrates a way to extract some images (those based on the JPG or
+# TIFF formats) from a PDF. There are other ways to store images, so
+# it may need to be expanded for real world usage, but it should serve
+# as a good guide.
+#
+# Thanks to Jack Rusher for the initial version of this example.
+#
+# USAGE:
+#
+#   ruby extract_images.rb somefile.pdf
+require 'pdf/reader'
+module ExtractImages
+  class Receiver
+    attr_reader :count
+    def initialize
+      @count = 0
+    end
+    def resource_xobject(name, stream)
+      return unless stream.hash[:Subtype] == :Image
+      increment_count
+      case stream.hash[:Filter]
+      when :CCITTFaxDecode
+        ExtractImages::Tiff.new(stream).save("#{count}-#{name}.tif")
+      when :DCTDecode
+        ExtractImages::Jpg.new(stream).save("#{count}-#{name}.jpg")
+      else
+        $stderr.puts "unrecognized image filter '#{stream.hash[:Filter]}'!"
+      end
+    end
+    def increment_count
+      @count += 1
+    end
+    private :increment_count
+  end
+  class Jpg
+    attr_reader :stream
+    def initialize(stream)
+      @stream = stream
+    end
+    def save(filename)
+      w = stream.hash[:Width]
+      h = stream.hash[:Height]
+      puts "#{filename}: h=#{h}, w=#{w}"
+      File.open(filename, "wb") { |file| file.write stream.data }
+    end
+  end
+  class Tiff
+    attr_reader :stream
+    def initialize(stream)
+      @stream = stream
+    end
+    def save(filename)
+      if stream.hash[:DecodeParms][:K] <= 0
+        save_group_four(filename)
+      else
+        $stderr.puts "#{filename}: CCITT non-group 4/2D image."
+      end
+    end
+    private
+    # Group 4, 2D
+    def save_group_four(filename)
+      k    = stream.hash[:DecodeParms][:K]
+      h    = stream.hash[:Height]
+      w    = stream.hash[:Width]
+      bpc  = stream.hash[:BitsPerComponent]
+      mask = stream.hash[:ImageMask]
+      len  = stream.hash[:Length]
+      cols = stream.hash[:DecodeParms][:Columns]
+      puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, mask=#{mask}, len=#{len}, cols=#{cols}, k=#{k}"
+      # Synthesize a TIFF header
+      long_tag  = lambda {|tag, value| [ tag, 4, 1, value ].pack( "ssII" ) }
+      short_tag = lambda {|tag, value| [ tag, 3, 1, value ].pack( "ssII" ) }
+      # header = byte order, version magic, offset of directory, directory count,
+      # followed by a series of tags containing metadata: 259 is a magic number for
+      # the compression type; 273 is the offset of the image data.
+      tiff = [ 73, 73, 42, 8, 5 ].pack("ccsIs") \
+      + short_tag.call( 256, cols ) \
+      + short_tag.call( 257, h ) \
+      + short_tag.call( 259, 4 ) \
+      + long_tag.call( 273, (10 + (5*12)) ) \
+      + long_tag.call( 279, len) \
+      + stream.data
+      File.open(filename, "wb") { |file| file.write tiff }
+    end
+  end
+end
+receiver = ExtractImages::Receiver.new
+PDF::Reader.file(ARGV[0], receiver)

data/examples/hash.rb ADDED Viewed

@@ -0,0 +1,12 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+# get direct access to PDF objects
+#
+$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../lib')
+require 'pdf/reader'
+filename = File.dirname(__FILE__) + "/../specs/data/cairo-unicode.pdf"
+hash = PDF::Reader::ObjectHash.new(filename)
+puts hash[3]