RubyGems - fireinc-pdf-reader - Versions diffs - 0.11.0.alpha - Mend

fireinc-pdf-reader 0.11.0.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

data/CHANGELOG +168 -0
data/MIT-LICENSE +21 -0
data/README.rdoc +137 -0
data/Rakefile +34 -0
data/TODO +45 -0
data/bin/pdf_list_callbacks +15 -0
data/bin/pdf_object +48 -0
data/bin/pdf_text +15 -0
data/examples/callbacks.rb +21 -0
data/examples/extract_bates.rb +49 -0
data/examples/extract_images.rb +108 -0
data/examples/hash.rb +12 -0
data/examples/metadata.rb +25 -0
data/examples/page_counter_improved.rb +23 -0
data/examples/page_counter_naive.rb +24 -0
data/examples/rspec.rb +57 -0
data/examples/text.rb +40 -0
data/examples/version.rb +25 -0
data/lib/pdf/hash.rb +15 -0
data/lib/pdf/reader/abstract_strategy.rb +81 -0
data/lib/pdf/reader/buffer.rb +346 -0
data/lib/pdf/reader/cmap.rb +138 -0
data/lib/pdf/reader/encoding.rb +190 -0
data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
data/lib/pdf/reader/encodings/standard.txt +47 -0
data/lib/pdf/reader/encodings/symbol.txt +154 -0
data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
data/lib/pdf/reader/error.rb +53 -0
data/lib/pdf/reader/filter.rb +219 -0
data/lib/pdf/reader/font.rb +133 -0
data/lib/pdf/reader/form_xobject.rb +83 -0
data/lib/pdf/reader/glyphlist.txt +4322 -0
data/lib/pdf/reader/lzw.rb +123 -0
data/lib/pdf/reader/metadata_strategy.rb +56 -0
data/lib/pdf/reader/object_cache.rb +85 -0
data/lib/pdf/reader/object_hash.rb +289 -0
data/lib/pdf/reader/object_stream.rb +51 -0
data/lib/pdf/reader/page.rb +185 -0
data/lib/pdf/reader/page_text_receiver.rb +278 -0
data/lib/pdf/reader/pages_strategy.rb +475 -0
data/lib/pdf/reader/parser.rb +225 -0
data/lib/pdf/reader/print_receiver.rb +18 -0
data/lib/pdf/reader/reference.rb +66 -0
data/lib/pdf/reader/register_receiver.rb +95 -0
data/lib/pdf/reader/stream.rb +69 -0
data/lib/pdf/reader/text_receiver.rb +264 -0
data/lib/pdf/reader/token.rb +41 -0
data/lib/pdf/reader/xref.rb +220 -0
data/lib/pdf/reader.rb +296 -0
data/lib/pdf-reader.rb +1 -0
metadata +211 -0

data/examples/metadata.rb ADDED Viewed

@@ -0,0 +1,25 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+# Extract metadata only
+require 'rubygems'
+require 'pdf/reader'
+class MetaDataReceiver
+  attr_accessor :regular
+  attr_accessor :xml
+  def metadata(data)
+    @regular = data
+  end
+  def metadata_xml(data)
+    @xml = data
+  end
+end
+receiver = MetaDataReceiver.new
+pdf = PDF::Reader.file(ARGV.shift, receiver, :pages => false, :metadata => true)
+puts receiver.regular.inspect
+puts receiver.xml.inspect

data/examples/page_counter_improved.rb ADDED Viewed

@@ -0,0 +1,23 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+# Improved Page Counter
+#
+# A simple app to display the number of pages in a PDF File.
+#
+  require 'rubygems'
+  require 'pdf/reader'
+  class PageReceiver
+    attr_accessor :pages
+    # Called when page parsing ends
+    def page_count(arg)
+      @pages = arg
+    end
+  end
+  receiver = PageReceiver.new
+  pdf = PDF::Reader.file("somefile.pdf", receiver, :pages => false)
+  puts "#{receiver.pages} pages"

data/examples/page_counter_naive.rb ADDED Viewed

@@ -0,0 +1,24 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+# A simple app to count the number of pages in a PDF File.
+require 'rubygems'
+require 'pdf/reader'
+class PageReceiver
+  attr_accessor :counter
+  def initialize
+    @counter = 0
+  end
+  # Called when page parsing ends
+  def end_page
+    @counter += 1
+  end
+end
+receiver = PageReceiver.new
+pdf = PDF::Reader.file("somefile.pdf", receiver)
+puts "#{receiver.counter} pages"

data/examples/rspec.rb ADDED Viewed

@@ -0,0 +1,57 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+#  Basic RSpec of a generated PDF
+require 'rubygems'
+require 'pdf/reader'
+require 'pdf/writer'
+require 'spec'
+class PageTextReceiver
+  attr_accessor :content
+  def initialize
+    @content = []
+  end
+  # Called when page parsing starts
+  def begin_page(arg = nil)
+    @content << ""
+  end
+  def show_text(string, *params)
+    @content.last << string.strip
+  end
+  # there's a few text callbacks, so make sure we process them all
+  alias :super_show_text :show_text
+  alias :move_to_next_line_and_show_text :show_text
+  alias :set_spacing_next_line_show_text :show_text
+  def show_text_with_positioning(*params)
+    params = params.first
+    params.each { |str| show_text(str) if str.kind_of?(String)}
+  end
+end
+context "My generated PDF" do
+  specify "should have the correct text on 2 pages" do
+    # generate our PDF
+    pdf = PDF::Writer.new
+    pdf.text "Chunky", :font_size => 32, :justification => :center
+    pdf.start_new_page
+    pdf.text "Bacon", :font_size => 32, :justification => :center
+    pdf.save_as("chunkybacon.pdf")
+    # process the PDF
+    receiver = PageTextReceiver.new
+    PDF::Reader.file("chunkybacon.pdf", receiver)
+    # confirm the text appears on the correct pages
+    receiver.content.size.should eql(2)
+    receiver.content[0].should eql("Chunky")
+    receiver.content[1].should eql("Bacon")
+  end
+end

data/examples/text.rb ADDED Viewed

@@ -0,0 +1,40 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+# Extract all text from a single PDF
+require 'rubygems'
+require 'pdf/reader'
+class PageTextReceiver
+  attr_accessor :content
+  def initialize
+    @content = []
+  end
+  # Called when page parsing starts
+  def begin_page(arg = nil)
+    @content << ""
+  end
+  # record text that is drawn on the page
+  def show_text(string, *params)
+    @content.last << string.strip
+  end
+  # there's a few text callbacks, so make sure we process them all
+  alias :super_show_text :show_text
+  alias :move_to_next_line_and_show_text :show_text
+  alias :set_spacing_next_line_show_text :show_text
+  # this final text callback takes slightly different arguments
+  def show_text_with_positioning(*params)
+    params = params.first
+    params.each { |str| show_text(str) if str.kind_of?(String)}
+  end
+end
+receiver = PageTextReceiver.new
+pdf = PDF::Reader.file("somefile.pdf", receiver)
+puts receiver.content.inspect

data/examples/version.rb ADDED Viewed

@@ -0,0 +1,25 @@
+#!/usr/bin/env ruby
+# coding: utf-8
+# Determine the PDF version of a file
+require 'rubygems'
+require 'pdf/reader'
+class VersionReceiver
+  attr_accessor :version
+  def initialize
+    @version = nil
+  end
+  # Called when document parsing starts
+  def pdf_version(arg = nil)
+    @version = arg
+  end
+end
+receiver = VersionReceiver.new
+pdf = PDF::Reader.file(ARGV.shift, receiver)
+puts receiver.version

data/lib/pdf/hash.rb ADDED Viewed

@@ -0,0 +1,15 @@
+# coding: utf-8
+module PDF
+  class Hash < ::PDF::Reader::ObjectHash # :nodoc:
+    def initialize(input)
+      warn "DEPRECATION NOTICE: PDF::Hash has been deprecated, use PDF::Reader::ObjectHash instead"
+      super
+    end
+    def version
+      warn "DEPRECATION NOTICE: PDF::Hash#version has been deprecated, use PDF::Reader::ObjectHash#pdf_version instead"
+      pdf_version
+    end
+  end
+end

data/lib/pdf/reader/abstract_strategy.rb ADDED Viewed

@@ -0,0 +1,81 @@
+# coding: utf-8
+class PDF::Reader
+  # DEPRECATED: this class was deprecated in version 0.11.0 and will
+  #             eventually be removed
+  class AbstractStrategy # :nodoc:
+    def initialize(ohash, receivers, options = {})
+      @ohash, @options = ohash, options
+      if receivers.is_a?(Array)
+        @receivers = receivers
+      else
+        @receivers = [receivers]
+      end
+    end
+    private
+    def options
+      @options || {}
+    end
+    # calls the name callback method on the receiver class with params as the arguments
+    #
+    def callback (name, params=[])
+      @receivers.each do |receiver|
+        receiver.send(name, *params) if receiver.respond_to?(name)
+      end
+    end
+    # strings outside of page content should be in either PDFDocEncoding or UTF-16.
+    def decode_strings(obj)
+      case obj
+      when String then
+        if obj[0,2].unpack("C*").slice(0,2) == [254,255]
+          PDF::Reader::Encoding.new(:UTF16Encoding).to_utf8(obj[2, obj.size])
+        else
+          PDF::Reader::Encoding.new(:PDFDocEncoding).to_utf8(obj)
+        end
+      when Hash   then obj.each { |key,val| obj[key] = decode_strings(val) }
+      when Array  then obj.collect { |item| decode_strings(item) }
+      else
+        obj
+      end
+    end
+    def info
+      ohash.object(trailer[:Info])
+    end
+    def info?
+      info ? true : false
+    end
+    def ohash
+      @ohash
+    end
+    def pages
+      ohash.object(root[:Pages])
+    end
+    def pages?
+      pages ? true : false
+    end
+    def root
+      ohash.object(trailer[:Root])
+    end
+    def root?
+      root ? true : false
+    end
+    def trailer
+      ohash.trailer
+    end
+  end
+end

data/lib/pdf/reader/buffer.rb ADDED Viewed

@@ -0,0 +1,346 @@
+# coding: utf-8
+################################################################################
+#
+# Copyright (C) 2010 James Healy (jimmy@deefa.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+################################################################################
+class PDF::Reader
+  # A string tokeniser that recognises PDF grammar. When passed an IO stream or a
+  # string, repeated calls to token() will return the next token from the source.
+  #
+  # This is very low level, and getting the raw tokens is not very useful in itself.
+  #
+  # This will usually be used in conjunction with PDF:Reader::Parser, which converts
+  # the raw tokens into objects we can work with (strings, ints, arrays, etc)
+  #
+  class Buffer
+    attr_reader :pos
+    # Creates a new buffer.
+    #
+    # Params:
+    #
+    #   io - an IO stream or string with the raw data to tokenise
+    #
+    # options:
+    #
+    #   :seek - a byte offset to seek to before starting to tokenise
+    #   :content_stream - set to true if buffer will be tokenising a
+    #                     content stream. Defaults to false
+    #
+    def initialize (io, opts = {})
+      @io = io
+      @tokens = []
+      @in_content_stream = opts[:content_stream]
+      @io.seek(opts[:seek]) if opts[:seek]
+      @pos = @io.pos
+    end
+    # return true if there are no more tokens left
+    #
+    def empty?
+      prepare_tokens if @tokens.size < 3
+      @tokens.empty?
+    end
+    # return raw bytes from the underlying IO stream.
+    #
+    #   bytes - the number of bytes to read
+    #
+    # options:
+    #
+    #   :skip_eol - if true, the IO stream is advanced past a CRLF or LF that
+    #               is sitting under the io cursor.
+    #
+    def read(bytes, opts = {})
+      reset_pos
+      if opts[:skip_eol]
+        @io.seek(-1, IO::SEEK_CUR)
+        str = @io.read(2)
+        if str.nil?
+          return nil
+        elsif str == "\r\n"
+          # do nothing
+        elsif str[0,1] == "\n"
+          @io.seek(-1, IO::SEEK_CUR)
+        else
+          @io.seek(-2, IO::SEEK_CUR)
+        end
+      end
+      bytes = @io.read(bytes)
+      save_pos
+      bytes
+    end
+    # return the next token from the source. Returns a string if a token
+    # is found, nil if there are no tokens left.
+    #
+    def token
+      reset_pos
+      prepare_tokens if @tokens.size < 3
+      merge_indirect_reference
+      prepare_tokens if @tokens.size < 3
+      @tokens.shift
+    end
+    # return the byte offset where the first XRef table in th source can be found.
+    #
+    def find_first_xref_offset
+      @io.seek(-1024, IO::SEEK_END) rescue @io.seek(0)
+      data = @io.read(1024)
+      # the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
+      lines = data.split(/[\n\r]+/).reverse
+      eof_index = lines.index { |l| l.strip == "%%EOF" }
+      raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
+      raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
+      lines[eof_index+1].to_i
+    end
+    private
+    # Returns true if this buffer is parsing a content stream
+    #
+    def in_content_stream?
+      @in_content_stream ? true : false
+    end
+    # Some bastard moved our IO stream cursor. Restore it.
+    #
+    def reset_pos
+      @io.seek(@pos) if @io.pos != @pos
+    end
+    # save the current position of the source IO stream. If someone else (like another buffer)
+    # moves the cursor, we can then restore it.
+    #
+    def save_pos
+      @pos = @io.pos
+    end
+    # attempt to prime the buffer with the next few tokens.
+    #
+    def prepare_tokens
+      10.times do
+        if state == :literal_string
+          prepare_literal_token
+        elsif state == :hex_string
+          prepare_hex_token
+        elsif state == :regular
+          prepare_regular_token
+        elsif state == :inline
+          prepare_inline_token
+        end
+      end
+      save_pos
+    end
+    # tokenising behaves slightly differently based on the current context.
+    # Determine the current context/state by examining the last token we found
+    #
+    def state
+      if @tokens[-1] == "("
+        :literal_string
+      elsif @tokens[-1] == "<"
+        :hex_string
+      elsif @tokens[-1] == "stream"
+        :stream
+      elsif in_content_stream? && @tokens[-1] == "ID"
+        :inline
+      else
+        :regular
+      end
+    end
+    # detect a series of 3 tokens that make up an indirect object. If we find
+    # them, replace the tokens with a PDF::Reader::Reference instance.
+    #
+    # Merging them into a single string was another option, but that would mean
+    # code further up the stack would need to check every token  to see if it looks
+    # like an indirect object. For optimisation reasons, I'd rather avoid
+    # that extra check.
+    #
+    # It's incredibly likely that the next 3 tokens in the buffer are NOT an
+    # indirect reference, so test for that case first and avoid the relatively
+    # expensive regexp checks if possible.
+    #
+    def merge_indirect_reference
+      return if @tokens.size < 3
+      return if @tokens[2] != "R"
+      if @tokens[0].match(/\d+/) && @tokens[1].match(/\d+/)
+        @tokens[0] = PDF::Reader::Reference.new(@tokens[0].to_i, @tokens[1].to_i)
+        @tokens[1] = nil
+        @tokens[2] = nil
+        @tokens.compact!
+      end
+    end
+    def prepare_inline_token
+      str = ""
+      while str[-2,2] != "EI"
+        chr = @io.read(1)
+        break if chr.nil?
+        str << chr
+      end
+      @tokens << str[0, str.size-2].strip
+      @io.seek(-2, IO::SEEK_CUR) unless chr.nil?
+    end
+    # if we're currently inside a hex string, read hex nibbles until
+    # we find a closing >
+    #
+    def prepare_hex_token
+      str = ""
+      finished = false
+      while !finished
+        chr = @io.read(1)
+        codepoint = chr.to_s.unpack("C*").first
+        if chr.nil?
+          finished = true # unbalanced params
+        elsif (48..57).include?(codepoint) || (65..90).include?(codepoint) || (97..122).include?(codepoint)
+          str << chr
+        elsif codepoint <= 32
+          # ignore it
+        else
+          @tokens << str if str.size > 0
+          @tokens << ">" if chr != ">"
+          @tokens << chr
+          finished = true
+        end
+      end
+    end
+    # if we're currently inside a literal string we more or less just read bytes until
+    # we find the closing ) delimiter. Lots of bytes that would otherwise indicate the
+    # start of a new token in regular mode are left untouched when inside a literal
+    # string.
+    #
+    # The entire literal string will be returned as a single token. It will need further
+    # processing to fix things like escaped new lines, but that's someone else's
+    # problem.
+    #
+    def prepare_literal_token
+      str = ""
+      count = 1
+      while count > 0
+        chr = @io.read(1)
+        if chr.nil?
+          count = 0 # unbalanced params
+        elsif chr == "\x5c"
+          str << chr << @io.read(1).to_s
+        elsif chr == "("
+          str << "("
+          count += 1
+        elsif chr == ")"
+          count -= 1
+          str << ")" unless count == 0
+        else
+          str << chr unless count == 0
+        end
+      end
+      @tokens << str if str.size > 0
+      @tokens << ")"
+    end
+    # Extract the next regular token and stock it in our buffer, ready to be returned.
+    #
+    # What each byte means is complex, check out section "3.1.1 Character Set" of the 1.7 spec
+    # to read up on it.
+    #
+    def prepare_regular_token
+      tok = ""
+      while chr = @io.read(1)
+        case chr
+        when "\x25"
+          # comment, ignore everything until the next EOL char
+          done = false
+          while !done
+            chr = @io.read(1)
+            done = true if chr.nil? || chr == "\x0A" || chr == "\x0D"
+          end
+        when "\x00", "\x09", "\x0A", "\x0C", "\x0D", "\x20"
+          # white space, token finished
+          @tokens << tok if tok.size > 0
+          tok = ""
+          break
+        when "\x3C"
+          # opening delimiter '<', start of new token
+          @tokens << tok if tok.size > 0
+          chr << @io.read(1) if peek_char == "\x3C" # check if token is actually '<<'
+          @tokens << chr
+          tok = ""
+          break
+        when "\x3E"
+          # closing delimiter '>', start of new token
+          @tokens << tok if tok.size > 0
+          chr << @io.read(1) if peek_char == "\x3E" # check if token is actually '>>'
+          @tokens << chr
+          tok = ""
+          break
+        when "\x28", "\x5B", "\x7B", "\x2F"
+          # opening delimiter, start of new token
+          @tokens << tok if tok.size > 0
+          @tokens << chr
+          tok = ""
+          break
+        when "\x29", "\x5D", "\x7D"
+          # closing delimiter
+          @tokens << tok if tok.size > 0
+          @tokens << chr
+          tok = ""
+          break
+        else
+          tok << chr
+        end
+      end
+      @tokens << tok if tok.size > 0
+    end
+    # peek at the next character in the io stream, leaving the stream position
+    # untouched
+    #
+    def peek_char
+      chr = @io.read(1)
+      @io.seek(-1, IO::SEEK_CUR) unless chr.nil?
+      chr
+    end
+  end
+end