RubyGems - pdf-reader - Versions diffs - 1.4.1 → 2.0.0.beta1 - Mend

pdf-reader 1.4.1 → 2.0.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml +4 -4
data/CHANGELOG +8 -3
data/{README.rdoc → README.md} +40 -23
data/Rakefile +2 -2
data/bin/pdf_object +4 -1
data/lib/pdf/reader.rb +7 -112
data/lib/pdf/reader/buffer.rb +2 -1
data/lib/pdf/reader/cmap.rb +26 -24
data/lib/pdf/reader/encoding.rb +4 -5
data/lib/pdf/reader/filter.rb +1 -0
data/lib/pdf/reader/filter/run_length.rb +1 -5
data/lib/pdf/reader/font.rb +1 -11
data/lib/pdf/reader/glyph_hash.rb +6 -2
data/lib/pdf/reader/lzw.rb +1 -1
data/lib/pdf/reader/object_hash.rb +35 -16
data/lib/pdf/reader/page_layout.rb +6 -17
data/lib/pdf/reader/pages_strategy.rb +1 -304
data/lib/pdf/reader/parser.rb +6 -4
data/lib/pdf/reader/standard_security_handler.rb +18 -14
data/lib/pdf/reader/text_run.rb +3 -9
metadata +14 -47
data/bin/pdf_list_callbacks +0 -17
data/lib/pdf/reader/abstract_strategy.rb +0 -81
data/lib/pdf/reader/metadata_strategy.rb +0 -56
data/lib/pdf/reader/text_receiver.rb +0 -265

data/lib/pdf/reader/encoding.rb CHANGED Viewed

@@ -147,7 +147,7 @@ class PDF::Reader
       ret = [
         @mapping[glyph_code.to_i] || glyph_code.to_i
       ].pack("U*")
-      ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
+      ret.force_encoding("UTF-8")
       ret
     end
@@ -158,13 +158,13 @@ class PDF::Reader
     def little_boxes(times)
       codepoints = [ PDF::Reader::Encoding::UNKNOWN_CHAR ] * times
       ret = codepoints.pack("U*")
-      ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
+      ret.force_encoding("UTF-8")
       ret
     end
     def convert_to_utf8(str)
       ret = str.unpack(unpack).map! { |c| @mapping[c] || c }.pack("U*")
-      ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
+      ret.force_encoding("UTF-8")
       ret
     end
@@ -207,8 +207,7 @@ class PDF::Reader
     end
     def load_mapping(file)
-      RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
-      File.open(file, mode) do |f|
+      File.open(file, "r:BINARY") do |f|
         f.each do |l|
           _m, single_byte, unicode = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
           @mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte

data/lib/pdf/reader/filter.rb CHANGED Viewed

@@ -46,6 +46,7 @@ class PDF::Reader
       when :CCITTFaxDecode  then PDF::Reader::Filter::Null.new(options)
       when :DCTDecode       then PDF::Reader::Filter::Null.new(options)
       when :FlateDecode     then PDF::Reader::Filter::Flate.new(options)
+      when :Fl              then PDF::Reader::Filter::Flate.new(options)
       when :JBIG2Decode     then PDF::Reader::Filter::Null.new(options)
       when :JPXDecode       then PDF::Reader::Filter::Null.new(options)
       when :LZWDecode       then PDF::Reader::Filter::Lzw.new(options)

data/lib/pdf/reader/filter/run_length.rb CHANGED Viewed

@@ -15,11 +15,7 @@ class PDF::Reader # :nodoc:
         out = ""
         while pos < data.length
-          if data.respond_to?(:getbyte)
-            length = data.getbyte(pos)
-          else
-            length = data[pos]
-          end
+          length = data.getbyte(pos)
           pos += 1
           case

data/lib/pdf/reader/font.rb CHANGED Viewed

@@ -36,11 +36,7 @@ class PDF::Reader
     attr_reader :widths, :first_char, :last_char, :basefont, :font_descriptor,
                 :cid_widths, :cid_default_width
-    def initialize(ohash = nil, obj = nil)
-      if ohash.nil? || obj.nil?
-        $stderr.puts "DEPREACTION WARNING - PDF::Reader::Font.new should be called with 2 args"
-        return
-      end
+    def initialize(ohash, obj)
       @ohash = ohash
       @tounicode = nil
@@ -52,12 +48,6 @@ class PDF::Reader
       @encoding ||= PDF::Reader::Encoding.new(:StandardEncoding)
     end
-    def basefont=(font)
-      $stderr.puts "Font#basefont= is deprecated and will be removed in the 2.0 release"
-      @encoding ||= default_encoding(font)
-      @basefont = font
-    end
     def to_utf8(params)
       if @tounicode
         to_utf8_via_cmap(params)

data/lib/pdf/reader/glyph_hash.rb CHANGED Viewed

@@ -48,6 +48,9 @@ class PDF::Reader
     #   h.name_to_unicode(:Euro)
     #   => 8364
     #
+    #   h.name_to_unicode(:X4A)
+    #   => 74
+    #
     #   h.name_to_unicode(:G30)
     #   => 48
     #
@@ -62,6 +65,8 @@ class PDF::Reader
       if @by_name.has_key?(name)
         @by_name[name]
+      elsif str.match(/\AX[0-9a-fA-F]{2,4}\Z/)
+        "0x#{str[1,4]}".hex
       elsif str.match(/\Auni[A-F\d]{4}\Z/)
         "0x#{str[3,4]}".hex
       elsif str.match(/\Au[A-F\d]{4,6}\Z/)
@@ -102,8 +107,7 @@ class PDF::Reader
       keyed_by_name      = {}
       keyed_by_codepoint = {}
-      RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
-      File.open(File.dirname(__FILE__) + "/glyphlist.txt", mode) do |f|
+      File.open(File.dirname(__FILE__) + "/glyphlist.txt", "r:BINARY") do |f|
         f.each do |l|
           _m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
           if name && code

data/lib/pdf/reader/lzw.rb CHANGED Viewed

@@ -22,7 +22,7 @@ module PDF
         def initialize(data, bits_in_chunk)
           @data = data
-          @data.force_encoding("BINARY") if @data.respond_to?(:force_encoding)
+          @data.force_encoding("BINARY")
           @bits_in_chunk = bits_in_chunk
           @current_pos = 0
           @bits_left_in_byte = 8

data/lib/pdf/reader/object_hash.rb CHANGED Viewed

@@ -102,21 +102,7 @@ class PDF::Reader
     # a PDF::Reader::Reference, the key is returned unchanged.
     #
     def deref!(key)
-      case object = deref(key)
-      when Hash
-        {}.tap { |hash|
-          object.each do |k, value|
-            hash[k] = deref!(value)
-          end
-        }
-      when PDF::Reader::Stream
-        object.hash = deref!(object.hash)
-        object
-      when Array
-        object.map { |value| deref!(value) }
-      else
-        object
-      end
+      deref_internal!(key, {})
     end
     # Access an object from the PDF. key can be an int or a PDF::Reader::Reference
@@ -266,6 +252,39 @@ class PDF::Reader
     private
+    # Private implementation of deref!, which exists to ensure the `seen` argument
+    # isn't publicly available. It's used to avoid endless loops in the recursion, and
+    # doesn't need to be part of the public API.
+    #
+    def deref_internal!(key, seen)
+      seen_key = key.is_a?(PDF::Reader::Reference) ? key : key.object_id
+      return seen[seen_key] if seen.key?(seen_key)
+      case object = deref(key)
+      when Hash
+        seen[seen_key] ||= {}
+        object.each do |k, value|
+          seen[seen_key][k] = deref_internal!(value, seen)
+        end
+        seen[seen_key]
+      when PDF::Reader::Stream
+        seen[seen_key] ||= PDF::Reader::Stream.new({}, object.data)
+        object.hash.each do |k,value|
+          seen[seen_key].hash[k] = deref_internal!(value, seen)
+        end
+        seen[seen_key]
+      when Array
+        seen[seen_key] ||= []
+        object.each do |value|
+          seen[seen_key] << deref_internal!(value, seen)
+        end
+        seen[seen_key]
+      else
+        object
+      end
+    end
     def build_security_handler(opts = {})
       return nil if trailer[:Encrypt].nil?
@@ -316,7 +335,7 @@ class PDF::Reader
       if obj[:Type] == :Page
         ref
-      elsif obj[:Type] == :Pages
+      elsif obj[:Kids]
         deref(obj[:Kids]).map { |kid| get_page_objects(kid) }
       end
     end

data/lib/pdf/reader/page_layout.rb CHANGED Viewed

@@ -8,17 +8,19 @@ class PDF::Reader
   # media box should be a 4 number array that describes the dimensions of the
   # page to be rendered as described by the page's MediaBox attribute
   class PageLayout
+    DEFAULT_FONT_SIZE = 12
     def initialize(runs, mediabox)
       raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
       @runs    = merge_runs(runs)
-      @mean_font_size   = mean(@runs.map(&:font_size)) || 0
+      @mean_font_size   = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
+      @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
       @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
       @page_width  = mediabox[2] - mediabox[0]
       @page_height = mediabox[3] - mediabox[1]
       @x_offset = @runs.map(&:x).sort.first
-      @current_platform_is_rbx_19 = RUBY_DESCRIPTION =~ /\Arubinius 2.0.0/ &&
-                                      RUBY_VERSION >= "1.9.0"
     end
     def to_s
@@ -110,21 +112,8 @@ class PDF::Reader
       runs
     end
-    # This is a simple alternative to String#[]=. We can't use the string
-    # method as it's buggy on rubinius 2.0rc1 (in 1.9 mode)
-    #
-    # See my bug report at https://github.com/rubinius/rubinius/issues/1985
     def local_string_insert(haystack, needle, index)
-      if @current_platform_is_rbx_19
-        char_count = needle.length
-        haystack.replace(
-          (haystack[0,index] || "") +
-          needle +
-          (haystack[index+char_count,500] || "")
-        )
-      else
-        haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
-      end
+      haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
     end
   end
 end

data/lib/pdf/reader/pages_strategy.rb CHANGED Viewed

@@ -27,42 +27,8 @@
 class PDF::Reader
   ################################################################################
-  # Walks the pages of the PDF file and calls the appropriate callback methods when
-  # something of interest is found.
-  #
-  # The callback methods should exist on the receiver object passed into the constructor.
-  # Whenever some content is found that will trigger a callback, the receiver is checked
-  # to see if the callback is defined.
-  #
-  # If it is defined it will be called. If not, processing will continue.
-  #
-  # = Available Callbacks
-  # The following callbacks are available and should be methods defined on your receiver class. Only
-  # implement the ones you need - the rest will be ignored.
-  #
-  # Some callbacks will include parameters which will be passed in as an array. For callbacks
-  # that supply no paramters, or where you don't need them, the *params argument can be left off.
-  # Some example callback method definitions are:
-  #
-  #   def begin_document
-  #   def end_page
-  #   def show_text(string, *params)
-  #   def fill_stroke(*params)
-  #
-  # You should be able to infer the basic command the callback is reporting based on the name. For
-  # further experimentation, define the callback with just a *params parameter, then print out the
-  # contents of the array using something like:
-  #
-  #   puts params.inspect
-  #
   # == Text Callbacks
   #
-  # All text passed into these callbacks will be encoded as UTF-8. Depending on where (and when) the
-  # PDF was generated, there's a good chance the text is NOT stored as UTF-8 internally so be
-  # careful when doing a comparison on strings returned from PDF::Reader (when doing unit tests for
-  # example). The string may not be byte-by-byte identical with the string that was originally
-  # written to the PDF.
-  #
   # - end_text_object
   # - move_to_start_of_next_line
   # - set_character_spacing
@@ -80,14 +46,6 @@ class PDF::Reader
   # - move_to_next_line_and_show_text
   # - set_spacing_next_line_show_text
   #
-  # If the :raw_text option was passed to the PDF::Reader class the following callbacks
-  # may also appear:
-  #
-  # - show_text_raw
-  # - show_text_with_positioning_raw
-  # - move_to_next_line_and_show_text_raw
-  # - set_spacing_next_line_show_text_raw
-  #
   # == Graphics Callbacks
   # - close_fill_stroke
   # - fill_stroke
@@ -145,42 +103,7 @@ class PDF::Reader
   # - set_clipping_path_with_even_odd
   # - append_curved_segment_final_point_replicated
   #
-  # == Misc Callbacks
-  # - begin_compatibility_section
-  # - end_compatibility_section,
-  # - begin_document
-  # - end_document
-  # - begin_page_container
-  # - end_page_container
-  # - begin_page
-  # - end_page
-  # - metadata
-  # - xml_metadata
-  # - page_count
-  # - begin_form_xobject
-  # - end_form_xobject
-  #
-  # == Resource Callbacks
-  #
-  # Each page can contain (or inherit) a range of resources required for the page,
-  # including things like fonts and images. The following callbacks may appear
-  # after begin_page if the relevant resources exist on a page:
-  #
-  # - resource_procset
-  # - resource_xobject
-  # - resource_extgstate
-  # - resource_colorspace
-  # - resource_pattern
-  # - resource_font
-  #
-  # In most cases, these callbacks associate a name with each resource, allowing it
-  # to be referred to by name in the page content. For example, an XObject can hold an image.
-  # If it gets mapped to the name "IM1", then it can be placed on the page using
-  # invoke_xobject "IM1".
-  #
-  # DEPRECATED: this class was deprecated in version 0.11.0 and will
-  #             eventually be removed
-  class PagesStrategy< AbstractStrategy # :nodoc:
+  class PagesStrategy # :nodoc:
     OPERATORS = {
       'b'   => :close_fill_stroke,
       'B'   => :fill_stroke,
@@ -256,232 +179,6 @@ class PDF::Reader
       '\''  => :move_to_next_line_and_show_text,
       '"'   => :set_spacing_next_line_show_text,
     }
-    def self.to_sym
-      :pages
-    end
-    ################################################################################
-    # Begin processing the document
-    def process
-      return false unless options[:pages]
-      callback(:begin_document, [root])
-      walk_pages(@ohash.object(root[:Pages]))
-      callback(:end_document)
-    end
-    private
-    ################################################################################
-    def params_to_utf8(params, font)
-      if params.is_a?(String)
-        font.to_utf8(params)
-      elsif params.is_a?(Array)
-        params.map { |i| params_to_utf8(i, font)}
-      else
-        params
-      end
-    end
-    ################################################################################
-    # Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
-    # its content
-    def walk_pages(page)
-      # extract page content
-      if page[:Type] == :Pages
-        callback(:begin_page_container, [page])
-        res = @ohash.object(page[:Resources])
-        resources.push res if res
-        @ohash.object(page[:Kids]).each {|child| walk_pages(@ohash.object(child))}
-        resources.pop if res
-        callback(:end_page_container)
-      elsif page[:Type] == :Page
-        callback(:begin_page, [page])
-        res = @ohash.object(page[:Resources])
-        resources.push res if res
-        walk_resources(current_resources)
-        if @ohash.object(page[:Contents]).kind_of?(Array)
-          contents = @ohash.object(page[:Contents])
-        else
-          contents = [page[:Contents]]
-        end
-        fonts = font_hash_from_resources(current_resources)
-        if page.has_key?(:Contents) and page[:Contents]
-          direct_contents = contents.map { |content| @ohash.object(content) }
-          content_stream(direct_contents, fonts)
-        end
-        resources.pop if res
-        callback(:end_page)
-      end
-    end
-    ################################################################################
-    # Retreive the XObject for the supplied label and if it's a Form, walk it
-    # like a regular page content stream.
-    #
-    def walk_xobject_form(label)
-      xobjects = @ohash.object(current_resources[:XObject]) || {}
-      xobject  = @ohash.object(xobjects[label])
-      if xobject && xobject.hash[:Subtype] == :Form
-        callback(:begin_form_xobject)
-        xobj_resources = @ohash.object(xobject.hash[:Resources])
-        if xobj_resources
-          resources.push xobj_resources
-          walk_resources(xobj_resources)
-        end
-        fonts = font_hash_from_resources(xobj_resources)
-        content_stream(xobject, fonts)
-        callback(:end_form_xobject)
-        resources.pop if xobj_resources
-      end
-    end
-    ################################################################################
-    # Return a merged hash of all resources that are current. Pages, page and xobject
-    #
-    def current_resources
-      hash = {}
-      resources.each do |res|
-        hash.merge!(res)
-      end
-      hash
-    end
-    ################################################################################
-    # Reads a PDF content stream and calls all the appropriate callback methods for the operators
-    # it contains
-    #
-    def content_stream(instructions, fonts = {})
-      instructions = [instructions] unless instructions.kind_of?(Array)
-      instructions = instructions.map { |ins|
-        ins.is_a?(PDF::Reader::Stream) ? ins.unfiltered_data : ins.to_s
-      }.join
-      buffer       = Buffer.new(StringIO.new(instructions), :content_stream => true)
-      parser       = Parser.new(buffer, @ohash)
-      current_font = nil
-      params       = []
-      while (token = parser.parse_token(OPERATORS))
-        if token.kind_of?(Token) and OPERATORS.has_key?(token)
-          if OPERATORS[token] == :set_text_font_and_size
-            current_font = params.first
-            if fonts[current_font].nil?
-              raise MalformedPDFError, "Unknown font #{current_font}"
-            end
-          end
-          # handle special cases in response to certain operators
-          if OPERATORS[token].to_s.include?("show_text")
-            # convert any text to utf-8, but output the raw string if the user wants it
-            if options[:raw_text]
-              callback("#{OPERATORS[token]}_raw".to_sym, params)
-            end
-            params = params_to_utf8(params, fonts[current_font])
-          elsif token == "ID"
-            # inline image data, first convert the current params into a more familiar hash
-            map = {}
-            params.each_slice(2) do |key, value|
-              map[key] = value
-            end
-            params = [map, buffer.token]
-          end
-          callback(OPERATORS[token], params)
-          if OPERATORS[token] == :invoke_xobject
-            xobject_label = params.first
-            params.clear
-            walk_xobject_form(xobject_label)
-          else
-            params.clear
-          end
-        else
-          params << token
-        end
-      end
-    rescue EOFError
-      raise MalformedPDFError, "End Of File while processing a content stream"
-    end
-    ################################################################################
-    def walk_resources(resources)
-      return unless resources.respond_to?(:[])
-      resources = resolve_references(resources)
-      # extract any procset information
-      if resources[:ProcSet]
-        callback(:resource_procset, resources[:ProcSet])
-      end
-      # extract any xobject information
-      if resources[:XObject]
-        @ohash.object(resources[:XObject]).each do |name, val|
-          callback(:resource_xobject, [name, @ohash.object(val)])
-        end
-      end
-      # extract any extgstate information
-      if resources[:ExtGState]
-        @ohash.object(resources[:ExtGState]).each do |name, val|
-          callback(:resource_extgstate, [name, @ohash.object(val)])
-        end
-      end
-      # extract any colorspace information
-      if resources[:ColorSpace]
-        @ohash.object(resources[:ColorSpace]).each do |name, val|
-          callback(:resource_colorspace, [name, @ohash.object(val)])
-        end
-      end
-      # extract any pattern information
-      if resources[:Pattern]
-        @ohash.object(resources[:Pattern]).each do |name, val|
-          callback(:resource_pattern, [name, @ohash.object(val)])
-        end
-      end
-      # extract any font information
-      if resources[:Font]
-        fonts = font_hash_from_resources(resources)
-        fonts.each do  |label, font|
-          callback(:resource_font, [label, font])
-        end
-      end
-    end
-    ################################################################################
-    # Convert any PDF::Reader::Resource objects into a real object
-    def resolve_references(obj)
-      case obj
-      when PDF::Reader::Stream then
-        obj.hash = resolve_references(obj.hash)
-        obj
-      when PDF::Reader::Reference then
-        resolve_references(@ohash.object(obj))
-      when Hash                   then
-        arr = obj.map { |key,val| [key, resolve_references(val)] }.flatten(1)
-        Hash[*arr]
-      when Array                  then
-        obj.collect { |item| resolve_references(item) }
-      else
-        obj
-      end
-    end
-    ################################################################################
-    ################################################################################
-    def font_hash_from_resources(resources)
-      return {} unless resources.respond_to?(:[])
-      fonts = {}
-      resources = @ohash.object(resources[:Font]) || {}
-      resources.each do |label, desc|
-        fonts[label] = PDF::Reader::Font.new(@ohash, @ohash.object(desc))
-      end
-      fonts
-    end
-    def resources
-      @resources ||= []
-    end
   end
   ################################################################################
 end