RubyGems - pdf-reader - Versions diffs - 1.1.1 → 2.5.0 - Mend

pdf-reader 1.1.1 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

checksums.yaml +7 -0
data/CHANGELOG +87 -2
data/{README.rdoc → README.md} +43 -31
data/Rakefile +21 -16
data/bin/pdf_callbacks +1 -1
data/bin/pdf_object +4 -1
data/bin/pdf_text +1 -3
data/examples/callbacks.rb +2 -1
data/examples/extract_images.rb +11 -6
data/examples/fuzzy_paragraphs.rb +24 -0
data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
data/lib/pdf/reader/afm/Courier.afm +342 -0
data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
data/lib/pdf/reader/afm/MustRead.html +19 -0
data/lib/pdf/reader/afm/Symbol.afm +213 -0
data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
data/lib/pdf/reader/buffer.rb +90 -63
data/lib/pdf/reader/cid_widths.rb +63 -0
data/lib/pdf/reader/cmap.rb +69 -38
data/lib/pdf/reader/encoding.rb +74 -48
data/lib/pdf/reader/error.rb +24 -4
data/lib/pdf/reader/filter/ascii85.rb +28 -0
data/lib/pdf/reader/filter/ascii_hex.rb +30 -0
data/lib/pdf/reader/filter/depredict.rb +141 -0
data/lib/pdf/reader/filter/flate.rb +53 -0
data/lib/pdf/reader/filter/lzw.rb +21 -0
data/lib/pdf/reader/filter/null.rb +18 -0
data/lib/pdf/reader/filter/run_length.rb +45 -0
data/lib/pdf/reader/filter.rb +15 -234
data/lib/pdf/reader/font.rb +107 -43
data/lib/pdf/reader/font_descriptor.rb +80 -0
data/lib/pdf/reader/form_xobject.rb +26 -4
data/lib/pdf/reader/glyph_hash.rb +56 -18
data/lib/pdf/reader/lzw.rb +6 -4
data/lib/pdf/reader/null_security_handler.rb +17 -0
data/lib/pdf/reader/object_cache.rb +40 -16
data/lib/pdf/reader/object_hash.rb +94 -40
data/lib/pdf/reader/object_stream.rb +1 -0
data/lib/pdf/reader/orientation_detector.rb +34 -0
data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
data/lib/pdf/reader/page.rb +48 -3
data/lib/pdf/reader/page_layout.rb +125 -0
data/lib/pdf/reader/page_state.rb +185 -70
data/lib/pdf/reader/page_text_receiver.rb +70 -20
data/lib/pdf/reader/pages_strategy.rb +4 -293
data/lib/pdf/reader/parser.rb +37 -61
data/lib/pdf/reader/print_receiver.rb +6 -0
data/lib/pdf/reader/reference.rb +4 -1
data/lib/pdf/reader/register_receiver.rb +17 -31
data/lib/pdf/reader/resource_methods.rb +1 -0
data/lib/pdf/reader/standard_security_handler.rb +82 -42
data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
data/lib/pdf/reader/stream.rb +5 -2
data/lib/pdf/reader/synchronized_cache.rb +33 -0
data/lib/pdf/reader/text_run.rb +99 -0
data/lib/pdf/reader/token.rb +4 -1
data/lib/pdf/reader/transformation_matrix.rb +195 -0
data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
data/lib/pdf/reader/width_calculator/built_in.rb +67 -0
data/lib/pdf/reader/width_calculator/composite.rb +28 -0
data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
data/lib/pdf/reader/width_calculator/type_one_or_three.rb +33 -0
data/lib/pdf/reader/width_calculator/type_zero.rb +25 -0
data/lib/pdf/reader/width_calculator.rb +12 -0
data/lib/pdf/reader/xref.rb +41 -9
data/lib/pdf/reader.rb +45 -104
data/lib/pdf-reader.rb +4 -1
metadata +220 -101
data/bin/pdf_list_callbacks +0 -17
data/lib/pdf/hash.rb +0 -15
data/lib/pdf/reader/abstract_strategy.rb +0 -81
data/lib/pdf/reader/metadata_strategy.rb +0 -56
data/lib/pdf/reader/text_receiver.rb +0 -264

data/lib/pdf/reader/font.rb CHANGED Viewed

@@ -1,3 +1,6 @@
+# coding: utf-8
+# frozen_string_literal: true
 ################################################################################
 #
 # Copyright (C) 2008 James Healy (jimmy@deefa.com)
@@ -23,41 +26,29 @@
 #
 ################################################################################
+require 'pdf/reader/width_calculator'
 class PDF::Reader
+  # Represents a single font PDF object and provides some useful methods
+  # for extracting info. Mainly used for converting text to UTF-8.
+  #
   class Font
-    attr_accessor :label, :subtype, :encoding, :descendantfonts, :tounicode
-    attr_reader :widths, :first_char, :ascent, :descent, :missing_width, :bbox
-    attr_reader :basefont
-    def initialize(ohash = nil, obj = nil)
-      if ohash.nil? || obj.nil?
-        $stderr.puts "DEPREACTION WARNING - PDF::Reader::Font.new should be called with 2 args"
-        return
-      end
+    attr_accessor :subtype, :encoding, :descendantfonts, :tounicode
+    attr_reader :widths, :first_char, :last_char, :basefont, :font_descriptor,
+                :cid_widths, :cid_default_width
+    def initialize(ohash, obj)
       @ohash = ohash
       @tounicode = nil
       extract_base_info(obj)
       extract_descriptor(obj)
       extract_descendants(obj)
+      @width_calc = build_width_calculator
       @encoding ||= PDF::Reader::Encoding.new(:StandardEncoding)
     end
-    def basefont=(font)
-      # setup a default encoding for the selected font. It can always be overridden
-      # with encoding= if required
-      case font
-      when "Symbol" then
-        @encoding = PDF::Reader::Encoding.new("SymbolEncoding")
-      when "ZapfDingbats" then
-        @encoding = PDF::Reader::Encoding.new("ZapfDingbatsEncoding")
-      else
-        @encoding = nil
-      end
-      @basefont = font
-    end
     def to_utf8(params)
       if @tounicode
         to_utf8_via_cmap(params)
@@ -66,39 +57,102 @@ class PDF::Reader
       end
     end
-    def glyph_width(c)
-      @missing_width ||= 0
-      @widths        ||= []
-      @widths.fetch(c - @first_char, @missing_width)
+    def unpack(data)
+      data.unpack(encoding.unpack)
+    end
+    # looks up the specified codepoint and returns a value that is in (pdf)
+    # glyph space, which is 1000 glyph units = 1 text space unit
+    def glyph_width(code_point)
+      if code_point.is_a?(String)
+        code_point = code_point.unpack(encoding.unpack).first
+      end
+      @cached_widths ||= {}
+      @cached_widths[code_point] ||= @width_calc.glyph_width(code_point)
     end
     private
+    def default_encoding(font_name)
+      case font_name.to_s
+      when "Symbol" then
+        PDF::Reader::Encoding.new(:SymbolEncoding)
+      when "ZapfDingbats" then
+        PDF::Reader::Encoding.new(:ZapfDingbatsEncoding)
+      else
+        PDF::Reader::Encoding.new(:StandardEncoding)
+      end
+    end
+    def build_width_calculator
+      if @subtype == :Type0
+        PDF::Reader::WidthCalculator::TypeZero.new(self)
+      elsif @subtype == :Type1
+        if @font_descriptor.nil?
+          PDF::Reader::WidthCalculator::BuiltIn.new(self)
+        else
+          PDF::Reader::WidthCalculator::TypeOneOrThree .new(self)
+        end
+      elsif @subtype == :Type3
+        PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
+      elsif @subtype == :TrueType
+        if @font_descriptor
+          PDF::Reader::WidthCalculator::TrueType.new(self)
+        else
+          # A TrueType font that isn't embedded. Most readers look for a version on the
+          # local system and fallback to a substitute. For now, we go straight to a substitute
+          PDF::Reader::WidthCalculator::BuiltIn.new(self)
+        end
+      elsif @subtype == :CIDFontType0 || @subtype == :CIDFontType2
+        PDF::Reader::WidthCalculator::Composite.new(self)
+      else
+        PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
+      end
+    end
     def extract_base_info(obj)
       @subtype  = @ohash.object(obj[:Subtype])
       @basefont = @ohash.object(obj[:BaseFont])
-      @encoding = PDF::Reader::Encoding.new(@ohash.object(obj[:Encoding]))
+      if @ohash.object(obj[:Encoding])
+        @encoding = PDF::Reader::Encoding.new(@ohash.object(obj[:Encoding]))
+      else
+        @encoding = default_encoding(@basefont)
+      end
       @widths   = @ohash.object(obj[:Widths]) || []
       @first_char = @ohash.object(obj[:FirstChar])
+      @last_char = @ohash.object(obj[:LastChar])
+      # CID Fonts are not required to have a W or DW entry, if they don't exist,
+      # the default cid width = 1000, see Section 9.7.4.1 PDF 32000-1:2008 pp 269
+      @cid_widths         = @ohash.object(obj[:W])  || []
+      @cid_default_width  = @ohash.object(obj[:DW]) || 1000
       if obj[:ToUnicode]
+        # ToUnicode is optional for Type1 and Type3
         stream = @ohash.object(obj[:ToUnicode])
-        @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
+        if stream.is_a?(PDF::Reader::Stream)
+          @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
+        end
       end
     end
     def extract_descriptor(obj)
-      return unless obj[:FontDescriptor]
-      fd       = @ohash.object(obj[:FontDescriptor])
-      @ascent  = @ohash.object(fd[:Ascent])
-      @descent = @ohash.object(fd[:Descent])
-      @missing_width = @ohash.object(fd[:MissingWidth])
-      @bbox    = @ohash.object(fd[:FontBBox])
+      if obj[:FontDescriptor]
+        # create a font descriptor object if we can, in other words, unless this is
+        # a CID Font
+        fd = @ohash.object(obj[:FontDescriptor])
+        @font_descriptor = PDF::Reader::FontDescriptor.new(@ohash, fd)
+      else
+        @font_descriptor = nil
+      end
     end
     def extract_descendants(obj)
       return unless obj[:DescendantFonts]
+      # per PDF 32000-1:2008 pp. 280 :DescendentFonts is:
+      # A one-element array specifying the CIDFont dictionary that is the
+      # descendant of this Type 0 font.
       descendants = @ohash.object(obj[:DescendantFonts])
       @descendantfonts = descendants.map { |desc|
         PDF::Reader::Font.new(@ohash, @ohash.object(desc))
@@ -106,11 +160,16 @@ class PDF::Reader
     end
     def to_utf8_via_cmap(params)
-      if params.class == String
+      case params
+      when Integer
+        [
+          @tounicode.decode(params) || PDF::Reader::Encoding::UNKNOWN_CHAR
+        ].flatten.pack("U*")
+      when String
         params.unpack(encoding.unpack).map { |c|
           @tounicode.decode(c) || PDF::Reader::Encoding::UNKNOWN_CHAR
-        }.pack("U*")
-      elsif params.class == Array
+        }.flatten.pack("U*")
+      when Array
         params.collect { |param| to_utf8_via_cmap(param) }
       else
         params
@@ -118,11 +177,16 @@ class PDF::Reader
     end
     def to_utf8_via_encoding(params)
-      raise UnsupportedFeatureError, "font encoding '#{encoding}' currently unsupported" if encoding.kind_of?(String)
+      if encoding.kind_of?(String)
+        raise UnsupportedFeatureError, "font encoding '#{encoding}' currently unsupported"
+      end
-      if params.class == String
+      case params
+      when Integer
+        encoding.int_to_utf8_string(params)
+      when String
         encoding.to_utf8(params)
-      elsif params.class == Array
+      when Array
         params.collect { |param| to_utf8_via_encoding(param) }
       else
         params

data/lib/pdf/reader/font_descriptor.rb ADDED Viewed

@@ -0,0 +1,80 @@
+# coding: utf-8
+# frozen_string_literal: true
+require 'ttfunk'
+class PDF::Reader
+  # Font descriptors are outlined in Section 9.8, PDF 32000-1:2008, pp 281-288
+  class FontDescriptor
+    attr_reader :font_name, :font_family, :font_stretch, :font_weight,
+                :font_bounding_box, :cap_height, :ascent, :descent, :leading,
+                :avg_width, :max_width, :missing_width, :italic_angle, :stem_v,
+                :x_height, :font_flags
+    def initialize(ohash, fd_hash)
+      @ascent                = ohash.object(fd_hash[:Ascent])    || 0
+      @descent               = ohash.object(fd_hash[:Descent])   || 0
+      @missing_width         = ohash.object(fd_hash[:MissingWidth]) || 0
+      @font_bounding_box     = ohash.object(fd_hash[:FontBBox])  || [0,0,0,0]
+      @avg_width             = ohash.object(fd_hash[:AvgWidth])  || 0
+      @cap_height            = ohash.object(fd_hash[:CapHeight]) || 0
+      @font_flags            = ohash.object(fd_hash[:Flags])     || 0
+      @italic_angle          = ohash.object(fd_hash[:ItalicAngle])
+      @font_name             = ohash.object(fd_hash[:FontName]).to_s
+      @leading               = ohash.object(fd_hash[:Leading])   || 0
+      @max_width             = ohash.object(fd_hash[:MaxWidth])  || 0
+      @stem_v                = ohash.object(fd_hash[:StemV])
+      @x_height              = ohash.object(fd_hash[:XHeight])
+      @font_stretch          = ohash.object(fd_hash[:FontStretch]) || :Normal
+      @font_weight           = ohash.object(fd_hash[:FontWeight])  || 400
+      @font_family           = ohash.object(fd_hash[:FontFamily])
+      # A FontDescriptor may have an embedded font program in FontFile
+      # (Type 1 Font Program), FontFile2 (TrueType font program), or
+      # FontFile3 (Other font program as defined by Subtype entry)
+      # Subtype entries:
+      # 1) Type1C:        Type 1 Font Program in Compact Font Format
+      # 2) CIDFontType0C: Type 0 Font Program in Compact Font Format
+      # 3) OpenType:      OpenType Font Program
+      # see Section 9.9, PDF 32000-1:2008, pp 288-292
+      @font_program_stream = ohash.object(fd_hash[:FontFile2])
+      #TODO handle FontFile and FontFile3
+      @is_ttf = true if @font_program_stream
+    end
+    def glyph_width(char_code)
+      if @is_ttf
+        if ttf_program_stream.cmap.unicode.length > 0
+          glyph_id = ttf_program_stream.cmap.unicode.first[char_code]
+        else
+          glyph_id = char_code
+        end
+        char_metric = ttf_program_stream.horizontal_metrics.metrics[glyph_id]
+        if char_metric
+          return char_metric.advance_width
+        end
+      end
+    end
+    # PDF states that a glyph is 1000 units wide, true type doesn't enforce
+    # any behavior, but uses units/em to define how wide the 'M' is (the widest letter)
+    def glyph_to_pdf_scale_factor
+      if @is_ttf
+        @glyph_to_pdf_sf ||= (1.0 / ttf_program_stream.header.units_per_em) * 1000.0
+      else
+        @glyph_to_pdf_sf ||= 1.0
+      end
+      @glyph_to_pdf_sf
+    end
+    private
+    def ttf_program_stream
+      @ttf_program_stream ||= TTFunk::File.new(@font_program_stream.unfiltered_data)
+    end
+  end
+end

data/lib/pdf/reader/form_xobject.rb CHANGED Viewed

@@ -1,4 +1,7 @@
 # coding: utf-8
+# frozen_string_literal: true
+require 'digest/md5'
 module PDF
   class Reader
@@ -15,9 +18,10 @@ module PDF
       attr_reader :xobject
-      def initialize(page, xobject)
+      def initialize(page, xobject, options = {})
         @page    = page
         @objects = page.objects
+        @cache   = options[:cache] || {}
         @xobject = @objects.deref(xobject)
       end
@@ -65,12 +69,30 @@ module PDF
         end
       end
+      def content_stream_md5
+        @content_stream_md5 ||= Digest::MD5.hexdigest(raw_content)
+      end
+      def cached_tokens_key
+        @cached_tokens_key ||= "tokens-#{content_stream_md5}"
+      end
+      def tokens
+        @cache[cached_tokens_key] ||= begin
+                      buffer = Buffer.new(StringIO.new(raw_content), :content_stream => true)
+                      parser = Parser.new(buffer, @objects)
+                      result = []
+                      while (token = parser.parse_token(PagesStrategy::OPERATORS))
+                        result << token
+                      end
+                      result
+                    end
+      end
       def content_stream(receivers, instructions)
-        buffer       = Buffer.new(StringIO.new(instructions), :content_stream => true)
-        parser       = Parser.new(buffer, @objects)
         params       = []
-        while (token = parser.parse_token(PagesStrategy::OPERATORS))
+        tokens.each do |token|
           if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
             callback(receivers, PagesStrategy::OPERATORS[token], params)
             params.clear

data/lib/pdf/reader/glyph_hash.rb CHANGED Viewed

@@ -1,3 +1,6 @@
+# coding: utf-8
+# frozen_string_literal: true
 ################################################################################
 #
 # Copyright (C) 2011 James Healy (jimmy@deefa.com)
@@ -24,9 +27,15 @@
 ################################################################################
 class PDF::Reader
+  # A Hash-like object that can convert glyph names into a unicode codepoint.
+  # The mapping is read from a data file on disk the first time it's needed.
+  #
   class GlyphHash # :nodoc:
     def initialize
-      @adobe = load_adobe_glyph_mapping
+      # only parse the glyph list once, and cache the results (for performance)
+      adobe = @@cache ||= load_adobe_glyph_mapping
+      @by_name      = adobe.first
+      @by_codepoint = adobe.last
     end
     # attempt to convert a PDF Name to a unicode codepoint. Returns nil
@@ -34,55 +43,84 @@ class PDF::Reader
     #
     #   h = GlyphHash.new
     #
-    #   h[:A]
+    #   h.name_to_unicode(:A)
     #   => 65
     #
-    #   h[:Euro]
+    #   h.name_to_unicode(:Euro)
     #   => 8364
     #
-    #   h[:G30]
+    #   h.name_to_unicode(:X4A)
+    #   => 74
+    #
+    #   h.name_to_unicode(:G30)
     #   => 48
     #
-    #   h[:34]
+    #   h.name_to_unicode(:34)
+    #   => 34
     #
-    def [](name)
+    def name_to_unicode(name)
       return nil unless name.is_a?(Symbol)
       name = name.to_s.gsub('_', '').intern
       str = name.to_s
-      if @adobe.has_key?(name)
-        @adobe[name]
+      if @by_name.has_key?(name)
+        @by_name[name]
+      elsif str.match(/\AX[0-9a-fA-F]{2,4}\Z/)
+        "0x#{str[1,4]}".hex
       elsif str.match(/\Auni[A-F\d]{4}\Z/)
         "0x#{str[3,4]}".hex
       elsif str.match(/\Au[A-F\d]{4,6}\Z/)
         "0x#{str[1,6]}".hex
-      elsif str.match(/\A[A-Za-z]\d{1,4}\Z/)
-        str[1,4].to_i
-      elsif str.match(/\A[A-Za-z]{2}\d{2,4}\Z/)
-        str[2,4].to_i
+      elsif str.match(/\A[A-Za-z]\d{1,5}\Z/)
+        str[1,5].to_i
+      elsif str.match(/\A[A-Za-z]{2}\d{2,5}\Z/)
+        str[2,5].to_i
       else
         nil
       end
     end
+    # attempt to convert a Unicode code point to the equivilant PDF Name. Returns nil
+    # if no conversion is possible.
+    #
+    #   h = GlyphHash.new
+    #
+    #   h.unicode_to_name(65)
+    #   => [:A]
+    #
+    #   h.unicode_to_name(8364)
+    #   => [:Euro]
+    #
+    #   h.unicode_to_name(34)
+    #   => [:34]
+    #
+    def unicode_to_name(codepoint)
+      @by_codepoint[codepoint.to_i] || []
+    end
     private
     # returns a hash that maps glyph names to unicode codepoints. The mapping is based on
     # a text file supplied by Adobe at:
     # http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
     def load_adobe_glyph_mapping
-      glyphs = {}
+      keyed_by_name      = {}
+      keyed_by_codepoint = {}
-      RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
-      File.open(File.dirname(__FILE__) + "/glyphlist.txt", mode) do |f|
+      File.open(File.dirname(__FILE__) + "/glyphlist.txt", "r:BINARY") do |f|
         f.each do |l|
-          m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
-          glyphs[name.to_sym] = "0x#{code}".hex if name
+          _m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
+          if name && code
+            cp = "0x#{code}".hex
+            keyed_by_name[name.to_sym]   = cp
+            keyed_by_codepoint[cp]     ||= []
+            keyed_by_codepoint[cp]     << name.to_sym
+          end
         end
       end
-      glyphs
+      [keyed_by_name.freeze, keyed_by_codepoint.freeze]
     end
   end