RubyGems - pdf-reader - Versions diffs - 1.2.0 → 1.3.0 - Mend

pdf-reader 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

data/CHANGELOG +7 -1
data/README.rdoc +1 -0
data/Rakefile +23 -8
data/lib/pdf-reader.rb +3 -1
data/lib/pdf/hash.rb +5 -1
data/lib/pdf/reader.rb +8 -1
data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
data/lib/pdf/reader/afm/Courier.afm +342 -0
data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
data/lib/pdf/reader/afm/Symbol.afm +213 -0
data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
data/lib/pdf/reader/buffer.rb +14 -6
data/lib/pdf/reader/cid_widths.rb +61 -0
data/lib/pdf/reader/cmap.rb +8 -2
data/lib/pdf/reader/encoding.rb +52 -27
data/lib/pdf/reader/error.rb +16 -1
data/lib/pdf/reader/filter.rb +2 -0
data/lib/pdf/reader/filter/ascii85.rb +3 -1
data/lib/pdf/reader/filter/ascii_hex.rb +3 -1
data/lib/pdf/reader/filter/depredict.rb +2 -0
data/lib/pdf/reader/filter/flate.rb +3 -1
data/lib/pdf/reader/filter/lzw.rb +1 -0
data/lib/pdf/reader/filter/null.rb +1 -0
data/lib/pdf/reader/filter/run_length.rb +2 -1
data/lib/pdf/reader/font.rb +74 -18
data/lib/pdf/reader/font_descriptor.rb +80 -0
data/lib/pdf/reader/glyph_hash.rb +6 -0
data/lib/pdf/reader/lzw.rb +1 -0
data/lib/pdf/reader/object_cache.rb +1 -1
data/lib/pdf/reader/object_hash.rb +1 -1
data/lib/pdf/reader/page_layout.rb +125 -0
data/lib/pdf/reader/page_state.rb +172 -69
data/lib/pdf/reader/page_text_receiver.rb +50 -21
data/lib/pdf/reader/pages_strategy.rb +17 -4
data/lib/pdf/reader/parser.rb +25 -52
data/lib/pdf/reader/print_receiver.rb +5 -0
data/lib/pdf/reader/reference.rb +2 -0
data/lib/pdf/reader/register_receiver.rb +1 -1
data/lib/pdf/reader/standard_security_handler.rb +2 -0
data/lib/pdf/reader/stream.rb +2 -0
data/lib/pdf/reader/synchronized_cache.rb +32 -0
data/lib/pdf/reader/text_receiver.rb +5 -4
data/lib/pdf/reader/text_run.rb +80 -0
data/lib/pdf/reader/token.rb +2 -0
data/lib/pdf/reader/transformation_matrix.rb +194 -0
data/lib/pdf/reader/width_calculator.rb +11 -0
data/lib/pdf/reader/width_calculator/built_in.rb +50 -0
data/lib/pdf/reader/width_calculator/composite.rb +27 -0
data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
data/lib/pdf/reader/width_calculator/type_one_or_three.rb +32 -0
data/lib/pdf/reader/width_calculator/type_zero.rb +24 -0
data/lib/pdf/reader/xref.rb +9 -2
metadata +119 -13

data/lib/pdf/reader/error.rb CHANGED

@@ -1,3 +1,5 @@
+# coding: utf-8
 ################################################################################
 #
 # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
@@ -21,7 +23,6 @@
 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #
 class PDF::Reader
   ################################################################################
   # An internal PDF::Reader class that helps to verify various parts of the PDF file
@@ -45,10 +46,24 @@ class PDF::Reader
     end
     ################################################################################
   end
   ################################################################################
+  # an exception that is raised when we believe the current PDF is not following
+  # the PDF spec and cannot be recovered
   class MalformedPDFError < RuntimeError; end
+  ################################################################################
+  # an exception that is raised when a PDF object appears to be invalid
   class InvalidObjectError < MalformedPDFError; end
+  ################################################################################
+  # an exception that is raised when a PDF follows the specs but uses a feature
+  # that we don't support just yet
   class UnsupportedFeatureError < RuntimeError; end
+  ################################################################################
+  # an exception that is raised when a PDF is encrypted and we don't have the
+  # necessary data to decrypt it
   class EncryptedPDFError < UnsupportedFeatureError; end
 end
 ################################################################################

data/lib/pdf/reader/filter.rb CHANGED

@@ -1,3 +1,5 @@
+# coding: utf-8
 ################################################################################
 #
 # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)

data/lib/pdf/reader/filter/ascii85.rb CHANGED

@@ -4,6 +4,7 @@ require 'ascii85'
 class PDF::Reader
   module Filter # :nodoc:
+    # implementation of the Ascii85 filter
     class Ascii85
       def initialize(options = {})
         @options = options
@@ -18,7 +19,8 @@ class PDF::Reader
         ::Ascii85::decode(data)
       rescue Exception => e
         # Oops, there was a problem decoding the stream
-        raise MalformedPDFError, "Error occured while decoding an ASCII85 stream (#{e.class.to_s}: #{e.to_s})"
+        raise MalformedPDFError,
+          "Error occured while decoding an ASCII85 stream (#{e.class.to_s}: #{e.to_s})"
       end
     end
   end

data/lib/pdf/reader/filter/ascii_hex.rb CHANGED

@@ -2,6 +2,7 @@
 #
 class PDF::Reader
   module Filter # :nodoc:
+    # implementation of the AsciiHex stream filter
     class AsciiHex
       def initialize(options = {})
         @options = options
@@ -18,7 +19,8 @@ class PDF::Reader
         data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
       rescue Exception => e
         # Oops, there was a problem decoding the stream
-        raise MalformedPDFError, "Error occured while decoding an ASCIIHex stream (#{e.class.to_s}: #{e.to_s})"
+        raise MalformedPDFError,
+            "Error occured while decoding an ASCIIHex stream (#{e.class.to_s}: #{e.to_s})"
       end
     end
   end

data/lib/pdf/reader/filter/depredict.rb CHANGED

@@ -2,6 +2,8 @@
 class PDF::Reader
   module Filter # :nodoc:
+    # some filter implementations support preprocessing of the  data to
+    # improve compression
     class Depredict
       def initialize(options = {})
         @options = options || {}

data/lib/pdf/reader/filter/flate.rb CHANGED

@@ -5,6 +5,7 @@ require 'zlib'
 class PDF::Reader
   module Filter # :nodoc:
+    # implementation of the Flate (zlib) stream filter
     class Flate
       def initialize(options = {})
         @options = options
@@ -30,7 +31,8 @@ class PDF::Reader
         Depredict.new(@options).filter(deflated)
       rescue Exception => e
         # Oops, there was a problem inflating the stream
-        raise MalformedPDFError, "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
+        raise MalformedPDFError,
+          "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
       end
     end
   end

data/lib/pdf/reader/filter/lzw.rb CHANGED

@@ -2,6 +2,7 @@
 #
 class PDF::Reader
   module Filter # :nodoc:
+    # implementation of the LZW stream filter
     class Lzw
       def initialize(options = {})
         @options = options

data/lib/pdf/reader/filter/null.rb CHANGED

@@ -2,6 +2,7 @@
 #
 class PDF::Reader
   module Filter # :nodoc:
+    # implementation of the null stream filter
     class Null
       def initialize(options = {})
         @options = options

data/lib/pdf/reader/filter/run_length.rb CHANGED

@@ -1,7 +1,8 @@
 # coding: utf-8
 #
-class PDF::Reader
+class PDF::Reader # :nodoc:
   module Filter # :nodoc:
+    # implementation of the run length stream filter
     class RunLength
       def initialize(options = {})
         @options = options

data/lib/pdf/reader/font.rb CHANGED

@@ -1,3 +1,5 @@
+# coding: utf-8
 ################################################################################
 #
 # Copyright (C) 2008 James Healy (jimmy@deefa.com)
@@ -23,11 +25,16 @@
 #
 ################################################################################
+require 'pdf/reader/width_calculator'
 class PDF::Reader
+  # Represents a single font PDF object and provides some useful methods
+  # for extracting info. Mainly used for converting text to UTF-8.
+  #
   class Font
-    attr_accessor :label, :subtype, :encoding, :descendantfonts, :tounicode
-    attr_reader :widths, :first_char, :ascent, :descent, :missing_width, :bbox
-    attr_reader :basefont
+    attr_accessor :subtype, :encoding, :descendantfonts, :tounicode
+    attr_reader :widths, :first_char, :last_char, :basefont, :font_descriptor,
+                :cid_widths, :cid_default_width
     def initialize(ohash = nil, obj = nil)
       if ohash.nil? || obj.nil?
@@ -40,6 +47,7 @@ class PDF::Reader
       extract_base_info(obj)
       extract_descriptor(obj)
       extract_descendants(obj)
+      @width_calc = build_width_calculator
       @encoding ||= PDF::Reader::Encoding.new(:StandardEncoding)
     end
@@ -66,39 +74,79 @@ class PDF::Reader
       end
     end
-    def glyph_width(c)
-      @missing_width ||= 0
-      @widths        ||= []
-      @widths.fetch(c - @first_char, @missing_width)
+    def unpack(data)
+      data.unpack(encoding.unpack)
+    end
+    # looks up the specified codepoint and returns a value that is in (pdf)
+    # glyph space, which is 1000 glyph units = 1 text space unit
+    def glyph_width(code_point)
+      if code_point.is_a?(String)
+        code_point = code_point.unpack(encoding.unpack).first
+      end
+      @cached_widths ||= {}
+      @cached_widths[code_point] ||= @width_calc.glyph_width(code_point)
     end
     private
+    def build_width_calculator
+      if @subtype == :Type0
+        PDF::Reader::WidthCalculator::TypeZero.new(self)
+      elsif @subtype == :Type1
+        if @font_descriptor.nil?
+          PDF::Reader::WidthCalculator::BuiltIn.new(self)
+        else
+          PDF::Reader::WidthCalculator::TypeOneOrThree .new(self)
+        end
+      elsif @subtype == :Type3
+        PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
+      elsif @subtype == :TrueType
+        PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
+      elsif @subtype == :CIDFontType0 || @subtype == :CIDFontType2
+        PDF::Reader::WidthCalculator::Composite.new(self)
+      else
+        PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
+      end
+    end
     def extract_base_info(obj)
       @subtype  = @ohash.object(obj[:Subtype])
       @basefont = @ohash.object(obj[:BaseFont])
       @encoding = PDF::Reader::Encoding.new(@ohash.object(obj[:Encoding]))
       @widths   = @ohash.object(obj[:Widths]) || []
       @first_char = @ohash.object(obj[:FirstChar])
+      @last_char = @ohash.object(obj[:LastChar])
+      # CID Fonts are not required to have a W or DW entry, if they don't exist,
+      # the default cid width = 1000, see Section 9.7.4.1 PDF 32000-1:2008 pp 269
+      @cid_widths         = @ohash.object(obj[:W])  || []
+      @cid_default_width  = @ohash.object(obj[:DW]) || 1000
       if obj[:ToUnicode]
+        # ToUnicode is optional for Type1 and Type3
         stream = @ohash.object(obj[:ToUnicode])
         @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
       end
     end
     def extract_descriptor(obj)
-      return unless obj[:FontDescriptor]
-      fd       = @ohash.object(obj[:FontDescriptor])
-      @ascent  = @ohash.object(fd[:Ascent])
-      @descent = @ohash.object(fd[:Descent])
-      @missing_width = @ohash.object(fd[:MissingWidth])
-      @bbox    = @ohash.object(fd[:FontBBox])
+      if obj[:FontDescriptor]
+        # create a font descriptor object if we can, in other words, unless this is
+        # a CID Font
+        fd = @ohash.object(obj[:FontDescriptor])
+        @font_descriptor = PDF::Reader::FontDescriptor.new(@ohash, fd)
+      else
+        @font_descriptor = nil
+      end
     end
     def extract_descendants(obj)
       return unless obj[:DescendantFonts]
+      # per PDF 32000-1:2008 pp. 280 :DescendentFonts is:
+      # A one-element array specifying the CIDFont dictionary that is the
+      # descendant of this Type 0 font.
       descendants = @ohash.object(obj[:DescendantFonts])
       @descendantfonts = descendants.map { |desc|
         PDF::Reader::Font.new(@ohash, @ohash.object(desc))
@@ -106,7 +154,11 @@ class PDF::Reader
     end
     def to_utf8_via_cmap(params)
-      if params.class == String
+      if params.class == Fixnum
+        [
+          @tounicode.decode(params) || PDF::Reader::Encoding::UNKNOWN_CHAR
+        ].flatten.pack("U*")
+      elsif params.class == String
         params.unpack(encoding.unpack).map { |c|
           @tounicode.decode(c) || PDF::Reader::Encoding::UNKNOWN_CHAR
         }.flatten.pack("U*")
@@ -118,9 +170,13 @@ class PDF::Reader
     end
     def to_utf8_via_encoding(params)
-      raise UnsupportedFeatureError, "font encoding '#{encoding}' currently unsupported" if encoding.kind_of?(String)
+      if encoding.kind_of?(String)
+        raise UnsupportedFeatureError, "font encoding '#{encoding}' currently unsupported"
+      end
-      if params.class == String
+      if params.class == Fixnum
+        encoding.int_to_utf8_string(params)
+      elsif params.class == String
         encoding.to_utf8(params)
       elsif params.class == Array
         params.collect { |param| to_utf8_via_encoding(param) }

data/lib/pdf/reader/font_descriptor.rb ADDED

@@ -0,0 +1,80 @@
+# coding: utf-8
+require 'ttfunk'
+class PDF::Reader
+  # Font descriptors are outlined in Section 9.8, PDF 32000-1:2008, pp 281-288
+  class FontDescriptor
+    attr_reader :font_name, :font_family, :font_stretch, :font_weight,
+                :font_bounding_box, :cap_height, :ascent, :descent, :leading,
+                :avg_width, :max_width, :missing_width, :italic_angle, :stem_v,
+                :x_height, :font_flags
+    def initialize(ohash, fd_hash)
+      @ascent                = ohash.object(fd_hash[:Ascent])    || 0
+      @descent               = ohash.object(fd_hash[:Descent])   || 0
+      @missing_width         = ohash.object(fd_hash[:MissingWidth]) || 0
+      @font_bounding_box     = ohash.object(fd_hash[:FontBBox])  || [0,0,0,0]
+      @avg_width             = ohash.object(fd_hash[:AvgWidth])  || 0
+      @cap_height            = ohash.object(fd_hash[:CapHeight]) || 0
+      @font_flags            = ohash.object(fd_hash[:Flags])     || 0
+      @italic_angle          = ohash.object(fd_hash[:ItalicAngle])
+      @font_name             = ohash.object(fd_hash[:FontName]).to_s
+      @leading               = ohash.object(fd_hash[:Leading])   || 0
+      @max_width             = ohash.object(fd_hash[:MaxWidth])  || 0
+      @stem_v                = ohash.object(fd_hash[:StemV])
+      @x_height              = ohash.object(fd_hash[:XHeight])
+      @font_stretch          = ohash.object(fd_hash[:FontStretch]) || :Normal
+      @font_weight           = ohash.object(fd_hash[:FontWeight])  || 400
+      @font_family           = ohash.object(fd_hash[:FontFamily])
+      # A FontDescriptor may have an embedded font program in FontFile
+      # (Type 1 Font Program), FontFile2 (TrueType font program), or
+      # FontFile3 (Other font program as defined by Subtype entry)
+      # Subtype entries:
+      # 1) Type1C:        Type 1 Font Program in Compact Font Format
+      # 2) CIDFontType0C: Type 0 Font Program in Compact Font Format
+      # 3) OpenType:      OpenType Font Program
+      # see Section 9.9, PDF 32000-1:2008, pp 288-292
+      @font_program_stream = ohash.object(fd_hash[:FontFile2])
+      #TODO handle FontFile and FontFile3
+      @is_ttf = true if @font_program_stream
+    end
+    def glyph_width(char_code)
+      if @is_ttf
+        if ttf_program_stream.cmap.unicode.length > 0
+          glyph_id = ttf_program_stream.cmap.unicode.first[char_code]
+        else
+          glyph_id = char_code
+        end
+        char_metric = ttf_program_stream.horizontal_metrics.metrics[glyph_id]
+        if char_metric
+          puts "Char Code: #{char_code} -- Advance Width: #{char_metric.advance_width}" > 0
+          return char_metric.advance_width
+        end
+      end
+    end
+    # PDF states that a glyph is 1000 units wide, true type doesn't enforce
+    # any behavior, but uses units/em to define how wide the 'M' is (the widest letter)
+    def glyph_to_pdf_scale_factor
+      if @is_ttf
+        @glyph_to_pdf_sf ||= (1.0 / ttf_program_stream.header.units_per_em) * 1000.0
+      else
+        @glyph_to_pdf_sf ||= 1.0
+      end
+      @glyph_to_pdf_sf
+    end
+    private
+    def ttf_program_stream
+      @ttf_program_stream ||= TTFunk::File.new(@font_program_stream.unfiltered_data)
+    end
+  end
+end

data/lib/pdf/reader/glyph_hash.rb CHANGED

@@ -1,3 +1,5 @@
+# coding: utf-8
 ################################################################################
 #
 # Copyright (C) 2011 James Healy (jimmy@deefa.com)
@@ -24,6 +26,9 @@
 ################################################################################
 class PDF::Reader
+  # A Hash-like object that can convert glyph names into a unicode codepoint.
+  # The mapping is read from a data file on disk the first time it's needed.
+  #
   class GlyphHash # :nodoc:
     def initialize
       # only parse the glyph list once, and cache the results (for performance)
@@ -45,6 +50,7 @@ class PDF::Reader
     #   => 48
     #
     #   h[:34]
+    #   => 34
     #
     def [](name)
       return nil unless name.is_a?(Symbol)

data/lib/pdf/reader/lzw.rb CHANGED

@@ -17,6 +17,7 @@ module PDF
     #
     class LZW # :nodoc:
+      # Wraps an LZW encoded string
       class BitStream # :nodoc:
         def initialize(data, bits_in_chunk)

data/lib/pdf/reader/object_cache.rb CHANGED

@@ -1,6 +1,6 @@
 # coding: utf-8
-require 'hashery'
+require 'hashery/lru_hash'
 class PDF::Reader

data/lib/pdf/reader/object_hash.rb CHANGED

@@ -41,8 +41,8 @@ class PDF::Reader
     #
     def initialize(input, opts = {})
       @io          = extract_io_from(input)
-      @pdf_version = read_version
       @xref        = PDF::Reader::XRef.new(@io)
+      @pdf_version = read_version
       @trailer     = @xref.trailer
       @cache       = opts[:cache] || PDF::Reader::ObjectCache.new
       @sec_handler = build_security_handler(opts)

data/lib/pdf/reader/page_layout.rb ADDED

@@ -0,0 +1,125 @@
+# coding: utf-8
+class PDF::Reader
+  # Takes a collection of TextRun objects and renders them into a single
+  # string that best approximates the way they'd appear on a render PDF page.
+  #
+  # media box should be a 4 number array that describes the dimensions of the
+  # page to be rendered as described by the page's MediaBox attribute
+  class PageLayout
+    def initialize(runs, mediabox)
+      @runs    = merge_runs(runs)
+      @mean_font_size   = mean(@runs.map(&:font_size)) || 0
+      @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
+      @page_width  = mediabox[2] - mediabox[0]
+      @page_height = mediabox[3] - mediabox[1]
+      @x_offset = @runs.map(&:x).sort.first
+      @current_platform_is_rbx_19 = RUBY_DESCRIPTION =~ /\Arubinius 2.0.0/ &&
+                                      RUBY_VERSION >= "1.9.0"
+    end
+    def to_s
+      return "" if @runs.empty?
+      page = row_count.times.map { |i| " " * col_count }
+      @runs.each do |run|
+        x_pos = ((run.x - @x_offset) / col_multiplier).round
+        y_pos = row_count - (run.y / row_multiplier).round
+        if y_pos < row_count && y_pos >= 0 && x_pos < col_count && x_pos >= 0
+          local_string_insert(page[y_pos], run.text, x_pos)
+        end
+      end
+      interesting_rows(page).map(&:rstrip).join("\n")
+    end
+    private
+    # given an array of strings, return a new array with empty rows from the
+    # beginning and end removed.
+    #
+    #   interesting_rows([ "", "one", "two", "" ])
+    #   => [ "one", "two" ]
+    #
+    def interesting_rows(rows)
+      line_lengths = rows.map { |l| l.strip.length }
+      first_line_with_text = line_lengths.index { |l| l > 0 }
+      last_line_with_text  = line_lengths.size - line_lengths.reverse.index { |l| l > 0 }
+      interesting_line_count = last_line_with_text - first_line_with_text
+      rows[first_line_with_text, interesting_line_count].map
+    end
+    def row_count
+      @row_count ||= (@page_height / @mean_font_size).floor
+    end
+    def col_count
+      @col_count ||= ((@page_width  / @mean_glyph_width) * 1.05).floor
+    end
+    def row_multiplier
+      @row_multiplier ||= @page_height / row_count
+    end
+    def col_multiplier
+      @col_multiplier ||= @page_width / col_count
+    end
+    def mean(collection)
+      if collection.size == 0
+        0
+      else
+        collection.inject(0) { |accum, v| accum + v} / collection.size.to_f
+      end
+    end
+    def each_line(&block)
+      @runs.sort.group_by { |run|
+        run.y.to_i
+      }.map { |y, collection|
+        yield y, collection
+      }
+    end
+    # take a collection of TextRun objects and merge any that are in close
+    # proximity
+    def merge_runs(runs)
+      runs.group_by { |char|
+        char.y.to_i
+      }.map { |y, chars|
+        group_chars_into_runs(chars.sort)
+      }.flatten.sort
+    end
+    def group_chars_into_runs(chars)
+      runs = []
+      while head = chars.shift
+        if runs.empty?
+          runs << head
+        elsif runs.last.mergable?(head)
+          runs[-1] = runs.last + head
+        else
+          runs << head
+        end
+      end
+      runs
+    end
+    # This is a simple alternative to String#[]=. We can't use the string
+    # method as it's buggy on rubinius 2.0rc1 (in 1.9 mode)
+    #
+    # See my bug report at https://github.com/rubinius/rubinius/issues/1985
+    def local_string_insert(haystack, needle, index)
+      if @current_platform_is_rbx_19
+        char_count = needle.length
+        haystack.replace(
+          (haystack[0,index] || "") +
+          needle +
+          (haystack[index+char_count,500] || "")
+        )
+      else
+        haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
+      end
+    end
+  end
+end