RubyGems - pdf-reader - Versions diffs - 2.2.1 → 2.5.0 - Mend

pdf-reader 2.2.1 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

checksums.yaml +4 -4
data/CHANGELOG +30 -0
data/README.md +2 -2
data/bin/pdf_callbacks +1 -1
data/bin/pdf_text +1 -1
data/lib/pdf/reader.rb +1 -2
data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
data/lib/pdf/reader/afm/Courier.afm +342 -342
data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
data/lib/pdf/reader/afm/MustRead.html +19 -0
data/lib/pdf/reader/afm/Symbol.afm +213 -213
data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
data/lib/pdf/reader/buffer.rb +1 -1
data/lib/pdf/reader/cmap.rb +8 -0
data/lib/pdf/reader/encoding.rb +11 -9
data/lib/pdf/reader/filter/flate.rb +28 -16
data/lib/pdf/reader/font.rb +10 -2
data/lib/pdf/reader/object_hash.rb +24 -11
data/lib/pdf/reader/orientation_detector.rb +2 -2
data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
data/lib/pdf/reader/page.rb +28 -0
data/lib/pdf/reader/page_layout.rb +10 -5
data/lib/pdf/reader/page_state.rb +7 -5
data/lib/pdf/reader/page_text_receiver.rb +22 -1
data/lib/pdf/reader/text_run.rb +24 -0
data/lib/pdf/reader/width_calculator/built_in.rb +24 -16
data/lib/pdf/reader/xref.rb +7 -4
metadata +22 -17
data/lib/pdf/hash.rb +0 -20

data/lib/pdf/reader/cmap.rb CHANGED Viewed

@@ -96,6 +96,14 @@ class PDF::Reader
       Parser.new(buffer)
     end
+    # The following includes some manual decoding of UTF-16BE strings into unicode codepoints. In
+    # theory we could replace all the UTF-16 code with something based on Ruby's encoding support:
+    #
+    #    str.dup.force_encoding("utf-16be").encode!("utf-8").unpack("U*")
+    #
+    # However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an
+    # exception when we try converting broken UTF-16 to UTF-8
+    #
     def str_to_int(str)
       return nil if str.nil? || str.size == 0
       unpacked_string = if str.bytesize == 1 # UTF-8

data/lib/pdf/reader/encoding.rb CHANGED Viewed

@@ -40,20 +40,22 @@ class PDF::Reader
       @mapping  = default_mapping # maps from character codes to Unicode codepoints
       @string_cache  = {} # maps from character codes to UTF-8 strings.
-      if enc.kind_of?(Hash)
-        self.differences = enc[:Differences] if enc[:Differences]
-        enc = enc[:Encoding] || enc[:BaseEncoding]
-      elsif enc != nil
-        enc = enc.to_sym
+      @enc_name = if enc.kind_of?(Hash)
+        enc[:Encoding] || enc[:BaseEncoding]
+      elsif enc && enc.respond_to?(:to_sym)
+        enc.to_sym
       else
-        enc = nil
+        :StandardEncoding
       end
-      @enc_name = enc
-      @unpack   = get_unpack(enc)
-      @map_file = get_mapping_file(enc)
+      @unpack   = get_unpack(@enc_name)
+      @map_file = get_mapping_file(@enc_name)
       load_mapping(@map_file) if @map_file
+      if enc.is_a?(Hash) && enc[:Differences]
+        self.differences = enc[:Differences]
+      end
     end
     # set the differences table for this encoding. should be an array in the following format:

data/lib/pdf/reader/filter/flate.rb CHANGED Viewed

@@ -8,6 +8,9 @@ class PDF::Reader
   module Filter # :nodoc:
     # implementation of the Flate (zlib) stream filter
     class Flate
+      ZLIB_AUTO_DETECT_ZLIB_OR_GZIP = 47  # Zlib::MAX_WBITS + 32
+      ZLIB_RAW_DEFLATE              = -15 # Zlib::MAX_WBITS * -1
       def initialize(options = {})
         @options = options
       end
@@ -15,25 +18,34 @@ class PDF::Reader
       ################################################################################
       # Decode the specified data with the Zlib compression algorithm
       def filter(data)
-        deflated = nil
+        deflated = zlib_inflate(data) || zlib_inflate(data[0, data.bytesize-1])
+        if deflated.nil?
+          raise MalformedPDFError,
+            "Error while inflating a compressed stream (no suitable inflation algorithm found)"
+        end
+        Depredict.new(@options).filter(deflated)
+      end
+      private
+      def zlib_inflate(data)
         begin
-          deflated = Zlib::Inflate.new.inflate(data)
-        rescue Zlib::DataError => e
+          return Zlib::Inflate.new(ZLIB_AUTO_DETECT_ZLIB_OR_GZIP).inflate(data)
+        rescue Zlib::DataError
           # by default, Ruby's Zlib assumes the data it's inflating
-          # is RFC1951 deflated data, wrapped in a RFC1951 zlib container.
-          # If that fails, then use an undocumented 'feature' to attempt to inflate
-          # the data as a raw RFC1951 stream.
-          #
-          # See
-          # - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
-          # - http://www.gzip.org/zlib/zlib_faq.html#faq38
-          deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
+          # is RFC1951 deflated data, wrapped in a RFC1950 zlib container. If that
+          # fails, swallow the exception and attempt to inflate the data as a raw
+          # RFC1951 stream.
         end
-        Depredict.new(@options).filter(deflated)
-      rescue Exception => e
-        # Oops, there was a problem inflating the stream
-        raise MalformedPDFError,
-          "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
+        begin
+          return Zlib::Inflate.new(ZLIB_RAW_DEFLATE).inflate(data)
+        rescue StandardError
+          # swallow this one too, so we can try some other fallback options
+        end
+        nil
       end
     end
   end

data/lib/pdf/reader/font.rb CHANGED Viewed

@@ -97,7 +97,13 @@ class PDF::Reader
       elsif @subtype == :Type3
         PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
       elsif @subtype == :TrueType
-        PDF::Reader::WidthCalculator::TrueType.new(self)
+        if @font_descriptor
+          PDF::Reader::WidthCalculator::TrueType.new(self)
+        else
+          # A TrueType font that isn't embedded. Most readers look for a version on the
+          # local system and fallback to a substitute. For now, we go straight to a substitute
+          PDF::Reader::WidthCalculator::BuiltIn.new(self)
+        end
       elsif @subtype == :CIDFontType0 || @subtype == :CIDFontType2
         PDF::Reader::WidthCalculator::Composite.new(self)
       else
@@ -125,7 +131,9 @@ class PDF::Reader
       if obj[:ToUnicode]
         # ToUnicode is optional for Type1 and Type3
         stream = @ohash.object(obj[:ToUnicode])
-        @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
+        if stream.is_a?(PDF::Reader::Stream)
+          @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
+        end
       end
     end

data/lib/pdf/reader/object_hash.rb CHANGED Viewed

@@ -78,16 +78,7 @@ class PDF::Reader
         key = PDF::Reader::Reference.new(key.to_i, 0)
       end
-      if @cache.has_key?(key)
-        @cache[key]
-      elsif xref[key].is_a?(Integer)
-        buf = new_buffer(xref[key])
-        @cache[key] = decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
-      elsif xref[key].is_a?(PDF::Reader::Reference)
-        container_key = xref[key]
-        object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
-        @cache[key] = object_streams[container_key][key.id]
-      end
+      @cache[key] ||= fetch_object(key) || fetch_object_stream(key)
     rescue InvalidObjectError
       return default
     end
@@ -254,6 +245,26 @@ class PDF::Reader
     private
+    # parse a traditional object from the PDF, starting from the byte offset indicated
+    # in the xref table
+    #
+    def fetch_object(key)
+      if xref[key].is_a?(Integer)
+        buf = new_buffer(xref[key])
+        decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
+      end
+    end
+    # parse a object that's embedded in an object stream in the PDF
+    #
+    def fetch_object_stream(key)
+      if xref[key].is_a?(PDF::Reader::Reference)
+        container_key = xref[key]
+        object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
+        object_streams[container_key][key.id]
+      end
+    end
     # Private implementation of deref!, which exists to ensure the `seen` argument
     # isn't publicly available. It's used to avoid endless loops in the recursion, and
     # doesn't need to be part of the public API.
@@ -320,7 +331,9 @@ class PDF::Reader
     def decrypt(ref, obj)
       case obj
       when PDF::Reader::Stream then
-        obj.data = sec_handler.decrypt(obj.data, ref)
+        # PDF 32000-1:2008 7.5.8.2: "The cross-reference stream shall not be encrypted [...]."
+        # Therefore we shouldn't try to decrypt it.
+        obj.data = sec_handler.decrypt(obj.data, ref) unless obj.hash[:Type] == :XRef
         obj
       when Hash                then
         arr = obj.map { |key,val| [key, decrypt(ref, val)] }.flatten(1)

data/lib/pdf/reader/orientation_detector.rb CHANGED Viewed

@@ -22,8 +22,8 @@ class PDF::Reader
     def detect_orientation
       llx,lly,urx,ury = @attributes[:MediaBox]
       rotation        = @attributes[:Rotate].to_i
-      width           = urx.to_i - llx.to_i
-      height          = ury.to_i - lly.to_i
+      width           = (urx.to_i - llx.to_i).abs
+      height          = (ury.to_i - lly.to_i).abs
       if width > height
         (rotation % 180).zero? ? 'landscape' : 'portrait'
       else

data/lib/pdf/reader/overlapping_runs_filter.rb ADDED Viewed

@@ -0,0 +1,65 @@
+# coding: utf-8
+class PDF::Reader
+  # remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
+  # uses slightly offset overlapping characters to achieve a fake 'bold' effect.
+  class OverlappingRunsFilter
+    # This should be between 0 and 1. If TextRun B obscures this much of TextRun A (and they
+    # have identical characters) then one will be discarded
+    OVERLAPPING_THRESHOLD = 0.5
+    def self.exclude_redundant_runs(runs)
+      sweep_line_status = Array.new
+      event_point_schedule = Array.new
+      to_exclude = []
+      runs.each do |run|
+        event_point_schedule << EventPoint.new(run.x, run)
+        event_point_schedule << EventPoint.new(run.endx, run)
+      end
+      event_point_schedule.sort! { |a,b| a.x <=> b.x }
+      event_point_schedule.each do |event_point|
+        run = event_point.run
+        if event_point.start?
+          if detect_intersection(sweep_line_status, event_point)
+            to_exclude << run
+          end
+          sweep_line_status.push(run)
+        else
+          sweep_line_status.delete(run)
+        end
+      end
+      runs - to_exclude
+    end
+    def self.detect_intersection(sweep_line_status, event_point)
+      sweep_line_status.each do |open_text_run|
+        if event_point.x >= open_text_run.x &&
+            event_point.x <= open_text_run.endx &&
+            open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
+          return true
+        end
+      end
+      return false
+    end
+  end
+  # Utility class used to avoid modifying the underlying TextRun objects while we're
+  # looking for duplicates
+  class EventPoint
+    attr_reader :x, :run
+    def initialize x, run
+      @x, @run = x, run
+    end
+    def start?
+      @x == @run.x
+    end
+  end
+end

data/lib/pdf/reader/page.rb CHANGED Viewed

@@ -124,6 +124,34 @@ module PDF
         }.join(" ")
       end
+      # returns the angle to rotate the page clockwise. Always 0, 90, 180 or 270
+      #
+      def rotate
+        value = attributes[:Rotate].to_i
+        case value
+        when 0, 90, 180, 270
+          value
+        else
+          0
+        end
+      end
+      # returns the "boxes" that define the page object.
+      # values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
+      #
+      def boxes
+        mediabox = attributes[:MediaBox]
+        cropbox = attributes[:Cropbox] || mediabox
+        {
+          MediaBox: objects.deref!(mediabox),
+          CropBox: objects.deref!(cropbox),
+          BleedBox: objects.deref!(attributes[:BleedBox] || cropbox),
+          TrimBox: objects.deref!(attributes[:TrimBox] || cropbox),
+          ArtBox: objects.deref!(attributes[:ArtBox] || cropbox)
+        }
+      end
       private
       def root

data/lib/pdf/reader/page_layout.rb CHANGED Viewed

@@ -1,6 +1,8 @@
 # coding: utf-8
 # frozen_string_literal: true
+require 'pdf/reader/overlapping_runs_filter'
 class PDF::Reader
   # Takes a collection of TextRun objects and renders them into a single
@@ -15,22 +17,25 @@ class PDF::Reader
     def initialize(runs, mediabox)
       raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
-      @runs    = merge_runs(runs)
+      @runs    = merge_runs(OverlappingRunsFilter.exclude_redundant_runs(runs))
       @mean_font_size   = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
       @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
       @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
-      @page_width  = mediabox[2] - mediabox[0]
-      @page_height = mediabox[3] - mediabox[1]
-      @x_offset = @runs.map(&:x).sort.first
+      @page_width  = (mediabox[2] - mediabox[0]).abs
+      @page_height = (mediabox[3] - mediabox[1]).abs
+      @x_offset = @runs.map(&:x).sort.first || 0
+      lowest_y = @runs.map(&:y).sort.first || 0
+      @y_offset = lowest_y > 0 ? 0 : lowest_y
     end
     def to_s
       return "" if @runs.empty?
+      return "" if row_count == 0
       page = row_count.times.map { |i| " " * col_count }
       @runs.each do |run|
         x_pos = ((run.x - @x_offset) / col_multiplier).round
-        y_pos = row_count - (run.y / row_multiplier).round
+        y_pos = row_count - ((run.y - @y_offset) / row_multiplier).round
         if y_pos <= row_count && y_pos >= 0 && x_pos <= col_count && x_pos >= 0
           local_string_insert(page[y_pos-1], run.text, x_pos)
         end

data/lib/pdf/reader/page_state.rb CHANGED Viewed

@@ -30,7 +30,7 @@ class PDF::Reader
         @xobject_stack = [page.xobjects]
         @cs_stack      = [page.color_spaces]
         @stack         = [DEFAULT_GRAPHICS_STATE.dup]
-        state[:ctm]    = identity_matrix
+        state[:ctm]  = identity_matrix
       end
       #####################################################
@@ -322,11 +322,13 @@ class PDF::Reader
         th = state[:h_scaling]
         # optimise the common path to reduce Float allocations
         if th == 1 && tj == 0 && tc == 0 && tw == 0
-          glyph_width = w0 * fs
-          tx = glyph_width
+          tx = w0 * fs
+        elsif tj != 0
+          # don't apply spacing to TJ displacement
+          tx = (w0 - (tj/1000.0)) * fs * th
         else
-          glyph_width = ((w0 - (tj/1000.0)) * fs) * th
-          tx = glyph_width + ((tc + tw) * th)
+          # apply horizontal scaling to spacing values but not font size
+          tx = ((w0 * fs) + tc + tw) * th
         end
         # TODO: I'm pretty sure that tx shouldn't need to be divided by

data/lib/pdf/reader/page_text_receiver.rb CHANGED Viewed

@@ -41,13 +41,17 @@ module PDF
       # starting a new page
       def page=(page)
         @state = PageState.new(page)
+        @page = page
         @content = []
         @characters = []
         @mediabox = page.objects.deref(page.attributes[:MediaBox])
+        device_bl = @state.ctm_transform(@mediabox[0], @mediabox[1])
+        device_tr = @state.ctm_transform(@mediabox[2], @mediabox[3])
+        @device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
       end
       def content
-        PageLayout.new(@characters, @mediabox).to_s
+        PageLayout.new(@characters, @device_mediabox).to_s
       end
       #####################################################
@@ -101,6 +105,8 @@ module PDF
         glyphs.each_with_index do |glyph_code, index|
           # paint the current glyph
           newx, newy = @state.trm_transform(0,0)
+          newx, newy = apply_rotation(newx, newy)
           utf8_chars = @state.current_font.to_utf8(glyph_code)
           # apply to glyph displacment for the current glyph so the next
@@ -115,6 +121,21 @@ module PDF
         end
       end
+      def apply_rotation(x, y)
+        if @page.rotate == 90
+          tmp = x
+          x = y
+          y = tmp * -1
+        elsif @page.rotate == 180
+          y *= -1
+        elsif @page.rotate == 270
+          tmp = x
+          x = y * -1
+          y = tmp * -1
+        end
+        return x, y
+      end
     end
   end
 end

data/lib/pdf/reader/text_run.rb CHANGED Viewed

@@ -38,6 +38,10 @@ class PDF::Reader
       @endx ||= x + width
     end
+    def endy
+      @endy ||= y + font_size
+    end
     def mean_character_width
       @width / character_count
     end
@@ -60,8 +64,28 @@ class PDF::Reader
       "#{text} w:#{width} f:#{font_size} @#{x},#{y}"
     end
+    def intersect?(other_run)
+      x <= other_run.endx && endx >= other_run.x &&
+        endy >= other_run.y && y <= other_run.endy
+    end
+    # return what percentage of this text run is overlapped by another run
+    def intersection_area_percent(other_run)
+      return 0 unless intersect?(other_run)
+      dx = [endx, other_run.endx].min - [x, other_run.x].max
+      dy = [endy, other_run.endy].min - [y, other_run.y].max
+      intersection_area = dx*dy
+      intersection_area.to_f / area
+    end
     private
+    def area
+      (endx - x) * (endy - y)
+    end
     def mergable_range
       @mergable_range ||= Range.new(endx - 3, endx + font_size)
     end