RubyGems - pdf-reader - Versions diffs - 2.2.1 → 2.3.0 - Mend

pdf-reader 2.2.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/CHANGELOG +7 -0
data/lib/pdf/reader/buffer.rb +1 -1
data/lib/pdf/reader/cmap.rb +8 -0
data/lib/pdf/reader/encoding.rb +10 -8
data/lib/pdf/reader/object_hash.rb +21 -10
data/lib/pdf/reader/overlapping_runs_filter.rb +66 -0
data/lib/pdf/reader/page_layout.rb +3 -1
data/lib/pdf/reader/text_run.rb +24 -0
data/lib/pdf/reader/xref.rb +7 -4
metadata +4 -3

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: e1d87a1e4cc6989cb579c5c720ebe8277ab8099a2a6d7044a5c6f843cfabe2a7
-  data.tar.gz: '02693bdcc7d21572494ffa3f7e4a7e7ecaa558601951590f6c348c97c892ead3'
+  metadata.gz: 419ef1c2770f8cff11f2ee6453f70cec80562eddb7912ddd618013c5c013bcad
+  data.tar.gz: 71a7a814472b527b7a03e24d4923893962a8c0a1748e0d9007eb5cd7c8bbf7b3
 SHA512:
-  metadata.gz: ae3845f040bff4089ba8e4b2df1e22c10ddea1019475e4525b89fdf3889ffd904f98c72162cfe451ff7cfbe2f697e9462d5b2efe4a4144fdfa34568343c51f2c
-  data.tar.gz: 26755a0cc78cd490e7013f548ed8b46999629995109986f4ee474fff430fd77913888e6172512630118a2b99ef14546a3e94a38681ff66ac9b80482f7504351b
+  metadata.gz: 4a5d4e76a74a766ceae3960587efce9aa63600c1b78b16175e9b41b58435d1c766871c2b288e79edec1499444aa12c786937eda70634bf301cd05ad8f2373063
+  data.tar.gz: 7e7bf8f2bb43822a64f89ca46bf0369a1e34b0e60078483ad1d4cf774ef6c6122f689b765f41d02a2120eafeba30fd47bb2cbd1b0dd5c56e7cf556648b3f4e33

data/CHANGELOG CHANGED

@@ -1,3 +1,10 @@
+v2.3.0 (7th November 2019)
+- Text extraction now makes an effort to skip duplicate characters that overlap, a
+  common approach used for a fake "bold" effect, This will make text extraction a bit
+  slower - if that turns out to be an issue I'll look into further optimisations or
+  provide a toggle to turn it off
+- Several small bug fixes
 v2.2.1 (27th July 2019)
 - Improve utf8 text extraction from CMaps that contain surrogate pair ligatures

data/lib/pdf/reader/buffer.rb CHANGED

@@ -55,7 +55,7 @@ class PDF::Reader
     #
     # Params:
     #
-    #   io - an IO stream or string with the raw data to tokenise
+    #   io - an IO stream (usually a StringIO) with the raw data to tokenise
     #
     # options:
     #

data/lib/pdf/reader/cmap.rb CHANGED

@@ -96,6 +96,14 @@ class PDF::Reader
       Parser.new(buffer)
     end
+    # The following includes some manual decoding of UTF-16BE strings into unicode codepoints. In
+    # theory we could replace all the UTF-16 code with something based on Ruby's encoding support:
+    #
+    #    str.dup.force_encoding("utf-16be").encode!("utf-8").unpack("U*")
+    #
+    # However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an
+    # exception when we try converting broken UTF-16 to UTF-8
+    #
     def str_to_int(str)
       return nil if str.nil? || str.size == 0
       unpacked_string = if str.bytesize == 1 # UTF-8

data/lib/pdf/reader/encoding.rb CHANGED

@@ -40,20 +40,22 @@ class PDF::Reader
       @mapping  = default_mapping # maps from character codes to Unicode codepoints
       @string_cache  = {} # maps from character codes to UTF-8 strings.
-      if enc.kind_of?(Hash)
-        self.differences = enc[:Differences] if enc[:Differences]
-        enc = enc[:Encoding] || enc[:BaseEncoding]
+      @enc_name = if enc.kind_of?(Hash)
+        enc[:Encoding] || enc[:BaseEncoding]
       elsif enc != nil
-        enc = enc.to_sym
+        enc.to_sym
       else
-        enc = nil
+        nil
       end
-      @enc_name = enc
-      @unpack   = get_unpack(enc)
-      @map_file = get_mapping_file(enc)
+      @unpack   = get_unpack(@enc_name)
+      @map_file = get_mapping_file(@enc_name)
       load_mapping(@map_file) if @map_file
+      if enc.is_a?(Hash) && enc[:Differences]
+        self.differences = enc[:Differences]
+      end
     end
     # set the differences table for this encoding. should be an array in the following format:

data/lib/pdf/reader/object_hash.rb CHANGED

@@ -78,16 +78,7 @@ class PDF::Reader
         key = PDF::Reader::Reference.new(key.to_i, 0)
       end
-      if @cache.has_key?(key)
-        @cache[key]
-      elsif xref[key].is_a?(Integer)
-        buf = new_buffer(xref[key])
-        @cache[key] = decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
-      elsif xref[key].is_a?(PDF::Reader::Reference)
-        container_key = xref[key]
-        object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
-        @cache[key] = object_streams[container_key][key.id]
-      end
+      @cache[key] ||= fetch_object(key) || fetch_object_stream(key)
     rescue InvalidObjectError
       return default
     end
@@ -254,6 +245,26 @@ class PDF::Reader
     private
+    # parse a traditional object from the PDF, starting from the byte offset indicated
+    # in the xref table
+    #
+    def fetch_object(key)
+      if xref[key].is_a?(Integer)
+        buf = new_buffer(xref[key])
+        decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
+      end
+    end
+    # parse a object that's embedded in an object stream in the PDF
+    #
+    def fetch_object_stream(key)
+      if xref[key].is_a?(PDF::Reader::Reference)
+        container_key = xref[key]
+        object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
+        object_streams[container_key][key.id]
+      end
+    end
     # Private implementation of deref!, which exists to ensure the `seen` argument
     # isn't publicly available. It's used to avoid endless loops in the recursion, and
     # doesn't need to be part of the public API.

data/lib/pdf/reader/overlapping_runs_filter.rb ADDED

@@ -0,0 +1,66 @@
+# coding: utf-8
+class PDF::Reader
+  # remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
+  # uses slightly offset overlapping characters to achieve a fake 'bold' effect.
+  class OverlappingRunsFilter
+    # This should be between 0 and 1. If TextRun B obscures this much of TextRun A (and they
+    # have identical characters) then one will be discarded
+    OVERLAPPING_THRESHOLD = 0.5
+    def self.exclude_redundant_runs(runs)
+      sweep_line_status = Array.new
+      event_point_schedule = Array.new
+      to_exclude = []
+      runs.each do |run|
+        event_point_schedule << EventPoint.new(run.x, run)
+        event_point_schedule << EventPoint.new(run.endx, run)
+      end
+      event_point_schedule.sort! { |a,b| a.x <=> b.x }
+      while not event_point_schedule.empty? do
+        event_point = event_point_schedule.shift
+        break unless event_point
+        if event_point.start? then
+          if detect_intersection(sweep_line_status, event_point)
+            to_exclude << event_point.run
+          end
+          sweep_line_status.push event_point
+        else
+          sweep_line_status.delete event_point
+        end
+      end
+      runs - to_exclude
+    end
+    def self.detect_intersection(sweep_line_status, event_point)
+      sweep_line_status.each do |point_in_sls|
+        if event_point.x >= point_in_sls.run.x &&
+            event_point.x <= point_in_sls.run.endx &&
+            point_in_sls.run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
+          return true
+        end
+      end
+      return false
+    end
+  end
+  # Utility class used to avoid modifying the underlying TextRun objects while we're
+  # looking for duplicates
+  class EventPoint
+    attr_reader :x, :run
+    def initialize x, run
+      @x, @run = x, run
+    end
+    def start?
+      @x == @run.x
+    end
+  end
+end

data/lib/pdf/reader/page_layout.rb CHANGED

@@ -1,6 +1,8 @@
 # coding: utf-8
 # frozen_string_literal: true
+require 'pdf/reader/overlapping_runs_filter'
 class PDF::Reader
   # Takes a collection of TextRun objects and renders them into a single
@@ -15,7 +17,7 @@ class PDF::Reader
     def initialize(runs, mediabox)
       raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
-      @runs    = merge_runs(runs)
+      @runs    = merge_runs(OverlappingRunsFilter.exclude_redundant_runs(runs))
       @mean_font_size   = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
       @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
       @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0

data/lib/pdf/reader/text_run.rb CHANGED

@@ -38,6 +38,10 @@ class PDF::Reader
       @endx ||= x + width
     end
+    def endy
+      @endy ||= y + font_size
+    end
     def mean_character_width
       @width / character_count
     end
@@ -60,8 +64,28 @@ class PDF::Reader
       "#{text} w:#{width} f:#{font_size} @#{x},#{y}"
     end
+    def intersect?(other_run)
+      x <= other_run.endx && endx >= other_run.x &&
+        endy >= other_run.y && y <= other_run.endy
+    end
+    # return what percentage of this text run is overlapped by another run
+    def intersection_area_percent(other_run)
+      return 0 unless intersect?(other_run)
+      dx = [endx, other_run.endx].min - [x, other_run.x].max
+      dy = [endy, other_run.endy].min - [y, other_run.y].max
+      intersection_area = dx*dy
+      intersection_area.to_f / area
+    end
     private
+    def area
+      (endx - x) * (endy - y)
+    end
     def mergable_range
       @mergable_range ||= Range.new(endx - 3, endx + font_size)
     end

data/lib/pdf/reader/xref.rb CHANGED

@@ -230,18 +230,21 @@ class PDF::Reader
     # should always be 0, but all sort of crazy junk is prefixed to PDF files
     # in the real world.
     #
-    # Checks up to 50 chars into the file, returns nil if no PDF data detected.
+    # Checks up to 1024 chars into the file,
+    # returns nil if no PDF data detected.
+    # Adobe PDF 1.4 spec (3.4.1) 12. Acrobat viewers require only that the
+    # header appear somewhere within the first 1024 bytes of the file
     #
     def calc_junk_offset(io)
       io.rewind
       offset = io.pos
-      until (c = io.readchar) == '%' || c == 37 || offset > 50
+      until (c = io.readchar) == '%' || c == 37 || offset > 1024
         offset += 1
       end
       io.rewind
-      offset < 50 ? offset : nil
+      offset < 1024 ? offset : nil
     rescue EOFError
-      return nil
+      nil
     end
   end
   ################################################################################

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pdf-reader
 version: !ruby/object:Gem::Version
-  version: 2.2.1
+  version: 2.3.0
 platform: ruby
 authors:
 - James Healy
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2019-07-27 00:00:00.000000000 Z
+date: 2019-11-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
@@ -246,6 +246,7 @@ files:
 - lib/pdf/reader/object_hash.rb
 - lib/pdf/reader/object_stream.rb
 - lib/pdf/reader/orientation_detector.rb
+- lib/pdf/reader/overlapping_runs_filter.rb
 - lib/pdf/reader/page.rb
 - lib/pdf/reader/page_layout.rb
 - lib/pdf/reader/page_state.rb
@@ -295,7 +296,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.0.1
+rubygems_version: 3.0.3
 signing_key:
 specification_version: 4
 summary: A library for accessing the content of PDF files