RubyGems - pdf-reader - Versions diffs - 2.2.0 → 2.2.1 - Mend

pdf-reader 2.2.0 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml +4 -4
data/CHANGELOG +3 -0
data/lib/pdf/reader/cmap.rb +13 -12
data/lib/pdf/reader/orientation_detector.rb +2 -2
metadata +4 -5

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: cfc4ed13692a51d8b78fc181d67fcf8b5e00fb1679dbca36137961f63365edaf
-  data.tar.gz: de5556fabc41642746fd242a2623c92c9424c56da2d845507c49624c312b646b
+  metadata.gz: e1d87a1e4cc6989cb579c5c720ebe8277ab8099a2a6d7044a5c6f843cfabe2a7
+  data.tar.gz: '02693bdcc7d21572494ffa3f7e4a7e7ecaa558601951590f6c348c97c892ead3'
 SHA512:
-  metadata.gz: 4074d5dd87f1ad9286f4022ad46a4160f44c6afed2341f9115029770770ae80b248ace9a8d5df0e444046bed662f9aa5a9334822b23222abec9574523d9e7c36
-  data.tar.gz: a69837921f7581d2aeb9226d0791b4b0dd5925a9f83e9cb4cee4dbaf43af33e6a7a570292650a14006ffc9d1759f2ea4ef268381e5aa63fc6da5c1a6d38f46a7
+  metadata.gz: ae3845f040bff4089ba8e4b2df1e22c10ddea1019475e4525b89fdf3889ffd904f98c72162cfe451ff7cfbe2f697e9462d5b2efe4a4144fdfa34568343c51f2c
+  data.tar.gz: 26755a0cc78cd490e7013f548ed8b46999629995109986f4ee474fff430fd77913888e6172512630118a2b99ef14546a3e94a38681ff66ac9b80482f7504351b

data/CHANGELOG CHANGED

@@ -1,3 +1,6 @@
+v2.2.1 (27th July 2019)
+- Improve utf8 text extraction from CMaps that contain surrogate pair ligatures
 v2.2.0 (18th December 2018)
 - Support additional XRef Stream variants (thanks Stefan Wienert)
 - Add frozen_strings pragma to reduce object allocations on ruby 2.3+

data/lib/pdf/reader/cmap.rb CHANGED

@@ -98,23 +98,24 @@ class PDF::Reader
     def str_to_int(str)
       return nil if str.nil? || str.size == 0
-      unpacked_string = if str.size == 1 # UTF-8
+      unpacked_string = if str.bytesize == 1 # UTF-8
         str.unpack("C*")
       else # UTF-16
          str.unpack("n*")
       end
-      if unpacked_string.size == 1
-        unpacked_string
-      elsif unpacked_string.size == 2 && unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF
-        # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
-        # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
-        # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
-        [(unpacked_string[0] - 0xD800) * 0x400 + (unpacked_string[1] - 0xDC00) + 0x10000]
-      else
-        # it is a bad idea to just return the first 16 bits, as this doesn't allow
-        # for ligatures for example fi (U+0066 U+0069)
-        unpacked_string
+      result = []
+      while unpacked_string.any? do
+        if unpacked_string.size >= 2 && unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF
+          # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
+          # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
+          # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
+          points = [unpacked_string.shift, unpacked_string.shift]
+          result << (points[0] - 0xD800) * 0x400 + (points[1] - 0xDC00) + 0x10000
+        else
+          result << unpacked_string.shift
+        end
       end
+      result
     end
     def process_bfchar_instructions(instructions)

data/lib/pdf/reader/orientation_detector.rb CHANGED

@@ -25,9 +25,9 @@ class PDF::Reader
       width           = urx.to_i - llx.to_i
       height          = ury.to_i - lly.to_i
       if width > height
-        [0,180].include?(rotation) ? 'landscape' : 'portrait'
+        (rotation % 180).zero? ? 'landscape' : 'portrait'
       else
-        [0,180].include?(rotation) ? 'portrait' : 'landscape'
+        (rotation % 180).zero? ? 'portrait' : 'landscape'
       end
     end
   end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pdf-reader
 version: !ruby/object:Gem::Version
-  version: 2.2.0
+  version: 2.2.1
 platform: ruby
 authors:
 - James Healy
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2018-12-18 00:00:00.000000000 Z
+date: 2019-07-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
@@ -167,7 +167,7 @@ dependencies:
 description: The PDF::Reader library implements a PDF parser conforming as much as
   possible to the PDF specification from Adobe
 email:
-- jimmy@deefa.com
+- james@yob.id.au
 executables:
 - pdf_object
 - pdf_text
@@ -295,8 +295,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.7.6
+rubygems_version: 3.0.1
 signing_key:
 specification_version: 4
 summary: A library for accessing the content of PDF files