RubyGems - pdf-reader - Versions diffs - 0.7 → 0.7.1 - Mend

pdf-reader 0.7 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/CHANGELOG CHANGED

@@ -1,3 +1,8 @@
+v0.7.1 (6th May 2008)
+- Non-page strings (ie. metadata, etc) are now converted to UTF-8 more accurately
+- Fixed a regression between 0.6.2 and 0.7 that prevented difference tables from being applied
+  correctly when translating text into UTF-8
 v0.7 (6th May 2008)
 - API INCOMPATIBLE CHANGE: any hashes that are passed to callbacks use symbols as keys instead of PDF::Reader::Name instances.
 - Improved support for converting text in some PDF files to unicode
@@ -5,7 +10,7 @@ v0.7 (6th May 2008)
 - Include some basic metadata callbacks
 - Don't interpret a comment token (%) inside a string as a comment
 - Small fixes to improve 1.9 compatability
-- Improved our Zlib deflating to make it more slightly more robust - still some more issues to work out though
+- Improved our Zlib deflating to make it slightly more robust - still some more issues to work out though
 - Throw an UnsupportedFeatureError if a pdf that uses XRef streams is opened
 - Added an option to PDF::Reader#file and PDF::Reader#string to enable parsing of only parts of a PDF file(ie. only metadata, etc)

data/Rakefile CHANGED

@@ -6,7 +6,7 @@ require 'rake/testtask'
 require "rake/gempackagetask"
 require 'spec/rake/spectask'
-PKG_VERSION = "0.7"
+PKG_VERSION = "0.7.1"
 PKG_NAME = "pdf-reader"
 PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"

data/TODO CHANGED

@@ -10,7 +10,7 @@ v0.8
   from the Original encoding to Unicode.
 - detect when a font's encoding is a CMap (generally used for pre-Unicode, multibyte asian encodings), and display a user friendly error
 - Provide a way to get raw access to a particular object. Good for testing purposes
-- Improve interpretation of non content stream data (ie metadata). Use PDFDofEncoding, recognise UTF16 strings, recognise dates, etc
+- Improve interpretation of non content stream data (ie metadata). recognise dates, etc
 - Support Cross Reference Streams (spec 3.4.7)
 v0.9

data/lib/pdf/reader/content.rb CHANGED

@@ -23,7 +23,6 @@
 #
 ################################################################################
 require 'stringio'
-#require 'enumerable'
 class PDF::Reader
   ################################################################################
@@ -254,7 +253,7 @@ class PDF::Reader
     ################################################################################
     # Begin processing the document metadata
     def metadata (info)
-      info = utf16_to_utf8(info)
+      info = decode_strings(info)
       callback(:metadata, [info]) if info
     end
     ################################################################################
@@ -430,16 +429,17 @@ class PDF::Reader
     end
     ################################################################################
     private
-    def utf16_to_utf8(obj)
+    # strings outside of page content should be in either PDFDocEncoding or UTF-16.
+    def decode_strings(obj)
       case obj
       when String then
         if obj[0,2] == "\376\377"
-          obj[2, obj.size-2].unpack("n*").pack("U*")
+          PDF::Reader::Encoding::UTF16Encoding.new.to_utf8(obj)
         else
-          obj
+          PDF::Reader::Encoding::PDFDocEncoding.new.to_utf8(obj)
         end
-      when Hash   then obj.each { |key,val| obj[key] = utf16_to_utf8(val) }
-      when Array  then obj.collect { |item| utf16_to_utf8(item) }
+      when Hash   then obj.each { |key,val| obj[key] = decode_strings(val) }
+      when Array  then obj.collect { |item| decode_strings(item) }
       else
         obj
       end

data/lib/pdf/reader/encoding.rb CHANGED

@@ -34,13 +34,13 @@ class PDF::Reader
     # set the differences table for this encoding. should be an array in the following format:
     #
-    #   [25, "A", 26, "B"]
+    #   [25, :A, 26, :B]
     #
     # The array alternates bewteen a decimal byte number and a glyph name to map to that byte
     #
     # To save space the following array is also valid and equivilant to the previous one
     #
-    #   [25, "A", "B"]
+    #   [25, :A, :B]
     def differences=(diff)
       raise ArgumentError, "diff must be an array" unless diff.kind_of?(Array)
@@ -498,6 +498,84 @@ class PDF::Reader
       end
     end
+    class PDFDocEncoding < Encoding
+      # convert a PDFDocEncoding string into UTF-8
+      def to_utf8(str, tounicode = nil)
+        array_pdf = str.unpack('C*')
+        array_pdf = self.process_differences(array_pdf)
+        array_enc = []
+        array_pdf.each do |num|
+          if tounicode && (code = tounicode.decode(num))
+            array_enc << code
+          elsif tounicode
+            array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
+          else
+            case num
+              # change necesary characters to equivilant Unicode codepoints
+            when 0x18; array_enc << 0x02D8
+            when 0x19; array_enc << 0x02C7
+            when 0x1A; array_enc << 0x02C6
+            when 0x1B; array_enc << 0x02D9
+            when 0x1C; array_enc << 0x02DD
+            when 0x1D; array_enc << 0x02DB
+            when 0x1E; array_enc << 0x02DA
+            when 0x1F; array_enc << 0x02DC
+            when 0x7F; array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR # Undefined
+            when 0x80; array_enc << 0x2022
+            when 0x81; array_enc << 0x2020
+            when 0x82; array_enc << 0x2021
+            when 0x83; array_enc << 0x2026
+            when 0x84; array_enc << 0x2014
+            when 0x85; array_enc << 0x2013
+            when 0x86; array_enc << 0x0192
+            when 0x87; array_enc << 0x2044
+            when 0x88; array_enc << 0x2039
+            when 0x89; array_enc << 0x203A
+            when 0x8A; array_enc << 0x2212
+            when 0x8B; array_enc << 0x2030
+            when 0x8C; array_enc << 0x201E
+            when 0x8D; array_enc << 0x201C
+            when 0x8E; array_enc << 0x201D
+            when 0x8F; array_enc << 0x2018
+            when 0x90; array_enc << 0x2019
+            when 0x91; array_enc << 0x201A
+            when 0x92; array_enc << 0x2122
+            when 0x93; array_enc << 0xFB01
+            when 0x94; array_enc << 0xFB02
+            when 0x95; array_enc << 0x0141
+            when 0x96; array_enc << 0x0152
+            when 0x97; array_enc << 0x0160
+            when 0x98; array_enc << 0x0178
+            when 0x99; array_enc << 0x017D
+            when 0x9A; array_enc << 0x0131
+            when 0x9B; array_enc << 0x0142
+            when 0x9C; array_enc << 0x0153
+            when 0x9D; array_enc << 0x0161
+            when 0x9E; array_enc << 0x017E
+            when 0x9F; array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR # Undefined
+            when 0xA0; array_enc << 0x20AC
+            else
+              array_enc << num
+            end
+          end
+        end
+        # convert any glyph names to unicode codepoints
+        array_enc = self.process_glyphnames(array_enc)
+        # replace charcters that didn't convert to unicode nicely with something valid
+        array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }
+        # pack all our Unicode codepoints into a UTF-8 string
+        ret = array_enc.pack("U*")
+        # set the strings encoding correctly under ruby 1.9+
+        ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
+        return ret
+      end
+    end
     class StandardEncoding < Encoding
       # convert an Adobe Standard Encoding string into UTF-8
       def to_utf8(str, tounicode = nil)
@@ -771,6 +849,23 @@ class PDF::Reader
       end
     end
+    class UTF16Encoding < Encoding
+      # convert a UTF-16 string into UTF-8
+      def to_utf8(str, tounicode = nil)
+        # remove the UTF-16 Byte Order Mark if it exists
+        str = str[2, str.size-2] if str[0,2] == "\376\377"
+        # convert away
+        str = str.unpack("n*").pack("U*")
+        # set the strings encoding correctly under ruby 1.9+
+        str.force_encoding("UTF-8") if str.respond_to?(:force_encoding)
+        return str
+      end
+    end
     class WinAnsiEncoding < Encoding
       # convert a WinAnsiEncoding string into UTF-8
       def to_utf8(str, tounicode = nil)

data/lib/pdf/reader/font.rb CHANGED

@@ -39,7 +39,7 @@ class PDF::Reader
         File.open(File.dirname(__FILE__) + "/glyphlist.txt",mode) do |f|
           f.each do |l|
             m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
-            @@glyphs[name] = "0x#{code}".hex if name
+            @@glyphs[name.to_sym] = "0x#{code}".hex if name
           end
         end
       end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: pdf-reader
 version: !ruby/object:Gem::Version
-  version: "0.7"
+  version: 0.7.1
 platform: ruby
 authors:
 - Peter Jones