RubyGems - hexapdf - Versions diffs - 0.3.0 → 0.4.0 - Mend

hexapdf 0.3.0 → 0.4.0

Files changed (142) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +68 -0
data/CONTRIBUTERS +1 -1
data/README.md +35 -4
data/Rakefile +1 -0
data/VERSION +1 -1
data/data/hexapdf/cmap/83pv-RKSJ-H +314 -0
data/data/hexapdf/cmap/90ms-RKSJ-H +259 -0
data/data/hexapdf/cmap/90ms-RKSJ-V +156 -0
data/data/hexapdf/cmap/90msp-RKSJ-H +257 -0
data/data/hexapdf/cmap/90msp-RKSJ-V +155 -0
data/data/hexapdf/cmap/90pv-RKSJ-H +355 -0
data/data/hexapdf/cmap/Add-RKSJ-H +738 -0
data/data/hexapdf/cmap/Add-RKSJ-V +135 -0
data/data/hexapdf/cmap/Adobe-CNS1-UCS2 +18209 -0
data/data/hexapdf/cmap/Adobe-GB1-UCS2 +14267 -0
data/data/hexapdf/cmap/Adobe-Japan1-UCS2 +19159 -0
data/data/hexapdf/cmap/Adobe-Korea1-UCS2 +9267 -0
data/data/hexapdf/cmap/B5pc-H +337 -0
data/data/hexapdf/cmap/B5pc-V +90 -0
data/data/hexapdf/cmap/CNS-EUC-H +490 -0
data/data/hexapdf/cmap/CNS-EUC-V +538 -0
data/data/hexapdf/cmap/ETen-B5-H +343 -0
data/data/hexapdf/cmap/ETen-B5-V +91 -0
data/data/hexapdf/cmap/ETenms-B5-H +79 -0
data/data/hexapdf/cmap/ETenms-B5-V +99 -0
data/data/hexapdf/cmap/EUC-H +207 -0
data/data/hexapdf/cmap/EUC-V +105 -0
data/data/hexapdf/cmap/Ext-RKSJ-H +768 -0
data/data/hexapdf/cmap/Ext-RKSJ-V +117 -0
data/data/hexapdf/cmap/GB-EUC-H +173 -0
data/data/hexapdf/cmap/GB-EUC-V +98 -0
data/data/hexapdf/cmap/GBK-EUC-H +4273 -0
data/data/hexapdf/cmap/GBK-EUC-V +97 -0
data/data/hexapdf/cmap/GBK2K-H +5325 -0
data/data/hexapdf/cmap/GBK2K-V +118 -0
data/data/hexapdf/cmap/GBKp-EUC-H +4272 -0
data/data/hexapdf/cmap/GBKp-EUC-V +97 -0
data/data/hexapdf/cmap/GBpc-EUC-H +175 -0
data/data/hexapdf/cmap/GBpc-EUC-V +98 -0
data/data/hexapdf/cmap/H +200 -0
data/data/hexapdf/cmap/HKscs-B5-H +1331 -0
data/data/hexapdf/cmap/HKscs-B5-V +90 -0
data/data/hexapdf/cmap/Identity-H +339 -0
data/data/hexapdf/cmap/Identity-V +73 -0
data/data/hexapdf/cmap/KSC-EUC-H +562 -0
data/data/hexapdf/cmap/KSC-EUC-V +94 -0
data/data/hexapdf/cmap/KSCms-UHC-H +776 -0
data/data/hexapdf/cmap/KSCms-UHC-HW-H +775 -0
data/data/hexapdf/cmap/KSCms-UHC-HW-V +93 -0
data/data/hexapdf/cmap/KSCms-UHC-V +94 -0
data/data/hexapdf/cmap/KSCpc-EUC-H +608 -0
data/data/hexapdf/cmap/LICENSE.txt +26 -0
data/data/hexapdf/cmap/README.txt +9 -0
data/data/hexapdf/cmap/UniCNS-UCS2-H +16992 -0
data/data/hexapdf/cmap/UniCNS-UCS2-V +90 -0
data/data/hexapdf/cmap/UniCNS-UTF16-H +19117 -0
data/data/hexapdf/cmap/UniCNS-UTF16-V +94 -0
data/data/hexapdf/cmap/UniGB-UCS2-H +14321 -0
data/data/hexapdf/cmap/UniGB-UCS2-V +101 -0
data/data/hexapdf/cmap/UniGB-UTF16-H +14381 -0
data/data/hexapdf/cmap/UniGB-UTF16-V +104 -0
data/data/hexapdf/cmap/UniJIS-UCS2-H +8870 -0
data/data/hexapdf/cmap/UniJIS-UCS2-HW-H +81 -0
data/data/hexapdf/cmap/UniJIS-UCS2-HW-V +279 -0
data/data/hexapdf/cmap/UniJIS-UCS2-V +275 -0
data/data/hexapdf/cmap/UniJIS-UTF16-H +14450 -0
data/data/hexapdf/cmap/UniJIS-UTF16-V +299 -0
data/data/hexapdf/cmap/UniKS-UCS2-H +8725 -0
data/data/hexapdf/cmap/UniKS-UCS2-V +95 -0
data/data/hexapdf/cmap/UniKS-UTF16-H +8895 -0
data/data/hexapdf/cmap/UniKS-UTF16-V +99 -0
data/data/hexapdf/cmap/V +105 -0
data/examples/arc.rb +3 -3
data/examples/merging.rb +4 -1
data/examples/optimizing.rb +3 -0
data/examples/show_char_bboxes.rb +2 -2
data/examples/truetype.rb +2 -2
data/lib/hexapdf/cli.rb +40 -1
data/lib/hexapdf/cli/batch.rb +72 -0
data/lib/hexapdf/cli/command.rb +112 -15
data/lib/hexapdf/cli/files.rb +2 -2
data/lib/hexapdf/cli/images.rb +14 -6
data/lib/hexapdf/cli/info.rb +6 -8
data/lib/hexapdf/cli/inspect.rb +5 -8
data/lib/hexapdf/cli/merge.rb +13 -20
data/lib/hexapdf/cli/modify.rb +4 -7
data/lib/hexapdf/cli/optimize.rb +2 -5
data/lib/hexapdf/configuration.rb +32 -3
data/lib/hexapdf/content/canvas.rb +130 -37
data/lib/hexapdf/content/parser.rb +40 -6
data/lib/hexapdf/content/processor.rb +4 -4
data/lib/hexapdf/document.rb +40 -10
data/lib/hexapdf/document/fonts.rb +1 -0
data/lib/hexapdf/encryption/security_handler.rb +8 -12
data/lib/hexapdf/filter/flate_decode.rb +25 -2
data/lib/hexapdf/font/cmap.rb +124 -8
data/lib/hexapdf/font/cmap/parser.rb +65 -15
data/lib/hexapdf/font/encoding/base.rb +2 -2
data/lib/hexapdf/font/encoding/glyph_list.rb +2 -4
data/lib/hexapdf/font/true_type.rb +1 -0
data/lib/hexapdf/font/true_type/builder.rb +75 -0
data/lib/hexapdf/font/true_type/optimizer.rb +65 -0
data/lib/hexapdf/font/true_type/subsetter.rb +9 -22
data/lib/hexapdf/font/true_type_wrapper.rb +9 -21
data/lib/hexapdf/font_loader.rb +1 -1
data/lib/hexapdf/importer.rb +1 -1
data/lib/hexapdf/serializer.rb +5 -3
data/lib/hexapdf/type.rb +2 -0
data/lib/hexapdf/type/cid_font.rb +120 -0
data/lib/hexapdf/type/font.rb +32 -12
data/lib/hexapdf/type/font_simple.rb +34 -42
data/lib/hexapdf/type/font_type0.rb +148 -0
data/lib/hexapdf/type/form.rb +4 -4
data/lib/hexapdf/type/page.rb +12 -11
data/lib/hexapdf/type/resources.rb +14 -0
data/lib/hexapdf/utils/graphics_helpers.rb +77 -0
data/lib/hexapdf/version.rb +1 -1
data/man/man1/hexapdf.1 +43 -1
data/test/hexapdf/content/test_canvas.rb +76 -0
data/test/hexapdf/content/test_parser.rb +20 -1
data/test/hexapdf/content/test_processor.rb +11 -7
data/test/hexapdf/document/test_fonts.rb +3 -1
data/test/hexapdf/font/cmap/test_parser.rb +42 -7
data/test/hexapdf/font/encoding/test_base.rb +1 -1
data/test/hexapdf/font/encoding/test_glyph_list.rb +3 -3
data/test/hexapdf/font/test_cmap.rb +104 -0
data/test/hexapdf/font/test_true_type_wrapper.rb +63 -46
data/test/hexapdf/font/true_type/test_builder.rb +37 -0
data/test/hexapdf/font/true_type/test_optimizer.rb +27 -0
data/test/hexapdf/font/true_type/test_subsetter.rb +6 -13
data/test/hexapdf/test_configuration.rb +12 -7
data/test/hexapdf/test_document.rb +24 -0
data/test/hexapdf/test_importer.rb +9 -1
data/test/hexapdf/test_writer.rb +2 -2
data/test/hexapdf/type/test_cid_font.rb +61 -0
data/test/hexapdf/type/test_font.rb +31 -4
data/test/hexapdf/type/test_font_simple.rb +6 -21
data/test/hexapdf/type/test_font_type0.rb +114 -0
data/test/hexapdf/type/test_resources.rb +17 -1
data/test/hexapdf/utils/test_graphics_helpers.rb +29 -0
metadata +82 -3

data/lib/hexapdf/content/parser.rb CHANGED Viewed

@@ -33,6 +33,7 @@
 require 'stringio'
 require 'hexapdf/tokenizer'
+require 'hexapdf/content/processor'
 module HexaPDF
   module Content
@@ -45,6 +46,9 @@ module HexaPDF
     # See: PDF1.7 s7.2
     class Tokenizer < HexaPDF::Tokenizer #:nodoc:
+      # The string that is tokenized.
+      attr_reader :string
       # Creates a new tokenizer.
       def initialize(string)
         @ss = StringScanner.new(string)
@@ -168,6 +172,8 @@ module HexaPDF
       private
+      MAX_TOKEN_CHECK = 5 #:nodoc:
       # Parses the inline image at the current position.
       def parse_inline_image(tokenizer)
         # BI has already been read, so read the image dictionary
@@ -190,13 +196,41 @@ module HexaPDF
         # one whitespace character after ID
         tokenizer.next_byte
-        # find the EI operator
-        data = tokenizer.scan_until(/(?=EI[#{Tokenizer::WHITESPACE}])/o)
-        if data.nil?
-          raise HexaPDF::Error, "End inline image marker EI not found"
+        real_end_found = false
+        image_data = ''.b
+        # find the EI operator and handle EI appearing inside the image data
+        until real_end_found
+          data = tokenizer.scan_until(/(?=EI(?:[#{Tokenizer::WHITESPACE}]|\z))/o)
+          if data.nil?
+            raise HexaPDF::Error, "End inline image marker EI not found"
+          end
+          image_data << data
+          tokenizer.pos += 2
+          last_pos = tokenizer.pos
+          # Check if we found EI inside of the image data
+          count = 0
+          while count < MAX_TOKEN_CHECK
+            token = tokenizer.next_object(allow_keyword: true) rescue break
+            if token == Tokenizer::NO_MORE_TOKENS
+              count += MAX_TOKEN_CHECK
+            elsif token.kind_of?(Tokenizer::Token) &&
+                !Processor::OPERATOR_MESSAGE_NAME_MAP.key?(token.to_sym)
+              break #  invalid token
+            end
+            count += 1
+          end
+          if count >= MAX_TOKEN_CHECK
+            real_end_found = true
+          else
+            image_data << "EI"
+          end
+          tokenizer.pos = last_pos
         end
-        tokenizer.pos += 3
-        [dict, data]
+        [dict, image_data]
       end
     end

data/lib/hexapdf/content/processor.rb CHANGED Viewed

@@ -408,7 +408,7 @@ module HexaPDF
       def decode_horizontal_text(array)
         font = graphics_state.font
         scaled_char_space = graphics_state.scaled_character_spacing
-        scaled_word_space = graphics_state.scaled_word_spacing
+        scaled_word_space = (font.word_spacing_applicable? ? graphics_state.scaled_word_spacing : 0)
         scaled_font_size = graphics_state.scaled_font_size
         below_baseline = font.bounding_box[1] * scaled_font_size / \
@@ -423,15 +423,15 @@ module HexaPDF
           else
             font.decode(item).each do |code_point|
               char = font.to_utf8(code_point)
-              width = font.width(code_point) * scaled_font_size
+              width = font.width(code_point) * scaled_font_size + scaled_char_space + \
+                (code_point == 32 ? scaled_word_space : 0)
               matrix = graphics_state.ctm.dup.premultiply(*graphics_state.tm)
               fragment = GlyphBox.new(code_point, char,
                                       *matrix.evaluate(0, below_baseline),
                                       *matrix.evaluate(width, below_baseline),
                                       *matrix.evaluate(0, above_baseline))
               text << fragment
-              graphics_state.tm.translate(width + scaled_char_space + \
-                                          (char == ' ' ? scaled_word_space : 0), 0)
+              graphics_state.tm.translate(width, 0)
             end
           end
         end

data/lib/hexapdf/document.rb CHANGED Viewed

@@ -135,6 +135,7 @@ module HexaPDF
       end
       @listeners = {}
+      @cache = Hash.new {|h, k| h[k] = {} }
     end
     # :call-seq:
@@ -315,25 +316,24 @@ module HexaPDF
       if type.kind_of?(Class)
         klass = type
       else
-        default = if data.stream
-                    HexaPDF::Stream
-                  elsif data.value.kind_of?(Hash)
-                    HexaPDF::Dictionary
-                  else
-                    HexaPDF::Object
-                  end
         if data.value.kind_of?(Hash)
           type ||= deref(data.value[:Type])
           subtype ||= deref(data.value[:Subtype])
         end
         if subtype
-          klass = GlobalConfiguration.constantize('object.subtype_map'.freeze, subtype)
+          klass = GlobalConfiguration.constantize('object.subtype_map'.freeze, subtype) { nil }
         end
         if type && !klass
-          klass = GlobalConfiguration.constantize('object.type_map'.freeze, type)
+          klass = GlobalConfiguration.constantize('object.type_map'.freeze, type) { nil }
         end
-        klass ||= default
+        klass ||= if data.stream
+                    HexaPDF::Stream
+                  elsif data.value.kind_of?(Hash)
+                    HexaPDF::Dictionary
+                  else
+                    HexaPDF::Object
+                  end
       end
       klass.new(data, document: self)
@@ -418,6 +418,36 @@ module HexaPDF
       @listeners[name] && @listeners[name].each {|obj| obj.call(*args)}
     end
+    # Caches the value or the return value of the given block using the given Object::PDFData and
+    # key arguments as composite hash key. If a cached value already exists, it is just returned.
+    #
+    # This facility can be used to cache expensive operations in PDF objects that are easy to
+    # compute again.
+    #
+    # Use #clear_cache to clear the cache if necessary.
+    def cache(pdf_data, key, value = nil)
+      @cache[pdf_data][key] ||= value || yield
+    end
+    # Returns +true+ if there is a value cached for the composite key consisting of the given
+    # +pdf_data+ and +key+ objects.
+    #
+    # Also see: #cache
+    def cached?(pdf_data, key)
+      @cache.key?(pdf_data) && @cache[pdf_data].key?(key)
+    end
+    # Clears all cached data or, if a Object::PDFData object is given, just the cache for this one
+    # object.
+    #
+    # It is *not* recommended to clear the whole cache! Better clear the cache for individual PDF
+    # objects!
+    #
+    # Also see: #cache
+    def clear_cache(pdf_data = nil)
+      pdf_data ? @cache[pdf_data].clear : @cache.clear
+    end
     # Returns the Pages object that provides convenience methods for working with pages.
     #
     # Also see: HexaPDF::Type::PageTreeNode

data/lib/hexapdf/document/fonts.rb CHANGED Viewed

@@ -55,6 +55,7 @@ module HexaPDF
       #
       # If a font with the same parameters has been loaded before, the cached font object is used.
       def load(name, **options)
+        options[:variant] ||= :none # assign default value for consistency with caching
         font = @loaded_fonts_cache[[name, options]]
         return font if font

data/lib/hexapdf/encryption/security_handler.rb CHANGED Viewed

@@ -143,12 +143,10 @@ module HexaPDF
       #
       # See: #set_up_encryption (for the common encryption options).
       def self.set_up_encryption(document, handler_name, **options)
-        handler = HexaPDF::GlobalConfiguration.constantize('encryption.filter_map', handler_name)
-        if handler.nil?
-          handler = HexaPDF::GlobalConfiguration.constantize('encryption.sub_filter_map', handler_name)
-        end
-        if handler.nil?
-          raise HexaPDF::EncryptionError, "Could not find the specified security handler"
+        handler = GlobalConfiguration.constantize('encryption.filter_map', handler_name) do
+          GlobalConfiguration.constantize('encryption.sub_filter_map', handler_name) do
+            raise HexaPDF::EncryptionError, "Could not find the specified security handler"
+          end
         end
         handler = handler.new(document)
@@ -172,12 +170,10 @@ module HexaPDF
         if dict.nil?
           raise HexaPDF::EncryptionError, "No /Encrypt dictionary found"
         end
-        handler = HexaPDF::GlobalConfiguration.constantize('encryption.filter_map', dict[:Filter])
-        if handler.nil?
-          handler = HexaPDF::GlobalConfiguration.constantize('encryption.sub_filter_map', dict[:SubFilter])
-        end
-        if handler.nil?
-          raise HexaPDF::EncryptionError, "Could not find a suitable security handler"
+        handler = HexaPDF::GlobalConfiguration.constantize('encryption.filter_map', dict[:Filter]) do
+          HexaPDF::GlobalConfiguration.constantize('encryption.sub_filter_map', dict[:SubFilter]) do
+            raise HexaPDF::EncryptionError, "Could not find a suitable security handler"
+          end
         end
         handler = handler.new(document)

data/lib/hexapdf/filter/flate_decode.rb CHANGED Viewed

@@ -45,10 +45,33 @@ module HexaPDF
     # See: HexaPDF::Filter, PDF1.7 s7.4.4
     module FlateDecode
+      class Pool #:nodoc:
+        # Creates a new Zlib::Stream pool. A block must be given that returns a new Zlib::Stream
+        # instance.
+        def initialize(&block)
+          @creator = block
+          @pool = []
+        end
+        # Returns the next available stream of the pool, already reset to its initial state.
+        def next_available
+          @pool.find(-> { e = @creator.call; @pool << e; e }, &:finished?).tap(&:reset)
+        end
+      end
+      @inflate_pool = Pool.new { Zlib::Inflate.new }
+      @deflate_pool = Pool.new do
+        Zlib::Deflate.new(HexaPDF::GlobalConfiguration['filter.flate_compression'],
+                          Zlib::MAX_WBITS,
+                          HexaPDF::GlobalConfiguration['filter.flate_memory'])
+      end
       # See HexaPDF::Filter
       def self.decoder(source, options = nil)
         fib = Fiber.new do
-          inflater = Zlib::Inflate.new
+          inflater = @inflate_pool.next_available
           while source.alive? && (data = source.resume)
             begin
               data = inflater.inflate(data)
@@ -78,7 +101,7 @@ module HexaPDF
         end
         Fiber.new do
-          deflater = Zlib::Deflate.new(HexaPDF::GlobalConfiguration['filter.flate_compression'])
+          deflater = @deflate_pool.next_available
           while source.alive? && (data = source.resume)
             data = deflater.deflate(data)
             Fiber.yield(data)

data/lib/hexapdf/font/cmap.rb CHANGED Viewed

@@ -31,20 +31,44 @@
 # is created or manipulated using HexaPDF.
 #++
+require 'hexapdf/error'
+require 'hexapdf/data_dir'
 module HexaPDF
   module Font
     # Represents a CMap, a mapping from character codes to CIDs (character IDs) or to their Unicode
     # value.
     #
-    # Currently, only the mapping to the Unicode values is supported.
-    #
-    # See: PDF1.7 s9.7.5, s9.10.3; Adobe Technical Note #5411
+    # See: PDF1.7 s9.7.5, s9.10.3; Adobe Technical Notes #5014 and #5411
     class CMap
       autoload(:Parser, 'hexapdf/font/cmap/parser')
       autoload(:Writer, 'hexapdf/font/cmap/writer')
+      CMAP_DIR = File.join(HexaPDF.data_dir, 'cmap') #:nodoc:
+      @cmap_cache = {}
+      # Returns +true+ if the given name specifies a predefined CMap.
+      def self.predefined?(name)
+        File.exist?(File.join(CMAP_DIR, name))
+      end
+      # Creates a new CMap object by parsing a predefined CMap with the given name.
+      #
+      # Raises an error if the given CMap is not found.
+      def self.for_name(name)
+        return @cmap_cache[name] if @cmap_cache.key?(name)
+        file = File.join(CMAP_DIR, name)
+        if File.exist?(file)
+          @cmap_cache[name] = parse(File.read(file, encoding: ::Encoding::UTF_8))
+        else
+          raise HexaPDF::Error, "No CMap named '#{name}' found"
+        end
+      end
       # Creates a new CMap object from the given string which needs to contain a valid CMap file.
       def self.parse(string)
         Parser.new.parse(string)
@@ -58,6 +82,7 @@ module HexaPDF
         Writer.new.create_to_unicode_cmap(mapping)
       end
       # The registry part of the CMap version.
       attr_accessor :registry
@@ -70,16 +95,107 @@ module HexaPDF
       # The name of the CMap.
       attr_accessor :name
-      # The mapping from character codes to Unicode values.
-      attr_accessor :unicode_mapping
+      # The writing mode of the CMap: 0 for horizontal, 1 for vertical writing.
+      attr_accessor :wmode
+      attr_reader :codespace_ranges     #: nodoc:
+      attr_reader :cid_mapping          # :nodoc:
+      attr_reader :cid_range_mappings   # :nodoc:
+      attr_reader :unicode_mapping      # :nodoc:
+      protected :codespace_ranges, :cid_mapping, :cid_range_mappings, :unicode_mapping
       # Creates a new CMap object.
       def initialize
-        @unicode_mapping = Hash.new("".freeze)
+        @codespace_ranges = []
+        @cid_mapping = {}
+        @cid_range_mappings = []
+        @unicode_mapping = {}
+      end
+      # Add all mappings from the given CMap to this CMap.
+      def use_cmap(cmap)
+        @codespace_ranges.concat(cmap.codespace_ranges)
+        @cid_mapping.merge!(cmap.cid_mapping)
+        @cid_range_mappings.concat(cmap.cid_range_mappings)
+        @unicode_mapping.merge!(cmap.unicode_mapping)
+      end
+      # Add a codespace range using an array of ranges for the individual bytes.
+      #
+      # This means that the first range is checked against the first byte, the second range against
+      # the second byte and so on.
+      def add_codespace_range(first, *rest)
+        @codespace_ranges << [first, rest]
+      end
+      # Parses the string and returns all character codes.
+      #
+      # An error is raised if the string contains invalid bytes.
+      def read_codes(string)
+        codes = []
+        bytes = string.each_byte
+        loop do
+          byte = bytes.next
+          code = 0
+          found = @codespace_ranges.any? do |first_byte_range, rest_ranges|
+            next unless first_byte_range.cover?(byte)
+            code = (code << 8) + byte
+            valid = rest_ranges.all? do |range|
+              begin
+                byte = bytes.next
+              rescue StopIteration
+                raise HexaPDF::Error, "Missing bytes while reading codes via CMap"
+              end
+              code = (code << 8) + byte
+              range.cover?(byte)
+            end
+            codes << code if valid
+          end
+          unless found
+            raise HexaPDF::Error, "Invalid byte while reading codes via CMap: #{byte}"
+          end
+        end
+        codes
+      end
+      # Adds an individual mapping from character code to CID.
+      def add_cid_mapping(code, cid)
+        @cid_mapping[code] = cid
+      end
+      # Adds a CID range, mapping characters codes from +start_code+ to +end_code+ to CIDs starting
+      # with +start_cid+.
+      def add_cid_range(start_code, end_code, start_cid)
+        @cid_range_mappings << [start_code..end_code, start_cid]
+      end
+      # Returns the CID for the given character code, or 0 if no mapping was found.
+      def to_cid(code)
+        cid = @cid_mapping.fetch(code, -1)
+        if cid == -1
+          @cid_range_mappings.reverse_each do |range, start_cid|
+            if range.cover?(code)
+              cid = start_cid + code - range.first
+              break
+            end
+          end
+        end
+        (cid == -1 ? 0 : cid)
+      end
+      # Adds a mapping from character code to Unicode string in UTF-8 encoding.
+      def add_unicode_mapping(code, string)
+        @unicode_mapping[code] = string
       end
-      # Returns the Unicode string in UTF-8 encoding for the given character code, or an empty
-      # string if no mapping was found.
+      # Returns the Unicode string in UTF-8 encoding for the given character code, or +nil+ if no
+      # mapping was found.
       def to_unicode(code)
         unicode_mapping[code]
       end

data/lib/hexapdf/font/cmap/parser.rb CHANGED Viewed

@@ -41,7 +41,7 @@ module HexaPDF
       # Parses CMap files.
       #
-      # Currently only ToUnicode CMaps are supported.
+      # See: Adobe Technical Notes #5014 and #5411
       class Parser
         # Parses the given string and returns a CMap object.
@@ -54,10 +54,18 @@ module HexaPDF
               case token
               when 'beginbfchar'.freeze then parse_bf_char(tokenizer, cmap)
               when 'beginbfrange'.freeze then parse_bf_range(tokenizer, cmap)
+              when 'begincidchar'.freeze then parse_cid_char(tokenizer, cmap)
+              when 'begincidrange'.freeze then parse_cid_range(tokenizer, cmap)
+              when 'begincodespacerange'.freeze then parse_codespace_range(tokenizer, cmap)
               when 'endcmap' then break
               end
             elsif token.kind_of?(Symbol)
-              parse_dict_mapping(tokenizer, cmap, token)
+              value = tokenizer.next_token
+              if value.kind_of?(HexaPDF::Tokenizer::Token)
+                parse_cmap(cmap, token) if value == 'usecmap'.freeze
+              else
+                parse_dict_mapping(cmap, token, value)
+              end
             end
           end
@@ -68,17 +76,59 @@ module HexaPDF
         private
-        # Parses a single mapping of a dictionary pair. The +name+ of the mapping has already been
-        # parsed.
-        def parse_dict_mapping(tokenizer, cmap, name)
-          value = tokenizer.next_token
-          return if value.kind_of?(HexaPDF::Tokenizer::Token)
+        # Populates the CMap with the values from the CMap with the given name.
+        def parse_cmap(cmap, name)
+          cmap.use_cmap(CMap.for_name(name.to_s))
+        end
+        # Parses a single mapping of a dictionary pair. The +name+ and +value+ of the mapping have
+        # already been parsed.
+        def parse_dict_mapping(cmap, name, value)
           case name
-          when :Registry then cmap.registry = value if value.kind_of?(String)
-          when :Ordering then cmap.ordering = value if value.kind_of?(String)
-          when :Supplement then cmap.supplement = value if value.kind_of?(Integer)
-          when :CMapName then cmap.name = value.to_s if value.kind_of?(Symbol)
+          when :Registry
+            cmap.registry = value.force_encoding(::Encoding::UTF_8) if value.kind_of?(String)
+          when :Ordering
+            cmap.ordering = value.force_encoding(::Encoding::UTF_8) if value.kind_of?(String)
+          when :Supplement
+            cmap.supplement = value if value.kind_of?(Integer)
+          when :CMapName
+            cmap.name = value.to_s.force_encoding(::Encoding::UTF_8) if value.kind_of?(Symbol)
+          when :WMode
+            cmap.wmode = value
+          end
+        end
+        # Parses the "begincodespacerange" operator at the current position.
+        def parse_codespace_range(tokenizer, cmap)
+          until (code1 = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
+            code2 = tokenizer.next_token
+            byte_ranges = []
+            code1.each_byte.with_index do |byte, index|
+              byte_ranges << (byte..(code2.getbyte(index)))
+            end
+            cmap.add_codespace_range(*byte_ranges)
+          end
+        end
+        # Parses the "cidchar" operator at the current position.
+        def parse_cid_char(tokenizer, cmap)
+          until (code = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
+            cmap.add_cid_mapping(bytes_to_int(code), tokenizer.next_token)
+          end
+        end
+        # Parses the "cidrange" operator at the current position.
+        def parse_cid_range(tokenizer, cmap)
+          until (code1 = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
+            code1 = bytes_to_int(code1)
+            code2 = bytes_to_int(tokenizer.next_token)
+            cid_start = tokenizer.next_object
+            if code1 == code2
+              cmap.add_cid_mapping(code1, cid_start)
+            else
+              cmap.add_cid_range(code1, code2, cid_start)
+            end
           end
         end
@@ -86,7 +136,7 @@ module HexaPDF
         def parse_bf_char(tokenizer, cmap)
           until (code = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
             str = tokenizer.next_token.encode!(::Encoding::UTF_8, ::Encoding::UTF_16BE)
-            cmap.unicode_mapping[bytes_to_int(code)] = str
+            cmap.add_unicode_mapping(bytes_to_int(code), str)
           end
         end
@@ -112,13 +162,13 @@ module HexaPDF
             if dest.kind_of?(String)
               codepoint = dest.force_encoding(::Encoding::UTF_16BE).ord
               code1.upto(code2) do |code|
-                cmap.unicode_mapping[code] = '' << codepoint
+                cmap.add_unicode_mapping(code, '' << codepoint)
                 codepoint += 1
               end
             elsif dest.kind_of?(Array)
               code1.upto(code2) do |code|
-                cmap.unicode_mapping[code] =
-                  dest[code - code1].encode!(::Encoding::UTF_8, ::Encoding::UTF_16BE)
+                str = dest[code - code1].encode!(::Encoding::UTF_8, ::Encoding::UTF_16BE)
+                cmap.add_unicode_mapping(code, str)
               end
             else
               raise HexaPDF::Error, "Invalid bfrange operator in CMap"