hexapdf 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (142) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +68 -0
  3. data/CONTRIBUTERS +1 -1
  4. data/README.md +35 -4
  5. data/Rakefile +1 -0
  6. data/VERSION +1 -1
  7. data/data/hexapdf/cmap/83pv-RKSJ-H +314 -0
  8. data/data/hexapdf/cmap/90ms-RKSJ-H +259 -0
  9. data/data/hexapdf/cmap/90ms-RKSJ-V +156 -0
  10. data/data/hexapdf/cmap/90msp-RKSJ-H +257 -0
  11. data/data/hexapdf/cmap/90msp-RKSJ-V +155 -0
  12. data/data/hexapdf/cmap/90pv-RKSJ-H +355 -0
  13. data/data/hexapdf/cmap/Add-RKSJ-H +738 -0
  14. data/data/hexapdf/cmap/Add-RKSJ-V +135 -0
  15. data/data/hexapdf/cmap/Adobe-CNS1-UCS2 +18209 -0
  16. data/data/hexapdf/cmap/Adobe-GB1-UCS2 +14267 -0
  17. data/data/hexapdf/cmap/Adobe-Japan1-UCS2 +19159 -0
  18. data/data/hexapdf/cmap/Adobe-Korea1-UCS2 +9267 -0
  19. data/data/hexapdf/cmap/B5pc-H +337 -0
  20. data/data/hexapdf/cmap/B5pc-V +90 -0
  21. data/data/hexapdf/cmap/CNS-EUC-H +490 -0
  22. data/data/hexapdf/cmap/CNS-EUC-V +538 -0
  23. data/data/hexapdf/cmap/ETen-B5-H +343 -0
  24. data/data/hexapdf/cmap/ETen-B5-V +91 -0
  25. data/data/hexapdf/cmap/ETenms-B5-H +79 -0
  26. data/data/hexapdf/cmap/ETenms-B5-V +99 -0
  27. data/data/hexapdf/cmap/EUC-H +207 -0
  28. data/data/hexapdf/cmap/EUC-V +105 -0
  29. data/data/hexapdf/cmap/Ext-RKSJ-H +768 -0
  30. data/data/hexapdf/cmap/Ext-RKSJ-V +117 -0
  31. data/data/hexapdf/cmap/GB-EUC-H +173 -0
  32. data/data/hexapdf/cmap/GB-EUC-V +98 -0
  33. data/data/hexapdf/cmap/GBK-EUC-H +4273 -0
  34. data/data/hexapdf/cmap/GBK-EUC-V +97 -0
  35. data/data/hexapdf/cmap/GBK2K-H +5325 -0
  36. data/data/hexapdf/cmap/GBK2K-V +118 -0
  37. data/data/hexapdf/cmap/GBKp-EUC-H +4272 -0
  38. data/data/hexapdf/cmap/GBKp-EUC-V +97 -0
  39. data/data/hexapdf/cmap/GBpc-EUC-H +175 -0
  40. data/data/hexapdf/cmap/GBpc-EUC-V +98 -0
  41. data/data/hexapdf/cmap/H +200 -0
  42. data/data/hexapdf/cmap/HKscs-B5-H +1331 -0
  43. data/data/hexapdf/cmap/HKscs-B5-V +90 -0
  44. data/data/hexapdf/cmap/Identity-H +339 -0
  45. data/data/hexapdf/cmap/Identity-V +73 -0
  46. data/data/hexapdf/cmap/KSC-EUC-H +562 -0
  47. data/data/hexapdf/cmap/KSC-EUC-V +94 -0
  48. data/data/hexapdf/cmap/KSCms-UHC-H +776 -0
  49. data/data/hexapdf/cmap/KSCms-UHC-HW-H +775 -0
  50. data/data/hexapdf/cmap/KSCms-UHC-HW-V +93 -0
  51. data/data/hexapdf/cmap/KSCms-UHC-V +94 -0
  52. data/data/hexapdf/cmap/KSCpc-EUC-H +608 -0
  53. data/data/hexapdf/cmap/LICENSE.txt +26 -0
  54. data/data/hexapdf/cmap/README.txt +9 -0
  55. data/data/hexapdf/cmap/UniCNS-UCS2-H +16992 -0
  56. data/data/hexapdf/cmap/UniCNS-UCS2-V +90 -0
  57. data/data/hexapdf/cmap/UniCNS-UTF16-H +19117 -0
  58. data/data/hexapdf/cmap/UniCNS-UTF16-V +94 -0
  59. data/data/hexapdf/cmap/UniGB-UCS2-H +14321 -0
  60. data/data/hexapdf/cmap/UniGB-UCS2-V +101 -0
  61. data/data/hexapdf/cmap/UniGB-UTF16-H +14381 -0
  62. data/data/hexapdf/cmap/UniGB-UTF16-V +104 -0
  63. data/data/hexapdf/cmap/UniJIS-UCS2-H +8870 -0
  64. data/data/hexapdf/cmap/UniJIS-UCS2-HW-H +81 -0
  65. data/data/hexapdf/cmap/UniJIS-UCS2-HW-V +279 -0
  66. data/data/hexapdf/cmap/UniJIS-UCS2-V +275 -0
  67. data/data/hexapdf/cmap/UniJIS-UTF16-H +14450 -0
  68. data/data/hexapdf/cmap/UniJIS-UTF16-V +299 -0
  69. data/data/hexapdf/cmap/UniKS-UCS2-H +8725 -0
  70. data/data/hexapdf/cmap/UniKS-UCS2-V +95 -0
  71. data/data/hexapdf/cmap/UniKS-UTF16-H +8895 -0
  72. data/data/hexapdf/cmap/UniKS-UTF16-V +99 -0
  73. data/data/hexapdf/cmap/V +105 -0
  74. data/examples/arc.rb +3 -3
  75. data/examples/merging.rb +4 -1
  76. data/examples/optimizing.rb +3 -0
  77. data/examples/show_char_bboxes.rb +2 -2
  78. data/examples/truetype.rb +2 -2
  79. data/lib/hexapdf/cli.rb +40 -1
  80. data/lib/hexapdf/cli/batch.rb +72 -0
  81. data/lib/hexapdf/cli/command.rb +112 -15
  82. data/lib/hexapdf/cli/files.rb +2 -2
  83. data/lib/hexapdf/cli/images.rb +14 -6
  84. data/lib/hexapdf/cli/info.rb +6 -8
  85. data/lib/hexapdf/cli/inspect.rb +5 -8
  86. data/lib/hexapdf/cli/merge.rb +13 -20
  87. data/lib/hexapdf/cli/modify.rb +4 -7
  88. data/lib/hexapdf/cli/optimize.rb +2 -5
  89. data/lib/hexapdf/configuration.rb +32 -3
  90. data/lib/hexapdf/content/canvas.rb +130 -37
  91. data/lib/hexapdf/content/parser.rb +40 -6
  92. data/lib/hexapdf/content/processor.rb +4 -4
  93. data/lib/hexapdf/document.rb +40 -10
  94. data/lib/hexapdf/document/fonts.rb +1 -0
  95. data/lib/hexapdf/encryption/security_handler.rb +8 -12
  96. data/lib/hexapdf/filter/flate_decode.rb +25 -2
  97. data/lib/hexapdf/font/cmap.rb +124 -8
  98. data/lib/hexapdf/font/cmap/parser.rb +65 -15
  99. data/lib/hexapdf/font/encoding/base.rb +2 -2
  100. data/lib/hexapdf/font/encoding/glyph_list.rb +2 -4
  101. data/lib/hexapdf/font/true_type.rb +1 -0
  102. data/lib/hexapdf/font/true_type/builder.rb +75 -0
  103. data/lib/hexapdf/font/true_type/optimizer.rb +65 -0
  104. data/lib/hexapdf/font/true_type/subsetter.rb +9 -22
  105. data/lib/hexapdf/font/true_type_wrapper.rb +9 -21
  106. data/lib/hexapdf/font_loader.rb +1 -1
  107. data/lib/hexapdf/importer.rb +1 -1
  108. data/lib/hexapdf/serializer.rb +5 -3
  109. data/lib/hexapdf/type.rb +2 -0
  110. data/lib/hexapdf/type/cid_font.rb +120 -0
  111. data/lib/hexapdf/type/font.rb +32 -12
  112. data/lib/hexapdf/type/font_simple.rb +34 -42
  113. data/lib/hexapdf/type/font_type0.rb +148 -0
  114. data/lib/hexapdf/type/form.rb +4 -4
  115. data/lib/hexapdf/type/page.rb +12 -11
  116. data/lib/hexapdf/type/resources.rb +14 -0
  117. data/lib/hexapdf/utils/graphics_helpers.rb +77 -0
  118. data/lib/hexapdf/version.rb +1 -1
  119. data/man/man1/hexapdf.1 +43 -1
  120. data/test/hexapdf/content/test_canvas.rb +76 -0
  121. data/test/hexapdf/content/test_parser.rb +20 -1
  122. data/test/hexapdf/content/test_processor.rb +11 -7
  123. data/test/hexapdf/document/test_fonts.rb +3 -1
  124. data/test/hexapdf/font/cmap/test_parser.rb +42 -7
  125. data/test/hexapdf/font/encoding/test_base.rb +1 -1
  126. data/test/hexapdf/font/encoding/test_glyph_list.rb +3 -3
  127. data/test/hexapdf/font/test_cmap.rb +104 -0
  128. data/test/hexapdf/font/test_true_type_wrapper.rb +63 -46
  129. data/test/hexapdf/font/true_type/test_builder.rb +37 -0
  130. data/test/hexapdf/font/true_type/test_optimizer.rb +27 -0
  131. data/test/hexapdf/font/true_type/test_subsetter.rb +6 -13
  132. data/test/hexapdf/test_configuration.rb +12 -7
  133. data/test/hexapdf/test_document.rb +24 -0
  134. data/test/hexapdf/test_importer.rb +9 -1
  135. data/test/hexapdf/test_writer.rb +2 -2
  136. data/test/hexapdf/type/test_cid_font.rb +61 -0
  137. data/test/hexapdf/type/test_font.rb +31 -4
  138. data/test/hexapdf/type/test_font_simple.rb +6 -21
  139. data/test/hexapdf/type/test_font_type0.rb +114 -0
  140. data/test/hexapdf/type/test_resources.rb +17 -1
  141. data/test/hexapdf/utils/test_graphics_helpers.rb +29 -0
  142. metadata +82 -3
@@ -33,6 +33,7 @@
33
33
 
34
34
  require 'stringio'
35
35
  require 'hexapdf/tokenizer'
36
+ require 'hexapdf/content/processor'
36
37
 
37
38
  module HexaPDF
38
39
  module Content
@@ -45,6 +46,9 @@ module HexaPDF
45
46
  # See: PDF1.7 s7.2
46
47
  class Tokenizer < HexaPDF::Tokenizer #:nodoc:
47
48
 
49
+ # The string that is tokenized.
50
+ attr_reader :string
51
+
48
52
  # Creates a new tokenizer.
49
53
  def initialize(string)
50
54
  @ss = StringScanner.new(string)
@@ -168,6 +172,8 @@ module HexaPDF
168
172
 
169
173
  private
170
174
 
175
+ MAX_TOKEN_CHECK = 5 #:nodoc:
176
+
171
177
  # Parses the inline image at the current position.
172
178
  def parse_inline_image(tokenizer)
173
179
  # BI has already been read, so read the image dictionary
@@ -190,13 +196,41 @@ module HexaPDF
190
196
  # one whitespace character after ID
191
197
  tokenizer.next_byte
192
198
 
193
- # find the EI operator
194
- data = tokenizer.scan_until(/(?=EI[#{Tokenizer::WHITESPACE}])/o)
195
- if data.nil?
196
- raise HexaPDF::Error, "End inline image marker EI not found"
199
+ real_end_found = false
200
+ image_data = ''.b
201
+
202
+ # find the EI operator and handle EI appearing inside the image data
203
+ until real_end_found
204
+ data = tokenizer.scan_until(/(?=EI(?:[#{Tokenizer::WHITESPACE}]|\z))/o)
205
+ if data.nil?
206
+ raise HexaPDF::Error, "End inline image marker EI not found"
207
+ end
208
+ image_data << data
209
+ tokenizer.pos += 2
210
+ last_pos = tokenizer.pos
211
+
212
+ # Check if we found EI inside of the image data
213
+ count = 0
214
+ while count < MAX_TOKEN_CHECK
215
+ token = tokenizer.next_object(allow_keyword: true) rescue break
216
+ if token == Tokenizer::NO_MORE_TOKENS
217
+ count += MAX_TOKEN_CHECK
218
+ elsif token.kind_of?(Tokenizer::Token) &&
219
+ !Processor::OPERATOR_MESSAGE_NAME_MAP.key?(token.to_sym)
220
+ break # invalid token
221
+ end
222
+ count += 1
223
+ end
224
+
225
+ if count >= MAX_TOKEN_CHECK
226
+ real_end_found = true
227
+ else
228
+ image_data << "EI"
229
+ end
230
+ tokenizer.pos = last_pos
197
231
  end
198
- tokenizer.pos += 3
199
- [dict, data]
232
+
233
+ [dict, image_data]
200
234
  end
201
235
 
202
236
  end
@@ -408,7 +408,7 @@ module HexaPDF
408
408
  def decode_horizontal_text(array)
409
409
  font = graphics_state.font
410
410
  scaled_char_space = graphics_state.scaled_character_spacing
411
- scaled_word_space = graphics_state.scaled_word_spacing
411
+ scaled_word_space = (font.word_spacing_applicable? ? graphics_state.scaled_word_spacing : 0)
412
412
  scaled_font_size = graphics_state.scaled_font_size
413
413
 
414
414
  below_baseline = font.bounding_box[1] * scaled_font_size / \
@@ -423,15 +423,15 @@ module HexaPDF
423
423
  else
424
424
  font.decode(item).each do |code_point|
425
425
  char = font.to_utf8(code_point)
426
- width = font.width(code_point) * scaled_font_size
426
+ width = font.width(code_point) * scaled_font_size + scaled_char_space + \
427
+ (code_point == 32 ? scaled_word_space : 0)
427
428
  matrix = graphics_state.ctm.dup.premultiply(*graphics_state.tm)
428
429
  fragment = GlyphBox.new(code_point, char,
429
430
  *matrix.evaluate(0, below_baseline),
430
431
  *matrix.evaluate(width, below_baseline),
431
432
  *matrix.evaluate(0, above_baseline))
432
433
  text << fragment
433
- graphics_state.tm.translate(width + scaled_char_space + \
434
- (char == ' ' ? scaled_word_space : 0), 0)
434
+ graphics_state.tm.translate(width, 0)
435
435
  end
436
436
  end
437
437
  end
@@ -135,6 +135,7 @@ module HexaPDF
135
135
  end
136
136
 
137
137
  @listeners = {}
138
+ @cache = Hash.new {|h, k| h[k] = {} }
138
139
  end
139
140
 
140
141
  # :call-seq:
@@ -315,25 +316,24 @@ module HexaPDF
315
316
  if type.kind_of?(Class)
316
317
  klass = type
317
318
  else
318
- default = if data.stream
319
- HexaPDF::Stream
320
- elsif data.value.kind_of?(Hash)
321
- HexaPDF::Dictionary
322
- else
323
- HexaPDF::Object
324
- end
325
319
  if data.value.kind_of?(Hash)
326
320
  type ||= deref(data.value[:Type])
327
321
  subtype ||= deref(data.value[:Subtype])
328
322
  end
329
323
 
330
324
  if subtype
331
- klass = GlobalConfiguration.constantize('object.subtype_map'.freeze, subtype)
325
+ klass = GlobalConfiguration.constantize('object.subtype_map'.freeze, subtype) { nil }
332
326
  end
333
327
  if type && !klass
334
- klass = GlobalConfiguration.constantize('object.type_map'.freeze, type)
328
+ klass = GlobalConfiguration.constantize('object.type_map'.freeze, type) { nil }
335
329
  end
336
- klass ||= default
330
+ klass ||= if data.stream
331
+ HexaPDF::Stream
332
+ elsif data.value.kind_of?(Hash)
333
+ HexaPDF::Dictionary
334
+ else
335
+ HexaPDF::Object
336
+ end
337
337
  end
338
338
 
339
339
  klass.new(data, document: self)
@@ -418,6 +418,36 @@ module HexaPDF
418
418
  @listeners[name] && @listeners[name].each {|obj| obj.call(*args)}
419
419
  end
420
420
 
421
+ # Caches the value or the return value of the given block using the given Object::PDFData and
422
+ # key arguments as composite hash key. If a cached value already exists, it is just returned.
423
+ #
424
+ # This facility can be used to cache expensive operations in PDF objects that are easy to
425
+ # compute again.
426
+ #
427
+ # Use #clear_cache to clear the cache if necessary.
428
+ def cache(pdf_data, key, value = nil)
429
+ @cache[pdf_data][key] ||= value || yield
430
+ end
431
+
432
+ # Returns +true+ if there is a value cached for the composite key consisting of the given
433
+ # +pdf_data+ and +key+ objects.
434
+ #
435
+ # Also see: #cache
436
+ def cached?(pdf_data, key)
437
+ @cache.key?(pdf_data) && @cache[pdf_data].key?(key)
438
+ end
439
+
440
+ # Clears all cached data or, if a Object::PDFData object is given, just the cache for this one
441
+ # object.
442
+ #
443
+ # It is *not* recommended to clear the whole cache! Better clear the cache for individual PDF
444
+ # objects!
445
+ #
446
+ # Also see: #cache
447
+ def clear_cache(pdf_data = nil)
448
+ pdf_data ? @cache[pdf_data].clear : @cache.clear
449
+ end
450
+
421
451
  # Returns the Pages object that provides convenience methods for working with pages.
422
452
  #
423
453
  # Also see: HexaPDF::Type::PageTreeNode
@@ -55,6 +55,7 @@ module HexaPDF
55
55
  #
56
56
  # If a font with the same parameters has been loaded before, the cached font object is used.
57
57
  def load(name, **options)
58
+ options[:variant] ||= :none # assign default value for consistency with caching
58
59
  font = @loaded_fonts_cache[[name, options]]
59
60
  return font if font
60
61
 
@@ -143,12 +143,10 @@ module HexaPDF
143
143
  #
144
144
  # See: #set_up_encryption (for the common encryption options).
145
145
  def self.set_up_encryption(document, handler_name, **options)
146
- handler = HexaPDF::GlobalConfiguration.constantize('encryption.filter_map', handler_name)
147
- if handler.nil?
148
- handler = HexaPDF::GlobalConfiguration.constantize('encryption.sub_filter_map', handler_name)
149
- end
150
- if handler.nil?
151
- raise HexaPDF::EncryptionError, "Could not find the specified security handler"
146
+ handler = GlobalConfiguration.constantize('encryption.filter_map', handler_name) do
147
+ GlobalConfiguration.constantize('encryption.sub_filter_map', handler_name) do
148
+ raise HexaPDF::EncryptionError, "Could not find the specified security handler"
149
+ end
152
150
  end
153
151
 
154
152
  handler = handler.new(document)
@@ -172,12 +170,10 @@ module HexaPDF
172
170
  if dict.nil?
173
171
  raise HexaPDF::EncryptionError, "No /Encrypt dictionary found"
174
172
  end
175
- handler = HexaPDF::GlobalConfiguration.constantize('encryption.filter_map', dict[:Filter])
176
- if handler.nil?
177
- handler = HexaPDF::GlobalConfiguration.constantize('encryption.sub_filter_map', dict[:SubFilter])
178
- end
179
- if handler.nil?
180
- raise HexaPDF::EncryptionError, "Could not find a suitable security handler"
173
+ handler = HexaPDF::GlobalConfiguration.constantize('encryption.filter_map', dict[:Filter]) do
174
+ HexaPDF::GlobalConfiguration.constantize('encryption.sub_filter_map', dict[:SubFilter]) do
175
+ raise HexaPDF::EncryptionError, "Could not find a suitable security handler"
176
+ end
181
177
  end
182
178
 
183
179
  handler = handler.new(document)
@@ -45,10 +45,33 @@ module HexaPDF
45
45
  # See: HexaPDF::Filter, PDF1.7 s7.4.4
46
46
  module FlateDecode
47
47
 
48
+ class Pool #:nodoc:
49
+
50
+ # Creates a new Zlib::Stream pool. A block must be given that returns a new Zlib::Stream
51
+ # instance.
52
+ def initialize(&block)
53
+ @creator = block
54
+ @pool = []
55
+ end
56
+
57
+ # Returns the next available stream of the pool, already reset to its initial state.
58
+ def next_available
59
+ @pool.find(-> { e = @creator.call; @pool << e; e }, &:finished?).tap(&:reset)
60
+ end
61
+
62
+ end
63
+
64
+ @inflate_pool = Pool.new { Zlib::Inflate.new }
65
+ @deflate_pool = Pool.new do
66
+ Zlib::Deflate.new(HexaPDF::GlobalConfiguration['filter.flate_compression'],
67
+ Zlib::MAX_WBITS,
68
+ HexaPDF::GlobalConfiguration['filter.flate_memory'])
69
+ end
70
+
48
71
  # See HexaPDF::Filter
49
72
  def self.decoder(source, options = nil)
50
73
  fib = Fiber.new do
51
- inflater = Zlib::Inflate.new
74
+ inflater = @inflate_pool.next_available
52
75
  while source.alive? && (data = source.resume)
53
76
  begin
54
77
  data = inflater.inflate(data)
@@ -78,7 +101,7 @@ module HexaPDF
78
101
  end
79
102
 
80
103
  Fiber.new do
81
- deflater = Zlib::Deflate.new(HexaPDF::GlobalConfiguration['filter.flate_compression'])
104
+ deflater = @deflate_pool.next_available
82
105
  while source.alive? && (data = source.resume)
83
106
  data = deflater.deflate(data)
84
107
  Fiber.yield(data)
@@ -31,20 +31,44 @@
31
31
  # is created or manipulated using HexaPDF.
32
32
  #++
33
33
 
34
+ require 'hexapdf/error'
35
+ require 'hexapdf/data_dir'
36
+
34
37
  module HexaPDF
35
38
  module Font
36
39
 
37
40
  # Represents a CMap, a mapping from character codes to CIDs (character IDs) or to their Unicode
38
41
  # value.
39
42
  #
40
- # Currently, only the mapping to the Unicode values is supported.
41
- #
42
- # See: PDF1.7 s9.7.5, s9.10.3; Adobe Technical Note #5411
43
+ # See: PDF1.7 s9.7.5, s9.10.3; Adobe Technical Notes #5014 and #5411
43
44
  class CMap
44
45
 
45
46
  autoload(:Parser, 'hexapdf/font/cmap/parser')
46
47
  autoload(:Writer, 'hexapdf/font/cmap/writer')
47
48
 
49
+ CMAP_DIR = File.join(HexaPDF.data_dir, 'cmap') #:nodoc:
50
+
51
+ @cmap_cache = {}
52
+
53
+ # Returns +true+ if the given name specifies a predefined CMap.
54
+ def self.predefined?(name)
55
+ File.exist?(File.join(CMAP_DIR, name))
56
+ end
57
+
58
+ # Creates a new CMap object by parsing a predefined CMap with the given name.
59
+ #
60
+ # Raises an error if the given CMap is not found.
61
+ def self.for_name(name)
62
+ return @cmap_cache[name] if @cmap_cache.key?(name)
63
+
64
+ file = File.join(CMAP_DIR, name)
65
+ if File.exist?(file)
66
+ @cmap_cache[name] = parse(File.read(file, encoding: ::Encoding::UTF_8))
67
+ else
68
+ raise HexaPDF::Error, "No CMap named '#{name}' found"
69
+ end
70
+ end
71
+
48
72
  # Creates a new CMap object from the given string which needs to contain a valid CMap file.
49
73
  def self.parse(string)
50
74
  Parser.new.parse(string)
@@ -58,6 +82,7 @@ module HexaPDF
58
82
  Writer.new.create_to_unicode_cmap(mapping)
59
83
  end
60
84
 
85
+
61
86
  # The registry part of the CMap version.
62
87
  attr_accessor :registry
63
88
 
@@ -70,16 +95,107 @@ module HexaPDF
70
95
  # The name of the CMap.
71
96
  attr_accessor :name
72
97
 
73
- # The mapping from character codes to Unicode values.
74
- attr_accessor :unicode_mapping
98
+ # The writing mode of the CMap: 0 for horizontal, 1 for vertical writing.
99
+ attr_accessor :wmode
100
+
101
+ attr_reader :codespace_ranges #: nodoc:
102
+ attr_reader :cid_mapping # :nodoc:
103
+ attr_reader :cid_range_mappings # :nodoc:
104
+ attr_reader :unicode_mapping # :nodoc:
105
+ protected :codespace_ranges, :cid_mapping, :cid_range_mappings, :unicode_mapping
75
106
 
76
107
  # Creates a new CMap object.
77
108
  def initialize
78
- @unicode_mapping = Hash.new("".freeze)
109
+ @codespace_ranges = []
110
+ @cid_mapping = {}
111
+ @cid_range_mappings = []
112
+ @unicode_mapping = {}
113
+ end
114
+
115
+ # Add all mappings from the given CMap to this CMap.
116
+ def use_cmap(cmap)
117
+ @codespace_ranges.concat(cmap.codespace_ranges)
118
+ @cid_mapping.merge!(cmap.cid_mapping)
119
+ @cid_range_mappings.concat(cmap.cid_range_mappings)
120
+ @unicode_mapping.merge!(cmap.unicode_mapping)
121
+ end
122
+
123
+ # Add a codespace range using an array of ranges for the individual bytes.
124
+ #
125
+ # This means that the first range is checked against the first byte, the second range against
126
+ # the second byte and so on.
127
+ def add_codespace_range(first, *rest)
128
+ @codespace_ranges << [first, rest]
129
+ end
130
+
131
+ # Parses the string and returns all character codes.
132
+ #
133
+ # An error is raised if the string contains invalid bytes.
134
+ def read_codes(string)
135
+ codes = []
136
+ bytes = string.each_byte
137
+
138
+ loop do
139
+ byte = bytes.next
140
+ code = 0
141
+
142
+ found = @codespace_ranges.any? do |first_byte_range, rest_ranges|
143
+ next unless first_byte_range.cover?(byte)
144
+
145
+ code = (code << 8) + byte
146
+ valid = rest_ranges.all? do |range|
147
+ begin
148
+ byte = bytes.next
149
+ rescue StopIteration
150
+ raise HexaPDF::Error, "Missing bytes while reading codes via CMap"
151
+ end
152
+ code = (code << 8) + byte
153
+ range.cover?(byte)
154
+ end
155
+
156
+ codes << code if valid
157
+ end
158
+
159
+ unless found
160
+ raise HexaPDF::Error, "Invalid byte while reading codes via CMap: #{byte}"
161
+ end
162
+ end
163
+
164
+ codes
165
+ end
166
+
167
+ # Adds an individual mapping from character code to CID.
168
+ def add_cid_mapping(code, cid)
169
+ @cid_mapping[code] = cid
170
+ end
171
+
172
+ # Adds a CID range, mapping characters codes from +start_code+ to +end_code+ to CIDs starting
173
+ # with +start_cid+.
174
+ def add_cid_range(start_code, end_code, start_cid)
175
+ @cid_range_mappings << [start_code..end_code, start_cid]
176
+ end
177
+
178
+ # Returns the CID for the given character code, or 0 if no mapping was found.
179
+ def to_cid(code)
180
+ cid = @cid_mapping.fetch(code, -1)
181
+ if cid == -1
182
+ @cid_range_mappings.reverse_each do |range, start_cid|
183
+ if range.cover?(code)
184
+ cid = start_cid + code - range.first
185
+ break
186
+ end
187
+ end
188
+ end
189
+ (cid == -1 ? 0 : cid)
190
+ end
191
+
192
+ # Adds a mapping from character code to Unicode string in UTF-8 encoding.
193
+ def add_unicode_mapping(code, string)
194
+ @unicode_mapping[code] = string
79
195
  end
80
196
 
81
- # Returns the Unicode string in UTF-8 encoding for the given character code, or an empty
82
- # string if no mapping was found.
197
+ # Returns the Unicode string in UTF-8 encoding for the given character code, or +nil+ if no
198
+ # mapping was found.
83
199
  def to_unicode(code)
84
200
  unicode_mapping[code]
85
201
  end
@@ -41,7 +41,7 @@ module HexaPDF
41
41
 
42
42
  # Parses CMap files.
43
43
  #
44
- # Currently only ToUnicode CMaps are supported.
44
+ # See: Adobe Technical Notes #5014 and #5411
45
45
  class Parser
46
46
 
47
47
  # Parses the given string and returns a CMap object.
@@ -54,10 +54,18 @@ module HexaPDF
54
54
  case token
55
55
  when 'beginbfchar'.freeze then parse_bf_char(tokenizer, cmap)
56
56
  when 'beginbfrange'.freeze then parse_bf_range(tokenizer, cmap)
57
+ when 'begincidchar'.freeze then parse_cid_char(tokenizer, cmap)
58
+ when 'begincidrange'.freeze then parse_cid_range(tokenizer, cmap)
59
+ when 'begincodespacerange'.freeze then parse_codespace_range(tokenizer, cmap)
57
60
  when 'endcmap' then break
58
61
  end
59
62
  elsif token.kind_of?(Symbol)
60
- parse_dict_mapping(tokenizer, cmap, token)
63
+ value = tokenizer.next_token
64
+ if value.kind_of?(HexaPDF::Tokenizer::Token)
65
+ parse_cmap(cmap, token) if value == 'usecmap'.freeze
66
+ else
67
+ parse_dict_mapping(cmap, token, value)
68
+ end
61
69
  end
62
70
  end
63
71
 
@@ -68,17 +76,59 @@ module HexaPDF
68
76
 
69
77
  private
70
78
 
71
- # Parses a single mapping of a dictionary pair. The +name+ of the mapping has already been
72
- # parsed.
73
- def parse_dict_mapping(tokenizer, cmap, name)
74
- value = tokenizer.next_token
75
- return if value.kind_of?(HexaPDF::Tokenizer::Token)
79
+ # Populates the CMap with the values from the CMap with the given name.
80
+ def parse_cmap(cmap, name)
81
+ cmap.use_cmap(CMap.for_name(name.to_s))
82
+ end
76
83
 
84
+ # Parses a single mapping of a dictionary pair. The +name+ and +value+ of the mapping have
85
+ # already been parsed.
86
+ def parse_dict_mapping(cmap, name, value)
77
87
  case name
78
- when :Registry then cmap.registry = value if value.kind_of?(String)
79
- when :Ordering then cmap.ordering = value if value.kind_of?(String)
80
- when :Supplement then cmap.supplement = value if value.kind_of?(Integer)
81
- when :CMapName then cmap.name = value.to_s if value.kind_of?(Symbol)
88
+ when :Registry
89
+ cmap.registry = value.force_encoding(::Encoding::UTF_8) if value.kind_of?(String)
90
+ when :Ordering
91
+ cmap.ordering = value.force_encoding(::Encoding::UTF_8) if value.kind_of?(String)
92
+ when :Supplement
93
+ cmap.supplement = value if value.kind_of?(Integer)
94
+ when :CMapName
95
+ cmap.name = value.to_s.force_encoding(::Encoding::UTF_8) if value.kind_of?(Symbol)
96
+ when :WMode
97
+ cmap.wmode = value
98
+ end
99
+ end
100
+
101
+ # Parses the "begincodespacerange" operator at the current position.
102
+ def parse_codespace_range(tokenizer, cmap)
103
+ until (code1 = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
104
+ code2 = tokenizer.next_token
105
+ byte_ranges = []
106
+ code1.each_byte.with_index do |byte, index|
107
+ byte_ranges << (byte..(code2.getbyte(index)))
108
+ end
109
+ cmap.add_codespace_range(*byte_ranges)
110
+ end
111
+ end
112
+
113
+ # Parses the "cidchar" operator at the current position.
114
+ def parse_cid_char(tokenizer, cmap)
115
+ until (code = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
116
+ cmap.add_cid_mapping(bytes_to_int(code), tokenizer.next_token)
117
+ end
118
+ end
119
+
120
+ # Parses the "cidrange" operator at the current position.
121
+ def parse_cid_range(tokenizer, cmap)
122
+ until (code1 = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
123
+ code1 = bytes_to_int(code1)
124
+ code2 = bytes_to_int(tokenizer.next_token)
125
+ cid_start = tokenizer.next_object
126
+
127
+ if code1 == code2
128
+ cmap.add_cid_mapping(code1, cid_start)
129
+ else
130
+ cmap.add_cid_range(code1, code2, cid_start)
131
+ end
82
132
  end
83
133
  end
84
134
 
@@ -86,7 +136,7 @@ module HexaPDF
86
136
  def parse_bf_char(tokenizer, cmap)
87
137
  until (code = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
88
138
  str = tokenizer.next_token.encode!(::Encoding::UTF_8, ::Encoding::UTF_16BE)
89
- cmap.unicode_mapping[bytes_to_int(code)] = str
139
+ cmap.add_unicode_mapping(bytes_to_int(code), str)
90
140
  end
91
141
  end
92
142
 
@@ -112,13 +162,13 @@ module HexaPDF
112
162
  if dest.kind_of?(String)
113
163
  codepoint = dest.force_encoding(::Encoding::UTF_16BE).ord
114
164
  code1.upto(code2) do |code|
115
- cmap.unicode_mapping[code] = '' << codepoint
165
+ cmap.add_unicode_mapping(code, '' << codepoint)
116
166
  codepoint += 1
117
167
  end
118
168
  elsif dest.kind_of?(Array)
119
169
  code1.upto(code2) do |code|
120
- cmap.unicode_mapping[code] =
121
- dest[code - code1].encode!(::Encoding::UTF_8, ::Encoding::UTF_16BE)
170
+ str = dest[code - code1].encode!(::Encoding::UTF_8, ::Encoding::UTF_16BE)
171
+ cmap.add_unicode_mapping(code, str)
122
172
  end
123
173
  else
124
174
  raise HexaPDF::Error, "Invalid bfrange operator in CMap"