hexapdf 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +68 -0
  3. data/CONTRIBUTERS +1 -1
  4. data/README.md +35 -4
  5. data/Rakefile +1 -0
  6. data/VERSION +1 -1
  7. data/data/hexapdf/cmap/83pv-RKSJ-H +314 -0
  8. data/data/hexapdf/cmap/90ms-RKSJ-H +259 -0
  9. data/data/hexapdf/cmap/90ms-RKSJ-V +156 -0
  10. data/data/hexapdf/cmap/90msp-RKSJ-H +257 -0
  11. data/data/hexapdf/cmap/90msp-RKSJ-V +155 -0
  12. data/data/hexapdf/cmap/90pv-RKSJ-H +355 -0
  13. data/data/hexapdf/cmap/Add-RKSJ-H +738 -0
  14. data/data/hexapdf/cmap/Add-RKSJ-V +135 -0
  15. data/data/hexapdf/cmap/Adobe-CNS1-UCS2 +18209 -0
  16. data/data/hexapdf/cmap/Adobe-GB1-UCS2 +14267 -0
  17. data/data/hexapdf/cmap/Adobe-Japan1-UCS2 +19159 -0
  18. data/data/hexapdf/cmap/Adobe-Korea1-UCS2 +9267 -0
  19. data/data/hexapdf/cmap/B5pc-H +337 -0
  20. data/data/hexapdf/cmap/B5pc-V +90 -0
  21. data/data/hexapdf/cmap/CNS-EUC-H +490 -0
  22. data/data/hexapdf/cmap/CNS-EUC-V +538 -0
  23. data/data/hexapdf/cmap/ETen-B5-H +343 -0
  24. data/data/hexapdf/cmap/ETen-B5-V +91 -0
  25. data/data/hexapdf/cmap/ETenms-B5-H +79 -0
  26. data/data/hexapdf/cmap/ETenms-B5-V +99 -0
  27. data/data/hexapdf/cmap/EUC-H +207 -0
  28. data/data/hexapdf/cmap/EUC-V +105 -0
  29. data/data/hexapdf/cmap/Ext-RKSJ-H +768 -0
  30. data/data/hexapdf/cmap/Ext-RKSJ-V +117 -0
  31. data/data/hexapdf/cmap/GB-EUC-H +173 -0
  32. data/data/hexapdf/cmap/GB-EUC-V +98 -0
  33. data/data/hexapdf/cmap/GBK-EUC-H +4273 -0
  34. data/data/hexapdf/cmap/GBK-EUC-V +97 -0
  35. data/data/hexapdf/cmap/GBK2K-H +5325 -0
  36. data/data/hexapdf/cmap/GBK2K-V +118 -0
  37. data/data/hexapdf/cmap/GBKp-EUC-H +4272 -0
  38. data/data/hexapdf/cmap/GBKp-EUC-V +97 -0
  39. data/data/hexapdf/cmap/GBpc-EUC-H +175 -0
  40. data/data/hexapdf/cmap/GBpc-EUC-V +98 -0
  41. data/data/hexapdf/cmap/H +200 -0
  42. data/data/hexapdf/cmap/HKscs-B5-H +1331 -0
  43. data/data/hexapdf/cmap/HKscs-B5-V +90 -0
  44. data/data/hexapdf/cmap/Identity-H +339 -0
  45. data/data/hexapdf/cmap/Identity-V +73 -0
  46. data/data/hexapdf/cmap/KSC-EUC-H +562 -0
  47. data/data/hexapdf/cmap/KSC-EUC-V +94 -0
  48. data/data/hexapdf/cmap/KSCms-UHC-H +776 -0
  49. data/data/hexapdf/cmap/KSCms-UHC-HW-H +775 -0
  50. data/data/hexapdf/cmap/KSCms-UHC-HW-V +93 -0
  51. data/data/hexapdf/cmap/KSCms-UHC-V +94 -0
  52. data/data/hexapdf/cmap/KSCpc-EUC-H +608 -0
  53. data/data/hexapdf/cmap/LICENSE.txt +26 -0
  54. data/data/hexapdf/cmap/README.txt +9 -0
  55. data/data/hexapdf/cmap/UniCNS-UCS2-H +16992 -0
  56. data/data/hexapdf/cmap/UniCNS-UCS2-V +90 -0
  57. data/data/hexapdf/cmap/UniCNS-UTF16-H +19117 -0
  58. data/data/hexapdf/cmap/UniCNS-UTF16-V +94 -0
  59. data/data/hexapdf/cmap/UniGB-UCS2-H +14321 -0
  60. data/data/hexapdf/cmap/UniGB-UCS2-V +101 -0
  61. data/data/hexapdf/cmap/UniGB-UTF16-H +14381 -0
  62. data/data/hexapdf/cmap/UniGB-UTF16-V +104 -0
  63. data/data/hexapdf/cmap/UniJIS-UCS2-H +8870 -0
  64. data/data/hexapdf/cmap/UniJIS-UCS2-HW-H +81 -0
  65. data/data/hexapdf/cmap/UniJIS-UCS2-HW-V +279 -0
  66. data/data/hexapdf/cmap/UniJIS-UCS2-V +275 -0
  67. data/data/hexapdf/cmap/UniJIS-UTF16-H +14450 -0
  68. data/data/hexapdf/cmap/UniJIS-UTF16-V +299 -0
  69. data/data/hexapdf/cmap/UniKS-UCS2-H +8725 -0
  70. data/data/hexapdf/cmap/UniKS-UCS2-V +95 -0
  71. data/data/hexapdf/cmap/UniKS-UTF16-H +8895 -0
  72. data/data/hexapdf/cmap/UniKS-UTF16-V +99 -0
  73. data/data/hexapdf/cmap/V +105 -0
  74. data/examples/arc.rb +3 -3
  75. data/examples/merging.rb +4 -1
  76. data/examples/optimizing.rb +3 -0
  77. data/examples/show_char_bboxes.rb +2 -2
  78. data/examples/truetype.rb +2 -2
  79. data/lib/hexapdf/cli.rb +40 -1
  80. data/lib/hexapdf/cli/batch.rb +72 -0
  81. data/lib/hexapdf/cli/command.rb +112 -15
  82. data/lib/hexapdf/cli/files.rb +2 -2
  83. data/lib/hexapdf/cli/images.rb +14 -6
  84. data/lib/hexapdf/cli/info.rb +6 -8
  85. data/lib/hexapdf/cli/inspect.rb +5 -8
  86. data/lib/hexapdf/cli/merge.rb +13 -20
  87. data/lib/hexapdf/cli/modify.rb +4 -7
  88. data/lib/hexapdf/cli/optimize.rb +2 -5
  89. data/lib/hexapdf/configuration.rb +32 -3
  90. data/lib/hexapdf/content/canvas.rb +130 -37
  91. data/lib/hexapdf/content/parser.rb +40 -6
  92. data/lib/hexapdf/content/processor.rb +4 -4
  93. data/lib/hexapdf/document.rb +40 -10
  94. data/lib/hexapdf/document/fonts.rb +1 -0
  95. data/lib/hexapdf/encryption/security_handler.rb +8 -12
  96. data/lib/hexapdf/filter/flate_decode.rb +25 -2
  97. data/lib/hexapdf/font/cmap.rb +124 -8
  98. data/lib/hexapdf/font/cmap/parser.rb +65 -15
  99. data/lib/hexapdf/font/encoding/base.rb +2 -2
  100. data/lib/hexapdf/font/encoding/glyph_list.rb +2 -4
  101. data/lib/hexapdf/font/true_type.rb +1 -0
  102. data/lib/hexapdf/font/true_type/builder.rb +75 -0
  103. data/lib/hexapdf/font/true_type/optimizer.rb +65 -0
  104. data/lib/hexapdf/font/true_type/subsetter.rb +9 -22
  105. data/lib/hexapdf/font/true_type_wrapper.rb +9 -21
  106. data/lib/hexapdf/font_loader.rb +1 -1
  107. data/lib/hexapdf/importer.rb +1 -1
  108. data/lib/hexapdf/serializer.rb +5 -3
  109. data/lib/hexapdf/type.rb +2 -0
  110. data/lib/hexapdf/type/cid_font.rb +120 -0
  111. data/lib/hexapdf/type/font.rb +32 -12
  112. data/lib/hexapdf/type/font_simple.rb +34 -42
  113. data/lib/hexapdf/type/font_type0.rb +148 -0
  114. data/lib/hexapdf/type/form.rb +4 -4
  115. data/lib/hexapdf/type/page.rb +12 -11
  116. data/lib/hexapdf/type/resources.rb +14 -0
  117. data/lib/hexapdf/utils/graphics_helpers.rb +77 -0
  118. data/lib/hexapdf/version.rb +1 -1
  119. data/man/man1/hexapdf.1 +43 -1
  120. data/test/hexapdf/content/test_canvas.rb +76 -0
  121. data/test/hexapdf/content/test_parser.rb +20 -1
  122. data/test/hexapdf/content/test_processor.rb +11 -7
  123. data/test/hexapdf/document/test_fonts.rb +3 -1
  124. data/test/hexapdf/font/cmap/test_parser.rb +42 -7
  125. data/test/hexapdf/font/encoding/test_base.rb +1 -1
  126. data/test/hexapdf/font/encoding/test_glyph_list.rb +3 -3
  127. data/test/hexapdf/font/test_cmap.rb +104 -0
  128. data/test/hexapdf/font/test_true_type_wrapper.rb +63 -46
  129. data/test/hexapdf/font/true_type/test_builder.rb +37 -0
  130. data/test/hexapdf/font/true_type/test_optimizer.rb +27 -0
  131. data/test/hexapdf/font/true_type/test_subsetter.rb +6 -13
  132. data/test/hexapdf/test_configuration.rb +12 -7
  133. data/test/hexapdf/test_document.rb +24 -0
  134. data/test/hexapdf/test_importer.rb +9 -1
  135. data/test/hexapdf/test_writer.rb +2 -2
  136. data/test/hexapdf/type/test_cid_font.rb +61 -0
  137. data/test/hexapdf/type/test_font.rb +31 -4
  138. data/test/hexapdf/type/test_font_simple.rb +6 -21
  139. data/test/hexapdf/type/test_font_type0.rb +114 -0
  140. data/test/hexapdf/type/test_resources.rb +17 -1
  141. data/test/hexapdf/utils/test_graphics_helpers.rb +29 -0
  142. metadata +82 -3
@@ -33,6 +33,7 @@
33
33
 
34
34
  require 'stringio'
35
35
  require 'hexapdf/tokenizer'
36
+ require 'hexapdf/content/processor'
36
37
 
37
38
  module HexaPDF
38
39
  module Content
@@ -45,6 +46,9 @@ module HexaPDF
45
46
  # See: PDF1.7 s7.2
46
47
  class Tokenizer < HexaPDF::Tokenizer #:nodoc:
47
48
 
49
+ # The string that is tokenized.
50
+ attr_reader :string
51
+
48
52
  # Creates a new tokenizer.
49
53
  def initialize(string)
50
54
  @ss = StringScanner.new(string)
@@ -168,6 +172,8 @@ module HexaPDF
168
172
 
169
173
  private
170
174
 
175
+ MAX_TOKEN_CHECK = 5 #:nodoc:
176
+
171
177
  # Parses the inline image at the current position.
172
178
  def parse_inline_image(tokenizer)
173
179
  # BI has already been read, so read the image dictionary
@@ -190,13 +196,41 @@ module HexaPDF
190
196
  # one whitespace character after ID
191
197
  tokenizer.next_byte
192
198
 
193
- # find the EI operator
194
- data = tokenizer.scan_until(/(?=EI[#{Tokenizer::WHITESPACE}])/o)
195
- if data.nil?
196
- raise HexaPDF::Error, "End inline image marker EI not found"
199
+ real_end_found = false
200
+ image_data = ''.b
201
+
202
+ # find the EI operator and handle EI appearing inside the image data
203
+ until real_end_found
204
+ data = tokenizer.scan_until(/(?=EI(?:[#{Tokenizer::WHITESPACE}]|\z))/o)
205
+ if data.nil?
206
+ raise HexaPDF::Error, "End inline image marker EI not found"
207
+ end
208
+ image_data << data
209
+ tokenizer.pos += 2
210
+ last_pos = tokenizer.pos
211
+
212
+ # Check if we found EI inside of the image data
213
+ count = 0
214
+ while count < MAX_TOKEN_CHECK
215
+ token = tokenizer.next_object(allow_keyword: true) rescue break
216
+ if token == Tokenizer::NO_MORE_TOKENS
217
+ count += MAX_TOKEN_CHECK
218
+ elsif token.kind_of?(Tokenizer::Token) &&
219
+ !Processor::OPERATOR_MESSAGE_NAME_MAP.key?(token.to_sym)
220
+ break # invalid token
221
+ end
222
+ count += 1
223
+ end
224
+
225
+ if count >= MAX_TOKEN_CHECK
226
+ real_end_found = true
227
+ else
228
+ image_data << "EI"
229
+ end
230
+ tokenizer.pos = last_pos
197
231
  end
198
- tokenizer.pos += 3
199
- [dict, data]
232
+
233
+ [dict, image_data]
200
234
  end
201
235
 
202
236
  end
@@ -408,7 +408,7 @@ module HexaPDF
408
408
  def decode_horizontal_text(array)
409
409
  font = graphics_state.font
410
410
  scaled_char_space = graphics_state.scaled_character_spacing
411
- scaled_word_space = graphics_state.scaled_word_spacing
411
+ scaled_word_space = (font.word_spacing_applicable? ? graphics_state.scaled_word_spacing : 0)
412
412
  scaled_font_size = graphics_state.scaled_font_size
413
413
 
414
414
  below_baseline = font.bounding_box[1] * scaled_font_size / \
@@ -423,15 +423,15 @@ module HexaPDF
423
423
  else
424
424
  font.decode(item).each do |code_point|
425
425
  char = font.to_utf8(code_point)
426
- width = font.width(code_point) * scaled_font_size
426
+ width = font.width(code_point) * scaled_font_size + scaled_char_space + \
427
+ (code_point == 32 ? scaled_word_space : 0)
427
428
  matrix = graphics_state.ctm.dup.premultiply(*graphics_state.tm)
428
429
  fragment = GlyphBox.new(code_point, char,
429
430
  *matrix.evaluate(0, below_baseline),
430
431
  *matrix.evaluate(width, below_baseline),
431
432
  *matrix.evaluate(0, above_baseline))
432
433
  text << fragment
433
- graphics_state.tm.translate(width + scaled_char_space + \
434
- (char == ' ' ? scaled_word_space : 0), 0)
434
+ graphics_state.tm.translate(width, 0)
435
435
  end
436
436
  end
437
437
  end
@@ -135,6 +135,7 @@ module HexaPDF
135
135
  end
136
136
 
137
137
  @listeners = {}
138
+ @cache = Hash.new {|h, k| h[k] = {} }
138
139
  end
139
140
 
140
141
  # :call-seq:
@@ -315,25 +316,24 @@ module HexaPDF
315
316
  if type.kind_of?(Class)
316
317
  klass = type
317
318
  else
318
- default = if data.stream
319
- HexaPDF::Stream
320
- elsif data.value.kind_of?(Hash)
321
- HexaPDF::Dictionary
322
- else
323
- HexaPDF::Object
324
- end
325
319
  if data.value.kind_of?(Hash)
326
320
  type ||= deref(data.value[:Type])
327
321
  subtype ||= deref(data.value[:Subtype])
328
322
  end
329
323
 
330
324
  if subtype
331
- klass = GlobalConfiguration.constantize('object.subtype_map'.freeze, subtype)
325
+ klass = GlobalConfiguration.constantize('object.subtype_map'.freeze, subtype) { nil }
332
326
  end
333
327
  if type && !klass
334
- klass = GlobalConfiguration.constantize('object.type_map'.freeze, type)
328
+ klass = GlobalConfiguration.constantize('object.type_map'.freeze, type) { nil }
335
329
  end
336
- klass ||= default
330
+ klass ||= if data.stream
331
+ HexaPDF::Stream
332
+ elsif data.value.kind_of?(Hash)
333
+ HexaPDF::Dictionary
334
+ else
335
+ HexaPDF::Object
336
+ end
337
337
  end
338
338
 
339
339
  klass.new(data, document: self)
@@ -418,6 +418,36 @@ module HexaPDF
418
418
  @listeners[name] && @listeners[name].each {|obj| obj.call(*args)}
419
419
  end
420
420
 
421
+ # Caches the value or the return value of the given block using the given Object::PDFData and
422
+ # key arguments as composite hash key. If a cached value already exists, it is just returned.
423
+ #
424
+ # This facility can be used to cache expensive operations in PDF objects that are easy to
425
+ # compute again.
426
+ #
427
+ # Use #clear_cache to clear the cache if necessary.
428
+ def cache(pdf_data, key, value = nil)
429
+ @cache[pdf_data][key] ||= value || yield
430
+ end
431
+
432
+ # Returns +true+ if there is a value cached for the composite key consisting of the given
433
+ # +pdf_data+ and +key+ objects.
434
+ #
435
+ # Also see: #cache
436
+ def cached?(pdf_data, key)
437
+ @cache.key?(pdf_data) && @cache[pdf_data].key?(key)
438
+ end
439
+
440
+ # Clears all cached data or, if a Object::PDFData object is given, just the cache for this one
441
+ # object.
442
+ #
443
+ # It is *not* recommended to clear the whole cache! Better clear the cache for individual PDF
444
+ # objects!
445
+ #
446
+ # Also see: #cache
447
+ def clear_cache(pdf_data = nil)
448
+ pdf_data ? @cache[pdf_data].clear : @cache.clear
449
+ end
450
+
421
451
  # Returns the Pages object that provides convenience methods for working with pages.
422
452
  #
423
453
  # Also see: HexaPDF::Type::PageTreeNode
@@ -55,6 +55,7 @@ module HexaPDF
55
55
  #
56
56
  # If a font with the same parameters has been loaded before, the cached font object is used.
57
57
  def load(name, **options)
58
+ options[:variant] ||= :none # assign default value for consistency with caching
58
59
  font = @loaded_fonts_cache[[name, options]]
59
60
  return font if font
60
61
 
@@ -143,12 +143,10 @@ module HexaPDF
143
143
  #
144
144
  # See: #set_up_encryption (for the common encryption options).
145
145
  def self.set_up_encryption(document, handler_name, **options)
146
- handler = HexaPDF::GlobalConfiguration.constantize('encryption.filter_map', handler_name)
147
- if handler.nil?
148
- handler = HexaPDF::GlobalConfiguration.constantize('encryption.sub_filter_map', handler_name)
149
- end
150
- if handler.nil?
151
- raise HexaPDF::EncryptionError, "Could not find the specified security handler"
146
+ handler = GlobalConfiguration.constantize('encryption.filter_map', handler_name) do
147
+ GlobalConfiguration.constantize('encryption.sub_filter_map', handler_name) do
148
+ raise HexaPDF::EncryptionError, "Could not find the specified security handler"
149
+ end
152
150
  end
153
151
 
154
152
  handler = handler.new(document)
@@ -172,12 +170,10 @@ module HexaPDF
172
170
  if dict.nil?
173
171
  raise HexaPDF::EncryptionError, "No /Encrypt dictionary found"
174
172
  end
175
- handler = HexaPDF::GlobalConfiguration.constantize('encryption.filter_map', dict[:Filter])
176
- if handler.nil?
177
- handler = HexaPDF::GlobalConfiguration.constantize('encryption.sub_filter_map', dict[:SubFilter])
178
- end
179
- if handler.nil?
180
- raise HexaPDF::EncryptionError, "Could not find a suitable security handler"
173
+ handler = HexaPDF::GlobalConfiguration.constantize('encryption.filter_map', dict[:Filter]) do
174
+ HexaPDF::GlobalConfiguration.constantize('encryption.sub_filter_map', dict[:SubFilter]) do
175
+ raise HexaPDF::EncryptionError, "Could not find a suitable security handler"
176
+ end
181
177
  end
182
178
 
183
179
  handler = handler.new(document)
@@ -45,10 +45,33 @@ module HexaPDF
45
45
  # See: HexaPDF::Filter, PDF1.7 s7.4.4
46
46
  module FlateDecode
47
47
 
48
+ class Pool #:nodoc:
49
+
50
+ # Creates a new Zlib::Stream pool. A block must be given that returns a new Zlib::Stream
51
+ # instance.
52
+ def initialize(&block)
53
+ @creator = block
54
+ @pool = []
55
+ end
56
+
57
+ # Returns the next available stream of the pool, already reset to its initial state.
58
+ def next_available
59
+ @pool.find(-> { e = @creator.call; @pool << e; e }, &:finished?).tap(&:reset)
60
+ end
61
+
62
+ end
63
+
64
+ @inflate_pool = Pool.new { Zlib::Inflate.new }
65
+ @deflate_pool = Pool.new do
66
+ Zlib::Deflate.new(HexaPDF::GlobalConfiguration['filter.flate_compression'],
67
+ Zlib::MAX_WBITS,
68
+ HexaPDF::GlobalConfiguration['filter.flate_memory'])
69
+ end
70
+
48
71
  # See HexaPDF::Filter
49
72
  def self.decoder(source, options = nil)
50
73
  fib = Fiber.new do
51
- inflater = Zlib::Inflate.new
74
+ inflater = @inflate_pool.next_available
52
75
  while source.alive? && (data = source.resume)
53
76
  begin
54
77
  data = inflater.inflate(data)
@@ -78,7 +101,7 @@ module HexaPDF
78
101
  end
79
102
 
80
103
  Fiber.new do
81
- deflater = Zlib::Deflate.new(HexaPDF::GlobalConfiguration['filter.flate_compression'])
104
+ deflater = @deflate_pool.next_available
82
105
  while source.alive? && (data = source.resume)
83
106
  data = deflater.deflate(data)
84
107
  Fiber.yield(data)
@@ -31,20 +31,44 @@
31
31
  # is created or manipulated using HexaPDF.
32
32
  #++
33
33
 
34
+ require 'hexapdf/error'
35
+ require 'hexapdf/data_dir'
36
+
34
37
  module HexaPDF
35
38
  module Font
36
39
 
37
40
  # Represents a CMap, a mapping from character codes to CIDs (character IDs) or to their Unicode
38
41
  # value.
39
42
  #
40
- # Currently, only the mapping to the Unicode values is supported.
41
- #
42
- # See: PDF1.7 s9.7.5, s9.10.3; Adobe Technical Note #5411
43
+ # See: PDF1.7 s9.7.5, s9.10.3; Adobe Technical Notes #5014 and #5411
43
44
  class CMap
44
45
 
45
46
  autoload(:Parser, 'hexapdf/font/cmap/parser')
46
47
  autoload(:Writer, 'hexapdf/font/cmap/writer')
47
48
 
49
+ CMAP_DIR = File.join(HexaPDF.data_dir, 'cmap') #:nodoc:
50
+
51
+ @cmap_cache = {}
52
+
53
+ # Returns +true+ if the given name specifies a predefined CMap.
54
+ def self.predefined?(name)
55
+ File.exist?(File.join(CMAP_DIR, name))
56
+ end
57
+
58
+ # Creates a new CMap object by parsing a predefined CMap with the given name.
59
+ #
60
+ # Raises an error if the given CMap is not found.
61
+ def self.for_name(name)
62
+ return @cmap_cache[name] if @cmap_cache.key?(name)
63
+
64
+ file = File.join(CMAP_DIR, name)
65
+ if File.exist?(file)
66
+ @cmap_cache[name] = parse(File.read(file, encoding: ::Encoding::UTF_8))
67
+ else
68
+ raise HexaPDF::Error, "No CMap named '#{name}' found"
69
+ end
70
+ end
71
+
48
72
  # Creates a new CMap object from the given string which needs to contain a valid CMap file.
49
73
  def self.parse(string)
50
74
  Parser.new.parse(string)
@@ -58,6 +82,7 @@ module HexaPDF
58
82
  Writer.new.create_to_unicode_cmap(mapping)
59
83
  end
60
84
 
85
+
61
86
  # The registry part of the CMap version.
62
87
  attr_accessor :registry
63
88
 
@@ -70,16 +95,107 @@ module HexaPDF
70
95
  # The name of the CMap.
71
96
  attr_accessor :name
72
97
 
73
- # The mapping from character codes to Unicode values.
74
- attr_accessor :unicode_mapping
98
+ # The writing mode of the CMap: 0 for horizontal, 1 for vertical writing.
99
+ attr_accessor :wmode
100
+
101
+ attr_reader :codespace_ranges #: nodoc:
102
+ attr_reader :cid_mapping # :nodoc:
103
+ attr_reader :cid_range_mappings # :nodoc:
104
+ attr_reader :unicode_mapping # :nodoc:
105
+ protected :codespace_ranges, :cid_mapping, :cid_range_mappings, :unicode_mapping
75
106
 
76
107
  # Creates a new CMap object.
77
108
  def initialize
78
- @unicode_mapping = Hash.new("".freeze)
109
+ @codespace_ranges = []
110
+ @cid_mapping = {}
111
+ @cid_range_mappings = []
112
+ @unicode_mapping = {}
113
+ end
114
+
115
+ # Add all mappings from the given CMap to this CMap.
116
+ def use_cmap(cmap)
117
+ @codespace_ranges.concat(cmap.codespace_ranges)
118
+ @cid_mapping.merge!(cmap.cid_mapping)
119
+ @cid_range_mappings.concat(cmap.cid_range_mappings)
120
+ @unicode_mapping.merge!(cmap.unicode_mapping)
121
+ end
122
+
123
+ # Add a codespace range using an array of ranges for the individual bytes.
124
+ #
125
+ # This means that the first range is checked against the first byte, the second range against
126
+ # the second byte and so on.
127
+ def add_codespace_range(first, *rest)
128
+ @codespace_ranges << [first, rest]
129
+ end
130
+
131
+ # Parses the string and returns all character codes.
132
+ #
133
+ # An error is raised if the string contains invalid bytes.
134
+ def read_codes(string)
135
+ codes = []
136
+ bytes = string.each_byte
137
+
138
+ loop do
139
+ byte = bytes.next
140
+ code = 0
141
+
142
+ found = @codespace_ranges.any? do |first_byte_range, rest_ranges|
143
+ next unless first_byte_range.cover?(byte)
144
+
145
+ code = (code << 8) + byte
146
+ valid = rest_ranges.all? do |range|
147
+ begin
148
+ byte = bytes.next
149
+ rescue StopIteration
150
+ raise HexaPDF::Error, "Missing bytes while reading codes via CMap"
151
+ end
152
+ code = (code << 8) + byte
153
+ range.cover?(byte)
154
+ end
155
+
156
+ codes << code if valid
157
+ end
158
+
159
+ unless found
160
+ raise HexaPDF::Error, "Invalid byte while reading codes via CMap: #{byte}"
161
+ end
162
+ end
163
+
164
+ codes
165
+ end
166
+
167
+ # Adds an individual mapping from character code to CID.
168
+ def add_cid_mapping(code, cid)
169
+ @cid_mapping[code] = cid
170
+ end
171
+
172
+ # Adds a CID range, mapping characters codes from +start_code+ to +end_code+ to CIDs starting
173
+ # with +start_cid+.
174
+ def add_cid_range(start_code, end_code, start_cid)
175
+ @cid_range_mappings << [start_code..end_code, start_cid]
176
+ end
177
+
178
+ # Returns the CID for the given character code, or 0 if no mapping was found.
179
+ def to_cid(code)
180
+ cid = @cid_mapping.fetch(code, -1)
181
+ if cid == -1
182
+ @cid_range_mappings.reverse_each do |range, start_cid|
183
+ if range.cover?(code)
184
+ cid = start_cid + code - range.first
185
+ break
186
+ end
187
+ end
188
+ end
189
+ (cid == -1 ? 0 : cid)
190
+ end
191
+
192
+ # Adds a mapping from character code to Unicode string in UTF-8 encoding.
193
+ def add_unicode_mapping(code, string)
194
+ @unicode_mapping[code] = string
79
195
  end
80
196
 
81
- # Returns the Unicode string in UTF-8 encoding for the given character code, or an empty
82
- # string if no mapping was found.
197
+ # Returns the Unicode string in UTF-8 encoding for the given character code, or +nil+ if no
198
+ # mapping was found.
83
199
  def to_unicode(code)
84
200
  unicode_mapping[code]
85
201
  end
@@ -41,7 +41,7 @@ module HexaPDF
41
41
 
42
42
  # Parses CMap files.
43
43
  #
44
- # Currently only ToUnicode CMaps are supported.
44
+ # See: Adobe Technical Notes #5014 and #5411
45
45
  class Parser
46
46
 
47
47
  # Parses the given string and returns a CMap object.
@@ -54,10 +54,18 @@ module HexaPDF
54
54
  case token
55
55
  when 'beginbfchar'.freeze then parse_bf_char(tokenizer, cmap)
56
56
  when 'beginbfrange'.freeze then parse_bf_range(tokenizer, cmap)
57
+ when 'begincidchar'.freeze then parse_cid_char(tokenizer, cmap)
58
+ when 'begincidrange'.freeze then parse_cid_range(tokenizer, cmap)
59
+ when 'begincodespacerange'.freeze then parse_codespace_range(tokenizer, cmap)
57
60
  when 'endcmap' then break
58
61
  end
59
62
  elsif token.kind_of?(Symbol)
60
- parse_dict_mapping(tokenizer, cmap, token)
63
+ value = tokenizer.next_token
64
+ if value.kind_of?(HexaPDF::Tokenizer::Token)
65
+ parse_cmap(cmap, token) if value == 'usecmap'.freeze
66
+ else
67
+ parse_dict_mapping(cmap, token, value)
68
+ end
61
69
  end
62
70
  end
63
71
 
@@ -68,17 +76,59 @@ module HexaPDF
68
76
 
69
77
  private
70
78
 
71
- # Parses a single mapping of a dictionary pair. The +name+ of the mapping has already been
72
- # parsed.
73
- def parse_dict_mapping(tokenizer, cmap, name)
74
- value = tokenizer.next_token
75
- return if value.kind_of?(HexaPDF::Tokenizer::Token)
79
+ # Populates the CMap with the values from the CMap with the given name.
80
+ def parse_cmap(cmap, name)
81
+ cmap.use_cmap(CMap.for_name(name.to_s))
82
+ end
76
83
 
84
+ # Parses a single mapping of a dictionary pair. The +name+ and +value+ of the mapping have
85
+ # already been parsed.
86
+ def parse_dict_mapping(cmap, name, value)
77
87
  case name
78
- when :Registry then cmap.registry = value if value.kind_of?(String)
79
- when :Ordering then cmap.ordering = value if value.kind_of?(String)
80
- when :Supplement then cmap.supplement = value if value.kind_of?(Integer)
81
- when :CMapName then cmap.name = value.to_s if value.kind_of?(Symbol)
88
+ when :Registry
89
+ cmap.registry = value.force_encoding(::Encoding::UTF_8) if value.kind_of?(String)
90
+ when :Ordering
91
+ cmap.ordering = value.force_encoding(::Encoding::UTF_8) if value.kind_of?(String)
92
+ when :Supplement
93
+ cmap.supplement = value if value.kind_of?(Integer)
94
+ when :CMapName
95
+ cmap.name = value.to_s.force_encoding(::Encoding::UTF_8) if value.kind_of?(Symbol)
96
+ when :WMode
97
+ cmap.wmode = value
98
+ end
99
+ end
100
+
101
+ # Parses the "begincodespacerange" operator at the current position.
102
+ def parse_codespace_range(tokenizer, cmap)
103
+ until (code1 = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
104
+ code2 = tokenizer.next_token
105
+ byte_ranges = []
106
+ code1.each_byte.with_index do |byte, index|
107
+ byte_ranges << (byte..(code2.getbyte(index)))
108
+ end
109
+ cmap.add_codespace_range(*byte_ranges)
110
+ end
111
+ end
112
+
113
+ # Parses the "cidchar" operator at the current position.
114
+ def parse_cid_char(tokenizer, cmap)
115
+ until (code = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
116
+ cmap.add_cid_mapping(bytes_to_int(code), tokenizer.next_token)
117
+ end
118
+ end
119
+
120
+ # Parses the "cidrange" operator at the current position.
121
+ def parse_cid_range(tokenizer, cmap)
122
+ until (code1 = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
123
+ code1 = bytes_to_int(code1)
124
+ code2 = bytes_to_int(tokenizer.next_token)
125
+ cid_start = tokenizer.next_object
126
+
127
+ if code1 == code2
128
+ cmap.add_cid_mapping(code1, cid_start)
129
+ else
130
+ cmap.add_cid_range(code1, code2, cid_start)
131
+ end
82
132
  end
83
133
  end
84
134
 
@@ -86,7 +136,7 @@ module HexaPDF
86
136
  def parse_bf_char(tokenizer, cmap)
87
137
  until (code = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
88
138
  str = tokenizer.next_token.encode!(::Encoding::UTF_8, ::Encoding::UTF_16BE)
89
- cmap.unicode_mapping[bytes_to_int(code)] = str
139
+ cmap.add_unicode_mapping(bytes_to_int(code), str)
90
140
  end
91
141
  end
92
142
 
@@ -112,13 +162,13 @@ module HexaPDF
112
162
  if dest.kind_of?(String)
113
163
  codepoint = dest.force_encoding(::Encoding::UTF_16BE).ord
114
164
  code1.upto(code2) do |code|
115
- cmap.unicode_mapping[code] = '' << codepoint
165
+ cmap.add_unicode_mapping(code, '' << codepoint)
116
166
  codepoint += 1
117
167
  end
118
168
  elsif dest.kind_of?(Array)
119
169
  code1.upto(code2) do |code|
120
- cmap.unicode_mapping[code] =
121
- dest[code - code1].encode!(::Encoding::UTF_8, ::Encoding::UTF_16BE)
170
+ str = dest[code - code1].encode!(::Encoding::UTF_8, ::Encoding::UTF_16BE)
171
+ cmap.add_unicode_mapping(code, str)
122
172
  end
123
173
  else
124
174
  raise HexaPDF::Error, "Invalid bfrange operator in CMap"