hexapdf 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +68 -0
- data/CONTRIBUTERS +1 -1
- data/README.md +35 -4
- data/Rakefile +1 -0
- data/VERSION +1 -1
- data/data/hexapdf/cmap/83pv-RKSJ-H +314 -0
- data/data/hexapdf/cmap/90ms-RKSJ-H +259 -0
- data/data/hexapdf/cmap/90ms-RKSJ-V +156 -0
- data/data/hexapdf/cmap/90msp-RKSJ-H +257 -0
- data/data/hexapdf/cmap/90msp-RKSJ-V +155 -0
- data/data/hexapdf/cmap/90pv-RKSJ-H +355 -0
- data/data/hexapdf/cmap/Add-RKSJ-H +738 -0
- data/data/hexapdf/cmap/Add-RKSJ-V +135 -0
- data/data/hexapdf/cmap/Adobe-CNS1-UCS2 +18209 -0
- data/data/hexapdf/cmap/Adobe-GB1-UCS2 +14267 -0
- data/data/hexapdf/cmap/Adobe-Japan1-UCS2 +19159 -0
- data/data/hexapdf/cmap/Adobe-Korea1-UCS2 +9267 -0
- data/data/hexapdf/cmap/B5pc-H +337 -0
- data/data/hexapdf/cmap/B5pc-V +90 -0
- data/data/hexapdf/cmap/CNS-EUC-H +490 -0
- data/data/hexapdf/cmap/CNS-EUC-V +538 -0
- data/data/hexapdf/cmap/ETen-B5-H +343 -0
- data/data/hexapdf/cmap/ETen-B5-V +91 -0
- data/data/hexapdf/cmap/ETenms-B5-H +79 -0
- data/data/hexapdf/cmap/ETenms-B5-V +99 -0
- data/data/hexapdf/cmap/EUC-H +207 -0
- data/data/hexapdf/cmap/EUC-V +105 -0
- data/data/hexapdf/cmap/Ext-RKSJ-H +768 -0
- data/data/hexapdf/cmap/Ext-RKSJ-V +117 -0
- data/data/hexapdf/cmap/GB-EUC-H +173 -0
- data/data/hexapdf/cmap/GB-EUC-V +98 -0
- data/data/hexapdf/cmap/GBK-EUC-H +4273 -0
- data/data/hexapdf/cmap/GBK-EUC-V +97 -0
- data/data/hexapdf/cmap/GBK2K-H +5325 -0
- data/data/hexapdf/cmap/GBK2K-V +118 -0
- data/data/hexapdf/cmap/GBKp-EUC-H +4272 -0
- data/data/hexapdf/cmap/GBKp-EUC-V +97 -0
- data/data/hexapdf/cmap/GBpc-EUC-H +175 -0
- data/data/hexapdf/cmap/GBpc-EUC-V +98 -0
- data/data/hexapdf/cmap/H +200 -0
- data/data/hexapdf/cmap/HKscs-B5-H +1331 -0
- data/data/hexapdf/cmap/HKscs-B5-V +90 -0
- data/data/hexapdf/cmap/Identity-H +339 -0
- data/data/hexapdf/cmap/Identity-V +73 -0
- data/data/hexapdf/cmap/KSC-EUC-H +562 -0
- data/data/hexapdf/cmap/KSC-EUC-V +94 -0
- data/data/hexapdf/cmap/KSCms-UHC-H +776 -0
- data/data/hexapdf/cmap/KSCms-UHC-HW-H +775 -0
- data/data/hexapdf/cmap/KSCms-UHC-HW-V +93 -0
- data/data/hexapdf/cmap/KSCms-UHC-V +94 -0
- data/data/hexapdf/cmap/KSCpc-EUC-H +608 -0
- data/data/hexapdf/cmap/LICENSE.txt +26 -0
- data/data/hexapdf/cmap/README.txt +9 -0
- data/data/hexapdf/cmap/UniCNS-UCS2-H +16992 -0
- data/data/hexapdf/cmap/UniCNS-UCS2-V +90 -0
- data/data/hexapdf/cmap/UniCNS-UTF16-H +19117 -0
- data/data/hexapdf/cmap/UniCNS-UTF16-V +94 -0
- data/data/hexapdf/cmap/UniGB-UCS2-H +14321 -0
- data/data/hexapdf/cmap/UniGB-UCS2-V +101 -0
- data/data/hexapdf/cmap/UniGB-UTF16-H +14381 -0
- data/data/hexapdf/cmap/UniGB-UTF16-V +104 -0
- data/data/hexapdf/cmap/UniJIS-UCS2-H +8870 -0
- data/data/hexapdf/cmap/UniJIS-UCS2-HW-H +81 -0
- data/data/hexapdf/cmap/UniJIS-UCS2-HW-V +279 -0
- data/data/hexapdf/cmap/UniJIS-UCS2-V +275 -0
- data/data/hexapdf/cmap/UniJIS-UTF16-H +14450 -0
- data/data/hexapdf/cmap/UniJIS-UTF16-V +299 -0
- data/data/hexapdf/cmap/UniKS-UCS2-H +8725 -0
- data/data/hexapdf/cmap/UniKS-UCS2-V +95 -0
- data/data/hexapdf/cmap/UniKS-UTF16-H +8895 -0
- data/data/hexapdf/cmap/UniKS-UTF16-V +99 -0
- data/data/hexapdf/cmap/V +105 -0
- data/examples/arc.rb +3 -3
- data/examples/merging.rb +4 -1
- data/examples/optimizing.rb +3 -0
- data/examples/show_char_bboxes.rb +2 -2
- data/examples/truetype.rb +2 -2
- data/lib/hexapdf/cli.rb +40 -1
- data/lib/hexapdf/cli/batch.rb +72 -0
- data/lib/hexapdf/cli/command.rb +112 -15
- data/lib/hexapdf/cli/files.rb +2 -2
- data/lib/hexapdf/cli/images.rb +14 -6
- data/lib/hexapdf/cli/info.rb +6 -8
- data/lib/hexapdf/cli/inspect.rb +5 -8
- data/lib/hexapdf/cli/merge.rb +13 -20
- data/lib/hexapdf/cli/modify.rb +4 -7
- data/lib/hexapdf/cli/optimize.rb +2 -5
- data/lib/hexapdf/configuration.rb +32 -3
- data/lib/hexapdf/content/canvas.rb +130 -37
- data/lib/hexapdf/content/parser.rb +40 -6
- data/lib/hexapdf/content/processor.rb +4 -4
- data/lib/hexapdf/document.rb +40 -10
- data/lib/hexapdf/document/fonts.rb +1 -0
- data/lib/hexapdf/encryption/security_handler.rb +8 -12
- data/lib/hexapdf/filter/flate_decode.rb +25 -2
- data/lib/hexapdf/font/cmap.rb +124 -8
- data/lib/hexapdf/font/cmap/parser.rb +65 -15
- data/lib/hexapdf/font/encoding/base.rb +2 -2
- data/lib/hexapdf/font/encoding/glyph_list.rb +2 -4
- data/lib/hexapdf/font/true_type.rb +1 -0
- data/lib/hexapdf/font/true_type/builder.rb +75 -0
- data/lib/hexapdf/font/true_type/optimizer.rb +65 -0
- data/lib/hexapdf/font/true_type/subsetter.rb +9 -22
- data/lib/hexapdf/font/true_type_wrapper.rb +9 -21
- data/lib/hexapdf/font_loader.rb +1 -1
- data/lib/hexapdf/importer.rb +1 -1
- data/lib/hexapdf/serializer.rb +5 -3
- data/lib/hexapdf/type.rb +2 -0
- data/lib/hexapdf/type/cid_font.rb +120 -0
- data/lib/hexapdf/type/font.rb +32 -12
- data/lib/hexapdf/type/font_simple.rb +34 -42
- data/lib/hexapdf/type/font_type0.rb +148 -0
- data/lib/hexapdf/type/form.rb +4 -4
- data/lib/hexapdf/type/page.rb +12 -11
- data/lib/hexapdf/type/resources.rb +14 -0
- data/lib/hexapdf/utils/graphics_helpers.rb +77 -0
- data/lib/hexapdf/version.rb +1 -1
- data/man/man1/hexapdf.1 +43 -1
- data/test/hexapdf/content/test_canvas.rb +76 -0
- data/test/hexapdf/content/test_parser.rb +20 -1
- data/test/hexapdf/content/test_processor.rb +11 -7
- data/test/hexapdf/document/test_fonts.rb +3 -1
- data/test/hexapdf/font/cmap/test_parser.rb +42 -7
- data/test/hexapdf/font/encoding/test_base.rb +1 -1
- data/test/hexapdf/font/encoding/test_glyph_list.rb +3 -3
- data/test/hexapdf/font/test_cmap.rb +104 -0
- data/test/hexapdf/font/test_true_type_wrapper.rb +63 -46
- data/test/hexapdf/font/true_type/test_builder.rb +37 -0
- data/test/hexapdf/font/true_type/test_optimizer.rb +27 -0
- data/test/hexapdf/font/true_type/test_subsetter.rb +6 -13
- data/test/hexapdf/test_configuration.rb +12 -7
- data/test/hexapdf/test_document.rb +24 -0
- data/test/hexapdf/test_importer.rb +9 -1
- data/test/hexapdf/test_writer.rb +2 -2
- data/test/hexapdf/type/test_cid_font.rb +61 -0
- data/test/hexapdf/type/test_font.rb +31 -4
- data/test/hexapdf/type/test_font_simple.rb +6 -21
- data/test/hexapdf/type/test_font_type0.rb +114 -0
- data/test/hexapdf/type/test_resources.rb +17 -1
- data/test/hexapdf/utils/test_graphics_helpers.rb +29 -0
- metadata +82 -3
@@ -33,6 +33,7 @@
|
|
33
33
|
|
34
34
|
require 'stringio'
|
35
35
|
require 'hexapdf/tokenizer'
|
36
|
+
require 'hexapdf/content/processor'
|
36
37
|
|
37
38
|
module HexaPDF
|
38
39
|
module Content
|
@@ -45,6 +46,9 @@ module HexaPDF
|
|
45
46
|
# See: PDF1.7 s7.2
|
46
47
|
class Tokenizer < HexaPDF::Tokenizer #:nodoc:
|
47
48
|
|
49
|
+
# The string that is tokenized.
|
50
|
+
attr_reader :string
|
51
|
+
|
48
52
|
# Creates a new tokenizer.
|
49
53
|
def initialize(string)
|
50
54
|
@ss = StringScanner.new(string)
|
@@ -168,6 +172,8 @@ module HexaPDF
|
|
168
172
|
|
169
173
|
private
|
170
174
|
|
175
|
+
MAX_TOKEN_CHECK = 5 #:nodoc:
|
176
|
+
|
171
177
|
# Parses the inline image at the current position.
|
172
178
|
def parse_inline_image(tokenizer)
|
173
179
|
# BI has already been read, so read the image dictionary
|
@@ -190,13 +196,41 @@ module HexaPDF
|
|
190
196
|
# one whitespace character after ID
|
191
197
|
tokenizer.next_byte
|
192
198
|
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
199
|
+
real_end_found = false
|
200
|
+
image_data = ''.b
|
201
|
+
|
202
|
+
# find the EI operator and handle EI appearing inside the image data
|
203
|
+
until real_end_found
|
204
|
+
data = tokenizer.scan_until(/(?=EI(?:[#{Tokenizer::WHITESPACE}]|\z))/o)
|
205
|
+
if data.nil?
|
206
|
+
raise HexaPDF::Error, "End inline image marker EI not found"
|
207
|
+
end
|
208
|
+
image_data << data
|
209
|
+
tokenizer.pos += 2
|
210
|
+
last_pos = tokenizer.pos
|
211
|
+
|
212
|
+
# Check if we found EI inside of the image data
|
213
|
+
count = 0
|
214
|
+
while count < MAX_TOKEN_CHECK
|
215
|
+
token = tokenizer.next_object(allow_keyword: true) rescue break
|
216
|
+
if token == Tokenizer::NO_MORE_TOKENS
|
217
|
+
count += MAX_TOKEN_CHECK
|
218
|
+
elsif token.kind_of?(Tokenizer::Token) &&
|
219
|
+
!Processor::OPERATOR_MESSAGE_NAME_MAP.key?(token.to_sym)
|
220
|
+
break # invalid token
|
221
|
+
end
|
222
|
+
count += 1
|
223
|
+
end
|
224
|
+
|
225
|
+
if count >= MAX_TOKEN_CHECK
|
226
|
+
real_end_found = true
|
227
|
+
else
|
228
|
+
image_data << "EI"
|
229
|
+
end
|
230
|
+
tokenizer.pos = last_pos
|
197
231
|
end
|
198
|
-
|
199
|
-
[dict,
|
232
|
+
|
233
|
+
[dict, image_data]
|
200
234
|
end
|
201
235
|
|
202
236
|
end
|
@@ -408,7 +408,7 @@ module HexaPDF
|
|
408
408
|
def decode_horizontal_text(array)
|
409
409
|
font = graphics_state.font
|
410
410
|
scaled_char_space = graphics_state.scaled_character_spacing
|
411
|
-
scaled_word_space = graphics_state.scaled_word_spacing
|
411
|
+
scaled_word_space = (font.word_spacing_applicable? ? graphics_state.scaled_word_spacing : 0)
|
412
412
|
scaled_font_size = graphics_state.scaled_font_size
|
413
413
|
|
414
414
|
below_baseline = font.bounding_box[1] * scaled_font_size / \
|
@@ -423,15 +423,15 @@ module HexaPDF
|
|
423
423
|
else
|
424
424
|
font.decode(item).each do |code_point|
|
425
425
|
char = font.to_utf8(code_point)
|
426
|
-
width = font.width(code_point) * scaled_font_size
|
426
|
+
width = font.width(code_point) * scaled_font_size + scaled_char_space + \
|
427
|
+
(code_point == 32 ? scaled_word_space : 0)
|
427
428
|
matrix = graphics_state.ctm.dup.premultiply(*graphics_state.tm)
|
428
429
|
fragment = GlyphBox.new(code_point, char,
|
429
430
|
*matrix.evaluate(0, below_baseline),
|
430
431
|
*matrix.evaluate(width, below_baseline),
|
431
432
|
*matrix.evaluate(0, above_baseline))
|
432
433
|
text << fragment
|
433
|
-
graphics_state.tm.translate(width
|
434
|
-
(char == ' ' ? scaled_word_space : 0), 0)
|
434
|
+
graphics_state.tm.translate(width, 0)
|
435
435
|
end
|
436
436
|
end
|
437
437
|
end
|
data/lib/hexapdf/document.rb
CHANGED
@@ -135,6 +135,7 @@ module HexaPDF
|
|
135
135
|
end
|
136
136
|
|
137
137
|
@listeners = {}
|
138
|
+
@cache = Hash.new {|h, k| h[k] = {} }
|
138
139
|
end
|
139
140
|
|
140
141
|
# :call-seq:
|
@@ -315,25 +316,24 @@ module HexaPDF
|
|
315
316
|
if type.kind_of?(Class)
|
316
317
|
klass = type
|
317
318
|
else
|
318
|
-
default = if data.stream
|
319
|
-
HexaPDF::Stream
|
320
|
-
elsif data.value.kind_of?(Hash)
|
321
|
-
HexaPDF::Dictionary
|
322
|
-
else
|
323
|
-
HexaPDF::Object
|
324
|
-
end
|
325
319
|
if data.value.kind_of?(Hash)
|
326
320
|
type ||= deref(data.value[:Type])
|
327
321
|
subtype ||= deref(data.value[:Subtype])
|
328
322
|
end
|
329
323
|
|
330
324
|
if subtype
|
331
|
-
klass = GlobalConfiguration.constantize('object.subtype_map'.freeze, subtype)
|
325
|
+
klass = GlobalConfiguration.constantize('object.subtype_map'.freeze, subtype) { nil }
|
332
326
|
end
|
333
327
|
if type && !klass
|
334
|
-
klass = GlobalConfiguration.constantize('object.type_map'.freeze, type)
|
328
|
+
klass = GlobalConfiguration.constantize('object.type_map'.freeze, type) { nil }
|
335
329
|
end
|
336
|
-
klass ||=
|
330
|
+
klass ||= if data.stream
|
331
|
+
HexaPDF::Stream
|
332
|
+
elsif data.value.kind_of?(Hash)
|
333
|
+
HexaPDF::Dictionary
|
334
|
+
else
|
335
|
+
HexaPDF::Object
|
336
|
+
end
|
337
337
|
end
|
338
338
|
|
339
339
|
klass.new(data, document: self)
|
@@ -418,6 +418,36 @@ module HexaPDF
|
|
418
418
|
@listeners[name] && @listeners[name].each {|obj| obj.call(*args)}
|
419
419
|
end
|
420
420
|
|
421
|
+
# Caches the value or the return value of the given block using the given Object::PDFData and
|
422
|
+
# key arguments as composite hash key. If a cached value already exists, it is just returned.
|
423
|
+
#
|
424
|
+
# This facility can be used to cache expensive operations in PDF objects that are easy to
|
425
|
+
# compute again.
|
426
|
+
#
|
427
|
+
# Use #clear_cache to clear the cache if necessary.
|
428
|
+
def cache(pdf_data, key, value = nil)
|
429
|
+
@cache[pdf_data][key] ||= value || yield
|
430
|
+
end
|
431
|
+
|
432
|
+
# Returns +true+ if there is a value cached for the composite key consisting of the given
|
433
|
+
# +pdf_data+ and +key+ objects.
|
434
|
+
#
|
435
|
+
# Also see: #cache
|
436
|
+
def cached?(pdf_data, key)
|
437
|
+
@cache.key?(pdf_data) && @cache[pdf_data].key?(key)
|
438
|
+
end
|
439
|
+
|
440
|
+
# Clears all cached data or, if a Object::PDFData object is given, just the cache for this one
|
441
|
+
# object.
|
442
|
+
#
|
443
|
+
# It is *not* recommended to clear the whole cache! Better clear the cache for individual PDF
|
444
|
+
# objects!
|
445
|
+
#
|
446
|
+
# Also see: #cache
|
447
|
+
def clear_cache(pdf_data = nil)
|
448
|
+
pdf_data ? @cache[pdf_data].clear : @cache.clear
|
449
|
+
end
|
450
|
+
|
421
451
|
# Returns the Pages object that provides convenience methods for working with pages.
|
422
452
|
#
|
423
453
|
# Also see: HexaPDF::Type::PageTreeNode
|
@@ -55,6 +55,7 @@ module HexaPDF
|
|
55
55
|
#
|
56
56
|
# If a font with the same parameters has been loaded before, the cached font object is used.
|
57
57
|
def load(name, **options)
|
58
|
+
options[:variant] ||= :none # assign default value for consistency with caching
|
58
59
|
font = @loaded_fonts_cache[[name, options]]
|
59
60
|
return font if font
|
60
61
|
|
@@ -143,12 +143,10 @@ module HexaPDF
|
|
143
143
|
#
|
144
144
|
# See: #set_up_encryption (for the common encryption options).
|
145
145
|
def self.set_up_encryption(document, handler_name, **options)
|
146
|
-
handler =
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
if handler.nil?
|
151
|
-
raise HexaPDF::EncryptionError, "Could not find the specified security handler"
|
146
|
+
handler = GlobalConfiguration.constantize('encryption.filter_map', handler_name) do
|
147
|
+
GlobalConfiguration.constantize('encryption.sub_filter_map', handler_name) do
|
148
|
+
raise HexaPDF::EncryptionError, "Could not find the specified security handler"
|
149
|
+
end
|
152
150
|
end
|
153
151
|
|
154
152
|
handler = handler.new(document)
|
@@ -172,12 +170,10 @@ module HexaPDF
|
|
172
170
|
if dict.nil?
|
173
171
|
raise HexaPDF::EncryptionError, "No /Encrypt dictionary found"
|
174
172
|
end
|
175
|
-
handler = HexaPDF::GlobalConfiguration.constantize('encryption.filter_map', dict[:Filter])
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
if handler.nil?
|
180
|
-
raise HexaPDF::EncryptionError, "Could not find a suitable security handler"
|
173
|
+
handler = HexaPDF::GlobalConfiguration.constantize('encryption.filter_map', dict[:Filter]) do
|
174
|
+
HexaPDF::GlobalConfiguration.constantize('encryption.sub_filter_map', dict[:SubFilter]) do
|
175
|
+
raise HexaPDF::EncryptionError, "Could not find a suitable security handler"
|
176
|
+
end
|
181
177
|
end
|
182
178
|
|
183
179
|
handler = handler.new(document)
|
@@ -45,10 +45,33 @@ module HexaPDF
|
|
45
45
|
# See: HexaPDF::Filter, PDF1.7 s7.4.4
|
46
46
|
module FlateDecode
|
47
47
|
|
48
|
+
class Pool #:nodoc:
|
49
|
+
|
50
|
+
# Creates a new Zlib::Stream pool. A block must be given that returns a new Zlib::Stream
|
51
|
+
# instance.
|
52
|
+
def initialize(&block)
|
53
|
+
@creator = block
|
54
|
+
@pool = []
|
55
|
+
end
|
56
|
+
|
57
|
+
# Returns the next available stream of the pool, already reset to its initial state.
|
58
|
+
def next_available
|
59
|
+
@pool.find(-> { e = @creator.call; @pool << e; e }, &:finished?).tap(&:reset)
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
@inflate_pool = Pool.new { Zlib::Inflate.new }
|
65
|
+
@deflate_pool = Pool.new do
|
66
|
+
Zlib::Deflate.new(HexaPDF::GlobalConfiguration['filter.flate_compression'],
|
67
|
+
Zlib::MAX_WBITS,
|
68
|
+
HexaPDF::GlobalConfiguration['filter.flate_memory'])
|
69
|
+
end
|
70
|
+
|
48
71
|
# See HexaPDF::Filter
|
49
72
|
def self.decoder(source, options = nil)
|
50
73
|
fib = Fiber.new do
|
51
|
-
inflater =
|
74
|
+
inflater = @inflate_pool.next_available
|
52
75
|
while source.alive? && (data = source.resume)
|
53
76
|
begin
|
54
77
|
data = inflater.inflate(data)
|
@@ -78,7 +101,7 @@ module HexaPDF
|
|
78
101
|
end
|
79
102
|
|
80
103
|
Fiber.new do
|
81
|
-
deflater =
|
104
|
+
deflater = @deflate_pool.next_available
|
82
105
|
while source.alive? && (data = source.resume)
|
83
106
|
data = deflater.deflate(data)
|
84
107
|
Fiber.yield(data)
|
data/lib/hexapdf/font/cmap.rb
CHANGED
@@ -31,20 +31,44 @@
|
|
31
31
|
# is created or manipulated using HexaPDF.
|
32
32
|
#++
|
33
33
|
|
34
|
+
require 'hexapdf/error'
|
35
|
+
require 'hexapdf/data_dir'
|
36
|
+
|
34
37
|
module HexaPDF
|
35
38
|
module Font
|
36
39
|
|
37
40
|
# Represents a CMap, a mapping from character codes to CIDs (character IDs) or to their Unicode
|
38
41
|
# value.
|
39
42
|
#
|
40
|
-
#
|
41
|
-
#
|
42
|
-
# See: PDF1.7 s9.7.5, s9.10.3; Adobe Technical Note #5411
|
43
|
+
# See: PDF1.7 s9.7.5, s9.10.3; Adobe Technical Notes #5014 and #5411
|
43
44
|
class CMap
|
44
45
|
|
45
46
|
autoload(:Parser, 'hexapdf/font/cmap/parser')
|
46
47
|
autoload(:Writer, 'hexapdf/font/cmap/writer')
|
47
48
|
|
49
|
+
CMAP_DIR = File.join(HexaPDF.data_dir, 'cmap') #:nodoc:
|
50
|
+
|
51
|
+
@cmap_cache = {}
|
52
|
+
|
53
|
+
# Returns +true+ if the given name specifies a predefined CMap.
|
54
|
+
def self.predefined?(name)
|
55
|
+
File.exist?(File.join(CMAP_DIR, name))
|
56
|
+
end
|
57
|
+
|
58
|
+
# Creates a new CMap object by parsing a predefined CMap with the given name.
|
59
|
+
#
|
60
|
+
# Raises an error if the given CMap is not found.
|
61
|
+
def self.for_name(name)
|
62
|
+
return @cmap_cache[name] if @cmap_cache.key?(name)
|
63
|
+
|
64
|
+
file = File.join(CMAP_DIR, name)
|
65
|
+
if File.exist?(file)
|
66
|
+
@cmap_cache[name] = parse(File.read(file, encoding: ::Encoding::UTF_8))
|
67
|
+
else
|
68
|
+
raise HexaPDF::Error, "No CMap named '#{name}' found"
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
48
72
|
# Creates a new CMap object from the given string which needs to contain a valid CMap file.
|
49
73
|
def self.parse(string)
|
50
74
|
Parser.new.parse(string)
|
@@ -58,6 +82,7 @@ module HexaPDF
|
|
58
82
|
Writer.new.create_to_unicode_cmap(mapping)
|
59
83
|
end
|
60
84
|
|
85
|
+
|
61
86
|
# The registry part of the CMap version.
|
62
87
|
attr_accessor :registry
|
63
88
|
|
@@ -70,16 +95,107 @@ module HexaPDF
|
|
70
95
|
# The name of the CMap.
|
71
96
|
attr_accessor :name
|
72
97
|
|
73
|
-
# The
|
74
|
-
attr_accessor :
|
98
|
+
# The writing mode of the CMap: 0 for horizontal, 1 for vertical writing.
|
99
|
+
attr_accessor :wmode
|
100
|
+
|
101
|
+
attr_reader :codespace_ranges #: nodoc:
|
102
|
+
attr_reader :cid_mapping # :nodoc:
|
103
|
+
attr_reader :cid_range_mappings # :nodoc:
|
104
|
+
attr_reader :unicode_mapping # :nodoc:
|
105
|
+
protected :codespace_ranges, :cid_mapping, :cid_range_mappings, :unicode_mapping
|
75
106
|
|
76
107
|
# Creates a new CMap object.
|
77
108
|
def initialize
|
78
|
-
@
|
109
|
+
@codespace_ranges = []
|
110
|
+
@cid_mapping = {}
|
111
|
+
@cid_range_mappings = []
|
112
|
+
@unicode_mapping = {}
|
113
|
+
end
|
114
|
+
|
115
|
+
# Add all mappings from the given CMap to this CMap.
|
116
|
+
def use_cmap(cmap)
|
117
|
+
@codespace_ranges.concat(cmap.codespace_ranges)
|
118
|
+
@cid_mapping.merge!(cmap.cid_mapping)
|
119
|
+
@cid_range_mappings.concat(cmap.cid_range_mappings)
|
120
|
+
@unicode_mapping.merge!(cmap.unicode_mapping)
|
121
|
+
end
|
122
|
+
|
123
|
+
# Add a codespace range using an array of ranges for the individual bytes.
|
124
|
+
#
|
125
|
+
# This means that the first range is checked against the first byte, the second range against
|
126
|
+
# the second byte and so on.
|
127
|
+
def add_codespace_range(first, *rest)
|
128
|
+
@codespace_ranges << [first, rest]
|
129
|
+
end
|
130
|
+
|
131
|
+
# Parses the string and returns all character codes.
|
132
|
+
#
|
133
|
+
# An error is raised if the string contains invalid bytes.
|
134
|
+
def read_codes(string)
|
135
|
+
codes = []
|
136
|
+
bytes = string.each_byte
|
137
|
+
|
138
|
+
loop do
|
139
|
+
byte = bytes.next
|
140
|
+
code = 0
|
141
|
+
|
142
|
+
found = @codespace_ranges.any? do |first_byte_range, rest_ranges|
|
143
|
+
next unless first_byte_range.cover?(byte)
|
144
|
+
|
145
|
+
code = (code << 8) + byte
|
146
|
+
valid = rest_ranges.all? do |range|
|
147
|
+
begin
|
148
|
+
byte = bytes.next
|
149
|
+
rescue StopIteration
|
150
|
+
raise HexaPDF::Error, "Missing bytes while reading codes via CMap"
|
151
|
+
end
|
152
|
+
code = (code << 8) + byte
|
153
|
+
range.cover?(byte)
|
154
|
+
end
|
155
|
+
|
156
|
+
codes << code if valid
|
157
|
+
end
|
158
|
+
|
159
|
+
unless found
|
160
|
+
raise HexaPDF::Error, "Invalid byte while reading codes via CMap: #{byte}"
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
codes
|
165
|
+
end
|
166
|
+
|
167
|
+
# Adds an individual mapping from character code to CID.
|
168
|
+
def add_cid_mapping(code, cid)
|
169
|
+
@cid_mapping[code] = cid
|
170
|
+
end
|
171
|
+
|
172
|
+
# Adds a CID range, mapping characters codes from +start_code+ to +end_code+ to CIDs starting
|
173
|
+
# with +start_cid+.
|
174
|
+
def add_cid_range(start_code, end_code, start_cid)
|
175
|
+
@cid_range_mappings << [start_code..end_code, start_cid]
|
176
|
+
end
|
177
|
+
|
178
|
+
# Returns the CID for the given character code, or 0 if no mapping was found.
|
179
|
+
def to_cid(code)
|
180
|
+
cid = @cid_mapping.fetch(code, -1)
|
181
|
+
if cid == -1
|
182
|
+
@cid_range_mappings.reverse_each do |range, start_cid|
|
183
|
+
if range.cover?(code)
|
184
|
+
cid = start_cid + code - range.first
|
185
|
+
break
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
189
|
+
(cid == -1 ? 0 : cid)
|
190
|
+
end
|
191
|
+
|
192
|
+
# Adds a mapping from character code to Unicode string in UTF-8 encoding.
|
193
|
+
def add_unicode_mapping(code, string)
|
194
|
+
@unicode_mapping[code] = string
|
79
195
|
end
|
80
196
|
|
81
|
-
# Returns the Unicode string in UTF-8 encoding for the given character code, or
|
82
|
-
#
|
197
|
+
# Returns the Unicode string in UTF-8 encoding for the given character code, or +nil+ if no
|
198
|
+
# mapping was found.
|
83
199
|
def to_unicode(code)
|
84
200
|
unicode_mapping[code]
|
85
201
|
end
|
@@ -41,7 +41,7 @@ module HexaPDF
|
|
41
41
|
|
42
42
|
# Parses CMap files.
|
43
43
|
#
|
44
|
-
#
|
44
|
+
# See: Adobe Technical Notes #5014 and #5411
|
45
45
|
class Parser
|
46
46
|
|
47
47
|
# Parses the given string and returns a CMap object.
|
@@ -54,10 +54,18 @@ module HexaPDF
|
|
54
54
|
case token
|
55
55
|
when 'beginbfchar'.freeze then parse_bf_char(tokenizer, cmap)
|
56
56
|
when 'beginbfrange'.freeze then parse_bf_range(tokenizer, cmap)
|
57
|
+
when 'begincidchar'.freeze then parse_cid_char(tokenizer, cmap)
|
58
|
+
when 'begincidrange'.freeze then parse_cid_range(tokenizer, cmap)
|
59
|
+
when 'begincodespacerange'.freeze then parse_codespace_range(tokenizer, cmap)
|
57
60
|
when 'endcmap' then break
|
58
61
|
end
|
59
62
|
elsif token.kind_of?(Symbol)
|
60
|
-
|
63
|
+
value = tokenizer.next_token
|
64
|
+
if value.kind_of?(HexaPDF::Tokenizer::Token)
|
65
|
+
parse_cmap(cmap, token) if value == 'usecmap'.freeze
|
66
|
+
else
|
67
|
+
parse_dict_mapping(cmap, token, value)
|
68
|
+
end
|
61
69
|
end
|
62
70
|
end
|
63
71
|
|
@@ -68,17 +76,59 @@ module HexaPDF
|
|
68
76
|
|
69
77
|
private
|
70
78
|
|
71
|
-
#
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
return if value.kind_of?(HexaPDF::Tokenizer::Token)
|
79
|
+
# Populates the CMap with the values from the CMap with the given name.
|
80
|
+
def parse_cmap(cmap, name)
|
81
|
+
cmap.use_cmap(CMap.for_name(name.to_s))
|
82
|
+
end
|
76
83
|
|
84
|
+
# Parses a single mapping of a dictionary pair. The +name+ and +value+ of the mapping have
|
85
|
+
# already been parsed.
|
86
|
+
def parse_dict_mapping(cmap, name, value)
|
77
87
|
case name
|
78
|
-
when :Registry
|
79
|
-
|
80
|
-
when :
|
81
|
-
|
88
|
+
when :Registry
|
89
|
+
cmap.registry = value.force_encoding(::Encoding::UTF_8) if value.kind_of?(String)
|
90
|
+
when :Ordering
|
91
|
+
cmap.ordering = value.force_encoding(::Encoding::UTF_8) if value.kind_of?(String)
|
92
|
+
when :Supplement
|
93
|
+
cmap.supplement = value if value.kind_of?(Integer)
|
94
|
+
when :CMapName
|
95
|
+
cmap.name = value.to_s.force_encoding(::Encoding::UTF_8) if value.kind_of?(Symbol)
|
96
|
+
when :WMode
|
97
|
+
cmap.wmode = value
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Parses the "begincodespacerange" operator at the current position.
|
102
|
+
def parse_codespace_range(tokenizer, cmap)
|
103
|
+
until (code1 = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
|
104
|
+
code2 = tokenizer.next_token
|
105
|
+
byte_ranges = []
|
106
|
+
code1.each_byte.with_index do |byte, index|
|
107
|
+
byte_ranges << (byte..(code2.getbyte(index)))
|
108
|
+
end
|
109
|
+
cmap.add_codespace_range(*byte_ranges)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
# Parses the "cidchar" operator at the current position.
|
114
|
+
def parse_cid_char(tokenizer, cmap)
|
115
|
+
until (code = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
|
116
|
+
cmap.add_cid_mapping(bytes_to_int(code), tokenizer.next_token)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# Parses the "cidrange" operator at the current position.
|
121
|
+
def parse_cid_range(tokenizer, cmap)
|
122
|
+
until (code1 = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
|
123
|
+
code1 = bytes_to_int(code1)
|
124
|
+
code2 = bytes_to_int(tokenizer.next_token)
|
125
|
+
cid_start = tokenizer.next_object
|
126
|
+
|
127
|
+
if code1 == code2
|
128
|
+
cmap.add_cid_mapping(code1, cid_start)
|
129
|
+
else
|
130
|
+
cmap.add_cid_range(code1, code2, cid_start)
|
131
|
+
end
|
82
132
|
end
|
83
133
|
end
|
84
134
|
|
@@ -86,7 +136,7 @@ module HexaPDF
|
|
86
136
|
def parse_bf_char(tokenizer, cmap)
|
87
137
|
until (code = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
|
88
138
|
str = tokenizer.next_token.encode!(::Encoding::UTF_8, ::Encoding::UTF_16BE)
|
89
|
-
cmap.
|
139
|
+
cmap.add_unicode_mapping(bytes_to_int(code), str)
|
90
140
|
end
|
91
141
|
end
|
92
142
|
|
@@ -112,13 +162,13 @@ module HexaPDF
|
|
112
162
|
if dest.kind_of?(String)
|
113
163
|
codepoint = dest.force_encoding(::Encoding::UTF_16BE).ord
|
114
164
|
code1.upto(code2) do |code|
|
115
|
-
cmap.
|
165
|
+
cmap.add_unicode_mapping(code, '' << codepoint)
|
116
166
|
codepoint += 1
|
117
167
|
end
|
118
168
|
elsif dest.kind_of?(Array)
|
119
169
|
code1.upto(code2) do |code|
|
120
|
-
|
121
|
-
|
170
|
+
str = dest[code - code1].encode!(::Encoding::UTF_8, ::Encoding::UTF_16BE)
|
171
|
+
cmap.add_unicode_mapping(code, str)
|
122
172
|
end
|
123
173
|
else
|
124
174
|
raise HexaPDF::Error, "Invalid bfrange operator in CMap"
|