hexapdf 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +68 -0
- data/CONTRIBUTERS +1 -1
- data/README.md +35 -4
- data/Rakefile +1 -0
- data/VERSION +1 -1
- data/data/hexapdf/cmap/83pv-RKSJ-H +314 -0
- data/data/hexapdf/cmap/90ms-RKSJ-H +259 -0
- data/data/hexapdf/cmap/90ms-RKSJ-V +156 -0
- data/data/hexapdf/cmap/90msp-RKSJ-H +257 -0
- data/data/hexapdf/cmap/90msp-RKSJ-V +155 -0
- data/data/hexapdf/cmap/90pv-RKSJ-H +355 -0
- data/data/hexapdf/cmap/Add-RKSJ-H +738 -0
- data/data/hexapdf/cmap/Add-RKSJ-V +135 -0
- data/data/hexapdf/cmap/Adobe-CNS1-UCS2 +18209 -0
- data/data/hexapdf/cmap/Adobe-GB1-UCS2 +14267 -0
- data/data/hexapdf/cmap/Adobe-Japan1-UCS2 +19159 -0
- data/data/hexapdf/cmap/Adobe-Korea1-UCS2 +9267 -0
- data/data/hexapdf/cmap/B5pc-H +337 -0
- data/data/hexapdf/cmap/B5pc-V +90 -0
- data/data/hexapdf/cmap/CNS-EUC-H +490 -0
- data/data/hexapdf/cmap/CNS-EUC-V +538 -0
- data/data/hexapdf/cmap/ETen-B5-H +343 -0
- data/data/hexapdf/cmap/ETen-B5-V +91 -0
- data/data/hexapdf/cmap/ETenms-B5-H +79 -0
- data/data/hexapdf/cmap/ETenms-B5-V +99 -0
- data/data/hexapdf/cmap/EUC-H +207 -0
- data/data/hexapdf/cmap/EUC-V +105 -0
- data/data/hexapdf/cmap/Ext-RKSJ-H +768 -0
- data/data/hexapdf/cmap/Ext-RKSJ-V +117 -0
- data/data/hexapdf/cmap/GB-EUC-H +173 -0
- data/data/hexapdf/cmap/GB-EUC-V +98 -0
- data/data/hexapdf/cmap/GBK-EUC-H +4273 -0
- data/data/hexapdf/cmap/GBK-EUC-V +97 -0
- data/data/hexapdf/cmap/GBK2K-H +5325 -0
- data/data/hexapdf/cmap/GBK2K-V +118 -0
- data/data/hexapdf/cmap/GBKp-EUC-H +4272 -0
- data/data/hexapdf/cmap/GBKp-EUC-V +97 -0
- data/data/hexapdf/cmap/GBpc-EUC-H +175 -0
- data/data/hexapdf/cmap/GBpc-EUC-V +98 -0
- data/data/hexapdf/cmap/H +200 -0
- data/data/hexapdf/cmap/HKscs-B5-H +1331 -0
- data/data/hexapdf/cmap/HKscs-B5-V +90 -0
- data/data/hexapdf/cmap/Identity-H +339 -0
- data/data/hexapdf/cmap/Identity-V +73 -0
- data/data/hexapdf/cmap/KSC-EUC-H +562 -0
- data/data/hexapdf/cmap/KSC-EUC-V +94 -0
- data/data/hexapdf/cmap/KSCms-UHC-H +776 -0
- data/data/hexapdf/cmap/KSCms-UHC-HW-H +775 -0
- data/data/hexapdf/cmap/KSCms-UHC-HW-V +93 -0
- data/data/hexapdf/cmap/KSCms-UHC-V +94 -0
- data/data/hexapdf/cmap/KSCpc-EUC-H +608 -0
- data/data/hexapdf/cmap/LICENSE.txt +26 -0
- data/data/hexapdf/cmap/README.txt +9 -0
- data/data/hexapdf/cmap/UniCNS-UCS2-H +16992 -0
- data/data/hexapdf/cmap/UniCNS-UCS2-V +90 -0
- data/data/hexapdf/cmap/UniCNS-UTF16-H +19117 -0
- data/data/hexapdf/cmap/UniCNS-UTF16-V +94 -0
- data/data/hexapdf/cmap/UniGB-UCS2-H +14321 -0
- data/data/hexapdf/cmap/UniGB-UCS2-V +101 -0
- data/data/hexapdf/cmap/UniGB-UTF16-H +14381 -0
- data/data/hexapdf/cmap/UniGB-UTF16-V +104 -0
- data/data/hexapdf/cmap/UniJIS-UCS2-H +8870 -0
- data/data/hexapdf/cmap/UniJIS-UCS2-HW-H +81 -0
- data/data/hexapdf/cmap/UniJIS-UCS2-HW-V +279 -0
- data/data/hexapdf/cmap/UniJIS-UCS2-V +275 -0
- data/data/hexapdf/cmap/UniJIS-UTF16-H +14450 -0
- data/data/hexapdf/cmap/UniJIS-UTF16-V +299 -0
- data/data/hexapdf/cmap/UniKS-UCS2-H +8725 -0
- data/data/hexapdf/cmap/UniKS-UCS2-V +95 -0
- data/data/hexapdf/cmap/UniKS-UTF16-H +8895 -0
- data/data/hexapdf/cmap/UniKS-UTF16-V +99 -0
- data/data/hexapdf/cmap/V +105 -0
- data/examples/arc.rb +3 -3
- data/examples/merging.rb +4 -1
- data/examples/optimizing.rb +3 -0
- data/examples/show_char_bboxes.rb +2 -2
- data/examples/truetype.rb +2 -2
- data/lib/hexapdf/cli.rb +40 -1
- data/lib/hexapdf/cli/batch.rb +72 -0
- data/lib/hexapdf/cli/command.rb +112 -15
- data/lib/hexapdf/cli/files.rb +2 -2
- data/lib/hexapdf/cli/images.rb +14 -6
- data/lib/hexapdf/cli/info.rb +6 -8
- data/lib/hexapdf/cli/inspect.rb +5 -8
- data/lib/hexapdf/cli/merge.rb +13 -20
- data/lib/hexapdf/cli/modify.rb +4 -7
- data/lib/hexapdf/cli/optimize.rb +2 -5
- data/lib/hexapdf/configuration.rb +32 -3
- data/lib/hexapdf/content/canvas.rb +130 -37
- data/lib/hexapdf/content/parser.rb +40 -6
- data/lib/hexapdf/content/processor.rb +4 -4
- data/lib/hexapdf/document.rb +40 -10
- data/lib/hexapdf/document/fonts.rb +1 -0
- data/lib/hexapdf/encryption/security_handler.rb +8 -12
- data/lib/hexapdf/filter/flate_decode.rb +25 -2
- data/lib/hexapdf/font/cmap.rb +124 -8
- data/lib/hexapdf/font/cmap/parser.rb +65 -15
- data/lib/hexapdf/font/encoding/base.rb +2 -2
- data/lib/hexapdf/font/encoding/glyph_list.rb +2 -4
- data/lib/hexapdf/font/true_type.rb +1 -0
- data/lib/hexapdf/font/true_type/builder.rb +75 -0
- data/lib/hexapdf/font/true_type/optimizer.rb +65 -0
- data/lib/hexapdf/font/true_type/subsetter.rb +9 -22
- data/lib/hexapdf/font/true_type_wrapper.rb +9 -21
- data/lib/hexapdf/font_loader.rb +1 -1
- data/lib/hexapdf/importer.rb +1 -1
- data/lib/hexapdf/serializer.rb +5 -3
- data/lib/hexapdf/type.rb +2 -0
- data/lib/hexapdf/type/cid_font.rb +120 -0
- data/lib/hexapdf/type/font.rb +32 -12
- data/lib/hexapdf/type/font_simple.rb +34 -42
- data/lib/hexapdf/type/font_type0.rb +148 -0
- data/lib/hexapdf/type/form.rb +4 -4
- data/lib/hexapdf/type/page.rb +12 -11
- data/lib/hexapdf/type/resources.rb +14 -0
- data/lib/hexapdf/utils/graphics_helpers.rb +77 -0
- data/lib/hexapdf/version.rb +1 -1
- data/man/man1/hexapdf.1 +43 -1
- data/test/hexapdf/content/test_canvas.rb +76 -0
- data/test/hexapdf/content/test_parser.rb +20 -1
- data/test/hexapdf/content/test_processor.rb +11 -7
- data/test/hexapdf/document/test_fonts.rb +3 -1
- data/test/hexapdf/font/cmap/test_parser.rb +42 -7
- data/test/hexapdf/font/encoding/test_base.rb +1 -1
- data/test/hexapdf/font/encoding/test_glyph_list.rb +3 -3
- data/test/hexapdf/font/test_cmap.rb +104 -0
- data/test/hexapdf/font/test_true_type_wrapper.rb +63 -46
- data/test/hexapdf/font/true_type/test_builder.rb +37 -0
- data/test/hexapdf/font/true_type/test_optimizer.rb +27 -0
- data/test/hexapdf/font/true_type/test_subsetter.rb +6 -13
- data/test/hexapdf/test_configuration.rb +12 -7
- data/test/hexapdf/test_document.rb +24 -0
- data/test/hexapdf/test_importer.rb +9 -1
- data/test/hexapdf/test_writer.rb +2 -2
- data/test/hexapdf/type/test_cid_font.rb +61 -0
- data/test/hexapdf/type/test_font.rb +31 -4
- data/test/hexapdf/type/test_font_simple.rb +6 -21
- data/test/hexapdf/type/test_font_type0.rb +114 -0
- data/test/hexapdf/type/test_resources.rb +17 -1
- data/test/hexapdf/utils/test_graphics_helpers.rb +29 -0
- metadata +82 -3
|
@@ -33,6 +33,7 @@
|
|
|
33
33
|
|
|
34
34
|
require 'stringio'
|
|
35
35
|
require 'hexapdf/tokenizer'
|
|
36
|
+
require 'hexapdf/content/processor'
|
|
36
37
|
|
|
37
38
|
module HexaPDF
|
|
38
39
|
module Content
|
|
@@ -45,6 +46,9 @@ module HexaPDF
|
|
|
45
46
|
# See: PDF1.7 s7.2
|
|
46
47
|
class Tokenizer < HexaPDF::Tokenizer #:nodoc:
|
|
47
48
|
|
|
49
|
+
# The string that is tokenized.
|
|
50
|
+
attr_reader :string
|
|
51
|
+
|
|
48
52
|
# Creates a new tokenizer.
|
|
49
53
|
def initialize(string)
|
|
50
54
|
@ss = StringScanner.new(string)
|
|
@@ -168,6 +172,8 @@ module HexaPDF
|
|
|
168
172
|
|
|
169
173
|
private
|
|
170
174
|
|
|
175
|
+
MAX_TOKEN_CHECK = 5 #:nodoc:
|
|
176
|
+
|
|
171
177
|
# Parses the inline image at the current position.
|
|
172
178
|
def parse_inline_image(tokenizer)
|
|
173
179
|
# BI has already been read, so read the image dictionary
|
|
@@ -190,13 +196,41 @@ module HexaPDF
|
|
|
190
196
|
# one whitespace character after ID
|
|
191
197
|
tokenizer.next_byte
|
|
192
198
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
199
|
+
real_end_found = false
|
|
200
|
+
image_data = ''.b
|
|
201
|
+
|
|
202
|
+
# find the EI operator and handle EI appearing inside the image data
|
|
203
|
+
until real_end_found
|
|
204
|
+
data = tokenizer.scan_until(/(?=EI(?:[#{Tokenizer::WHITESPACE}]|\z))/o)
|
|
205
|
+
if data.nil?
|
|
206
|
+
raise HexaPDF::Error, "End inline image marker EI not found"
|
|
207
|
+
end
|
|
208
|
+
image_data << data
|
|
209
|
+
tokenizer.pos += 2
|
|
210
|
+
last_pos = tokenizer.pos
|
|
211
|
+
|
|
212
|
+
# Check if we found EI inside of the image data
|
|
213
|
+
count = 0
|
|
214
|
+
while count < MAX_TOKEN_CHECK
|
|
215
|
+
token = tokenizer.next_object(allow_keyword: true) rescue break
|
|
216
|
+
if token == Tokenizer::NO_MORE_TOKENS
|
|
217
|
+
count += MAX_TOKEN_CHECK
|
|
218
|
+
elsif token.kind_of?(Tokenizer::Token) &&
|
|
219
|
+
!Processor::OPERATOR_MESSAGE_NAME_MAP.key?(token.to_sym)
|
|
220
|
+
break # invalid token
|
|
221
|
+
end
|
|
222
|
+
count += 1
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
if count >= MAX_TOKEN_CHECK
|
|
226
|
+
real_end_found = true
|
|
227
|
+
else
|
|
228
|
+
image_data << "EI"
|
|
229
|
+
end
|
|
230
|
+
tokenizer.pos = last_pos
|
|
197
231
|
end
|
|
198
|
-
|
|
199
|
-
[dict,
|
|
232
|
+
|
|
233
|
+
[dict, image_data]
|
|
200
234
|
end
|
|
201
235
|
|
|
202
236
|
end
|
|
@@ -408,7 +408,7 @@ module HexaPDF
|
|
|
408
408
|
def decode_horizontal_text(array)
|
|
409
409
|
font = graphics_state.font
|
|
410
410
|
scaled_char_space = graphics_state.scaled_character_spacing
|
|
411
|
-
scaled_word_space = graphics_state.scaled_word_spacing
|
|
411
|
+
scaled_word_space = (font.word_spacing_applicable? ? graphics_state.scaled_word_spacing : 0)
|
|
412
412
|
scaled_font_size = graphics_state.scaled_font_size
|
|
413
413
|
|
|
414
414
|
below_baseline = font.bounding_box[1] * scaled_font_size / \
|
|
@@ -423,15 +423,15 @@ module HexaPDF
|
|
|
423
423
|
else
|
|
424
424
|
font.decode(item).each do |code_point|
|
|
425
425
|
char = font.to_utf8(code_point)
|
|
426
|
-
width = font.width(code_point) * scaled_font_size
|
|
426
|
+
width = font.width(code_point) * scaled_font_size + scaled_char_space + \
|
|
427
|
+
(code_point == 32 ? scaled_word_space : 0)
|
|
427
428
|
matrix = graphics_state.ctm.dup.premultiply(*graphics_state.tm)
|
|
428
429
|
fragment = GlyphBox.new(code_point, char,
|
|
429
430
|
*matrix.evaluate(0, below_baseline),
|
|
430
431
|
*matrix.evaluate(width, below_baseline),
|
|
431
432
|
*matrix.evaluate(0, above_baseline))
|
|
432
433
|
text << fragment
|
|
433
|
-
graphics_state.tm.translate(width
|
|
434
|
-
(char == ' ' ? scaled_word_space : 0), 0)
|
|
434
|
+
graphics_state.tm.translate(width, 0)
|
|
435
435
|
end
|
|
436
436
|
end
|
|
437
437
|
end
|
data/lib/hexapdf/document.rb
CHANGED
|
@@ -135,6 +135,7 @@ module HexaPDF
|
|
|
135
135
|
end
|
|
136
136
|
|
|
137
137
|
@listeners = {}
|
|
138
|
+
@cache = Hash.new {|h, k| h[k] = {} }
|
|
138
139
|
end
|
|
139
140
|
|
|
140
141
|
# :call-seq:
|
|
@@ -315,25 +316,24 @@ module HexaPDF
|
|
|
315
316
|
if type.kind_of?(Class)
|
|
316
317
|
klass = type
|
|
317
318
|
else
|
|
318
|
-
default = if data.stream
|
|
319
|
-
HexaPDF::Stream
|
|
320
|
-
elsif data.value.kind_of?(Hash)
|
|
321
|
-
HexaPDF::Dictionary
|
|
322
|
-
else
|
|
323
|
-
HexaPDF::Object
|
|
324
|
-
end
|
|
325
319
|
if data.value.kind_of?(Hash)
|
|
326
320
|
type ||= deref(data.value[:Type])
|
|
327
321
|
subtype ||= deref(data.value[:Subtype])
|
|
328
322
|
end
|
|
329
323
|
|
|
330
324
|
if subtype
|
|
331
|
-
klass = GlobalConfiguration.constantize('object.subtype_map'.freeze, subtype)
|
|
325
|
+
klass = GlobalConfiguration.constantize('object.subtype_map'.freeze, subtype) { nil }
|
|
332
326
|
end
|
|
333
327
|
if type && !klass
|
|
334
|
-
klass = GlobalConfiguration.constantize('object.type_map'.freeze, type)
|
|
328
|
+
klass = GlobalConfiguration.constantize('object.type_map'.freeze, type) { nil }
|
|
335
329
|
end
|
|
336
|
-
klass ||=
|
|
330
|
+
klass ||= if data.stream
|
|
331
|
+
HexaPDF::Stream
|
|
332
|
+
elsif data.value.kind_of?(Hash)
|
|
333
|
+
HexaPDF::Dictionary
|
|
334
|
+
else
|
|
335
|
+
HexaPDF::Object
|
|
336
|
+
end
|
|
337
337
|
end
|
|
338
338
|
|
|
339
339
|
klass.new(data, document: self)
|
|
@@ -418,6 +418,36 @@ module HexaPDF
|
|
|
418
418
|
@listeners[name] && @listeners[name].each {|obj| obj.call(*args)}
|
|
419
419
|
end
|
|
420
420
|
|
|
421
|
+
# Caches the value or the return value of the given block using the given Object::PDFData and
|
|
422
|
+
# key arguments as composite hash key. If a cached value already exists, it is just returned.
|
|
423
|
+
#
|
|
424
|
+
# This facility can be used to cache expensive operations in PDF objects that are easy to
|
|
425
|
+
# compute again.
|
|
426
|
+
#
|
|
427
|
+
# Use #clear_cache to clear the cache if necessary.
|
|
428
|
+
def cache(pdf_data, key, value = nil)
|
|
429
|
+
@cache[pdf_data][key] ||= value || yield
|
|
430
|
+
end
|
|
431
|
+
|
|
432
|
+
# Returns +true+ if there is a value cached for the composite key consisting of the given
|
|
433
|
+
# +pdf_data+ and +key+ objects.
|
|
434
|
+
#
|
|
435
|
+
# Also see: #cache
|
|
436
|
+
def cached?(pdf_data, key)
|
|
437
|
+
@cache.key?(pdf_data) && @cache[pdf_data].key?(key)
|
|
438
|
+
end
|
|
439
|
+
|
|
440
|
+
# Clears all cached data or, if a Object::PDFData object is given, just the cache for this one
|
|
441
|
+
# object.
|
|
442
|
+
#
|
|
443
|
+
# It is *not* recommended to clear the whole cache! Better clear the cache for individual PDF
|
|
444
|
+
# objects!
|
|
445
|
+
#
|
|
446
|
+
# Also see: #cache
|
|
447
|
+
def clear_cache(pdf_data = nil)
|
|
448
|
+
pdf_data ? @cache[pdf_data].clear : @cache.clear
|
|
449
|
+
end
|
|
450
|
+
|
|
421
451
|
# Returns the Pages object that provides convenience methods for working with pages.
|
|
422
452
|
#
|
|
423
453
|
# Also see: HexaPDF::Type::PageTreeNode
|
|
@@ -55,6 +55,7 @@ module HexaPDF
|
|
|
55
55
|
#
|
|
56
56
|
# If a font with the same parameters has been loaded before, the cached font object is used.
|
|
57
57
|
def load(name, **options)
|
|
58
|
+
options[:variant] ||= :none # assign default value for consistency with caching
|
|
58
59
|
font = @loaded_fonts_cache[[name, options]]
|
|
59
60
|
return font if font
|
|
60
61
|
|
|
@@ -143,12 +143,10 @@ module HexaPDF
|
|
|
143
143
|
#
|
|
144
144
|
# See: #set_up_encryption (for the common encryption options).
|
|
145
145
|
def self.set_up_encryption(document, handler_name, **options)
|
|
146
|
-
handler =
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
if handler.nil?
|
|
151
|
-
raise HexaPDF::EncryptionError, "Could not find the specified security handler"
|
|
146
|
+
handler = GlobalConfiguration.constantize('encryption.filter_map', handler_name) do
|
|
147
|
+
GlobalConfiguration.constantize('encryption.sub_filter_map', handler_name) do
|
|
148
|
+
raise HexaPDF::EncryptionError, "Could not find the specified security handler"
|
|
149
|
+
end
|
|
152
150
|
end
|
|
153
151
|
|
|
154
152
|
handler = handler.new(document)
|
|
@@ -172,12 +170,10 @@ module HexaPDF
|
|
|
172
170
|
if dict.nil?
|
|
173
171
|
raise HexaPDF::EncryptionError, "No /Encrypt dictionary found"
|
|
174
172
|
end
|
|
175
|
-
handler = HexaPDF::GlobalConfiguration.constantize('encryption.filter_map', dict[:Filter])
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
if handler.nil?
|
|
180
|
-
raise HexaPDF::EncryptionError, "Could not find a suitable security handler"
|
|
173
|
+
handler = HexaPDF::GlobalConfiguration.constantize('encryption.filter_map', dict[:Filter]) do
|
|
174
|
+
HexaPDF::GlobalConfiguration.constantize('encryption.sub_filter_map', dict[:SubFilter]) do
|
|
175
|
+
raise HexaPDF::EncryptionError, "Could not find a suitable security handler"
|
|
176
|
+
end
|
|
181
177
|
end
|
|
182
178
|
|
|
183
179
|
handler = handler.new(document)
|
|
@@ -45,10 +45,33 @@ module HexaPDF
|
|
|
45
45
|
# See: HexaPDF::Filter, PDF1.7 s7.4.4
|
|
46
46
|
module FlateDecode
|
|
47
47
|
|
|
48
|
+
class Pool #:nodoc:
|
|
49
|
+
|
|
50
|
+
# Creates a new Zlib::Stream pool. A block must be given that returns a new Zlib::Stream
|
|
51
|
+
# instance.
|
|
52
|
+
def initialize(&block)
|
|
53
|
+
@creator = block
|
|
54
|
+
@pool = []
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Returns the next available stream of the pool, already reset to its initial state.
|
|
58
|
+
def next_available
|
|
59
|
+
@pool.find(-> { e = @creator.call; @pool << e; e }, &:finished?).tap(&:reset)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
@inflate_pool = Pool.new { Zlib::Inflate.new }
|
|
65
|
+
@deflate_pool = Pool.new do
|
|
66
|
+
Zlib::Deflate.new(HexaPDF::GlobalConfiguration['filter.flate_compression'],
|
|
67
|
+
Zlib::MAX_WBITS,
|
|
68
|
+
HexaPDF::GlobalConfiguration['filter.flate_memory'])
|
|
69
|
+
end
|
|
70
|
+
|
|
48
71
|
# See HexaPDF::Filter
|
|
49
72
|
def self.decoder(source, options = nil)
|
|
50
73
|
fib = Fiber.new do
|
|
51
|
-
inflater =
|
|
74
|
+
inflater = @inflate_pool.next_available
|
|
52
75
|
while source.alive? && (data = source.resume)
|
|
53
76
|
begin
|
|
54
77
|
data = inflater.inflate(data)
|
|
@@ -78,7 +101,7 @@ module HexaPDF
|
|
|
78
101
|
end
|
|
79
102
|
|
|
80
103
|
Fiber.new do
|
|
81
|
-
deflater =
|
|
104
|
+
deflater = @deflate_pool.next_available
|
|
82
105
|
while source.alive? && (data = source.resume)
|
|
83
106
|
data = deflater.deflate(data)
|
|
84
107
|
Fiber.yield(data)
|
data/lib/hexapdf/font/cmap.rb
CHANGED
|
@@ -31,20 +31,44 @@
|
|
|
31
31
|
# is created or manipulated using HexaPDF.
|
|
32
32
|
#++
|
|
33
33
|
|
|
34
|
+
require 'hexapdf/error'
|
|
35
|
+
require 'hexapdf/data_dir'
|
|
36
|
+
|
|
34
37
|
module HexaPDF
|
|
35
38
|
module Font
|
|
36
39
|
|
|
37
40
|
# Represents a CMap, a mapping from character codes to CIDs (character IDs) or to their Unicode
|
|
38
41
|
# value.
|
|
39
42
|
#
|
|
40
|
-
#
|
|
41
|
-
#
|
|
42
|
-
# See: PDF1.7 s9.7.5, s9.10.3; Adobe Technical Note #5411
|
|
43
|
+
# See: PDF1.7 s9.7.5, s9.10.3; Adobe Technical Notes #5014 and #5411
|
|
43
44
|
class CMap
|
|
44
45
|
|
|
45
46
|
autoload(:Parser, 'hexapdf/font/cmap/parser')
|
|
46
47
|
autoload(:Writer, 'hexapdf/font/cmap/writer')
|
|
47
48
|
|
|
49
|
+
CMAP_DIR = File.join(HexaPDF.data_dir, 'cmap') #:nodoc:
|
|
50
|
+
|
|
51
|
+
@cmap_cache = {}
|
|
52
|
+
|
|
53
|
+
# Returns +true+ if the given name specifies a predefined CMap.
|
|
54
|
+
def self.predefined?(name)
|
|
55
|
+
File.exist?(File.join(CMAP_DIR, name))
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Creates a new CMap object by parsing a predefined CMap with the given name.
|
|
59
|
+
#
|
|
60
|
+
# Raises an error if the given CMap is not found.
|
|
61
|
+
def self.for_name(name)
|
|
62
|
+
return @cmap_cache[name] if @cmap_cache.key?(name)
|
|
63
|
+
|
|
64
|
+
file = File.join(CMAP_DIR, name)
|
|
65
|
+
if File.exist?(file)
|
|
66
|
+
@cmap_cache[name] = parse(File.read(file, encoding: ::Encoding::UTF_8))
|
|
67
|
+
else
|
|
68
|
+
raise HexaPDF::Error, "No CMap named '#{name}' found"
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
48
72
|
# Creates a new CMap object from the given string which needs to contain a valid CMap file.
|
|
49
73
|
def self.parse(string)
|
|
50
74
|
Parser.new.parse(string)
|
|
@@ -58,6 +82,7 @@ module HexaPDF
|
|
|
58
82
|
Writer.new.create_to_unicode_cmap(mapping)
|
|
59
83
|
end
|
|
60
84
|
|
|
85
|
+
|
|
61
86
|
# The registry part of the CMap version.
|
|
62
87
|
attr_accessor :registry
|
|
63
88
|
|
|
@@ -70,16 +95,107 @@ module HexaPDF
|
|
|
70
95
|
# The name of the CMap.
|
|
71
96
|
attr_accessor :name
|
|
72
97
|
|
|
73
|
-
# The
|
|
74
|
-
attr_accessor :
|
|
98
|
+
# The writing mode of the CMap: 0 for horizontal, 1 for vertical writing.
|
|
99
|
+
attr_accessor :wmode
|
|
100
|
+
|
|
101
|
+
attr_reader :codespace_ranges #: nodoc:
|
|
102
|
+
attr_reader :cid_mapping # :nodoc:
|
|
103
|
+
attr_reader :cid_range_mappings # :nodoc:
|
|
104
|
+
attr_reader :unicode_mapping # :nodoc:
|
|
105
|
+
protected :codespace_ranges, :cid_mapping, :cid_range_mappings, :unicode_mapping
|
|
75
106
|
|
|
76
107
|
# Creates a new CMap object.
|
|
77
108
|
def initialize
|
|
78
|
-
@
|
|
109
|
+
@codespace_ranges = []
|
|
110
|
+
@cid_mapping = {}
|
|
111
|
+
@cid_range_mappings = []
|
|
112
|
+
@unicode_mapping = {}
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Add all mappings from the given CMap to this CMap.
|
|
116
|
+
def use_cmap(cmap)
|
|
117
|
+
@codespace_ranges.concat(cmap.codespace_ranges)
|
|
118
|
+
@cid_mapping.merge!(cmap.cid_mapping)
|
|
119
|
+
@cid_range_mappings.concat(cmap.cid_range_mappings)
|
|
120
|
+
@unicode_mapping.merge!(cmap.unicode_mapping)
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Add a codespace range using an array of ranges for the individual bytes.
|
|
124
|
+
#
|
|
125
|
+
# This means that the first range is checked against the first byte, the second range against
|
|
126
|
+
# the second byte and so on.
|
|
127
|
+
def add_codespace_range(first, *rest)
|
|
128
|
+
@codespace_ranges << [first, rest]
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Parses the string and returns all character codes.
|
|
132
|
+
#
|
|
133
|
+
# An error is raised if the string contains invalid bytes.
|
|
134
|
+
def read_codes(string)
|
|
135
|
+
codes = []
|
|
136
|
+
bytes = string.each_byte
|
|
137
|
+
|
|
138
|
+
loop do
|
|
139
|
+
byte = bytes.next
|
|
140
|
+
code = 0
|
|
141
|
+
|
|
142
|
+
found = @codespace_ranges.any? do |first_byte_range, rest_ranges|
|
|
143
|
+
next unless first_byte_range.cover?(byte)
|
|
144
|
+
|
|
145
|
+
code = (code << 8) + byte
|
|
146
|
+
valid = rest_ranges.all? do |range|
|
|
147
|
+
begin
|
|
148
|
+
byte = bytes.next
|
|
149
|
+
rescue StopIteration
|
|
150
|
+
raise HexaPDF::Error, "Missing bytes while reading codes via CMap"
|
|
151
|
+
end
|
|
152
|
+
code = (code << 8) + byte
|
|
153
|
+
range.cover?(byte)
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
codes << code if valid
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
unless found
|
|
160
|
+
raise HexaPDF::Error, "Invalid byte while reading codes via CMap: #{byte}"
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
codes
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Adds an individual mapping from character code to CID.
|
|
168
|
+
def add_cid_mapping(code, cid)
|
|
169
|
+
@cid_mapping[code] = cid
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# Adds a CID range, mapping characters codes from +start_code+ to +end_code+ to CIDs starting
|
|
173
|
+
# with +start_cid+.
|
|
174
|
+
def add_cid_range(start_code, end_code, start_cid)
|
|
175
|
+
@cid_range_mappings << [start_code..end_code, start_cid]
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# Returns the CID for the given character code, or 0 if no mapping was found.
|
|
179
|
+
def to_cid(code)
|
|
180
|
+
cid = @cid_mapping.fetch(code, -1)
|
|
181
|
+
if cid == -1
|
|
182
|
+
@cid_range_mappings.reverse_each do |range, start_cid|
|
|
183
|
+
if range.cover?(code)
|
|
184
|
+
cid = start_cid + code - range.first
|
|
185
|
+
break
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
(cid == -1 ? 0 : cid)
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# Adds a mapping from character code to Unicode string in UTF-8 encoding.
|
|
193
|
+
def add_unicode_mapping(code, string)
|
|
194
|
+
@unicode_mapping[code] = string
|
|
79
195
|
end
|
|
80
196
|
|
|
81
|
-
# Returns the Unicode string in UTF-8 encoding for the given character code, or
|
|
82
|
-
#
|
|
197
|
+
# Returns the Unicode string in UTF-8 encoding for the given character code, or +nil+ if no
|
|
198
|
+
# mapping was found.
|
|
83
199
|
def to_unicode(code)
|
|
84
200
|
unicode_mapping[code]
|
|
85
201
|
end
|
|
@@ -41,7 +41,7 @@ module HexaPDF
|
|
|
41
41
|
|
|
42
42
|
# Parses CMap files.
|
|
43
43
|
#
|
|
44
|
-
#
|
|
44
|
+
# See: Adobe Technical Notes #5014 and #5411
|
|
45
45
|
class Parser
|
|
46
46
|
|
|
47
47
|
# Parses the given string and returns a CMap object.
|
|
@@ -54,10 +54,18 @@ module HexaPDF
|
|
|
54
54
|
case token
|
|
55
55
|
when 'beginbfchar'.freeze then parse_bf_char(tokenizer, cmap)
|
|
56
56
|
when 'beginbfrange'.freeze then parse_bf_range(tokenizer, cmap)
|
|
57
|
+
when 'begincidchar'.freeze then parse_cid_char(tokenizer, cmap)
|
|
58
|
+
when 'begincidrange'.freeze then parse_cid_range(tokenizer, cmap)
|
|
59
|
+
when 'begincodespacerange'.freeze then parse_codespace_range(tokenizer, cmap)
|
|
57
60
|
when 'endcmap' then break
|
|
58
61
|
end
|
|
59
62
|
elsif token.kind_of?(Symbol)
|
|
60
|
-
|
|
63
|
+
value = tokenizer.next_token
|
|
64
|
+
if value.kind_of?(HexaPDF::Tokenizer::Token)
|
|
65
|
+
parse_cmap(cmap, token) if value == 'usecmap'.freeze
|
|
66
|
+
else
|
|
67
|
+
parse_dict_mapping(cmap, token, value)
|
|
68
|
+
end
|
|
61
69
|
end
|
|
62
70
|
end
|
|
63
71
|
|
|
@@ -68,17 +76,59 @@ module HexaPDF
|
|
|
68
76
|
|
|
69
77
|
private
|
|
70
78
|
|
|
71
|
-
#
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
return if value.kind_of?(HexaPDF::Tokenizer::Token)
|
|
79
|
+
# Populates the CMap with the values from the CMap with the given name.
|
|
80
|
+
def parse_cmap(cmap, name)
|
|
81
|
+
cmap.use_cmap(CMap.for_name(name.to_s))
|
|
82
|
+
end
|
|
76
83
|
|
|
84
|
+
# Parses a single mapping of a dictionary pair. The +name+ and +value+ of the mapping have
|
|
85
|
+
# already been parsed.
|
|
86
|
+
def parse_dict_mapping(cmap, name, value)
|
|
77
87
|
case name
|
|
78
|
-
when :Registry
|
|
79
|
-
|
|
80
|
-
when :
|
|
81
|
-
|
|
88
|
+
when :Registry
|
|
89
|
+
cmap.registry = value.force_encoding(::Encoding::UTF_8) if value.kind_of?(String)
|
|
90
|
+
when :Ordering
|
|
91
|
+
cmap.ordering = value.force_encoding(::Encoding::UTF_8) if value.kind_of?(String)
|
|
92
|
+
when :Supplement
|
|
93
|
+
cmap.supplement = value if value.kind_of?(Integer)
|
|
94
|
+
when :CMapName
|
|
95
|
+
cmap.name = value.to_s.force_encoding(::Encoding::UTF_8) if value.kind_of?(Symbol)
|
|
96
|
+
when :WMode
|
|
97
|
+
cmap.wmode = value
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Parses the "begincodespacerange" operator at the current position.
|
|
102
|
+
def parse_codespace_range(tokenizer, cmap)
|
|
103
|
+
until (code1 = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
|
|
104
|
+
code2 = tokenizer.next_token
|
|
105
|
+
byte_ranges = []
|
|
106
|
+
code1.each_byte.with_index do |byte, index|
|
|
107
|
+
byte_ranges << (byte..(code2.getbyte(index)))
|
|
108
|
+
end
|
|
109
|
+
cmap.add_codespace_range(*byte_ranges)
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Parses the "cidchar" operator at the current position.
|
|
114
|
+
def parse_cid_char(tokenizer, cmap)
|
|
115
|
+
until (code = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
|
|
116
|
+
cmap.add_cid_mapping(bytes_to_int(code), tokenizer.next_token)
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# Parses the "cidrange" operator at the current position.
|
|
121
|
+
def parse_cid_range(tokenizer, cmap)
|
|
122
|
+
until (code1 = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
|
|
123
|
+
code1 = bytes_to_int(code1)
|
|
124
|
+
code2 = bytes_to_int(tokenizer.next_token)
|
|
125
|
+
cid_start = tokenizer.next_object
|
|
126
|
+
|
|
127
|
+
if code1 == code2
|
|
128
|
+
cmap.add_cid_mapping(code1, cid_start)
|
|
129
|
+
else
|
|
130
|
+
cmap.add_cid_range(code1, code2, cid_start)
|
|
131
|
+
end
|
|
82
132
|
end
|
|
83
133
|
end
|
|
84
134
|
|
|
@@ -86,7 +136,7 @@ module HexaPDF
|
|
|
86
136
|
def parse_bf_char(tokenizer, cmap)
|
|
87
137
|
until (code = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token)
|
|
88
138
|
str = tokenizer.next_token.encode!(::Encoding::UTF_8, ::Encoding::UTF_16BE)
|
|
89
|
-
cmap.
|
|
139
|
+
cmap.add_unicode_mapping(bytes_to_int(code), str)
|
|
90
140
|
end
|
|
91
141
|
end
|
|
92
142
|
|
|
@@ -112,13 +162,13 @@ module HexaPDF
|
|
|
112
162
|
if dest.kind_of?(String)
|
|
113
163
|
codepoint = dest.force_encoding(::Encoding::UTF_16BE).ord
|
|
114
164
|
code1.upto(code2) do |code|
|
|
115
|
-
cmap.
|
|
165
|
+
cmap.add_unicode_mapping(code, '' << codepoint)
|
|
116
166
|
codepoint += 1
|
|
117
167
|
end
|
|
118
168
|
elsif dest.kind_of?(Array)
|
|
119
169
|
code1.upto(code2) do |code|
|
|
120
|
-
|
|
121
|
-
|
|
170
|
+
str = dest[code - code1].encode!(::Encoding::UTF_8, ::Encoding::UTF_16BE)
|
|
171
|
+
cmap.add_unicode_mapping(code, str)
|
|
122
172
|
end
|
|
123
173
|
else
|
|
124
174
|
raise HexaPDF::Error, "Invalid bfrange operator in CMap"
|