pdf-reader 0.7.2 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +12 -2
- data/{README → README.rdoc} +27 -47
- data/Rakefile +5 -4
- data/TODO +3 -1
- data/bin/pdf_list_callbacks +1 -5
- data/bin/pdf_object +43 -0
- data/bin/pdf_text +1 -0
- data/lib/pdf/reader.rb +25 -7
- data/lib/pdf/reader/buffer.rb +3 -1
- data/lib/pdf/reader/content.rb +56 -48
- data/lib/pdf/reader/encoding.rb +82 -1088
- data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
- data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
- data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
- data/lib/pdf/reader/encodings/standard.txt +47 -0
- data/lib/pdf/reader/encodings/symbol.txt +154 -0
- data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
- data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
- data/lib/pdf/reader/error.rb +1 -0
- data/lib/pdf/reader/font.rb +4 -3
- data/lib/pdf/reader/parser.rb +1 -0
- data/lib/pdf/reader/print_receiver.rb +19 -0
- data/lib/pdf/reader/xref.rb +12 -0
- metadata +26 -17
- data/lib/pdf/reader/parser.rb.rej +0 -29
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -32,6 +32,48 @@ class PDF::Reader
|
|
32
32
|
|
33
33
|
attr_reader :differences
|
34
34
|
|
35
|
+
def initialize(enc)
|
36
|
+
if enc.kind_of?(Hash)
|
37
|
+
self.differences=enc[:Differences] if enc[:Differences]
|
38
|
+
enc = enc[:Encoding] || enc[:BaseEncoding]
|
39
|
+
elsif enc != nil
|
40
|
+
enc = enc.to_sym
|
41
|
+
end
|
42
|
+
|
43
|
+
case enc
|
44
|
+
when nil then
|
45
|
+
load_mapping File.dirname(__FILE__) + "/encodings/standard.txt"
|
46
|
+
@unpack = "C*"
|
47
|
+
when "Identity-H".to_sym then
|
48
|
+
@unpack = "n*"
|
49
|
+
@to_unicode_required = true
|
50
|
+
when :MacRomanEncoding then
|
51
|
+
load_mapping File.dirname(__FILE__) + "/encodings/mac_roman.txt"
|
52
|
+
@unpack = "C*"
|
53
|
+
when :MacExpertEncoding then
|
54
|
+
load_mapping File.dirname(__FILE__) + "/encodings/mac_expert.txt"
|
55
|
+
@unpack = "C*"
|
56
|
+
when :PDFDocEncoding then
|
57
|
+
load_mapping File.dirname(__FILE__) + "/encodings/pdf_doc.txt"
|
58
|
+
@unpack = "C*"
|
59
|
+
when :StandardEncoding then
|
60
|
+
load_mapping File.dirname(__FILE__) + "/encodings/standard.txt"
|
61
|
+
@unpack = "C*"
|
62
|
+
when :SymbolEncoding then
|
63
|
+
load_mapping File.dirname(__FILE__) + "/encodings/symbol.txt"
|
64
|
+
@unpack = "C*"
|
65
|
+
when :UTF16Encoding then
|
66
|
+
@unpack = "n*"
|
67
|
+
when :WinAnsiEncoding then
|
68
|
+
load_mapping File.dirname(__FILE__) + "/encodings/win_ansi.txt"
|
69
|
+
@unpack = "C*"
|
70
|
+
when :ZapfDingbatsEncoding then
|
71
|
+
load_mapping File.dirname(__FILE__) + "/encodings/zapf_dingbats.txt"
|
72
|
+
@unpack = "C*"
|
73
|
+
else raise UnsupportedFeatureError, "#{enc} is not currently a supported encoding"
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
35
77
|
# set the differences table for this encoding. should be an array in the following format:
|
36
78
|
#
|
37
79
|
# [25, :A, 26, :B]
|
@@ -57,1117 +99,69 @@ class PDF::Reader
|
|
57
99
|
@differences
|
58
100
|
end
|
59
101
|
|
60
|
-
#
|
61
|
-
def
|
62
|
-
if enc.kind_of?(Hash)
|
63
|
-
diff = enc[:Differences]
|
64
|
-
enc = enc[:Encoding] || enc[:BaseEncoding]
|
65
|
-
elsif enc != nil
|
66
|
-
enc = enc.to_sym
|
67
|
-
end
|
102
|
+
# convert the specified string to utf8
|
103
|
+
def to_utf8(str, tounicode = nil)
|
68
104
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
105
|
+
# unpack the single bytes
|
106
|
+
array_orig = str.unpack(@unpack)
|
107
|
+
|
108
|
+
# replace any relevant bytes with a glyph name
|
109
|
+
array_orig = process_differences(array_orig)
|
110
|
+
|
111
|
+
# replace any remaining bytes with a unicode codepoint
|
112
|
+
array_enc = []
|
113
|
+
array_orig.each do |num|
|
114
|
+
if tounicode && (code = tounicode.decode(num))
|
115
|
+
array_enc << code
|
116
|
+
elsif tounicode || (tounicode.nil? && @to_unicode_required)
|
117
|
+
array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
|
118
|
+
elsif @mapping && @mapping[num]
|
119
|
+
array_enc << @mapping[num]
|
120
|
+
else
|
121
|
+
array_enc << num
|
122
|
+
end
|
79
123
|
end
|
80
124
|
|
81
|
-
|
125
|
+
# convert any glyph names to unicode codepoints
|
126
|
+
array_enc = process_glyphnames(array_enc)
|
82
127
|
|
83
|
-
|
84
|
-
|
128
|
+
# replace charcters that didn't convert to unicode nicely with something valid
|
129
|
+
array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }
|
85
130
|
|
86
|
-
|
87
|
-
|
88
|
-
|
131
|
+
# pack all our Unicode codepoints into a UTF-8 string
|
132
|
+
ret = array_enc.pack("U*")
|
133
|
+
|
134
|
+
# set the strings encoding correctly under ruby 1.9+
|
135
|
+
ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
|
136
|
+
|
137
|
+
return ret
|
89
138
|
end
|
90
139
|
|
140
|
+
private
|
141
|
+
|
91
142
|
# accepts an array of byte numbers, and replaces any that have entries in the differences table
|
92
143
|
# with a glyph name
|
93
144
|
def process_differences(arr)
|
94
145
|
@differences ||= {}
|
95
146
|
arr.collect! { |n| @differences[n].nil? ? n : @differences[n]}
|
96
147
|
end
|
97
|
-
protected :process_differences
|
98
148
|
|
99
149
|
# accepts an array of unicode code points and glyphnames, and converts any glyph names to codepoints
|
100
150
|
def process_glyphnames(arr)
|
101
151
|
@differences ||= {}
|
102
152
|
arr.collect! { |n| n.kind_of?(Numeric) ? n : PDF::Reader::Font.glyphnames[n]}
|
103
153
|
end
|
104
|
-
protected :process_glyphnames
|
105
|
-
|
106
|
-
class IdentityH < Encoding
|
107
|
-
def to_utf8(str, tounicode = nil)
|
108
|
-
|
109
|
-
array_enc = []
|
110
|
-
|
111
|
-
# iterate over string, reading it in 2 byte chunks and interpreting those
|
112
|
-
# chunks as ints
|
113
|
-
str.unpack("n*").each do |num|
|
114
|
-
|
115
|
-
# convert the int to a unicode codepoint if possible.
|
116
|
-
# without a ToUnicode CMap, it's impossible to reliably convert this text
|
117
|
-
# to unicode, so just replace each character with a little box. Big smacks
|
118
|
-
# the the PDF producing app.
|
119
|
-
if tounicode && (code = tounicode.decode(num))
|
120
|
-
array_enc << code
|
121
|
-
else
|
122
|
-
array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
|
123
|
-
end
|
124
|
-
end
|
125
|
-
|
126
|
-
# replace charcters that didn't convert to unicode nicely with something valid
|
127
|
-
array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }
|
128
|
-
|
129
|
-
# pack all our Unicode codepoints into a UTF-8 string
|
130
|
-
ret = array_enc.pack("U*")
|
131
|
-
|
132
|
-
# set the strings encoding correctly under ruby 1.9+
|
133
|
-
ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
|
134
|
-
|
135
|
-
return ret
|
136
|
-
end
|
137
|
-
end
|
138
|
-
|
139
|
-
class MacExpertEncoding < Encoding
|
140
|
-
# convert a MacExpertEncoding string into UTF-8
|
141
|
-
def to_utf8(str, tounicode = nil)
|
142
|
-
array_expert = str.unpack('C*')
|
143
|
-
array_expert = self.process_differences(array_expert)
|
144
|
-
array_enc = []
|
145
|
-
array_expert.each do |num|
|
146
|
-
if tounicode && (code = tounicode.decode(num))
|
147
|
-
array_enc << code
|
148
|
-
elsif tounicode
|
149
|
-
array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
|
150
|
-
else
|
151
|
-
case num
|
152
|
-
# change necesary characters to equivilant Unicode codepoints
|
153
|
-
when 0x21; array_enc << 0xF721
|
154
|
-
when 0x22; array_enc << 0xF6F8 # Hungarumlautsmall
|
155
|
-
when 0x23; array_enc << 0xF7A2
|
156
|
-
when 0x24; array_enc << 0xF724
|
157
|
-
when 0x25; array_enc << 0xF6E4
|
158
|
-
when 0x26; array_enc << 0xF726
|
159
|
-
when 0x27; array_enc << 0xF7B4
|
160
|
-
when 0x28; array_enc << 0x207D
|
161
|
-
when 0x29; array_enc << 0xF07E
|
162
|
-
when 0x2A; array_enc << 0x2025
|
163
|
-
when 0x2B; array_enc << 0x2024
|
164
|
-
when 0x2F; array_enc << 0x2044
|
165
|
-
when 0x30; array_enc << 0xF730
|
166
|
-
when 0x31; array_enc << 0xF731
|
167
|
-
when 0x32; array_enc << 0xF732
|
168
|
-
when 0x33; array_enc << 0xF733
|
169
|
-
when 0x34; array_enc << 0xF734
|
170
|
-
when 0x35; array_enc << 0xF735
|
171
|
-
when 0x36; array_enc << 0xF736
|
172
|
-
when 0x37; array_enc << 0xF737
|
173
|
-
when 0x38; array_enc << 0xF738
|
174
|
-
when 0x39; array_enc << 0xF739
|
175
|
-
when 0x3D; array_enc << 0xF6DE
|
176
|
-
when 0x3F; array_enc << 0xF73F
|
177
|
-
when 0x44; array_enc << 0xF7F0
|
178
|
-
when 0x47; array_enc << 0x00BC
|
179
|
-
when 0x48; array_enc << 0x00BD
|
180
|
-
when 0x49; array_enc << 0x00BE
|
181
|
-
when 0x4A; array_enc << 0x215B
|
182
|
-
when 0x4B; array_enc << 0x215C
|
183
|
-
when 0x4C; array_enc << 0x215D
|
184
|
-
when 0x4D; array_enc << 0x215E
|
185
|
-
when 0x4E; array_enc << 0x2153
|
186
|
-
when 0x4F; array_enc << 0x2154
|
187
|
-
when 0x56; array_enc << 0xFB00
|
188
|
-
when 0x57; array_enc << 0xFB01
|
189
|
-
when 0x58; array_enc << 0xFB02
|
190
|
-
when 0x59; array_enc << 0xFB03
|
191
|
-
when 0x5A; array_enc << 0xFB04
|
192
|
-
when 0x5B; array_enc << 0x208D
|
193
|
-
when 0x5D; array_enc << 0x208E
|
194
|
-
when 0x5E; array_enc << 0xF6F6
|
195
|
-
when 0x5F; array_enc << 0xF6E5
|
196
|
-
when 0x60; array_enc << 0xF760
|
197
|
-
when 0x61; array_enc << 0xF761
|
198
|
-
when 0x62; array_enc << 0xF762
|
199
|
-
when 0x63; array_enc << 0xF763
|
200
|
-
when 0x64; array_enc << 0xF764
|
201
|
-
when 0x65; array_enc << 0xF765
|
202
|
-
when 0x66; array_enc << 0xF766
|
203
|
-
when 0x67; array_enc << 0xF767
|
204
|
-
when 0x68; array_enc << 0xF768
|
205
|
-
when 0x69; array_enc << 0xF769
|
206
|
-
when 0x6A; array_enc << 0xF76A
|
207
|
-
when 0x6B; array_enc << 0xF76B
|
208
|
-
when 0x6C; array_enc << 0xF76C
|
209
|
-
when 0x6D; array_enc << 0xF76D
|
210
|
-
when 0x6E; array_enc << 0xF76E
|
211
|
-
when 0x6F; array_enc << 0xF76F
|
212
|
-
when 0x70; array_enc << 0xF770
|
213
|
-
when 0x71; array_enc << 0xF771
|
214
|
-
when 0x72; array_enc << 0xF772
|
215
|
-
when 0x73; array_enc << 0xF773
|
216
|
-
when 0x74; array_enc << 0xF774
|
217
|
-
when 0x75; array_enc << 0xF775
|
218
|
-
when 0x76; array_enc << 0xF776
|
219
|
-
when 0x77; array_enc << 0xF777
|
220
|
-
when 0x78; array_enc << 0xF778
|
221
|
-
when 0x79; array_enc << 0xF779
|
222
|
-
when 0x7A; array_enc << 0xF77A
|
223
|
-
when 0x7B; array_enc << 0x20A1
|
224
|
-
when 0x7C; array_enc << 0xF6DC
|
225
|
-
when 0x7D; array_enc << 0xF6DD
|
226
|
-
when 0x7E; array_enc << 0xF6FE
|
227
|
-
when 0x81; array_enc << 0xF6E9
|
228
|
-
when 0x82; array_enc << 0xF6E0
|
229
|
-
when 0x87; array_enc << 0xF7E1 # Acircumflexsmall
|
230
|
-
when 0x88; array_enc << 0xF7E0
|
231
|
-
when 0x89; array_enc << 0xF7E2 # Acutesmall
|
232
|
-
when 0x8A; array_enc << 0xF7E4
|
233
|
-
when 0x8B; array_enc << 0xF7E3
|
234
|
-
when 0x8C; array_enc << 0xF7E5
|
235
|
-
when 0x8D; array_enc << 0xF7E7
|
236
|
-
when 0x8E; array_enc << 0xF7E9
|
237
|
-
when 0x8F; array_enc << 0xF7E8
|
238
|
-
when 0x90; array_enc << 0xF7E4
|
239
|
-
when 0x91; array_enc << 0xF7EB
|
240
|
-
when 0x92; array_enc << 0xF7ED
|
241
|
-
when 0x93; array_enc << 0xF7EC
|
242
|
-
when 0x94; array_enc << 0xF7EE
|
243
|
-
when 0x95; array_enc << 0xF7EF
|
244
|
-
when 0x96; array_enc << 0xF7F1
|
245
|
-
when 0x97; array_enc << 0xF7F3
|
246
|
-
when 0x98; array_enc << 0xF7F2
|
247
|
-
when 0x99; array_enc << 0xF7F4
|
248
|
-
when 0x9A; array_enc << 0xF7F6
|
249
|
-
when 0x9B; array_enc << 0xF7F5
|
250
|
-
when 0x9C; array_enc << 0xF7FA
|
251
|
-
when 0x9D; array_enc << 0xF7F9
|
252
|
-
when 0x9E; array_enc << 0xF7FB
|
253
|
-
when 0x9F; array_enc << 0xF7FC
|
254
|
-
when 0xA1; array_enc << 0x2078
|
255
|
-
when 0xA2; array_enc << 0x2084
|
256
|
-
when 0xA3; array_enc << 0x2083
|
257
|
-
when 0xA4; array_enc << 0x2086
|
258
|
-
when 0xA5; array_enc << 0x2088
|
259
|
-
when 0xA6; array_enc << 0x2087
|
260
|
-
when 0xA7; array_enc << 0xF6FD
|
261
|
-
when 0xA9; array_enc << 0xF6DF
|
262
|
-
when 0xAA; array_enc << 0x2082
|
263
|
-
when 0xAC; array_enc << 0xF7A8
|
264
|
-
when 0xAE; array_enc << 0xF6F5
|
265
|
-
when 0xAF; array_enc << 0xF6F0
|
266
|
-
when 0xB0; array_enc << 0x2085
|
267
|
-
when 0xB2; array_enc << 0xF6E1
|
268
|
-
when 0xB3; array_enc << 0xF6E7
|
269
|
-
when 0xB4; array_enc << 0xF7FD
|
270
|
-
when 0xB6; array_enc << 0xF6E3
|
271
|
-
when 0xB9; array_enc << 0xF7FE
|
272
|
-
when 0xBB; array_enc << 0x2089
|
273
|
-
when 0xBC; array_enc << 0x2080
|
274
|
-
when 0xBD; array_enc << 0xF6FF
|
275
|
-
when 0xBE; array_enc << 0xF7E6 # AEsmall
|
276
|
-
when 0xBF; array_enc << 0xF7F8
|
277
|
-
when 0xC0; array_enc << 0xF7BF
|
278
|
-
when 0xC1; array_enc << 0x2081
|
279
|
-
when 0xC2; array_enc << 0xF6F9
|
280
|
-
when 0xC9; array_enc << 0xF7B8
|
281
|
-
when 0xCF; array_enc << 0xF6FA
|
282
|
-
when 0xD0; array_enc << 0x2012
|
283
|
-
when 0xD1; array_enc << 0xF6E6
|
284
|
-
when 0xD6; array_enc << 0xF7A1
|
285
|
-
when 0xD8; array_enc << 0xF7FF
|
286
|
-
when 0xDA; array_enc << 0x00B9
|
287
|
-
when 0xDB; array_enc << 0x00B2
|
288
|
-
when 0xDC; array_enc << 0x00B3
|
289
|
-
when 0xDD; array_enc << 0x2074
|
290
|
-
when 0xDE; array_enc << 0x2075
|
291
|
-
when 0xDF; array_enc << 0x2076
|
292
|
-
when 0xE0; array_enc << 0x2077
|
293
|
-
when 0xE1; array_enc << 0x2079
|
294
|
-
when 0xE2; array_enc << 0x2070
|
295
|
-
when 0xE4; array_enc << 0xF6EC
|
296
|
-
when 0xE5; array_enc << 0xF6F1
|
297
|
-
when 0xE6; array_enc << 0xF6F3
|
298
|
-
when 0xE9; array_enc << 0xF6ED
|
299
|
-
when 0xEA; array_enc << 0xF6F2
|
300
|
-
when 0xEB; array_enc << 0xF6EB
|
301
|
-
when 0xF1; array_enc << 0xF6EE
|
302
|
-
when 0xF2; array_enc << 0xF6FB
|
303
|
-
when 0xF3; array_enc << 0xF6F4
|
304
|
-
when 0xF4; array_enc << 0xF7AF
|
305
|
-
when 0xF5; array_enc << 0xF6EF
|
306
|
-
when 0xF6; array_enc << 0x207F
|
307
|
-
when 0xF7; array_enc << 0xF6EF
|
308
|
-
when 0xF8; array_enc << 0xF6E2
|
309
|
-
when 0xF9; array_enc << 0xF6E8
|
310
|
-
when 0xFA; array_enc << 0xF6F7
|
311
|
-
when 0xFB; array_enc << 0xF6FC
|
312
|
-
else
|
313
|
-
array_enc << num
|
314
|
-
end
|
315
|
-
end
|
316
|
-
end
|
317
|
-
|
318
|
-
# convert any glyph names to unicode codepoints
|
319
|
-
array_enc = self.process_glyphnames(array_enc)
|
320
|
-
|
321
|
-
# replace charcters that didn't convert to unicode nicely with something valid
|
322
|
-
array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }
|
323
|
-
|
324
|
-
# pack all our Unicode codepoints into a UTF-8 string
|
325
|
-
ret = array_enc.pack("U*")
|
326
|
-
|
327
|
-
# set the strings encoding correctly under ruby 1.9+
|
328
|
-
ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
|
329
|
-
|
330
|
-
return ret
|
331
|
-
end
|
332
|
-
end
|
333
|
-
|
334
|
-
# The default encoding for OSX <= v9
|
335
|
-
# see: http://en.wikipedia.org/wiki/Mac_OS_Roman
|
336
|
-
class MacRomanEncoding < Encoding
|
337
|
-
# convert a MacRomanEncoding string into UTF-8
|
338
|
-
def to_utf8(str, tounicode = nil)
|
339
|
-
# content of this method borrowed from REXML::Encoding.decode_cp1252
|
340
|
-
array_mac = str.unpack('C*')
|
341
|
-
array_mac = self.process_differences(array_mac)
|
342
|
-
array_enc = []
|
343
|
-
array_mac.each do |num|
|
344
|
-
if tounicode && (code = tounicode.decode(num))
|
345
|
-
array_enc << code
|
346
|
-
elsif tounicode
|
347
|
-
array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
|
348
|
-
else
|
349
|
-
case num
|
350
|
-
# change necesary characters to equivilant Unicode codepoints
|
351
|
-
when 0x80; array_enc << 0x00C4
|
352
|
-
when 0x81; array_enc << 0x00C5
|
353
|
-
when 0x82; array_enc << 0x00C7
|
354
|
-
when 0x83; array_enc << 0x00C9
|
355
|
-
when 0x84; array_enc << 0x00D1
|
356
|
-
when 0x85; array_enc << 0x00D6
|
357
|
-
when 0x86; array_enc << 0x00DC
|
358
|
-
when 0x87; array_enc << 0x00E1
|
359
|
-
when 0x88; array_enc << 0x00E0
|
360
|
-
when 0x89; array_enc << 0x00E2
|
361
|
-
when 0x8A; array_enc << 0x00E4
|
362
|
-
when 0x8B; array_enc << 0x00E3
|
363
|
-
when 0x8C; array_enc << 0x00E5
|
364
|
-
when 0x8D; array_enc << 0x00E7
|
365
|
-
when 0x8E; array_enc << 0x00E9
|
366
|
-
when 0x8F; array_enc << 0x00E8
|
367
|
-
when 0x90; array_enc << 0x00EA
|
368
|
-
when 0x91; array_enc << 0x00EB
|
369
|
-
when 0x92; array_enc << 0x00ED
|
370
|
-
when 0x93; array_enc << 0x00EC
|
371
|
-
when 0x94; array_enc << 0x00EE
|
372
|
-
when 0x95; array_enc << 0x00EF
|
373
|
-
when 0x96; array_enc << 0x00F1
|
374
|
-
when 0x97; array_enc << 0x00F3
|
375
|
-
when 0x98; array_enc << 0x00F2
|
376
|
-
when 0x99; array_enc << 0x00F4
|
377
|
-
when 0x9A; array_enc << 0x00F6
|
378
|
-
when 0x9B; array_enc << 0x00F5
|
379
|
-
when 0x9C; array_enc << 0x00FA
|
380
|
-
when 0x9D; array_enc << 0x00F9
|
381
|
-
when 0x9E; array_enc << 0x00FB
|
382
|
-
when 0x9F; array_enc << 0x00FC
|
383
|
-
when 0xA0; array_enc << 0x2020
|
384
|
-
when 0xA1; array_enc << 0x00B0
|
385
|
-
when 0xA2; array_enc << 0x00A2
|
386
|
-
when 0xA3; array_enc << 0x00A3
|
387
|
-
when 0xA4; array_enc << 0x00A7
|
388
|
-
when 0xA5; array_enc << 0x2022
|
389
|
-
when 0xA6; array_enc << 0x00B6
|
390
|
-
when 0xA7; array_enc << 0x00DF
|
391
|
-
when 0xA8; array_enc << 0x00AE
|
392
|
-
when 0xA9; array_enc << 0x00A9
|
393
|
-
when 0xAA; array_enc << 0x2122
|
394
|
-
when 0xAB; array_enc << 0x00B4
|
395
|
-
when 0xAC; array_enc << 0x00A8
|
396
|
-
when 0xAD; array_enc << 0x2260
|
397
|
-
when 0xAE; array_enc << 0x00C6
|
398
|
-
when 0xAF; array_enc << 0x00D8
|
399
|
-
when 0xB0; array_enc << 0x221E
|
400
|
-
when 0xB1; array_enc << 0x00B1
|
401
|
-
when 0xB2; array_enc << 0x2264
|
402
|
-
when 0xB3; array_enc << 0x2265
|
403
|
-
when 0xB4; array_enc << 0x00A5
|
404
|
-
when 0xB5; array_enc << 0x00B5
|
405
|
-
when 0xB6; array_enc << 0x2202
|
406
|
-
when 0xB7; array_enc << 0x2211
|
407
|
-
when 0xB8; array_enc << 0x220F
|
408
|
-
when 0xB9; array_enc << 0x03C0
|
409
|
-
when 0xBA; array_enc << 0x222B
|
410
|
-
when 0xBB; array_enc << 0x00AA
|
411
|
-
when 0xBC; array_enc << 0x00BA
|
412
|
-
when 0xBD; array_enc << 0x03A9
|
413
|
-
when 0xBE; array_enc << 0x00E6
|
414
|
-
when 0xBF; array_enc << 0x00F8
|
415
|
-
when 0xC0; array_enc << 0x00BF
|
416
|
-
when 0xC1; array_enc << 0x00A1
|
417
|
-
when 0xC2; array_enc << 0x00AC
|
418
|
-
when 0xC3; array_enc << 0x221A
|
419
|
-
when 0xC4; array_enc << 0x0192
|
420
|
-
when 0xC5; array_enc << 0x2248
|
421
|
-
when 0xC6; array_enc << 0x2206
|
422
|
-
when 0xC7; array_enc << 0x00AB
|
423
|
-
when 0xC8; array_enc << 0x00BB
|
424
|
-
when 0xC9; array_enc << 0x2026
|
425
|
-
when 0xCA; array_enc << 0x00A0
|
426
|
-
when 0xCB; array_enc << 0x00C0
|
427
|
-
when 0xCC; array_enc << 0x00C3
|
428
|
-
when 0xCD; array_enc << 0x00D5
|
429
|
-
when 0xCE; array_enc << 0x0152
|
430
|
-
when 0xCF; array_enc << 0x0153
|
431
|
-
when 0xD0; array_enc << 0x2013
|
432
|
-
when 0xD1; array_enc << 0x2014
|
433
|
-
when 0xD2; array_enc << 0x201C
|
434
|
-
when 0xD3; array_enc << 0x201D
|
435
|
-
when 0xD4; array_enc << 0x2018
|
436
|
-
when 0xD5; array_enc << 0x2019
|
437
|
-
when 0xD6; array_enc << 0x00F7
|
438
|
-
when 0xD7; array_enc << 0x25CA
|
439
|
-
when 0xD8; array_enc << 0x00FF
|
440
|
-
when 0xD9; array_enc << 0x0178
|
441
|
-
when 0xDA; array_enc << 0x2044
|
442
|
-
when 0xDB; array_enc << 0x20AC
|
443
|
-
when 0xDC; array_enc << 0x2039
|
444
|
-
when 0xDD; array_enc << 0x203A
|
445
|
-
when 0xDE; array_enc << 0xFB01
|
446
|
-
when 0xDF; array_enc << 0xFB02
|
447
|
-
when 0xE0; array_enc << 0x2021
|
448
|
-
when 0xE1; array_enc << 0x00B7
|
449
|
-
when 0xE2; array_enc << 0x201A
|
450
|
-
when 0xE3; array_enc << 0x201E
|
451
|
-
when 0xE4; array_enc << 0x2030
|
452
|
-
when 0xE5; array_enc << 0x00C2
|
453
|
-
when 0xE6; array_enc << 0x00CA
|
454
|
-
when 0xE7; array_enc << 0x00C1
|
455
|
-
when 0xE8; array_enc << 0x00CB
|
456
|
-
when 0xE9; array_enc << 0x00C8
|
457
|
-
when 0xEA; array_enc << 0x00CD
|
458
|
-
when 0xEB; array_enc << 0x00CE
|
459
|
-
when 0xEC; array_enc << 0x00CF
|
460
|
-
when 0xED; array_enc << 0x00CC
|
461
|
-
when 0xEE; array_enc << 0x00D3
|
462
|
-
when 0xEF; array_enc << 0x00D4
|
463
|
-
when 0xF0; array_enc << 0xF8FF
|
464
|
-
when 0xF1; array_enc << 0x00D2
|
465
|
-
when 0xF2; array_enc << 0x00DA
|
466
|
-
when 0xF3; array_enc << 0x00D8
|
467
|
-
when 0xF4; array_enc << 0x00D9
|
468
|
-
when 0xF5; array_enc << 0x0131
|
469
|
-
when 0xF6; array_enc << 0x02C6
|
470
|
-
when 0xF7; array_enc << 0x02DC
|
471
|
-
when 0xF8; array_enc << 0x00AF
|
472
|
-
when 0xF9; array_enc << 0x02D8
|
473
|
-
when 0xFA; array_enc << 0x02D9
|
474
|
-
when 0xFB; array_enc << 0x02DA
|
475
|
-
when 0xFC; array_enc << 0x00B8
|
476
|
-
when 0xFD; array_enc << 0x02DD
|
477
|
-
when 0xFE; array_enc << 0x02DB
|
478
|
-
when 0xFF; array_enc << 0x02C7
|
479
|
-
else
|
480
|
-
array_enc << num
|
481
|
-
end
|
482
|
-
end
|
483
|
-
end
|
484
|
-
|
485
|
-
# convert any glyph names to unicode codepoints
|
486
|
-
array_enc = self.process_glyphnames(array_enc)
|
487
|
-
|
488
|
-
# replace charcters that didn't convert to unicode nicely with something valid
|
489
|
-
array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }
|
490
|
-
|
491
|
-
# pack all our Unicode codepoints into a UTF-8 string
|
492
|
-
ret = array_enc.pack("U*")
|
493
|
-
|
494
|
-
# set the strings encoding correctly under ruby 1.9+
|
495
|
-
ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
|
496
|
-
|
497
|
-
return ret
|
498
|
-
end
|
499
|
-
end
|
500
|
-
|
501
|
-
class PDFDocEncoding < Encoding
|
502
|
-
# convert a PDFDocEncoding string into UTF-8
|
503
|
-
def to_utf8(str, tounicode = nil)
|
504
|
-
array_pdf = str.unpack('C*')
|
505
|
-
array_pdf = self.process_differences(array_pdf)
|
506
|
-
array_enc = []
|
507
|
-
array_pdf.each do |num|
|
508
|
-
if tounicode && (code = tounicode.decode(num))
|
509
|
-
array_enc << code
|
510
|
-
elsif tounicode
|
511
|
-
array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
|
512
|
-
else
|
513
|
-
case num
|
514
|
-
# change necesary characters to equivilant Unicode codepoints
|
515
|
-
when 0x18; array_enc << 0x02D8
|
516
|
-
when 0x19; array_enc << 0x02C7
|
517
|
-
when 0x1A; array_enc << 0x02C6
|
518
|
-
when 0x1B; array_enc << 0x02D9
|
519
|
-
when 0x1C; array_enc << 0x02DD
|
520
|
-
when 0x1D; array_enc << 0x02DB
|
521
|
-
when 0x1E; array_enc << 0x02DA
|
522
|
-
when 0x1F; array_enc << 0x02DC
|
523
|
-
when 0x7F; array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR # Undefined
|
524
|
-
when 0x80; array_enc << 0x2022
|
525
|
-
when 0x81; array_enc << 0x2020
|
526
|
-
when 0x82; array_enc << 0x2021
|
527
|
-
when 0x83; array_enc << 0x2026
|
528
|
-
when 0x84; array_enc << 0x2014
|
529
|
-
when 0x85; array_enc << 0x2013
|
530
|
-
when 0x86; array_enc << 0x0192
|
531
|
-
when 0x87; array_enc << 0x2044
|
532
|
-
when 0x88; array_enc << 0x2039
|
533
|
-
when 0x89; array_enc << 0x203A
|
534
|
-
when 0x8A; array_enc << 0x2212
|
535
|
-
when 0x8B; array_enc << 0x2030
|
536
|
-
when 0x8C; array_enc << 0x201E
|
537
|
-
when 0x8D; array_enc << 0x201C
|
538
|
-
when 0x8E; array_enc << 0x201D
|
539
|
-
when 0x8F; array_enc << 0x2018
|
540
|
-
when 0x90; array_enc << 0x2019
|
541
|
-
when 0x91; array_enc << 0x201A
|
542
|
-
when 0x92; array_enc << 0x2122
|
543
|
-
when 0x93; array_enc << 0xFB01
|
544
|
-
when 0x94; array_enc << 0xFB02
|
545
|
-
when 0x95; array_enc << 0x0141
|
546
|
-
when 0x96; array_enc << 0x0152
|
547
|
-
when 0x97; array_enc << 0x0160
|
548
|
-
when 0x98; array_enc << 0x0178
|
549
|
-
when 0x99; array_enc << 0x017D
|
550
|
-
when 0x9A; array_enc << 0x0131
|
551
|
-
when 0x9B; array_enc << 0x0142
|
552
|
-
when 0x9C; array_enc << 0x0153
|
553
|
-
when 0x9D; array_enc << 0x0161
|
554
|
-
when 0x9E; array_enc << 0x017E
|
555
|
-
when 0x9F; array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR # Undefined
|
556
|
-
when 0xA0; array_enc << 0x20AC
|
557
|
-
else
|
558
|
-
array_enc << num
|
559
|
-
end
|
560
|
-
end
|
561
|
-
end
|
562
|
-
|
563
|
-
# convert any glyph names to unicode codepoints
|
564
|
-
array_enc = self.process_glyphnames(array_enc)
|
565
|
-
|
566
|
-
# replace charcters that didn't convert to unicode nicely with something valid
|
567
|
-
array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }
|
568
|
-
|
569
|
-
# pack all our Unicode codepoints into a UTF-8 string
|
570
|
-
ret = array_enc.pack("U*")
|
571
|
-
|
572
|
-
# set the strings encoding correctly under ruby 1.9+
|
573
|
-
ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
|
574
|
-
|
575
|
-
return ret
|
576
|
-
end
|
577
|
-
end
|
578
154
|
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
array_enc = []
|
587
|
-
array_std.each do |num|
|
588
|
-
if tounicode && (code = tounicode.decode(num))
|
589
|
-
array_enc << code
|
590
|
-
elsif tounicode
|
591
|
-
array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
|
592
|
-
else
|
593
|
-
case num
|
594
|
-
when 0x27; array_enc << 0x2019
|
595
|
-
when 0x60; array_enc << 0x2018
|
596
|
-
when 0xA4; array_enc << 0x2044
|
597
|
-
when 0xA6; array_enc << 0x0192
|
598
|
-
when 0xA8; array_enc << 0x00A4
|
599
|
-
when 0xA9; array_enc << 0x0027
|
600
|
-
when 0xAA; array_enc << 0x201C
|
601
|
-
when 0xAC; array_enc << 0x2039
|
602
|
-
when 0xAD; array_enc << 0x203A
|
603
|
-
when 0xAE; array_enc << 0xFB01
|
604
|
-
when 0xAF; array_enc << 0xFB02
|
605
|
-
when 0xB1; array_enc << 0x2013
|
606
|
-
when 0xB2; array_enc << 0x2020
|
607
|
-
when 0xB3; array_enc << 0x2021
|
608
|
-
when 0xB4; array_enc << 0x00B7
|
609
|
-
when 0xB7; array_enc << 0x2022
|
610
|
-
when 0xB8; array_enc << 0x201A
|
611
|
-
when 0xB9; array_enc << 0x201E
|
612
|
-
when 0xBA; array_enc << 0x201D
|
613
|
-
when 0xBC; array_enc << 0x2026
|
614
|
-
when 0xBD; array_enc << 0x2030
|
615
|
-
when 0xC1; array_enc << 0x0060
|
616
|
-
when 0xC2; array_enc << 0x00B4
|
617
|
-
when 0xC3; array_enc << 0x02C6
|
618
|
-
when 0xC4; array_enc << 0x02DC
|
619
|
-
when 0xC5; array_enc << 0x00AF
|
620
|
-
when 0xC6; array_enc << 0x02D8
|
621
|
-
when 0xC7; array_enc << 0x02D9
|
622
|
-
when 0xC8; array_enc << 0x00A8
|
623
|
-
when 0xCA; array_enc << 0x02DA
|
624
|
-
when 0xCB; array_enc << 0x00B8
|
625
|
-
when 0xCD; array_enc << 0x02DD
|
626
|
-
when 0xCE; array_enc << 0x02DB
|
627
|
-
when 0xCF; array_enc << 0x02C7
|
628
|
-
when 0xD0; array_enc << 0x2014
|
629
|
-
when 0xE1; array_enc << 0x00C6
|
630
|
-
when 0xE3; array_enc << 0x00AA
|
631
|
-
when 0xE8; array_enc << 0x0141
|
632
|
-
when 0xE9; array_enc << 0x00D8
|
633
|
-
when 0xEA; array_enc << 0x0152
|
634
|
-
when 0xEB; array_enc << 0x00BA
|
635
|
-
when 0xF1; array_enc << 0x00E6
|
636
|
-
when 0xF5; array_enc << 0x0131
|
637
|
-
when 0xF8; array_enc << 0x0142
|
638
|
-
when 0xF9; array_enc << 0x00F8
|
639
|
-
when 0xFA; array_enc << 0x0153
|
640
|
-
when 0xFB; array_enc << 0x00DF
|
641
|
-
else
|
642
|
-
array_enc << num
|
643
|
-
end
|
644
|
-
end
|
155
|
+
def load_mapping(file)
|
156
|
+
@mapping = {}
|
157
|
+
RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
|
158
|
+
File.open(file, mode) do |f|
|
159
|
+
f.each do |l|
|
160
|
+
m, single_byte, unicode = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
|
161
|
+
@mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
|
645
162
|
end
|
646
|
-
|
647
|
-
# convert any glyph names to unicode codepoints
|
648
|
-
array_enc = self.process_glyphnames(array_enc)
|
649
|
-
|
650
|
-
# replace charcters that didn't convert to unicode nicely with something valid
|
651
|
-
array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }
|
652
|
-
|
653
|
-
# pack all our Unicode codepoints into a UTF-8 string
|
654
|
-
ret = array_enc.pack("U*")
|
655
|
-
|
656
|
-
# set the strings encoding correctly under ruby 1.9+
|
657
|
-
ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
|
658
|
-
|
659
|
-
return ret
|
660
163
|
end
|
661
164
|
end
|
662
165
|
|
663
|
-
class SymbolEncoding < Encoding
|
664
|
-
# convert a SymbolEncoding string into UTF-8
|
665
|
-
def to_utf8(str, tounicode = nil)
|
666
|
-
array_symbol = str.unpack('C*')
|
667
|
-
array_symbol = self.process_differences(array_symbol)
|
668
|
-
array_enc = []
|
669
|
-
array_symbol.each do |num|
|
670
|
-
if tounicode && (code = tounicode.decode(num))
|
671
|
-
array_enc << code
|
672
|
-
elsif tounicode
|
673
|
-
array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
|
674
|
-
else
|
675
|
-
case num
|
676
|
-
when 0x22; array_enc << 0x2200
|
677
|
-
when 0x24; array_enc << 0x2203
|
678
|
-
when 0x27; array_enc << 0x220B
|
679
|
-
when 0x2A; array_enc << 0x2217
|
680
|
-
when 0x2D; array_enc << 0x2212
|
681
|
-
when 0x40; array_enc << 0x2245
|
682
|
-
when 0x41; array_enc << 0x0391
|
683
|
-
when 0x42; array_enc << 0x0392
|
684
|
-
when 0x43; array_enc << 0x03A7
|
685
|
-
when 0x44; array_enc << 0x0394
|
686
|
-
when 0x45; array_enc << 0x0395
|
687
|
-
when 0x46; array_enc << 0x03A6
|
688
|
-
when 0x47; array_enc << 0x0393
|
689
|
-
when 0x48; array_enc << 0x0397
|
690
|
-
when 0x49; array_enc << 0x0399
|
691
|
-
when 0x4A; array_enc << 0x03D1
|
692
|
-
when 0x4B; array_enc << 0x039A
|
693
|
-
when 0x4C; array_enc << 0x039B
|
694
|
-
when 0x4D; array_enc << 0x039C
|
695
|
-
when 0x4E; array_enc << 0x039D
|
696
|
-
when 0x4F; array_enc << 0x039F
|
697
|
-
when 0x50; array_enc << 0x03A0
|
698
|
-
when 0x51; array_enc << 0x0398
|
699
|
-
when 0x52; array_enc << 0x03A1
|
700
|
-
when 0x53; array_enc << 0x03A3
|
701
|
-
when 0x54; array_enc << 0x03A4
|
702
|
-
when 0x55; array_enc << 0x03A5
|
703
|
-
when 0x56; array_enc << 0x03C2
|
704
|
-
when 0x57; array_enc << 0x03A9
|
705
|
-
when 0x58; array_enc << 0x039E
|
706
|
-
when 0x59; array_enc << 0x03A8
|
707
|
-
when 0x5A; array_enc << 0x0396
|
708
|
-
when 0x5C; array_enc << 0x2234
|
709
|
-
when 0x5E; array_enc << 0x22A5
|
710
|
-
when 0x60; array_enc << 0xF8E5
|
711
|
-
when 0x61; array_enc << 0x03B1
|
712
|
-
when 0x62; array_enc << 0x03B2
|
713
|
-
when 0x63; array_enc << 0x03C7
|
714
|
-
when 0x64; array_enc << 0x03B4
|
715
|
-
when 0x65; array_enc << 0x03B5
|
716
|
-
when 0x66; array_enc << 0x03C6
|
717
|
-
when 0x67; array_enc << 0x03B3
|
718
|
-
when 0x68; array_enc << 0x03B7
|
719
|
-
when 0x69; array_enc << 0x03B9
|
720
|
-
when 0x6A; array_enc << 0x03D5
|
721
|
-
when 0x6B; array_enc << 0x03BA
|
722
|
-
when 0x6C; array_enc << 0x03BB
|
723
|
-
when 0x6D; array_enc << 0x03BC
|
724
|
-
when 0x6E; array_enc << 0x03BD
|
725
|
-
when 0x6F; array_enc << 0x03BF
|
726
|
-
when 0x70; array_enc << 0x03C0
|
727
|
-
when 0x71; array_enc << 0x03B8
|
728
|
-
when 0x72; array_enc << 0x03C1
|
729
|
-
when 0x73; array_enc << 0x03C3
|
730
|
-
when 0x74; array_enc << 0x03C4
|
731
|
-
when 0x75; array_enc << 0x03C5
|
732
|
-
when 0x76; array_enc << 0x03D6
|
733
|
-
when 0x77; array_enc << 0x03C9
|
734
|
-
when 0x78; array_enc << 0x03BE
|
735
|
-
when 0x79; array_enc << 0x03C8
|
736
|
-
when 0x7A; array_enc << 0x03B6
|
737
|
-
when 0x7E; array_enc << 0x223C
|
738
|
-
when 0xA0; array_enc << 0x20AC
|
739
|
-
when 0xA1; array_enc << 0x03D2
|
740
|
-
when 0xA2; array_enc << 0x2032
|
741
|
-
when 0xA3; array_enc << 0x2264
|
742
|
-
when 0xA4; array_enc << 0x2215
|
743
|
-
when 0xA5; array_enc << 0x221E
|
744
|
-
when 0xA6; array_enc << 0x0192
|
745
|
-
when 0xA7; array_enc << 0x2663
|
746
|
-
when 0xA8; array_enc << 0x2666
|
747
|
-
when 0xA9; array_enc << 0x2665
|
748
|
-
when 0xAA; array_enc << 0x2660
|
749
|
-
when 0xAB; array_enc << 0x2194
|
750
|
-
when 0xAC; array_enc << 0x2190
|
751
|
-
when 0xAD; array_enc << 0x2191
|
752
|
-
when 0xAE; array_enc << 0x2192
|
753
|
-
when 0xAF; array_enc << 0x2193
|
754
|
-
when 0xB2; array_enc << 0x2033
|
755
|
-
when 0xB3; array_enc << 0x2265
|
756
|
-
when 0xB4; array_enc << 0x00D7
|
757
|
-
when 0xB5; array_enc << 0x221D
|
758
|
-
when 0xB6; array_enc << 0x2202
|
759
|
-
when 0xB7; array_enc << 0x2022
|
760
|
-
when 0xB8; array_enc << 0x00F7
|
761
|
-
when 0xB9; array_enc << 0x2260
|
762
|
-
when 0xBA; array_enc << 0x2261
|
763
|
-
when 0xBB; array_enc << 0x2248
|
764
|
-
when 0xBC; array_enc << 0x2026
|
765
|
-
when 0xBD; array_enc << 0xF8E6
|
766
|
-
when 0xBE; array_enc << 0xF8E7
|
767
|
-
when 0xBF; array_enc << 0x21B5
|
768
|
-
when 0xC0; array_enc << 0x2135
|
769
|
-
when 0xC1; array_enc << 0x2111
|
770
|
-
when 0xC2; array_enc << 0x211C
|
771
|
-
when 0xC3; array_enc << 0x2118
|
772
|
-
when 0xC4; array_enc << 0x2297
|
773
|
-
when 0xC5; array_enc << 0x2295
|
774
|
-
when 0xC6; array_enc << 0x2205
|
775
|
-
when 0xC7; array_enc << 0x2229
|
776
|
-
when 0xC8; array_enc << 0x222A
|
777
|
-
when 0xC9; array_enc << 0x2283
|
778
|
-
when 0xCA; array_enc << 0x2287
|
779
|
-
when 0xCB; array_enc << 0x2284
|
780
|
-
when 0xCC; array_enc << 0x2282
|
781
|
-
when 0xCD; array_enc << 0x2286
|
782
|
-
when 0xCE; array_enc << 0x2208
|
783
|
-
when 0xCF; array_enc << 0x2209
|
784
|
-
when 0xD0; array_enc << 0x2220
|
785
|
-
when 0xD1; array_enc << 0x2207
|
786
|
-
when 0xD2; array_enc << 0xF6DA
|
787
|
-
when 0xD3; array_enc << 0xF6D9
|
788
|
-
when 0xD4; array_enc << 0xF6DB
|
789
|
-
when 0xD5; array_enc << 0x220F
|
790
|
-
when 0xD6; array_enc << 0x221A
|
791
|
-
when 0xD7; array_enc << 0x22C5
|
792
|
-
when 0xD8; array_enc << 0x00AC
|
793
|
-
when 0xD9; array_enc << 0x2227
|
794
|
-
when 0xDA; array_enc << 0x2228
|
795
|
-
when 0xDB; array_enc << 0x21D4
|
796
|
-
when 0xDC; array_enc << 0x21D0
|
797
|
-
when 0xDD; array_enc << 0x21D1
|
798
|
-
when 0xDE; array_enc << 0x21D2
|
799
|
-
when 0xDF; array_enc << 0x21D3
|
800
|
-
when 0xE0; array_enc << 0x25CA
|
801
|
-
when 0xE1; array_enc << 0x2329
|
802
|
-
when 0xE2; array_enc << 0xF8E8
|
803
|
-
when 0xE3; array_enc << 0xF8E9
|
804
|
-
when 0xE4; array_enc << 0xF8EA
|
805
|
-
when 0xE5; array_enc << 0x2211
|
806
|
-
when 0xE6; array_enc << 0xF8EB
|
807
|
-
when 0xE7; array_enc << 0xF8EC
|
808
|
-
when 0xE8; array_enc << 0xF8ED
|
809
|
-
when 0xE9; array_enc << 0xF8EE
|
810
|
-
when 0xEA; array_enc << 0xF8EF
|
811
|
-
when 0xEB; array_enc << 0xF8F0
|
812
|
-
when 0xEC; array_enc << 0xF8F1
|
813
|
-
when 0xED; array_enc << 0xF8F2
|
814
|
-
when 0xEE; array_enc << 0xF8F3
|
815
|
-
when 0xEF; array_enc << 0xF8F4
|
816
|
-
when 0xF1; array_enc << 0x232A
|
817
|
-
when 0xF2; array_enc << 0x222B
|
818
|
-
when 0xF3; array_enc << 0x2320
|
819
|
-
when 0xF4; array_enc << 0xF8F5
|
820
|
-
when 0xF5; array_enc << 0x2321
|
821
|
-
when 0xF6; array_enc << 0xF8F6
|
822
|
-
when 0xF7; array_enc << 0xF8F7
|
823
|
-
when 0xF8; array_enc << 0xF8F8
|
824
|
-
when 0xF9; array_enc << 0xF8F9
|
825
|
-
when 0xFA; array_enc << 0xF8FA
|
826
|
-
when 0xFB; array_enc << 0xF8FB
|
827
|
-
when 0xFC; array_enc << 0xF8FC
|
828
|
-
when 0xFD; array_enc << 0xF8FD
|
829
|
-
when 0xFE; array_enc << 0xF8FE
|
830
|
-
else
|
831
|
-
array_enc << num
|
832
|
-
end
|
833
|
-
end
|
834
|
-
end
|
835
|
-
|
836
|
-
# replace charcters that didn't convert to unicode nicely with something valid
|
837
|
-
array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }
|
838
|
-
|
839
|
-
# convert any glyph names to unicode codepoints
|
840
|
-
array_enc = self.process_glyphnames(array_enc)
|
841
|
-
|
842
|
-
# pack all our Unicode codepoints into a UTF-8 string
|
843
|
-
ret = array_enc.pack("U*")
|
844
|
-
|
845
|
-
# set the strings encoding correctly under ruby 1.9+
|
846
|
-
ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
|
847
|
-
|
848
|
-
return ret
|
849
|
-
end
|
850
|
-
end
|
851
|
-
|
852
|
-
class UTF16Encoding < Encoding
|
853
|
-
# convert a UTF-16 string into UTF-8
|
854
|
-
def to_utf8(str, tounicode = nil)
|
855
|
-
|
856
|
-
# remove the UTF-16 Byte Order Mark if it exists
|
857
|
-
str = str[2, str.size-2] if str[0,2] == "\376\377"
|
858
|
-
|
859
|
-
# convert away
|
860
|
-
str = str.unpack("n*").pack("U*")
|
861
|
-
|
862
|
-
# set the strings encoding correctly under ruby 1.9+
|
863
|
-
str.force_encoding("UTF-8") if str.respond_to?(:force_encoding)
|
864
|
-
|
865
|
-
return str
|
866
|
-
end
|
867
|
-
end
|
868
|
-
|
869
|
-
class WinAnsiEncoding < Encoding
|
870
|
-
# convert a WinAnsiEncoding string into UTF-8
|
871
|
-
def to_utf8(str, tounicode = nil)
|
872
|
-
# content of this method borrowed from REXML::Encoding.decode_cp1252
|
873
|
-
# for further reading:
|
874
|
-
# http://www.intertwingly.net/stories/2004/04/14/i18n.html
|
875
|
-
array_latin9 = str.unpack('C*')
|
876
|
-
array_latin9 = self.process_differences(array_latin9)
|
877
|
-
array_enc = []
|
878
|
-
array_latin9.each do |num|
|
879
|
-
if tounicode && (code = tounicode.decode(num))
|
880
|
-
array_enc << code
|
881
|
-
elsif tounicode
|
882
|
-
array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
|
883
|
-
else
|
884
|
-
case num
|
885
|
-
# characters that added compared to iso-8859-1
|
886
|
-
when 0x80; array_enc << 0x20AC # 0xe2 0x82 0xac
|
887
|
-
when 0x82; array_enc << 0x201A # 0xe2 0x82 0x9a
|
888
|
-
when 0x83; array_enc << 0x0192 # 0xc6 0x92
|
889
|
-
when 0x84; array_enc << 0x201E # 0xe2 0x82 0x9e
|
890
|
-
when 0x85; array_enc << 0x2026 # 0xe2 0x80 0xa6
|
891
|
-
when 0x86; array_enc << 0x2020 # 0xe2 0x80 0xa0
|
892
|
-
when 0x87; array_enc << 0x2021 # 0xe2 0x80 0xa1
|
893
|
-
when 0x88; array_enc << 0x02C6 # 0xcb 0x86
|
894
|
-
when 0x89; array_enc << 0x2030 # 0xe2 0x80 0xb0
|
895
|
-
when 0x8A; array_enc << 0x0160 # 0xc5 0xa0
|
896
|
-
when 0x8B; array_enc << 0x2039 # 0xe2 0x80 0xb9
|
897
|
-
when 0x8C; array_enc << 0x0152 # 0xc5 0x92
|
898
|
-
when 0x8E; array_enc << 0x017D # 0xc5 0xbd
|
899
|
-
when 0x91; array_enc << 0x2018 # 0xe2 0x80 0x98
|
900
|
-
when 0x92; array_enc << 0x2019 # 0xe2 0x80 0x99
|
901
|
-
when 0x93; array_enc << 0x201C
|
902
|
-
when 0x94; array_enc << 0x201D
|
903
|
-
when 0x95; array_enc << 0x2022
|
904
|
-
when 0x96; array_enc << 0x2013
|
905
|
-
when 0x97; array_enc << 0x2014
|
906
|
-
when 0x98; array_enc << 0x02DC
|
907
|
-
when 0x99; array_enc << 0x2122
|
908
|
-
when 0x9A; array_enc << 0x0161
|
909
|
-
when 0x9B; array_enc << 0x203A
|
910
|
-
when 0x9C; array_enc << 0x0152 # 0xc5 0x93
|
911
|
-
when 0x9E; array_enc << 0x017E # 0xc5 0xbe
|
912
|
-
when 0x9F; array_enc << 0x0178
|
913
|
-
else
|
914
|
-
array_enc << num
|
915
|
-
end
|
916
|
-
end
|
917
|
-
end
|
918
|
-
|
919
|
-
# convert any glyph names to unicode codepoints
|
920
|
-
array_enc = self.process_glyphnames(array_enc)
|
921
|
-
|
922
|
-
# replace charcters that didn't convert to unicode nicely with something valid
|
923
|
-
array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }
|
924
|
-
|
925
|
-
# pack all our Unicode codepoints into a UTF-8 string
|
926
|
-
ret = array_enc.pack("U*")
|
927
|
-
|
928
|
-
# set the strings encoding correctly under ruby 1.9+
|
929
|
-
ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
|
930
|
-
|
931
|
-
return ret
|
932
|
-
end
|
933
|
-
end
|
934
|
-
|
935
|
-
class ZapfDingbatsEncoding < Encoding
|
936
|
-
# convert a ZapfDingbatsEncoding string into UTF-8
|
937
|
-
def to_utf8(str, tounicode = nil)
|
938
|
-
# mapping to unicode taken from:
|
939
|
-
# http://unicode.org/Public/MAPPINGS/VENDORS/ADOBE/zdingbat.txt
|
940
|
-
array_symbol = str.unpack('C*')
|
941
|
-
array_symbol = self.process_differences(array_symbol)
|
942
|
-
array_enc = []
|
943
|
-
array_symbol.each do |num|
|
944
|
-
if tounicode && (code = tounicode.decode(num))
|
945
|
-
array_enc << code
|
946
|
-
elsif tounicode
|
947
|
-
array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
|
948
|
-
else
|
949
|
-
case num
|
950
|
-
when 0x21; array_enc << 0x2701
|
951
|
-
when 0x22; array_enc << 0x2702
|
952
|
-
when 0x23; array_enc << 0x2703
|
953
|
-
when 0x24; array_enc << 0x2704
|
954
|
-
when 0x25; array_enc << 0x260E
|
955
|
-
when 0x26; array_enc << 0x2706
|
956
|
-
when 0x27; array_enc << 0x2707
|
957
|
-
when 0x28; array_enc << 0x2708
|
958
|
-
when 0x29; array_enc << 0x2709
|
959
|
-
when 0x2A; array_enc << 0x261B
|
960
|
-
when 0x2B; array_enc << 0x261E
|
961
|
-
when 0x2C; array_enc << 0x270C
|
962
|
-
when 0x2D; array_enc << 0x270D
|
963
|
-
when 0x2E; array_enc << 0x270E
|
964
|
-
when 0x2F; array_enc << 0x270F
|
965
|
-
when 0x30; array_enc << 0x2710
|
966
|
-
when 0x31; array_enc << 0x2711
|
967
|
-
when 0x32; array_enc << 0x2712
|
968
|
-
when 0x33; array_enc << 0x2713
|
969
|
-
when 0x34; array_enc << 0x2714
|
970
|
-
when 0x35; array_enc << 0x2715
|
971
|
-
when 0x36; array_enc << 0x2716
|
972
|
-
when 0x37; array_enc << 0x2717
|
973
|
-
when 0x38; array_enc << 0x2718
|
974
|
-
when 0x39; array_enc << 0x2719
|
975
|
-
when 0x3A; array_enc << 0x271A
|
976
|
-
when 0x3B; array_enc << 0x271B
|
977
|
-
when 0x3C; array_enc << 0x271C
|
978
|
-
when 0x3D; array_enc << 0x271D
|
979
|
-
when 0x3E; array_enc << 0x271E
|
980
|
-
when 0x3F; array_enc << 0x271E
|
981
|
-
when 0x40; array_enc << 0x2720
|
982
|
-
when 0x41; array_enc << 0x2721
|
983
|
-
when 0x42; array_enc << 0x2722
|
984
|
-
when 0x43; array_enc << 0x2723
|
985
|
-
when 0x44; array_enc << 0x2724
|
986
|
-
when 0x45; array_enc << 0x2725
|
987
|
-
when 0x46; array_enc << 0x2726
|
988
|
-
when 0x47; array_enc << 0x2727
|
989
|
-
when 0x48; array_enc << 0x2605
|
990
|
-
when 0x49; array_enc << 0x2729
|
991
|
-
when 0x4A; array_enc << 0x272A
|
992
|
-
when 0x4B; array_enc << 0x272B
|
993
|
-
when 0x4C; array_enc << 0x272C
|
994
|
-
when 0x4D; array_enc << 0x272D
|
995
|
-
when 0x4E; array_enc << 0x272E
|
996
|
-
when 0x4F; array_enc << 0x272F
|
997
|
-
when 0x50; array_enc << 0x2730
|
998
|
-
when 0x51; array_enc << 0x2731
|
999
|
-
when 0x52; array_enc << 0x2732
|
1000
|
-
when 0x53; array_enc << 0x2733
|
1001
|
-
when 0x54; array_enc << 0x2734
|
1002
|
-
when 0x55; array_enc << 0x2735
|
1003
|
-
when 0x56; array_enc << 0x2736
|
1004
|
-
when 0x57; array_enc << 0x2737
|
1005
|
-
when 0x58; array_enc << 0x2738
|
1006
|
-
when 0x59; array_enc << 0x2739
|
1007
|
-
when 0x5A; array_enc << 0x273A
|
1008
|
-
when 0x5B; array_enc << 0x273B
|
1009
|
-
when 0x5C; array_enc << 0x273C
|
1010
|
-
when 0x5D; array_enc << 0x273D
|
1011
|
-
when 0x5E; array_enc << 0x273E
|
1012
|
-
when 0x5F; array_enc << 0x273F
|
1013
|
-
when 0x60; array_enc << 0x2740
|
1014
|
-
when 0x61; array_enc << 0x2741
|
1015
|
-
when 0x62; array_enc << 0x2742
|
1016
|
-
when 0x63; array_enc << 0x2743
|
1017
|
-
when 0x64; array_enc << 0x2744
|
1018
|
-
when 0x65; array_enc << 0x2745
|
1019
|
-
when 0x66; array_enc << 0x2746
|
1020
|
-
when 0x67; array_enc << 0x2747
|
1021
|
-
when 0x68; array_enc << 0x2748
|
1022
|
-
when 0x69; array_enc << 0x2749
|
1023
|
-
when 0x6A; array_enc << 0x274A
|
1024
|
-
when 0x6B; array_enc << 0x274B
|
1025
|
-
when 0x6C; array_enc << 0x25CF
|
1026
|
-
when 0x6D; array_enc << 0x274D
|
1027
|
-
when 0x6E; array_enc << 0x25A0
|
1028
|
-
when 0x6F; array_enc << 0x274F
|
1029
|
-
when 0x70; array_enc << 0x2750
|
1030
|
-
when 0x71; array_enc << 0x2751
|
1031
|
-
when 0x72; array_enc << 0x2752
|
1032
|
-
when 0x73; array_enc << 0x2753
|
1033
|
-
when 0x74; array_enc << 0x2754
|
1034
|
-
when 0x75; array_enc << 0x2755
|
1035
|
-
when 0x76; array_enc << 0x2756
|
1036
|
-
when 0x77; array_enc << 0x2757
|
1037
|
-
when 0x78; array_enc << 0x2758
|
1038
|
-
when 0x79; array_enc << 0x2759
|
1039
|
-
when 0x7A; array_enc << 0x275A
|
1040
|
-
when 0x7B; array_enc << 0x275B
|
1041
|
-
when 0x7C; array_enc << 0x275C
|
1042
|
-
when 0x7D; array_enc << 0x275D
|
1043
|
-
when 0x7E; array_enc << 0x275E
|
1044
|
-
when 0x80; array_enc << 0xF8D7
|
1045
|
-
when 0x81; array_enc << 0xF8D8
|
1046
|
-
when 0x82; array_enc << 0xF8D9
|
1047
|
-
when 0x83; array_enc << 0xF8DA
|
1048
|
-
when 0x84; array_enc << 0xF8DB
|
1049
|
-
when 0x85; array_enc << 0xF8DC
|
1050
|
-
when 0x86; array_enc << 0xF8DD
|
1051
|
-
when 0x87; array_enc << 0xF8DE
|
1052
|
-
when 0x88; array_enc << 0xF8DF
|
1053
|
-
when 0x89; array_enc << 0xF8E0
|
1054
|
-
when 0x8A; array_enc << 0xF8E1
|
1055
|
-
when 0x8B; array_enc << 0xF8E2
|
1056
|
-
when 0x8C; array_enc << 0xF8E3
|
1057
|
-
when 0x8D; array_enc << 0xF8E4
|
1058
|
-
when 0xA1; array_enc << 0x2761
|
1059
|
-
when 0xA2; array_enc << 0x2762
|
1060
|
-
when 0xA3; array_enc << 0x2763
|
1061
|
-
when 0xA4; array_enc << 0x2764
|
1062
|
-
when 0xA5; array_enc << 0x2765
|
1063
|
-
when 0xA6; array_enc << 0x2766
|
1064
|
-
when 0xA7; array_enc << 0x2767
|
1065
|
-
when 0xA8; array_enc << 0x2663
|
1066
|
-
when 0xA9; array_enc << 0x2666
|
1067
|
-
when 0xAA; array_enc << 0x2665
|
1068
|
-
when 0xAB; array_enc << 0x2660
|
1069
|
-
when 0xAC; array_enc << 0x2460
|
1070
|
-
when 0xAD; array_enc << 0x2461
|
1071
|
-
when 0xAE; array_enc << 0x2462
|
1072
|
-
when 0xAF; array_enc << 0x2463
|
1073
|
-
when 0xB0; array_enc << 0x2464
|
1074
|
-
when 0xB1; array_enc << 0x2465
|
1075
|
-
when 0xB2; array_enc << 0x2466
|
1076
|
-
when 0xB3; array_enc << 0x2467
|
1077
|
-
when 0xB4; array_enc << 0x2468
|
1078
|
-
when 0xB5; array_enc << 0x2469
|
1079
|
-
when 0xB6; array_enc << 0x2776
|
1080
|
-
when 0xB7; array_enc << 0x2777
|
1081
|
-
when 0xB8; array_enc << 0x2778
|
1082
|
-
when 0xB9; array_enc << 0x2779
|
1083
|
-
when 0xBA; array_enc << 0x277A
|
1084
|
-
when 0xBB; array_enc << 0x277B
|
1085
|
-
when 0xBC; array_enc << 0x277C
|
1086
|
-
when 0xBD; array_enc << 0x277D
|
1087
|
-
when 0xBE; array_enc << 0x277E
|
1088
|
-
when 0xBF; array_enc << 0x277F
|
1089
|
-
when 0xC0; array_enc << 0x2780
|
1090
|
-
when 0xC1; array_enc << 0x2781
|
1091
|
-
when 0xC2; array_enc << 0x2782
|
1092
|
-
when 0xC3; array_enc << 0x2783
|
1093
|
-
when 0xC4; array_enc << 0x2784
|
1094
|
-
when 0xC5; array_enc << 0x2785
|
1095
|
-
when 0xC6; array_enc << 0x2786
|
1096
|
-
when 0xC7; array_enc << 0x2787
|
1097
|
-
when 0xC8; array_enc << 0x2788
|
1098
|
-
when 0xC9; array_enc << 0x2789
|
1099
|
-
when 0xCA; array_enc << 0x278A
|
1100
|
-
when 0xCB; array_enc << 0x278B
|
1101
|
-
when 0xCC; array_enc << 0x278C
|
1102
|
-
when 0xCD; array_enc << 0x278D
|
1103
|
-
when 0xCE; array_enc << 0x278E
|
1104
|
-
when 0xCF; array_enc << 0x278F
|
1105
|
-
when 0xD0; array_enc << 0x2790
|
1106
|
-
when 0xD1; array_enc << 0x2791
|
1107
|
-
when 0xD2; array_enc << 0x2792
|
1108
|
-
when 0xD3; array_enc << 0x2793
|
1109
|
-
when 0xD4; array_enc << 0x2794
|
1110
|
-
when 0xD5; array_enc << 0x2795
|
1111
|
-
when 0xD6; array_enc << 0x2796
|
1112
|
-
when 0xD7; array_enc << 0x2797
|
1113
|
-
when 0xD8; array_enc << 0x2798
|
1114
|
-
when 0xD9; array_enc << 0x2799
|
1115
|
-
when 0xDA; array_enc << 0x279A
|
1116
|
-
when 0xDB; array_enc << 0x279B
|
1117
|
-
when 0xDC; array_enc << 0x279C
|
1118
|
-
when 0xDD; array_enc << 0x279D
|
1119
|
-
when 0xDE; array_enc << 0x279E
|
1120
|
-
when 0xDF; array_enc << 0x279F
|
1121
|
-
when 0xE0; array_enc << 0x27A0
|
1122
|
-
when 0xE1; array_enc << 0x27A1
|
1123
|
-
when 0xE2; array_enc << 0x27A2
|
1124
|
-
when 0xE3; array_enc << 0x27A3
|
1125
|
-
when 0xE4; array_enc << 0x27A4
|
1126
|
-
when 0xE5; array_enc << 0x27A5
|
1127
|
-
when 0xE6; array_enc << 0x27A6
|
1128
|
-
when 0xE7; array_enc << 0x27A7
|
1129
|
-
when 0xE8; array_enc << 0x27A8
|
1130
|
-
when 0xE9; array_enc << 0x27A9
|
1131
|
-
when 0xEA; array_enc << 0x27AA
|
1132
|
-
when 0xEB; array_enc << 0x27AB
|
1133
|
-
when 0xEC; array_enc << 0x27AC
|
1134
|
-
when 0xED; array_enc << 0x27AD
|
1135
|
-
when 0xEE; array_enc << 0x27AE
|
1136
|
-
when 0xEF; array_enc << 0x27AF
|
1137
|
-
when 0xF1; array_enc << 0x27B1
|
1138
|
-
when 0xF2; array_enc << 0x27B2
|
1139
|
-
when 0xF3; array_enc << 0x27B3
|
1140
|
-
when 0xF4; array_enc << 0x27B4
|
1141
|
-
when 0xF5; array_enc << 0x27B5
|
1142
|
-
when 0xF6; array_enc << 0x27B6
|
1143
|
-
when 0xF7; array_enc << 0x27B7
|
1144
|
-
when 0xF8; array_enc << 0x27B8
|
1145
|
-
when 0xF9; array_enc << 0x27B9
|
1146
|
-
when 0xFA; array_enc << 0x27BA
|
1147
|
-
when 0xFB; array_enc << 0x27BB
|
1148
|
-
when 0xFC; array_enc << 0x27BC
|
1149
|
-
when 0xFD; array_enc << 0x27BD
|
1150
|
-
when 0xFE; array_enc << 0x27BE
|
1151
|
-
else
|
1152
|
-
array_enc << num
|
1153
|
-
end
|
1154
|
-
end
|
1155
|
-
end
|
1156
|
-
|
1157
|
-
# convert any glyph names to unicode codepoints
|
1158
|
-
array_enc = self.process_glyphnames(array_enc)
|
1159
|
-
|
1160
|
-
# replace charcters that didn't convert to unicode nicely with something valid
|
1161
|
-
array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }
|
1162
|
-
|
1163
|
-
# pack all our Unicode codepoints into a UTF-8 string
|
1164
|
-
ret = array_enc.pack("U*")
|
1165
|
-
|
1166
|
-
# set the strings encoding correctly under ruby 1.9+
|
1167
|
-
ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
|
1168
|
-
|
1169
|
-
return ret
|
1170
|
-
end
|
1171
|
-
end
|
1172
166
|
end
|
1173
167
|
end
|