pdf-reader 1.1.1 → 2.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG +87 -2
- data/{README.rdoc → README.md} +43 -31
- data/Rakefile +21 -16
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_object +4 -1
- data/bin/pdf_text +1 -3
- data/examples/callbacks.rb +2 -1
- data/examples/extract_images.rb +11 -6
- data/examples/fuzzy_paragraphs.rb +24 -0
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
- data/lib/pdf/reader/afm/Courier.afm +342 -0
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -0
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
- data/lib/pdf/reader/buffer.rb +90 -63
- data/lib/pdf/reader/cid_widths.rb +63 -0
- data/lib/pdf/reader/cmap.rb +69 -38
- data/lib/pdf/reader/encoding.rb +74 -48
- data/lib/pdf/reader/error.rb +24 -4
- data/lib/pdf/reader/filter/ascii85.rb +28 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +30 -0
- data/lib/pdf/reader/filter/depredict.rb +141 -0
- data/lib/pdf/reader/filter/flate.rb +53 -0
- data/lib/pdf/reader/filter/lzw.rb +21 -0
- data/lib/pdf/reader/filter/null.rb +18 -0
- data/lib/pdf/reader/filter/run_length.rb +45 -0
- data/lib/pdf/reader/filter.rb +15 -234
- data/lib/pdf/reader/font.rb +107 -43
- data/lib/pdf/reader/font_descriptor.rb +80 -0
- data/lib/pdf/reader/form_xobject.rb +26 -4
- data/lib/pdf/reader/glyph_hash.rb +56 -18
- data/lib/pdf/reader/lzw.rb +6 -4
- data/lib/pdf/reader/null_security_handler.rb +17 -0
- data/lib/pdf/reader/object_cache.rb +40 -16
- data/lib/pdf/reader/object_hash.rb +94 -40
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/orientation_detector.rb +34 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
- data/lib/pdf/reader/page.rb +48 -3
- data/lib/pdf/reader/page_layout.rb +125 -0
- data/lib/pdf/reader/page_state.rb +185 -70
- data/lib/pdf/reader/page_text_receiver.rb +70 -20
- data/lib/pdf/reader/pages_strategy.rb +4 -293
- data/lib/pdf/reader/parser.rb +37 -61
- data/lib/pdf/reader/print_receiver.rb +6 -0
- data/lib/pdf/reader/reference.rb +4 -1
- data/lib/pdf/reader/register_receiver.rb +17 -31
- data/lib/pdf/reader/resource_methods.rb +1 -0
- data/lib/pdf/reader/standard_security_handler.rb +82 -42
- data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
- data/lib/pdf/reader/stream.rb +5 -2
- data/lib/pdf/reader/synchronized_cache.rb +33 -0
- data/lib/pdf/reader/text_run.rb +99 -0
- data/lib/pdf/reader/token.rb +4 -1
- data/lib/pdf/reader/transformation_matrix.rb +195 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +67 -0
- data/lib/pdf/reader/width_calculator/composite.rb +28 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +33 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +25 -0
- data/lib/pdf/reader/width_calculator.rb +12 -0
- data/lib/pdf/reader/xref.rb +41 -9
- data/lib/pdf/reader.rb +45 -104
- data/lib/pdf-reader.rb +4 -1
- metadata +220 -101
- data/bin/pdf_list_callbacks +0 -17
- data/lib/pdf/hash.rb +0 -15
- data/lib/pdf/reader/abstract_strategy.rb +0 -81
- data/lib/pdf/reader/metadata_strategy.rb +0 -56
- data/lib/pdf/reader/text_receiver.rb +0 -264
data/lib/pdf/reader/font.rb
CHANGED
@@ -1,3 +1,6 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
1
4
|
################################################################################
|
2
5
|
#
|
3
6
|
# Copyright (C) 2008 James Healy (jimmy@deefa.com)
|
@@ -23,41 +26,29 @@
|
|
23
26
|
#
|
24
27
|
################################################################################
|
25
28
|
|
29
|
+
require 'pdf/reader/width_calculator'
|
30
|
+
|
26
31
|
class PDF::Reader
|
32
|
+
# Represents a single font PDF object and provides some useful methods
|
33
|
+
# for extracting info. Mainly used for converting text to UTF-8.
|
34
|
+
#
|
27
35
|
class Font
|
28
|
-
attr_accessor :
|
29
|
-
attr_reader :widths, :first_char, :
|
30
|
-
|
31
|
-
|
32
|
-
def initialize(ohash
|
33
|
-
if ohash.nil? || obj.nil?
|
34
|
-
$stderr.puts "DEPREACTION WARNING - PDF::Reader::Font.new should be called with 2 args"
|
35
|
-
return
|
36
|
-
end
|
36
|
+
attr_accessor :subtype, :encoding, :descendantfonts, :tounicode
|
37
|
+
attr_reader :widths, :first_char, :last_char, :basefont, :font_descriptor,
|
38
|
+
:cid_widths, :cid_default_width
|
39
|
+
|
40
|
+
def initialize(ohash, obj)
|
37
41
|
@ohash = ohash
|
38
42
|
@tounicode = nil
|
39
43
|
|
40
44
|
extract_base_info(obj)
|
41
45
|
extract_descriptor(obj)
|
42
46
|
extract_descendants(obj)
|
47
|
+
@width_calc = build_width_calculator
|
43
48
|
|
44
49
|
@encoding ||= PDF::Reader::Encoding.new(:StandardEncoding)
|
45
50
|
end
|
46
51
|
|
47
|
-
def basefont=(font)
|
48
|
-
# setup a default encoding for the selected font. It can always be overridden
|
49
|
-
# with encoding= if required
|
50
|
-
case font
|
51
|
-
when "Symbol" then
|
52
|
-
@encoding = PDF::Reader::Encoding.new("SymbolEncoding")
|
53
|
-
when "ZapfDingbats" then
|
54
|
-
@encoding = PDF::Reader::Encoding.new("ZapfDingbatsEncoding")
|
55
|
-
else
|
56
|
-
@encoding = nil
|
57
|
-
end
|
58
|
-
@basefont = font
|
59
|
-
end
|
60
|
-
|
61
52
|
def to_utf8(params)
|
62
53
|
if @tounicode
|
63
54
|
to_utf8_via_cmap(params)
|
@@ -66,39 +57,102 @@ class PDF::Reader
|
|
66
57
|
end
|
67
58
|
end
|
68
59
|
|
69
|
-
def
|
70
|
-
|
71
|
-
|
72
|
-
|
60
|
+
def unpack(data)
|
61
|
+
data.unpack(encoding.unpack)
|
62
|
+
end
|
63
|
+
|
64
|
+
# looks up the specified codepoint and returns a value that is in (pdf)
|
65
|
+
# glyph space, which is 1000 glyph units = 1 text space unit
|
66
|
+
def glyph_width(code_point)
|
67
|
+
if code_point.is_a?(String)
|
68
|
+
code_point = code_point.unpack(encoding.unpack).first
|
69
|
+
end
|
70
|
+
|
71
|
+
@cached_widths ||= {}
|
72
|
+
@cached_widths[code_point] ||= @width_calc.glyph_width(code_point)
|
73
73
|
end
|
74
74
|
|
75
75
|
private
|
76
76
|
|
77
|
+
def default_encoding(font_name)
|
78
|
+
case font_name.to_s
|
79
|
+
when "Symbol" then
|
80
|
+
PDF::Reader::Encoding.new(:SymbolEncoding)
|
81
|
+
when "ZapfDingbats" then
|
82
|
+
PDF::Reader::Encoding.new(:ZapfDingbatsEncoding)
|
83
|
+
else
|
84
|
+
PDF::Reader::Encoding.new(:StandardEncoding)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def build_width_calculator
|
89
|
+
if @subtype == :Type0
|
90
|
+
PDF::Reader::WidthCalculator::TypeZero.new(self)
|
91
|
+
elsif @subtype == :Type1
|
92
|
+
if @font_descriptor.nil?
|
93
|
+
PDF::Reader::WidthCalculator::BuiltIn.new(self)
|
94
|
+
else
|
95
|
+
PDF::Reader::WidthCalculator::TypeOneOrThree .new(self)
|
96
|
+
end
|
97
|
+
elsif @subtype == :Type3
|
98
|
+
PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
|
99
|
+
elsif @subtype == :TrueType
|
100
|
+
if @font_descriptor
|
101
|
+
PDF::Reader::WidthCalculator::TrueType.new(self)
|
102
|
+
else
|
103
|
+
# A TrueType font that isn't embedded. Most readers look for a version on the
|
104
|
+
# local system and fallback to a substitute. For now, we go straight to a substitute
|
105
|
+
PDF::Reader::WidthCalculator::BuiltIn.new(self)
|
106
|
+
end
|
107
|
+
elsif @subtype == :CIDFontType0 || @subtype == :CIDFontType2
|
108
|
+
PDF::Reader::WidthCalculator::Composite.new(self)
|
109
|
+
else
|
110
|
+
PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
77
114
|
def extract_base_info(obj)
|
78
115
|
@subtype = @ohash.object(obj[:Subtype])
|
79
116
|
@basefont = @ohash.object(obj[:BaseFont])
|
80
|
-
|
117
|
+
if @ohash.object(obj[:Encoding])
|
118
|
+
@encoding = PDF::Reader::Encoding.new(@ohash.object(obj[:Encoding]))
|
119
|
+
else
|
120
|
+
@encoding = default_encoding(@basefont)
|
121
|
+
end
|
81
122
|
@widths = @ohash.object(obj[:Widths]) || []
|
82
123
|
@first_char = @ohash.object(obj[:FirstChar])
|
124
|
+
@last_char = @ohash.object(obj[:LastChar])
|
125
|
+
|
126
|
+
# CID Fonts are not required to have a W or DW entry, if they don't exist,
|
127
|
+
# the default cid width = 1000, see Section 9.7.4.1 PDF 32000-1:2008 pp 269
|
128
|
+
@cid_widths = @ohash.object(obj[:W]) || []
|
129
|
+
@cid_default_width = @ohash.object(obj[:DW]) || 1000
|
130
|
+
|
83
131
|
if obj[:ToUnicode]
|
132
|
+
# ToUnicode is optional for Type1 and Type3
|
84
133
|
stream = @ohash.object(obj[:ToUnicode])
|
85
|
-
|
134
|
+
if stream.is_a?(PDF::Reader::Stream)
|
135
|
+
@tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
136
|
+
end
|
86
137
|
end
|
87
138
|
end
|
88
139
|
|
89
140
|
def extract_descriptor(obj)
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
141
|
+
if obj[:FontDescriptor]
|
142
|
+
# create a font descriptor object if we can, in other words, unless this is
|
143
|
+
# a CID Font
|
144
|
+
fd = @ohash.object(obj[:FontDescriptor])
|
145
|
+
@font_descriptor = PDF::Reader::FontDescriptor.new(@ohash, fd)
|
146
|
+
else
|
147
|
+
@font_descriptor = nil
|
148
|
+
end
|
97
149
|
end
|
98
150
|
|
99
151
|
def extract_descendants(obj)
|
100
152
|
return unless obj[:DescendantFonts]
|
101
|
-
|
153
|
+
# per PDF 32000-1:2008 pp. 280 :DescendentFonts is:
|
154
|
+
# A one-element array specifying the CIDFont dictionary that is the
|
155
|
+
# descendant of this Type 0 font.
|
102
156
|
descendants = @ohash.object(obj[:DescendantFonts])
|
103
157
|
@descendantfonts = descendants.map { |desc|
|
104
158
|
PDF::Reader::Font.new(@ohash, @ohash.object(desc))
|
@@ -106,11 +160,16 @@ class PDF::Reader
|
|
106
160
|
end
|
107
161
|
|
108
162
|
def to_utf8_via_cmap(params)
|
109
|
-
|
163
|
+
case params
|
164
|
+
when Integer
|
165
|
+
[
|
166
|
+
@tounicode.decode(params) || PDF::Reader::Encoding::UNKNOWN_CHAR
|
167
|
+
].flatten.pack("U*")
|
168
|
+
when String
|
110
169
|
params.unpack(encoding.unpack).map { |c|
|
111
170
|
@tounicode.decode(c) || PDF::Reader::Encoding::UNKNOWN_CHAR
|
112
|
-
}.pack("U*")
|
113
|
-
|
171
|
+
}.flatten.pack("U*")
|
172
|
+
when Array
|
114
173
|
params.collect { |param| to_utf8_via_cmap(param) }
|
115
174
|
else
|
116
175
|
params
|
@@ -118,11 +177,16 @@ class PDF::Reader
|
|
118
177
|
end
|
119
178
|
|
120
179
|
def to_utf8_via_encoding(params)
|
121
|
-
|
180
|
+
if encoding.kind_of?(String)
|
181
|
+
raise UnsupportedFeatureError, "font encoding '#{encoding}' currently unsupported"
|
182
|
+
end
|
122
183
|
|
123
|
-
|
184
|
+
case params
|
185
|
+
when Integer
|
186
|
+
encoding.int_to_utf8_string(params)
|
187
|
+
when String
|
124
188
|
encoding.to_utf8(params)
|
125
|
-
|
189
|
+
when Array
|
126
190
|
params.collect { |param| to_utf8_via_encoding(param) }
|
127
191
|
else
|
128
192
|
params
|
@@ -0,0 +1,80 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'ttfunk'
|
5
|
+
|
6
|
+
class PDF::Reader
|
7
|
+
|
8
|
+
# Font descriptors are outlined in Section 9.8, PDF 32000-1:2008, pp 281-288
|
9
|
+
class FontDescriptor
|
10
|
+
|
11
|
+
attr_reader :font_name, :font_family, :font_stretch, :font_weight,
|
12
|
+
:font_bounding_box, :cap_height, :ascent, :descent, :leading,
|
13
|
+
:avg_width, :max_width, :missing_width, :italic_angle, :stem_v,
|
14
|
+
:x_height, :font_flags
|
15
|
+
|
16
|
+
def initialize(ohash, fd_hash)
|
17
|
+
@ascent = ohash.object(fd_hash[:Ascent]) || 0
|
18
|
+
@descent = ohash.object(fd_hash[:Descent]) || 0
|
19
|
+
@missing_width = ohash.object(fd_hash[:MissingWidth]) || 0
|
20
|
+
@font_bounding_box = ohash.object(fd_hash[:FontBBox]) || [0,0,0,0]
|
21
|
+
@avg_width = ohash.object(fd_hash[:AvgWidth]) || 0
|
22
|
+
@cap_height = ohash.object(fd_hash[:CapHeight]) || 0
|
23
|
+
@font_flags = ohash.object(fd_hash[:Flags]) || 0
|
24
|
+
@italic_angle = ohash.object(fd_hash[:ItalicAngle])
|
25
|
+
@font_name = ohash.object(fd_hash[:FontName]).to_s
|
26
|
+
@leading = ohash.object(fd_hash[:Leading]) || 0
|
27
|
+
@max_width = ohash.object(fd_hash[:MaxWidth]) || 0
|
28
|
+
@stem_v = ohash.object(fd_hash[:StemV])
|
29
|
+
@x_height = ohash.object(fd_hash[:XHeight])
|
30
|
+
@font_stretch = ohash.object(fd_hash[:FontStretch]) || :Normal
|
31
|
+
@font_weight = ohash.object(fd_hash[:FontWeight]) || 400
|
32
|
+
@font_family = ohash.object(fd_hash[:FontFamily])
|
33
|
+
|
34
|
+
# A FontDescriptor may have an embedded font program in FontFile
|
35
|
+
# (Type 1 Font Program), FontFile2 (TrueType font program), or
|
36
|
+
# FontFile3 (Other font program as defined by Subtype entry)
|
37
|
+
# Subtype entries:
|
38
|
+
# 1) Type1C: Type 1 Font Program in Compact Font Format
|
39
|
+
# 2) CIDFontType0C: Type 0 Font Program in Compact Font Format
|
40
|
+
# 3) OpenType: OpenType Font Program
|
41
|
+
# see Section 9.9, PDF 32000-1:2008, pp 288-292
|
42
|
+
@font_program_stream = ohash.object(fd_hash[:FontFile2])
|
43
|
+
#TODO handle FontFile and FontFile3
|
44
|
+
|
45
|
+
@is_ttf = true if @font_program_stream
|
46
|
+
end
|
47
|
+
|
48
|
+
def glyph_width(char_code)
|
49
|
+
if @is_ttf
|
50
|
+
if ttf_program_stream.cmap.unicode.length > 0
|
51
|
+
glyph_id = ttf_program_stream.cmap.unicode.first[char_code]
|
52
|
+
else
|
53
|
+
glyph_id = char_code
|
54
|
+
end
|
55
|
+
char_metric = ttf_program_stream.horizontal_metrics.metrics[glyph_id]
|
56
|
+
if char_metric
|
57
|
+
return char_metric.advance_width
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# PDF states that a glyph is 1000 units wide, true type doesn't enforce
|
63
|
+
# any behavior, but uses units/em to define how wide the 'M' is (the widest letter)
|
64
|
+
def glyph_to_pdf_scale_factor
|
65
|
+
if @is_ttf
|
66
|
+
@glyph_to_pdf_sf ||= (1.0 / ttf_program_stream.header.units_per_em) * 1000.0
|
67
|
+
else
|
68
|
+
@glyph_to_pdf_sf ||= 1.0
|
69
|
+
end
|
70
|
+
@glyph_to_pdf_sf
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def ttf_program_stream
|
76
|
+
@ttf_program_stream ||= TTFunk::File.new(@font_program_stream.unfiltered_data)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
@@ -1,4 +1,7 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'digest/md5'
|
2
5
|
|
3
6
|
module PDF
|
4
7
|
class Reader
|
@@ -15,9 +18,10 @@ module PDF
|
|
15
18
|
|
16
19
|
attr_reader :xobject
|
17
20
|
|
18
|
-
def initialize(page, xobject)
|
21
|
+
def initialize(page, xobject, options = {})
|
19
22
|
@page = page
|
20
23
|
@objects = page.objects
|
24
|
+
@cache = options[:cache] || {}
|
21
25
|
@xobject = @objects.deref(xobject)
|
22
26
|
end
|
23
27
|
|
@@ -65,12 +69,30 @@ module PDF
|
|
65
69
|
end
|
66
70
|
end
|
67
71
|
|
72
|
+
def content_stream_md5
|
73
|
+
@content_stream_md5 ||= Digest::MD5.hexdigest(raw_content)
|
74
|
+
end
|
75
|
+
|
76
|
+
def cached_tokens_key
|
77
|
+
@cached_tokens_key ||= "tokens-#{content_stream_md5}"
|
78
|
+
end
|
79
|
+
|
80
|
+
def tokens
|
81
|
+
@cache[cached_tokens_key] ||= begin
|
82
|
+
buffer = Buffer.new(StringIO.new(raw_content), :content_stream => true)
|
83
|
+
parser = Parser.new(buffer, @objects)
|
84
|
+
result = []
|
85
|
+
while (token = parser.parse_token(PagesStrategy::OPERATORS))
|
86
|
+
result << token
|
87
|
+
end
|
88
|
+
result
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
68
92
|
def content_stream(receivers, instructions)
|
69
|
-
buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
|
70
|
-
parser = Parser.new(buffer, @objects)
|
71
93
|
params = []
|
72
94
|
|
73
|
-
|
95
|
+
tokens.each do |token|
|
74
96
|
if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
|
75
97
|
callback(receivers, PagesStrategy::OPERATORS[token], params)
|
76
98
|
params.clear
|
@@ -1,3 +1,6 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
1
4
|
################################################################################
|
2
5
|
#
|
3
6
|
# Copyright (C) 2011 James Healy (jimmy@deefa.com)
|
@@ -24,9 +27,15 @@
|
|
24
27
|
################################################################################
|
25
28
|
|
26
29
|
class PDF::Reader
|
30
|
+
# A Hash-like object that can convert glyph names into a unicode codepoint.
|
31
|
+
# The mapping is read from a data file on disk the first time it's needed.
|
32
|
+
#
|
27
33
|
class GlyphHash # :nodoc:
|
28
34
|
def initialize
|
29
|
-
|
35
|
+
# only parse the glyph list once, and cache the results (for performance)
|
36
|
+
adobe = @@cache ||= load_adobe_glyph_mapping
|
37
|
+
@by_name = adobe.first
|
38
|
+
@by_codepoint = adobe.last
|
30
39
|
end
|
31
40
|
|
32
41
|
# attempt to convert a PDF Name to a unicode codepoint. Returns nil
|
@@ -34,55 +43,84 @@ class PDF::Reader
|
|
34
43
|
#
|
35
44
|
# h = GlyphHash.new
|
36
45
|
#
|
37
|
-
# h
|
46
|
+
# h.name_to_unicode(:A)
|
38
47
|
# => 65
|
39
48
|
#
|
40
|
-
# h
|
49
|
+
# h.name_to_unicode(:Euro)
|
41
50
|
# => 8364
|
42
51
|
#
|
43
|
-
# h
|
52
|
+
# h.name_to_unicode(:X4A)
|
53
|
+
# => 74
|
54
|
+
#
|
55
|
+
# h.name_to_unicode(:G30)
|
44
56
|
# => 48
|
45
57
|
#
|
46
|
-
# h
|
58
|
+
# h.name_to_unicode(:34)
|
59
|
+
# => 34
|
47
60
|
#
|
48
|
-
def
|
61
|
+
def name_to_unicode(name)
|
49
62
|
return nil unless name.is_a?(Symbol)
|
50
63
|
|
51
64
|
name = name.to_s.gsub('_', '').intern
|
52
65
|
str = name.to_s
|
53
66
|
|
54
|
-
if @
|
55
|
-
@
|
67
|
+
if @by_name.has_key?(name)
|
68
|
+
@by_name[name]
|
69
|
+
elsif str.match(/\AX[0-9a-fA-F]{2,4}\Z/)
|
70
|
+
"0x#{str[1,4]}".hex
|
56
71
|
elsif str.match(/\Auni[A-F\d]{4}\Z/)
|
57
72
|
"0x#{str[3,4]}".hex
|
58
73
|
elsif str.match(/\Au[A-F\d]{4,6}\Z/)
|
59
74
|
"0x#{str[1,6]}".hex
|
60
|
-
elsif str.match(/\A[A-Za-z]\d{1,
|
61
|
-
str[1,
|
62
|
-
elsif str.match(/\A[A-Za-z]{2}\d{2,
|
63
|
-
str[2,
|
75
|
+
elsif str.match(/\A[A-Za-z]\d{1,5}\Z/)
|
76
|
+
str[1,5].to_i
|
77
|
+
elsif str.match(/\A[A-Za-z]{2}\d{2,5}\Z/)
|
78
|
+
str[2,5].to_i
|
64
79
|
else
|
65
80
|
nil
|
66
81
|
end
|
67
82
|
end
|
68
83
|
|
84
|
+
# attempt to convert a Unicode code point to the equivilant PDF Name. Returns nil
|
85
|
+
# if no conversion is possible.
|
86
|
+
#
|
87
|
+
# h = GlyphHash.new
|
88
|
+
#
|
89
|
+
# h.unicode_to_name(65)
|
90
|
+
# => [:A]
|
91
|
+
#
|
92
|
+
# h.unicode_to_name(8364)
|
93
|
+
# => [:Euro]
|
94
|
+
#
|
95
|
+
# h.unicode_to_name(34)
|
96
|
+
# => [:34]
|
97
|
+
#
|
98
|
+
def unicode_to_name(codepoint)
|
99
|
+
@by_codepoint[codepoint.to_i] || []
|
100
|
+
end
|
101
|
+
|
69
102
|
private
|
70
103
|
|
71
104
|
# returns a hash that maps glyph names to unicode codepoints. The mapping is based on
|
72
105
|
# a text file supplied by Adobe at:
|
73
106
|
# http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
|
74
107
|
def load_adobe_glyph_mapping
|
75
|
-
|
108
|
+
keyed_by_name = {}
|
109
|
+
keyed_by_codepoint = {}
|
76
110
|
|
77
|
-
|
78
|
-
File.open(File.dirname(__FILE__) + "/glyphlist.txt", mode) do |f|
|
111
|
+
File.open(File.dirname(__FILE__) + "/glyphlist.txt", "r:BINARY") do |f|
|
79
112
|
f.each do |l|
|
80
|
-
|
81
|
-
|
113
|
+
_m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
|
114
|
+
if name && code
|
115
|
+
cp = "0x#{code}".hex
|
116
|
+
keyed_by_name[name.to_sym] = cp
|
117
|
+
keyed_by_codepoint[cp] ||= []
|
118
|
+
keyed_by_codepoint[cp] << name.to_sym
|
119
|
+
end
|
82
120
|
end
|
83
121
|
end
|
84
122
|
|
85
|
-
|
123
|
+
[keyed_by_name.freeze, keyed_by_codepoint.freeze]
|
86
124
|
end
|
87
125
|
|
88
126
|
end
|