pdf-reader 1.1.1 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG +87 -2
- data/{README.rdoc → README.md} +43 -31
- data/Rakefile +21 -16
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_object +4 -1
- data/bin/pdf_text +1 -3
- data/examples/callbacks.rb +2 -1
- data/examples/extract_images.rb +11 -6
- data/examples/fuzzy_paragraphs.rb +24 -0
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
- data/lib/pdf/reader/afm/Courier.afm +342 -0
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -0
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
- data/lib/pdf/reader/buffer.rb +90 -63
- data/lib/pdf/reader/cid_widths.rb +63 -0
- data/lib/pdf/reader/cmap.rb +69 -38
- data/lib/pdf/reader/encoding.rb +74 -48
- data/lib/pdf/reader/error.rb +24 -4
- data/lib/pdf/reader/filter/ascii85.rb +28 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +30 -0
- data/lib/pdf/reader/filter/depredict.rb +141 -0
- data/lib/pdf/reader/filter/flate.rb +53 -0
- data/lib/pdf/reader/filter/lzw.rb +21 -0
- data/lib/pdf/reader/filter/null.rb +18 -0
- data/lib/pdf/reader/filter/run_length.rb +45 -0
- data/lib/pdf/reader/filter.rb +15 -234
- data/lib/pdf/reader/font.rb +107 -43
- data/lib/pdf/reader/font_descriptor.rb +80 -0
- data/lib/pdf/reader/form_xobject.rb +26 -4
- data/lib/pdf/reader/glyph_hash.rb +56 -18
- data/lib/pdf/reader/lzw.rb +6 -4
- data/lib/pdf/reader/null_security_handler.rb +17 -0
- data/lib/pdf/reader/object_cache.rb +40 -16
- data/lib/pdf/reader/object_hash.rb +94 -40
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/orientation_detector.rb +34 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
- data/lib/pdf/reader/page.rb +48 -3
- data/lib/pdf/reader/page_layout.rb +125 -0
- data/lib/pdf/reader/page_state.rb +185 -70
- data/lib/pdf/reader/page_text_receiver.rb +70 -20
- data/lib/pdf/reader/pages_strategy.rb +4 -293
- data/lib/pdf/reader/parser.rb +37 -61
- data/lib/pdf/reader/print_receiver.rb +6 -0
- data/lib/pdf/reader/reference.rb +4 -1
- data/lib/pdf/reader/register_receiver.rb +17 -31
- data/lib/pdf/reader/resource_methods.rb +1 -0
- data/lib/pdf/reader/standard_security_handler.rb +82 -42
- data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
- data/lib/pdf/reader/stream.rb +5 -2
- data/lib/pdf/reader/synchronized_cache.rb +33 -0
- data/lib/pdf/reader/text_run.rb +99 -0
- data/lib/pdf/reader/token.rb +4 -1
- data/lib/pdf/reader/transformation_matrix.rb +195 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +67 -0
- data/lib/pdf/reader/width_calculator/composite.rb +28 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +33 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +25 -0
- data/lib/pdf/reader/width_calculator.rb +12 -0
- data/lib/pdf/reader/xref.rb +41 -9
- data/lib/pdf/reader.rb +45 -104
- data/lib/pdf-reader.rb +4 -1
- metadata +220 -101
- data/bin/pdf_list_callbacks +0 -17
- data/lib/pdf/hash.rb +0 -15
- data/lib/pdf/reader/abstract_strategy.rb +0 -81
- data/lib/pdf/reader/metadata_strategy.rb +0 -56
- data/lib/pdf/reader/text_receiver.rb +0 -264
data/lib/pdf/reader/font.rb
CHANGED
@@ -1,3 +1,6 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
1
4
|
################################################################################
|
2
5
|
#
|
3
6
|
# Copyright (C) 2008 James Healy (jimmy@deefa.com)
|
@@ -23,41 +26,29 @@
|
|
23
26
|
#
|
24
27
|
################################################################################
|
25
28
|
|
29
|
+
require 'pdf/reader/width_calculator'
|
30
|
+
|
26
31
|
class PDF::Reader
|
32
|
+
# Represents a single font PDF object and provides some useful methods
|
33
|
+
# for extracting info. Mainly used for converting text to UTF-8.
|
34
|
+
#
|
27
35
|
class Font
|
28
|
-
attr_accessor :
|
29
|
-
attr_reader :widths, :first_char, :
|
30
|
-
|
31
|
-
|
32
|
-
def initialize(ohash
|
33
|
-
if ohash.nil? || obj.nil?
|
34
|
-
$stderr.puts "DEPREACTION WARNING - PDF::Reader::Font.new should be called with 2 args"
|
35
|
-
return
|
36
|
-
end
|
36
|
+
attr_accessor :subtype, :encoding, :descendantfonts, :tounicode
|
37
|
+
attr_reader :widths, :first_char, :last_char, :basefont, :font_descriptor,
|
38
|
+
:cid_widths, :cid_default_width
|
39
|
+
|
40
|
+
def initialize(ohash, obj)
|
37
41
|
@ohash = ohash
|
38
42
|
@tounicode = nil
|
39
43
|
|
40
44
|
extract_base_info(obj)
|
41
45
|
extract_descriptor(obj)
|
42
46
|
extract_descendants(obj)
|
47
|
+
@width_calc = build_width_calculator
|
43
48
|
|
44
49
|
@encoding ||= PDF::Reader::Encoding.new(:StandardEncoding)
|
45
50
|
end
|
46
51
|
|
47
|
-
def basefont=(font)
|
48
|
-
# setup a default encoding for the selected font. It can always be overridden
|
49
|
-
# with encoding= if required
|
50
|
-
case font
|
51
|
-
when "Symbol" then
|
52
|
-
@encoding = PDF::Reader::Encoding.new("SymbolEncoding")
|
53
|
-
when "ZapfDingbats" then
|
54
|
-
@encoding = PDF::Reader::Encoding.new("ZapfDingbatsEncoding")
|
55
|
-
else
|
56
|
-
@encoding = nil
|
57
|
-
end
|
58
|
-
@basefont = font
|
59
|
-
end
|
60
|
-
|
61
52
|
def to_utf8(params)
|
62
53
|
if @tounicode
|
63
54
|
to_utf8_via_cmap(params)
|
@@ -66,39 +57,102 @@ class PDF::Reader
|
|
66
57
|
end
|
67
58
|
end
|
68
59
|
|
69
|
-
def
|
70
|
-
|
71
|
-
|
72
|
-
|
60
|
+
def unpack(data)
|
61
|
+
data.unpack(encoding.unpack)
|
62
|
+
end
|
63
|
+
|
64
|
+
# looks up the specified codepoint and returns a value that is in (pdf)
|
65
|
+
# glyph space, which is 1000 glyph units = 1 text space unit
|
66
|
+
def glyph_width(code_point)
|
67
|
+
if code_point.is_a?(String)
|
68
|
+
code_point = code_point.unpack(encoding.unpack).first
|
69
|
+
end
|
70
|
+
|
71
|
+
@cached_widths ||= {}
|
72
|
+
@cached_widths[code_point] ||= @width_calc.glyph_width(code_point)
|
73
73
|
end
|
74
74
|
|
75
75
|
private
|
76
76
|
|
77
|
+
def default_encoding(font_name)
|
78
|
+
case font_name.to_s
|
79
|
+
when "Symbol" then
|
80
|
+
PDF::Reader::Encoding.new(:SymbolEncoding)
|
81
|
+
when "ZapfDingbats" then
|
82
|
+
PDF::Reader::Encoding.new(:ZapfDingbatsEncoding)
|
83
|
+
else
|
84
|
+
PDF::Reader::Encoding.new(:StandardEncoding)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def build_width_calculator
|
89
|
+
if @subtype == :Type0
|
90
|
+
PDF::Reader::WidthCalculator::TypeZero.new(self)
|
91
|
+
elsif @subtype == :Type1
|
92
|
+
if @font_descriptor.nil?
|
93
|
+
PDF::Reader::WidthCalculator::BuiltIn.new(self)
|
94
|
+
else
|
95
|
+
PDF::Reader::WidthCalculator::TypeOneOrThree .new(self)
|
96
|
+
end
|
97
|
+
elsif @subtype == :Type3
|
98
|
+
PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
|
99
|
+
elsif @subtype == :TrueType
|
100
|
+
if @font_descriptor
|
101
|
+
PDF::Reader::WidthCalculator::TrueType.new(self)
|
102
|
+
else
|
103
|
+
# A TrueType font that isn't embedded. Most readers look for a version on the
|
104
|
+
# local system and fallback to a substitute. For now, we go straight to a substitute
|
105
|
+
PDF::Reader::WidthCalculator::BuiltIn.new(self)
|
106
|
+
end
|
107
|
+
elsif @subtype == :CIDFontType0 || @subtype == :CIDFontType2
|
108
|
+
PDF::Reader::WidthCalculator::Composite.new(self)
|
109
|
+
else
|
110
|
+
PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
77
114
|
def extract_base_info(obj)
|
78
115
|
@subtype = @ohash.object(obj[:Subtype])
|
79
116
|
@basefont = @ohash.object(obj[:BaseFont])
|
80
|
-
|
117
|
+
if @ohash.object(obj[:Encoding])
|
118
|
+
@encoding = PDF::Reader::Encoding.new(@ohash.object(obj[:Encoding]))
|
119
|
+
else
|
120
|
+
@encoding = default_encoding(@basefont)
|
121
|
+
end
|
81
122
|
@widths = @ohash.object(obj[:Widths]) || []
|
82
123
|
@first_char = @ohash.object(obj[:FirstChar])
|
124
|
+
@last_char = @ohash.object(obj[:LastChar])
|
125
|
+
|
126
|
+
# CID Fonts are not required to have a W or DW entry, if they don't exist,
|
127
|
+
# the default cid width = 1000, see Section 9.7.4.1 PDF 32000-1:2008 pp 269
|
128
|
+
@cid_widths = @ohash.object(obj[:W]) || []
|
129
|
+
@cid_default_width = @ohash.object(obj[:DW]) || 1000
|
130
|
+
|
83
131
|
if obj[:ToUnicode]
|
132
|
+
# ToUnicode is optional for Type1 and Type3
|
84
133
|
stream = @ohash.object(obj[:ToUnicode])
|
85
|
-
|
134
|
+
if stream.is_a?(PDF::Reader::Stream)
|
135
|
+
@tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
136
|
+
end
|
86
137
|
end
|
87
138
|
end
|
88
139
|
|
89
140
|
def extract_descriptor(obj)
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
141
|
+
if obj[:FontDescriptor]
|
142
|
+
# create a font descriptor object if we can, in other words, unless this is
|
143
|
+
# a CID Font
|
144
|
+
fd = @ohash.object(obj[:FontDescriptor])
|
145
|
+
@font_descriptor = PDF::Reader::FontDescriptor.new(@ohash, fd)
|
146
|
+
else
|
147
|
+
@font_descriptor = nil
|
148
|
+
end
|
97
149
|
end
|
98
150
|
|
99
151
|
def extract_descendants(obj)
|
100
152
|
return unless obj[:DescendantFonts]
|
101
|
-
|
153
|
+
# per PDF 32000-1:2008 pp. 280 :DescendentFonts is:
|
154
|
+
# A one-element array specifying the CIDFont dictionary that is the
|
155
|
+
# descendant of this Type 0 font.
|
102
156
|
descendants = @ohash.object(obj[:DescendantFonts])
|
103
157
|
@descendantfonts = descendants.map { |desc|
|
104
158
|
PDF::Reader::Font.new(@ohash, @ohash.object(desc))
|
@@ -106,11 +160,16 @@ class PDF::Reader
|
|
106
160
|
end
|
107
161
|
|
108
162
|
def to_utf8_via_cmap(params)
|
109
|
-
|
163
|
+
case params
|
164
|
+
when Integer
|
165
|
+
[
|
166
|
+
@tounicode.decode(params) || PDF::Reader::Encoding::UNKNOWN_CHAR
|
167
|
+
].flatten.pack("U*")
|
168
|
+
when String
|
110
169
|
params.unpack(encoding.unpack).map { |c|
|
111
170
|
@tounicode.decode(c) || PDF::Reader::Encoding::UNKNOWN_CHAR
|
112
|
-
}.pack("U*")
|
113
|
-
|
171
|
+
}.flatten.pack("U*")
|
172
|
+
when Array
|
114
173
|
params.collect { |param| to_utf8_via_cmap(param) }
|
115
174
|
else
|
116
175
|
params
|
@@ -118,11 +177,16 @@ class PDF::Reader
|
|
118
177
|
end
|
119
178
|
|
120
179
|
def to_utf8_via_encoding(params)
|
121
|
-
|
180
|
+
if encoding.kind_of?(String)
|
181
|
+
raise UnsupportedFeatureError, "font encoding '#{encoding}' currently unsupported"
|
182
|
+
end
|
122
183
|
|
123
|
-
|
184
|
+
case params
|
185
|
+
when Integer
|
186
|
+
encoding.int_to_utf8_string(params)
|
187
|
+
when String
|
124
188
|
encoding.to_utf8(params)
|
125
|
-
|
189
|
+
when Array
|
126
190
|
params.collect { |param| to_utf8_via_encoding(param) }
|
127
191
|
else
|
128
192
|
params
|
@@ -0,0 +1,80 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'ttfunk'
|
5
|
+
|
6
|
+
class PDF::Reader
|
7
|
+
|
8
|
+
# Font descriptors are outlined in Section 9.8, PDF 32000-1:2008, pp 281-288
|
9
|
+
class FontDescriptor
|
10
|
+
|
11
|
+
attr_reader :font_name, :font_family, :font_stretch, :font_weight,
|
12
|
+
:font_bounding_box, :cap_height, :ascent, :descent, :leading,
|
13
|
+
:avg_width, :max_width, :missing_width, :italic_angle, :stem_v,
|
14
|
+
:x_height, :font_flags
|
15
|
+
|
16
|
+
def initialize(ohash, fd_hash)
|
17
|
+
@ascent = ohash.object(fd_hash[:Ascent]) || 0
|
18
|
+
@descent = ohash.object(fd_hash[:Descent]) || 0
|
19
|
+
@missing_width = ohash.object(fd_hash[:MissingWidth]) || 0
|
20
|
+
@font_bounding_box = ohash.object(fd_hash[:FontBBox]) || [0,0,0,0]
|
21
|
+
@avg_width = ohash.object(fd_hash[:AvgWidth]) || 0
|
22
|
+
@cap_height = ohash.object(fd_hash[:CapHeight]) || 0
|
23
|
+
@font_flags = ohash.object(fd_hash[:Flags]) || 0
|
24
|
+
@italic_angle = ohash.object(fd_hash[:ItalicAngle])
|
25
|
+
@font_name = ohash.object(fd_hash[:FontName]).to_s
|
26
|
+
@leading = ohash.object(fd_hash[:Leading]) || 0
|
27
|
+
@max_width = ohash.object(fd_hash[:MaxWidth]) || 0
|
28
|
+
@stem_v = ohash.object(fd_hash[:StemV])
|
29
|
+
@x_height = ohash.object(fd_hash[:XHeight])
|
30
|
+
@font_stretch = ohash.object(fd_hash[:FontStretch]) || :Normal
|
31
|
+
@font_weight = ohash.object(fd_hash[:FontWeight]) || 400
|
32
|
+
@font_family = ohash.object(fd_hash[:FontFamily])
|
33
|
+
|
34
|
+
# A FontDescriptor may have an embedded font program in FontFile
|
35
|
+
# (Type 1 Font Program), FontFile2 (TrueType font program), or
|
36
|
+
# FontFile3 (Other font program as defined by Subtype entry)
|
37
|
+
# Subtype entries:
|
38
|
+
# 1) Type1C: Type 1 Font Program in Compact Font Format
|
39
|
+
# 2) CIDFontType0C: Type 0 Font Program in Compact Font Format
|
40
|
+
# 3) OpenType: OpenType Font Program
|
41
|
+
# see Section 9.9, PDF 32000-1:2008, pp 288-292
|
42
|
+
@font_program_stream = ohash.object(fd_hash[:FontFile2])
|
43
|
+
#TODO handle FontFile and FontFile3
|
44
|
+
|
45
|
+
@is_ttf = true if @font_program_stream
|
46
|
+
end
|
47
|
+
|
48
|
+
def glyph_width(char_code)
|
49
|
+
if @is_ttf
|
50
|
+
if ttf_program_stream.cmap.unicode.length > 0
|
51
|
+
glyph_id = ttf_program_stream.cmap.unicode.first[char_code]
|
52
|
+
else
|
53
|
+
glyph_id = char_code
|
54
|
+
end
|
55
|
+
char_metric = ttf_program_stream.horizontal_metrics.metrics[glyph_id]
|
56
|
+
if char_metric
|
57
|
+
return char_metric.advance_width
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# PDF states that a glyph is 1000 units wide, true type doesn't enforce
|
63
|
+
# any behavior, but uses units/em to define how wide the 'M' is (the widest letter)
|
64
|
+
def glyph_to_pdf_scale_factor
|
65
|
+
if @is_ttf
|
66
|
+
@glyph_to_pdf_sf ||= (1.0 / ttf_program_stream.header.units_per_em) * 1000.0
|
67
|
+
else
|
68
|
+
@glyph_to_pdf_sf ||= 1.0
|
69
|
+
end
|
70
|
+
@glyph_to_pdf_sf
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def ttf_program_stream
|
76
|
+
@ttf_program_stream ||= TTFunk::File.new(@font_program_stream.unfiltered_data)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
@@ -1,4 +1,7 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'digest/md5'
|
2
5
|
|
3
6
|
module PDF
|
4
7
|
class Reader
|
@@ -15,9 +18,10 @@ module PDF
|
|
15
18
|
|
16
19
|
attr_reader :xobject
|
17
20
|
|
18
|
-
def initialize(page, xobject)
|
21
|
+
def initialize(page, xobject, options = {})
|
19
22
|
@page = page
|
20
23
|
@objects = page.objects
|
24
|
+
@cache = options[:cache] || {}
|
21
25
|
@xobject = @objects.deref(xobject)
|
22
26
|
end
|
23
27
|
|
@@ -65,12 +69,30 @@ module PDF
|
|
65
69
|
end
|
66
70
|
end
|
67
71
|
|
72
|
+
def content_stream_md5
|
73
|
+
@content_stream_md5 ||= Digest::MD5.hexdigest(raw_content)
|
74
|
+
end
|
75
|
+
|
76
|
+
def cached_tokens_key
|
77
|
+
@cached_tokens_key ||= "tokens-#{content_stream_md5}"
|
78
|
+
end
|
79
|
+
|
80
|
+
def tokens
|
81
|
+
@cache[cached_tokens_key] ||= begin
|
82
|
+
buffer = Buffer.new(StringIO.new(raw_content), :content_stream => true)
|
83
|
+
parser = Parser.new(buffer, @objects)
|
84
|
+
result = []
|
85
|
+
while (token = parser.parse_token(PagesStrategy::OPERATORS))
|
86
|
+
result << token
|
87
|
+
end
|
88
|
+
result
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
68
92
|
def content_stream(receivers, instructions)
|
69
|
-
buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
|
70
|
-
parser = Parser.new(buffer, @objects)
|
71
93
|
params = []
|
72
94
|
|
73
|
-
|
95
|
+
tokens.each do |token|
|
74
96
|
if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
|
75
97
|
callback(receivers, PagesStrategy::OPERATORS[token], params)
|
76
98
|
params.clear
|
@@ -1,3 +1,6 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
1
4
|
################################################################################
|
2
5
|
#
|
3
6
|
# Copyright (C) 2011 James Healy (jimmy@deefa.com)
|
@@ -24,9 +27,15 @@
|
|
24
27
|
################################################################################
|
25
28
|
|
26
29
|
class PDF::Reader
|
30
|
+
# A Hash-like object that can convert glyph names into a unicode codepoint.
|
31
|
+
# The mapping is read from a data file on disk the first time it's needed.
|
32
|
+
#
|
27
33
|
class GlyphHash # :nodoc:
|
28
34
|
def initialize
|
29
|
-
|
35
|
+
# only parse the glyph list once, and cache the results (for performance)
|
36
|
+
adobe = @@cache ||= load_adobe_glyph_mapping
|
37
|
+
@by_name = adobe.first
|
38
|
+
@by_codepoint = adobe.last
|
30
39
|
end
|
31
40
|
|
32
41
|
# attempt to convert a PDF Name to a unicode codepoint. Returns nil
|
@@ -34,55 +43,84 @@ class PDF::Reader
|
|
34
43
|
#
|
35
44
|
# h = GlyphHash.new
|
36
45
|
#
|
37
|
-
# h
|
46
|
+
# h.name_to_unicode(:A)
|
38
47
|
# => 65
|
39
48
|
#
|
40
|
-
# h
|
49
|
+
# h.name_to_unicode(:Euro)
|
41
50
|
# => 8364
|
42
51
|
#
|
43
|
-
# h
|
52
|
+
# h.name_to_unicode(:X4A)
|
53
|
+
# => 74
|
54
|
+
#
|
55
|
+
# h.name_to_unicode(:G30)
|
44
56
|
# => 48
|
45
57
|
#
|
46
|
-
# h
|
58
|
+
# h.name_to_unicode(:34)
|
59
|
+
# => 34
|
47
60
|
#
|
48
|
-
def
|
61
|
+
def name_to_unicode(name)
|
49
62
|
return nil unless name.is_a?(Symbol)
|
50
63
|
|
51
64
|
name = name.to_s.gsub('_', '').intern
|
52
65
|
str = name.to_s
|
53
66
|
|
54
|
-
if @
|
55
|
-
@
|
67
|
+
if @by_name.has_key?(name)
|
68
|
+
@by_name[name]
|
69
|
+
elsif str.match(/\AX[0-9a-fA-F]{2,4}\Z/)
|
70
|
+
"0x#{str[1,4]}".hex
|
56
71
|
elsif str.match(/\Auni[A-F\d]{4}\Z/)
|
57
72
|
"0x#{str[3,4]}".hex
|
58
73
|
elsif str.match(/\Au[A-F\d]{4,6}\Z/)
|
59
74
|
"0x#{str[1,6]}".hex
|
60
|
-
elsif str.match(/\A[A-Za-z]\d{1,
|
61
|
-
str[1,
|
62
|
-
elsif str.match(/\A[A-Za-z]{2}\d{2,
|
63
|
-
str[2,
|
75
|
+
elsif str.match(/\A[A-Za-z]\d{1,5}\Z/)
|
76
|
+
str[1,5].to_i
|
77
|
+
elsif str.match(/\A[A-Za-z]{2}\d{2,5}\Z/)
|
78
|
+
str[2,5].to_i
|
64
79
|
else
|
65
80
|
nil
|
66
81
|
end
|
67
82
|
end
|
68
83
|
|
84
|
+
# attempt to convert a Unicode code point to the equivilant PDF Name. Returns nil
|
85
|
+
# if no conversion is possible.
|
86
|
+
#
|
87
|
+
# h = GlyphHash.new
|
88
|
+
#
|
89
|
+
# h.unicode_to_name(65)
|
90
|
+
# => [:A]
|
91
|
+
#
|
92
|
+
# h.unicode_to_name(8364)
|
93
|
+
# => [:Euro]
|
94
|
+
#
|
95
|
+
# h.unicode_to_name(34)
|
96
|
+
# => [:34]
|
97
|
+
#
|
98
|
+
def unicode_to_name(codepoint)
|
99
|
+
@by_codepoint[codepoint.to_i] || []
|
100
|
+
end
|
101
|
+
|
69
102
|
private
|
70
103
|
|
71
104
|
# returns a hash that maps glyph names to unicode codepoints. The mapping is based on
|
72
105
|
# a text file supplied by Adobe at:
|
73
106
|
# http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
|
74
107
|
def load_adobe_glyph_mapping
|
75
|
-
|
108
|
+
keyed_by_name = {}
|
109
|
+
keyed_by_codepoint = {}
|
76
110
|
|
77
|
-
|
78
|
-
File.open(File.dirname(__FILE__) + "/glyphlist.txt", mode) do |f|
|
111
|
+
File.open(File.dirname(__FILE__) + "/glyphlist.txt", "r:BINARY") do |f|
|
79
112
|
f.each do |l|
|
80
|
-
|
81
|
-
|
113
|
+
_m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
|
114
|
+
if name && code
|
115
|
+
cp = "0x#{code}".hex
|
116
|
+
keyed_by_name[name.to_sym] = cp
|
117
|
+
keyed_by_codepoint[cp] ||= []
|
118
|
+
keyed_by_codepoint[cp] << name.to_sym
|
119
|
+
end
|
82
120
|
end
|
83
121
|
end
|
84
122
|
|
85
|
-
|
123
|
+
[keyed_by_name.freeze, keyed_by_codepoint.freeze]
|
86
124
|
end
|
87
125
|
|
88
126
|
end
|