pdf-reader 2.2.0 → 2.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +90 -0
- data/README.md +18 -3
- data/Rakefile +1 -1
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_text +1 -1
- data/examples/extract_fonts.rb +12 -7
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier.afm +342 -342
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -213
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
- data/lib/pdf/reader/buffer.rb +91 -47
- data/lib/pdf/reader/cid_widths.rb +7 -4
- data/lib/pdf/reader/cmap.rb +83 -59
- data/lib/pdf/reader/encoding.rb +17 -14
- data/lib/pdf/reader/error.rb +15 -3
- data/lib/pdf/reader/filter/ascii85.rb +7 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +12 -10
- data/lib/pdf/reader/filter/flate.rb +30 -16
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -1
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +11 -11
- data/lib/pdf/reader/font.rb +89 -26
- data/lib/pdf/reader/font_descriptor.rb +22 -18
- data/lib/pdf/reader/form_xobject.rb +18 -5
- data/lib/pdf/reader/glyph_hash.rb +28 -13
- data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
- data/lib/pdf/reader/key_builder_v5.rb +138 -0
- data/lib/pdf/reader/lzw.rb +28 -11
- data/lib/pdf/reader/no_text_filter.rb +14 -0
- data/lib/pdf/reader/null_security_handler.rb +1 -4
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +292 -63
- data/lib/pdf/reader/object_stream.rb +3 -2
- data/lib/pdf/reader/overlapping_runs_filter.rb +72 -0
- data/lib/pdf/reader/page.rb +143 -16
- data/lib/pdf/reader/page_layout.rb +43 -39
- data/lib/pdf/reader/page_state.rb +26 -17
- data/lib/pdf/reader/page_text_receiver.rb +74 -4
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +34 -14
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rc4_security_handler.rb +38 -0
- data/lib/pdf/reader/rectangle.rb +113 -0
- data/lib/pdf/reader/reference.rb +3 -1
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/{resource_methods.rb → resources.rb} +17 -9
- data/lib/pdf/reader/security_handler_factory.rb +79 -0
- data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
- data/lib/pdf/reader/stream.rb +3 -2
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +40 -5
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +8 -7
- data/lib/pdf/reader/type_check.rb +98 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/validating_receiver.rb +262 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +27 -17
- data/lib/pdf/reader/width_calculator/composite.rb +6 -1
- data/lib/pdf/reader/width_calculator/true_type.rb +10 -11
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +6 -4
- data/lib/pdf/reader/width_calculator/type_zero.rb +6 -2
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +37 -11
- data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
- data/lib/pdf/reader.rb +49 -24
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +2048 -0
- metadata +39 -23
- data/lib/pdf/hash.rb +0 -20
- data/lib/pdf/reader/orientation_detector.rb +0 -34
- data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
class PDF::Reader
|
@@ -6,8 +7,9 @@ class PDF::Reader
|
|
6
7
|
# some filter implementations support preprocessing of the data to
|
7
8
|
# improve compression
|
8
9
|
class Depredict
|
10
|
+
|
9
11
|
def initialize(options = {})
|
10
|
-
@options = options
|
12
|
+
@options = options
|
11
13
|
end
|
12
14
|
|
13
15
|
################################################################################
|
@@ -34,7 +36,7 @@ class PDF::Reader
|
|
34
36
|
################################################################################
|
35
37
|
def tiff_depredict(data)
|
36
38
|
data = data.unpack("C*")
|
37
|
-
unfiltered =
|
39
|
+
unfiltered = ''
|
38
40
|
bpc = @options[:BitsPerComponent] || 8
|
39
41
|
pixel_bits = bpc * @options[:Colors]
|
40
42
|
pixel_bytes = pixel_bits / 8
|
@@ -51,11 +53,11 @@ class PDF::Reader
|
|
51
53
|
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
52
54
|
row_data[index] = (byte + left) % 256
|
53
55
|
end
|
54
|
-
unfiltered += row_data
|
56
|
+
unfiltered += row_data.pack("C*")
|
55
57
|
pos += line_len
|
56
58
|
end
|
57
59
|
|
58
|
-
unfiltered
|
60
|
+
unfiltered
|
59
61
|
end
|
60
62
|
################################################################################
|
61
63
|
def png_depredict(data)
|
@@ -67,7 +69,7 @@ class PDF::Reader
|
|
67
69
|
scanline_length = (pixel_bytes * @options[:Columns]) + 1
|
68
70
|
row = 0
|
69
71
|
pixels = []
|
70
|
-
paeth, pa, pb, pc =
|
72
|
+
paeth, pa, pb, pc = 0, 0, 0, 0
|
71
73
|
until data.empty? do
|
72
74
|
row_data = data.slice! 0, scanline_length
|
73
75
|
filter = row_data.shift
|
@@ -94,17 +96,17 @@ class PDF::Reader
|
|
94
96
|
row_data[index] = (byte + ((left + upper)/2).floor) % 256
|
95
97
|
end
|
96
98
|
when 4 # Paeth
|
97
|
-
left = upper = upper_left =
|
99
|
+
left = upper = upper_left = 0
|
98
100
|
row_data.each_with_index do |byte, index|
|
99
101
|
col = index / pixel_bytes
|
100
102
|
|
101
|
-
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
103
|
+
left = index < pixel_bytes ? 0 : Integer(row_data[index - pixel_bytes])
|
102
104
|
if row.zero?
|
103
105
|
upper = upper_left = 0
|
104
106
|
else
|
105
|
-
upper = pixels[row-1][col][index % pixel_bytes]
|
107
|
+
upper = Integer(pixels[row-1][col][index % pixel_bytes])
|
106
108
|
upper_left = col.zero? ? 0 :
|
107
|
-
pixels[row-1][col-1][index % pixel_bytes]
|
109
|
+
Integer(pixels[row-1][col-1][index % pixel_bytes])
|
108
110
|
end
|
109
111
|
|
110
112
|
p = left + upper - upper_left
|
@@ -123,7 +125,7 @@ class PDF::Reader
|
|
123
125
|
row_data[index] = (byte + paeth) % 256
|
124
126
|
end
|
125
127
|
else
|
126
|
-
raise
|
128
|
+
raise MalformedPDFError, "Invalid filter algorithm #{filter}"
|
127
129
|
end
|
128
130
|
|
129
131
|
s = []
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
|
@@ -8,6 +9,10 @@ class PDF::Reader
|
|
8
9
|
module Filter # :nodoc:
|
9
10
|
# implementation of the Flate (zlib) stream filter
|
10
11
|
class Flate
|
12
|
+
|
13
|
+
ZLIB_AUTO_DETECT_ZLIB_OR_GZIP = 47 # Zlib::MAX_WBITS + 32
|
14
|
+
ZLIB_RAW_DEFLATE = -15 # Zlib::MAX_WBITS * -1
|
15
|
+
|
11
16
|
def initialize(options = {})
|
12
17
|
@options = options
|
13
18
|
end
|
@@ -15,25 +20,34 @@ class PDF::Reader
|
|
15
20
|
################################################################################
|
16
21
|
# Decode the specified data with the Zlib compression algorithm
|
17
22
|
def filter(data)
|
18
|
-
deflated =
|
23
|
+
deflated = zlib_inflate(data) || zlib_inflate(data[0, data.bytesize-1])
|
24
|
+
|
25
|
+
if deflated.nil?
|
26
|
+
raise MalformedPDFError,
|
27
|
+
"Error while inflating a compressed stream (no suitable inflation algorithm found)"
|
28
|
+
end
|
29
|
+
Depredict.new(@options).filter(deflated)
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def zlib_inflate(data)
|
19
35
|
begin
|
20
|
-
|
21
|
-
rescue Zlib::
|
36
|
+
return Zlib::Inflate.new(ZLIB_AUTO_DETECT_ZLIB_OR_GZIP).inflate(data)
|
37
|
+
rescue Zlib::Error
|
22
38
|
# by default, Ruby's Zlib assumes the data it's inflating
|
23
|
-
# is RFC1951 deflated data, wrapped in a
|
24
|
-
#
|
25
|
-
#
|
26
|
-
#
|
27
|
-
# See
|
28
|
-
# - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
|
29
|
-
# - http://www.gzip.org/zlib/zlib_faq.html#faq38
|
30
|
-
deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
|
39
|
+
# is RFC1951 deflated data, wrapped in a RFC1950 zlib container. If that
|
40
|
+
# fails, swallow the exception and attempt to inflate the data as a raw
|
41
|
+
# RFC1951 stream.
|
31
42
|
end
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
43
|
+
|
44
|
+
begin
|
45
|
+
return Zlib::Inflate.new(ZLIB_RAW_DEFLATE).inflate(data)
|
46
|
+
rescue Zlib::Error
|
47
|
+
# swallow this one too, so we can try some other fallback options
|
48
|
+
end
|
49
|
+
|
50
|
+
nil
|
37
51
|
end
|
38
52
|
end
|
39
53
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
#
|
@@ -6,6 +7,7 @@ class PDF::Reader # :nodoc:
|
|
6
7
|
module Filter # :nodoc:
|
7
8
|
# implementation of the run length stream filter
|
8
9
|
class RunLength
|
10
|
+
|
9
11
|
def initialize(options = {})
|
10
12
|
@options = options
|
11
13
|
end
|
@@ -20,19 +22,23 @@ class PDF::Reader # :nodoc:
|
|
20
22
|
length = data.getbyte(pos)
|
21
23
|
pos += 1
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
25
|
+
unless length.nil?
|
26
|
+
case
|
27
|
+
# nothing
|
28
|
+
when length == 128
|
29
|
+
break
|
30
|
+
when length < 128
|
31
|
+
# When the length is < 128, we copy the following length+1 bytes
|
32
|
+
# literally.
|
33
|
+
out << data[pos, length + 1]
|
34
|
+
pos += length
|
35
|
+
else
|
36
|
+
# When the length is > 128, we copy the next byte (257 - length)
|
37
|
+
# times; i.e., "\xFA\x00" ([250, 0]) will expand to
|
38
|
+
# "\x00\x00\x00\x00\x00\x00\x00".
|
39
|
+
previous_byte = data[pos, 1] || ""
|
40
|
+
out << previous_byte * (257 - length)
|
41
|
+
end
|
36
42
|
end
|
37
43
|
|
38
44
|
pos += 1
|
data/lib/pdf/reader/filter.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -41,17 +42,16 @@ class PDF::Reader
|
|
41
42
|
# returned untouched. At this stage PDF::Reader has no need to decode images.
|
42
43
|
#
|
43
44
|
def self.with(name, options = {})
|
44
|
-
case name
|
45
|
-
when :ASCII85Decode then PDF::Reader::Filter::Ascii85.new(options)
|
46
|
-
when :ASCIIHexDecode then PDF::Reader::Filter::AsciiHex.new(options)
|
47
|
-
when :CCITTFaxDecode then PDF::Reader::Filter::Null.new(options)
|
48
|
-
when :DCTDecode then PDF::Reader::Filter::Null.new(options)
|
49
|
-
when :FlateDecode
|
50
|
-
when :
|
51
|
-
when :
|
52
|
-
when :
|
53
|
-
when :
|
54
|
-
when :RunLengthDecode then PDF::Reader::Filter::RunLength.new(options)
|
45
|
+
case name
|
46
|
+
when :ASCII85Decode, :A85 then PDF::Reader::Filter::Ascii85.new(options)
|
47
|
+
when :ASCIIHexDecode, :AHx then PDF::Reader::Filter::AsciiHex.new(options)
|
48
|
+
when :CCITTFaxDecode, :CCF then PDF::Reader::Filter::Null.new(options)
|
49
|
+
when :DCTDecode, :DCT then PDF::Reader::Filter::Null.new(options)
|
50
|
+
when :FlateDecode, :Fl then PDF::Reader::Filter::Flate.new(options)
|
51
|
+
when :JBIG2Decode then PDF::Reader::Filter::Null.new(options)
|
52
|
+
when :JPXDecode then PDF::Reader::Filter::Null.new(options)
|
53
|
+
when :LZWDecode, :LZW then PDF::Reader::Filter::Lzw.new(options)
|
54
|
+
when :RunLengthDecode, :RL then PDF::Reader::Filter::RunLength.new(options)
|
55
55
|
else
|
56
56
|
raise UnsupportedFeatureError, "Unknown filter: #{name}"
|
57
57
|
end
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -42,6 +43,7 @@ class PDF::Reader
|
|
42
43
|
@tounicode = nil
|
43
44
|
|
44
45
|
extract_base_info(obj)
|
46
|
+
extract_type3_info(obj)
|
45
47
|
extract_descriptor(obj)
|
46
48
|
extract_descendants(obj)
|
47
49
|
@width_calc = build_width_calculator
|
@@ -72,8 +74,44 @@ class PDF::Reader
|
|
72
74
|
@cached_widths[code_point] ||= @width_calc.glyph_width(code_point)
|
73
75
|
end
|
74
76
|
|
77
|
+
# In most cases glyph width is converted into text space with a simple divide by 1000.
|
78
|
+
#
|
79
|
+
# However, Type3 fonts provide their own FontMatrix that's used for the transformation.
|
80
|
+
#
|
81
|
+
def glyph_width_in_text_space(code_point)
|
82
|
+
glyph_width_in_glyph_space = glyph_width(code_point)
|
83
|
+
|
84
|
+
if @subtype == :Type3
|
85
|
+
x1, y1 = font_matrix_transform(0,0)
|
86
|
+
x2, y2 = font_matrix_transform(glyph_width_in_glyph_space, 0)
|
87
|
+
(x2 - x1).abs.round(2)
|
88
|
+
else
|
89
|
+
glyph_width_in_glyph_space / 1000.0
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
75
93
|
private
|
76
94
|
|
95
|
+
# Only valid for Type3 fonts
|
96
|
+
def font_matrix_transform(x, y)
|
97
|
+
return x, y if @font_matrix.nil?
|
98
|
+
|
99
|
+
matrix = TransformationMatrix.new(
|
100
|
+
@font_matrix[0], @font_matrix[1],
|
101
|
+
@font_matrix[2], @font_matrix[3],
|
102
|
+
@font_matrix[4], @font_matrix[5],
|
103
|
+
)
|
104
|
+
|
105
|
+
if x == 0 && y == 0
|
106
|
+
[matrix.e, matrix.f]
|
107
|
+
else
|
108
|
+
[
|
109
|
+
(matrix.a * x) + (matrix.c * y) + (matrix.e),
|
110
|
+
(matrix.b * x) + (matrix.d * y) + (matrix.f)
|
111
|
+
]
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
77
115
|
def default_encoding(font_name)
|
78
116
|
case font_name.to_s
|
79
117
|
when "Symbol" then
|
@@ -97,7 +135,13 @@ class PDF::Reader
|
|
97
135
|
elsif @subtype == :Type3
|
98
136
|
PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
|
99
137
|
elsif @subtype == :TrueType
|
100
|
-
|
138
|
+
if @font_descriptor
|
139
|
+
PDF::Reader::WidthCalculator::TrueType.new(self)
|
140
|
+
else
|
141
|
+
# A TrueType font that isn't embedded. Most readers look for a version on the
|
142
|
+
# local system and fallback to a substitute. For now, we go straight to a substitute
|
143
|
+
PDF::Reader::WidthCalculator::BuiltIn.new(self)
|
144
|
+
end
|
101
145
|
elsif @subtype == :CIDFontType0 || @subtype == :CIDFontType2
|
102
146
|
PDF::Reader::WidthCalculator::Composite.new(self)
|
103
147
|
else
|
@@ -105,27 +149,47 @@ class PDF::Reader
|
|
105
149
|
end
|
106
150
|
end
|
107
151
|
|
108
|
-
def
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
152
|
+
def build_encoding(obj)
|
153
|
+
if obj[:Encoding].is_a?(Symbol)
|
154
|
+
# one of the standard encodings, referenced by name
|
155
|
+
# TODO pass in a standard shape, always a Hash
|
156
|
+
PDF::Reader::Encoding.new(obj[:Encoding])
|
157
|
+
elsif obj[:Encoding].is_a?(Hash) || obj[:Encoding].is_a?(PDF::Reader::Stream)
|
158
|
+
PDF::Reader::Encoding.new(obj[:Encoding])
|
159
|
+
elsif obj[:Encoding].nil?
|
160
|
+
default_encoding(@basefont)
|
113
161
|
else
|
114
|
-
|
162
|
+
raise MalformedPDFError, "Unexpected type for Encoding (#{obj[:Encoding].class})"
|
115
163
|
end
|
116
|
-
|
117
|
-
|
118
|
-
|
164
|
+
end
|
165
|
+
|
166
|
+
def extract_base_info(obj)
|
167
|
+
@subtype = @ohash.deref_name(obj[:Subtype])
|
168
|
+
@basefont = @ohash.deref_name(obj[:BaseFont])
|
169
|
+
@encoding = build_encoding(obj)
|
170
|
+
@widths = @ohash.deref_array_of_numbers(obj[:Widths]) || []
|
171
|
+
@first_char = @ohash.deref_integer(obj[:FirstChar])
|
172
|
+
@last_char = @ohash.deref_integer(obj[:LastChar])
|
119
173
|
|
120
174
|
# CID Fonts are not required to have a W or DW entry, if they don't exist,
|
121
175
|
# the default cid width = 1000, see Section 9.7.4.1 PDF 32000-1:2008 pp 269
|
122
|
-
@cid_widths = @ohash.
|
123
|
-
@cid_default_width = @ohash.
|
176
|
+
@cid_widths = @ohash.deref_array(obj[:W]) || []
|
177
|
+
@cid_default_width = @ohash.deref_number(obj[:DW]) || 1000
|
124
178
|
|
125
179
|
if obj[:ToUnicode]
|
126
180
|
# ToUnicode is optional for Type1 and Type3
|
127
|
-
stream = @ohash.
|
128
|
-
|
181
|
+
stream = @ohash.deref_stream(obj[:ToUnicode])
|
182
|
+
if stream
|
183
|
+
@tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
def extract_type3_info(obj)
|
189
|
+
if @subtype == :Type3
|
190
|
+
@font_matrix = @ohash.deref_array_of_numbers(obj[:FontMatrix]) || [
|
191
|
+
0.001, 0, 0, 0.001, 0, 0
|
192
|
+
]
|
129
193
|
end
|
130
194
|
end
|
131
195
|
|
@@ -133,7 +197,7 @@ class PDF::Reader
|
|
133
197
|
if obj[:FontDescriptor]
|
134
198
|
# create a font descriptor object if we can, in other words, unless this is
|
135
199
|
# a CID Font
|
136
|
-
fd = @ohash.
|
200
|
+
fd = @ohash.deref_hash(obj[:FontDescriptor])
|
137
201
|
@font_descriptor = PDF::Reader::FontDescriptor.new(@ohash, fd)
|
138
202
|
else
|
139
203
|
@font_descriptor = nil
|
@@ -141,14 +205,17 @@ class PDF::Reader
|
|
141
205
|
end
|
142
206
|
|
143
207
|
def extract_descendants(obj)
|
144
|
-
return unless obj[:DescendantFonts]
|
145
208
|
# per PDF 32000-1:2008 pp. 280 :DescendentFonts is:
|
146
209
|
# A one-element array specifying the CIDFont dictionary that is the
|
147
210
|
# descendant of this Type 0 font.
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
211
|
+
if obj[:DescendantFonts]
|
212
|
+
descendants = @ohash.deref_array(obj[:DescendantFonts])
|
213
|
+
@descendantfonts = descendants.map { |desc|
|
214
|
+
PDF::Reader::Font.new(@ohash, @ohash.deref_hash(desc))
|
215
|
+
}
|
216
|
+
else
|
217
|
+
@descendantfonts = []
|
218
|
+
end
|
152
219
|
end
|
153
220
|
|
154
221
|
def to_utf8_via_cmap(params)
|
@@ -162,9 +229,7 @@ class PDF::Reader
|
|
162
229
|
@tounicode.decode(c) || PDF::Reader::Encoding::UNKNOWN_CHAR
|
163
230
|
}.flatten.pack("U*")
|
164
231
|
when Array
|
165
|
-
params.collect { |param| to_utf8_via_cmap(param) }
|
166
|
-
else
|
167
|
-
params
|
232
|
+
params.collect { |param| to_utf8_via_cmap(param) }.join("")
|
168
233
|
end
|
169
234
|
end
|
170
235
|
|
@@ -179,9 +244,7 @@ class PDF::Reader
|
|
179
244
|
when String
|
180
245
|
encoding.to_utf8(params)
|
181
246
|
when Array
|
182
|
-
params.collect { |param| to_utf8_via_encoding(param) }
|
183
|
-
else
|
184
|
-
params
|
247
|
+
params.collect { |param| to_utf8_via_encoding(param) }.join("")
|
185
248
|
end
|
186
249
|
end
|
187
250
|
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'ttfunk'
|
@@ -14,22 +15,23 @@ class PDF::Reader
|
|
14
15
|
:x_height, :font_flags
|
15
16
|
|
16
17
|
def initialize(ohash, fd_hash)
|
17
|
-
|
18
|
-
@
|
19
|
-
@
|
20
|
-
@
|
21
|
-
@
|
22
|
-
@
|
23
|
-
@
|
24
|
-
@
|
25
|
-
@
|
26
|
-
@
|
27
|
-
@
|
28
|
-
@
|
29
|
-
@
|
30
|
-
@
|
31
|
-
@
|
32
|
-
@
|
18
|
+
# TODO change these to typed derefs
|
19
|
+
@ascent = ohash.deref_number(fd_hash[:Ascent]) || 0
|
20
|
+
@descent = ohash.deref_number(fd_hash[:Descent]) || 0
|
21
|
+
@missing_width = ohash.deref_number(fd_hash[:MissingWidth]) || 0
|
22
|
+
@font_bounding_box = ohash.deref_array_of_numbers(fd_hash[:FontBBox]) || [0,0,0,0]
|
23
|
+
@avg_width = ohash.deref_number(fd_hash[:AvgWidth]) || 0
|
24
|
+
@cap_height = ohash.deref_number(fd_hash[:CapHeight]) || 0
|
25
|
+
@font_flags = ohash.deref_integer(fd_hash[:Flags]) || 0
|
26
|
+
@italic_angle = ohash.deref_number(fd_hash[:ItalicAngle])
|
27
|
+
@font_name = ohash.deref_name(fd_hash[:FontName]).to_s
|
28
|
+
@leading = ohash.deref_number(fd_hash[:Leading]) || 0
|
29
|
+
@max_width = ohash.deref_number(fd_hash[:MaxWidth]) || 0
|
30
|
+
@stem_v = ohash.deref_number(fd_hash[:StemV])
|
31
|
+
@x_height = ohash.deref_number(fd_hash[:XHeight])
|
32
|
+
@font_stretch = ohash.deref_name(fd_hash[:FontStretch]) || :Normal
|
33
|
+
@font_weight = ohash.deref_number(fd_hash[:FontWeight]) || 400
|
34
|
+
@font_family = ohash.deref_string(fd_hash[:FontFamily])
|
33
35
|
|
34
36
|
# A FontDescriptor may have an embedded font program in FontFile
|
35
37
|
# (Type 1 Font Program), FontFile2 (TrueType font program), or
|
@@ -39,7 +41,7 @@ class PDF::Reader
|
|
39
41
|
# 2) CIDFontType0C: Type 0 Font Program in Compact Font Format
|
40
42
|
# 3) OpenType: OpenType Font Program
|
41
43
|
# see Section 9.9, PDF 32000-1:2008, pp 288-292
|
42
|
-
@font_program_stream = ohash.
|
44
|
+
@font_program_stream = ohash.deref_stream(fd_hash[:FontFile2])
|
43
45
|
#TODO handle FontFile and FontFile3
|
44
46
|
|
45
47
|
@is_ttf = true if @font_program_stream
|
@@ -54,7 +56,9 @@ class PDF::Reader
|
|
54
56
|
end
|
55
57
|
char_metric = ttf_program_stream.horizontal_metrics.metrics[glyph_id]
|
56
58
|
if char_metric
|
57
|
-
|
59
|
+
char_metric.advance_width
|
60
|
+
else
|
61
|
+
0
|
58
62
|
end
|
59
63
|
end
|
60
64
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'digest/md5'
|
@@ -14,15 +15,24 @@ module PDF
|
|
14
15
|
# This behaves and looks much like a limited PDF::Reader::Page class.
|
15
16
|
#
|
16
17
|
class FormXObject
|
17
|
-
|
18
|
+
extend Forwardable
|
18
19
|
|
19
20
|
attr_reader :xobject
|
20
21
|
|
22
|
+
def_delegators :resources, :color_spaces
|
23
|
+
def_delegators :resources, :fonts
|
24
|
+
def_delegators :resources, :graphic_states
|
25
|
+
def_delegators :resources, :patterns
|
26
|
+
def_delegators :resources, :procedure_sets
|
27
|
+
def_delegators :resources, :properties
|
28
|
+
def_delegators :resources, :shadings
|
29
|
+
def_delegators :resources, :xobjects
|
30
|
+
|
21
31
|
def initialize(page, xobject, options = {})
|
22
32
|
@page = page
|
23
33
|
@objects = page.objects
|
24
34
|
@cache = options[:cache] || {}
|
25
|
-
@xobject = @objects.
|
35
|
+
@xobject = @objects.deref_stream(xobject)
|
26
36
|
end
|
27
37
|
|
28
38
|
# return a hash of fonts used on this form.
|
@@ -33,9 +43,9 @@ module PDF
|
|
33
43
|
# to most available metrics for each font.
|
34
44
|
#
|
35
45
|
def font_objects
|
36
|
-
raw_fonts = @objects.
|
46
|
+
raw_fonts = @objects.deref_hash(fonts)
|
37
47
|
::Hash[raw_fonts.map { |label, font|
|
38
|
-
[label, PDF::Reader::Font.new(@objects, @objects.
|
48
|
+
[label, PDF::Reader::Font.new(@objects, @objects.deref_hash(font) || {})]
|
39
49
|
}]
|
40
50
|
end
|
41
51
|
|
@@ -45,6 +55,9 @@ module PDF
|
|
45
55
|
# See the comments on PDF::Reader::Page#walk for more detail.
|
46
56
|
#
|
47
57
|
def walk(*receivers)
|
58
|
+
receivers = receivers.map { |receiver|
|
59
|
+
ValidatingReceiver.new(receiver)
|
60
|
+
}
|
48
61
|
content_stream(receivers, raw_content)
|
49
62
|
end
|
50
63
|
|
@@ -60,7 +73,7 @@ module PDF
|
|
60
73
|
# Returns the resources that accompany this form.
|
61
74
|
#
|
62
75
|
def resources
|
63
|
-
@resources ||= @objects.
|
76
|
+
@resources ||= Resources.new(@objects, @objects.deref_hash(@xobject.hash[:Resources]) || {})
|
64
77
|
end
|
65
78
|
|
66
79
|
def callback(receivers, name, params=[])
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -32,10 +33,18 @@ class PDF::Reader
|
|
32
33
|
#
|
33
34
|
class GlyphHash # :nodoc:
|
34
35
|
def initialize
|
36
|
+
@@by_codepoint_cache ||= nil
|
37
|
+
@@by_name_cache ||= nil
|
38
|
+
|
35
39
|
# only parse the glyph list once, and cache the results (for performance)
|
36
|
-
|
37
|
-
|
38
|
-
|
40
|
+
if @@by_codepoint_cache != nil && @@by_name_cache != nil
|
41
|
+
@by_name = @@by_name_cache
|
42
|
+
@by_codepoint = @@by_codepoint_cache
|
43
|
+
else
|
44
|
+
by_name, by_codepoint = load_adobe_glyph_mapping
|
45
|
+
@by_name = @@by_name_cache ||= by_name
|
46
|
+
@by_codepoint = @@by_codepoint_cache ||= by_codepoint
|
47
|
+
end
|
39
48
|
end
|
40
49
|
|
41
50
|
# attempt to convert a PDF Name to a unicode codepoint. Returns nil
|
@@ -103,24 +112,30 @@ class PDF::Reader
|
|
103
112
|
|
104
113
|
# returns a hash that maps glyph names to unicode codepoints. The mapping is based on
|
105
114
|
# a text file supplied by Adobe at:
|
106
|
-
#
|
115
|
+
# https://github.com/adobe-type-tools/agl-aglfn
|
107
116
|
def load_adobe_glyph_mapping
|
108
117
|
keyed_by_name = {}
|
109
118
|
keyed_by_codepoint = {}
|
110
119
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
120
|
+
paths = [
|
121
|
+
File.dirname(__FILE__) + "/glyphlist.txt",
|
122
|
+
File.dirname(__FILE__) + "/glyphlist-zapfdingbats.txt",
|
123
|
+
]
|
124
|
+
paths.each do |path|
|
125
|
+
File.open(path, "r:BINARY") do |f|
|
126
|
+
f.each do |l|
|
127
|
+
_m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
|
128
|
+
if name && code
|
129
|
+
cp = "0x#{code}".hex
|
130
|
+
keyed_by_name[name.to_sym] = cp
|
131
|
+
keyed_by_codepoint[cp] ||= []
|
132
|
+
keyed_by_codepoint[cp] << name.to_sym
|
133
|
+
end
|
119
134
|
end
|
120
135
|
end
|
121
136
|
end
|
122
137
|
|
123
|
-
|
138
|
+
return keyed_by_name.freeze, keyed_by_codepoint.freeze
|
124
139
|
end
|
125
140
|
|
126
141
|
end
|