pdf-reader 2.14.1 → 2.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +15 -0
- data/lib/pdf/reader/advanced_text_run_filter.rb +17 -2
- data/lib/pdf/reader/aes_v2_security_handler.rb +30 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +35 -3
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +1 -0
- data/lib/pdf/reader/buffer.rb +35 -17
- data/lib/pdf/reader/cid_widths.rb +7 -1
- data/lib/pdf/reader/cmap.rb +14 -3
- data/lib/pdf/reader/encoding.rb +37 -12
- data/lib/pdf/reader/error.rb +6 -0
- data/lib/pdf/reader/filter/ascii85.rb +2 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
- data/lib/pdf/reader/filter/depredict.rb +4 -0
- data/lib/pdf/reader/filter/flate.rb +5 -2
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +2 -0
- data/lib/pdf/reader/filter/run_length.rb +2 -0
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/font.rb +90 -22
- data/lib/pdf/reader/font_descriptor.rb +76 -23
- data/lib/pdf/reader/form_xobject.rb +11 -0
- data/lib/pdf/reader/glyph_hash.rb +34 -9
- data/lib/pdf/reader/key_builder_v5.rb +17 -9
- data/lib/pdf/reader/lzw.rb +17 -6
- data/lib/pdf/reader/no_text_filter.rb +1 -0
- data/lib/pdf/reader/null_security_handler.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +7 -2
- data/lib/pdf/reader/object_hash.rb +116 -9
- data/lib/pdf/reader/object_stream.rb +19 -2
- data/lib/pdf/reader/overlapping_runs_filter.rb +7 -1
- data/lib/pdf/reader/page.rb +41 -7
- data/lib/pdf/reader/page_layout.rb +25 -8
- data/lib/pdf/reader/page_state.rb +5 -2
- data/lib/pdf/reader/page_text_receiver.rb +6 -2
- data/lib/pdf/reader/pages_strategy.rb +1 -1
- data/lib/pdf/reader/parser.rb +51 -10
- data/lib/pdf/reader/point.rb +9 -2
- data/lib/pdf/reader/print_receiver.rb +2 -6
- data/lib/pdf/reader/rc4_security_handler.rb +2 -0
- data/lib/pdf/reader/rectangle.rb +24 -1
- data/lib/pdf/reader/reference.rb +10 -1
- data/lib/pdf/reader/register_receiver.rb +15 -2
- data/lib/pdf/reader/resources.rb +9 -0
- data/lib/pdf/reader/security_handler_factory.rb +13 -0
- data/lib/pdf/reader/standard_key_builder.rb +37 -23
- data/lib/pdf/reader/stream.rb +9 -3
- data/lib/pdf/reader/synchronized_cache.rb +5 -2
- data/lib/pdf/reader/text_run.rb +28 -1
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +33 -2
- data/lib/pdf/reader/type_check.rb +10 -3
- data/lib/pdf/reader/unimplemented_security_handler.rb +2 -0
- data/lib/pdf/reader/validating_receiver.rb +29 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +10 -3
- data/lib/pdf/reader/width_calculator/composite.rb +5 -1
- data/lib/pdf/reader/width_calculator/true_type.rb +5 -1
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +3 -1
- data/lib/pdf/reader/width_calculator/type_zero.rb +2 -0
- data/lib/pdf/reader/xref.rb +28 -7
- data/lib/pdf/reader/zero_width_runs_filter.rb +1 -0
- data/lib/pdf/reader.rb +18 -2
- data/rbi/pdf-reader.rbi +1502 -1594
- metadata +17 -11
@@ -7,10 +7,12 @@ class PDF::Reader
|
|
7
7
|
# Security handler for when we don't support the flavour of encryption
|
8
8
|
# used in a PDF.
|
9
9
|
class UnimplementedSecurityHandler
|
10
|
+
#: (Hash[Symbol, untyped]) -> bool
|
10
11
|
def self.supports?(encrypt)
|
11
12
|
true
|
12
13
|
end
|
13
14
|
|
15
|
+
#: (String, PDF::Reader::Reference) -> String
|
14
16
|
def decrypt(buf, ref)
|
15
17
|
raise PDF::Reader::EncryptedPDFError, "Unsupported encryption style"
|
16
18
|
end
|
@@ -16,10 +16,12 @@ module PDF
|
|
16
16
|
# Not all operators have type safety implemented yet, but we can expand the number over time.
|
17
17
|
class ValidatingReceiver
|
18
18
|
|
19
|
+
#: (untyped) -> void
|
19
20
|
def initialize(wrapped)
|
20
21
|
@wrapped = wrapped
|
21
22
|
end
|
22
23
|
|
24
|
+
#: (PDF::Reader::Page) -> void
|
23
25
|
def page=(page)
|
24
26
|
call_wrapped(:page=, page)
|
25
27
|
end
|
@@ -27,10 +29,12 @@ module PDF
|
|
27
29
|
#####################################################
|
28
30
|
# Graphics State Operators
|
29
31
|
#####################################################
|
32
|
+
#: (*untyped) -> void
|
30
33
|
def save_graphics_state(*args)
|
31
34
|
call_wrapped(:save_graphics_state)
|
32
35
|
end
|
33
36
|
|
37
|
+
#: (*untyped) -> void
|
34
38
|
def restore_graphics_state(*args)
|
35
39
|
call_wrapped(:restore_graphics_state)
|
36
40
|
end
|
@@ -39,6 +43,7 @@ module PDF
|
|
39
43
|
# Matrix Operators
|
40
44
|
#####################################################
|
41
45
|
|
46
|
+
#: (*untyped) -> void
|
42
47
|
def concatenate_matrix(*args)
|
43
48
|
a, b, c, d, e, f = *args
|
44
49
|
call_wrapped(
|
@@ -56,10 +61,12 @@ module PDF
|
|
56
61
|
# Text Object Operators
|
57
62
|
#####################################################
|
58
63
|
|
64
|
+
#: (*untyped) -> void
|
59
65
|
def begin_text_object(*args)
|
60
66
|
call_wrapped(:begin_text_object)
|
61
67
|
end
|
62
68
|
|
69
|
+
#: (*untyped) -> void
|
63
70
|
def end_text_object(*args)
|
64
71
|
call_wrapped(:end_text_object)
|
65
72
|
end
|
@@ -67,6 +74,7 @@ module PDF
|
|
67
74
|
#####################################################
|
68
75
|
# Text State Operators
|
69
76
|
#####################################################
|
77
|
+
#: (*untyped) -> void
|
70
78
|
def set_character_spacing(*args)
|
71
79
|
char_spacing, _ = *args
|
72
80
|
call_wrapped(
|
@@ -75,6 +83,7 @@ module PDF
|
|
75
83
|
)
|
76
84
|
end
|
77
85
|
|
86
|
+
#: (*untyped) -> void
|
78
87
|
def set_horizontal_text_scaling(*args)
|
79
88
|
h_scaling, _ = *args
|
80
89
|
call_wrapped(
|
@@ -83,6 +92,7 @@ module PDF
|
|
83
92
|
)
|
84
93
|
end
|
85
94
|
|
95
|
+
#: (*untyped) -> void
|
86
96
|
def set_text_font_and_size(*args)
|
87
97
|
label, size, _ = *args
|
88
98
|
call_wrapped(
|
@@ -92,6 +102,7 @@ module PDF
|
|
92
102
|
)
|
93
103
|
end
|
94
104
|
|
105
|
+
#: (*untyped) -> void
|
95
106
|
def set_text_leading(*args)
|
96
107
|
leading, _ = *args
|
97
108
|
call_wrapped(
|
@@ -100,6 +111,7 @@ module PDF
|
|
100
111
|
)
|
101
112
|
end
|
102
113
|
|
114
|
+
#: (*untyped) -> void
|
103
115
|
def set_text_rendering_mode(*args)
|
104
116
|
mode, _ = *args
|
105
117
|
call_wrapped(
|
@@ -108,6 +120,7 @@ module PDF
|
|
108
120
|
)
|
109
121
|
end
|
110
122
|
|
123
|
+
#: (*untyped) -> void
|
111
124
|
def set_text_rise(*args)
|
112
125
|
rise, _ = *args
|
113
126
|
call_wrapped(
|
@@ -116,6 +129,7 @@ module PDF
|
|
116
129
|
)
|
117
130
|
end
|
118
131
|
|
132
|
+
#: (*untyped) -> void
|
119
133
|
def set_word_spacing(*args)
|
120
134
|
word_spacing, _ = *args
|
121
135
|
call_wrapped(
|
@@ -128,6 +142,7 @@ module PDF
|
|
128
142
|
# Text Positioning Operators
|
129
143
|
#####################################################
|
130
144
|
|
145
|
+
#: (*untyped) -> void
|
131
146
|
def move_text_position(*args) # Td
|
132
147
|
x, y, _ = *args
|
133
148
|
call_wrapped(
|
@@ -137,6 +152,7 @@ module PDF
|
|
137
152
|
)
|
138
153
|
end
|
139
154
|
|
155
|
+
#: (*untyped) -> void
|
140
156
|
def move_text_position_and_set_leading(*args) # TD
|
141
157
|
x, y, _ = *args
|
142
158
|
call_wrapped(
|
@@ -146,6 +162,7 @@ module PDF
|
|
146
162
|
)
|
147
163
|
end
|
148
164
|
|
165
|
+
#: (*untyped) -> void
|
149
166
|
def set_text_matrix_and_text_line_matrix(*args) # Tm
|
150
167
|
a, b, c, d, e, f = *args
|
151
168
|
call_wrapped(
|
@@ -159,6 +176,7 @@ module PDF
|
|
159
176
|
)
|
160
177
|
end
|
161
178
|
|
179
|
+
#: (*untyped) -> void
|
162
180
|
def move_to_start_of_next_line(*args) # T*
|
163
181
|
call_wrapped(:move_to_start_of_next_line)
|
164
182
|
end
|
@@ -166,6 +184,7 @@ module PDF
|
|
166
184
|
#####################################################
|
167
185
|
# Text Showing Operators
|
168
186
|
#####################################################
|
187
|
+
#: (*untyped) -> void
|
169
188
|
def show_text(*args) # Tj (AWAY)
|
170
189
|
string, _ = *args
|
171
190
|
call_wrapped(
|
@@ -174,6 +193,7 @@ module PDF
|
|
174
193
|
)
|
175
194
|
end
|
176
195
|
|
196
|
+
#: (*untyped) -> void
|
177
197
|
def show_text_with_positioning(*args) # TJ [(A) 120 (WA) 20 (Y)]
|
178
198
|
params, _ = *args
|
179
199
|
unless params.is_a?(Array)
|
@@ -186,6 +206,7 @@ module PDF
|
|
186
206
|
)
|
187
207
|
end
|
188
208
|
|
209
|
+
#: (*untyped) -> void
|
189
210
|
def move_to_next_line_and_show_text(*args) # '
|
190
211
|
string, _ = *args
|
191
212
|
call_wrapped(
|
@@ -194,6 +215,7 @@ module PDF
|
|
194
215
|
)
|
195
216
|
end
|
196
217
|
|
218
|
+
#: (*untyped) -> void
|
197
219
|
def set_spacing_next_line_show_text(*args) # "
|
198
220
|
aw, ac, string = *args
|
199
221
|
call_wrapped(
|
@@ -208,6 +230,7 @@ module PDF
|
|
208
230
|
# Form XObject Operators
|
209
231
|
#####################################################
|
210
232
|
|
233
|
+
#: (*untyped) -> void
|
211
234
|
def invoke_xobject(*args)
|
212
235
|
label, _ = *args
|
213
236
|
|
@@ -221,16 +244,19 @@ module PDF
|
|
221
244
|
# Inline Image Operators
|
222
245
|
#####################################################
|
223
246
|
|
247
|
+
#: (*untyped) -> void
|
224
248
|
def begin_inline_image(*args)
|
225
249
|
call_wrapped(:begin_inline_image)
|
226
250
|
end
|
227
251
|
|
252
|
+
#: (*untyped) -> void
|
228
253
|
def begin_inline_image_data(*args)
|
229
254
|
# We can't use call_wrapped() here because sorbet won't allow splat args with a dynamic
|
230
255
|
# number of elements
|
231
256
|
@wrapped.begin_inline_image_data(*args) if @wrapped.respond_to?(:begin_inline_image_data)
|
232
257
|
end
|
233
258
|
|
259
|
+
#: (*untyped) -> void
|
234
260
|
def end_inline_image(*args)
|
235
261
|
data, _ = *args
|
236
262
|
|
@@ -244,16 +270,19 @@ module PDF
|
|
244
270
|
# Final safety net for any operators that don't have type checking enabled yet
|
245
271
|
#####################################################
|
246
272
|
|
273
|
+
#: (untyped) -> bool
|
247
274
|
def respond_to?(meth)
|
248
275
|
@wrapped.respond_to?(meth)
|
249
276
|
end
|
250
277
|
|
278
|
+
#: (Symbol, *untyped) -> void
|
251
279
|
def method_missing(methodname, *args)
|
252
280
|
@wrapped.send(methodname, *args)
|
253
281
|
end
|
254
282
|
|
255
283
|
private
|
256
284
|
|
285
|
+
#: (untyped, *untyped) -> void
|
257
286
|
def call_wrapped(methodname, *args)
|
258
287
|
@wrapped.send(methodname, *args) if @wrapped.respond_to?(methodname)
|
259
288
|
end
|
@@ -1,5 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
-
# typed:
|
2
|
+
# typed: strict
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
require 'afm'
|
@@ -12,6 +12,7 @@ class PDF::Reader
|
|
12
12
|
# the reader is expected to have it's own copy of the font metrics.
|
13
13
|
# see Section 9.6.2.2, PDF 32000-1:2008, pp 256
|
14
14
|
class BuiltIn
|
15
|
+
@@all_metrics = nil #: PDF::Reader::SynchronizedCache | nil
|
15
16
|
|
16
17
|
BUILTINS = [
|
17
18
|
:Courier, :"Courier-Bold", :"Courier-BoldOblique", :"Courier-Oblique",
|
@@ -19,11 +20,13 @@ class PDF::Reader
|
|
19
20
|
:Symbol,
|
20
21
|
:"Times-Roman", :"Times-Bold", :"Times-BoldItalic", :"Times-Italic",
|
21
22
|
:ZapfDingbats
|
22
|
-
]
|
23
|
+
] #: Array[Symbol]
|
23
24
|
|
25
|
+
#: (PDF::Reader::Font) -> void
|
24
26
|
def initialize(font)
|
25
27
|
@font = font
|
26
28
|
@@all_metrics ||= PDF::Reader::SynchronizedCache.new
|
29
|
+
@metrics = nil #: AFM::Font?
|
27
30
|
|
28
31
|
basefont = extract_basefont(font.basefont)
|
29
32
|
metrics_path = File.join(File.dirname(__FILE__), "..","afm","#{basefont}.afm")
|
@@ -35,8 +38,10 @@ class PDF::Reader
|
|
35
38
|
end
|
36
39
|
end
|
37
40
|
|
41
|
+
#: (Integer?) -> Numeric
|
38
42
|
def glyph_width(code_point)
|
39
|
-
return 0 if code_point.nil? || code_point < 0
|
43
|
+
return 0 if code_point.nil? || code_point < 0 || @metrics.nil?
|
44
|
+
|
40
45
|
|
41
46
|
names = @font.encoding.int_to_name(code_point)
|
42
47
|
metrics = names.map { |name|
|
@@ -52,11 +57,13 @@ class PDF::Reader
|
|
52
57
|
|
53
58
|
private
|
54
59
|
|
60
|
+
#: (Integer) -> bool
|
55
61
|
def control_character?(code_point)
|
56
62
|
match = @font.encoding.int_to_name(code_point).first.to_s[/\Acontrol..\Z/]
|
57
63
|
match ? true : false
|
58
64
|
end
|
59
65
|
|
66
|
+
#: (Symbol?) -> String
|
60
67
|
def extract_basefont(font_name)
|
61
68
|
if BUILTINS.include?(font_name)
|
62
69
|
font_name.to_s
|
@@ -12,11 +12,15 @@ class PDF::Reader
|
|
12
12
|
# see Section 9.7.4.1, PDF 32000-1:2008, pp 269-270
|
13
13
|
class Composite
|
14
14
|
|
15
|
+
#: (PDF::Reader::Font) -> void
|
15
16
|
def initialize(font)
|
16
17
|
@font = font
|
17
|
-
@widths = PDF::Reader::CidWidths.new(
|
18
|
+
@widths = PDF::Reader::CidWidths.new(
|
19
|
+
@font.cid_default_width, @font.cid_widths
|
20
|
+
) #: PDF::Reader::CidWidths
|
18
21
|
end
|
19
22
|
|
23
|
+
#: (Integer?) -> Numeric
|
20
24
|
def glyph_width(code_point)
|
21
25
|
return 0 if code_point.nil? || code_point < 0
|
22
26
|
|
@@ -7,16 +7,18 @@ class PDF::Reader
|
|
7
7
|
# Calculates the width of a glyph in a TrueType font
|
8
8
|
class TrueType
|
9
9
|
|
10
|
+
#: (PDF::Reader::Font) -> void
|
10
11
|
def initialize(font)
|
11
12
|
@font = font
|
12
13
|
|
13
14
|
if fd = @font.font_descriptor
|
14
|
-
@missing_width = fd.missing_width
|
15
|
+
@missing_width = fd.missing_width #: Numeric
|
15
16
|
else
|
16
17
|
@missing_width = 0
|
17
18
|
end
|
18
19
|
end
|
19
20
|
|
21
|
+
#: (Integer?) -> Numeric
|
20
22
|
def glyph_width(code_point)
|
21
23
|
return 0 if code_point.nil? || code_point < 0
|
22
24
|
glyph_width_from_font(code_point) || glyph_width_from_descriptor(code_point) || 0
|
@@ -25,6 +27,7 @@ class PDF::Reader
|
|
25
27
|
private
|
26
28
|
|
27
29
|
#TODO convert Type3 units 1000 units => 1 text space unit
|
30
|
+
#: (Integer) -> Numeric?
|
28
31
|
def glyph_width_from_font(code_point)
|
29
32
|
return if @font.widths.nil? || @font.widths.count == 0
|
30
33
|
|
@@ -38,6 +41,7 @@ class PDF::Reader
|
|
38
41
|
end
|
39
42
|
end
|
40
43
|
|
44
|
+
#: (Integer) -> Numeric?
|
41
45
|
def glyph_width_from_descriptor(code_point)
|
42
46
|
# true type fonts will have most of their information contained
|
43
47
|
# with-in a program inside the font descriptor, however the widths
|
@@ -7,16 +7,18 @@ class PDF::Reader
|
|
7
7
|
# Calculates the width of a glyph in a Type One or Type Three
|
8
8
|
class TypeOneOrThree
|
9
9
|
|
10
|
+
#: (PDF::Reader::Font) -> void
|
10
11
|
def initialize(font)
|
11
12
|
@font = font
|
12
13
|
|
13
14
|
if fd = @font.font_descriptor
|
14
|
-
@missing_width = fd.missing_width
|
15
|
+
@missing_width = fd.missing_width #: Numeric
|
15
16
|
else
|
16
17
|
@missing_width = 0
|
17
18
|
end
|
18
19
|
end
|
19
20
|
|
21
|
+
#: (Integer?) -> Numeric
|
20
22
|
def glyph_width(code_point)
|
21
23
|
return 0 if code_point.nil? || code_point < 0
|
22
24
|
return 0 if @font.widths.nil? || @font.widths.count == 0
|
@@ -11,10 +11,12 @@ class PDF::Reader
|
|
11
11
|
# the descendant font
|
12
12
|
class TypeZero
|
13
13
|
|
14
|
+
#: (PDF::Reader::Font) -> void
|
14
15
|
def initialize(font)
|
15
16
|
@font = font
|
16
17
|
end
|
17
18
|
|
19
|
+
#: (Integer?) -> Numeric
|
18
20
|
def glyph_width(code_point)
|
19
21
|
return 0 if code_point.nil? || code_point < 0
|
20
22
|
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
-
# typed:
|
2
|
+
# typed: strict
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
################################################################################
|
@@ -46,8 +46,11 @@ class PDF::Reader
|
|
46
46
|
# the Enumerable mixin. The key difference is no []= method - the hash
|
47
47
|
# is read only.
|
48
48
|
#
|
49
|
+
#: [Elem]
|
49
50
|
class XRef
|
50
51
|
include Enumerable
|
52
|
+
|
53
|
+
#: Hash[Symbol, untyped]
|
51
54
|
attr_reader :trailer
|
52
55
|
|
53
56
|
################################################################################
|
@@ -55,16 +58,19 @@ class PDF::Reader
|
|
55
58
|
#
|
56
59
|
# io - must be an IO object, generally either a file or a StringIO
|
57
60
|
#
|
61
|
+
#: (IO | Tempfile | StringIO) -> void
|
58
62
|
def initialize(io)
|
59
63
|
@io = io
|
60
|
-
@junk_offset = calc_junk_offset(io) || 0
|
61
|
-
@xref = {}
|
62
|
-
@trailer = load_offsets
|
64
|
+
@junk_offset = calc_junk_offset(io) || 0 #: Integer
|
65
|
+
@xref = {} #: Hash[Integer, Hash[Integer, Integer | PDF::Reader::Reference]]
|
66
|
+
@trailer = load_offsets #: Hash[Symbol, untyped]
|
63
67
|
end
|
64
68
|
|
65
69
|
################################################################################
|
66
70
|
# return the number of objects in this file. Objects with multiple generations are
|
67
71
|
# only counter once.
|
72
|
+
#
|
73
|
+
#: () -> untyped
|
68
74
|
def size
|
69
75
|
@xref.size
|
70
76
|
end
|
@@ -72,6 +78,7 @@ class PDF::Reader
|
|
72
78
|
# returns the byte offset for the specified PDF object.
|
73
79
|
#
|
74
80
|
# ref - a PDF::Reader::Reference object containing an object ID and revision number
|
81
|
+
#: (untyped) -> untyped
|
75
82
|
def [](ref)
|
76
83
|
@xref.fetch(ref.id, {}).fetch(ref.gen)
|
77
84
|
rescue
|
@@ -79,6 +86,9 @@ class PDF::Reader
|
|
79
86
|
end
|
80
87
|
################################################################################
|
81
88
|
# iterate over each object in the xref table
|
89
|
+
#
|
90
|
+
# @override(allow_incompatible: true)
|
91
|
+
#: () { (PDF::Reader::Reference) -> untyped } -> void
|
82
92
|
def each(&block)
|
83
93
|
ids = @xref.keys.sort
|
84
94
|
ids.each do |id|
|
@@ -97,6 +107,7 @@ class PDF::Reader
|
|
97
107
|
# After seeking to the offset, processing is handed of to either load_xref_table()
|
98
108
|
# or load_xref_stream() based on what we find there.
|
99
109
|
#
|
110
|
+
#: (?Integer?) -> Hash[Symbol, untyped]
|
100
111
|
def load_offsets(offset = nil)
|
101
112
|
offset ||= new_buffer.find_first_xref_offset
|
102
113
|
offset += @junk_offset
|
@@ -117,7 +128,9 @@ class PDF::Reader
|
|
117
128
|
# to handle the case where an XRef Stream has the Length specified via an
|
118
129
|
# indirect object
|
119
130
|
stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i)
|
120
|
-
|
131
|
+
if stream.is_a?(PDF::Reader::Stream)
|
132
|
+
return load_xref_stream(stream)
|
133
|
+
end
|
121
134
|
end
|
122
135
|
|
123
136
|
raise PDF::Reader::MalformedPDFError,
|
@@ -126,6 +139,8 @@ class PDF::Reader
|
|
126
139
|
################################################################################
|
127
140
|
# Assumes the underlying buffer is positioned at the start of a traditional
|
128
141
|
# Xref table and processes it into memory.
|
142
|
+
#
|
143
|
+
#: (PDF::Reader::Buffer) -> Hash[Symbol, untyped]
|
129
144
|
def load_xref_table(buf)
|
130
145
|
params = []
|
131
146
|
|
@@ -169,8 +184,9 @@ class PDF::Reader
|
|
169
184
|
################################################################################
|
170
185
|
# Read an XRef stream from the underlying buffer instead of a traditional xref table.
|
171
186
|
#
|
187
|
+
#: (PDF::Reader::Stream) -> Hash[Symbol, untyped]
|
172
188
|
def load_xref_stream(stream)
|
173
|
-
unless stream.
|
189
|
+
unless stream.hash[:Type] == :XRef
|
174
190
|
raise PDF::Reader::MalformedPDFError, "xref stream not found when expected"
|
175
191
|
end
|
176
192
|
trailer = Hash[stream.hash.select { |key, value|
|
@@ -216,8 +232,9 @@ class PDF::Reader
|
|
216
232
|
# XRef streams pack info into integers 1-N bytes wide. Depending on the number of
|
217
233
|
# bytes they need to be converted to an int in different ways.
|
218
234
|
#
|
235
|
+
#: (String?) -> Integer
|
219
236
|
def unpack_bytes(bytes)
|
220
|
-
if bytes.
|
237
|
+
res = if bytes.nil? || bytes == ""
|
221
238
|
0
|
222
239
|
elsif bytes.size == 1
|
223
240
|
bytes.unpack("C")[0]
|
@@ -232,6 +249,7 @@ class PDF::Reader
|
|
232
249
|
else
|
233
250
|
raise UnsupportedFeatureError, "Unable to unpack xref stream entries of #{bytes.size} bytes"
|
234
251
|
end
|
252
|
+
TypeCheck.cast_to_int!(res)
|
235
253
|
end
|
236
254
|
################################################################################
|
237
255
|
# Wrap the io stream we're working with in a buffer that can tokenise it for us.
|
@@ -239,12 +257,14 @@ class PDF::Reader
|
|
239
257
|
# We create multiple buffers so we can be tokenising multiple sections of the file
|
240
258
|
# at the same time without worrying about clearing the buffers contents.
|
241
259
|
#
|
260
|
+
#: (?Integer) -> PDF::Reader::Buffer
|
242
261
|
def new_buffer(offset = 0)
|
243
262
|
PDF::Reader::Buffer.new(@io, :seek => offset)
|
244
263
|
end
|
245
264
|
################################################################################
|
246
265
|
# Stores an offset value for a particular PDF object ID and revision number
|
247
266
|
#
|
267
|
+
#: (Integer, Integer, Integer | PDF::Reader::Reference) -> (Integer | PDF::Reader::Reference)
|
248
268
|
def store(id, gen, offset)
|
249
269
|
(@xref[id] ||= {})[gen] ||= offset
|
250
270
|
end
|
@@ -258,6 +278,7 @@ class PDF::Reader
|
|
258
278
|
# Adobe PDF 1.4 spec (3.4.1) 12. Acrobat viewers require only that the
|
259
279
|
# header appear somewhere within the first 1024 bytes of the file
|
260
280
|
#
|
281
|
+
#: (IO | Tempfile | StringIO) -> Integer?
|
261
282
|
def calc_junk_offset(io)
|
262
283
|
io.rewind
|
263
284
|
offset = io.pos
|
data/lib/pdf/reader.rb
CHANGED
@@ -95,6 +95,7 @@ module PDF
|
|
95
95
|
class Reader
|
96
96
|
|
97
97
|
# lowlevel hash-like access to all objects in the underlying PDF
|
98
|
+
#: PDF::Reader::ObjectHash
|
98
99
|
attr_reader :objects
|
99
100
|
|
100
101
|
# creates a new document reader for the provided PDF.
|
@@ -115,14 +116,18 @@ module PDF
|
|
115
116
|
# Using this method directly is supported, but it's more common to use
|
116
117
|
# `PDF::Reader.open`
|
117
118
|
#
|
119
|
+
#: (String | Tempfile | IO | StringIO, ?Hash[untyped, untyped]) -> void
|
118
120
|
def initialize(input, opts = {})
|
119
|
-
@cache = PDF::Reader::ObjectCache.new
|
121
|
+
@cache = PDF::Reader::ObjectCache.new #: PDF::Reader::ObjectCache
|
120
122
|
opts.merge!(:cache => @cache)
|
121
|
-
@objects = PDF::Reader::ObjectHash.new(input, opts)
|
123
|
+
@objects = PDF::Reader::ObjectHash.new(input, opts) #: PDF::Reader::ObjectHash
|
124
|
+
@page_count = nil #: Integer | nil
|
125
|
+
@root = nil #: Hash[Symbol, untyped] | nil
|
122
126
|
end
|
123
127
|
|
124
128
|
# Return a Hash with some basic information about the PDF file
|
125
129
|
#
|
130
|
+
#: () -> Hash[untyped, untyped]?
|
126
131
|
def info
|
127
132
|
dict = @objects.deref_hash(@objects.trailer[:Info]) || {}
|
128
133
|
doc_strings_to_utf8(dict)
|
@@ -131,6 +136,7 @@ module PDF
|
|
131
136
|
# Return a String with extra XML metadata provided by the author of the PDF file. Not
|
132
137
|
# always present.
|
133
138
|
#
|
139
|
+
#: () -> String?
|
134
140
|
def metadata
|
135
141
|
stream = @objects.deref_stream(root[:Metadata])
|
136
142
|
if stream.nil?
|
@@ -144,6 +150,7 @@ module PDF
|
|
144
150
|
|
145
151
|
# To number of pages in this PDF
|
146
152
|
#
|
153
|
+
#: () -> Integer
|
147
154
|
def page_count
|
148
155
|
pages = @objects.deref_hash(root[:Pages])
|
149
156
|
unless pages.kind_of?(::Hash)
|
@@ -154,6 +161,7 @@ module PDF
|
|
154
161
|
|
155
162
|
# The PDF version this file uses
|
156
163
|
#
|
164
|
+
#: () -> Float
|
157
165
|
def pdf_version
|
158
166
|
@objects.pdf_version
|
159
167
|
end
|
@@ -171,6 +179,7 @@ module PDF
|
|
171
179
|
# puts reader.pdf_version
|
172
180
|
# end
|
173
181
|
#
|
182
|
+
#: (String | Tempfile | IO, ?Hash[untyped, untyped]) { (PDF::Reader) -> void } -> untyped
|
174
183
|
def self.open(input, opts = {}, &block)
|
175
184
|
yield PDF::Reader.new(input, opts)
|
176
185
|
end
|
@@ -189,6 +198,7 @@ module PDF
|
|
189
198
|
# See the docs for PDF::Reader::Page to read more about the
|
190
199
|
# methods available on each page
|
191
200
|
#
|
201
|
+
#: () -> Array[PDF::Reader::Page]
|
192
202
|
def pages
|
193
203
|
return [] if page_count <= 0
|
194
204
|
|
@@ -213,6 +223,7 @@ module PDF
|
|
213
223
|
# See the docs for PDF::Reader::Page to read more about the
|
214
224
|
# methods available on each page
|
215
225
|
#
|
226
|
+
#: (Integer) -> PDF::Reader::Page
|
216
227
|
def page(num)
|
217
228
|
num = num.to_i
|
218
229
|
if num < 1 || num > self.page_count
|
@@ -225,6 +236,7 @@ module PDF
|
|
225
236
|
|
226
237
|
# recursively convert strings from outside a content stream into UTF-8
|
227
238
|
#
|
239
|
+
#: (untyped) -> untyped
|
228
240
|
def doc_strings_to_utf8(obj)
|
229
241
|
case obj
|
230
242
|
when ::Hash then
|
@@ -246,6 +258,7 @@ module PDF
|
|
246
258
|
end
|
247
259
|
end
|
248
260
|
|
261
|
+
#: (String) -> bool
|
249
262
|
def has_utf16_bom?(str)
|
250
263
|
first_bytes = str[0,2]
|
251
264
|
|
@@ -256,6 +269,7 @@ module PDF
|
|
256
269
|
|
257
270
|
# TODO find a PDF I can use to spec this behaviour
|
258
271
|
#
|
272
|
+
#: (String) -> String
|
259
273
|
def pdfdoc_to_utf8(obj)
|
260
274
|
obj.force_encoding("utf-8")
|
261
275
|
obj
|
@@ -264,6 +278,7 @@ module PDF
|
|
264
278
|
# one day we'll all run on a 1.9 compatible VM and I can just do this with
|
265
279
|
# String#encode
|
266
280
|
#
|
281
|
+
#: (String) -> String
|
267
282
|
def utf16_to_utf8(obj)
|
268
283
|
str = obj[2, obj.size].to_s
|
269
284
|
str = str.unpack("n*").pack("U*")
|
@@ -271,6 +286,7 @@ module PDF
|
|
271
286
|
str
|
272
287
|
end
|
273
288
|
|
289
|
+
#: () -> Hash[Symbol, untyped]
|
274
290
|
def root
|
275
291
|
@root ||= @objects.deref_hash(@objects.trailer[:Root]) || {}
|
276
292
|
end
|