pdf-reader 2.6.0 → 2.9.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +30 -1
- data/Rakefile +1 -1
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
- data/lib/pdf/reader/buffer.rb +36 -33
- data/lib/pdf/reader/cid_widths.rb +1 -0
- data/lib/pdf/reader/cmap.rb +65 -50
- data/lib/pdf/reader/encoding.rb +2 -1
- data/lib/pdf/reader/error.rb +16 -0
- data/lib/pdf/reader/filter/ascii85.rb +2 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +8 -6
- data/lib/pdf/reader/filter/flate.rb +4 -2
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -1
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +11 -11
- data/lib/pdf/reader/font.rb +72 -16
- data/lib/pdf/reader/font_descriptor.rb +19 -17
- data/lib/pdf/reader/form_xobject.rb +15 -5
- data/lib/pdf/reader/glyph_hash.rb +1 -0
- data/lib/pdf/reader/key_builder_v5.rb +138 -0
- data/lib/pdf/reader/lzw.rb +4 -2
- data/lib/pdf/reader/null_security_handler.rb +1 -4
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +252 -44
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
- data/lib/pdf/reader/page.rb +99 -19
- data/lib/pdf/reader/page_layout.rb +28 -32
- data/lib/pdf/reader/page_state.rb +12 -11
- data/lib/pdf/reader/page_text_receiver.rb +57 -10
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +26 -8
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rc4_security_handler.rb +38 -0
- data/lib/pdf/reader/rectangle.rb +113 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/{resource_methods.rb → resources.rb} +16 -9
- data/lib/pdf/reader/security_handler_factory.rb +79 -0
- data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
- data/lib/pdf/reader/stream.rb +2 -1
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +14 -6
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/type_check.rb +52 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/validating_receiver.rb +262 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +1 -0
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +2 -1
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +21 -3
- data/lib/pdf/reader/zero_width_runs_filter.rb +2 -0
- data/lib/pdf/reader.rb +46 -15
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +1978 -0
- metadata +22 -13
- data/lib/pdf/reader/orientation_detector.rb +0 -34
- data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
data/lib/pdf/reader/xref.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -103,13 +104,18 @@ class PDF::Reader
|
|
103
104
|
buf = new_buffer(offset)
|
104
105
|
tok_one = buf.token
|
105
106
|
|
107
|
+
# we have a traditional xref table
|
106
108
|
return load_xref_table(buf) if tok_one == "xref" || tok_one == "ref"
|
107
109
|
|
108
110
|
tok_two = buf.token
|
109
111
|
tok_three = buf.token
|
110
112
|
|
113
|
+
# we have an XRef stream
|
111
114
|
if tok_one.to_i >= 0 && tok_two.to_i >= 0 && tok_three == "obj"
|
112
115
|
buf = new_buffer(offset)
|
116
|
+
# Maybe we should be parsing the ObjectHash second argument to the Parser here,
|
117
|
+
# to handle the case where an XRef Stream has the Length specified via an
|
118
|
+
# indirect object
|
113
119
|
stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i)
|
114
120
|
return load_xref_stream(stream)
|
115
121
|
end
|
@@ -125,6 +131,10 @@ class PDF::Reader
|
|
125
131
|
|
126
132
|
while !params.include?("trailer") && !params.include?(nil)
|
127
133
|
if params.size == 2
|
134
|
+
unless params[0].to_s.match(/\A\d+\z/)
|
135
|
+
raise MalformedPDFError, "invalid xref table, expected object ID"
|
136
|
+
end
|
137
|
+
|
128
138
|
objid, count = params[0].to_i, params[1].to_i
|
129
139
|
count.times do
|
130
140
|
offset = buf.token.to_i
|
@@ -142,7 +152,7 @@ class PDF::Reader
|
|
142
152
|
params << buf.token
|
143
153
|
end
|
144
154
|
|
145
|
-
trailer = Parser.new(buf
|
155
|
+
trailer = Parser.new(buf).parse_token
|
146
156
|
|
147
157
|
unless trailer.kind_of?(Hash)
|
148
158
|
raise MalformedPDFError, "PDF malformed, trailer should be a dictionary"
|
@@ -167,8 +177,16 @@ class PDF::Reader
|
|
167
177
|
[:Size, :Prev, :Root, :Encrypt, :Info, :ID].include?(key)
|
168
178
|
}]
|
169
179
|
|
170
|
-
widths
|
171
|
-
|
180
|
+
widths = stream.hash[:W]
|
181
|
+
|
182
|
+
PDF::Reader::Error.validate_type_as_malformed(widths, "xref stream widths", Array)
|
183
|
+
|
184
|
+
entry_length = widths.inject(0) { |s, w|
|
185
|
+
unless w.is_a?(Integer)
|
186
|
+
w = 0
|
187
|
+
end
|
188
|
+
s + w
|
189
|
+
}
|
172
190
|
raw_data = StringIO.new(stream.unfiltered_data)
|
173
191
|
if stream.hash[:Index]
|
174
192
|
index = stream.hash[:Index]
|
data/lib/pdf/reader.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -111,19 +112,27 @@ module PDF
|
|
111
112
|
#
|
112
113
|
# reader = PDF::Reader.new("somefile.pdf", :password => "apples")
|
113
114
|
#
|
115
|
+
# Using this method directly is supported, but it's more common to use
|
116
|
+
# `PDF::Reader.open`
|
117
|
+
#
|
114
118
|
def initialize(input, opts = {})
|
115
119
|
@cache = PDF::Reader::ObjectCache.new
|
116
120
|
opts.merge!(:cache => @cache)
|
117
121
|
@objects = PDF::Reader::ObjectHash.new(input, opts)
|
118
122
|
end
|
119
123
|
|
124
|
+
# Return a Hash with some basic information about the PDF file
|
125
|
+
#
|
120
126
|
def info
|
121
|
-
dict = @objects.
|
127
|
+
dict = @objects.deref_hash(@objects.trailer[:Info]) || {}
|
122
128
|
doc_strings_to_utf8(dict)
|
123
129
|
end
|
124
130
|
|
131
|
+
# Return a Hash with extra metadata provided by the author of the PDF file. Not
|
132
|
+
# always present.
|
133
|
+
#
|
125
134
|
def metadata
|
126
|
-
stream = @objects.
|
135
|
+
stream = @objects.deref_stream(root[:Metadata])
|
127
136
|
if stream.nil?
|
128
137
|
nil
|
129
138
|
else
|
@@ -133,20 +142,24 @@ module PDF
|
|
133
142
|
end
|
134
143
|
end
|
135
144
|
|
145
|
+
# To number of pages in this PDF
|
146
|
+
#
|
136
147
|
def page_count
|
137
|
-
pages = @objects.
|
148
|
+
pages = @objects.deref_hash(root[:Pages])
|
138
149
|
unless pages.kind_of?(::Hash)
|
139
|
-
raise MalformedPDFError,
|
150
|
+
raise MalformedPDFError, "Pages structure is missing #{pages.class}"
|
140
151
|
end
|
141
|
-
@page_count ||= @objects.
|
152
|
+
@page_count ||= @objects.deref_integer(pages[:Count]) || 0
|
142
153
|
end
|
143
154
|
|
155
|
+
# The PDF version this file uses
|
156
|
+
#
|
144
157
|
def pdf_version
|
145
158
|
@objects.pdf_version
|
146
159
|
end
|
147
160
|
|
148
|
-
# syntactic sugar for opening a PDF file. Accepts the
|
149
|
-
# as new().
|
161
|
+
# syntactic sugar for opening a PDF file and the most common approach. Accepts the
|
162
|
+
# same arguments as new().
|
150
163
|
#
|
151
164
|
# PDF::Reader.open("somefile.pdf") do |reader|
|
152
165
|
# puts reader.pdf_version
|
@@ -177,6 +190,8 @@ module PDF
|
|
177
190
|
# methods available on each page
|
178
191
|
#
|
179
192
|
def pages
|
193
|
+
return [] if page_count <= 0
|
194
|
+
|
180
195
|
(1..self.page_count).map do |num|
|
181
196
|
begin
|
182
197
|
PDF::Reader::Page.new(@objects, num, :cache => @cache)
|
@@ -221,16 +236,24 @@ module PDF
|
|
221
236
|
when Array then
|
222
237
|
obj.map { |item| doc_strings_to_utf8(item) }
|
223
238
|
when String then
|
224
|
-
if obj
|
239
|
+
if has_utf16_bom?(obj)
|
225
240
|
utf16_to_utf8(obj)
|
226
241
|
else
|
227
242
|
pdfdoc_to_utf8(obj)
|
228
243
|
end
|
229
244
|
else
|
230
|
-
|
245
|
+
obj
|
231
246
|
end
|
232
247
|
end
|
233
248
|
|
249
|
+
def has_utf16_bom?(str)
|
250
|
+
first_bytes = str[0,2]
|
251
|
+
|
252
|
+
return false if first_bytes.nil?
|
253
|
+
|
254
|
+
first_bytes.unpack("C*") == [254, 255]
|
255
|
+
end
|
256
|
+
|
234
257
|
# TODO find a PDF I can use to spec this behaviour
|
235
258
|
#
|
236
259
|
def pdfdoc_to_utf8(obj)
|
@@ -242,7 +265,7 @@ module PDF
|
|
242
265
|
# String#encode
|
243
266
|
#
|
244
267
|
def utf16_to_utf8(obj)
|
245
|
-
str = obj[2, obj.size]
|
268
|
+
str = obj[2, obj.size].to_s
|
246
269
|
str = str.unpack("n*").pack("U*")
|
247
270
|
str.force_encoding("utf-8")
|
248
271
|
str
|
@@ -250,7 +273,7 @@ module PDF
|
|
250
273
|
|
251
274
|
def root
|
252
275
|
@root ||= begin
|
253
|
-
obj = @objects.
|
276
|
+
obj = @objects.deref_hash(@objects.trailer[:Root]) || {}
|
254
277
|
unless obj.kind_of?(::Hash)
|
255
278
|
raise MalformedPDFError, "PDF malformed, trailer Root should be a dictionary"
|
256
279
|
end
|
@@ -262,8 +285,9 @@ module PDF
|
|
262
285
|
end
|
263
286
|
################################################################################
|
264
287
|
|
265
|
-
require 'pdf/reader/
|
288
|
+
require 'pdf/reader/resources'
|
266
289
|
require 'pdf/reader/buffer'
|
290
|
+
require 'pdf/reader/bounding_rectangle_runs_filter'
|
267
291
|
require 'pdf/reader/cid_widths'
|
268
292
|
require 'pdf/reader/cmap'
|
269
293
|
require 'pdf/reader/encoding'
|
@@ -286,18 +310,25 @@ require 'pdf/reader/object_hash'
|
|
286
310
|
require 'pdf/reader/object_stream'
|
287
311
|
require 'pdf/reader/pages_strategy'
|
288
312
|
require 'pdf/reader/parser'
|
313
|
+
require 'pdf/reader/point'
|
289
314
|
require 'pdf/reader/print_receiver'
|
315
|
+
require 'pdf/reader/rectangle'
|
290
316
|
require 'pdf/reader/reference'
|
291
317
|
require 'pdf/reader/register_receiver'
|
292
318
|
require 'pdf/reader/null_security_handler'
|
293
|
-
require 'pdf/reader/
|
294
|
-
require 'pdf/reader/
|
319
|
+
require 'pdf/reader/security_handler_factory'
|
320
|
+
require 'pdf/reader/standard_key_builder'
|
321
|
+
require 'pdf/reader/key_builder_v5'
|
322
|
+
require 'pdf/reader/aes_v2_security_handler'
|
323
|
+
require 'pdf/reader/aes_v3_security_handler'
|
324
|
+
require 'pdf/reader/rc4_security_handler'
|
295
325
|
require 'pdf/reader/unimplemented_security_handler'
|
296
326
|
require 'pdf/reader/stream'
|
297
327
|
require 'pdf/reader/text_run'
|
328
|
+
require 'pdf/reader/type_check'
|
298
329
|
require 'pdf/reader/page_state'
|
299
330
|
require 'pdf/reader/page_text_receiver'
|
300
331
|
require 'pdf/reader/token'
|
301
332
|
require 'pdf/reader/xref'
|
302
|
-
require 'pdf/reader/orientation_detector'
|
303
333
|
require 'pdf/reader/page'
|
334
|
+
require 'pdf/reader/validating_receiver'
|