pdf-reader 2.9.2 → 2.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +39 -0
- data/README.md +33 -33
- data/Rakefile +2 -2
- data/lib/pdf/reader/advanced_text_run_filter.rb +152 -0
- data/lib/pdf/reader/aes_v2_security_handler.rb +30 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +35 -3
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +1 -0
- data/lib/pdf/reader/buffer.rb +39 -22
- data/lib/pdf/reader/cid_widths.rb +14 -6
- data/lib/pdf/reader/cmap.rb +16 -5
- data/lib/pdf/reader/encoding.rb +42 -18
- data/lib/pdf/reader/error.rb +6 -4
- data/lib/pdf/reader/filter/ascii85.rb +2 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
- data/lib/pdf/reader/filter/depredict.rb +6 -2
- data/lib/pdf/reader/filter/flate.rb +5 -2
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +2 -0
- data/lib/pdf/reader/filter/run_length.rb +2 -0
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/font.rb +99 -32
- data/lib/pdf/reader/font_descriptor.rb +79 -24
- data/lib/pdf/reader/form_xobject.rb +15 -1
- data/lib/pdf/reader/glyph_hash.rb +41 -8
- data/lib/pdf/reader/key_builder_v5.rb +17 -9
- data/lib/pdf/reader/lzw.rb +42 -16
- data/lib/pdf/reader/no_text_filter.rb +15 -0
- data/lib/pdf/reader/null_security_handler.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +7 -2
- data/lib/pdf/reader/object_hash.rb +129 -16
- data/lib/pdf/reader/object_stream.rb +22 -5
- data/lib/pdf/reader/overlapping_runs_filter.rb +8 -2
- data/lib/pdf/reader/page.rb +66 -13
- data/lib/pdf/reader/page_layout.rb +26 -9
- data/lib/pdf/reader/page_state.rb +12 -3
- data/lib/pdf/reader/page_text_receiver.rb +16 -2
- data/lib/pdf/reader/pages_strategy.rb +1 -1
- data/lib/pdf/reader/parser.rb +52 -13
- data/lib/pdf/reader/point.rb +9 -2
- data/lib/pdf/reader/print_receiver.rb +2 -6
- data/lib/pdf/reader/rc4_security_handler.rb +2 -0
- data/lib/pdf/reader/rectangle.rb +24 -1
- data/lib/pdf/reader/reference.rb +13 -3
- data/lib/pdf/reader/register_receiver.rb +15 -2
- data/lib/pdf/reader/resources.rb +12 -2
- data/lib/pdf/reader/security_handler_factory.rb +13 -0
- data/lib/pdf/reader/standard_key_builder.rb +37 -23
- data/lib/pdf/reader/stream.rb +9 -3
- data/lib/pdf/reader/synchronized_cache.rb +6 -3
- data/lib/pdf/reader/text_run.rb +33 -3
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +41 -10
- data/lib/pdf/reader/type_check.rb +53 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +2 -0
- data/lib/pdf/reader/validating_receiver.rb +29 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +13 -5
- data/lib/pdf/reader/width_calculator/composite.rb +11 -3
- data/lib/pdf/reader/width_calculator/true_type.rb +14 -12
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +8 -5
- data/lib/pdf/reader/width_calculator/type_zero.rb +8 -3
- data/lib/pdf/reader/xref.rb +31 -10
- data/lib/pdf/reader/zero_width_runs_filter.rb +1 -0
- data/lib/pdf/reader.rb +24 -12
- data/rbi/pdf-reader.rbi +1504 -1480
- metadata +34 -17
data/lib/pdf/reader/xref.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
-
# typed:
|
2
|
+
# typed: strict
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
################################################################################
|
@@ -46,8 +46,11 @@ class PDF::Reader
|
|
46
46
|
# the Enumerable mixin. The key difference is no []= method - the hash
|
47
47
|
# is read only.
|
48
48
|
#
|
49
|
+
#: [Elem]
|
49
50
|
class XRef
|
50
51
|
include Enumerable
|
52
|
+
|
53
|
+
#: Hash[Symbol, untyped]
|
51
54
|
attr_reader :trailer
|
52
55
|
|
53
56
|
################################################################################
|
@@ -55,16 +58,19 @@ class PDF::Reader
|
|
55
58
|
#
|
56
59
|
# io - must be an IO object, generally either a file or a StringIO
|
57
60
|
#
|
61
|
+
#: (IO | Tempfile | StringIO) -> void
|
58
62
|
def initialize(io)
|
59
63
|
@io = io
|
60
|
-
@junk_offset = calc_junk_offset(io) || 0
|
61
|
-
@xref = {}
|
62
|
-
@trailer = load_offsets
|
64
|
+
@junk_offset = calc_junk_offset(io) || 0 #: Integer
|
65
|
+
@xref = {} #: Hash[Integer, Hash[Integer, Integer | PDF::Reader::Reference]]
|
66
|
+
@trailer = load_offsets #: Hash[Symbol, untyped]
|
63
67
|
end
|
64
68
|
|
65
69
|
################################################################################
|
66
70
|
# return the number of objects in this file. Objects with multiple generations are
|
67
71
|
# only counter once.
|
72
|
+
#
|
73
|
+
#: () -> untyped
|
68
74
|
def size
|
69
75
|
@xref.size
|
70
76
|
end
|
@@ -72,18 +78,22 @@ class PDF::Reader
|
|
72
78
|
# returns the byte offset for the specified PDF object.
|
73
79
|
#
|
74
80
|
# ref - a PDF::Reader::Reference object containing an object ID and revision number
|
81
|
+
#: (untyped) -> untyped
|
75
82
|
def [](ref)
|
76
|
-
@xref
|
83
|
+
@xref.fetch(ref.id, {}).fetch(ref.gen)
|
77
84
|
rescue
|
78
85
|
raise InvalidObjectError, "Object #{ref.id}, Generation #{ref.gen} is invalid"
|
79
86
|
end
|
80
87
|
################################################################################
|
81
88
|
# iterate over each object in the xref table
|
89
|
+
#
|
90
|
+
# @override(allow_incompatible: true)
|
91
|
+
#: () { (PDF::Reader::Reference) -> untyped } -> void
|
82
92
|
def each(&block)
|
83
93
|
ids = @xref.keys.sort
|
84
94
|
ids.each do |id|
|
85
|
-
gen = @xref
|
86
|
-
yield PDF::Reader::Reference.new(id, gen)
|
95
|
+
gen = @xref.fetch(id, {}).keys.sort[-1]
|
96
|
+
yield PDF::Reader::Reference.new(id, gen.to_i)
|
87
97
|
end
|
88
98
|
end
|
89
99
|
################################################################################
|
@@ -97,6 +107,7 @@ class PDF::Reader
|
|
97
107
|
# After seeking to the offset, processing is handed of to either load_xref_table()
|
98
108
|
# or load_xref_stream() based on what we find there.
|
99
109
|
#
|
110
|
+
#: (?Integer?) -> Hash[Symbol, untyped]
|
100
111
|
def load_offsets(offset = nil)
|
101
112
|
offset ||= new_buffer.find_first_xref_offset
|
102
113
|
offset += @junk_offset
|
@@ -117,7 +128,9 @@ class PDF::Reader
|
|
117
128
|
# to handle the case where an XRef Stream has the Length specified via an
|
118
129
|
# indirect object
|
119
130
|
stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i)
|
120
|
-
|
131
|
+
if stream.is_a?(PDF::Reader::Stream)
|
132
|
+
return load_xref_stream(stream)
|
133
|
+
end
|
121
134
|
end
|
122
135
|
|
123
136
|
raise PDF::Reader::MalformedPDFError,
|
@@ -126,6 +139,8 @@ class PDF::Reader
|
|
126
139
|
################################################################################
|
127
140
|
# Assumes the underlying buffer is positioned at the start of a traditional
|
128
141
|
# Xref table and processes it into memory.
|
142
|
+
#
|
143
|
+
#: (PDF::Reader::Buffer) -> Hash[Symbol, untyped]
|
129
144
|
def load_xref_table(buf)
|
130
145
|
params = []
|
131
146
|
|
@@ -169,8 +184,9 @@ class PDF::Reader
|
|
169
184
|
################################################################################
|
170
185
|
# Read an XRef stream from the underlying buffer instead of a traditional xref table.
|
171
186
|
#
|
187
|
+
#: (PDF::Reader::Stream) -> Hash[Symbol, untyped]
|
172
188
|
def load_xref_stream(stream)
|
173
|
-
unless stream.
|
189
|
+
unless stream.hash[:Type] == :XRef
|
174
190
|
raise PDF::Reader::MalformedPDFError, "xref stream not found when expected"
|
175
191
|
end
|
176
192
|
trailer = Hash[stream.hash.select { |key, value|
|
@@ -216,8 +232,9 @@ class PDF::Reader
|
|
216
232
|
# XRef streams pack info into integers 1-N bytes wide. Depending on the number of
|
217
233
|
# bytes they need to be converted to an int in different ways.
|
218
234
|
#
|
235
|
+
#: (String?) -> Integer
|
219
236
|
def unpack_bytes(bytes)
|
220
|
-
if bytes.
|
237
|
+
res = if bytes.nil? || bytes == ""
|
221
238
|
0
|
222
239
|
elsif bytes.size == 1
|
223
240
|
bytes.unpack("C")[0]
|
@@ -232,6 +249,7 @@ class PDF::Reader
|
|
232
249
|
else
|
233
250
|
raise UnsupportedFeatureError, "Unable to unpack xref stream entries of #{bytes.size} bytes"
|
234
251
|
end
|
252
|
+
TypeCheck.cast_to_int!(res)
|
235
253
|
end
|
236
254
|
################################################################################
|
237
255
|
# Wrap the io stream we're working with in a buffer that can tokenise it for us.
|
@@ -239,12 +257,14 @@ class PDF::Reader
|
|
239
257
|
# We create multiple buffers so we can be tokenising multiple sections of the file
|
240
258
|
# at the same time without worrying about clearing the buffers contents.
|
241
259
|
#
|
260
|
+
#: (?Integer) -> PDF::Reader::Buffer
|
242
261
|
def new_buffer(offset = 0)
|
243
262
|
PDF::Reader::Buffer.new(@io, :seek => offset)
|
244
263
|
end
|
245
264
|
################################################################################
|
246
265
|
# Stores an offset value for a particular PDF object ID and revision number
|
247
266
|
#
|
267
|
+
#: (Integer, Integer, Integer | PDF::Reader::Reference) -> (Integer | PDF::Reader::Reference)
|
248
268
|
def store(id, gen, offset)
|
249
269
|
(@xref[id] ||= {})[gen] ||= offset
|
250
270
|
end
|
@@ -258,6 +278,7 @@ class PDF::Reader
|
|
258
278
|
# Adobe PDF 1.4 spec (3.4.1) 12. Acrobat viewers require only that the
|
259
279
|
# header appear somewhere within the first 1024 bytes of the file
|
260
280
|
#
|
281
|
+
#: (IO | Tempfile | StringIO) -> Integer?
|
261
282
|
def calc_junk_offset(io)
|
262
283
|
io.rewind
|
263
284
|
offset = io.pos
|
data/lib/pdf/reader.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
-
# typed:
|
2
|
+
# typed: strict
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
################################################################################
|
@@ -95,6 +95,7 @@ module PDF
|
|
95
95
|
class Reader
|
96
96
|
|
97
97
|
# lowlevel hash-like access to all objects in the underlying PDF
|
98
|
+
#: PDF::Reader::ObjectHash
|
98
99
|
attr_reader :objects
|
99
100
|
|
100
101
|
# creates a new document reader for the provided PDF.
|
@@ -115,22 +116,27 @@ module PDF
|
|
115
116
|
# Using this method directly is supported, but it's more common to use
|
116
117
|
# `PDF::Reader.open`
|
117
118
|
#
|
119
|
+
#: (String | Tempfile | IO | StringIO, ?Hash[untyped, untyped]) -> void
|
118
120
|
def initialize(input, opts = {})
|
119
|
-
@cache = PDF::Reader::ObjectCache.new
|
121
|
+
@cache = PDF::Reader::ObjectCache.new #: PDF::Reader::ObjectCache
|
120
122
|
opts.merge!(:cache => @cache)
|
121
|
-
@objects = PDF::Reader::ObjectHash.new(input, opts)
|
123
|
+
@objects = PDF::Reader::ObjectHash.new(input, opts) #: PDF::Reader::ObjectHash
|
124
|
+
@page_count = nil #: Integer | nil
|
125
|
+
@root = nil #: Hash[Symbol, untyped] | nil
|
122
126
|
end
|
123
127
|
|
124
128
|
# Return a Hash with some basic information about the PDF file
|
125
129
|
#
|
130
|
+
#: () -> Hash[untyped, untyped]?
|
126
131
|
def info
|
127
132
|
dict = @objects.deref_hash(@objects.trailer[:Info]) || {}
|
128
133
|
doc_strings_to_utf8(dict)
|
129
134
|
end
|
130
135
|
|
131
|
-
# Return a
|
136
|
+
# Return a String with extra XML metadata provided by the author of the PDF file. Not
|
132
137
|
# always present.
|
133
138
|
#
|
139
|
+
#: () -> String?
|
134
140
|
def metadata
|
135
141
|
stream = @objects.deref_stream(root[:Metadata])
|
136
142
|
if stream.nil?
|
@@ -144,6 +150,7 @@ module PDF
|
|
144
150
|
|
145
151
|
# To number of pages in this PDF
|
146
152
|
#
|
153
|
+
#: () -> Integer
|
147
154
|
def page_count
|
148
155
|
pages = @objects.deref_hash(root[:Pages])
|
149
156
|
unless pages.kind_of?(::Hash)
|
@@ -154,6 +161,7 @@ module PDF
|
|
154
161
|
|
155
162
|
# The PDF version this file uses
|
156
163
|
#
|
164
|
+
#: () -> Float
|
157
165
|
def pdf_version
|
158
166
|
@objects.pdf_version
|
159
167
|
end
|
@@ -171,6 +179,7 @@ module PDF
|
|
171
179
|
# puts reader.pdf_version
|
172
180
|
# end
|
173
181
|
#
|
182
|
+
#: (String | Tempfile | IO, ?Hash[untyped, untyped]) { (PDF::Reader) -> void } -> untyped
|
174
183
|
def self.open(input, opts = {}, &block)
|
175
184
|
yield PDF::Reader.new(input, opts)
|
176
185
|
end
|
@@ -182,13 +191,14 @@ module PDF
|
|
182
191
|
#
|
183
192
|
# reader.pages.each do |page|
|
184
193
|
# puts page.fonts
|
185
|
-
# puts page.
|
194
|
+
# puts page.rectangles
|
186
195
|
# puts page.text
|
187
196
|
# end
|
188
197
|
#
|
189
198
|
# See the docs for PDF::Reader::Page to read more about the
|
190
199
|
# methods available on each page
|
191
200
|
#
|
201
|
+
#: () -> Array[PDF::Reader::Page]
|
192
202
|
def pages
|
193
203
|
return [] if page_count <= 0
|
194
204
|
|
@@ -213,6 +223,7 @@ module PDF
|
|
213
223
|
# See the docs for PDF::Reader::Page to read more about the
|
214
224
|
# methods available on each page
|
215
225
|
#
|
226
|
+
#: (Integer) -> PDF::Reader::Page
|
216
227
|
def page(num)
|
217
228
|
num = num.to_i
|
218
229
|
if num < 1 || num > self.page_count
|
@@ -225,6 +236,7 @@ module PDF
|
|
225
236
|
|
226
237
|
# recursively convert strings from outside a content stream into UTF-8
|
227
238
|
#
|
239
|
+
#: (untyped) -> untyped
|
228
240
|
def doc_strings_to_utf8(obj)
|
229
241
|
case obj
|
230
242
|
when ::Hash then
|
@@ -246,6 +258,7 @@ module PDF
|
|
246
258
|
end
|
247
259
|
end
|
248
260
|
|
261
|
+
#: (String) -> bool
|
249
262
|
def has_utf16_bom?(str)
|
250
263
|
first_bytes = str[0,2]
|
251
264
|
|
@@ -256,6 +269,7 @@ module PDF
|
|
256
269
|
|
257
270
|
# TODO find a PDF I can use to spec this behaviour
|
258
271
|
#
|
272
|
+
#: (String) -> String
|
259
273
|
def pdfdoc_to_utf8(obj)
|
260
274
|
obj.force_encoding("utf-8")
|
261
275
|
obj
|
@@ -264,6 +278,7 @@ module PDF
|
|
264
278
|
# one day we'll all run on a 1.9 compatible VM and I can just do this with
|
265
279
|
# String#encode
|
266
280
|
#
|
281
|
+
#: (String) -> String
|
267
282
|
def utf16_to_utf8(obj)
|
268
283
|
str = obj[2, obj.size].to_s
|
269
284
|
str = str.unpack("n*").pack("U*")
|
@@ -271,14 +286,9 @@ module PDF
|
|
271
286
|
str
|
272
287
|
end
|
273
288
|
|
289
|
+
#: () -> Hash[Symbol, untyped]
|
274
290
|
def root
|
275
|
-
@root ||=
|
276
|
-
obj = @objects.deref_hash(@objects.trailer[:Root]) || {}
|
277
|
-
unless obj.kind_of?(::Hash)
|
278
|
-
raise MalformedPDFError, "PDF malformed, trailer Root should be a dictionary"
|
279
|
-
end
|
280
|
-
obj
|
281
|
-
end
|
291
|
+
@root ||= @objects.deref_hash(@objects.trailer[:Root]) || {}
|
282
292
|
end
|
283
293
|
|
284
294
|
end
|
@@ -286,6 +296,7 @@ end
|
|
286
296
|
################################################################################
|
287
297
|
|
288
298
|
require 'pdf/reader/resources'
|
299
|
+
require 'pdf/reader/advanced_text_run_filter'
|
289
300
|
require 'pdf/reader/buffer'
|
290
301
|
require 'pdf/reader/bounding_rectangle_runs_filter'
|
291
302
|
require 'pdf/reader/cid_widths'
|
@@ -315,6 +326,7 @@ require 'pdf/reader/print_receiver'
|
|
315
326
|
require 'pdf/reader/rectangle'
|
316
327
|
require 'pdf/reader/reference'
|
317
328
|
require 'pdf/reader/register_receiver'
|
329
|
+
require 'pdf/reader/no_text_filter'
|
318
330
|
require 'pdf/reader/null_security_handler'
|
319
331
|
require 'pdf/reader/security_handler_factory'
|
320
332
|
require 'pdf/reader/standard_key_builder'
|