pdf-reader 2.2.0 → 2.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +90 -0
- data/README.md +18 -3
- data/Rakefile +1 -1
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_text +1 -1
- data/examples/extract_fonts.rb +12 -7
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier.afm +342 -342
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -213
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
- data/lib/pdf/reader/buffer.rb +91 -47
- data/lib/pdf/reader/cid_widths.rb +7 -4
- data/lib/pdf/reader/cmap.rb +83 -59
- data/lib/pdf/reader/encoding.rb +17 -14
- data/lib/pdf/reader/error.rb +15 -3
- data/lib/pdf/reader/filter/ascii85.rb +7 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +12 -10
- data/lib/pdf/reader/filter/flate.rb +30 -16
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -1
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +11 -11
- data/lib/pdf/reader/font.rb +89 -26
- data/lib/pdf/reader/font_descriptor.rb +22 -18
- data/lib/pdf/reader/form_xobject.rb +18 -5
- data/lib/pdf/reader/glyph_hash.rb +28 -13
- data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
- data/lib/pdf/reader/key_builder_v5.rb +138 -0
- data/lib/pdf/reader/lzw.rb +28 -11
- data/lib/pdf/reader/no_text_filter.rb +14 -0
- data/lib/pdf/reader/null_security_handler.rb +1 -4
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +292 -63
- data/lib/pdf/reader/object_stream.rb +3 -2
- data/lib/pdf/reader/overlapping_runs_filter.rb +72 -0
- data/lib/pdf/reader/page.rb +143 -16
- data/lib/pdf/reader/page_layout.rb +43 -39
- data/lib/pdf/reader/page_state.rb +26 -17
- data/lib/pdf/reader/page_text_receiver.rb +74 -4
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +34 -14
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rc4_security_handler.rb +38 -0
- data/lib/pdf/reader/rectangle.rb +113 -0
- data/lib/pdf/reader/reference.rb +3 -1
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/{resource_methods.rb → resources.rb} +17 -9
- data/lib/pdf/reader/security_handler_factory.rb +79 -0
- data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
- data/lib/pdf/reader/stream.rb +3 -2
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +40 -5
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +8 -7
- data/lib/pdf/reader/type_check.rb +98 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/validating_receiver.rb +262 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +27 -17
- data/lib/pdf/reader/width_calculator/composite.rb +6 -1
- data/lib/pdf/reader/width_calculator/true_type.rb +10 -11
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +6 -4
- data/lib/pdf/reader/width_calculator/type_zero.rb +6 -2
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +37 -11
- data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
- data/lib/pdf/reader.rb +49 -24
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +2048 -0
- metadata +39 -23
- data/lib/pdf/hash.rb +0 -20
- data/lib/pdf/reader/orientation_detector.rb +0 -34
- data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
data/lib/pdf/reader/xref.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -72,7 +73,7 @@ class PDF::Reader
|
|
72
73
|
#
|
73
74
|
# ref - a PDF::Reader::Reference object containing an object ID and revision number
|
74
75
|
def [](ref)
|
75
|
-
@xref
|
76
|
+
@xref.fetch(ref.id, {}).fetch(ref.gen)
|
76
77
|
rescue
|
77
78
|
raise InvalidObjectError, "Object #{ref.id}, Generation #{ref.gen} is invalid"
|
78
79
|
end
|
@@ -81,8 +82,8 @@ class PDF::Reader
|
|
81
82
|
def each(&block)
|
82
83
|
ids = @xref.keys.sort
|
83
84
|
ids.each do |id|
|
84
|
-
gen = @xref
|
85
|
-
yield PDF::Reader::Reference.new(id, gen)
|
85
|
+
gen = @xref.fetch(id, {}).keys.sort[-1]
|
86
|
+
yield PDF::Reader::Reference.new(id, gen.to_i)
|
86
87
|
end
|
87
88
|
end
|
88
89
|
################################################################################
|
@@ -103,13 +104,18 @@ class PDF::Reader
|
|
103
104
|
buf = new_buffer(offset)
|
104
105
|
tok_one = buf.token
|
105
106
|
|
107
|
+
# we have a traditional xref table
|
106
108
|
return load_xref_table(buf) if tok_one == "xref" || tok_one == "ref"
|
107
109
|
|
108
110
|
tok_two = buf.token
|
109
111
|
tok_three = buf.token
|
110
112
|
|
113
|
+
# we have an XRef stream
|
111
114
|
if tok_one.to_i >= 0 && tok_two.to_i >= 0 && tok_three == "obj"
|
112
115
|
buf = new_buffer(offset)
|
116
|
+
# Maybe we should be parsing the ObjectHash second argument to the Parser here,
|
117
|
+
# to handle the case where an XRef Stream has the Length specified via an
|
118
|
+
# indirect object
|
113
119
|
stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i)
|
114
120
|
return load_xref_stream(stream)
|
115
121
|
end
|
@@ -125,12 +131,19 @@ class PDF::Reader
|
|
125
131
|
|
126
132
|
while !params.include?("trailer") && !params.include?(nil)
|
127
133
|
if params.size == 2
|
134
|
+
unless params[0].to_s.match(/\A\d+\z/)
|
135
|
+
raise MalformedPDFError, "invalid xref table, expected object ID"
|
136
|
+
end
|
137
|
+
|
128
138
|
objid, count = params[0].to_i, params[1].to_i
|
129
139
|
count.times do
|
130
140
|
offset = buf.token.to_i
|
131
141
|
generation = buf.token.to_i
|
132
142
|
state = buf.token
|
133
143
|
|
144
|
+
# Some PDF writers start numbering at 1 instead of 0. Fix up the number.
|
145
|
+
# TODO should this fix be logged?
|
146
|
+
objid = 0 if objid == 1 and offset == 0 and generation == 65535 and state == 'f'
|
134
147
|
store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
|
135
148
|
objid += 1
|
136
149
|
params.clear
|
@@ -139,14 +152,16 @@ class PDF::Reader
|
|
139
152
|
params << buf.token
|
140
153
|
end
|
141
154
|
|
142
|
-
trailer = Parser.new(buf
|
155
|
+
trailer = Parser.new(buf).parse_token
|
143
156
|
|
144
157
|
unless trailer.kind_of?(Hash)
|
145
158
|
raise MalformedPDFError, "PDF malformed, trailer should be a dictionary"
|
146
159
|
end
|
147
160
|
|
148
161
|
load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
|
149
|
-
|
162
|
+
# Some PDF creators seem to use '/Prev 0' in trailer if there is no previous xref
|
163
|
+
# It's not possible for an xref to appear at offset 0, so can safely skip the ref
|
164
|
+
load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev) and trailer[:Prev].to_i != 0
|
150
165
|
|
151
166
|
trailer
|
152
167
|
end
|
@@ -162,8 +177,16 @@ class PDF::Reader
|
|
162
177
|
[:Size, :Prev, :Root, :Encrypt, :Info, :ID].include?(key)
|
163
178
|
}]
|
164
179
|
|
165
|
-
widths
|
166
|
-
|
180
|
+
widths = stream.hash[:W]
|
181
|
+
|
182
|
+
PDF::Reader::Error.validate_type_as_malformed(widths, "xref stream widths", Array)
|
183
|
+
|
184
|
+
entry_length = widths.inject(0) { |s, w|
|
185
|
+
unless w.is_a?(Integer)
|
186
|
+
w = 0
|
187
|
+
end
|
188
|
+
s + w
|
189
|
+
}
|
167
190
|
raw_data = StringIO.new(stream.unfiltered_data)
|
168
191
|
if stream.hash[:Index]
|
169
192
|
index = stream.hash[:Index]
|
@@ -230,18 +253,21 @@ class PDF::Reader
|
|
230
253
|
# should always be 0, but all sort of crazy junk is prefixed to PDF files
|
231
254
|
# in the real world.
|
232
255
|
#
|
233
|
-
# Checks up to
|
256
|
+
# Checks up to 1024 chars into the file,
|
257
|
+
# returns nil if no PDF data detected.
|
258
|
+
# Adobe PDF 1.4 spec (3.4.1) 12. Acrobat viewers require only that the
|
259
|
+
# header appear somewhere within the first 1024 bytes of the file
|
234
260
|
#
|
235
261
|
def calc_junk_offset(io)
|
236
262
|
io.rewind
|
237
263
|
offset = io.pos
|
238
|
-
until (c = io.readchar) == '%' || c == 37 || offset >
|
264
|
+
until (c = io.readchar) == '%' || c == 37 || offset > 1024
|
239
265
|
offset += 1
|
240
266
|
end
|
241
267
|
io.rewind
|
242
|
-
offset <
|
268
|
+
offset < 1024 ? offset : nil
|
243
269
|
rescue EOFError
|
244
|
-
|
270
|
+
nil
|
245
271
|
end
|
246
272
|
end
|
247
273
|
################################################################################
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: strict
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
class PDF::Reader
|
6
|
+
# There's no point rendering zero-width characters
|
7
|
+
class ZeroWidthRunsFilter
|
8
|
+
|
9
|
+
def self.exclude_zero_width_runs(runs)
|
10
|
+
runs.reject { |run| run.width == 0 }
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
data/lib/pdf/reader.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -111,19 +112,27 @@ module PDF
|
|
111
112
|
#
|
112
113
|
# reader = PDF::Reader.new("somefile.pdf", :password => "apples")
|
113
114
|
#
|
115
|
+
# Using this method directly is supported, but it's more common to use
|
116
|
+
# `PDF::Reader.open`
|
117
|
+
#
|
114
118
|
def initialize(input, opts = {})
|
115
119
|
@cache = PDF::Reader::ObjectCache.new
|
116
120
|
opts.merge!(:cache => @cache)
|
117
121
|
@objects = PDF::Reader::ObjectHash.new(input, opts)
|
118
122
|
end
|
119
123
|
|
124
|
+
# Return a Hash with some basic information about the PDF file
|
125
|
+
#
|
120
126
|
def info
|
121
|
-
dict = @objects.
|
127
|
+
dict = @objects.deref_hash(@objects.trailer[:Info]) || {}
|
122
128
|
doc_strings_to_utf8(dict)
|
123
129
|
end
|
124
130
|
|
131
|
+
# Return a String with extra XML metadata provided by the author of the PDF file. Not
|
132
|
+
# always present.
|
133
|
+
#
|
125
134
|
def metadata
|
126
|
-
stream = @objects.
|
135
|
+
stream = @objects.deref_stream(root[:Metadata])
|
127
136
|
if stream.nil?
|
128
137
|
nil
|
129
138
|
else
|
@@ -133,20 +142,24 @@ module PDF
|
|
133
142
|
end
|
134
143
|
end
|
135
144
|
|
145
|
+
# To number of pages in this PDF
|
146
|
+
#
|
136
147
|
def page_count
|
137
|
-
pages = @objects.
|
148
|
+
pages = @objects.deref_hash(root[:Pages])
|
138
149
|
unless pages.kind_of?(::Hash)
|
139
|
-
raise MalformedPDFError,
|
150
|
+
raise MalformedPDFError, "Pages structure is missing #{pages.class}"
|
140
151
|
end
|
141
|
-
@page_count ||= @objects.
|
152
|
+
@page_count ||= @objects.deref_integer(pages[:Count]) || 0
|
142
153
|
end
|
143
154
|
|
155
|
+
# The PDF version this file uses
|
156
|
+
#
|
144
157
|
def pdf_version
|
145
158
|
@objects.pdf_version
|
146
159
|
end
|
147
160
|
|
148
|
-
# syntactic sugar for opening a PDF file. Accepts the
|
149
|
-
# as new().
|
161
|
+
# syntactic sugar for opening a PDF file and the most common approach. Accepts the
|
162
|
+
# same arguments as new().
|
150
163
|
#
|
151
164
|
# PDF::Reader.open("somefile.pdf") do |reader|
|
152
165
|
# puts reader.pdf_version
|
@@ -169,7 +182,7 @@ module PDF
|
|
169
182
|
#
|
170
183
|
# reader.pages.each do |page|
|
171
184
|
# puts page.fonts
|
172
|
-
# puts page.
|
185
|
+
# puts page.rectangles
|
173
186
|
# puts page.text
|
174
187
|
# end
|
175
188
|
#
|
@@ -177,10 +190,12 @@ module PDF
|
|
177
190
|
# methods available on each page
|
178
191
|
#
|
179
192
|
def pages
|
193
|
+
return [] if page_count <= 0
|
194
|
+
|
180
195
|
(1..self.page_count).map do |num|
|
181
196
|
begin
|
182
197
|
PDF::Reader::Page.new(@objects, num, :cache => @cache)
|
183
|
-
rescue InvalidPageError
|
198
|
+
rescue InvalidPageError
|
184
199
|
raise MalformedPDFError, "Missing data for page: #{num}"
|
185
200
|
end
|
186
201
|
end
|
@@ -221,16 +236,24 @@ module PDF
|
|
221
236
|
when Array then
|
222
237
|
obj.map { |item| doc_strings_to_utf8(item) }
|
223
238
|
when String then
|
224
|
-
if obj
|
239
|
+
if has_utf16_bom?(obj)
|
225
240
|
utf16_to_utf8(obj)
|
226
241
|
else
|
227
242
|
pdfdoc_to_utf8(obj)
|
228
243
|
end
|
229
244
|
else
|
230
|
-
|
245
|
+
obj
|
231
246
|
end
|
232
247
|
end
|
233
248
|
|
249
|
+
def has_utf16_bom?(str)
|
250
|
+
first_bytes = str[0,2]
|
251
|
+
|
252
|
+
return false if first_bytes.nil?
|
253
|
+
|
254
|
+
first_bytes.unpack("C*") == [254, 255]
|
255
|
+
end
|
256
|
+
|
234
257
|
# TODO find a PDF I can use to spec this behaviour
|
235
258
|
#
|
236
259
|
def pdfdoc_to_utf8(obj)
|
@@ -242,28 +265,23 @@ module PDF
|
|
242
265
|
# String#encode
|
243
266
|
#
|
244
267
|
def utf16_to_utf8(obj)
|
245
|
-
str = obj[2, obj.size]
|
268
|
+
str = obj[2, obj.size].to_s
|
246
269
|
str = str.unpack("n*").pack("U*")
|
247
270
|
str.force_encoding("utf-8")
|
248
271
|
str
|
249
272
|
end
|
250
273
|
|
251
274
|
def root
|
252
|
-
@root ||=
|
253
|
-
obj = @objects.deref(@objects.trailer[:Root])
|
254
|
-
unless obj.kind_of?(::Hash)
|
255
|
-
raise MalformedPDFError, "PDF malformed, trailer Root should be a dictionary"
|
256
|
-
end
|
257
|
-
obj
|
258
|
-
end
|
275
|
+
@root ||= @objects.deref_hash(@objects.trailer[:Root]) || {}
|
259
276
|
end
|
260
277
|
|
261
278
|
end
|
262
279
|
end
|
263
280
|
################################################################################
|
264
281
|
|
265
|
-
require 'pdf/reader/
|
282
|
+
require 'pdf/reader/resources'
|
266
283
|
require 'pdf/reader/buffer'
|
284
|
+
require 'pdf/reader/bounding_rectangle_runs_filter'
|
267
285
|
require 'pdf/reader/cid_widths'
|
268
286
|
require 'pdf/reader/cmap'
|
269
287
|
require 'pdf/reader/encoding'
|
@@ -286,19 +304,26 @@ require 'pdf/reader/object_hash'
|
|
286
304
|
require 'pdf/reader/object_stream'
|
287
305
|
require 'pdf/reader/pages_strategy'
|
288
306
|
require 'pdf/reader/parser'
|
307
|
+
require 'pdf/reader/point'
|
289
308
|
require 'pdf/reader/print_receiver'
|
309
|
+
require 'pdf/reader/rectangle'
|
290
310
|
require 'pdf/reader/reference'
|
291
311
|
require 'pdf/reader/register_receiver'
|
312
|
+
require 'pdf/reader/no_text_filter'
|
292
313
|
require 'pdf/reader/null_security_handler'
|
293
|
-
require 'pdf/reader/
|
294
|
-
require 'pdf/reader/
|
314
|
+
require 'pdf/reader/security_handler_factory'
|
315
|
+
require 'pdf/reader/standard_key_builder'
|
316
|
+
require 'pdf/reader/key_builder_v5'
|
317
|
+
require 'pdf/reader/aes_v2_security_handler'
|
318
|
+
require 'pdf/reader/aes_v3_security_handler'
|
319
|
+
require 'pdf/reader/rc4_security_handler'
|
295
320
|
require 'pdf/reader/unimplemented_security_handler'
|
296
321
|
require 'pdf/reader/stream'
|
297
322
|
require 'pdf/reader/text_run'
|
323
|
+
require 'pdf/reader/type_check'
|
298
324
|
require 'pdf/reader/page_state'
|
299
325
|
require 'pdf/reader/page_text_receiver'
|
300
326
|
require 'pdf/reader/token'
|
301
327
|
require 'pdf/reader/xref'
|
302
|
-
require 'pdf/reader/orientation_detector'
|
303
328
|
require 'pdf/reader/page'
|
304
|
-
require 'pdf/
|
329
|
+
require 'pdf/reader/validating_receiver'
|