pdf-reader 2.5.0 → 2.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +42 -0
- data/README.md +16 -1
- data/Rakefile +1 -1
- data/examples/extract_fonts.rb +12 -7
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
- data/lib/pdf/reader/buffer.rb +90 -46
- data/lib/pdf/reader/cid_widths.rb +1 -0
- data/lib/pdf/reader/cmap.rb +65 -50
- data/lib/pdf/reader/encoding.rb +3 -2
- data/lib/pdf/reader/error.rb +19 -3
- data/lib/pdf/reader/filter/ascii85.rb +7 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +11 -9
- data/lib/pdf/reader/filter/flate.rb +4 -2
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -1
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +2 -1
- data/lib/pdf/reader/font.rb +72 -16
- data/lib/pdf/reader/font_descriptor.rb +19 -17
- data/lib/pdf/reader/form_xobject.rb +15 -5
- data/lib/pdf/reader/glyph_hash.rb +16 -9
- data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
- data/lib/pdf/reader/key_builder_v5.rb +138 -0
- data/lib/pdf/reader/lzw.rb +4 -2
- data/lib/pdf/reader/null_security_handler.rb +1 -4
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +252 -44
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
- data/lib/pdf/reader/page.rb +99 -19
- data/lib/pdf/reader/page_layout.rb +36 -37
- data/lib/pdf/reader/page_state.rb +12 -11
- data/lib/pdf/reader/page_text_receiver.rb +57 -10
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +23 -12
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rc4_security_handler.rb +38 -0
- data/lib/pdf/reader/rectangle.rb +113 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/{resource_methods.rb → resources.rb} +16 -9
- data/lib/pdf/reader/security_handler_factory.rb +79 -0
- data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
- data/lib/pdf/reader/stream.rb +2 -1
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +14 -6
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/type_check.rb +52 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/validating_receiver.rb +262 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +1 -0
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +2 -1
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +27 -4
- data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
- data/lib/pdf/reader.rb +46 -15
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +1978 -0
- metadata +21 -10
- data/lib/pdf/reader/orientation_detector.rb +0 -34
- data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
data/lib/pdf/reader/xref.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -103,13 +104,18 @@ class PDF::Reader
|
|
103
104
|
buf = new_buffer(offset)
|
104
105
|
tok_one = buf.token
|
105
106
|
|
107
|
+
# we have a traditional xref table
|
106
108
|
return load_xref_table(buf) if tok_one == "xref" || tok_one == "ref"
|
107
109
|
|
108
110
|
tok_two = buf.token
|
109
111
|
tok_three = buf.token
|
110
112
|
|
113
|
+
# we have an XRef stream
|
111
114
|
if tok_one.to_i >= 0 && tok_two.to_i >= 0 && tok_three == "obj"
|
112
115
|
buf = new_buffer(offset)
|
116
|
+
# Maybe we should be parsing the ObjectHash second argument to the Parser here,
|
117
|
+
# to handle the case where an XRef Stream has the Length specified via an
|
118
|
+
# indirect object
|
113
119
|
stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i)
|
114
120
|
return load_xref_stream(stream)
|
115
121
|
end
|
@@ -125,12 +131,19 @@ class PDF::Reader
|
|
125
131
|
|
126
132
|
while !params.include?("trailer") && !params.include?(nil)
|
127
133
|
if params.size == 2
|
134
|
+
unless params[0].to_s.match(/\A\d+\z/)
|
135
|
+
raise MalformedPDFError, "invalid xref table, expected object ID"
|
136
|
+
end
|
137
|
+
|
128
138
|
objid, count = params[0].to_i, params[1].to_i
|
129
139
|
count.times do
|
130
140
|
offset = buf.token.to_i
|
131
141
|
generation = buf.token.to_i
|
132
142
|
state = buf.token
|
133
143
|
|
144
|
+
# Some PDF writers start numbering at 1 instead of 0. Fix up the number.
|
145
|
+
# TODO should this fix be logged?
|
146
|
+
objid = 0 if objid == 1 and offset == 0 and generation == 65535 and state == 'f'
|
134
147
|
store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
|
135
148
|
objid += 1
|
136
149
|
params.clear
|
@@ -139,14 +152,16 @@ class PDF::Reader
|
|
139
152
|
params << buf.token
|
140
153
|
end
|
141
154
|
|
142
|
-
trailer = Parser.new(buf
|
155
|
+
trailer = Parser.new(buf).parse_token
|
143
156
|
|
144
157
|
unless trailer.kind_of?(Hash)
|
145
158
|
raise MalformedPDFError, "PDF malformed, trailer should be a dictionary"
|
146
159
|
end
|
147
160
|
|
148
161
|
load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
|
149
|
-
|
162
|
+
# Some PDF creators seem to use '/Prev 0' in trailer if there is no previous xref
|
163
|
+
# It's not possible for an xref to appear at offset 0, so can safely skip the ref
|
164
|
+
load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev) and trailer[:Prev].to_i != 0
|
150
165
|
|
151
166
|
trailer
|
152
167
|
end
|
@@ -162,8 +177,16 @@ class PDF::Reader
|
|
162
177
|
[:Size, :Prev, :Root, :Encrypt, :Info, :ID].include?(key)
|
163
178
|
}]
|
164
179
|
|
165
|
-
widths
|
166
|
-
|
180
|
+
widths = stream.hash[:W]
|
181
|
+
|
182
|
+
PDF::Reader::Error.validate_type_as_malformed(widths, "xref stream widths", Array)
|
183
|
+
|
184
|
+
entry_length = widths.inject(0) { |s, w|
|
185
|
+
unless w.is_a?(Integer)
|
186
|
+
w = 0
|
187
|
+
end
|
188
|
+
s + w
|
189
|
+
}
|
167
190
|
raw_data = StringIO.new(stream.unfiltered_data)
|
168
191
|
if stream.hash[:Index]
|
169
192
|
index = stream.hash[:Index]
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: strict
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
class PDF::Reader
|
6
|
+
# There's no point rendering zero-width characters
|
7
|
+
class ZeroWidthRunsFilter
|
8
|
+
|
9
|
+
def self.exclude_zero_width_runs(runs)
|
10
|
+
runs.reject { |run| run.width == 0 }
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
data/lib/pdf/reader.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -111,19 +112,27 @@ module PDF
|
|
111
112
|
#
|
112
113
|
# reader = PDF::Reader.new("somefile.pdf", :password => "apples")
|
113
114
|
#
|
115
|
+
# Using this method directly is supported, but it's more common to use
|
116
|
+
# `PDF::Reader.open`
|
117
|
+
#
|
114
118
|
def initialize(input, opts = {})
|
115
119
|
@cache = PDF::Reader::ObjectCache.new
|
116
120
|
opts.merge!(:cache => @cache)
|
117
121
|
@objects = PDF::Reader::ObjectHash.new(input, opts)
|
118
122
|
end
|
119
123
|
|
124
|
+
# Return a Hash with some basic information about the PDF file
|
125
|
+
#
|
120
126
|
def info
|
121
|
-
dict = @objects.
|
127
|
+
dict = @objects.deref_hash(@objects.trailer[:Info]) || {}
|
122
128
|
doc_strings_to_utf8(dict)
|
123
129
|
end
|
124
130
|
|
131
|
+
# Return a Hash with extra metadata provided by the author of the PDF file. Not
|
132
|
+
# always present.
|
133
|
+
#
|
125
134
|
def metadata
|
126
|
-
stream = @objects.
|
135
|
+
stream = @objects.deref_stream(root[:Metadata])
|
127
136
|
if stream.nil?
|
128
137
|
nil
|
129
138
|
else
|
@@ -133,20 +142,24 @@ module PDF
|
|
133
142
|
end
|
134
143
|
end
|
135
144
|
|
145
|
+
# To number of pages in this PDF
|
146
|
+
#
|
136
147
|
def page_count
|
137
|
-
pages = @objects.
|
148
|
+
pages = @objects.deref_hash(root[:Pages])
|
138
149
|
unless pages.kind_of?(::Hash)
|
139
|
-
raise MalformedPDFError,
|
150
|
+
raise MalformedPDFError, "Pages structure is missing #{pages.class}"
|
140
151
|
end
|
141
|
-
@page_count ||= @objects.
|
152
|
+
@page_count ||= @objects.deref_integer(pages[:Count]) || 0
|
142
153
|
end
|
143
154
|
|
155
|
+
# The PDF version this file uses
|
156
|
+
#
|
144
157
|
def pdf_version
|
145
158
|
@objects.pdf_version
|
146
159
|
end
|
147
160
|
|
148
|
-
# syntactic sugar for opening a PDF file. Accepts the
|
149
|
-
# as new().
|
161
|
+
# syntactic sugar for opening a PDF file and the most common approach. Accepts the
|
162
|
+
# same arguments as new().
|
150
163
|
#
|
151
164
|
# PDF::Reader.open("somefile.pdf") do |reader|
|
152
165
|
# puts reader.pdf_version
|
@@ -177,6 +190,8 @@ module PDF
|
|
177
190
|
# methods available on each page
|
178
191
|
#
|
179
192
|
def pages
|
193
|
+
return [] if page_count <= 0
|
194
|
+
|
180
195
|
(1..self.page_count).map do |num|
|
181
196
|
begin
|
182
197
|
PDF::Reader::Page.new(@objects, num, :cache => @cache)
|
@@ -221,16 +236,24 @@ module PDF
|
|
221
236
|
when Array then
|
222
237
|
obj.map { |item| doc_strings_to_utf8(item) }
|
223
238
|
when String then
|
224
|
-
if obj
|
239
|
+
if has_utf16_bom?(obj)
|
225
240
|
utf16_to_utf8(obj)
|
226
241
|
else
|
227
242
|
pdfdoc_to_utf8(obj)
|
228
243
|
end
|
229
244
|
else
|
230
|
-
|
245
|
+
obj
|
231
246
|
end
|
232
247
|
end
|
233
248
|
|
249
|
+
def has_utf16_bom?(str)
|
250
|
+
first_bytes = str[0,2]
|
251
|
+
|
252
|
+
return false if first_bytes.nil?
|
253
|
+
|
254
|
+
first_bytes.unpack("C*") == [254, 255]
|
255
|
+
end
|
256
|
+
|
234
257
|
# TODO find a PDF I can use to spec this behaviour
|
235
258
|
#
|
236
259
|
def pdfdoc_to_utf8(obj)
|
@@ -242,7 +265,7 @@ module PDF
|
|
242
265
|
# String#encode
|
243
266
|
#
|
244
267
|
def utf16_to_utf8(obj)
|
245
|
-
str = obj[2, obj.size]
|
268
|
+
str = obj[2, obj.size].to_s
|
246
269
|
str = str.unpack("n*").pack("U*")
|
247
270
|
str.force_encoding("utf-8")
|
248
271
|
str
|
@@ -250,7 +273,7 @@ module PDF
|
|
250
273
|
|
251
274
|
def root
|
252
275
|
@root ||= begin
|
253
|
-
obj = @objects.
|
276
|
+
obj = @objects.deref_hash(@objects.trailer[:Root]) || {}
|
254
277
|
unless obj.kind_of?(::Hash)
|
255
278
|
raise MalformedPDFError, "PDF malformed, trailer Root should be a dictionary"
|
256
279
|
end
|
@@ -262,8 +285,9 @@ module PDF
|
|
262
285
|
end
|
263
286
|
################################################################################
|
264
287
|
|
265
|
-
require 'pdf/reader/
|
288
|
+
require 'pdf/reader/resources'
|
266
289
|
require 'pdf/reader/buffer'
|
290
|
+
require 'pdf/reader/bounding_rectangle_runs_filter'
|
267
291
|
require 'pdf/reader/cid_widths'
|
268
292
|
require 'pdf/reader/cmap'
|
269
293
|
require 'pdf/reader/encoding'
|
@@ -286,18 +310,25 @@ require 'pdf/reader/object_hash'
|
|
286
310
|
require 'pdf/reader/object_stream'
|
287
311
|
require 'pdf/reader/pages_strategy'
|
288
312
|
require 'pdf/reader/parser'
|
313
|
+
require 'pdf/reader/point'
|
289
314
|
require 'pdf/reader/print_receiver'
|
315
|
+
require 'pdf/reader/rectangle'
|
290
316
|
require 'pdf/reader/reference'
|
291
317
|
require 'pdf/reader/register_receiver'
|
292
318
|
require 'pdf/reader/null_security_handler'
|
293
|
-
require 'pdf/reader/
|
294
|
-
require 'pdf/reader/
|
319
|
+
require 'pdf/reader/security_handler_factory'
|
320
|
+
require 'pdf/reader/standard_key_builder'
|
321
|
+
require 'pdf/reader/key_builder_v5'
|
322
|
+
require 'pdf/reader/aes_v2_security_handler'
|
323
|
+
require 'pdf/reader/aes_v3_security_handler'
|
324
|
+
require 'pdf/reader/rc4_security_handler'
|
295
325
|
require 'pdf/reader/unimplemented_security_handler'
|
296
326
|
require 'pdf/reader/stream'
|
297
327
|
require 'pdf/reader/text_run'
|
328
|
+
require 'pdf/reader/type_check'
|
298
329
|
require 'pdf/reader/page_state'
|
299
330
|
require 'pdf/reader/page_text_receiver'
|
300
331
|
require 'pdf/reader/token'
|
301
332
|
require 'pdf/reader/xref'
|
302
|
-
require 'pdf/reader/orientation_detector'
|
303
333
|
require 'pdf/reader/page'
|
334
|
+
require 'pdf/reader/validating_receiver'
|