pdf-reader 1.1.1 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG +87 -2
- data/{README.rdoc → README.md} +43 -31
- data/Rakefile +21 -16
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_object +4 -1
- data/bin/pdf_text +1 -3
- data/examples/callbacks.rb +2 -1
- data/examples/extract_images.rb +11 -6
- data/examples/fuzzy_paragraphs.rb +24 -0
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
- data/lib/pdf/reader/afm/Courier.afm +342 -0
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -0
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
- data/lib/pdf/reader/buffer.rb +90 -63
- data/lib/pdf/reader/cid_widths.rb +63 -0
- data/lib/pdf/reader/cmap.rb +69 -38
- data/lib/pdf/reader/encoding.rb +74 -48
- data/lib/pdf/reader/error.rb +24 -4
- data/lib/pdf/reader/filter/ascii85.rb +28 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +30 -0
- data/lib/pdf/reader/filter/depredict.rb +141 -0
- data/lib/pdf/reader/filter/flate.rb +53 -0
- data/lib/pdf/reader/filter/lzw.rb +21 -0
- data/lib/pdf/reader/filter/null.rb +18 -0
- data/lib/pdf/reader/filter/run_length.rb +45 -0
- data/lib/pdf/reader/filter.rb +15 -234
- data/lib/pdf/reader/font.rb +107 -43
- data/lib/pdf/reader/font_descriptor.rb +80 -0
- data/lib/pdf/reader/form_xobject.rb +26 -4
- data/lib/pdf/reader/glyph_hash.rb +56 -18
- data/lib/pdf/reader/lzw.rb +6 -4
- data/lib/pdf/reader/null_security_handler.rb +17 -0
- data/lib/pdf/reader/object_cache.rb +40 -16
- data/lib/pdf/reader/object_hash.rb +94 -40
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/orientation_detector.rb +34 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
- data/lib/pdf/reader/page.rb +48 -3
- data/lib/pdf/reader/page_layout.rb +125 -0
- data/lib/pdf/reader/page_state.rb +185 -70
- data/lib/pdf/reader/page_text_receiver.rb +70 -20
- data/lib/pdf/reader/pages_strategy.rb +4 -293
- data/lib/pdf/reader/parser.rb +37 -61
- data/lib/pdf/reader/print_receiver.rb +6 -0
- data/lib/pdf/reader/reference.rb +4 -1
- data/lib/pdf/reader/register_receiver.rb +17 -31
- data/lib/pdf/reader/resource_methods.rb +1 -0
- data/lib/pdf/reader/standard_security_handler.rb +82 -42
- data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
- data/lib/pdf/reader/stream.rb +5 -2
- data/lib/pdf/reader/synchronized_cache.rb +33 -0
- data/lib/pdf/reader/text_run.rb +99 -0
- data/lib/pdf/reader/token.rb +4 -1
- data/lib/pdf/reader/transformation_matrix.rb +195 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +67 -0
- data/lib/pdf/reader/width_calculator/composite.rb +28 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +33 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +25 -0
- data/lib/pdf/reader/width_calculator.rb +12 -0
- data/lib/pdf/reader/xref.rb +41 -9
- data/lib/pdf/reader.rb +45 -104
- data/lib/pdf-reader.rb +4 -1
- metadata +220 -101
- data/bin/pdf_list_callbacks +0 -17
- data/lib/pdf/hash.rb +0 -15
- data/lib/pdf/reader/abstract_strategy.rb +0 -81
- data/lib/pdf/reader/metadata_strategy.rb +0 -56
- data/lib/pdf/reader/text_receiver.rb +0 -264
@@ -0,0 +1,17 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
class PDF::Reader
|
5
|
+
|
6
|
+
# Security handler for when we don't support the flavour of encryption
|
7
|
+
# used in a PDF.
|
8
|
+
class UnimplementedSecurityHandler
|
9
|
+
def self.supports?(encrypt)
|
10
|
+
true
|
11
|
+
end
|
12
|
+
|
13
|
+
def decrypt(buf, ref)
|
14
|
+
raise PDF::Reader::EncryptedPDFError, "Unsupported encryption style"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'afm'
|
5
|
+
require 'pdf/reader/synchronized_cache'
|
6
|
+
|
7
|
+
class PDF::Reader
|
8
|
+
module WidthCalculator
|
9
|
+
|
10
|
+
# Type1 fonts can be one of 14 "built in" standard fonts. In these cases,
|
11
|
+
# the reader is expected to have it's own copy of the font metrics.
|
12
|
+
# see Section 9.6.2.2, PDF 32000-1:2008, pp 256
|
13
|
+
class BuiltIn
|
14
|
+
|
15
|
+
BUILTINS = [
|
16
|
+
:Courier, :"Courier-Bold", :"Courier-BoldOblique", :"Courier-Oblique",
|
17
|
+
:Helvetica, :"Helvetica-Bold", :"Helvetica-BoldOblique", :"Helvetica-Oblique",
|
18
|
+
:Symbol,
|
19
|
+
:"Times-Roman", :"Times-Bold", :"Times-BoldItalic", :"Times-Italic",
|
20
|
+
:ZapfDingbats
|
21
|
+
]
|
22
|
+
|
23
|
+
def initialize(font)
|
24
|
+
@font = font
|
25
|
+
@@all_metrics ||= PDF::Reader::SynchronizedCache.new
|
26
|
+
|
27
|
+
basefont = extract_basefont(font.basefont)
|
28
|
+
metrics_path = File.join(File.dirname(__FILE__), "..","afm","#{basefont}.afm")
|
29
|
+
|
30
|
+
if File.file?(metrics_path)
|
31
|
+
@metrics = @@all_metrics[metrics_path] ||= AFM::Font.new(metrics_path)
|
32
|
+
else
|
33
|
+
raise ArgumentError, "No built-in metrics for #{font.basefont}"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def glyph_width(code_point)
|
38
|
+
return 0 if code_point.nil? || code_point < 0
|
39
|
+
|
40
|
+
names = @font.encoding.int_to_name(code_point)
|
41
|
+
metrics = names.map { |name|
|
42
|
+
@metrics.char_metrics[name.to_s]
|
43
|
+
}.compact.first
|
44
|
+
|
45
|
+
if metrics
|
46
|
+
metrics[:wx]
|
47
|
+
else
|
48
|
+
@font.widths[code_point - 1] || 0
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def control_character?(code_point)
|
55
|
+
@font.encoding.int_to_name(code_point).first.to_s[/\Acontrol..\Z/]
|
56
|
+
end
|
57
|
+
|
58
|
+
def extract_basefont(font_name)
|
59
|
+
if BUILTINS.include?(font_name)
|
60
|
+
font_name
|
61
|
+
else
|
62
|
+
"Times-Roman"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
class PDF::Reader
|
5
|
+
module WidthCalculator
|
6
|
+
# CIDFontType0 or CIDFontType2 use DW (integer) and W (array) to determine
|
7
|
+
# codepoint widths, note that CIDFontType2 will contain a true type font
|
8
|
+
# program which could be used to calculate width, however, a conforming writer
|
9
|
+
# is supposed to convert the widths for the codepoints used into the W array
|
10
|
+
# so that it can be used.
|
11
|
+
# see Section 9.7.4.1, PDF 32000-1:2008, pp 269-270
|
12
|
+
class Composite
|
13
|
+
|
14
|
+
def initialize(font)
|
15
|
+
@font = font
|
16
|
+
@widths = PDF::Reader::CidWidths.new(@font.cid_default_width, @font.cid_widths)
|
17
|
+
end
|
18
|
+
|
19
|
+
def glyph_width(code_point)
|
20
|
+
return 0 if code_point.nil? || code_point < 0
|
21
|
+
|
22
|
+
w = @widths[code_point]
|
23
|
+
# 0 is a valid width
|
24
|
+
return w.to_f unless w.nil?
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
class PDF::Reader
|
5
|
+
module WidthCalculator
|
6
|
+
# Calculates the width of a glyph in a TrueType font
|
7
|
+
class TrueType
|
8
|
+
|
9
|
+
def initialize(font)
|
10
|
+
@font = font
|
11
|
+
|
12
|
+
if @font.font_descriptor
|
13
|
+
@missing_width = @font.font_descriptor.missing_width
|
14
|
+
else
|
15
|
+
@missing_width = 0
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def glyph_width(code_point)
|
20
|
+
return 0 if code_point.nil? || code_point < 0
|
21
|
+
glyph_width_from_font(code_point) || glyph_width_from_descriptor(code_point) || 0
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
#TODO convert Type3 units 1000 units => 1 text space unit
|
27
|
+
def glyph_width_from_font(code_point)
|
28
|
+
return if @font.widths.nil? || @font.widths.count == 0
|
29
|
+
|
30
|
+
# in ruby a negative index is valid, and will go from the end of the array
|
31
|
+
# which is undesireable in this case.
|
32
|
+
if @font.first_char <= code_point
|
33
|
+
@font.widths.fetch(code_point - @font.first_char, @missing_width).to_f
|
34
|
+
else
|
35
|
+
@missing_width.to_f
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def glyph_width_from_descriptor(code_point)
|
40
|
+
return unless @font.font_descriptor
|
41
|
+
|
42
|
+
# true type fonts will have most of their information contained
|
43
|
+
# with-in a program inside the font descriptor, however the widths
|
44
|
+
# may not be in standard PDF glyph widths (1000 units => 1 text space unit)
|
45
|
+
# so this width will need to be scaled
|
46
|
+
w = @font.font_descriptor.glyph_width(code_point)
|
47
|
+
if w
|
48
|
+
w.to_f * @font.font_descriptor.glyph_to_pdf_scale_factor
|
49
|
+
else
|
50
|
+
nil
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
class PDF::Reader
|
5
|
+
module WidthCalculator
|
6
|
+
# Calculates the width of a glyph in a Type One or Type Three
|
7
|
+
class TypeOneOrThree
|
8
|
+
|
9
|
+
def initialize(font)
|
10
|
+
@font = font
|
11
|
+
|
12
|
+
if @font.font_descriptor
|
13
|
+
@missing_width = @font.font_descriptor.missing_width
|
14
|
+
else
|
15
|
+
@missing_width = 0
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def glyph_width(code_point)
|
20
|
+
return 0 if code_point.nil? || code_point < 0
|
21
|
+
return 0 if @font.widths.nil? || @font.widths.count == 0
|
22
|
+
|
23
|
+
# in ruby a negative index is valid, and will go from the end of the array
|
24
|
+
# which is undesireable in this case.
|
25
|
+
if @font.first_char <= code_point
|
26
|
+
@font.widths.fetch(code_point - @font.first_char, @missing_width).to_f
|
27
|
+
else
|
28
|
+
@missing_width.to_f
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
class PDF::Reader
|
5
|
+
module WidthCalculator
|
6
|
+
# Type0 (or Composite) fonts are a "root font" that rely on a "descendant font"
|
7
|
+
# to do the heavy lifting. The "descendant font" is a CID-Keyed font.
|
8
|
+
# see Section 9.7.1, PDF 32000-1:2008, pp 267
|
9
|
+
# so if we are calculating a Type0 font width, we just pass off to
|
10
|
+
# the descendant font
|
11
|
+
class TypeZero
|
12
|
+
|
13
|
+
def initialize(font)
|
14
|
+
@font = font
|
15
|
+
@descendant_font = @font.descendantfonts.first
|
16
|
+
end
|
17
|
+
|
18
|
+
def glyph_width(code_point)
|
19
|
+
return 0 if code_point.nil? || code_point < 0
|
20
|
+
|
21
|
+
@descendant_font.glyph_width(code_point).to_f
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
# PDF files may define fonts in a number of ways. Each approach means we must
|
5
|
+
# calculate glyph widths differently, so this set of classes conform to an
|
6
|
+
# interface that will perform the appropriate calculations.
|
7
|
+
|
8
|
+
require 'pdf/reader/width_calculator/built_in'
|
9
|
+
require 'pdf/reader/width_calculator/composite'
|
10
|
+
require 'pdf/reader/width_calculator/true_type'
|
11
|
+
require 'pdf/reader/width_calculator/type_zero'
|
12
|
+
require 'pdf/reader/width_calculator/type_one_or_three'
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -1,3 +1,6 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
1
4
|
################################################################################
|
2
5
|
#
|
3
6
|
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
@@ -51,11 +54,13 @@ class PDF::Reader
|
|
51
54
|
#
|
52
55
|
# io - must be an IO object, generally either a file or a StringIO
|
53
56
|
#
|
54
|
-
def initialize
|
57
|
+
def initialize(io)
|
55
58
|
@io = io
|
59
|
+
@junk_offset = calc_junk_offset(io) || 0
|
56
60
|
@xref = {}
|
57
61
|
@trailer = load_offsets
|
58
62
|
end
|
63
|
+
|
59
64
|
################################################################################
|
60
65
|
# return the number of objects in this file. Objects with multiple generations are
|
61
66
|
# only counter once.
|
@@ -93,6 +98,7 @@ class PDF::Reader
|
|
93
98
|
#
|
94
99
|
def load_offsets(offset = nil)
|
95
100
|
offset ||= new_buffer.find_first_xref_offset
|
101
|
+
offset += @junk_offset
|
96
102
|
|
97
103
|
buf = new_buffer(offset)
|
98
104
|
tok_one = buf.token
|
@@ -108,7 +114,8 @@ class PDF::Reader
|
|
108
114
|
return load_xref_stream(stream)
|
109
115
|
end
|
110
116
|
|
111
|
-
raise PDF::Reader::MalformedPDFError,
|
117
|
+
raise PDF::Reader::MalformedPDFError,
|
118
|
+
"xref table not found at offset #{offset} (#{tok_one} != xref)"
|
112
119
|
end
|
113
120
|
################################################################################
|
114
121
|
# Assumes the underlying buffer is positioned at the start of a traditional
|
@@ -124,7 +131,7 @@ class PDF::Reader
|
|
124
131
|
generation = buf.token.to_i
|
125
132
|
state = buf.token
|
126
133
|
|
127
|
-
store(objid, generation, offset) if state == "n" && offset > 0
|
134
|
+
store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
|
128
135
|
objid += 1
|
129
136
|
params.clear
|
130
137
|
end
|
@@ -134,7 +141,9 @@ class PDF::Reader
|
|
134
141
|
|
135
142
|
trailer = Parser.new(buf, self).parse_token
|
136
143
|
|
137
|
-
|
144
|
+
unless trailer.kind_of?(Hash)
|
145
|
+
raise MalformedPDFError, "PDF malformed, trailer should be a dictionary"
|
146
|
+
end
|
138
147
|
|
139
148
|
load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
|
140
149
|
load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
|
@@ -143,7 +152,7 @@ class PDF::Reader
|
|
143
152
|
end
|
144
153
|
|
145
154
|
################################################################################
|
146
|
-
# Read
|
155
|
+
# Read an XRef stream from the underlying buffer instead of a traditional xref table.
|
147
156
|
#
|
148
157
|
def load_xref_stream(stream)
|
149
158
|
unless stream.is_a?(PDF::Reader::Stream) && stream.hash[:Type] == :XRef
|
@@ -169,7 +178,7 @@ class PDF::Reader
|
|
169
178
|
f2 = unpack_bytes(entry[widths[0],widths[1]])
|
170
179
|
f3 = unpack_bytes(entry[widths[0]+widths[1],widths[2]])
|
171
180
|
if f1 == 1 && f2 > 0
|
172
|
-
store(objid, f3, f2)
|
181
|
+
store(objid, f3, f2 + @junk_offset)
|
173
182
|
elsif f1 == 2 && f2 > 0
|
174
183
|
store(objid, 0, PDF::Reader::Reference.new(f2, 0))
|
175
184
|
end
|
@@ -195,15 +204,17 @@ class PDF::Reader
|
|
195
204
|
("\x00" + bytes).unpack("N")[0]
|
196
205
|
elsif bytes.size == 4
|
197
206
|
bytes.unpack("N")[0]
|
207
|
+
elsif bytes.size == 8
|
208
|
+
bytes.unpack("Q>")[0]
|
198
209
|
else
|
199
|
-
raise UnsupportedFeatureError, "Unable to unpack xref stream entries
|
210
|
+
raise UnsupportedFeatureError, "Unable to unpack xref stream entries of #{bytes.size} bytes"
|
200
211
|
end
|
201
212
|
end
|
202
213
|
################################################################################
|
203
214
|
# Wrap the io stream we're working with in a buffer that can tokenise it for us.
|
204
215
|
#
|
205
216
|
# We create multiple buffers so we can be tokenising multiple sections of the file
|
206
|
-
# at the same time without
|
217
|
+
# at the same time without worrying about clearing the buffers contents.
|
207
218
|
#
|
208
219
|
def new_buffer(offset = 0)
|
209
220
|
PDF::Reader::Buffer.new(@io, :seek => offset)
|
@@ -211,9 +222,30 @@ class PDF::Reader
|
|
211
222
|
################################################################################
|
212
223
|
# Stores an offset value for a particular PDF object ID and revision number
|
213
224
|
#
|
214
|
-
def store
|
225
|
+
def store(id, gen, offset)
|
215
226
|
(@xref[id] ||= {})[gen] ||= offset
|
216
227
|
end
|
228
|
+
################################################################################
|
229
|
+
# Returns the offset of the PDF document in the +stream+. In theory this
|
230
|
+
# should always be 0, but all sort of crazy junk is prefixed to PDF files
|
231
|
+
# in the real world.
|
232
|
+
#
|
233
|
+
# Checks up to 1024 chars into the file,
|
234
|
+
# returns nil if no PDF data detected.
|
235
|
+
# Adobe PDF 1.4 spec (3.4.1) 12. Acrobat viewers require only that the
|
236
|
+
# header appear somewhere within the first 1024 bytes of the file
|
237
|
+
#
|
238
|
+
def calc_junk_offset(io)
|
239
|
+
io.rewind
|
240
|
+
offset = io.pos
|
241
|
+
until (c = io.readchar) == '%' || c == 37 || offset > 1024
|
242
|
+
offset += 1
|
243
|
+
end
|
244
|
+
io.rewind
|
245
|
+
offset < 1024 ? offset : nil
|
246
|
+
rescue EOFError
|
247
|
+
nil
|
248
|
+
end
|
217
249
|
end
|
218
250
|
################################################################################
|
219
251
|
end
|
data/lib/pdf/reader.rb
CHANGED
@@ -1,3 +1,6 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
1
4
|
################################################################################
|
2
5
|
#
|
3
6
|
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
@@ -25,9 +28,6 @@
|
|
25
28
|
################################################################################
|
26
29
|
|
27
30
|
require 'stringio'
|
28
|
-
require 'zlib'
|
29
|
-
|
30
|
-
require 'ascii85'
|
31
31
|
|
32
32
|
module PDF
|
33
33
|
################################################################################
|
@@ -111,10 +111,10 @@ module PDF
|
|
111
111
|
#
|
112
112
|
# reader = PDF::Reader.new("somefile.pdf", :password => "apples")
|
113
113
|
#
|
114
|
-
def initialize(input
|
115
|
-
|
116
|
-
|
117
|
-
|
114
|
+
def initialize(input, opts = {})
|
115
|
+
@cache = PDF::Reader::ObjectCache.new
|
116
|
+
opts.merge!(:cache => @cache)
|
117
|
+
@objects = PDF::Reader::ObjectHash.new(input, opts)
|
118
118
|
end
|
119
119
|
|
120
120
|
def info
|
@@ -128,13 +128,16 @@ module PDF
|
|
128
128
|
nil
|
129
129
|
else
|
130
130
|
xml = stream.unfiltered_data
|
131
|
-
xml.force_encoding("utf-8")
|
131
|
+
xml.force_encoding("utf-8")
|
132
132
|
xml
|
133
133
|
end
|
134
134
|
end
|
135
135
|
|
136
136
|
def page_count
|
137
137
|
pages = @objects.deref(root[:Pages])
|
138
|
+
unless pages.kind_of?(::Hash)
|
139
|
+
raise MalformedPDFError, 'Pages structure is missing'
|
140
|
+
end
|
138
141
|
@page_count ||= @objects.deref(pages[:Count])
|
139
142
|
end
|
140
143
|
|
@@ -159,53 +162,6 @@ module PDF
|
|
159
162
|
yield PDF::Reader.new(input, opts)
|
160
163
|
end
|
161
164
|
|
162
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
163
|
-
# eventually be removed
|
164
|
-
#
|
165
|
-
#
|
166
|
-
# Parse the file with the given name, sending events to the given receiver.
|
167
|
-
#
|
168
|
-
def self.file(name, receivers, opts = {})
|
169
|
-
File.open(name,"rb") do |f|
|
170
|
-
new.parse(f, receivers, opts)
|
171
|
-
end
|
172
|
-
end
|
173
|
-
|
174
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
175
|
-
# eventually be removed
|
176
|
-
#
|
177
|
-
# Parse the given string, sending events to the given receiver.
|
178
|
-
#
|
179
|
-
def self.string(str, receivers, opts = {})
|
180
|
-
StringIO.open(str) do |s|
|
181
|
-
new.parse(s, receivers, opts)
|
182
|
-
end
|
183
|
-
end
|
184
|
-
|
185
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
186
|
-
# eventually be removed
|
187
|
-
#
|
188
|
-
# Parse the file with the given name, returning an unmarshalled ruby version of
|
189
|
-
# represents the requested pdf object
|
190
|
-
#
|
191
|
-
def self.object_file(name, id, gen = 0)
|
192
|
-
File.open(name,"rb") { |f|
|
193
|
-
new.object(f, id.to_i, gen.to_i)
|
194
|
-
}
|
195
|
-
end
|
196
|
-
|
197
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
198
|
-
# eventually be removed
|
199
|
-
#
|
200
|
-
# Parse the given string, returning an unmarshalled ruby version of represents
|
201
|
-
# the requested pdf object
|
202
|
-
#
|
203
|
-
def self.object_string(str, id, gen = 0)
|
204
|
-
StringIO.open(str) { |s|
|
205
|
-
new.object(s, id.to_i, gen.to_i)
|
206
|
-
}
|
207
|
-
end
|
208
|
-
|
209
165
|
# returns an array of PDF::Reader::Page objects, one for each
|
210
166
|
# page in the source PDF.
|
211
167
|
#
|
@@ -221,9 +177,13 @@ module PDF
|
|
221
177
|
# methods available on each page
|
222
178
|
#
|
223
179
|
def pages
|
224
|
-
(1..self.page_count).map
|
225
|
-
|
226
|
-
|
180
|
+
(1..self.page_count).map do |num|
|
181
|
+
begin
|
182
|
+
PDF::Reader::Page.new(@objects, num, :cache => @cache)
|
183
|
+
rescue InvalidPageError
|
184
|
+
raise MalformedPDFError, "Missing data for page: #{num}"
|
185
|
+
end
|
186
|
+
end
|
227
187
|
end
|
228
188
|
|
229
189
|
# returns a single PDF::Reader::Page for the specified page.
|
@@ -240,38 +200,10 @@ module PDF
|
|
240
200
|
#
|
241
201
|
def page(num)
|
242
202
|
num = num.to_i
|
243
|
-
|
244
|
-
|
245
|
-
end
|
246
|
-
|
247
|
-
|
248
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
249
|
-
# eventually be removed
|
250
|
-
#
|
251
|
-
# Given an IO object that contains PDF data, parse it.
|
252
|
-
#
|
253
|
-
def parse(io, receivers, opts = {})
|
254
|
-
ohash = ObjectHash.new(io)
|
255
|
-
|
256
|
-
options = {:pages => true, :raw_text => false, :metadata => true}
|
257
|
-
options.merge!(opts)
|
258
|
-
|
259
|
-
strategies.each do |s|
|
260
|
-
s.new(ohash, receivers, options).process
|
203
|
+
if num < 1 || num > self.page_count
|
204
|
+
raise InvalidPageError, "Valid pages are 1 .. #{self.page_count}"
|
261
205
|
end
|
262
|
-
|
263
|
-
self
|
264
|
-
end
|
265
|
-
|
266
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
267
|
-
# eventually be removed
|
268
|
-
#
|
269
|
-
# Given an IO object that contains PDF data, return the contents of a single object
|
270
|
-
#
|
271
|
-
def object (io, id, gen)
|
272
|
-
@objects = ObjectHash.new(io)
|
273
|
-
|
274
|
-
@objects.deref(Reference.new(id, gen))
|
206
|
+
PDF::Reader::Page.new(@objects, num, :cache => @cache)
|
275
207
|
end
|
276
208
|
|
277
209
|
private
|
@@ -295,14 +227,14 @@ module PDF
|
|
295
227
|
pdfdoc_to_utf8(obj)
|
296
228
|
end
|
297
229
|
else
|
298
|
-
obj
|
230
|
+
@objects.deref(obj)
|
299
231
|
end
|
300
232
|
end
|
301
233
|
|
302
234
|
# TODO find a PDF I can use to spec this behaviour
|
303
235
|
#
|
304
236
|
def pdfdoc_to_utf8(obj)
|
305
|
-
obj.force_encoding("utf-8")
|
237
|
+
obj.force_encoding("utf-8")
|
306
238
|
obj
|
307
239
|
end
|
308
240
|
|
@@ -312,19 +244,18 @@ module PDF
|
|
312
244
|
def utf16_to_utf8(obj)
|
313
245
|
str = obj[2, obj.size]
|
314
246
|
str = str.unpack("n*").pack("U*")
|
315
|
-
str.force_encoding("utf-8")
|
247
|
+
str.force_encoding("utf-8")
|
316
248
|
str
|
317
249
|
end
|
318
250
|
|
319
|
-
def strategies
|
320
|
-
@strategies ||= [
|
321
|
-
::PDF::Reader::MetadataStrategy,
|
322
|
-
::PDF::Reader::PagesStrategy
|
323
|
-
]
|
324
|
-
end
|
325
|
-
|
326
251
|
def root
|
327
|
-
@root ||=
|
252
|
+
@root ||= begin
|
253
|
+
obj = @objects.deref(@objects.trailer[:Root])
|
254
|
+
unless obj.kind_of?(::Hash)
|
255
|
+
raise MalformedPDFError, "PDF malformed, trailer Root should be a dictionary"
|
256
|
+
end
|
257
|
+
obj
|
258
|
+
end
|
328
259
|
end
|
329
260
|
|
330
261
|
end
|
@@ -332,17 +263,24 @@ end
|
|
332
263
|
################################################################################
|
333
264
|
|
334
265
|
require 'pdf/reader/resource_methods'
|
335
|
-
require 'pdf/reader/abstract_strategy'
|
336
266
|
require 'pdf/reader/buffer'
|
267
|
+
require 'pdf/reader/cid_widths'
|
337
268
|
require 'pdf/reader/cmap'
|
338
269
|
require 'pdf/reader/encoding'
|
339
270
|
require 'pdf/reader/error'
|
340
271
|
require 'pdf/reader/filter'
|
272
|
+
require 'pdf/reader/filter/ascii85'
|
273
|
+
require 'pdf/reader/filter/ascii_hex'
|
274
|
+
require 'pdf/reader/filter/depredict'
|
275
|
+
require 'pdf/reader/filter/flate'
|
276
|
+
require 'pdf/reader/filter/lzw'
|
277
|
+
require 'pdf/reader/filter/null'
|
278
|
+
require 'pdf/reader/filter/run_length'
|
341
279
|
require 'pdf/reader/font'
|
280
|
+
require 'pdf/reader/font_descriptor'
|
342
281
|
require 'pdf/reader/form_xobject'
|
343
282
|
require 'pdf/reader/glyph_hash'
|
344
283
|
require 'pdf/reader/lzw'
|
345
|
-
require 'pdf/reader/metadata_strategy'
|
346
284
|
require 'pdf/reader/object_cache'
|
347
285
|
require 'pdf/reader/object_hash'
|
348
286
|
require 'pdf/reader/object_stream'
|
@@ -351,12 +289,15 @@ require 'pdf/reader/parser'
|
|
351
289
|
require 'pdf/reader/print_receiver'
|
352
290
|
require 'pdf/reader/reference'
|
353
291
|
require 'pdf/reader/register_receiver'
|
292
|
+
require 'pdf/reader/null_security_handler'
|
354
293
|
require 'pdf/reader/standard_security_handler'
|
294
|
+
require 'pdf/reader/standard_security_handler_v5'
|
295
|
+
require 'pdf/reader/unimplemented_security_handler'
|
355
296
|
require 'pdf/reader/stream'
|
356
|
-
require 'pdf/reader/
|
297
|
+
require 'pdf/reader/text_run'
|
357
298
|
require 'pdf/reader/page_state'
|
358
299
|
require 'pdf/reader/page_text_receiver'
|
359
300
|
require 'pdf/reader/token'
|
360
301
|
require 'pdf/reader/xref'
|
302
|
+
require 'pdf/reader/orientation_detector'
|
361
303
|
require 'pdf/reader/page'
|
362
|
-
require 'pdf/hash'
|