pdf-reader 1.1.1 → 2.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG +87 -2
- data/{README.rdoc → README.md} +43 -31
- data/Rakefile +21 -16
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_object +4 -1
- data/bin/pdf_text +1 -3
- data/examples/callbacks.rb +2 -1
- data/examples/extract_images.rb +11 -6
- data/examples/fuzzy_paragraphs.rb +24 -0
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
- data/lib/pdf/reader/afm/Courier.afm +342 -0
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -0
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
- data/lib/pdf/reader/buffer.rb +90 -63
- data/lib/pdf/reader/cid_widths.rb +63 -0
- data/lib/pdf/reader/cmap.rb +69 -38
- data/lib/pdf/reader/encoding.rb +74 -48
- data/lib/pdf/reader/error.rb +24 -4
- data/lib/pdf/reader/filter/ascii85.rb +28 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +30 -0
- data/lib/pdf/reader/filter/depredict.rb +141 -0
- data/lib/pdf/reader/filter/flate.rb +53 -0
- data/lib/pdf/reader/filter/lzw.rb +21 -0
- data/lib/pdf/reader/filter/null.rb +18 -0
- data/lib/pdf/reader/filter/run_length.rb +45 -0
- data/lib/pdf/reader/filter.rb +15 -234
- data/lib/pdf/reader/font.rb +107 -43
- data/lib/pdf/reader/font_descriptor.rb +80 -0
- data/lib/pdf/reader/form_xobject.rb +26 -4
- data/lib/pdf/reader/glyph_hash.rb +56 -18
- data/lib/pdf/reader/lzw.rb +6 -4
- data/lib/pdf/reader/null_security_handler.rb +17 -0
- data/lib/pdf/reader/object_cache.rb +40 -16
- data/lib/pdf/reader/object_hash.rb +94 -40
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/orientation_detector.rb +34 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
- data/lib/pdf/reader/page.rb +48 -3
- data/lib/pdf/reader/page_layout.rb +125 -0
- data/lib/pdf/reader/page_state.rb +185 -70
- data/lib/pdf/reader/page_text_receiver.rb +70 -20
- data/lib/pdf/reader/pages_strategy.rb +4 -293
- data/lib/pdf/reader/parser.rb +37 -61
- data/lib/pdf/reader/print_receiver.rb +6 -0
- data/lib/pdf/reader/reference.rb +4 -1
- data/lib/pdf/reader/register_receiver.rb +17 -31
- data/lib/pdf/reader/resource_methods.rb +1 -0
- data/lib/pdf/reader/standard_security_handler.rb +82 -42
- data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
- data/lib/pdf/reader/stream.rb +5 -2
- data/lib/pdf/reader/synchronized_cache.rb +33 -0
- data/lib/pdf/reader/text_run.rb +99 -0
- data/lib/pdf/reader/token.rb +4 -1
- data/lib/pdf/reader/transformation_matrix.rb +195 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +67 -0
- data/lib/pdf/reader/width_calculator/composite.rb +28 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +33 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +25 -0
- data/lib/pdf/reader/width_calculator.rb +12 -0
- data/lib/pdf/reader/xref.rb +41 -9
- data/lib/pdf/reader.rb +45 -104
- data/lib/pdf-reader.rb +4 -1
- metadata +220 -101
- data/bin/pdf_list_callbacks +0 -17
- data/lib/pdf/hash.rb +0 -15
- data/lib/pdf/reader/abstract_strategy.rb +0 -81
- data/lib/pdf/reader/metadata_strategy.rb +0 -56
- data/lib/pdf/reader/text_receiver.rb +0 -264
@@ -0,0 +1,17 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
class PDF::Reader
|
5
|
+
|
6
|
+
# Security handler for when we don't support the flavour of encryption
|
7
|
+
# used in a PDF.
|
8
|
+
class UnimplementedSecurityHandler
|
9
|
+
def self.supports?(encrypt)
|
10
|
+
true
|
11
|
+
end
|
12
|
+
|
13
|
+
def decrypt(buf, ref)
|
14
|
+
raise PDF::Reader::EncryptedPDFError, "Unsupported encryption style"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'afm'
|
5
|
+
require 'pdf/reader/synchronized_cache'
|
6
|
+
|
7
|
+
class PDF::Reader
|
8
|
+
module WidthCalculator
|
9
|
+
|
10
|
+
# Type1 fonts can be one of 14 "built in" standard fonts. In these cases,
|
11
|
+
# the reader is expected to have it's own copy of the font metrics.
|
12
|
+
# see Section 9.6.2.2, PDF 32000-1:2008, pp 256
|
13
|
+
class BuiltIn
|
14
|
+
|
15
|
+
BUILTINS = [
|
16
|
+
:Courier, :"Courier-Bold", :"Courier-BoldOblique", :"Courier-Oblique",
|
17
|
+
:Helvetica, :"Helvetica-Bold", :"Helvetica-BoldOblique", :"Helvetica-Oblique",
|
18
|
+
:Symbol,
|
19
|
+
:"Times-Roman", :"Times-Bold", :"Times-BoldItalic", :"Times-Italic",
|
20
|
+
:ZapfDingbats
|
21
|
+
]
|
22
|
+
|
23
|
+
def initialize(font)
|
24
|
+
@font = font
|
25
|
+
@@all_metrics ||= PDF::Reader::SynchronizedCache.new
|
26
|
+
|
27
|
+
basefont = extract_basefont(font.basefont)
|
28
|
+
metrics_path = File.join(File.dirname(__FILE__), "..","afm","#{basefont}.afm")
|
29
|
+
|
30
|
+
if File.file?(metrics_path)
|
31
|
+
@metrics = @@all_metrics[metrics_path] ||= AFM::Font.new(metrics_path)
|
32
|
+
else
|
33
|
+
raise ArgumentError, "No built-in metrics for #{font.basefont}"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def glyph_width(code_point)
|
38
|
+
return 0 if code_point.nil? || code_point < 0
|
39
|
+
|
40
|
+
names = @font.encoding.int_to_name(code_point)
|
41
|
+
metrics = names.map { |name|
|
42
|
+
@metrics.char_metrics[name.to_s]
|
43
|
+
}.compact.first
|
44
|
+
|
45
|
+
if metrics
|
46
|
+
metrics[:wx]
|
47
|
+
else
|
48
|
+
@font.widths[code_point - 1] || 0
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def control_character?(code_point)
|
55
|
+
@font.encoding.int_to_name(code_point).first.to_s[/\Acontrol..\Z/]
|
56
|
+
end
|
57
|
+
|
58
|
+
def extract_basefont(font_name)
|
59
|
+
if BUILTINS.include?(font_name)
|
60
|
+
font_name
|
61
|
+
else
|
62
|
+
"Times-Roman"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
class PDF::Reader
|
5
|
+
module WidthCalculator
|
6
|
+
# CIDFontType0 or CIDFontType2 use DW (integer) and W (array) to determine
|
7
|
+
# codepoint widths, note that CIDFontType2 will contain a true type font
|
8
|
+
# program which could be used to calculate width, however, a conforming writer
|
9
|
+
# is supposed to convert the widths for the codepoints used into the W array
|
10
|
+
# so that it can be used.
|
11
|
+
# see Section 9.7.4.1, PDF 32000-1:2008, pp 269-270
|
12
|
+
class Composite
|
13
|
+
|
14
|
+
def initialize(font)
|
15
|
+
@font = font
|
16
|
+
@widths = PDF::Reader::CidWidths.new(@font.cid_default_width, @font.cid_widths)
|
17
|
+
end
|
18
|
+
|
19
|
+
def glyph_width(code_point)
|
20
|
+
return 0 if code_point.nil? || code_point < 0
|
21
|
+
|
22
|
+
w = @widths[code_point]
|
23
|
+
# 0 is a valid width
|
24
|
+
return w.to_f unless w.nil?
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
class PDF::Reader
|
5
|
+
module WidthCalculator
|
6
|
+
# Calculates the width of a glyph in a TrueType font
|
7
|
+
class TrueType
|
8
|
+
|
9
|
+
def initialize(font)
|
10
|
+
@font = font
|
11
|
+
|
12
|
+
if @font.font_descriptor
|
13
|
+
@missing_width = @font.font_descriptor.missing_width
|
14
|
+
else
|
15
|
+
@missing_width = 0
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def glyph_width(code_point)
|
20
|
+
return 0 if code_point.nil? || code_point < 0
|
21
|
+
glyph_width_from_font(code_point) || glyph_width_from_descriptor(code_point) || 0
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
#TODO convert Type3 units 1000 units => 1 text space unit
|
27
|
+
def glyph_width_from_font(code_point)
|
28
|
+
return if @font.widths.nil? || @font.widths.count == 0
|
29
|
+
|
30
|
+
# in ruby a negative index is valid, and will go from the end of the array
|
31
|
+
# which is undesireable in this case.
|
32
|
+
if @font.first_char <= code_point
|
33
|
+
@font.widths.fetch(code_point - @font.first_char, @missing_width).to_f
|
34
|
+
else
|
35
|
+
@missing_width.to_f
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def glyph_width_from_descriptor(code_point)
|
40
|
+
return unless @font.font_descriptor
|
41
|
+
|
42
|
+
# true type fonts will have most of their information contained
|
43
|
+
# with-in a program inside the font descriptor, however the widths
|
44
|
+
# may not be in standard PDF glyph widths (1000 units => 1 text space unit)
|
45
|
+
# so this width will need to be scaled
|
46
|
+
w = @font.font_descriptor.glyph_width(code_point)
|
47
|
+
if w
|
48
|
+
w.to_f * @font.font_descriptor.glyph_to_pdf_scale_factor
|
49
|
+
else
|
50
|
+
nil
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
class PDF::Reader
|
5
|
+
module WidthCalculator
|
6
|
+
# Calculates the width of a glyph in a Type One or Type Three
|
7
|
+
class TypeOneOrThree
|
8
|
+
|
9
|
+
def initialize(font)
|
10
|
+
@font = font
|
11
|
+
|
12
|
+
if @font.font_descriptor
|
13
|
+
@missing_width = @font.font_descriptor.missing_width
|
14
|
+
else
|
15
|
+
@missing_width = 0
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def glyph_width(code_point)
|
20
|
+
return 0 if code_point.nil? || code_point < 0
|
21
|
+
return 0 if @font.widths.nil? || @font.widths.count == 0
|
22
|
+
|
23
|
+
# in ruby a negative index is valid, and will go from the end of the array
|
24
|
+
# which is undesireable in this case.
|
25
|
+
if @font.first_char <= code_point
|
26
|
+
@font.widths.fetch(code_point - @font.first_char, @missing_width).to_f
|
27
|
+
else
|
28
|
+
@missing_width.to_f
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
class PDF::Reader
|
5
|
+
module WidthCalculator
|
6
|
+
# Type0 (or Composite) fonts are a "root font" that rely on a "descendant font"
|
7
|
+
# to do the heavy lifting. The "descendant font" is a CID-Keyed font.
|
8
|
+
# see Section 9.7.1, PDF 32000-1:2008, pp 267
|
9
|
+
# so if we are calculating a Type0 font width, we just pass off to
|
10
|
+
# the descendant font
|
11
|
+
class TypeZero
|
12
|
+
|
13
|
+
def initialize(font)
|
14
|
+
@font = font
|
15
|
+
@descendant_font = @font.descendantfonts.first
|
16
|
+
end
|
17
|
+
|
18
|
+
def glyph_width(code_point)
|
19
|
+
return 0 if code_point.nil? || code_point < 0
|
20
|
+
|
21
|
+
@descendant_font.glyph_width(code_point).to_f
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
# PDF files may define fonts in a number of ways. Each approach means we must
|
5
|
+
# calculate glyph widths differently, so this set of classes conform to an
|
6
|
+
# interface that will perform the appropriate calculations.
|
7
|
+
|
8
|
+
require 'pdf/reader/width_calculator/built_in'
|
9
|
+
require 'pdf/reader/width_calculator/composite'
|
10
|
+
require 'pdf/reader/width_calculator/true_type'
|
11
|
+
require 'pdf/reader/width_calculator/type_zero'
|
12
|
+
require 'pdf/reader/width_calculator/type_one_or_three'
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -1,3 +1,6 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
1
4
|
################################################################################
|
2
5
|
#
|
3
6
|
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
@@ -51,11 +54,13 @@ class PDF::Reader
|
|
51
54
|
#
|
52
55
|
# io - must be an IO object, generally either a file or a StringIO
|
53
56
|
#
|
54
|
-
def initialize
|
57
|
+
def initialize(io)
|
55
58
|
@io = io
|
59
|
+
@junk_offset = calc_junk_offset(io) || 0
|
56
60
|
@xref = {}
|
57
61
|
@trailer = load_offsets
|
58
62
|
end
|
63
|
+
|
59
64
|
################################################################################
|
60
65
|
# return the number of objects in this file. Objects with multiple generations are
|
61
66
|
# only counter once.
|
@@ -93,6 +98,7 @@ class PDF::Reader
|
|
93
98
|
#
|
94
99
|
def load_offsets(offset = nil)
|
95
100
|
offset ||= new_buffer.find_first_xref_offset
|
101
|
+
offset += @junk_offset
|
96
102
|
|
97
103
|
buf = new_buffer(offset)
|
98
104
|
tok_one = buf.token
|
@@ -108,7 +114,8 @@ class PDF::Reader
|
|
108
114
|
return load_xref_stream(stream)
|
109
115
|
end
|
110
116
|
|
111
|
-
raise PDF::Reader::MalformedPDFError,
|
117
|
+
raise PDF::Reader::MalformedPDFError,
|
118
|
+
"xref table not found at offset #{offset} (#{tok_one} != xref)"
|
112
119
|
end
|
113
120
|
################################################################################
|
114
121
|
# Assumes the underlying buffer is positioned at the start of a traditional
|
@@ -124,7 +131,7 @@ class PDF::Reader
|
|
124
131
|
generation = buf.token.to_i
|
125
132
|
state = buf.token
|
126
133
|
|
127
|
-
store(objid, generation, offset) if state == "n" && offset > 0
|
134
|
+
store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
|
128
135
|
objid += 1
|
129
136
|
params.clear
|
130
137
|
end
|
@@ -134,7 +141,9 @@ class PDF::Reader
|
|
134
141
|
|
135
142
|
trailer = Parser.new(buf, self).parse_token
|
136
143
|
|
137
|
-
|
144
|
+
unless trailer.kind_of?(Hash)
|
145
|
+
raise MalformedPDFError, "PDF malformed, trailer should be a dictionary"
|
146
|
+
end
|
138
147
|
|
139
148
|
load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
|
140
149
|
load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
|
@@ -143,7 +152,7 @@ class PDF::Reader
|
|
143
152
|
end
|
144
153
|
|
145
154
|
################################################################################
|
146
|
-
# Read
|
155
|
+
# Read an XRef stream from the underlying buffer instead of a traditional xref table.
|
147
156
|
#
|
148
157
|
def load_xref_stream(stream)
|
149
158
|
unless stream.is_a?(PDF::Reader::Stream) && stream.hash[:Type] == :XRef
|
@@ -169,7 +178,7 @@ class PDF::Reader
|
|
169
178
|
f2 = unpack_bytes(entry[widths[0],widths[1]])
|
170
179
|
f3 = unpack_bytes(entry[widths[0]+widths[1],widths[2]])
|
171
180
|
if f1 == 1 && f2 > 0
|
172
|
-
store(objid, f3, f2)
|
181
|
+
store(objid, f3, f2 + @junk_offset)
|
173
182
|
elsif f1 == 2 && f2 > 0
|
174
183
|
store(objid, 0, PDF::Reader::Reference.new(f2, 0))
|
175
184
|
end
|
@@ -195,15 +204,17 @@ class PDF::Reader
|
|
195
204
|
("\x00" + bytes).unpack("N")[0]
|
196
205
|
elsif bytes.size == 4
|
197
206
|
bytes.unpack("N")[0]
|
207
|
+
elsif bytes.size == 8
|
208
|
+
bytes.unpack("Q>")[0]
|
198
209
|
else
|
199
|
-
raise UnsupportedFeatureError, "Unable to unpack xref stream entries
|
210
|
+
raise UnsupportedFeatureError, "Unable to unpack xref stream entries of #{bytes.size} bytes"
|
200
211
|
end
|
201
212
|
end
|
202
213
|
################################################################################
|
203
214
|
# Wrap the io stream we're working with in a buffer that can tokenise it for us.
|
204
215
|
#
|
205
216
|
# We create multiple buffers so we can be tokenising multiple sections of the file
|
206
|
-
# at the same time without
|
217
|
+
# at the same time without worrying about clearing the buffers contents.
|
207
218
|
#
|
208
219
|
def new_buffer(offset = 0)
|
209
220
|
PDF::Reader::Buffer.new(@io, :seek => offset)
|
@@ -211,9 +222,30 @@ class PDF::Reader
|
|
211
222
|
################################################################################
|
212
223
|
# Stores an offset value for a particular PDF object ID and revision number
|
213
224
|
#
|
214
|
-
def store
|
225
|
+
def store(id, gen, offset)
|
215
226
|
(@xref[id] ||= {})[gen] ||= offset
|
216
227
|
end
|
228
|
+
################################################################################
|
229
|
+
# Returns the offset of the PDF document in the +stream+. In theory this
|
230
|
+
# should always be 0, but all sort of crazy junk is prefixed to PDF files
|
231
|
+
# in the real world.
|
232
|
+
#
|
233
|
+
# Checks up to 1024 chars into the file,
|
234
|
+
# returns nil if no PDF data detected.
|
235
|
+
# Adobe PDF 1.4 spec (3.4.1) 12. Acrobat viewers require only that the
|
236
|
+
# header appear somewhere within the first 1024 bytes of the file
|
237
|
+
#
|
238
|
+
def calc_junk_offset(io)
|
239
|
+
io.rewind
|
240
|
+
offset = io.pos
|
241
|
+
until (c = io.readchar) == '%' || c == 37 || offset > 1024
|
242
|
+
offset += 1
|
243
|
+
end
|
244
|
+
io.rewind
|
245
|
+
offset < 1024 ? offset : nil
|
246
|
+
rescue EOFError
|
247
|
+
nil
|
248
|
+
end
|
217
249
|
end
|
218
250
|
################################################################################
|
219
251
|
end
|
data/lib/pdf/reader.rb
CHANGED
@@ -1,3 +1,6 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
1
4
|
################################################################################
|
2
5
|
#
|
3
6
|
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
@@ -25,9 +28,6 @@
|
|
25
28
|
################################################################################
|
26
29
|
|
27
30
|
require 'stringio'
|
28
|
-
require 'zlib'
|
29
|
-
|
30
|
-
require 'ascii85'
|
31
31
|
|
32
32
|
module PDF
|
33
33
|
################################################################################
|
@@ -111,10 +111,10 @@ module PDF
|
|
111
111
|
#
|
112
112
|
# reader = PDF::Reader.new("somefile.pdf", :password => "apples")
|
113
113
|
#
|
114
|
-
def initialize(input
|
115
|
-
|
116
|
-
|
117
|
-
|
114
|
+
def initialize(input, opts = {})
|
115
|
+
@cache = PDF::Reader::ObjectCache.new
|
116
|
+
opts.merge!(:cache => @cache)
|
117
|
+
@objects = PDF::Reader::ObjectHash.new(input, opts)
|
118
118
|
end
|
119
119
|
|
120
120
|
def info
|
@@ -128,13 +128,16 @@ module PDF
|
|
128
128
|
nil
|
129
129
|
else
|
130
130
|
xml = stream.unfiltered_data
|
131
|
-
xml.force_encoding("utf-8")
|
131
|
+
xml.force_encoding("utf-8")
|
132
132
|
xml
|
133
133
|
end
|
134
134
|
end
|
135
135
|
|
136
136
|
def page_count
|
137
137
|
pages = @objects.deref(root[:Pages])
|
138
|
+
unless pages.kind_of?(::Hash)
|
139
|
+
raise MalformedPDFError, 'Pages structure is missing'
|
140
|
+
end
|
138
141
|
@page_count ||= @objects.deref(pages[:Count])
|
139
142
|
end
|
140
143
|
|
@@ -159,53 +162,6 @@ module PDF
|
|
159
162
|
yield PDF::Reader.new(input, opts)
|
160
163
|
end
|
161
164
|
|
162
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
163
|
-
# eventually be removed
|
164
|
-
#
|
165
|
-
#
|
166
|
-
# Parse the file with the given name, sending events to the given receiver.
|
167
|
-
#
|
168
|
-
def self.file(name, receivers, opts = {})
|
169
|
-
File.open(name,"rb") do |f|
|
170
|
-
new.parse(f, receivers, opts)
|
171
|
-
end
|
172
|
-
end
|
173
|
-
|
174
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
175
|
-
# eventually be removed
|
176
|
-
#
|
177
|
-
# Parse the given string, sending events to the given receiver.
|
178
|
-
#
|
179
|
-
def self.string(str, receivers, opts = {})
|
180
|
-
StringIO.open(str) do |s|
|
181
|
-
new.parse(s, receivers, opts)
|
182
|
-
end
|
183
|
-
end
|
184
|
-
|
185
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
186
|
-
# eventually be removed
|
187
|
-
#
|
188
|
-
# Parse the file with the given name, returning an unmarshalled ruby version of
|
189
|
-
# represents the requested pdf object
|
190
|
-
#
|
191
|
-
def self.object_file(name, id, gen = 0)
|
192
|
-
File.open(name,"rb") { |f|
|
193
|
-
new.object(f, id.to_i, gen.to_i)
|
194
|
-
}
|
195
|
-
end
|
196
|
-
|
197
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
198
|
-
# eventually be removed
|
199
|
-
#
|
200
|
-
# Parse the given string, returning an unmarshalled ruby version of represents
|
201
|
-
# the requested pdf object
|
202
|
-
#
|
203
|
-
def self.object_string(str, id, gen = 0)
|
204
|
-
StringIO.open(str) { |s|
|
205
|
-
new.object(s, id.to_i, gen.to_i)
|
206
|
-
}
|
207
|
-
end
|
208
|
-
|
209
165
|
# returns an array of PDF::Reader::Page objects, one for each
|
210
166
|
# page in the source PDF.
|
211
167
|
#
|
@@ -221,9 +177,13 @@ module PDF
|
|
221
177
|
# methods available on each page
|
222
178
|
#
|
223
179
|
def pages
|
224
|
-
(1..self.page_count).map
|
225
|
-
|
226
|
-
|
180
|
+
(1..self.page_count).map do |num|
|
181
|
+
begin
|
182
|
+
PDF::Reader::Page.new(@objects, num, :cache => @cache)
|
183
|
+
rescue InvalidPageError
|
184
|
+
raise MalformedPDFError, "Missing data for page: #{num}"
|
185
|
+
end
|
186
|
+
end
|
227
187
|
end
|
228
188
|
|
229
189
|
# returns a single PDF::Reader::Page for the specified page.
|
@@ -240,38 +200,10 @@ module PDF
|
|
240
200
|
#
|
241
201
|
def page(num)
|
242
202
|
num = num.to_i
|
243
|
-
|
244
|
-
|
245
|
-
end
|
246
|
-
|
247
|
-
|
248
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
249
|
-
# eventually be removed
|
250
|
-
#
|
251
|
-
# Given an IO object that contains PDF data, parse it.
|
252
|
-
#
|
253
|
-
def parse(io, receivers, opts = {})
|
254
|
-
ohash = ObjectHash.new(io)
|
255
|
-
|
256
|
-
options = {:pages => true, :raw_text => false, :metadata => true}
|
257
|
-
options.merge!(opts)
|
258
|
-
|
259
|
-
strategies.each do |s|
|
260
|
-
s.new(ohash, receivers, options).process
|
203
|
+
if num < 1 || num > self.page_count
|
204
|
+
raise InvalidPageError, "Valid pages are 1 .. #{self.page_count}"
|
261
205
|
end
|
262
|
-
|
263
|
-
self
|
264
|
-
end
|
265
|
-
|
266
|
-
# DEPRECATED: this method was deprecated in version 1.0.0 and will
|
267
|
-
# eventually be removed
|
268
|
-
#
|
269
|
-
# Given an IO object that contains PDF data, return the contents of a single object
|
270
|
-
#
|
271
|
-
def object (io, id, gen)
|
272
|
-
@objects = ObjectHash.new(io)
|
273
|
-
|
274
|
-
@objects.deref(Reference.new(id, gen))
|
206
|
+
PDF::Reader::Page.new(@objects, num, :cache => @cache)
|
275
207
|
end
|
276
208
|
|
277
209
|
private
|
@@ -295,14 +227,14 @@ module PDF
|
|
295
227
|
pdfdoc_to_utf8(obj)
|
296
228
|
end
|
297
229
|
else
|
298
|
-
obj
|
230
|
+
@objects.deref(obj)
|
299
231
|
end
|
300
232
|
end
|
301
233
|
|
302
234
|
# TODO find a PDF I can use to spec this behaviour
|
303
235
|
#
|
304
236
|
def pdfdoc_to_utf8(obj)
|
305
|
-
obj.force_encoding("utf-8")
|
237
|
+
obj.force_encoding("utf-8")
|
306
238
|
obj
|
307
239
|
end
|
308
240
|
|
@@ -312,19 +244,18 @@ module PDF
|
|
312
244
|
def utf16_to_utf8(obj)
|
313
245
|
str = obj[2, obj.size]
|
314
246
|
str = str.unpack("n*").pack("U*")
|
315
|
-
str.force_encoding("utf-8")
|
247
|
+
str.force_encoding("utf-8")
|
316
248
|
str
|
317
249
|
end
|
318
250
|
|
319
|
-
def strategies
|
320
|
-
@strategies ||= [
|
321
|
-
::PDF::Reader::MetadataStrategy,
|
322
|
-
::PDF::Reader::PagesStrategy
|
323
|
-
]
|
324
|
-
end
|
325
|
-
|
326
251
|
def root
|
327
|
-
@root ||=
|
252
|
+
@root ||= begin
|
253
|
+
obj = @objects.deref(@objects.trailer[:Root])
|
254
|
+
unless obj.kind_of?(::Hash)
|
255
|
+
raise MalformedPDFError, "PDF malformed, trailer Root should be a dictionary"
|
256
|
+
end
|
257
|
+
obj
|
258
|
+
end
|
328
259
|
end
|
329
260
|
|
330
261
|
end
|
@@ -332,17 +263,24 @@ end
|
|
332
263
|
################################################################################
|
333
264
|
|
334
265
|
require 'pdf/reader/resource_methods'
|
335
|
-
require 'pdf/reader/abstract_strategy'
|
336
266
|
require 'pdf/reader/buffer'
|
267
|
+
require 'pdf/reader/cid_widths'
|
337
268
|
require 'pdf/reader/cmap'
|
338
269
|
require 'pdf/reader/encoding'
|
339
270
|
require 'pdf/reader/error'
|
340
271
|
require 'pdf/reader/filter'
|
272
|
+
require 'pdf/reader/filter/ascii85'
|
273
|
+
require 'pdf/reader/filter/ascii_hex'
|
274
|
+
require 'pdf/reader/filter/depredict'
|
275
|
+
require 'pdf/reader/filter/flate'
|
276
|
+
require 'pdf/reader/filter/lzw'
|
277
|
+
require 'pdf/reader/filter/null'
|
278
|
+
require 'pdf/reader/filter/run_length'
|
341
279
|
require 'pdf/reader/font'
|
280
|
+
require 'pdf/reader/font_descriptor'
|
342
281
|
require 'pdf/reader/form_xobject'
|
343
282
|
require 'pdf/reader/glyph_hash'
|
344
283
|
require 'pdf/reader/lzw'
|
345
|
-
require 'pdf/reader/metadata_strategy'
|
346
284
|
require 'pdf/reader/object_cache'
|
347
285
|
require 'pdf/reader/object_hash'
|
348
286
|
require 'pdf/reader/object_stream'
|
@@ -351,12 +289,15 @@ require 'pdf/reader/parser'
|
|
351
289
|
require 'pdf/reader/print_receiver'
|
352
290
|
require 'pdf/reader/reference'
|
353
291
|
require 'pdf/reader/register_receiver'
|
292
|
+
require 'pdf/reader/null_security_handler'
|
354
293
|
require 'pdf/reader/standard_security_handler'
|
294
|
+
require 'pdf/reader/standard_security_handler_v5'
|
295
|
+
require 'pdf/reader/unimplemented_security_handler'
|
355
296
|
require 'pdf/reader/stream'
|
356
|
-
require 'pdf/reader/
|
297
|
+
require 'pdf/reader/text_run'
|
357
298
|
require 'pdf/reader/page_state'
|
358
299
|
require 'pdf/reader/page_text_receiver'
|
359
300
|
require 'pdf/reader/token'
|
360
301
|
require 'pdf/reader/xref'
|
302
|
+
require 'pdf/reader/orientation_detector'
|
361
303
|
require 'pdf/reader/page'
|
362
|
-
require 'pdf/hash'
|