pdf-reader 0.8.6 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +17 -0
- data/README.rdoc +7 -15
- data/Rakefile +10 -63
- data/TODO +6 -8
- data/bin/pdf_object +3 -0
- data/bin/pdf_text +4 -2
- data/examples/extract_images.rb +108 -0
- data/examples/hash.rb +1 -1
- data/examples/text.rb +3 -0
- data/lib/pdf/hash.rb +8 -225
- data/lib/pdf/reader.rb +79 -55
- data/lib/pdf/reader/abstract_strategy.rb +77 -0
- data/lib/pdf/reader/buffer.rb +61 -40
- data/lib/pdf/reader/cmap.rb +11 -10
- data/lib/pdf/reader/encoding.rb +85 -79
- data/lib/pdf/reader/error.rb +1 -2
- data/lib/pdf/reader/filter.rb +109 -6
- data/lib/pdf/reader/font.rb +11 -11
- data/lib/pdf/reader/lzw.rb +123 -0
- data/lib/pdf/reader/metadata_strategy.rb +53 -0
- data/lib/pdf/reader/object_hash.rb +275 -0
- data/lib/pdf/reader/object_stream.rb +51 -0
- data/lib/pdf/reader/{content.rb → pages_strategy.rb} +63 -100
- data/lib/pdf/reader/parser.rb +74 -37
- data/lib/pdf/reader/print_receiver.rb +0 -1
- data/lib/pdf/reader/register_receiver.rb +21 -0
- data/lib/pdf/reader/stream.rb +5 -1
- data/lib/pdf/reader/text_receiver.rb +3 -1
- data/lib/pdf/reader/token.rb +1 -1
- data/lib/pdf/reader/xref.rb +126 -64
- metadata +61 -13
- data/lib/pdf/reader/explore.rb +0 -116
data/lib/pdf/reader/parser.rb
CHANGED
@@ -32,10 +32,10 @@ class PDF::Reader
|
|
32
32
|
# Create a new parser around a PDF::Reader::Buffer object
|
33
33
|
#
|
34
34
|
# buffer - a PDF::Reader::Buffer object that contains PDF data
|
35
|
-
#
|
36
|
-
def initialize (buffer,
|
35
|
+
# ohash - a PDF::Reader::ObjectHash object that can return objects from the PDF file
|
36
|
+
def initialize (buffer, ohash=nil)
|
37
37
|
@buffer = buffer
|
38
|
-
@
|
38
|
+
@ohash = ohash
|
39
39
|
end
|
40
40
|
################################################################################
|
41
41
|
# Reads the next token from the underlying buffer and convets it to an appropriate
|
@@ -46,23 +46,22 @@ class PDF::Reader
|
|
46
46
|
token = @buffer.token
|
47
47
|
|
48
48
|
case token
|
49
|
-
when PDF::Reader::Reference
|
50
|
-
when
|
51
|
-
when "
|
52
|
-
when "
|
53
|
-
when "
|
54
|
-
when "
|
55
|
-
when "
|
56
|
-
when "
|
57
|
-
when "
|
58
|
-
when "
|
59
|
-
when "
|
60
|
-
when "
|
61
|
-
when ">>", "]", ">", ")" then return Token.new(token)
|
49
|
+
when PDF::Reader::Reference, nil then return token
|
50
|
+
when "/" then return pdf_name()
|
51
|
+
when "<<" then return dictionary()
|
52
|
+
when "[" then return array()
|
53
|
+
when "(" then return string()
|
54
|
+
when "<" then return hex_string()
|
55
|
+
when "true" then return true
|
56
|
+
when "false" then return false
|
57
|
+
when "null" then return nil
|
58
|
+
when "obj", "endobj", "stream", "endstream" then return Token.new(token)
|
59
|
+
when "stream", "endstream" then return Token.new(token)
|
60
|
+
when ">>", "]", ">", ")" then return Token.new(token)
|
62
61
|
else
|
63
|
-
if operators.has_key?(token)
|
64
|
-
elsif token =~ /\d*\.\d/
|
65
|
-
else
|
62
|
+
if operators.has_key?(token) then return Token.new(token)
|
63
|
+
elsif token =~ /\d*\.\d/ then return token.to_f
|
64
|
+
else return token.to_i
|
66
65
|
end
|
67
66
|
end
|
68
67
|
end
|
@@ -151,30 +150,68 @@ class PDF::Reader
|
|
151
150
|
return "" if str == ")"
|
152
151
|
Error.assert_equal(parse_token, ")")
|
153
152
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
str.
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
153
|
+
ret = ""
|
154
|
+
idx = 0
|
155
|
+
|
156
|
+
while idx < str.size
|
157
|
+
chr = str[idx,1]
|
158
|
+
jump = 1
|
159
|
+
|
160
|
+
if chr == "\\"
|
161
|
+
jump = 2
|
162
|
+
case str[idx+1, 1]
|
163
|
+
when "" then jump = 1
|
164
|
+
when "n" then chr = "\n"
|
165
|
+
when "r" then chr = "\r"
|
166
|
+
when "t" then chr = "\t"
|
167
|
+
when "b" then chr = "\b"
|
168
|
+
when "f" then chr = "\f"
|
169
|
+
when "(" then chr = "("
|
170
|
+
when ")" then chr = ")"
|
171
|
+
when "\\" then chr = "\\"
|
172
|
+
when "\n" then
|
173
|
+
chr = ""
|
174
|
+
jump = 2
|
175
|
+
else
|
176
|
+
if str[idx+1,3].match(/\d{3}/)
|
177
|
+
jump = 4
|
178
|
+
chr = str[idx+1,3].oct.chr
|
179
|
+
elsif str[idx+1,2].match(/\d{2}/)
|
180
|
+
jump = 3
|
181
|
+
chr = ("0"+str[idx+1,2]).oct.chr
|
182
|
+
elsif str[idx+1,1].match(/\d/)
|
183
|
+
jump = 2
|
184
|
+
chr = ("00"+str[idx+1,1]).oct.chr
|
185
|
+
else
|
186
|
+
jump = 1
|
187
|
+
chr = ""
|
188
|
+
end
|
189
|
+
|
190
|
+
end
|
191
|
+
elsif chr == "\r" && str[idx+1,1] == "\n"
|
192
|
+
chr = "\n"
|
193
|
+
jump = 2
|
194
|
+
elsif chr == "\n" && str[idx+1,1] == "\r"
|
195
|
+
chr = "\n"
|
196
|
+
jump = 2
|
197
|
+
elsif chr == "\r"
|
198
|
+
chr = "\n"
|
199
|
+
end
|
200
|
+
ret << chr
|
201
|
+
idx += jump
|
167
202
|
end
|
168
|
-
|
169
|
-
str.gsub!(/\\([^\\])/,'\1')
|
170
|
-
|
171
|
-
str
|
203
|
+
ret
|
172
204
|
end
|
173
205
|
################################################################################
|
174
206
|
# Decodes the contents of a PDF Stream and returns it as a Ruby String.
|
175
207
|
def stream (dict)
|
176
208
|
raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
|
177
|
-
|
209
|
+
if @ohash
|
210
|
+
length = @ohash.object(dict[:Length])
|
211
|
+
else
|
212
|
+
length = dict[:Length] || 0
|
213
|
+
end
|
214
|
+
data = @buffer.read(length, :skip_eol => true)
|
178
215
|
|
179
216
|
Error.str_assert(parse_token, "endstream")
|
180
217
|
Error.str_assert(parse_token, "endobj")
|
@@ -1,4 +1,21 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
# Copyright (C) 2010 James Healy (jimmy@deefa.com)
|
4
|
+
|
1
5
|
class PDF::Reader
|
6
|
+
|
7
|
+
# An example receiver that just records all callbacks generated by parsing
|
8
|
+
# a PDF file.
|
9
|
+
#
|
10
|
+
# Useful for testing the contents of a file in an rspec/test-unit suite.
|
11
|
+
#
|
12
|
+
# Usage:
|
13
|
+
#
|
14
|
+
# receiver = PDF::Reader::RegisterReceiver.new
|
15
|
+
# PDF::Reader.file("somefile.pdf", receiver)
|
16
|
+
# callback = receiver.first_occurance_of(:show_text)
|
17
|
+
# callback[:args].first.should == "Hellow World"
|
18
|
+
#
|
2
19
|
class RegisterReceiver
|
3
20
|
|
4
21
|
attr_accessor :callbacks
|
@@ -31,6 +48,10 @@ class PDF::Reader
|
|
31
48
|
return ret
|
32
49
|
end
|
33
50
|
|
51
|
+
def all_args(methodname)
|
52
|
+
all(methodname).map { |cb| cb[:args] }
|
53
|
+
end
|
54
|
+
|
34
55
|
# return the details for the first time the specified callback was fired
|
35
56
|
def first_occurance_of(methodname)
|
36
57
|
callbacks.each do |cb|
|
data/lib/pdf/reader/stream.rb
CHANGED
@@ -50,7 +50,11 @@ class PDF::Reader
|
|
50
50
|
options = []
|
51
51
|
|
52
52
|
if hash.has_key?(:DecodeParms)
|
53
|
-
|
53
|
+
if hash[:DecodeParms].is_a?(Hash)
|
54
|
+
options = [hash[:DecodeParms]]
|
55
|
+
else
|
56
|
+
options = hash[:DecodeParms]
|
57
|
+
end
|
54
58
|
end
|
55
59
|
|
56
60
|
Array(hash[:Filter]).each_with_index do |filter, index|
|
@@ -96,7 +96,9 @@ class PDF::Reader
|
|
96
96
|
end
|
97
97
|
################################################################################
|
98
98
|
# PDF operator Tm
|
99
|
-
def set_text_matrix_and_text_line_matrix (
|
99
|
+
def set_text_matrix_and_text_line_matrix (*args)
|
100
|
+
# these variable names look bad, but they're from the PDF spec
|
101
|
+
a, b, c, d, e, f = *args
|
100
102
|
calculate_line_and_location(f)
|
101
103
|
end
|
102
104
|
################################################################################
|
data/lib/pdf/reader/token.rb
CHANGED
@@ -28,7 +28,7 @@ class PDF::Reader
|
|
28
28
|
# An internal PDF::Reader class that represents a single token from a PDF file.
|
29
29
|
#
|
30
30
|
# Behaves exactly like a Ruby String - it basically exists for convenience.
|
31
|
-
class Token < String
|
31
|
+
class Token < String # :nodoc:
|
32
32
|
################################################################################
|
33
33
|
# Creates a new token with the specified value
|
34
34
|
def initialize (val)
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -25,76 +25,48 @@
|
|
25
25
|
|
26
26
|
class PDF::Reader
|
27
27
|
################################################################################
|
28
|
-
# An internal PDF::Reader class that represents the
|
28
|
+
# An internal PDF::Reader class that represents the XRef table in a PDF file as a
|
29
|
+
# hash-like object.
|
30
|
+
#
|
29
31
|
# An Xref table is a map of object identifiers and byte offsets. Any time a particular
|
30
32
|
# object needs to be found, the Xref table is used to find where it is stored in the
|
31
33
|
# file.
|
34
|
+
#
|
35
|
+
# Hash keys are object ids, values are either:
|
36
|
+
#
|
37
|
+
# * a byte offset where the object starts (regular PDF objects)
|
38
|
+
# * a PDF::Reader::Reference instance that points to a stream that contains the
|
39
|
+
# desired object (PDF objects embedded in an object stream)
|
40
|
+
#
|
41
|
+
# The class behaves much like a standard Ruby hash, including the use of
|
42
|
+
# the Enumerable mixin. The key difference is no []= method - the hash
|
43
|
+
# is read only.
|
44
|
+
#
|
32
45
|
class XRef
|
46
|
+
include Enumerable
|
47
|
+
attr_reader :trailer
|
48
|
+
|
33
49
|
################################################################################
|
34
|
-
# create a new Xref table based on the contents of the supplied
|
50
|
+
# create a new Xref table based on the contents of the supplied io object
|
51
|
+
#
|
52
|
+
# io - must be an IO object, generally either a file or a StringIO
|
53
|
+
#
|
35
54
|
def initialize (io)
|
36
55
|
@io = io
|
37
56
|
@xref = {}
|
57
|
+
@trailer = load_offsets
|
38
58
|
end
|
59
|
+
################################################################################
|
60
|
+
# return the number of objects in this file. Objects with multiple generations are
|
61
|
+
# only counter once.
|
39
62
|
def size
|
40
63
|
@xref.size
|
41
64
|
end
|
42
65
|
################################################################################
|
43
|
-
# returns the PDF version of the current document. Technically this isn't part of the XRef
|
44
|
-
# table, but it is one of the lowest level data items in the file, so we've lumped it in
|
45
|
-
# with the cross reference code.
|
46
|
-
def pdf_version
|
47
|
-
@io.seek(0)
|
48
|
-
m, version = *@io.read(8).match(/%PDF-(\d.\d)/)
|
49
|
-
raise MalformedPDFError, 'invalid PDF version' if version.nil?
|
50
|
-
return version.to_f
|
51
|
-
end
|
52
|
-
################################################################################
|
53
|
-
# Read the xref table from the underlying buffer. If offset is specified the table
|
54
|
-
# will be loaded from there, otherwise the default offset will be located and used.
|
55
|
-
#
|
56
|
-
# Will fail silently if there is no xref table at the requested offset.
|
57
|
-
def load (offset = nil)
|
58
|
-
offset ||= new_buffer.find_first_xref_offset
|
59
|
-
|
60
|
-
buf = new_buffer(offset)
|
61
|
-
token = buf.token
|
62
|
-
|
63
|
-
if token == "xref" || token == "ref"
|
64
|
-
load_xref_table(buf)
|
65
|
-
elsif token.to_i >= 0 && buf.token.to_i >= 0 && buf.token == "obj"
|
66
|
-
raise PDF::Reader::UnsupportedFeatureError, "XRef streams are not supported in PDF::Reader yet"
|
67
|
-
else
|
68
|
-
raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{token} != xref)"
|
69
|
-
end
|
70
|
-
end
|
71
|
-
################################################################################
|
72
|
-
# Return a string containing the contents of an entire PDF object. The object is requested
|
73
|
-
# by specifying a PDF::Reader::Reference object that contains the objects ID and revision
|
74
|
-
# number
|
75
|
-
#
|
76
|
-
# If the object is a stream, that is returned as well
|
77
|
-
def object (ref)
|
78
|
-
return ref unless ref.kind_of?(Reference)
|
79
|
-
buf = new_buffer(offset_for(ref))
|
80
|
-
obj = Parser.new(buf, self).object(ref.id, ref.gen)
|
81
|
-
return obj
|
82
|
-
end
|
83
|
-
# returns the type of object a ref points to
|
84
|
-
def obj_type(ref)
|
85
|
-
obj = object(ref)
|
86
|
-
obj.class.to_s.to_sym
|
87
|
-
end
|
88
|
-
# returns true if the supplied references points to an object with a stream
|
89
|
-
def stream?(ref)
|
90
|
-
obj, stream = @xref.object(ref)
|
91
|
-
stream ? true : false
|
92
|
-
end
|
93
|
-
################################################################################
|
94
66
|
# returns the byte offset for the specified PDF object.
|
95
67
|
#
|
96
68
|
# ref - a PDF::Reader::Reference object containing an object ID and revision number
|
97
|
-
def
|
69
|
+
def [](ref)
|
98
70
|
@xref[ref.id][ref.gen]
|
99
71
|
rescue
|
100
72
|
raise InvalidObjectError, "Object #{ref.id}, Generation #{ref.gen} is invalid"
|
@@ -105,20 +77,42 @@ class PDF::Reader
|
|
105
77
|
ids = @xref.keys.sort
|
106
78
|
ids.each do |id|
|
107
79
|
gen = @xref[id].keys.sort[-1]
|
108
|
-
|
109
|
-
yield ref, object(ref)
|
80
|
+
yield PDF::Reader::Reference.new(id, gen)
|
110
81
|
end
|
111
82
|
end
|
112
83
|
################################################################################
|
113
|
-
# Stores an offset value for a particular PDF object ID and revision number
|
114
|
-
def store (id, gen, offset)
|
115
|
-
(@xref[id] ||= {})[gen] ||= offset
|
116
|
-
end
|
117
|
-
################################################################################
|
118
84
|
private
|
119
85
|
################################################################################
|
120
|
-
#
|
121
|
-
#
|
86
|
+
# Read a xref table from the underlying buffer.
|
87
|
+
#
|
88
|
+
# If offset is specified the table will be loaded from there, otherwise the
|
89
|
+
# default offset will be located and used.
|
90
|
+
#
|
91
|
+
# After seeking to the offset, processing is handed of to either load_xref_table()
|
92
|
+
# or load_xref_stream() based on what we find there.
|
93
|
+
#
|
94
|
+
def load_offsets(offset = nil)
|
95
|
+
offset ||= new_buffer.find_first_xref_offset
|
96
|
+
|
97
|
+
buf = new_buffer(offset)
|
98
|
+
tok_one = buf.token
|
99
|
+
|
100
|
+
return load_xref_table(buf) if tok_one == "xref" || tok_one == "ref"
|
101
|
+
|
102
|
+
tok_two = buf.token
|
103
|
+
tok_three = buf.token
|
104
|
+
|
105
|
+
if tok_one.to_i >= 0 && tok_two.to_i >= 0 && tok_three == "obj"
|
106
|
+
buf = new_buffer(offset)
|
107
|
+
stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i)
|
108
|
+
return load_xref_stream(stream)
|
109
|
+
end
|
110
|
+
|
111
|
+
raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{tok_one} != xref)"
|
112
|
+
end
|
113
|
+
################################################################################
|
114
|
+
# Assumes the underlying buffer is positioned at the start of a traditional
|
115
|
+
# Xref table and processes it into memory.
|
122
116
|
def load_xref_table(buf)
|
123
117
|
params = []
|
124
118
|
|
@@ -142,14 +136,82 @@ class PDF::Reader
|
|
142
136
|
|
143
137
|
raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless trailer.kind_of?(Hash)
|
144
138
|
|
145
|
-
|
139
|
+
load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
|
140
|
+
load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
|
146
141
|
|
147
142
|
trailer
|
148
143
|
end
|
149
144
|
|
145
|
+
################################################################################
|
146
|
+
# Read a XReaf stream from the underlying buffer instead of a traditional xref table.
|
147
|
+
#
|
148
|
+
def load_xref_stream(stream)
|
149
|
+
unless stream.hash[:Type] == :XRef
|
150
|
+
raise PDF::Reader::MalformedPDFError, "xref stream not found when expected"
|
151
|
+
end
|
152
|
+
trailer = {}
|
153
|
+
trailer[:Root] = stream.hash[:Root] if stream.hash[:Root]
|
154
|
+
trailer[:Info] = stream.hash[:Info] if stream.hash[:Info]
|
155
|
+
trailer[:Prev] = stream.hash[:Prev] if stream.hash[:Prev]
|
156
|
+
|
157
|
+
widths = stream.hash[:W]
|
158
|
+
entry_length = widths.inject(0) { |s, w| s + w }
|
159
|
+
raw_data = stream.unfiltered_data
|
160
|
+
if stream.hash[:Index]
|
161
|
+
index = stream.hash[:Index][0]
|
162
|
+
else
|
163
|
+
index = 0
|
164
|
+
end
|
165
|
+
stream.hash[:Size].times do |i|
|
166
|
+
entry = raw_data[i*entry_length, entry_length] || ""
|
167
|
+
f1 = unpack_bytes(entry[0,widths[0]])
|
168
|
+
f2 = unpack_bytes(entry[widths[0],widths[1]])
|
169
|
+
f3 = unpack_bytes(entry[widths[0]+widths[1],widths[2]])
|
170
|
+
if f1 == 1
|
171
|
+
store(index + i, f3, f2)
|
172
|
+
elsif f1 == 2
|
173
|
+
store(index + i, 0, PDF::Reader::Reference.new(f2, 0))
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
|
178
|
+
|
179
|
+
trailer
|
180
|
+
end
|
181
|
+
################################################################################
|
182
|
+
# XRef streams pack info into integers 1-N bytes wide. Depending on the number of
|
183
|
+
# bytes they need to be converted to an int in different ways.
|
184
|
+
#
|
185
|
+
def unpack_bytes(bytes)
|
186
|
+
if bytes.to_s.size == 0
|
187
|
+
0
|
188
|
+
elsif bytes.size == 1
|
189
|
+
bytes.unpack("C")[0]
|
190
|
+
elsif bytes.size == 2
|
191
|
+
bytes.unpack("n")[0]
|
192
|
+
elsif bytes.size == 3
|
193
|
+
("\x00" + bytes).unpack("N")[0]
|
194
|
+
elsif bytes.size == 4
|
195
|
+
bytes.unpack("N")[0]
|
196
|
+
else
|
197
|
+
raise UnsupportedFeatureError, "Unable to unpack xref stream entries with more than 4 bytes"
|
198
|
+
end
|
199
|
+
end
|
200
|
+
################################################################################
|
201
|
+
# Wrap the io stream we're working with in a buffer that can tokenise it for us.
|
202
|
+
#
|
203
|
+
# We create multiple buffers so we can be tokenising multiple sections of the file
|
204
|
+
# at the same time without worring about clearing the buffers contents.
|
205
|
+
#
|
150
206
|
def new_buffer(offset = 0)
|
151
207
|
PDF::Reader::Buffer.new(@io, :seek => offset)
|
152
208
|
end
|
209
|
+
################################################################################
|
210
|
+
# Stores an offset value for a particular PDF object ID and revision number
|
211
|
+
#
|
212
|
+
def store (id, gen, offset)
|
213
|
+
(@xref[id] ||= {})[gen] ||= offset
|
214
|
+
end
|
153
215
|
end
|
154
216
|
################################################################################
|
155
217
|
end
|