pdf-reader 0.8.6 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +17 -0
- data/README.rdoc +7 -15
- data/Rakefile +10 -63
- data/TODO +6 -8
- data/bin/pdf_object +3 -0
- data/bin/pdf_text +4 -2
- data/examples/extract_images.rb +108 -0
- data/examples/hash.rb +1 -1
- data/examples/text.rb +3 -0
- data/lib/pdf/hash.rb +8 -225
- data/lib/pdf/reader.rb +79 -55
- data/lib/pdf/reader/abstract_strategy.rb +77 -0
- data/lib/pdf/reader/buffer.rb +61 -40
- data/lib/pdf/reader/cmap.rb +11 -10
- data/lib/pdf/reader/encoding.rb +85 -79
- data/lib/pdf/reader/error.rb +1 -2
- data/lib/pdf/reader/filter.rb +109 -6
- data/lib/pdf/reader/font.rb +11 -11
- data/lib/pdf/reader/lzw.rb +123 -0
- data/lib/pdf/reader/metadata_strategy.rb +53 -0
- data/lib/pdf/reader/object_hash.rb +275 -0
- data/lib/pdf/reader/object_stream.rb +51 -0
- data/lib/pdf/reader/{content.rb → pages_strategy.rb} +63 -100
- data/lib/pdf/reader/parser.rb +74 -37
- data/lib/pdf/reader/print_receiver.rb +0 -1
- data/lib/pdf/reader/register_receiver.rb +21 -0
- data/lib/pdf/reader/stream.rb +5 -1
- data/lib/pdf/reader/text_receiver.rb +3 -1
- data/lib/pdf/reader/token.rb +1 -1
- data/lib/pdf/reader/xref.rb +126 -64
- metadata +61 -13
- data/lib/pdf/reader/explore.rb +0 -116
data/lib/pdf/reader/parser.rb
CHANGED
@@ -32,10 +32,10 @@ class PDF::Reader
|
|
32
32
|
# Create a new parser around a PDF::Reader::Buffer object
|
33
33
|
#
|
34
34
|
# buffer - a PDF::Reader::Buffer object that contains PDF data
|
35
|
-
#
|
36
|
-
def initialize (buffer,
|
35
|
+
# ohash - a PDF::Reader::ObjectHash object that can return objects from the PDF file
|
36
|
+
def initialize (buffer, ohash=nil)
|
37
37
|
@buffer = buffer
|
38
|
-
@
|
38
|
+
@ohash = ohash
|
39
39
|
end
|
40
40
|
################################################################################
|
41
41
|
# Reads the next token from the underlying buffer and convets it to an appropriate
|
@@ -46,23 +46,22 @@ class PDF::Reader
|
|
46
46
|
token = @buffer.token
|
47
47
|
|
48
48
|
case token
|
49
|
-
when PDF::Reader::Reference
|
50
|
-
when
|
51
|
-
when "
|
52
|
-
when "
|
53
|
-
when "
|
54
|
-
when "
|
55
|
-
when "
|
56
|
-
when "
|
57
|
-
when "
|
58
|
-
when "
|
59
|
-
when "
|
60
|
-
when "
|
61
|
-
when ">>", "]", ">", ")" then return Token.new(token)
|
49
|
+
when PDF::Reader::Reference, nil then return token
|
50
|
+
when "/" then return pdf_name()
|
51
|
+
when "<<" then return dictionary()
|
52
|
+
when "[" then return array()
|
53
|
+
when "(" then return string()
|
54
|
+
when "<" then return hex_string()
|
55
|
+
when "true" then return true
|
56
|
+
when "false" then return false
|
57
|
+
when "null" then return nil
|
58
|
+
when "obj", "endobj", "stream", "endstream" then return Token.new(token)
|
59
|
+
when "stream", "endstream" then return Token.new(token)
|
60
|
+
when ">>", "]", ">", ")" then return Token.new(token)
|
62
61
|
else
|
63
|
-
if operators.has_key?(token)
|
64
|
-
elsif token =~ /\d*\.\d/
|
65
|
-
else
|
62
|
+
if operators.has_key?(token) then return Token.new(token)
|
63
|
+
elsif token =~ /\d*\.\d/ then return token.to_f
|
64
|
+
else return token.to_i
|
66
65
|
end
|
67
66
|
end
|
68
67
|
end
|
@@ -151,30 +150,68 @@ class PDF::Reader
|
|
151
150
|
return "" if str == ")"
|
152
151
|
Error.assert_equal(parse_token, ")")
|
153
152
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
str.
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
153
|
+
ret = ""
|
154
|
+
idx = 0
|
155
|
+
|
156
|
+
while idx < str.size
|
157
|
+
chr = str[idx,1]
|
158
|
+
jump = 1
|
159
|
+
|
160
|
+
if chr == "\\"
|
161
|
+
jump = 2
|
162
|
+
case str[idx+1, 1]
|
163
|
+
when "" then jump = 1
|
164
|
+
when "n" then chr = "\n"
|
165
|
+
when "r" then chr = "\r"
|
166
|
+
when "t" then chr = "\t"
|
167
|
+
when "b" then chr = "\b"
|
168
|
+
when "f" then chr = "\f"
|
169
|
+
when "(" then chr = "("
|
170
|
+
when ")" then chr = ")"
|
171
|
+
when "\\" then chr = "\\"
|
172
|
+
when "\n" then
|
173
|
+
chr = ""
|
174
|
+
jump = 2
|
175
|
+
else
|
176
|
+
if str[idx+1,3].match(/\d{3}/)
|
177
|
+
jump = 4
|
178
|
+
chr = str[idx+1,3].oct.chr
|
179
|
+
elsif str[idx+1,2].match(/\d{2}/)
|
180
|
+
jump = 3
|
181
|
+
chr = ("0"+str[idx+1,2]).oct.chr
|
182
|
+
elsif str[idx+1,1].match(/\d/)
|
183
|
+
jump = 2
|
184
|
+
chr = ("00"+str[idx+1,1]).oct.chr
|
185
|
+
else
|
186
|
+
jump = 1
|
187
|
+
chr = ""
|
188
|
+
end
|
189
|
+
|
190
|
+
end
|
191
|
+
elsif chr == "\r" && str[idx+1,1] == "\n"
|
192
|
+
chr = "\n"
|
193
|
+
jump = 2
|
194
|
+
elsif chr == "\n" && str[idx+1,1] == "\r"
|
195
|
+
chr = "\n"
|
196
|
+
jump = 2
|
197
|
+
elsif chr == "\r"
|
198
|
+
chr = "\n"
|
199
|
+
end
|
200
|
+
ret << chr
|
201
|
+
idx += jump
|
167
202
|
end
|
168
|
-
|
169
|
-
str.gsub!(/\\([^\\])/,'\1')
|
170
|
-
|
171
|
-
str
|
203
|
+
ret
|
172
204
|
end
|
173
205
|
################################################################################
|
174
206
|
# Decodes the contents of a PDF Stream and returns it as a Ruby String.
|
175
207
|
def stream (dict)
|
176
208
|
raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
|
177
|
-
|
209
|
+
if @ohash
|
210
|
+
length = @ohash.object(dict[:Length])
|
211
|
+
else
|
212
|
+
length = dict[:Length] || 0
|
213
|
+
end
|
214
|
+
data = @buffer.read(length, :skip_eol => true)
|
178
215
|
|
179
216
|
Error.str_assert(parse_token, "endstream")
|
180
217
|
Error.str_assert(parse_token, "endobj")
|
@@ -1,4 +1,21 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
# Copyright (C) 2010 James Healy (jimmy@deefa.com)
|
4
|
+
|
1
5
|
class PDF::Reader
|
6
|
+
|
7
|
+
# An example receiver that just records all callbacks generated by parsing
|
8
|
+
# a PDF file.
|
9
|
+
#
|
10
|
+
# Useful for testing the contents of a file in an rspec/test-unit suite.
|
11
|
+
#
|
12
|
+
# Usage:
|
13
|
+
#
|
14
|
+
# receiver = PDF::Reader::RegisterReceiver.new
|
15
|
+
# PDF::Reader.file("somefile.pdf", receiver)
|
16
|
+
# callback = receiver.first_occurance_of(:show_text)
|
17
|
+
# callback[:args].first.should == "Hellow World"
|
18
|
+
#
|
2
19
|
class RegisterReceiver
|
3
20
|
|
4
21
|
attr_accessor :callbacks
|
@@ -31,6 +48,10 @@ class PDF::Reader
|
|
31
48
|
return ret
|
32
49
|
end
|
33
50
|
|
51
|
+
def all_args(methodname)
|
52
|
+
all(methodname).map { |cb| cb[:args] }
|
53
|
+
end
|
54
|
+
|
34
55
|
# return the details for the first time the specified callback was fired
|
35
56
|
def first_occurance_of(methodname)
|
36
57
|
callbacks.each do |cb|
|
data/lib/pdf/reader/stream.rb
CHANGED
@@ -50,7 +50,11 @@ class PDF::Reader
|
|
50
50
|
options = []
|
51
51
|
|
52
52
|
if hash.has_key?(:DecodeParms)
|
53
|
-
|
53
|
+
if hash[:DecodeParms].is_a?(Hash)
|
54
|
+
options = [hash[:DecodeParms]]
|
55
|
+
else
|
56
|
+
options = hash[:DecodeParms]
|
57
|
+
end
|
54
58
|
end
|
55
59
|
|
56
60
|
Array(hash[:Filter]).each_with_index do |filter, index|
|
@@ -96,7 +96,9 @@ class PDF::Reader
|
|
96
96
|
end
|
97
97
|
################################################################################
|
98
98
|
# PDF operator Tm
|
99
|
-
def set_text_matrix_and_text_line_matrix (
|
99
|
+
def set_text_matrix_and_text_line_matrix (*args)
|
100
|
+
# these variable names look bad, but they're from the PDF spec
|
101
|
+
a, b, c, d, e, f = *args
|
100
102
|
calculate_line_and_location(f)
|
101
103
|
end
|
102
104
|
################################################################################
|
data/lib/pdf/reader/token.rb
CHANGED
@@ -28,7 +28,7 @@ class PDF::Reader
|
|
28
28
|
# An internal PDF::Reader class that represents a single token from a PDF file.
|
29
29
|
#
|
30
30
|
# Behaves exactly like a Ruby String - it basically exists for convenience.
|
31
|
-
class Token < String
|
31
|
+
class Token < String # :nodoc:
|
32
32
|
################################################################################
|
33
33
|
# Creates a new token with the specified value
|
34
34
|
def initialize (val)
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -25,76 +25,48 @@
|
|
25
25
|
|
26
26
|
class PDF::Reader
|
27
27
|
################################################################################
|
28
|
-
# An internal PDF::Reader class that represents the
|
28
|
+
# An internal PDF::Reader class that represents the XRef table in a PDF file as a
|
29
|
+
# hash-like object.
|
30
|
+
#
|
29
31
|
# An Xref table is a map of object identifiers and byte offsets. Any time a particular
|
30
32
|
# object needs to be found, the Xref table is used to find where it is stored in the
|
31
33
|
# file.
|
34
|
+
#
|
35
|
+
# Hash keys are object ids, values are either:
|
36
|
+
#
|
37
|
+
# * a byte offset where the object starts (regular PDF objects)
|
38
|
+
# * a PDF::Reader::Reference instance that points to a stream that contains the
|
39
|
+
# desired object (PDF objects embedded in an object stream)
|
40
|
+
#
|
41
|
+
# The class behaves much like a standard Ruby hash, including the use of
|
42
|
+
# the Enumerable mixin. The key difference is no []= method - the hash
|
43
|
+
# is read only.
|
44
|
+
#
|
32
45
|
class XRef
|
46
|
+
include Enumerable
|
47
|
+
attr_reader :trailer
|
48
|
+
|
33
49
|
################################################################################
|
34
|
-
# create a new Xref table based on the contents of the supplied
|
50
|
+
# create a new Xref table based on the contents of the supplied io object
|
51
|
+
#
|
52
|
+
# io - must be an IO object, generally either a file or a StringIO
|
53
|
+
#
|
35
54
|
def initialize (io)
|
36
55
|
@io = io
|
37
56
|
@xref = {}
|
57
|
+
@trailer = load_offsets
|
38
58
|
end
|
59
|
+
################################################################################
|
60
|
+
# return the number of objects in this file. Objects with multiple generations are
|
61
|
+
# only counter once.
|
39
62
|
def size
|
40
63
|
@xref.size
|
41
64
|
end
|
42
65
|
################################################################################
|
43
|
-
# returns the PDF version of the current document. Technically this isn't part of the XRef
|
44
|
-
# table, but it is one of the lowest level data items in the file, so we've lumped it in
|
45
|
-
# with the cross reference code.
|
46
|
-
def pdf_version
|
47
|
-
@io.seek(0)
|
48
|
-
m, version = *@io.read(8).match(/%PDF-(\d.\d)/)
|
49
|
-
raise MalformedPDFError, 'invalid PDF version' if version.nil?
|
50
|
-
return version.to_f
|
51
|
-
end
|
52
|
-
################################################################################
|
53
|
-
# Read the xref table from the underlying buffer. If offset is specified the table
|
54
|
-
# will be loaded from there, otherwise the default offset will be located and used.
|
55
|
-
#
|
56
|
-
# Will fail silently if there is no xref table at the requested offset.
|
57
|
-
def load (offset = nil)
|
58
|
-
offset ||= new_buffer.find_first_xref_offset
|
59
|
-
|
60
|
-
buf = new_buffer(offset)
|
61
|
-
token = buf.token
|
62
|
-
|
63
|
-
if token == "xref" || token == "ref"
|
64
|
-
load_xref_table(buf)
|
65
|
-
elsif token.to_i >= 0 && buf.token.to_i >= 0 && buf.token == "obj"
|
66
|
-
raise PDF::Reader::UnsupportedFeatureError, "XRef streams are not supported in PDF::Reader yet"
|
67
|
-
else
|
68
|
-
raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{token} != xref)"
|
69
|
-
end
|
70
|
-
end
|
71
|
-
################################################################################
|
72
|
-
# Return a string containing the contents of an entire PDF object. The object is requested
|
73
|
-
# by specifying a PDF::Reader::Reference object that contains the objects ID and revision
|
74
|
-
# number
|
75
|
-
#
|
76
|
-
# If the object is a stream, that is returned as well
|
77
|
-
def object (ref)
|
78
|
-
return ref unless ref.kind_of?(Reference)
|
79
|
-
buf = new_buffer(offset_for(ref))
|
80
|
-
obj = Parser.new(buf, self).object(ref.id, ref.gen)
|
81
|
-
return obj
|
82
|
-
end
|
83
|
-
# returns the type of object a ref points to
|
84
|
-
def obj_type(ref)
|
85
|
-
obj = object(ref)
|
86
|
-
obj.class.to_s.to_sym
|
87
|
-
end
|
88
|
-
# returns true if the supplied references points to an object with a stream
|
89
|
-
def stream?(ref)
|
90
|
-
obj, stream = @xref.object(ref)
|
91
|
-
stream ? true : false
|
92
|
-
end
|
93
|
-
################################################################################
|
94
66
|
# returns the byte offset for the specified PDF object.
|
95
67
|
#
|
96
68
|
# ref - a PDF::Reader::Reference object containing an object ID and revision number
|
97
|
-
def
|
69
|
+
def [](ref)
|
98
70
|
@xref[ref.id][ref.gen]
|
99
71
|
rescue
|
100
72
|
raise InvalidObjectError, "Object #{ref.id}, Generation #{ref.gen} is invalid"
|
@@ -105,20 +77,42 @@ class PDF::Reader
|
|
105
77
|
ids = @xref.keys.sort
|
106
78
|
ids.each do |id|
|
107
79
|
gen = @xref[id].keys.sort[-1]
|
108
|
-
|
109
|
-
yield ref, object(ref)
|
80
|
+
yield PDF::Reader::Reference.new(id, gen)
|
110
81
|
end
|
111
82
|
end
|
112
83
|
################################################################################
|
113
|
-
# Stores an offset value for a particular PDF object ID and revision number
|
114
|
-
def store (id, gen, offset)
|
115
|
-
(@xref[id] ||= {})[gen] ||= offset
|
116
|
-
end
|
117
|
-
################################################################################
|
118
84
|
private
|
119
85
|
################################################################################
|
120
|
-
#
|
121
|
-
#
|
86
|
+
# Read a xref table from the underlying buffer.
|
87
|
+
#
|
88
|
+
# If offset is specified the table will be loaded from there, otherwise the
|
89
|
+
# default offset will be located and used.
|
90
|
+
#
|
91
|
+
# After seeking to the offset, processing is handed of to either load_xref_table()
|
92
|
+
# or load_xref_stream() based on what we find there.
|
93
|
+
#
|
94
|
+
def load_offsets(offset = nil)
|
95
|
+
offset ||= new_buffer.find_first_xref_offset
|
96
|
+
|
97
|
+
buf = new_buffer(offset)
|
98
|
+
tok_one = buf.token
|
99
|
+
|
100
|
+
return load_xref_table(buf) if tok_one == "xref" || tok_one == "ref"
|
101
|
+
|
102
|
+
tok_two = buf.token
|
103
|
+
tok_three = buf.token
|
104
|
+
|
105
|
+
if tok_one.to_i >= 0 && tok_two.to_i >= 0 && tok_three == "obj"
|
106
|
+
buf = new_buffer(offset)
|
107
|
+
stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i)
|
108
|
+
return load_xref_stream(stream)
|
109
|
+
end
|
110
|
+
|
111
|
+
raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{tok_one} != xref)"
|
112
|
+
end
|
113
|
+
################################################################################
|
114
|
+
# Assumes the underlying buffer is positioned at the start of a traditional
|
115
|
+
# Xref table and processes it into memory.
|
122
116
|
def load_xref_table(buf)
|
123
117
|
params = []
|
124
118
|
|
@@ -142,14 +136,82 @@ class PDF::Reader
|
|
142
136
|
|
143
137
|
raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless trailer.kind_of?(Hash)
|
144
138
|
|
145
|
-
|
139
|
+
load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
|
140
|
+
load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
|
146
141
|
|
147
142
|
trailer
|
148
143
|
end
|
149
144
|
|
145
|
+
################################################################################
|
146
|
+
# Read a XReaf stream from the underlying buffer instead of a traditional xref table.
|
147
|
+
#
|
148
|
+
def load_xref_stream(stream)
|
149
|
+
unless stream.hash[:Type] == :XRef
|
150
|
+
raise PDF::Reader::MalformedPDFError, "xref stream not found when expected"
|
151
|
+
end
|
152
|
+
trailer = {}
|
153
|
+
trailer[:Root] = stream.hash[:Root] if stream.hash[:Root]
|
154
|
+
trailer[:Info] = stream.hash[:Info] if stream.hash[:Info]
|
155
|
+
trailer[:Prev] = stream.hash[:Prev] if stream.hash[:Prev]
|
156
|
+
|
157
|
+
widths = stream.hash[:W]
|
158
|
+
entry_length = widths.inject(0) { |s, w| s + w }
|
159
|
+
raw_data = stream.unfiltered_data
|
160
|
+
if stream.hash[:Index]
|
161
|
+
index = stream.hash[:Index][0]
|
162
|
+
else
|
163
|
+
index = 0
|
164
|
+
end
|
165
|
+
stream.hash[:Size].times do |i|
|
166
|
+
entry = raw_data[i*entry_length, entry_length] || ""
|
167
|
+
f1 = unpack_bytes(entry[0,widths[0]])
|
168
|
+
f2 = unpack_bytes(entry[widths[0],widths[1]])
|
169
|
+
f3 = unpack_bytes(entry[widths[0]+widths[1],widths[2]])
|
170
|
+
if f1 == 1
|
171
|
+
store(index + i, f3, f2)
|
172
|
+
elsif f1 == 2
|
173
|
+
store(index + i, 0, PDF::Reader::Reference.new(f2, 0))
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
|
178
|
+
|
179
|
+
trailer
|
180
|
+
end
|
181
|
+
################################################################################
|
182
|
+
# XRef streams pack info into integers 1-N bytes wide. Depending on the number of
|
183
|
+
# bytes they need to be converted to an int in different ways.
|
184
|
+
#
|
185
|
+
def unpack_bytes(bytes)
|
186
|
+
if bytes.to_s.size == 0
|
187
|
+
0
|
188
|
+
elsif bytes.size == 1
|
189
|
+
bytes.unpack("C")[0]
|
190
|
+
elsif bytes.size == 2
|
191
|
+
bytes.unpack("n")[0]
|
192
|
+
elsif bytes.size == 3
|
193
|
+
("\x00" + bytes).unpack("N")[0]
|
194
|
+
elsif bytes.size == 4
|
195
|
+
bytes.unpack("N")[0]
|
196
|
+
else
|
197
|
+
raise UnsupportedFeatureError, "Unable to unpack xref stream entries with more than 4 bytes"
|
198
|
+
end
|
199
|
+
end
|
200
|
+
################################################################################
|
201
|
+
# Wrap the io stream we're working with in a buffer that can tokenise it for us.
|
202
|
+
#
|
203
|
+
# We create multiple buffers so we can be tokenising multiple sections of the file
|
204
|
+
# at the same time without worring about clearing the buffers contents.
|
205
|
+
#
|
150
206
|
def new_buffer(offset = 0)
|
151
207
|
PDF::Reader::Buffer.new(@io, :seek => offset)
|
152
208
|
end
|
209
|
+
################################################################################
|
210
|
+
# Stores an offset value for a particular PDF object ID and revision number
|
211
|
+
#
|
212
|
+
def store (id, gen, offset)
|
213
|
+
(@xref[id] ||= {})[gen] ||= offset
|
214
|
+
end
|
153
215
|
end
|
154
216
|
################################################################################
|
155
217
|
end
|