pdf-reader 0.8.6 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -32,10 +32,10 @@ class PDF::Reader
32
32
  # Create a new parser around a PDF::Reader::Buffer object
33
33
  #
34
34
  # buffer - a PDF::Reader::Buffer object that contains PDF data
35
- # xref - a PDF::Reader::XRef object that represents the document's object offsets
36
- def initialize (buffer, xref=nil)
35
+ # ohash - a PDF::Reader::ObjectHash object that can return objects from the PDF file
36
+ def initialize (buffer, ohash=nil)
37
37
  @buffer = buffer
38
- @xref = xref
38
+ @ohash = ohash
39
39
  end
40
40
  ################################################################################
41
41
  # Reads the next token from the underlying buffer and convets it to an appropriate
@@ -46,23 +46,22 @@ class PDF::Reader
46
46
  token = @buffer.token
47
47
 
48
48
  case token
49
- when PDF::Reader::Reference then return token
50
- when nil then return nil
51
- when "/" then return pdf_name()
52
- when "<<" then return dictionary()
53
- when "[" then return array()
54
- when "(" then return string()
55
- when "<" then return hex_string()
56
- when "true" then return true
57
- when "false" then return false
58
- when "null" then return nil
59
- when "obj", "endobj" then return Token.new(token)
60
- when "stream", "endstream" then return Token.new(token)
61
- when ">>", "]", ">", ")" then return Token.new(token)
49
+ when PDF::Reader::Reference, nil then return token
50
+ when "/" then return pdf_name()
51
+ when "<<" then return dictionary()
52
+ when "[" then return array()
53
+ when "(" then return string()
54
+ when "<" then return hex_string()
55
+ when "true" then return true
56
+ when "false" then return false
57
+ when "null" then return nil
58
+ when "obj", "endobj", "stream", "endstream" then return Token.new(token)
59
+ when "stream", "endstream" then return Token.new(token)
60
+ when ">>", "]", ">", ")" then return Token.new(token)
62
61
  else
63
- if operators.has_key?(token) then return Token.new(token)
64
- elsif token =~ /\d*\.\d/ then return token.to_f
65
- else return token.to_i
62
+ if operators.has_key?(token) then return Token.new(token)
63
+ elsif token =~ /\d*\.\d/ then return token.to_f
64
+ else return token.to_i
66
65
  end
67
66
  end
68
67
  end
@@ -151,30 +150,68 @@ class PDF::Reader
151
150
  return "" if str == ")"
152
151
  Error.assert_equal(parse_token, ")")
153
152
 
154
- str.gsub!(/([^\\])(\n\r|\r\n|\r)/m,'\1\n')
155
- str.gsub!("\\n","\n")
156
- str.gsub!("\\r","\r")
157
- str.gsub!("\\t","\t")
158
- str.gsub!("\\b","\b")
159
- str.gsub!("\\f","\f")
160
- str.gsub!("\\(","(")
161
- str.gsub!("\\)",")")
162
- str.gsub!("\\\\","\\")
163
- str.gsub!(/\\\n/m,"")
164
-
165
- str.scan(/\\\d{1,3}/).each do |octal|
166
- str.gsub!(octal, octal[1,3].oct.chr)
153
+ ret = ""
154
+ idx = 0
155
+
156
+ while idx < str.size
157
+ chr = str[idx,1]
158
+ jump = 1
159
+
160
+ if chr == "\\"
161
+ jump = 2
162
+ case str[idx+1, 1]
163
+ when "" then jump = 1
164
+ when "n" then chr = "\n"
165
+ when "r" then chr = "\r"
166
+ when "t" then chr = "\t"
167
+ when "b" then chr = "\b"
168
+ when "f" then chr = "\f"
169
+ when "(" then chr = "("
170
+ when ")" then chr = ")"
171
+ when "\\" then chr = "\\"
172
+ when "\n" then
173
+ chr = ""
174
+ jump = 2
175
+ else
176
+ if str[idx+1,3].match(/\d{3}/)
177
+ jump = 4
178
+ chr = str[idx+1,3].oct.chr
179
+ elsif str[idx+1,2].match(/\d{2}/)
180
+ jump = 3
181
+ chr = ("0"+str[idx+1,2]).oct.chr
182
+ elsif str[idx+1,1].match(/\d/)
183
+ jump = 2
184
+ chr = ("00"+str[idx+1,1]).oct.chr
185
+ else
186
+ jump = 1
187
+ chr = ""
188
+ end
189
+
190
+ end
191
+ elsif chr == "\r" && str[idx+1,1] == "\n"
192
+ chr = "\n"
193
+ jump = 2
194
+ elsif chr == "\n" && str[idx+1,1] == "\r"
195
+ chr = "\n"
196
+ jump = 2
197
+ elsif chr == "\r"
198
+ chr = "\n"
199
+ end
200
+ ret << chr
201
+ idx += jump
167
202
  end
168
-
169
- str.gsub!(/\\([^\\])/,'\1')
170
-
171
- str
203
+ ret
172
204
  end
173
205
  ################################################################################
174
206
  # Decodes the contents of a PDF Stream and returns it as a Ruby String.
175
207
  def stream (dict)
176
208
  raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
177
- data = @buffer.read(@xref.object(dict[:Length]), :skip_eol => true)
209
+ if @ohash
210
+ length = @ohash.object(dict[:Length])
211
+ else
212
+ length = dict[:Length] || 0
213
+ end
214
+ data = @buffer.read(length, :skip_eol => true)
178
215
 
179
216
  Error.str_assert(parse_token, "endstream")
180
217
  Error.str_assert(parse_token, "endobj")
@@ -8,7 +8,6 @@ class PDF::Reader
8
8
  end
9
9
 
10
10
  def respond_to?(meth)
11
- return false if [:begin_inline_image_data].include?(meth)
12
11
  true
13
12
  end
14
13
 
@@ -1,4 +1,21 @@
1
+ # coding: utf-8
2
+
3
+ # Copyright (C) 2010 James Healy (jimmy@deefa.com)
4
+
1
5
  class PDF::Reader
6
+
7
+ # An example receiver that just records all callbacks generated by parsing
8
+ # a PDF file.
9
+ #
10
+ # Useful for testing the contents of a file in an rspec/test-unit suite.
11
+ #
12
+ # Usage:
13
+ #
14
+ # receiver = PDF::Reader::RegisterReceiver.new
15
+ # PDF::Reader.file("somefile.pdf", receiver)
16
+ # callback = receiver.first_occurance_of(:show_text)
17
+ # callback[:args].first.should == "Hellow World"
18
+ #
2
19
  class RegisterReceiver
3
20
 
4
21
  attr_accessor :callbacks
@@ -31,6 +48,10 @@ class PDF::Reader
31
48
  return ret
32
49
  end
33
50
 
51
+ def all_args(methodname)
52
+ all(methodname).map { |cb| cb[:args] }
53
+ end
54
+
34
55
  # return the details for the first time the specified callback was fired
35
56
  def first_occurance_of(methodname)
36
57
  callbacks.each do |cb|
@@ -50,7 +50,11 @@ class PDF::Reader
50
50
  options = []
51
51
 
52
52
  if hash.has_key?(:DecodeParms)
53
- options = Array(hash[:DecodeParms])
53
+ if hash[:DecodeParms].is_a?(Hash)
54
+ options = [hash[:DecodeParms]]
55
+ else
56
+ options = hash[:DecodeParms]
57
+ end
54
58
  end
55
59
 
56
60
  Array(hash[:Filter]).each_with_index do |filter, index|
@@ -96,7 +96,9 @@ class PDF::Reader
96
96
  end
97
97
  ################################################################################
98
98
  # PDF operator Tm
99
- def set_text_matrix_and_text_line_matrix (a, b, c, d, e, f)
99
+ def set_text_matrix_and_text_line_matrix (*args)
100
+ # these variable names look bad, but they're from the PDF spec
101
+ a, b, c, d, e, f = *args
100
102
  calculate_line_and_location(f)
101
103
  end
102
104
  ################################################################################
@@ -28,7 +28,7 @@ class PDF::Reader
28
28
  # An internal PDF::Reader class that represents a single token from a PDF file.
29
29
  #
30
30
  # Behaves exactly like a Ruby String - it basically exists for convenience.
31
- class Token < String
31
+ class Token < String # :nodoc:
32
32
  ################################################################################
33
33
  # Creates a new token with the specified value
34
34
  def initialize (val)
@@ -25,76 +25,48 @@
25
25
 
26
26
  class PDF::Reader
27
27
  ################################################################################
28
- # An internal PDF::Reader class that represents the Xref table in a PDF file
28
+ # An internal PDF::Reader class that represents the XRef table in a PDF file as a
29
+ # hash-like object.
30
+ #
29
31
  # An Xref table is a map of object identifiers and byte offsets. Any time a particular
30
32
  # object needs to be found, the Xref table is used to find where it is stored in the
31
33
  # file.
34
+ #
35
+ # Hash keys are object ids, values are either:
36
+ #
37
+ # * a byte offset where the object starts (regular PDF objects)
38
+ # * a PDF::Reader::Reference instance that points to a stream that contains the
39
+ # desired object (PDF objects embedded in an object stream)
40
+ #
41
+ # The class behaves much like a standard Ruby hash, including the use of
42
+ # the Enumerable mixin. The key difference is no []= method - the hash
43
+ # is read only.
44
+ #
32
45
  class XRef
46
+ include Enumerable
47
+ attr_reader :trailer
48
+
33
49
  ################################################################################
34
- # create a new Xref table based on the contents of the supplied PDF::Reader::Buffer object
50
+ # create a new Xref table based on the contents of the supplied io object
51
+ #
52
+ # io - must be an IO object, generally either a file or a StringIO
53
+ #
35
54
  def initialize (io)
36
55
  @io = io
37
56
  @xref = {}
57
+ @trailer = load_offsets
38
58
  end
59
+ ################################################################################
60
+ # return the number of objects in this file. Objects with multiple generations are
61
+ # only counter once.
39
62
  def size
40
63
  @xref.size
41
64
  end
42
65
  ################################################################################
43
- # returns the PDF version of the current document. Technically this isn't part of the XRef
44
- # table, but it is one of the lowest level data items in the file, so we've lumped it in
45
- # with the cross reference code.
46
- def pdf_version
47
- @io.seek(0)
48
- m, version = *@io.read(8).match(/%PDF-(\d.\d)/)
49
- raise MalformedPDFError, 'invalid PDF version' if version.nil?
50
- return version.to_f
51
- end
52
- ################################################################################
53
- # Read the xref table from the underlying buffer. If offset is specified the table
54
- # will be loaded from there, otherwise the default offset will be located and used.
55
- #
56
- # Will fail silently if there is no xref table at the requested offset.
57
- def load (offset = nil)
58
- offset ||= new_buffer.find_first_xref_offset
59
-
60
- buf = new_buffer(offset)
61
- token = buf.token
62
-
63
- if token == "xref" || token == "ref"
64
- load_xref_table(buf)
65
- elsif token.to_i >= 0 && buf.token.to_i >= 0 && buf.token == "obj"
66
- raise PDF::Reader::UnsupportedFeatureError, "XRef streams are not supported in PDF::Reader yet"
67
- else
68
- raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{token} != xref)"
69
- end
70
- end
71
- ################################################################################
72
- # Return a string containing the contents of an entire PDF object. The object is requested
73
- # by specifying a PDF::Reader::Reference object that contains the objects ID and revision
74
- # number
75
- #
76
- # If the object is a stream, that is returned as well
77
- def object (ref)
78
- return ref unless ref.kind_of?(Reference)
79
- buf = new_buffer(offset_for(ref))
80
- obj = Parser.new(buf, self).object(ref.id, ref.gen)
81
- return obj
82
- end
83
- # returns the type of object a ref points to
84
- def obj_type(ref)
85
- obj = object(ref)
86
- obj.class.to_s.to_sym
87
- end
88
- # returns true if the supplied references points to an object with a stream
89
- def stream?(ref)
90
- obj, stream = @xref.object(ref)
91
- stream ? true : false
92
- end
93
- ################################################################################
94
66
  # returns the byte offset for the specified PDF object.
95
67
  #
96
68
  # ref - a PDF::Reader::Reference object containing an object ID and revision number
97
- def offset_for (ref)
69
+ def [](ref)
98
70
  @xref[ref.id][ref.gen]
99
71
  rescue
100
72
  raise InvalidObjectError, "Object #{ref.id}, Generation #{ref.gen} is invalid"
@@ -105,20 +77,42 @@ class PDF::Reader
105
77
  ids = @xref.keys.sort
106
78
  ids.each do |id|
107
79
  gen = @xref[id].keys.sort[-1]
108
- ref = PDF::Reader::Reference.new(id, gen)
109
- yield ref, object(ref)
80
+ yield PDF::Reader::Reference.new(id, gen)
110
81
  end
111
82
  end
112
83
  ################################################################################
113
- # Stores an offset value for a particular PDF object ID and revision number
114
- def store (id, gen, offset)
115
- (@xref[id] ||= {})[gen] ||= offset
116
- end
117
- ################################################################################
118
84
  private
119
85
  ################################################################################
120
- # Assumes the underlying buffer is positioned at the start of an Xref table and
121
- # processes it into memory.
86
+ # Read a xref table from the underlying buffer.
87
+ #
88
+ # If offset is specified the table will be loaded from there, otherwise the
89
+ # default offset will be located and used.
90
+ #
91
+ # After seeking to the offset, processing is handed of to either load_xref_table()
92
+ # or load_xref_stream() based on what we find there.
93
+ #
94
+ def load_offsets(offset = nil)
95
+ offset ||= new_buffer.find_first_xref_offset
96
+
97
+ buf = new_buffer(offset)
98
+ tok_one = buf.token
99
+
100
+ return load_xref_table(buf) if tok_one == "xref" || tok_one == "ref"
101
+
102
+ tok_two = buf.token
103
+ tok_three = buf.token
104
+
105
+ if tok_one.to_i >= 0 && tok_two.to_i >= 0 && tok_three == "obj"
106
+ buf = new_buffer(offset)
107
+ stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i)
108
+ return load_xref_stream(stream)
109
+ end
110
+
111
+ raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{tok_one} != xref)"
112
+ end
113
+ ################################################################################
114
+ # Assumes the underlying buffer is positioned at the start of a traditional
115
+ # Xref table and processes it into memory.
122
116
  def load_xref_table(buf)
123
117
  params = []
124
118
 
@@ -142,14 +136,82 @@ class PDF::Reader
142
136
 
143
137
  raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless trailer.kind_of?(Hash)
144
138
 
145
- load(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
139
+ load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
140
+ load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
146
141
 
147
142
  trailer
148
143
  end
149
144
 
145
+ ################################################################################
146
+ # Read a XReaf stream from the underlying buffer instead of a traditional xref table.
147
+ #
148
+ def load_xref_stream(stream)
149
+ unless stream.hash[:Type] == :XRef
150
+ raise PDF::Reader::MalformedPDFError, "xref stream not found when expected"
151
+ end
152
+ trailer = {}
153
+ trailer[:Root] = stream.hash[:Root] if stream.hash[:Root]
154
+ trailer[:Info] = stream.hash[:Info] if stream.hash[:Info]
155
+ trailer[:Prev] = stream.hash[:Prev] if stream.hash[:Prev]
156
+
157
+ widths = stream.hash[:W]
158
+ entry_length = widths.inject(0) { |s, w| s + w }
159
+ raw_data = stream.unfiltered_data
160
+ if stream.hash[:Index]
161
+ index = stream.hash[:Index][0]
162
+ else
163
+ index = 0
164
+ end
165
+ stream.hash[:Size].times do |i|
166
+ entry = raw_data[i*entry_length, entry_length] || ""
167
+ f1 = unpack_bytes(entry[0,widths[0]])
168
+ f2 = unpack_bytes(entry[widths[0],widths[1]])
169
+ f3 = unpack_bytes(entry[widths[0]+widths[1],widths[2]])
170
+ if f1 == 1
171
+ store(index + i, f3, f2)
172
+ elsif f1 == 2
173
+ store(index + i, 0, PDF::Reader::Reference.new(f2, 0))
174
+ end
175
+ end
176
+
177
+ load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
178
+
179
+ trailer
180
+ end
181
+ ################################################################################
182
+ # XRef streams pack info into integers 1-N bytes wide. Depending on the number of
183
+ # bytes they need to be converted to an int in different ways.
184
+ #
185
+ def unpack_bytes(bytes)
186
+ if bytes.to_s.size == 0
187
+ 0
188
+ elsif bytes.size == 1
189
+ bytes.unpack("C")[0]
190
+ elsif bytes.size == 2
191
+ bytes.unpack("n")[0]
192
+ elsif bytes.size == 3
193
+ ("\x00" + bytes).unpack("N")[0]
194
+ elsif bytes.size == 4
195
+ bytes.unpack("N")[0]
196
+ else
197
+ raise UnsupportedFeatureError, "Unable to unpack xref stream entries with more than 4 bytes"
198
+ end
199
+ end
200
+ ################################################################################
201
+ # Wrap the io stream we're working with in a buffer that can tokenise it for us.
202
+ #
203
+ # We create multiple buffers so we can be tokenising multiple sections of the file
204
+ # at the same time without worring about clearing the buffers contents.
205
+ #
150
206
  def new_buffer(offset = 0)
151
207
  PDF::Reader::Buffer.new(@io, :seek => offset)
152
208
  end
209
+ ################################################################################
210
+ # Stores an offset value for a particular PDF object ID and revision number
211
+ #
212
+ def store (id, gen, offset)
213
+ (@xref[id] ||= {})[gen] ||= offset
214
+ end
153
215
  end
154
216
  ################################################################################
155
217
  end