pdf-reader 0.8.6 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -32,10 +32,10 @@ class PDF::Reader
32
32
  # Create a new parser around a PDF::Reader::Buffer object
33
33
  #
34
34
  # buffer - a PDF::Reader::Buffer object that contains PDF data
35
- # xref - a PDF::Reader::XRef object that represents the document's object offsets
36
- def initialize (buffer, xref=nil)
35
+ # ohash - a PDF::Reader::ObjectHash object that can return objects from the PDF file
36
+ def initialize (buffer, ohash=nil)
37
37
  @buffer = buffer
38
- @xref = xref
38
+ @ohash = ohash
39
39
  end
40
40
  ################################################################################
41
41
  # Reads the next token from the underlying buffer and convets it to an appropriate
@@ -46,23 +46,22 @@ class PDF::Reader
46
46
  token = @buffer.token
47
47
 
48
48
  case token
49
- when PDF::Reader::Reference then return token
50
- when nil then return nil
51
- when "/" then return pdf_name()
52
- when "<<" then return dictionary()
53
- when "[" then return array()
54
- when "(" then return string()
55
- when "<" then return hex_string()
56
- when "true" then return true
57
- when "false" then return false
58
- when "null" then return nil
59
- when "obj", "endobj" then return Token.new(token)
60
- when "stream", "endstream" then return Token.new(token)
61
- when ">>", "]", ">", ")" then return Token.new(token)
49
+ when PDF::Reader::Reference, nil then return token
50
+ when "/" then return pdf_name()
51
+ when "<<" then return dictionary()
52
+ when "[" then return array()
53
+ when "(" then return string()
54
+ when "<" then return hex_string()
55
+ when "true" then return true
56
+ when "false" then return false
57
+ when "null" then return nil
58
+ when "obj", "endobj", "stream", "endstream" then return Token.new(token)
59
+ when "stream", "endstream" then return Token.new(token)
60
+ when ">>", "]", ">", ")" then return Token.new(token)
62
61
  else
63
- if operators.has_key?(token) then return Token.new(token)
64
- elsif token =~ /\d*\.\d/ then return token.to_f
65
- else return token.to_i
62
+ if operators.has_key?(token) then return Token.new(token)
63
+ elsif token =~ /\d*\.\d/ then return token.to_f
64
+ else return token.to_i
66
65
  end
67
66
  end
68
67
  end
@@ -151,30 +150,68 @@ class PDF::Reader
151
150
  return "" if str == ")"
152
151
  Error.assert_equal(parse_token, ")")
153
152
 
154
- str.gsub!(/([^\\])(\n\r|\r\n|\r)/m,'\1\n')
155
- str.gsub!("\\n","\n")
156
- str.gsub!("\\r","\r")
157
- str.gsub!("\\t","\t")
158
- str.gsub!("\\b","\b")
159
- str.gsub!("\\f","\f")
160
- str.gsub!("\\(","(")
161
- str.gsub!("\\)",")")
162
- str.gsub!("\\\\","\\")
163
- str.gsub!(/\\\n/m,"")
164
-
165
- str.scan(/\\\d{1,3}/).each do |octal|
166
- str.gsub!(octal, octal[1,3].oct.chr)
153
+ ret = ""
154
+ idx = 0
155
+
156
+ while idx < str.size
157
+ chr = str[idx,1]
158
+ jump = 1
159
+
160
+ if chr == "\\"
161
+ jump = 2
162
+ case str[idx+1, 1]
163
+ when "" then jump = 1
164
+ when "n" then chr = "\n"
165
+ when "r" then chr = "\r"
166
+ when "t" then chr = "\t"
167
+ when "b" then chr = "\b"
168
+ when "f" then chr = "\f"
169
+ when "(" then chr = "("
170
+ when ")" then chr = ")"
171
+ when "\\" then chr = "\\"
172
+ when "\n" then
173
+ chr = ""
174
+ jump = 2
175
+ else
176
+ if str[idx+1,3].match(/\d{3}/)
177
+ jump = 4
178
+ chr = str[idx+1,3].oct.chr
179
+ elsif str[idx+1,2].match(/\d{2}/)
180
+ jump = 3
181
+ chr = ("0"+str[idx+1,2]).oct.chr
182
+ elsif str[idx+1,1].match(/\d/)
183
+ jump = 2
184
+ chr = ("00"+str[idx+1,1]).oct.chr
185
+ else
186
+ jump = 1
187
+ chr = ""
188
+ end
189
+
190
+ end
191
+ elsif chr == "\r" && str[idx+1,1] == "\n"
192
+ chr = "\n"
193
+ jump = 2
194
+ elsif chr == "\n" && str[idx+1,1] == "\r"
195
+ chr = "\n"
196
+ jump = 2
197
+ elsif chr == "\r"
198
+ chr = "\n"
199
+ end
200
+ ret << chr
201
+ idx += jump
167
202
  end
168
-
169
- str.gsub!(/\\([^\\])/,'\1')
170
-
171
- str
203
+ ret
172
204
  end
173
205
  ################################################################################
174
206
  # Decodes the contents of a PDF Stream and returns it as a Ruby String.
175
207
  def stream (dict)
176
208
  raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
177
- data = @buffer.read(@xref.object(dict[:Length]), :skip_eol => true)
209
+ if @ohash
210
+ length = @ohash.object(dict[:Length])
211
+ else
212
+ length = dict[:Length] || 0
213
+ end
214
+ data = @buffer.read(length, :skip_eol => true)
178
215
 
179
216
  Error.str_assert(parse_token, "endstream")
180
217
  Error.str_assert(parse_token, "endobj")
@@ -8,7 +8,6 @@ class PDF::Reader
8
8
  end
9
9
 
10
10
  def respond_to?(meth)
11
- return false if [:begin_inline_image_data].include?(meth)
12
11
  true
13
12
  end
14
13
 
@@ -1,4 +1,21 @@
1
+ # coding: utf-8
2
+
3
+ # Copyright (C) 2010 James Healy (jimmy@deefa.com)
4
+
1
5
  class PDF::Reader
6
+
7
+ # An example receiver that just records all callbacks generated by parsing
8
+ # a PDF file.
9
+ #
10
+ # Useful for testing the contents of a file in an rspec/test-unit suite.
11
+ #
12
+ # Usage:
13
+ #
14
+ # receiver = PDF::Reader::RegisterReceiver.new
15
+ # PDF::Reader.file("somefile.pdf", receiver)
16
+ # callback = receiver.first_occurance_of(:show_text)
17
+ # callback[:args].first.should == "Hellow World"
18
+ #
2
19
  class RegisterReceiver
3
20
 
4
21
  attr_accessor :callbacks
@@ -31,6 +48,10 @@ class PDF::Reader
31
48
  return ret
32
49
  end
33
50
 
51
+ def all_args(methodname)
52
+ all(methodname).map { |cb| cb[:args] }
53
+ end
54
+
34
55
  # return the details for the first time the specified callback was fired
35
56
  def first_occurance_of(methodname)
36
57
  callbacks.each do |cb|
@@ -50,7 +50,11 @@ class PDF::Reader
50
50
  options = []
51
51
 
52
52
  if hash.has_key?(:DecodeParms)
53
- options = Array(hash[:DecodeParms])
53
+ if hash[:DecodeParms].is_a?(Hash)
54
+ options = [hash[:DecodeParms]]
55
+ else
56
+ options = hash[:DecodeParms]
57
+ end
54
58
  end
55
59
 
56
60
  Array(hash[:Filter]).each_with_index do |filter, index|
@@ -96,7 +96,9 @@ class PDF::Reader
96
96
  end
97
97
  ################################################################################
98
98
  # PDF operator Tm
99
- def set_text_matrix_and_text_line_matrix (a, b, c, d, e, f)
99
+ def set_text_matrix_and_text_line_matrix (*args)
100
+ # these variable names look bad, but they're from the PDF spec
101
+ a, b, c, d, e, f = *args
100
102
  calculate_line_and_location(f)
101
103
  end
102
104
  ################################################################################
@@ -28,7 +28,7 @@ class PDF::Reader
28
28
  # An internal PDF::Reader class that represents a single token from a PDF file.
29
29
  #
30
30
  # Behaves exactly like a Ruby String - it basically exists for convenience.
31
- class Token < String
31
+ class Token < String # :nodoc:
32
32
  ################################################################################
33
33
  # Creates a new token with the specified value
34
34
  def initialize (val)
@@ -25,76 +25,48 @@
25
25
 
26
26
  class PDF::Reader
27
27
  ################################################################################
28
- # An internal PDF::Reader class that represents the Xref table in a PDF file
28
+ # An internal PDF::Reader class that represents the XRef table in a PDF file as a
29
+ # hash-like object.
30
+ #
29
31
  # An Xref table is a map of object identifiers and byte offsets. Any time a particular
30
32
  # object needs to be found, the Xref table is used to find where it is stored in the
31
33
  # file.
34
+ #
35
+ # Hash keys are object ids, values are either:
36
+ #
37
+ # * a byte offset where the object starts (regular PDF objects)
38
+ # * a PDF::Reader::Reference instance that points to a stream that contains the
39
+ # desired object (PDF objects embedded in an object stream)
40
+ #
41
+ # The class behaves much like a standard Ruby hash, including the use of
42
+ # the Enumerable mixin. The key difference is no []= method - the hash
43
+ # is read only.
44
+ #
32
45
  class XRef
46
+ include Enumerable
47
+ attr_reader :trailer
48
+
33
49
  ################################################################################
34
- # create a new Xref table based on the contents of the supplied PDF::Reader::Buffer object
50
+ # create a new Xref table based on the contents of the supplied io object
51
+ #
52
+ # io - must be an IO object, generally either a file or a StringIO
53
+ #
35
54
  def initialize (io)
36
55
  @io = io
37
56
  @xref = {}
57
+ @trailer = load_offsets
38
58
  end
59
+ ################################################################################
60
+ # return the number of objects in this file. Objects with multiple generations are
61
+ # only counter once.
39
62
  def size
40
63
  @xref.size
41
64
  end
42
65
  ################################################################################
43
- # returns the PDF version of the current document. Technically this isn't part of the XRef
44
- # table, but it is one of the lowest level data items in the file, so we've lumped it in
45
- # with the cross reference code.
46
- def pdf_version
47
- @io.seek(0)
48
- m, version = *@io.read(8).match(/%PDF-(\d.\d)/)
49
- raise MalformedPDFError, 'invalid PDF version' if version.nil?
50
- return version.to_f
51
- end
52
- ################################################################################
53
- # Read the xref table from the underlying buffer. If offset is specified the table
54
- # will be loaded from there, otherwise the default offset will be located and used.
55
- #
56
- # Will fail silently if there is no xref table at the requested offset.
57
- def load (offset = nil)
58
- offset ||= new_buffer.find_first_xref_offset
59
-
60
- buf = new_buffer(offset)
61
- token = buf.token
62
-
63
- if token == "xref" || token == "ref"
64
- load_xref_table(buf)
65
- elsif token.to_i >= 0 && buf.token.to_i >= 0 && buf.token == "obj"
66
- raise PDF::Reader::UnsupportedFeatureError, "XRef streams are not supported in PDF::Reader yet"
67
- else
68
- raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{token} != xref)"
69
- end
70
- end
71
- ################################################################################
72
- # Return a string containing the contents of an entire PDF object. The object is requested
73
- # by specifying a PDF::Reader::Reference object that contains the objects ID and revision
74
- # number
75
- #
76
- # If the object is a stream, that is returned as well
77
- def object (ref)
78
- return ref unless ref.kind_of?(Reference)
79
- buf = new_buffer(offset_for(ref))
80
- obj = Parser.new(buf, self).object(ref.id, ref.gen)
81
- return obj
82
- end
83
- # returns the type of object a ref points to
84
- def obj_type(ref)
85
- obj = object(ref)
86
- obj.class.to_s.to_sym
87
- end
88
- # returns true if the supplied references points to an object with a stream
89
- def stream?(ref)
90
- obj, stream = @xref.object(ref)
91
- stream ? true : false
92
- end
93
- ################################################################################
94
66
  # returns the byte offset for the specified PDF object.
95
67
  #
96
68
  # ref - a PDF::Reader::Reference object containing an object ID and revision number
97
- def offset_for (ref)
69
+ def [](ref)
98
70
  @xref[ref.id][ref.gen]
99
71
  rescue
100
72
  raise InvalidObjectError, "Object #{ref.id}, Generation #{ref.gen} is invalid"
@@ -105,20 +77,42 @@ class PDF::Reader
105
77
  ids = @xref.keys.sort
106
78
  ids.each do |id|
107
79
  gen = @xref[id].keys.sort[-1]
108
- ref = PDF::Reader::Reference.new(id, gen)
109
- yield ref, object(ref)
80
+ yield PDF::Reader::Reference.new(id, gen)
110
81
  end
111
82
  end
112
83
  ################################################################################
113
- # Stores an offset value for a particular PDF object ID and revision number
114
- def store (id, gen, offset)
115
- (@xref[id] ||= {})[gen] ||= offset
116
- end
117
- ################################################################################
118
84
  private
119
85
  ################################################################################
120
- # Assumes the underlying buffer is positioned at the start of an Xref table and
121
- # processes it into memory.
86
+ # Read a xref table from the underlying buffer.
87
+ #
88
+ # If offset is specified the table will be loaded from there, otherwise the
89
+ # default offset will be located and used.
90
+ #
91
+ # After seeking to the offset, processing is handed of to either load_xref_table()
92
+ # or load_xref_stream() based on what we find there.
93
+ #
94
+ def load_offsets(offset = nil)
95
+ offset ||= new_buffer.find_first_xref_offset
96
+
97
+ buf = new_buffer(offset)
98
+ tok_one = buf.token
99
+
100
+ return load_xref_table(buf) if tok_one == "xref" || tok_one == "ref"
101
+
102
+ tok_two = buf.token
103
+ tok_three = buf.token
104
+
105
+ if tok_one.to_i >= 0 && tok_two.to_i >= 0 && tok_three == "obj"
106
+ buf = new_buffer(offset)
107
+ stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i)
108
+ return load_xref_stream(stream)
109
+ end
110
+
111
+ raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{tok_one} != xref)"
112
+ end
113
+ ################################################################################
114
+ # Assumes the underlying buffer is positioned at the start of a traditional
115
+ # Xref table and processes it into memory.
122
116
  def load_xref_table(buf)
123
117
  params = []
124
118
 
@@ -142,14 +136,82 @@ class PDF::Reader
142
136
 
143
137
  raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless trailer.kind_of?(Hash)
144
138
 
145
- load(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
139
+ load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
140
+ load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
146
141
 
147
142
  trailer
148
143
  end
149
144
 
145
+ ################################################################################
146
+ # Read a XReaf stream from the underlying buffer instead of a traditional xref table.
147
+ #
148
+ def load_xref_stream(stream)
149
+ unless stream.hash[:Type] == :XRef
150
+ raise PDF::Reader::MalformedPDFError, "xref stream not found when expected"
151
+ end
152
+ trailer = {}
153
+ trailer[:Root] = stream.hash[:Root] if stream.hash[:Root]
154
+ trailer[:Info] = stream.hash[:Info] if stream.hash[:Info]
155
+ trailer[:Prev] = stream.hash[:Prev] if stream.hash[:Prev]
156
+
157
+ widths = stream.hash[:W]
158
+ entry_length = widths.inject(0) { |s, w| s + w }
159
+ raw_data = stream.unfiltered_data
160
+ if stream.hash[:Index]
161
+ index = stream.hash[:Index][0]
162
+ else
163
+ index = 0
164
+ end
165
+ stream.hash[:Size].times do |i|
166
+ entry = raw_data[i*entry_length, entry_length] || ""
167
+ f1 = unpack_bytes(entry[0,widths[0]])
168
+ f2 = unpack_bytes(entry[widths[0],widths[1]])
169
+ f3 = unpack_bytes(entry[widths[0]+widths[1],widths[2]])
170
+ if f1 == 1
171
+ store(index + i, f3, f2)
172
+ elsif f1 == 2
173
+ store(index + i, 0, PDF::Reader::Reference.new(f2, 0))
174
+ end
175
+ end
176
+
177
+ load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
178
+
179
+ trailer
180
+ end
181
+ ################################################################################
182
+ # XRef streams pack info into integers 1-N bytes wide. Depending on the number of
183
+ # bytes they need to be converted to an int in different ways.
184
+ #
185
+ def unpack_bytes(bytes)
186
+ if bytes.to_s.size == 0
187
+ 0
188
+ elsif bytes.size == 1
189
+ bytes.unpack("C")[0]
190
+ elsif bytes.size == 2
191
+ bytes.unpack("n")[0]
192
+ elsif bytes.size == 3
193
+ ("\x00" + bytes).unpack("N")[0]
194
+ elsif bytes.size == 4
195
+ bytes.unpack("N")[0]
196
+ else
197
+ raise UnsupportedFeatureError, "Unable to unpack xref stream entries with more than 4 bytes"
198
+ end
199
+ end
200
+ ################################################################################
201
+ # Wrap the io stream we're working with in a buffer that can tokenise it for us.
202
+ #
203
+ # We create multiple buffers so we can be tokenising multiple sections of the file
204
+ # at the same time without worring about clearing the buffers contents.
205
+ #
150
206
  def new_buffer(offset = 0)
151
207
  PDF::Reader::Buffer.new(@io, :seek => offset)
152
208
  end
209
+ ################################################################################
210
+ # Stores an offset value for a particular PDF object ID and revision number
211
+ #
212
+ def store (id, gen, offset)
213
+ (@xref[id] ||= {})[gen] ||= offset
214
+ end
153
215
  end
154
216
  ################################################################################
155
217
  end