pdf-reader 0.6.1 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,13 @@
1
+ v0.6.2 (22nd March 2008)
2
+ - Catch low level errors when applying filters to a content stream and raise a MalformedPDFError instead.
3
+ - Added support for processing inline images
4
+ - Support for parsing XRef tables that have multiple subsections
5
+ - Added a few callbacks to improve the way we supply information on page resources
6
+ - Ignore whitespace in hex strings, as required by the spec (section 3.2.3)
7
+ - Use our "unknown character box" when a single character in an Identity-H string fails to decode
8
+ - Support ToUnicode CMaps that use the bfrange operator
9
+ - Tweaked tokenising code to ensure whitespace doesn't get in the way
10
+
1
11
  v0.6.1 (12th March 2008)
2
12
  - Tweaked behaviour when we encounter Identity-H encoded text that doesn't have a ToUnicode mapping. We
3
13
  just replace each character with a little box.
data/Rakefile CHANGED
@@ -6,7 +6,7 @@ require 'rake/testtask'
6
6
  require "rake/gempackagetask"
7
7
  require 'spec/rake/spectask'
8
8
 
9
- PKG_VERSION = "0.6.1"
9
+ PKG_VERSION = "0.6.2"
10
10
  PKG_NAME = "pdf-reader"
11
11
  PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
12
12
 
data/TODO CHANGED
@@ -4,7 +4,6 @@ v0.7
4
4
  - maybe a third option to Reader.parse?
5
5
  parse(io, receiver, {:pages => true, :fonts => false, :metadata => true, :bookmarks => false})
6
6
  - detect when a font's encoding is a CMap (generally used for pre-Unicode, multibyte asian encodings), and display a user friendly error
7
- - When parsing a CMap into a ruby object, recognise ranged mappings defined by begincodespacerange (see spec, section 5.9.2)
8
7
  - Provide a way to get raw access to a particular object. Good for testing purposes
9
8
 
10
9
  v0.8
@@ -14,10 +13,14 @@ v0.8
14
13
 
15
14
  v0.9
16
15
  - Support for CJK text (convert to UTF-8 like all other encodings. See Section 5.9 of the PDF spec)
16
+ - Will require significantly improved handling of CMaps, including creating a bunch of predefined ones
17
17
  - Add a way to extract raster images
18
-
18
+ - see XObjects section of spec (section 4.7)
19
+ - Add a way to extract font data?
19
20
 
20
21
  Sometime
22
+ - Work out why specs/data/zlib*.pdf isn't parsed correctly when all the major PDF viewers can display it correctly
23
+
21
24
  - Ship some extra receivers in the standard package, particuarly ones that are useful for running
22
25
  rspec over generated PDF files
23
26
 
@@ -33,3 +36,5 @@ Sometime
33
36
  - Identity-V(I *think* this relates to vertical text. Not sure how we'd support it sensibly)
34
37
 
35
38
  - Investigate how R->L text is handled
39
+
40
+ - Add support for object streams (spec section 3.4.6)
data/lib/pdf/reader.rb CHANGED
@@ -87,7 +87,6 @@ require 'pdf/reader/text_receiver'
87
87
  require 'pdf/reader/token'
88
88
  require 'pdf/reader/xref'
89
89
 
90
-
91
90
  class PDF::Reader
92
91
  ################################################################################
93
92
  # Initialize a new PDF::Reader
@@ -56,6 +56,24 @@ class PDF::Reader
56
56
  out
57
57
  end
58
58
  ################################################################################
59
+ # Reads from the buffer until the specified token is found, or the end of the buffer
60
+ #
61
+ # bytes - the bytes to search for.
62
+ def read_until(bytes)
63
+ out = ""
64
+ size = bytes.size
65
+
66
+ loop do
67
+ out << @io.read(1)
68
+ if out[-1 * size,size].eql?(bytes)
69
+ out = out[0, out.size - size]
70
+ seek(pos - size)
71
+ break
72
+ end
73
+ end
74
+ out
75
+ end
76
+ ################################################################################
59
77
  # returns true if the underlying IO object is at end and the internal buffer
60
78
  # is empty
61
79
  def eof?
@@ -71,21 +89,21 @@ class PDF::Reader
71
89
  end
72
90
  ################################################################################
73
91
  # PDF files are processed by tokenising the content into a series of objects and commands.
74
- # This prepares the buffer for use by rerading the next line of tokens into memory.
92
+ # This prepares the buffer for use by reading the next line of tokens into memory.
75
93
  def ready_token (with_strip=true, skip_blanks=true)
76
94
  while @buffer.nil? or @buffer.empty?
77
95
  @buffer = @io.readline
78
96
  @buffer.sub!(/%.*$/, '')
79
97
  @buffer.chomp!
80
- @buffer.lstrip! if with_strip
81
98
  break unless skip_blanks
82
99
  end
100
+ @buffer.lstrip! if with_strip
83
101
  end
84
102
  ################################################################################
85
103
  # return the next token from the underlying IO stream
86
104
  def token
87
105
  ready_token
88
-
106
+
89
107
  i = @buffer.index(/[\[\]()<>{}\s\/]/) || @buffer.size
90
108
 
91
109
  token_chars =
@@ -28,12 +28,24 @@ class PDF::Reader
28
28
 
29
29
  def initialize(data)
30
30
  @map = {}
31
- inmap = false
31
+ in_char_mode = false
32
+ in_range_mode = false
33
+
32
34
  data.each_line do |l|
33
- inmap = true if l.include?("beginbfchar")
34
- if inmap
35
- m, find, replace = *l.match(/<([0-9a-fA-F]+)> <([0-9a-fA-F]+)>/)
36
- @map["0x#{find}".hex] = "0x#{replace}".hex if find && replace
35
+ if l.include?("beginbfchar")
36
+ in_char_mode = true
37
+ elsif l.include?("endbfchar")
38
+ in_char_mode = false
39
+ elsif l.include?("beginbfrange")
40
+ in_range_mode = true
41
+ elsif l.include?("endbfrange")
42
+ in_range_mode = false
43
+ end
44
+
45
+ if in_char_mode
46
+ process_bfchar_line(l)
47
+ elsif in_range_mode
48
+ process_bfrange_line(l)
37
49
  end
38
50
  end
39
51
  end
@@ -44,5 +56,29 @@ class PDF::Reader
44
56
  @map[c]
45
57
  end
46
58
 
59
+ private
60
+
61
+ def process_bfchar_line(l)
62
+ m, find, replace = *l.match(/<([0-9a-fA-F]+)> <([0-9a-fA-F]+)>/)
63
+ @map["0x#{find}".hex] = "0x#{replace}".hex if find && replace
64
+ end
65
+
66
+ def process_bfrange_line(l)
67
+ m, start_code, end_code, dst = *l.match(/<([0-9a-fA-F]+)> <([0-9a-fA-F]+)> <([0-9a-fA-F]+)>/)
68
+ if start_code && end_code && dst
69
+ start_code = "0x#{start_code}".hex
70
+ end_code = "0x#{end_code}".hex
71
+ dst = "0x#{dst}".hex
72
+ incr = 0
73
+
74
+ # add all values in the range to our mapping
75
+ (start_code..end_code).each do |val|
76
+ @map[val] = dst + incr
77
+ incr += 1
78
+ # ensure a single range does not exceed 255 chars
79
+ raise PDF::Reader::MalformedPDFError, "a CMap bfrange cann't exceed 255 chars" if incr > 255
80
+ end
81
+ end
82
+ end
47
83
  end
48
84
  end
@@ -23,6 +23,7 @@
23
23
  #
24
24
  ################################################################################
25
25
  require 'stringio'
26
+ #require 'enumerable'
26
27
 
27
28
  class PDF::Reader
28
29
  ################################################################################
@@ -144,6 +145,25 @@ class PDF::Reader
144
145
  # - end_page_container
145
146
  # - begin_page
146
147
  # - end_page
148
+ #
149
+ # == Resource Callbacks
150
+ #
151
+ # Each page and page_container can contain a range of resources required for the page,
152
+ # including things like fonts and images. The following callbacks may appear
153
+ # after begin_page_container and begin_page if the relevant resources exist
154
+ # on a page:
155
+ #
156
+ # In most cases, these callbacks associate a name with each resource, allowing it
157
+ # to be referred to by name in the page content. For example, an XObject can hold an image.
158
+ # If it gets mapped to the name "IM1", then it can be placed on the page using
159
+ # invoke_xobject "IM1".
160
+ #
161
+ # - resource_procset
162
+ # - resource_xobject
163
+ # - resource_extgstate
164
+ # - resource_colorspace
165
+ # - resource_pattern
166
+ # - resource_font
147
167
  class Content
148
168
  OPERATORS = {
149
169
  'b' => :close_fill_stroke,
@@ -240,20 +260,27 @@ class PDF::Reader
240
260
  # Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
241
261
  # its content
242
262
  def walk_pages (page)
243
- resolve_resources(@xref.object(page['Resources'])) if page['Resources']
263
+
264
+ if page['Resources']
265
+ res = page['Resources']
266
+ page.delete('Resources')
267
+ end
244
268
 
245
269
  # extract page content
246
270
  if page['Type'] == "Pages"
247
271
  callback(:begin_page_container, [page])
272
+ walk_resources(@xref.object(res)) if res
248
273
  page['Kids'].each {|child| walk_pages(@xref.object(child))}
249
274
  callback(:end_page_container)
250
275
  elsif page['Type'] == "Page"
251
276
  callback(:begin_page, [page])
277
+ walk_resources(@xref.object(res)) if res
252
278
  @page = page
253
279
  @params = []
254
280
 
255
281
  page['Contents'].to_a.each do |cstream|
256
- content_stream(@xref.object(cstream))
282
+ obj, stream = @xref.object(cstream)
283
+ content_stream(stream)
257
284
  end if page.has_key?('Contents') and page['Contents']
258
285
 
259
286
  callback(:end_page)
@@ -274,9 +301,19 @@ class PDF::Reader
274
301
  if token.kind_of?(Token) and OPERATORS.has_key?(token)
275
302
  @current_font = @params.first if OPERATORS[token] == :set_text_font_and_size
276
303
 
277
- # convert any text to utf-8
304
+ # handle special cases in response to certain operators
278
305
  if OPERATORS[token].to_s.include?("show_text") && @fonts[@current_font]
306
+ # convert any text to utf-8
279
307
  @params = @fonts[@current_font].to_utf8(@params)
308
+ elsif token == "ID"
309
+ # inline image data, first convert the current params into a more familiar hash
310
+ map = {}
311
+ @params.each_slice(2) do |a|
312
+ map[a.first] = a.last
313
+ end
314
+ @params = [map]
315
+ # read the raw image data from the buffer without tokenising
316
+ @params << @buffer.read_until("EI")
280
317
  end
281
318
  callback(OPERATORS[token], @params)
282
319
  @params.clear
@@ -289,7 +326,43 @@ class PDF::Reader
289
326
  rescue EOFError => e
290
327
  end
291
328
  ################################################################################
292
- def resolve_resources(resources)
329
+ def walk_resources(resources)
330
+ resources = resolve_references(resources)
331
+
332
+ # extract any procset information
333
+ if resources['ProcSet']
334
+ callback(:resource_procset, resources['ProcSet'])
335
+ end
336
+
337
+ # extract any xobject information
338
+ if resources['XObject']
339
+ @xref.object(resources['XObject']).each do |name, val|
340
+ obj, stream = @xref.object(val)
341
+ callback(:resource_xobject, [name, obj, stream])
342
+ end
343
+ end
344
+
345
+ # extract any extgstate information
346
+ if resources['ExtGState']
347
+ @xref.object(resources['ExtGState']).each do |name, val|
348
+ callback(:resource_extgstate, [name, @xref.object(val)])
349
+ end
350
+ end
351
+
352
+ # extract any colorspace information
353
+ if resources['ColorSpace']
354
+ @xref.object(resources['ColorSpace']).each do |name, val|
355
+ callback(:resource_colorspace, [name, @xref.object(val)])
356
+ end
357
+ end
358
+
359
+ # extract any pattern information
360
+ if resources['Pattern']
361
+ @xref.object(resources['Pattern']).each do |name, val|
362
+ callback(:resource_pattern, [name, @xref.object(val)])
363
+ end
364
+ end
365
+
293
366
  # extract any font information
294
367
  if resources['Font']
295
368
  @xref.object(resources['Font']).each do |label, desc|
@@ -301,15 +374,29 @@ class PDF::Reader
301
374
  @fonts[label].encoding = PDF::Reader::Encoding.factory(@xref.object(desc['Encoding']))
302
375
  @fonts[label].descendantfonts = desc['DescendantFonts'] if desc['DescendantFonts']
303
376
  if desc['ToUnicode']
304
- @fonts[label].tounicode = desc['ToUnicode']
305
- @fonts[label].tounicode = @xref.object(@fonts[label].tounicode)
377
+ obj, cmap = @xref.object(desc['ToUnicode'])
378
+
379
+ # this stream is a cmap
380
+ begin
381
+ @fonts[label].tounicode = PDF::Reader::CMap.new(cmap)
382
+ rescue
383
+ # if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
384
+ end
306
385
  end
386
+ callback(:resource_font, [label, @fonts[label]])
307
387
  end
308
388
  end
309
- #@fonts.each do |key,val|
310
- # puts "#{key}: #{val.inspect}"
311
- # puts
312
- #end
389
+ end
390
+ ################################################################################
391
+ # Convert any PDF::Reader::Resource objects into a real object
392
+ def resolve_references(obj)
393
+ case obj
394
+ when PDF::Reader::Reference then resolve_references(@xref.object(obj))
395
+ when Hash then obj.each { |key,val| obj[key] = resolve_references(val) }
396
+ when Array then obj.collect { |item| resolve_references(item) }
397
+ else
398
+ obj
399
+ end
313
400
  end
314
401
  ################################################################################
315
402
  # calls the name callback method on the receiver class with params as the arguments
@@ -111,12 +111,13 @@ class PDF::Reader
111
111
  # iterate over string, reading it in 2 byte chunks and interpreting those
112
112
  # chunks as ints
113
113
  str.unpack("n*").each do |c|
114
+
114
115
  # convert the int to a unicode codepoint if possible.
115
116
  # without a ToUnicode CMap, it's impossible to reliably convert this text
116
117
  # to unicode, so just replace each character with a little box. Big smacks
117
118
  # the the PDF producing app.
118
- if map
119
- array_enc << map.decode(c)
119
+ if map && (code = map.decode(c))
120
+ array_enc << code
120
121
  else
121
122
  array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
122
123
  end
@@ -40,20 +40,27 @@ class PDF::Reader
40
40
 
41
41
  case name
42
42
  when "FlateDecode" then @filter = :flate
43
- else raise UnsupportedFeatureError, "Unknown filter: #{name}"
43
+ #else raise UnsupportedFeatureError, "Unknown filter: #{name}"
44
44
  end
45
45
  end
46
46
  ################################################################################
47
47
  # attempts to decode the specified data with the current filter
48
48
  def filter (data)
49
+ # leave the data untouched if we don't support the required filter
50
+ return data if @filter.nil?
51
+
52
+ # decode the data
49
53
  self.send(@filter, data)
50
54
  end
51
55
  ################################################################################
52
56
  # Decode the specified data with the Zlib compression algorithm
53
57
  def flate (data)
54
- z = Zlib::Inflate.new
55
- z << data
56
- z.inflate(nil)
58
+ begin
59
+ z = Zlib::Inflate.new
60
+ z.inflate(data)
61
+ rescue Exception => e
62
+ raise MalformedPDFError, "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
63
+ end
57
64
  end
58
65
  ################################################################################
59
66
  end
@@ -9,10 +9,10 @@
9
9
  # distribute, sublicense, and/or sell copies of the Software, and to
10
10
  # permit persons to whom the Software is furnished to do so, subject to
11
11
  # the following conditions:
12
- #
12
+ #
13
13
  # The above copyright notice and this permission notice shall be
14
14
  # included in all copies or substantial portions of the Software.
15
- #
15
+ #
16
16
  # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
17
  # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
18
  # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
@@ -58,9 +58,9 @@ class PDF::Reader
58
58
  when "obj", "endobj" then return Token.new(token)
59
59
  when "stream", "endstream" then return Token.new(token)
60
60
  when ">>", "]", ">" then return Token.new(token)
61
- else
61
+ else
62
62
  if operators.has_key?(token) then return Token.new(token)
63
- else return token.to_f
63
+ else return token.to_f
64
64
  end
65
65
  end
66
66
  end
@@ -72,7 +72,7 @@ class PDF::Reader
72
72
  loop do
73
73
  key = parse_token
74
74
  break if key.kind_of?(Token) and key == ">>"
75
- raise MalformedPDFError, "PDF malformed, dictionary key is not a name" unless key.kind_of?(Name)
75
+ raise MalformedPDFError, "Dictionary key (#{key.inspect}) is not a name" unless key.kind_of?(Name)
76
76
 
77
77
  value = parse_token
78
78
  value.kind_of?(Token) and Error.str_assert_not(value, ">>")
@@ -97,9 +97,15 @@ class PDF::Reader
97
97
  ################################################################################
98
98
  # Reads a PDF hex string from the buffer and converts it to a Ruby String
99
99
  def hex_string
100
- str = @buffer.token
101
- Error.str_assert(@buffer.token, ">")
100
+ str = ""
101
+
102
+ loop do
103
+ token = @buffer.token
104
+ break if token == ">"
105
+ str << token
106
+ end
102
107
 
108
+ # add a missing digit if required, as required by the spec
103
109
  str << "0" unless str.size % 2 == 0
104
110
  str.scan(/../).map {|i| i.hex.chr}.join
105
111
  end
@@ -151,11 +157,12 @@ class PDF::Reader
151
157
 
152
158
  @buffer.head(to_remove, false)
153
159
  end
154
-
155
160
  str
156
161
  end
157
162
  ################################################################################
158
163
  # Reads an entire PDF object from the buffer and returns it as a Ruby String.
164
+ # If the object is a content stream, returns both the stream and the dictionary
165
+ # that describes it
159
166
  #
160
167
  # id - the object ID to return
161
168
  # gen - the object revision number to return
@@ -166,11 +173,10 @@ class PDF::Reader
166
173
 
167
174
  obj = parse_token
168
175
  post_obj = parse_token
169
-
170
176
  case post_obj
171
177
  when "endobj" then return obj
172
- when "stream" then return stream(obj)
173
- else raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
178
+ when "stream" then return obj, stream(obj)
179
+ else raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
174
180
  end
175
181
  end
176
182
  ################################################################################
@@ -178,6 +184,7 @@ class PDF::Reader
178
184
  def stream (dict)
179
185
  raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?('Length')
180
186
  data = @buffer.read(@xref.object(dict['Length']))
187
+
181
188
  Error.str_assert(parse_token, "endstream")
182
189
  Error.str_assert(parse_token, "endobj")
183
190
 
@@ -193,9 +200,6 @@ class PDF::Reader
193
200
  end
194
201
  end
195
202
 
196
- # this stream is a cmap
197
- data = PDF::Reader::CMap.new(data) if data.include?("begincmap") && data.include?("endcmap")
198
-
199
203
  data
200
204
  end
201
205
  ################################################################################
@@ -42,44 +42,69 @@ class PDF::Reader
42
42
  #
43
43
  # Will fail silently if there is no xref table at the requested offset.
44
44
  def load (offset = nil)
45
- @buffer.seek(offset || @buffer.find_first_xref_offset)
45
+ offset ||= @buffer.find_first_xref_offset
46
+ @buffer.seek(offset)
46
47
  token = @buffer.token
47
-
48
- if token == "xref"
48
+
49
+ if token == "xref" || token == "ref"
49
50
  load_xref_table
51
+ else
52
+ raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{token} != xref)"
50
53
  end
51
54
  end
52
55
  ################################################################################
53
56
  # Return a string containing the contents of an entire PDF object. The object is requested
54
57
  # by specifying a PDF::Reader::Reference object that contains the objects ID and revision
55
58
  # number
59
+ #
60
+ # If the object is a stream, that is returned as well
56
61
  def object (ref, save_pos = true)
57
62
  return ref unless ref.kind_of?(Reference)
58
63
  pos = @buffer.pos if save_pos
59
- parser = Parser.new(@buffer.seek(offset_for(ref)), self).object(ref.id, ref.gen)
64
+ obj, stream = Parser.new(@buffer.seek(offset_for(ref)), self).object(ref.id, ref.gen)
60
65
  @buffer.seek(pos) if save_pos
61
- parser
66
+ if stream
67
+ return obj, stream
68
+ else
69
+ return obj
70
+ end
62
71
  end
63
72
  ################################################################################
64
73
  # Assumes the underlying buffer is positioned at the start of an Xref table and
65
74
  # processes it into memory.
66
75
  def load_xref_table
67
- objid, count = @buffer.token.to_i, @buffer.token.to_i
76
+ tok_one = tok_two = nil
77
+
78
+ begin
79
+ # loop over all subsections of the xref table
80
+ # In a well formed PDF, the 'trailer' token will indicate
81
+ # the end of the table. However we need to be careful in case
82
+ # we're processing a malformed pdf that is missing the trailer.
83
+ loop do
84
+ tok_one, tok_two = @buffer.token, @buffer.token
85
+ if tok_one != "trailer" && !tok_one.match(/\d+/)
86
+ raise MalformedPDFError, "PDF malformed, missing trailer after cross reference"
87
+ end
88
+ break if tok_one == "trailer" or tok_one.nil?
89
+ objid, count = tok_one.to_i, tok_two.to_i
68
90
 
69
- count.times do
70
- offset = @buffer.token.to_i
71
- generation = @buffer.token.to_i
72
- state = @buffer.token
91
+ count.times do
92
+ offset = @buffer.token.to_i
93
+ generation = @buffer.token.to_i
94
+ state = @buffer.token
73
95
 
74
- store(objid, generation, offset) if state == "n"
75
- objid += 1
96
+ store(objid, generation, offset) if state == "n"
97
+ objid += 1
98
+ end
99
+ end
100
+ rescue EOFError => e
101
+ raise MalformedPDFError, "PDF malformed, missing trailer after cross reference"
76
102
  end
77
103
 
78
- raise MalformedPDFError, "PDF malformed, missing trailer after cross reference" unless @buffer.token == "trailer"
79
- raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless @buffer.token == "<<"
104
+ raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless tok_two == "<<"
80
105
 
81
106
  trailer = Parser.new(@buffer, self).dictionary
82
- load(trailer['Prev']) if trailer.has_key?('Prev')
107
+ load(trailer['Prev'].to_i) if trailer.has_key?('Prev')
83
108
 
84
109
  trailer
85
110
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.1
4
+ version: 0.6.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Jones
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-03-12 00:00:00 +11:00
12
+ date: 2008-03-22 00:00:00 +11:00
13
13
  default_executable:
14
14
  dependencies: []
15
15