pdf-reader 0.8.6 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,53 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+
5
+ class MetadataStrategy < AbstractStrategy # :nodoc:
6
+
7
+ def self.to_sym
8
+ :metadata
9
+ end
10
+
11
+ def process
12
+ return false unless options[:metadata]
13
+
14
+ # may be useful to some people
15
+ callback(:pdf_version, ohash.pdf_version)
16
+
17
+ # ye olde metadata
18
+ callback(:metadata, [decoded_info]) if info?
19
+
20
+ # new style xml metadata
21
+ callback(:xml_metadata, [xml_metadata]) if xml_metadata?
22
+
23
+ # page count
24
+ if pages?
25
+ count = ohash.object(pages[:Count])
26
+ callback(:page_count, count.to_i)
27
+ end
28
+ end
29
+
30
+ private
31
+
32
+ def xml_metadata
33
+ return @xml_metadata if defined?(@xml_metadata)
34
+
35
+ if root[:Metadata].nil?
36
+ @xml_metadata = nil
37
+ else
38
+ string = ohash.object(root[:Metadata]).unfiltered_data
39
+ string.force_encoding("utf-8") if string.respond_to?(:force_encoding)
40
+ @xml_metadata = string
41
+ end
42
+ end
43
+
44
+ def xml_metadata?
45
+ xml_metadata ? true : false
46
+ end
47
+
48
+ def decoded_info
49
+ @decoded_info ||= decode_strings(info)
50
+ end
51
+
52
+ end
53
+ end
@@ -0,0 +1,275 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+ # Provides low level access to the objects in a PDF file via a hash-like
5
+ # object.
6
+ #
7
+ # A PDF file can be viewed as a large hash map. It is a series of objects
8
+ # stored at an exact byte offsets, and a table that maps object IDs to byte
9
+ # offsets. Given an object ID, looking up an object is an O(1) operation.
10
+ #
11
+ # Each PDF object can be mapped to a ruby object, so by passing an object
12
+ # ID to the [] method, a ruby representation of that object will be
13
+ # retrieved.
14
+ #
15
+ # The class behaves much like a standard Ruby hash, including the use of
16
+ # the Enumerable mixin. The key difference is no []= method - the hash
17
+ # is read only.
18
+ #
19
+ # == Basic Usage
20
+ #
21
+ # h = PDF::Reader::ObjectHash.new("somefile.pdf")
22
+ # h[1]
23
+ # => 3469
24
+ #
25
+ # h[PDF::Reader::Reference.new(1,0)]
26
+ # => 3469
27
+ #
28
+ class ObjectHash
29
+ include Enumerable
30
+
31
+ attr_accessor :default
32
+ attr_reader :trailer, :pdf_version
33
+
34
+ # Creates a new ObjectHash object. input can be a string with a valid filename,
35
+ # a string containing a PDF file, or an IO object.
36
+ #
37
+ def initialize(input)
38
+ if input.respond_to?(:seek) && input.respond_to?(:read)
39
+ @io = input
40
+ elsif File.file?(input.to_s)
41
+ if File.respond_to?(:binread)
42
+ input = File.binread(input.to_s)
43
+ else
44
+ input = File.read(input.to_s)
45
+ end
46
+ @io = StringIO.new(input)
47
+ else
48
+ raise ArgumentError, "input must be an IO-like object or a filename"
49
+ end
50
+ @pdf_version = read_version
51
+ @xref = PDF::Reader::XRef.new(@io)
52
+ @trailer = @xref.trailer
53
+ end
54
+
55
+ # returns the type of object a ref points to
56
+ def obj_type(ref)
57
+ self[ref].class.to_s.to_sym
58
+ rescue
59
+ nil
60
+ end
61
+
62
+ # returns true if the supplied references points to an object with a stream
63
+ def stream?(ref)
64
+ self[ref].class == PDF::Reader::Stream
65
+ rescue
66
+ false
67
+ end
68
+
69
+ # Access an object from the PDF. key can be an int or a PDF::Reader::Reference
70
+ # object.
71
+ #
72
+ # If an int is used, the object with that ID and a generation number of 0 will
73
+ # be returned.
74
+ #
75
+ # If a PDF::Reader::Reference object is used the exact ID and generation number
76
+ # can be specified.
77
+ #
78
+ def [](key)
79
+ return default if key.to_i <= 0
80
+ begin
81
+ unless key.kind_of?(PDF::Reader::Reference)
82
+ key = PDF::Reader::Reference.new(key.to_i, 0)
83
+ end
84
+ if xref[key].is_a?(Fixnum)
85
+ buf = new_buffer(xref[key])
86
+ Parser.new(buf, self).object(key.id, key.gen)
87
+ elsif xref[key].is_a?(PDF::Reader::Reference)
88
+ container_key = xref[key]
89
+ object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
90
+ object_streams[container_key][key.id]
91
+ end
92
+ rescue InvalidObjectError
93
+ return default
94
+ end
95
+ end
96
+
97
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
98
+ # object in the PDF and return it. Otherwise return key untouched.
99
+ #
100
+ def object(key)
101
+ key.is_a?(PDF::Reader::Reference) ? self[key] : key
102
+ end
103
+
104
+ # Access an object from the PDF. key can be an int or a PDF::Reader::Reference
105
+ # object.
106
+ #
107
+ # If an int is used, the object with that ID and a generation number of 0 will
108
+ # be returned.
109
+ #
110
+ # If a PDF::Reader::Reference object is used the exact ID and generation number
111
+ # can be specified.
112
+ #
113
+ # local_default is the object that will be returned if the requested key doesn't
114
+ # exist.
115
+ #
116
+ def fetch(key, local_default = nil)
117
+ obj = self[key]
118
+ if obj
119
+ return obj
120
+ elsif local_default
121
+ return local_default
122
+ else
123
+ raise IndexError, "#{key} is invalid" if key.to_i <= 0
124
+ end
125
+ end
126
+
127
+ # iterate over each key, value. Just like a ruby hash.
128
+ #
129
+ def each(&block)
130
+ @xref.each do |ref|
131
+ yield ref, self[ref]
132
+ end
133
+ end
134
+ alias :each_pair :each
135
+
136
+ # iterate over each key. Just like a ruby hash.
137
+ #
138
+ def each_key(&block)
139
+ each do |id, obj|
140
+ yield id
141
+ end
142
+ end
143
+
144
+ # iterate over each value. Just like a ruby hash.
145
+ #
146
+ def each_value(&block)
147
+ each do |id, obj|
148
+ yield obj
149
+ end
150
+ end
151
+
152
+ # return the number of objects in the file. An object with multiple generations
153
+ # is counted once.
154
+ def size
155
+ xref.size
156
+ end
157
+ alias :length :size
158
+
159
+ # return true if there are no objects in this file
160
+ #
161
+ def empty?
162
+ size == 0 ? true : false
163
+ end
164
+
165
+ # return true if the specified key exists in the file. key
166
+ # can be an int or a PDF::Reader::Reference
167
+ #
168
+ def has_key?(check_key)
169
+ # TODO update from O(n) to O(1)
170
+ each_key do |key|
171
+ if check_key.kind_of?(PDF::Reader::Reference)
172
+ return true if check_key == key
173
+ else
174
+ return true if check_key.to_i == key.id
175
+ end
176
+ end
177
+ return false
178
+ end
179
+ alias :include? :has_key?
180
+ alias :key? :has_key?
181
+ alias :member? :has_key?
182
+
183
+ # return true if the specifiedvalue exists in the file
184
+ #
185
+ def has_value?(value)
186
+ # TODO update from O(n) to O(1)
187
+ each_value do |obj|
188
+ return true if obj == value
189
+ end
190
+ return false
191
+ end
192
+ alias :value? :has_key?
193
+
194
+ def to_s
195
+ "<PDF::Reader::ObejctHash size: #{self.size}>"
196
+ end
197
+
198
+ # return an array of all keys in the file
199
+ #
200
+ def keys
201
+ ret = []
202
+ each_key { |k| ret << k }
203
+ ret
204
+ end
205
+
206
+ # return an array of all values in the file
207
+ #
208
+ def values
209
+ ret = []
210
+ each_value { |v| ret << v }
211
+ ret
212
+ end
213
+
214
+ # return an array of all values from the specified keys
215
+ #
216
+ def values_at(*ids)
217
+ ids.map { |id| self[id] }
218
+ end
219
+
220
+ # return an array of arrays. Each sub array contains a key/value pair.
221
+ #
222
+ def to_a
223
+ ret = []
224
+ each do |id, obj|
225
+ ret << [id, obj]
226
+ end
227
+ ret
228
+ end
229
+
230
+ # returns an array of PDF::Reader::References. Each reference in the
231
+ # array points a Page object, one for each page in the PDF. The first
232
+ # reference is page 1, second reference is page 2, etc.
233
+ #
234
+ # Useful for apps that want to extract data from specific pages.
235
+ #
236
+ def page_references
237
+ root = fetch(trailer[:Root])
238
+ @page_references ||= get_page_objects(root[:Pages]).flatten
239
+ end
240
+
241
+ private
242
+
243
+ def new_buffer(offset = 0)
244
+ PDF::Reader::Buffer.new(@io, :seek => offset)
245
+ end
246
+
247
+ def xref
248
+ @xref
249
+ end
250
+
251
+ def object_streams
252
+ @object_stream ||= {}
253
+ end
254
+
255
+ # returns a nested array of object references for all pages in this object store.
256
+ #
257
+ def get_page_objects(ref)
258
+ obj = fetch(ref)
259
+
260
+ if obj[:Type] == :Page
261
+ ref
262
+ elsif obj[:Type] == :Pages
263
+ obj[:Kids].map { |kid| get_page_objects(kid) }
264
+ end
265
+ end
266
+
267
+ def read_version
268
+ @io.seek(0)
269
+ m, version = *@io.read(10).match(/PDF-(\d.\d)/)
270
+ @io.seek(0)
271
+ version.to_f
272
+ end
273
+
274
+ end
275
+ end
@@ -0,0 +1,51 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+
5
+ # provides a wrapper around a PDF stream object that contains other objects in it.
6
+ # This is done for added compression and is described as an "Object Stream" in the spec.
7
+ #
8
+ class ObjectStream # :nodoc:
9
+ def initialize(stream)
10
+ @dict = stream.hash
11
+ @data = stream.unfiltered_data
12
+ end
13
+
14
+ def [](objid)
15
+ if offsets[objid].nil?
16
+ nil
17
+ else
18
+ buf = PDF::Reader::Buffer.new(StringIO.new(@data), :seek => offsets[objid])
19
+ parser = PDF::Reader::Parser.new(buf)
20
+ parser.parse_token
21
+ end
22
+ end
23
+
24
+ def size
25
+ @dict[:N]
26
+ end
27
+
28
+ private
29
+
30
+ def offsets
31
+ @offsets ||= {}
32
+ return @offsets if @offsets.keys.size > 0
33
+
34
+ size.times do
35
+ @offsets[buffer.token.to_i] = first + buffer.token.to_i
36
+ end
37
+ @offsets
38
+ end
39
+
40
+ def first
41
+ @dict[:First]
42
+ end
43
+
44
+ def buffer
45
+ @buffer ||= PDF::Reader::Buffer.new(StringIO.new(@data))
46
+ end
47
+
48
+ end
49
+
50
+ end
51
+
@@ -22,12 +22,11 @@
22
22
  # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
23
  #
24
24
  ################################################################################
25
- require 'stringio'
26
25
 
27
26
  class PDF::Reader
28
27
  ################################################################################
29
- # Walks the PDF file and calls the appropriate callback methods when something of interest is
30
- # found.
28
+ # Walks the pages of the PDF file and calls the appropriate callback methods when
29
+ # something of interest is found.
31
30
  #
32
31
  # The callback methods should exist on the receiver object passed into the constructor. Whenever
33
32
  # some content is found that will trigger a callback, the receiver is checked to see if the callback
@@ -78,6 +77,14 @@ class PDF::Reader
78
77
  # - move_to_next_line_and_show_text
79
78
  # - set_spacing_next_line_show_text
80
79
  #
80
+ # If the :raw_text option was passed to the PDF::Reader class the following callbacks
81
+ # may also appear:
82
+ #
83
+ # - show_text_raw
84
+ # - show_text_with_positioning_raw
85
+ # - move_to_next_line_and_show_text_raw
86
+ # - set_spacing_next_line_show_text_raw
87
+ #
81
88
  # == Graphics Callbacks
82
89
  # - close_fill_stroke
83
90
  # - fill_stroke
@@ -168,7 +175,7 @@ class PDF::Reader
168
175
  # If it gets mapped to the name "IM1", then it can be placed on the page using
169
176
  # invoke_xobject "IM1".
170
177
  #
171
- class Content
178
+ class PagesStrategy< AbstractStrategy # :nodoc:
172
179
  OPERATORS = {
173
180
  'b' => :close_fill_stroke,
174
181
  'B' => :fill_stroke,
@@ -244,45 +251,19 @@ class PDF::Reader
244
251
  '\'' => :move_to_next_line_and_show_text,
245
252
  '"' => :set_spacing_next_line_show_text,
246
253
  }
247
- ################################################################################
248
- # Create a new PDF::Reader::Content object to process the contents of PDF file
249
- # - receiver - an object containing the required callback methods
250
- # - xref - a PDF::Reader::Xref object that contains references to all the objects in a PDF file
251
- def initialize (receiver, xref)
252
- @receiver = receiver
253
- @xref = xref
254
- end
255
- ################################################################################
256
- # Begin processing the document metadata
257
- def metadata (root, info)
258
- info = decode_strings(info)
259
-
260
- # may be useful to some people
261
- callback(:pdf_version, @xref.pdf_version)
262
-
263
- # ye olde metadata
264
- callback(:metadata, [info]) if info
265
-
266
- # new style xml metadata
267
- if root[:Metadata]
268
- stream = @xref.object(root[:Metadata])
269
- callback(:xml_metadata,stream.unfiltered_data)
270
- end
271
-
272
- # page count
273
- if (pages = @xref.object(root[:Pages]))
274
- if (count = @xref.object(pages[:Count]))
275
- callback(:page_count, count.to_i)
276
- end
277
- end
254
+ def self.to_sym
255
+ :pages
278
256
  end
279
257
  ################################################################################
280
258
  # Begin processing the document
281
- def document (root)
259
+ def process
260
+ return false unless options[:pages]
261
+
282
262
  callback(:begin_document, [root])
283
- walk_pages(@xref.object(root[:Pages]))
263
+ walk_pages(@ohash.object(root[:Pages]))
284
264
  callback(:end_document)
285
265
  end
266
+ private
286
267
  ################################################################################
287
268
  # Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
288
269
  # its content
@@ -291,19 +272,19 @@ class PDF::Reader
291
272
  # extract page content
292
273
  if page[:Type] == :Pages
293
274
  callback(:begin_page_container, [page])
294
- res = @xref.object(page[:Resources])
275
+ res = @ohash.object(page[:Resources])
295
276
  resources.push res if res
296
- @xref.object(page[:Kids]).each {|child| walk_pages(@xref.object(child))}
277
+ @ohash.object(page[:Kids]).each {|child| walk_pages(@ohash.object(child))}
297
278
  resources.pop if res
298
279
  callback(:end_page_container)
299
280
  elsif page[:Type] == :Page
300
281
  callback(:begin_page, [page])
301
- res = @xref.object(page[:Resources])
282
+ res = @ohash.object(page[:Resources])
302
283
  resources.push res if res
303
284
  walk_resources(current_resources)
304
285
 
305
- if @xref.object(page[:Contents]).kind_of?(Array)
306
- contents = @xref.object(page[:Contents])
286
+ if @ohash.object(page[:Contents]).kind_of?(Array)
287
+ contents = @ohash.object(page[:Contents])
307
288
  else
308
289
  contents = [page[:Contents]]
309
290
  end
@@ -311,10 +292,8 @@ class PDF::Reader
311
292
  fonts = font_hash_from_resources(current_resources)
312
293
 
313
294
  if page.has_key?(:Contents) and page[:Contents]
314
- contents.each do |content|
315
- obj = @xref.object(content)
316
- content_stream(obj, fonts)
317
- end
295
+ direct_contents = contents.map { |content| @ohash.object(content) }
296
+ content_stream(direct_contents, fonts)
318
297
  end
319
298
 
320
299
  resources.pop if res
@@ -326,12 +305,12 @@ class PDF::Reader
326
305
  # like a regular page content stream.
327
306
  #
328
307
  def walk_xobject_form(label)
329
- xobjects = @xref.object(current_resources[:XObject]) || {}
330
- xobject = @xref.object(xobjects[label])
308
+ xobjects = @ohash.object(current_resources[:XObject]) || {}
309
+ xobject = @ohash.object(xobjects[label])
331
310
 
332
311
  if xobject && xobject.hash[:Subtype] == :Form
333
312
  callback(:begin_form_xobject)
334
- resources = @xref.object(xobject.hash[:Resources])
313
+ resources = @ohash.object(xobject.hash[:Resources])
335
314
  walk_resources(resources) if resources
336
315
  fonts = font_hash_from_resources(resources)
337
316
  content_stream(xobject, fonts)
@@ -352,30 +331,40 @@ class PDF::Reader
352
331
  ################################################################################
353
332
  # Reads a PDF content stream and calls all the appropriate callback methods for the operators
354
333
  # it contains
334
+ #
355
335
  def content_stream (instructions, fonts = {})
356
- instructions = instructions.unfiltered_data if instructions.kind_of?(PDF::Reader::Stream)
357
- buffer = Buffer.new(StringIO.new(instructions))
358
- parser = Parser.new(buffer, @xref)
336
+ instructions = [instructions] unless instructions.kind_of?(Array)
337
+ instructions = instructions.map { |ins|
338
+ ins.is_a?(PDF::Reader::Stream) ? ins.unfiltered_data : ins.to_s
339
+ }.join
340
+ buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
341
+ parser = Parser.new(buffer, @ohash)
359
342
  current_font = nil
360
343
  params = []
361
344
 
362
345
  while (token = parser.parse_token(OPERATORS))
363
346
  if token.kind_of?(Token) and OPERATORS.has_key?(token)
364
- current_font = params.first if OPERATORS[token] == :set_text_font_and_size
347
+ if OPERATORS[token] == :set_text_font_and_size
348
+ current_font = params.first
349
+ if fonts[current_font].nil?
350
+ raise MalformedPDFError, "Unknown font #{current_font}"
351
+ end
352
+ end
365
353
 
366
354
  # handle special cases in response to certain operators
367
- if OPERATORS[token].to_s.include?("show_text") && fonts[current_font]
368
- # convert any text to utf-8
355
+ if OPERATORS[token].to_s.include?("show_text")
356
+ # convert any text to utf-8, but output the raw string if the user wants it
357
+ if options[:raw_text]
358
+ callback("#{OPERATORS[token]}_raw".to_sym, params)
359
+ end
369
360
  params = fonts[current_font].to_utf8(params)
370
361
  elsif token == "ID"
371
362
  # inline image data, first convert the current params into a more familiar hash
372
363
  map = {}
373
- params.each_slice(2) do |a|
374
- map[a.first] = a.last
364
+ params.each_slice(2) do |key, value|
365
+ map[key] = value
375
366
  end
376
- params = [map]
377
- # read the raw image data from the buffer without tokenising
378
- params << buffer.read_until("EI")
367
+ params = [map, buffer.token]
379
368
  end
380
369
 
381
370
  callback(OPERATORS[token], params)
@@ -407,29 +396,29 @@ class PDF::Reader
407
396
 
408
397
  # extract any xobject information
409
398
  if resources[:XObject]
410
- @xref.object(resources[:XObject]).each do |name, val|
411
- callback(:resource_xobject, [name, @xref.object(val)])
399
+ @ohash.object(resources[:XObject]).each do |name, val|
400
+ callback(:resource_xobject, [name, @ohash.object(val)])
412
401
  end
413
402
  end
414
403
 
415
404
  # extract any extgstate information
416
405
  if resources[:ExtGState]
417
- @xref.object(resources[:ExtGState]).each do |name, val|
418
- callback(:resource_extgstate, [name, @xref.object(val)])
406
+ @ohash.object(resources[:ExtGState]).each do |name, val|
407
+ callback(:resource_extgstate, [name, @ohash.object(val)])
419
408
  end
420
409
  end
421
410
 
422
411
  # extract any colorspace information
423
412
  if resources[:ColorSpace]
424
- @xref.object(resources[:ColorSpace]).each do |name, val|
425
- callback(:resource_colorspace, [name, @xref.object(val)])
413
+ @ohash.object(resources[:ColorSpace]).each do |name, val|
414
+ callback(:resource_colorspace, [name, @ohash.object(val)])
426
415
  end
427
416
  end
428
417
 
429
418
  # extract any pattern information
430
419
  if resources[:Pattern]
431
- @xref.object(resources[:Pattern]).each do |name, val|
432
- callback(:resource_pattern, [name, @xref.object(val)])
420
+ @ohash.object(resources[:Pattern]).each do |name, val|
421
+ callback(:resource_pattern, [name, @ohash.object(val)])
433
422
  end
434
423
  end
435
424
 
@@ -449,7 +438,7 @@ class PDF::Reader
449
438
  obj.hash = resolve_references(obj.hash)
450
439
  obj
451
440
  when PDF::Reader::Reference then
452
- resolve_references(@xref.object(obj))
441
+ resolve_references(@ohash.object(obj))
453
442
  when Hash then obj.each { |key,val| obj[key] = resolve_references(val) }
454
443
  when Array then obj.collect { |item| resolve_references(item) }
455
444
  else
@@ -457,53 +446,27 @@ class PDF::Reader
457
446
  end
458
447
  end
459
448
  ################################################################################
460
- # calls the name callback method on the receiver class with params as the arguments
461
- def callback (name, params=[])
462
- @receiver.send(name, *params) if @receiver.respond_to?(name)
463
- end
464
- ################################################################################
465
- private
466
449
  ################################################################################
467
450
  def font_hash_from_resources(resources)
468
451
  return {} unless resources.respond_to?(:[])
469
452
 
470
453
  fonts = {}
471
- resources = @xref.object(resources[:Font]) || {}
454
+ resources = @ohash.object(resources[:Font]) || {}
472
455
  resources.each do |label, desc|
473
- desc = @xref.object(desc)
456
+ desc = @ohash.object(desc)
474
457
  fonts[label] = PDF::Reader::Font.new
475
458
  fonts[label].label = label
476
459
  fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
477
460
  fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
478
- fonts[label].encoding = PDF::Reader::Encoding.new(@xref.object(desc[:Encoding]))
461
+ fonts[label].encoding = PDF::Reader::Encoding.new(@ohash.object(desc[:Encoding]))
479
462
  fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
480
463
  if desc[:ToUnicode]
481
- # this stream is a cmap
482
- begin
483
- stream = desc[:ToUnicode]
484
- fonts[label].tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
485
- rescue
486
- # if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
487
- end
464
+ stream = @ohash.object(desc[:ToUnicode])
465
+ fonts[label].tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
488
466
  end
489
467
  end
490
468
  fonts
491
469
  end
492
- # strings outside of page content should be in either PDFDocEncoding or UTF-16.
493
- def decode_strings(obj)
494
- case obj
495
- when String then
496
- if obj[0,2] == "\376\377"
497
- PDF::Reader::Encoding.new(:UTF16Encoding).to_utf8(obj[2, obj.size])
498
- else
499
- PDF::Reader::Encoding.new(:PDFDocEncoding).to_utf8(obj)
500
- end
501
- when Hash then obj.each { |key,val| obj[key] = decode_strings(val) }
502
- when Array then obj.collect { |item| decode_strings(item) }
503
- else
504
- obj
505
- end
506
- end
507
470
  def resources
508
471
  @resources ||= []
509
472
  end