fireinc-pdf-reader 0.11.0.alpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. data/CHANGELOG +168 -0
  2. data/MIT-LICENSE +21 -0
  3. data/README.rdoc +137 -0
  4. data/Rakefile +34 -0
  5. data/TODO +45 -0
  6. data/bin/pdf_list_callbacks +15 -0
  7. data/bin/pdf_object +48 -0
  8. data/bin/pdf_text +15 -0
  9. data/examples/callbacks.rb +21 -0
  10. data/examples/extract_bates.rb +49 -0
  11. data/examples/extract_images.rb +108 -0
  12. data/examples/hash.rb +12 -0
  13. data/examples/metadata.rb +25 -0
  14. data/examples/page_counter_improved.rb +23 -0
  15. data/examples/page_counter_naive.rb +24 -0
  16. data/examples/rspec.rb +57 -0
  17. data/examples/text.rb +40 -0
  18. data/examples/version.rb +25 -0
  19. data/lib/pdf/hash.rb +15 -0
  20. data/lib/pdf/reader/abstract_strategy.rb +81 -0
  21. data/lib/pdf/reader/buffer.rb +346 -0
  22. data/lib/pdf/reader/cmap.rb +138 -0
  23. data/lib/pdf/reader/encoding.rb +190 -0
  24. data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
  25. data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
  26. data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
  27. data/lib/pdf/reader/encodings/standard.txt +47 -0
  28. data/lib/pdf/reader/encodings/symbol.txt +154 -0
  29. data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
  30. data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
  31. data/lib/pdf/reader/error.rb +53 -0
  32. data/lib/pdf/reader/filter.rb +219 -0
  33. data/lib/pdf/reader/font.rb +133 -0
  34. data/lib/pdf/reader/form_xobject.rb +83 -0
  35. data/lib/pdf/reader/glyphlist.txt +4322 -0
  36. data/lib/pdf/reader/lzw.rb +123 -0
  37. data/lib/pdf/reader/metadata_strategy.rb +56 -0
  38. data/lib/pdf/reader/object_cache.rb +85 -0
  39. data/lib/pdf/reader/object_hash.rb +289 -0
  40. data/lib/pdf/reader/object_stream.rb +51 -0
  41. data/lib/pdf/reader/page.rb +185 -0
  42. data/lib/pdf/reader/page_text_receiver.rb +278 -0
  43. data/lib/pdf/reader/pages_strategy.rb +475 -0
  44. data/lib/pdf/reader/parser.rb +225 -0
  45. data/lib/pdf/reader/print_receiver.rb +18 -0
  46. data/lib/pdf/reader/reference.rb +66 -0
  47. data/lib/pdf/reader/register_receiver.rb +95 -0
  48. data/lib/pdf/reader/stream.rb +69 -0
  49. data/lib/pdf/reader/text_receiver.rb +264 -0
  50. data/lib/pdf/reader/token.rb +41 -0
  51. data/lib/pdf/reader/xref.rb +220 -0
  52. data/lib/pdf/reader.rb +296 -0
  53. data/lib/pdf-reader.rb +1 -0
  54. metadata +211 -0
@@ -0,0 +1,475 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+
26
+ class PDF::Reader
27
+ ################################################################################
28
+ # Walks the pages of the PDF file and calls the appropriate callback methods when
29
+ # something of interest is found.
30
+ #
31
+ # The callback methods should exist on the receiver object passed into the constructor. Whenever
32
+ # some content is found that will trigger a callback, the receiver is checked to see if the callback
33
+ # is defined.
34
+ #
35
+ # If it is defined it will be called. If not, processing will continue.
36
+ #
37
+ # = Available Callbacks
38
+ # The following callbacks are available and should be methods defined on your receiver class. Only
39
+ # implement the ones you need - the rest will be ignored.
40
+ #
41
+ # Some callbacks will include parameters which will be passed in as an array. For callbacks that supply no
42
+ # paramters, or where you don't need them, the *params argument can be left off. Some example callback
43
+ # method definitions are:
44
+ #
45
+ # def begin_document
46
+ # def end_page
47
+ # def show_text(string, *params)
48
+ # def fill_stroke(*params)
49
+ #
50
+ # You should be able to infer the basic command the callback is reporting based on the name. For
51
+ # further experimentation, define the callback with just a *params parameter, then print out the
52
+ # contents of the array using something like:
53
+ #
54
+ # puts params.inspect
55
+ #
56
+ # == Text Callbacks
57
+ #
58
+ # All text passed into these callbacks will be encoded as UTF-8. Depending on where (and when) the
59
+ # PDF was generated, there's a good chance the text is NOT stored as UTF-8 internally so be careful
60
+ # when doing a comparison on strings returned from PDF::Reader (when doing unit tests for example). The
61
+ # string may not be byte-by-byte identical with the string that was originally written to the PDF.
62
+ #
63
+ # - end_text_object
64
+ # - move_to_start_of_next_line
65
+ # - set_character_spacing
66
+ # - move_text_position
67
+ # - move_text_position_and_set_leading
68
+ # - set_text_font_and_size
69
+ # - show_text
70
+ # - show_text_with_positioning
71
+ # - set_text_leading
72
+ # - set_text_matrix_and_text_line_matrix
73
+ # - set_text_rendering_mode
74
+ # - set_text_rise
75
+ # - set_word_spacing
76
+ # - set_horizontal_text_scaling
77
+ # - move_to_next_line_and_show_text
78
+ # - set_spacing_next_line_show_text
79
+ #
80
+ # If the :raw_text option was passed to the PDF::Reader class the following callbacks
81
+ # may also appear:
82
+ #
83
+ # - show_text_raw
84
+ # - show_text_with_positioning_raw
85
+ # - move_to_next_line_and_show_text_raw
86
+ # - set_spacing_next_line_show_text_raw
87
+ #
88
+ # == Graphics Callbacks
89
+ # - close_fill_stroke
90
+ # - fill_stroke
91
+ # - close_fill_stroke_with_even_odd
92
+ # - fill_stroke_with_even_odd
93
+ # - begin_marked_content_with_pl
94
+ # - begin_inline_image
95
+ # - begin_marked_content
96
+ # - begin_text_object
97
+ # - append_curved_segment
98
+ # - concatenate_matrix
99
+ # - set_stroke_color_space
100
+ # - set_nonstroke_color_space
101
+ # - set_line_dash
102
+ # - set_glyph_width
103
+ # - set_glyph_width_and_bounding_box
104
+ # - invoke_xobject
105
+ # - define_marked_content_with_pl
106
+ # - end_inline_image
107
+ # - end_marked_content
108
+ # - fill_path_with_nonzero
109
+ # - fill_path_with_nonzero
110
+ # - fill_path_with_even_odd
111
+ # - set_gray_for_stroking
112
+ # - set_gray_for_nonstroking
113
+ # - set_graphics_state_parameters
114
+ # - close_subpath
115
+ # - set_flatness_tolerance
116
+ # - begin_inline_image_data
117
+ # - set_line_join_style
118
+ # - set_line_cap_style
119
+ # - set_cmyk_color_for_stroking,
120
+ # - set_cmyk_color_for_nonstroking
121
+ # - append_line
122
+ # - begin_new_subpath
123
+ # - set_miter_limit
124
+ # - define_marked_content_point
125
+ # - end_path
126
+ # - save_graphics_state
127
+ # - restore_graphics_state
128
+ # - append_rectangle
129
+ # - set_rgb_color_for_stroking
130
+ # - set_rgb_color_for_nonstroking
131
+ # - set_color_rendering_intent
132
+ # - close_and_stroke_path
133
+ # - stroke_path
134
+ # - set_color_for_stroking
135
+ # - set_color_for_nonstroking
136
+ # - set_color_for_stroking_and_special
137
+ # - set_color_for_nonstroking_and_special
138
+ # - paint_area_with_shading_pattern
139
+ # - append_curved_segment_initial_point_replicated
140
+ # - set_line_width
141
+ # - set_clipping_path_with_nonzero
142
+ # - set_clipping_path_with_even_odd
143
+ # - append_curved_segment_final_point_replicated
144
+ #
145
+ # == Misc Callbacks
146
+ # - begin_compatibility_section
147
+ # - end_compatibility_section,
148
+ # - begin_document
149
+ # - end_document
150
+ # - begin_page_container
151
+ # - end_page_container
152
+ # - begin_page
153
+ # - end_page
154
+ # - metadata
155
+ # - xml_metadata
156
+ # - page_count
157
+ # - begin_form_xobject
158
+ # - end_form_xobject
159
+ #
160
+ # == Resource Callbacks
161
+ #
162
+ # Each page can contain (or inherit) a range of resources required for the page,
163
+ # including things like fonts and images. The following callbacks may appear
164
+ # after begin_page if the relevant resources exist on a page:
165
+ #
166
+ # - resource_procset
167
+ # - resource_xobject
168
+ # - resource_extgstate
169
+ # - resource_colorspace
170
+ # - resource_pattern
171
+ # - resource_font
172
+ #
173
+ # In most cases, these callbacks associate a name with each resource, allowing it
174
+ # to be referred to by name in the page content. For example, an XObject can hold an image.
175
+ # If it gets mapped to the name "IM1", then it can be placed on the page using
176
+ # invoke_xobject "IM1".
177
+ #
178
+ # DEPRECATED: this class was deprecated in version 0.11.0 and will
179
+ # eventually be removed
180
+ class PagesStrategy< AbstractStrategy # :nodoc:
181
+ OPERATORS = {
182
+ 'b' => :close_fill_stroke,
183
+ 'B' => :fill_stroke,
184
+ 'b*' => :close_fill_stroke_with_even_odd,
185
+ 'B*' => :fill_stroke_with_even_odd,
186
+ 'BDC' => :begin_marked_content_with_pl,
187
+ 'BI' => :begin_inline_image,
188
+ 'BMC' => :begin_marked_content,
189
+ 'BT' => :begin_text_object,
190
+ 'BX' => :begin_compatibility_section,
191
+ 'c' => :append_curved_segment,
192
+ 'cm' => :concatenate_matrix,
193
+ 'CS' => :set_stroke_color_space,
194
+ 'cs' => :set_nonstroke_color_space,
195
+ 'd' => :set_line_dash,
196
+ 'd0' => :set_glyph_width,
197
+ 'd1' => :set_glyph_width_and_bounding_box,
198
+ 'Do' => :invoke_xobject,
199
+ 'DP' => :define_marked_content_with_pl,
200
+ 'EI' => :end_inline_image,
201
+ 'EMC' => :end_marked_content,
202
+ 'ET' => :end_text_object,
203
+ 'EX' => :end_compatibility_section,
204
+ 'f' => :fill_path_with_nonzero,
205
+ 'F' => :fill_path_with_nonzero,
206
+ 'f*' => :fill_path_with_even_odd,
207
+ 'G' => :set_gray_for_stroking,
208
+ 'g' => :set_gray_for_nonstroking,
209
+ 'gs' => :set_graphics_state_parameters,
210
+ 'h' => :close_subpath,
211
+ 'i' => :set_flatness_tolerance,
212
+ 'ID' => :begin_inline_image_data,
213
+ 'j' => :set_line_join_style,
214
+ 'J' => :set_line_cap_style,
215
+ 'K' => :set_cmyk_color_for_stroking,
216
+ 'k' => :set_cmyk_color_for_nonstroking,
217
+ 'l' => :append_line,
218
+ 'm' => :begin_new_subpath,
219
+ 'M' => :set_miter_limit,
220
+ 'MP' => :define_marked_content_point,
221
+ 'n' => :end_path,
222
+ 'q' => :save_graphics_state,
223
+ 'Q' => :restore_graphics_state,
224
+ 're' => :append_rectangle,
225
+ 'RG' => :set_rgb_color_for_stroking,
226
+ 'rg' => :set_rgb_color_for_nonstroking,
227
+ 'ri' => :set_color_rendering_intent,
228
+ 's' => :close_and_stroke_path,
229
+ 'S' => :stroke_path,
230
+ 'SC' => :set_color_for_stroking,
231
+ 'sc' => :set_color_for_nonstroking,
232
+ 'SCN' => :set_color_for_stroking_and_special,
233
+ 'scn' => :set_color_for_nonstroking_and_special,
234
+ 'sh' => :paint_area_with_shading_pattern,
235
+ 'T*' => :move_to_start_of_next_line,
236
+ 'Tc' => :set_character_spacing,
237
+ 'Td' => :move_text_position,
238
+ 'TD' => :move_text_position_and_set_leading,
239
+ 'Tf' => :set_text_font_and_size,
240
+ 'Tj' => :show_text,
241
+ 'TJ' => :show_text_with_positioning,
242
+ 'TL' => :set_text_leading,
243
+ 'Tm' => :set_text_matrix_and_text_line_matrix,
244
+ 'Tr' => :set_text_rendering_mode,
245
+ 'Ts' => :set_text_rise,
246
+ 'Tw' => :set_word_spacing,
247
+ 'Tz' => :set_horizontal_text_scaling,
248
+ 'v' => :append_curved_segment_initial_point_replicated,
249
+ 'w' => :set_line_width,
250
+ 'W' => :set_clipping_path_with_nonzero,
251
+ 'W*' => :set_clipping_path_with_even_odd,
252
+ 'y' => :append_curved_segment_final_point_replicated,
253
+ '\'' => :move_to_next_line_and_show_text,
254
+ '"' => :set_spacing_next_line_show_text,
255
+ }
256
+ def self.to_sym
257
+ :pages
258
+ end
259
+ ################################################################################
260
+ # Begin processing the document
261
+ def process
262
+ return false unless options[:pages]
263
+
264
+ callback(:begin_document, [root])
265
+ walk_pages(@ohash.object(root[:Pages]))
266
+ callback(:end_document)
267
+ end
268
+ private
269
+ ################################################################################
270
+ # Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
271
+ # its content
272
+ def walk_pages (page)
273
+
274
+ # extract page content
275
+ if page[:Type] == :Pages
276
+ callback(:begin_page_container, [page])
277
+ res = @ohash.object(page[:Resources])
278
+ resources.push res if res
279
+ @ohash.object(page[:Kids]).each {|child| walk_pages(@ohash.object(child))}
280
+ resources.pop if res
281
+ callback(:end_page_container)
282
+ elsif page[:Type] == :Page
283
+ callback(:begin_page, [page])
284
+ res = @ohash.object(page[:Resources])
285
+ resources.push res if res
286
+ walk_resources(current_resources)
287
+
288
+ if @ohash.object(page[:Contents]).kind_of?(Array)
289
+ contents = @ohash.object(page[:Contents])
290
+ else
291
+ contents = [page[:Contents]]
292
+ end
293
+
294
+ fonts = font_hash_from_resources(current_resources)
295
+
296
+ if page.has_key?(:Contents) and page[:Contents]
297
+ direct_contents = contents.map { |content| @ohash.object(content) }
298
+ content_stream(direct_contents, fonts)
299
+ end
300
+
301
+ resources.pop if res
302
+ callback(:end_page)
303
+ end
304
+ end
305
+ ################################################################################
306
+ # Retreive the XObject for the supplied label and if it's a Form, walk it
307
+ # like a regular page content stream.
308
+ #
309
+ def walk_xobject_form(label)
310
+ xobjects = @ohash.object(current_resources[:XObject]) || {}
311
+ xobject = @ohash.object(xobjects[label])
312
+
313
+ if xobject && xobject.hash[:Subtype] == :Form
314
+ callback(:begin_form_xobject)
315
+ xobj_resources = @ohash.object(xobject.hash[:Resources])
316
+ if xobj_resources
317
+ resources.push xobj_resources
318
+ walk_resources(xobj_resources)
319
+ end
320
+ fonts = font_hash_from_resources(xobj_resources)
321
+ content_stream(xobject, fonts)
322
+ callback(:end_form_xobject)
323
+ resources.pop if xobj_resources
324
+ end
325
+ end
326
+
327
+ ################################################################################
328
+ # Return a merged hash of all resources that are current. Pages, page and xobject
329
+ #
330
+ def current_resources
331
+ hash = {}
332
+ resources.each do |res|
333
+ hash.merge!(res)
334
+ end
335
+ hash
336
+ end
337
+ ################################################################################
338
+ # Reads a PDF content stream and calls all the appropriate callback methods for the operators
339
+ # it contains
340
+ #
341
+ def content_stream (instructions, fonts = {})
342
+ instructions = [instructions] unless instructions.kind_of?(Array)
343
+ instructions = instructions.map { |ins|
344
+ ins.is_a?(PDF::Reader::Stream) ? ins.unfiltered_data : ins.to_s
345
+ }.join
346
+ buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
347
+ parser = Parser.new(buffer, @ohash)
348
+ current_font = nil
349
+ params = []
350
+
351
+ while (token = parser.parse_token(OPERATORS))
352
+ if token.kind_of?(Token) and OPERATORS.has_key?(token)
353
+ if OPERATORS[token] == :set_text_font_and_size
354
+ current_font = params.first
355
+ if fonts[current_font].nil?
356
+ raise MalformedPDFError, "Unknown font #{current_font}"
357
+ end
358
+ end
359
+
360
+ # handle special cases in response to certain operators
361
+ if OPERATORS[token].to_s.include?("show_text")
362
+ # convert any text to utf-8, but output the raw string if the user wants it
363
+ if options[:raw_text]
364
+ callback("#{OPERATORS[token]}_raw".to_sym, params)
365
+ end
366
+ params = fonts[current_font].to_utf8(params)
367
+ elsif token == "ID"
368
+ # inline image data, first convert the current params into a more familiar hash
369
+ map = {}
370
+ params.each_slice(2) do |key, value|
371
+ map[key] = value
372
+ end
373
+ params = [map, buffer.token]
374
+ end
375
+
376
+ callback(OPERATORS[token], params)
377
+
378
+ if OPERATORS[token] == :invoke_xobject
379
+ xobject_label = params.first
380
+ params.clear
381
+ walk_xobject_form(xobject_label)
382
+ else
383
+ params.clear
384
+ end
385
+ else
386
+ params << token
387
+ end
388
+ end
389
+ rescue EOFError => e
390
+ raise MalformedPDFError, "End Of File while processing a content stream"
391
+ end
392
+ ################################################################################
393
+ def walk_resources(resources)
394
+ return unless resources.respond_to?(:[])
395
+
396
+ resources = resolve_references(resources)
397
+
398
+ # extract any procset information
399
+ if resources[:ProcSet]
400
+ callback(:resource_procset, resources[:ProcSet])
401
+ end
402
+
403
+ # extract any xobject information
404
+ if resources[:XObject]
405
+ @ohash.object(resources[:XObject]).each do |name, val|
406
+ callback(:resource_xobject, [name, @ohash.object(val)])
407
+ end
408
+ end
409
+
410
+ # extract any extgstate information
411
+ if resources[:ExtGState]
412
+ @ohash.object(resources[:ExtGState]).each do |name, val|
413
+ callback(:resource_extgstate, [name, @ohash.object(val)])
414
+ end
415
+ end
416
+
417
+ # extract any colorspace information
418
+ if resources[:ColorSpace]
419
+ @ohash.object(resources[:ColorSpace]).each do |name, val|
420
+ callback(:resource_colorspace, [name, @ohash.object(val)])
421
+ end
422
+ end
423
+
424
+ # extract any pattern information
425
+ if resources[:Pattern]
426
+ @ohash.object(resources[:Pattern]).each do |name, val|
427
+ callback(:resource_pattern, [name, @ohash.object(val)])
428
+ end
429
+ end
430
+
431
+ # extract any font information
432
+ if resources[:Font]
433
+ fonts = font_hash_from_resources(resources)
434
+ fonts.each do |label, font|
435
+ callback(:resource_font, [label, font])
436
+ end
437
+ end
438
+ end
439
+ ################################################################################
440
+ # Convert any PDF::Reader::Resource objects into a real object
441
+ def resolve_references(obj)
442
+ case obj
443
+ when PDF::Reader::Stream then
444
+ obj.hash = resolve_references(obj.hash)
445
+ obj
446
+ when PDF::Reader::Reference then
447
+ resolve_references(@ohash.object(obj))
448
+ when Hash then
449
+ arr = obj.map { |key,val| [key, resolve_references(val)] }.flatten(1)
450
+ Hash[*arr]
451
+ when Array then
452
+ obj.collect { |item| resolve_references(item) }
453
+ else
454
+ obj
455
+ end
456
+ end
457
+ ################################################################################
458
+ ################################################################################
459
+ def font_hash_from_resources(resources)
460
+ return {} unless resources.respond_to?(:[])
461
+
462
+ fonts = {}
463
+ resources = @ohash.object(resources[:Font]) || {}
464
+ resources.each do |label, desc|
465
+ fonts[label] = PDF::Reader::Font.new(@ohash, @ohash.object(desc))
466
+ end
467
+ fonts
468
+ end
469
+ def resources
470
+ @resources ||= []
471
+ end
472
+ end
473
+ ################################################################################
474
+ end
475
+ ################################################################################
@@ -0,0 +1,225 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+
26
+ class PDF::Reader
27
+ ################################################################################
28
+ # An internal PDF::Reader class that reads objects from the PDF file and converts
29
+ # them into useable ruby objects (hash's, arrays, true, false, etc)
30
+ class Parser
31
+ ################################################################################
32
+ # Create a new parser around a PDF::Reader::Buffer object
33
+ #
34
+ # buffer - a PDF::Reader::Buffer object that contains PDF data
35
+ # ohash - a PDF::Reader::ObjectHash object that can return objects from the PDF file
36
+ def initialize (buffer, ohash=nil)
37
+ @buffer = buffer
38
+ @ohash = ohash
39
+ end
40
+ ################################################################################
41
+ # Reads the next token from the underlying buffer and convets it to an appropriate
42
+ # object
43
+ #
44
+ # operators - a hash of supported operators to read from the underlying buffer.
45
+ def parse_token (operators={})
46
+ token = @buffer.token
47
+
48
+ case token
49
+ when PDF::Reader::Reference, nil then return token
50
+ when "/" then return pdf_name()
51
+ when "<<" then return dictionary()
52
+ when "[" then return array()
53
+ when "(" then return string()
54
+ when "<" then return hex_string()
55
+ when "true" then return true
56
+ when "false" then return false
57
+ when "null" then return nil
58
+ when "obj", "endobj", "stream", "endstream" then return Token.new(token)
59
+ when "stream", "endstream" then return Token.new(token)
60
+ when ">>", "]", ">", ")" then return Token.new(token)
61
+ else
62
+ if operators.has_key?(token) then return Token.new(token)
63
+ elsif token =~ /\d*\.\d/ then return token.to_f
64
+ else return token.to_i
65
+ end
66
+ end
67
+ end
68
+ ################################################################################
69
+ # Reads an entire PDF object from the buffer and returns it as a Ruby String.
70
+ # If the object is a content stream, returns both the stream and the dictionary
71
+ # that describes it
72
+ #
73
+ # id - the object ID to return
74
+ # gen - the object revision number to return
75
+ def object (id, gen)
76
+ Error.assert_equal(parse_token, id)
77
+ Error.assert_equal(parse_token, gen)
78
+ Error.str_assert(parse_token, "obj")
79
+
80
+ obj = parse_token
81
+ post_obj = parse_token
82
+ if post_obj == "stream"
83
+ stream(obj)
84
+ else
85
+ obj
86
+ end
87
+ end
88
+
89
+ private
90
+
91
+ ################################################################################
92
+ # reads a PDF dict from the buffer and converts it to a Ruby Hash.
93
+ def dictionary
94
+ dict = {}
95
+
96
+ loop do
97
+ key = parse_token
98
+ break if key.kind_of?(Token) and key == ">>"
99
+ raise MalformedPDFError, "Dictionary key (#{key.inspect}) is not a name" unless key.kind_of?(Symbol)
100
+
101
+ value = parse_token
102
+ value.kind_of?(Token) and Error.str_assert_not(value, ">>")
103
+ dict[key] = value
104
+ end
105
+
106
+ dict
107
+ end
108
+ ################################################################################
109
+ # reads a PDF name from the buffer and converts it to a Ruby Symbol
110
+ def pdf_name
111
+ tok = @buffer.token
112
+ tok.scan(/#([A-Fa-f0-9]{2})/).each do |find|
113
+ replace = find[0].hex.chr
114
+ tok.gsub!("#"+find[0], replace)
115
+ end
116
+ tok.to_sym
117
+ end
118
+ ################################################################################
119
+ # reads a PDF array from the buffer and converts it to a Ruby Array.
120
+ def array
121
+ a = []
122
+
123
+ loop do
124
+ item = parse_token
125
+ break if item.kind_of?(Token) and item == "]"
126
+ a << item
127
+ end
128
+
129
+ a
130
+ end
131
+ ################################################################################
132
+ # Reads a PDF hex string from the buffer and converts it to a Ruby String
133
+ def hex_string
134
+ str = ""
135
+
136
+ loop do
137
+ token = @buffer.token
138
+ break if token == ">"
139
+ str << token
140
+ end
141
+
142
+ # add a missing digit if required, as required by the spec
143
+ str << "0" unless str.size % 2 == 0
144
+ str.scan(/../).map {|i| i.hex.chr}.join
145
+ end
146
+ ################################################################################
147
+ # Reads a PDF String from the buffer and converts it to a Ruby String
148
+ def string
149
+ str = @buffer.token
150
+ return "" if str == ")"
151
+ Error.assert_equal(parse_token, ")")
152
+
153
+ ret = ""
154
+ idx = 0
155
+
156
+ while idx < str.size
157
+ chr = str[idx,1]
158
+ jump = 1
159
+
160
+ if chr == "\\"
161
+ jump = 2
162
+ case str[idx+1, 1]
163
+ when "" then jump = 1
164
+ when "n" then chr = "\n"
165
+ when "r" then chr = "\r"
166
+ when "t" then chr = "\t"
167
+ when "b" then chr = "\b"
168
+ when "f" then chr = "\f"
169
+ when "(" then chr = "("
170
+ when ")" then chr = ")"
171
+ when "\\" then chr = "\\"
172
+ when "\n" then
173
+ chr = ""
174
+ jump = 2
175
+ else
176
+ if str[idx+1,3].match(/\d{3}/)
177
+ jump = 4
178
+ chr = str[idx+1,3].oct.chr
179
+ elsif str[idx+1,2].match(/\d{2}/)
180
+ jump = 3
181
+ chr = ("0"+str[idx+1,2]).oct.chr
182
+ elsif str[idx+1,1].match(/\d/)
183
+ jump = 2
184
+ chr = ("00"+str[idx+1,1]).oct.chr
185
+ else
186
+ jump = 1
187
+ chr = ""
188
+ end
189
+
190
+ end
191
+ elsif chr == "\r" && str[idx+1,1] == "\n"
192
+ chr = "\n"
193
+ jump = 2
194
+ elsif chr == "\n" && str[idx+1,1] == "\r"
195
+ chr = "\n"
196
+ jump = 2
197
+ elsif chr == "\r"
198
+ chr = "\n"
199
+ end
200
+ ret << chr
201
+ idx += jump
202
+ end
203
+ ret
204
+ end
205
+ ################################################################################
206
+ # Decodes the contents of a PDF Stream and returns it as a Ruby String.
207
+ def stream (dict)
208
+ raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
209
+ if @ohash
210
+ length = @ohash.object(dict[:Length])
211
+ else
212
+ length = dict[:Length] || 0
213
+ end
214
+ data = @buffer.read(length, :skip_eol => true)
215
+
216
+ Error.str_assert(parse_token, "endstream")
217
+ Error.str_assert(parse_token, "endobj")
218
+
219
+ PDF::Reader::Stream.new(dict, data)
220
+ end
221
+ ################################################################################
222
+ end
223
+ ################################################################################
224
+ end
225
+ ################################################################################