fireinc-pdf-reader 0.11.0.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. data/CHANGELOG +168 -0
  2. data/MIT-LICENSE +21 -0
  3. data/README.rdoc +137 -0
  4. data/Rakefile +34 -0
  5. data/TODO +45 -0
  6. data/bin/pdf_list_callbacks +15 -0
  7. data/bin/pdf_object +48 -0
  8. data/bin/pdf_text +15 -0
  9. data/examples/callbacks.rb +21 -0
  10. data/examples/extract_bates.rb +49 -0
  11. data/examples/extract_images.rb +108 -0
  12. data/examples/hash.rb +12 -0
  13. data/examples/metadata.rb +25 -0
  14. data/examples/page_counter_improved.rb +23 -0
  15. data/examples/page_counter_naive.rb +24 -0
  16. data/examples/rspec.rb +57 -0
  17. data/examples/text.rb +40 -0
  18. data/examples/version.rb +25 -0
  19. data/lib/pdf/hash.rb +15 -0
  20. data/lib/pdf/reader/abstract_strategy.rb +81 -0
  21. data/lib/pdf/reader/buffer.rb +346 -0
  22. data/lib/pdf/reader/cmap.rb +138 -0
  23. data/lib/pdf/reader/encoding.rb +190 -0
  24. data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
  25. data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
  26. data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
  27. data/lib/pdf/reader/encodings/standard.txt +47 -0
  28. data/lib/pdf/reader/encodings/symbol.txt +154 -0
  29. data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
  30. data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
  31. data/lib/pdf/reader/error.rb +53 -0
  32. data/lib/pdf/reader/filter.rb +219 -0
  33. data/lib/pdf/reader/font.rb +133 -0
  34. data/lib/pdf/reader/form_xobject.rb +83 -0
  35. data/lib/pdf/reader/glyphlist.txt +4322 -0
  36. data/lib/pdf/reader/lzw.rb +123 -0
  37. data/lib/pdf/reader/metadata_strategy.rb +56 -0
  38. data/lib/pdf/reader/object_cache.rb +85 -0
  39. data/lib/pdf/reader/object_hash.rb +289 -0
  40. data/lib/pdf/reader/object_stream.rb +51 -0
  41. data/lib/pdf/reader/page.rb +185 -0
  42. data/lib/pdf/reader/page_text_receiver.rb +278 -0
  43. data/lib/pdf/reader/pages_strategy.rb +475 -0
  44. data/lib/pdf/reader/parser.rb +225 -0
  45. data/lib/pdf/reader/print_receiver.rb +18 -0
  46. data/lib/pdf/reader/reference.rb +66 -0
  47. data/lib/pdf/reader/register_receiver.rb +95 -0
  48. data/lib/pdf/reader/stream.rb +69 -0
  49. data/lib/pdf/reader/text_receiver.rb +264 -0
  50. data/lib/pdf/reader/token.rb +41 -0
  51. data/lib/pdf/reader/xref.rb +220 -0
  52. data/lib/pdf/reader.rb +296 -0
  53. data/lib/pdf-reader.rb +1 -0
  54. metadata +211 -0
@@ -0,0 +1,475 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+
26
+ class PDF::Reader
27
+ ################################################################################
28
+ # Walks the pages of the PDF file and calls the appropriate callback methods when
29
+ # something of interest is found.
30
+ #
31
+ # The callback methods should exist on the receiver object passed into the constructor. Whenever
32
+ # some content is found that will trigger a callback, the receiver is checked to see if the callback
33
+ # is defined.
34
+ #
35
+ # If it is defined it will be called. If not, processing will continue.
36
+ #
37
+ # = Available Callbacks
38
+ # The following callbacks are available and should be methods defined on your receiver class. Only
39
+ # implement the ones you need - the rest will be ignored.
40
+ #
41
+ # Some callbacks will include parameters which will be passed in as an array. For callbacks that supply no
42
+ # paramters, or where you don't need them, the *params argument can be left off. Some example callback
43
+ # method definitions are:
44
+ #
45
+ # def begin_document
46
+ # def end_page
47
+ # def show_text(string, *params)
48
+ # def fill_stroke(*params)
49
+ #
50
+ # You should be able to infer the basic command the callback is reporting based on the name. For
51
+ # further experimentation, define the callback with just a *params parameter, then print out the
52
+ # contents of the array using something like:
53
+ #
54
+ # puts params.inspect
55
+ #
56
+ # == Text Callbacks
57
+ #
58
+ # All text passed into these callbacks will be encoded as UTF-8. Depending on where (and when) the
59
+ # PDF was generated, there's a good chance the text is NOT stored as UTF-8 internally so be careful
60
+ # when doing a comparison on strings returned from PDF::Reader (when doing unit tests for example). The
61
+ # string may not be byte-by-byte identical with the string that was originally written to the PDF.
62
+ #
63
+ # - end_text_object
64
+ # - move_to_start_of_next_line
65
+ # - set_character_spacing
66
+ # - move_text_position
67
+ # - move_text_position_and_set_leading
68
+ # - set_text_font_and_size
69
+ # - show_text
70
+ # - show_text_with_positioning
71
+ # - set_text_leading
72
+ # - set_text_matrix_and_text_line_matrix
73
+ # - set_text_rendering_mode
74
+ # - set_text_rise
75
+ # - set_word_spacing
76
+ # - set_horizontal_text_scaling
77
+ # - move_to_next_line_and_show_text
78
+ # - set_spacing_next_line_show_text
79
+ #
80
+ # If the :raw_text option was passed to the PDF::Reader class the following callbacks
81
+ # may also appear:
82
+ #
83
+ # - show_text_raw
84
+ # - show_text_with_positioning_raw
85
+ # - move_to_next_line_and_show_text_raw
86
+ # - set_spacing_next_line_show_text_raw
87
+ #
88
+ # == Graphics Callbacks
89
+ # - close_fill_stroke
90
+ # - fill_stroke
91
+ # - close_fill_stroke_with_even_odd
92
+ # - fill_stroke_with_even_odd
93
+ # - begin_marked_content_with_pl
94
+ # - begin_inline_image
95
+ # - begin_marked_content
96
+ # - begin_text_object
97
+ # - append_curved_segment
98
+ # - concatenate_matrix
99
+ # - set_stroke_color_space
100
+ # - set_nonstroke_color_space
101
+ # - set_line_dash
102
+ # - set_glyph_width
103
+ # - set_glyph_width_and_bounding_box
104
+ # - invoke_xobject
105
+ # - define_marked_content_with_pl
106
+ # - end_inline_image
107
+ # - end_marked_content
108
+ # - fill_path_with_nonzero
109
+ # - fill_path_with_nonzero
110
+ # - fill_path_with_even_odd
111
+ # - set_gray_for_stroking
112
+ # - set_gray_for_nonstroking
113
+ # - set_graphics_state_parameters
114
+ # - close_subpath
115
+ # - set_flatness_tolerance
116
+ # - begin_inline_image_data
117
+ # - set_line_join_style
118
+ # - set_line_cap_style
119
+ # - set_cmyk_color_for_stroking,
120
+ # - set_cmyk_color_for_nonstroking
121
+ # - append_line
122
+ # - begin_new_subpath
123
+ # - set_miter_limit
124
+ # - define_marked_content_point
125
+ # - end_path
126
+ # - save_graphics_state
127
+ # - restore_graphics_state
128
+ # - append_rectangle
129
+ # - set_rgb_color_for_stroking
130
+ # - set_rgb_color_for_nonstroking
131
+ # - set_color_rendering_intent
132
+ # - close_and_stroke_path
133
+ # - stroke_path
134
+ # - set_color_for_stroking
135
+ # - set_color_for_nonstroking
136
+ # - set_color_for_stroking_and_special
137
+ # - set_color_for_nonstroking_and_special
138
+ # - paint_area_with_shading_pattern
139
+ # - append_curved_segment_initial_point_replicated
140
+ # - set_line_width
141
+ # - set_clipping_path_with_nonzero
142
+ # - set_clipping_path_with_even_odd
143
+ # - append_curved_segment_final_point_replicated
144
+ #
145
+ # == Misc Callbacks
146
+ # - begin_compatibility_section
147
+ # - end_compatibility_section,
148
+ # - begin_document
149
+ # - end_document
150
+ # - begin_page_container
151
+ # - end_page_container
152
+ # - begin_page
153
+ # - end_page
154
+ # - metadata
155
+ # - xml_metadata
156
+ # - page_count
157
+ # - begin_form_xobject
158
+ # - end_form_xobject
159
+ #
160
+ # == Resource Callbacks
161
+ #
162
+ # Each page can contain (or inherit) a range of resources required for the page,
163
+ # including things like fonts and images. The following callbacks may appear
164
+ # after begin_page if the relevant resources exist on a page:
165
+ #
166
+ # - resource_procset
167
+ # - resource_xobject
168
+ # - resource_extgstate
169
+ # - resource_colorspace
170
+ # - resource_pattern
171
+ # - resource_font
172
+ #
173
+ # In most cases, these callbacks associate a name with each resource, allowing it
174
+ # to be referred to by name in the page content. For example, an XObject can hold an image.
175
+ # If it gets mapped to the name "IM1", then it can be placed on the page using
176
+ # invoke_xobject "IM1".
177
+ #
178
+ # DEPRECATED: this class was deprecated in version 0.11.0 and will
179
+ # eventually be removed
180
+ class PagesStrategy< AbstractStrategy # :nodoc:
181
+ OPERATORS = {
182
+ 'b' => :close_fill_stroke,
183
+ 'B' => :fill_stroke,
184
+ 'b*' => :close_fill_stroke_with_even_odd,
185
+ 'B*' => :fill_stroke_with_even_odd,
186
+ 'BDC' => :begin_marked_content_with_pl,
187
+ 'BI' => :begin_inline_image,
188
+ 'BMC' => :begin_marked_content,
189
+ 'BT' => :begin_text_object,
190
+ 'BX' => :begin_compatibility_section,
191
+ 'c' => :append_curved_segment,
192
+ 'cm' => :concatenate_matrix,
193
+ 'CS' => :set_stroke_color_space,
194
+ 'cs' => :set_nonstroke_color_space,
195
+ 'd' => :set_line_dash,
196
+ 'd0' => :set_glyph_width,
197
+ 'd1' => :set_glyph_width_and_bounding_box,
198
+ 'Do' => :invoke_xobject,
199
+ 'DP' => :define_marked_content_with_pl,
200
+ 'EI' => :end_inline_image,
201
+ 'EMC' => :end_marked_content,
202
+ 'ET' => :end_text_object,
203
+ 'EX' => :end_compatibility_section,
204
+ 'f' => :fill_path_with_nonzero,
205
+ 'F' => :fill_path_with_nonzero,
206
+ 'f*' => :fill_path_with_even_odd,
207
+ 'G' => :set_gray_for_stroking,
208
+ 'g' => :set_gray_for_nonstroking,
209
+ 'gs' => :set_graphics_state_parameters,
210
+ 'h' => :close_subpath,
211
+ 'i' => :set_flatness_tolerance,
212
+ 'ID' => :begin_inline_image_data,
213
+ 'j' => :set_line_join_style,
214
+ 'J' => :set_line_cap_style,
215
+ 'K' => :set_cmyk_color_for_stroking,
216
+ 'k' => :set_cmyk_color_for_nonstroking,
217
+ 'l' => :append_line,
218
+ 'm' => :begin_new_subpath,
219
+ 'M' => :set_miter_limit,
220
+ 'MP' => :define_marked_content_point,
221
+ 'n' => :end_path,
222
+ 'q' => :save_graphics_state,
223
+ 'Q' => :restore_graphics_state,
224
+ 're' => :append_rectangle,
225
+ 'RG' => :set_rgb_color_for_stroking,
226
+ 'rg' => :set_rgb_color_for_nonstroking,
227
+ 'ri' => :set_color_rendering_intent,
228
+ 's' => :close_and_stroke_path,
229
+ 'S' => :stroke_path,
230
+ 'SC' => :set_color_for_stroking,
231
+ 'sc' => :set_color_for_nonstroking,
232
+ 'SCN' => :set_color_for_stroking_and_special,
233
+ 'scn' => :set_color_for_nonstroking_and_special,
234
+ 'sh' => :paint_area_with_shading_pattern,
235
+ 'T*' => :move_to_start_of_next_line,
236
+ 'Tc' => :set_character_spacing,
237
+ 'Td' => :move_text_position,
238
+ 'TD' => :move_text_position_and_set_leading,
239
+ 'Tf' => :set_text_font_and_size,
240
+ 'Tj' => :show_text,
241
+ 'TJ' => :show_text_with_positioning,
242
+ 'TL' => :set_text_leading,
243
+ 'Tm' => :set_text_matrix_and_text_line_matrix,
244
+ 'Tr' => :set_text_rendering_mode,
245
+ 'Ts' => :set_text_rise,
246
+ 'Tw' => :set_word_spacing,
247
+ 'Tz' => :set_horizontal_text_scaling,
248
+ 'v' => :append_curved_segment_initial_point_replicated,
249
+ 'w' => :set_line_width,
250
+ 'W' => :set_clipping_path_with_nonzero,
251
+ 'W*' => :set_clipping_path_with_even_odd,
252
+ 'y' => :append_curved_segment_final_point_replicated,
253
+ '\'' => :move_to_next_line_and_show_text,
254
+ '"' => :set_spacing_next_line_show_text,
255
+ }
256
+ def self.to_sym
257
+ :pages
258
+ end
259
+ ################################################################################
260
+ # Begin processing the document
261
+ def process
262
+ return false unless options[:pages]
263
+
264
+ callback(:begin_document, [root])
265
+ walk_pages(@ohash.object(root[:Pages]))
266
+ callback(:end_document)
267
+ end
268
+ private
269
+ ################################################################################
270
+ # Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
271
+ # its content
272
+ def walk_pages (page)
273
+
274
+ # extract page content
275
+ if page[:Type] == :Pages
276
+ callback(:begin_page_container, [page])
277
+ res = @ohash.object(page[:Resources])
278
+ resources.push res if res
279
+ @ohash.object(page[:Kids]).each {|child| walk_pages(@ohash.object(child))}
280
+ resources.pop if res
281
+ callback(:end_page_container)
282
+ elsif page[:Type] == :Page
283
+ callback(:begin_page, [page])
284
+ res = @ohash.object(page[:Resources])
285
+ resources.push res if res
286
+ walk_resources(current_resources)
287
+
288
+ if @ohash.object(page[:Contents]).kind_of?(Array)
289
+ contents = @ohash.object(page[:Contents])
290
+ else
291
+ contents = [page[:Contents]]
292
+ end
293
+
294
+ fonts = font_hash_from_resources(current_resources)
295
+
296
+ if page.has_key?(:Contents) and page[:Contents]
297
+ direct_contents = contents.map { |content| @ohash.object(content) }
298
+ content_stream(direct_contents, fonts)
299
+ end
300
+
301
+ resources.pop if res
302
+ callback(:end_page)
303
+ end
304
+ end
305
+ ################################################################################
306
+ # Retreive the XObject for the supplied label and if it's a Form, walk it
307
+ # like a regular page content stream.
308
+ #
309
+ def walk_xobject_form(label)
310
+ xobjects = @ohash.object(current_resources[:XObject]) || {}
311
+ xobject = @ohash.object(xobjects[label])
312
+
313
+ if xobject && xobject.hash[:Subtype] == :Form
314
+ callback(:begin_form_xobject)
315
+ xobj_resources = @ohash.object(xobject.hash[:Resources])
316
+ if xobj_resources
317
+ resources.push xobj_resources
318
+ walk_resources(xobj_resources)
319
+ end
320
+ fonts = font_hash_from_resources(xobj_resources)
321
+ content_stream(xobject, fonts)
322
+ callback(:end_form_xobject)
323
+ resources.pop if xobj_resources
324
+ end
325
+ end
326
+
327
+ ################################################################################
328
+ # Return a merged hash of all resources that are current. Pages, page and xobject
329
+ #
330
+ def current_resources
331
+ hash = {}
332
+ resources.each do |res|
333
+ hash.merge!(res)
334
+ end
335
+ hash
336
+ end
337
+ ################################################################################
338
+ # Reads a PDF content stream and calls all the appropriate callback methods for the operators
339
+ # it contains
340
+ #
341
+ def content_stream (instructions, fonts = {})
342
+ instructions = [instructions] unless instructions.kind_of?(Array)
343
+ instructions = instructions.map { |ins|
344
+ ins.is_a?(PDF::Reader::Stream) ? ins.unfiltered_data : ins.to_s
345
+ }.join
346
+ buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
347
+ parser = Parser.new(buffer, @ohash)
348
+ current_font = nil
349
+ params = []
350
+
351
+ while (token = parser.parse_token(OPERATORS))
352
+ if token.kind_of?(Token) and OPERATORS.has_key?(token)
353
+ if OPERATORS[token] == :set_text_font_and_size
354
+ current_font = params.first
355
+ if fonts[current_font].nil?
356
+ raise MalformedPDFError, "Unknown font #{current_font}"
357
+ end
358
+ end
359
+
360
+ # handle special cases in response to certain operators
361
+ if OPERATORS[token].to_s.include?("show_text")
362
+ # convert any text to utf-8, but output the raw string if the user wants it
363
+ if options[:raw_text]
364
+ callback("#{OPERATORS[token]}_raw".to_sym, params)
365
+ end
366
+ params = fonts[current_font].to_utf8(params)
367
+ elsif token == "ID"
368
+ # inline image data, first convert the current params into a more familiar hash
369
+ map = {}
370
+ params.each_slice(2) do |key, value|
371
+ map[key] = value
372
+ end
373
+ params = [map, buffer.token]
374
+ end
375
+
376
+ callback(OPERATORS[token], params)
377
+
378
+ if OPERATORS[token] == :invoke_xobject
379
+ xobject_label = params.first
380
+ params.clear
381
+ walk_xobject_form(xobject_label)
382
+ else
383
+ params.clear
384
+ end
385
+ else
386
+ params << token
387
+ end
388
+ end
389
+ rescue EOFError => e
390
+ raise MalformedPDFError, "End Of File while processing a content stream"
391
+ end
392
+ ################################################################################
393
+ def walk_resources(resources)
394
+ return unless resources.respond_to?(:[])
395
+
396
+ resources = resolve_references(resources)
397
+
398
+ # extract any procset information
399
+ if resources[:ProcSet]
400
+ callback(:resource_procset, resources[:ProcSet])
401
+ end
402
+
403
+ # extract any xobject information
404
+ if resources[:XObject]
405
+ @ohash.object(resources[:XObject]).each do |name, val|
406
+ callback(:resource_xobject, [name, @ohash.object(val)])
407
+ end
408
+ end
409
+
410
+ # extract any extgstate information
411
+ if resources[:ExtGState]
412
+ @ohash.object(resources[:ExtGState]).each do |name, val|
413
+ callback(:resource_extgstate, [name, @ohash.object(val)])
414
+ end
415
+ end
416
+
417
+ # extract any colorspace information
418
+ if resources[:ColorSpace]
419
+ @ohash.object(resources[:ColorSpace]).each do |name, val|
420
+ callback(:resource_colorspace, [name, @ohash.object(val)])
421
+ end
422
+ end
423
+
424
+ # extract any pattern information
425
+ if resources[:Pattern]
426
+ @ohash.object(resources[:Pattern]).each do |name, val|
427
+ callback(:resource_pattern, [name, @ohash.object(val)])
428
+ end
429
+ end
430
+
431
+ # extract any font information
432
+ if resources[:Font]
433
+ fonts = font_hash_from_resources(resources)
434
+ fonts.each do |label, font|
435
+ callback(:resource_font, [label, font])
436
+ end
437
+ end
438
+ end
439
+ ################################################################################
440
+ # Convert any PDF::Reader::Resource objects into a real object
441
+ def resolve_references(obj)
442
+ case obj
443
+ when PDF::Reader::Stream then
444
+ obj.hash = resolve_references(obj.hash)
445
+ obj
446
+ when PDF::Reader::Reference then
447
+ resolve_references(@ohash.object(obj))
448
+ when Hash then
449
+ arr = obj.map { |key,val| [key, resolve_references(val)] }.flatten(1)
450
+ Hash[*arr]
451
+ when Array then
452
+ obj.collect { |item| resolve_references(item) }
453
+ else
454
+ obj
455
+ end
456
+ end
457
+ ################################################################################
458
+ ################################################################################
459
+ def font_hash_from_resources(resources)
460
+ return {} unless resources.respond_to?(:[])
461
+
462
+ fonts = {}
463
+ resources = @ohash.object(resources[:Font]) || {}
464
+ resources.each do |label, desc|
465
+ fonts[label] = PDF::Reader::Font.new(@ohash, @ohash.object(desc))
466
+ end
467
+ fonts
468
+ end
469
+ def resources
470
+ @resources ||= []
471
+ end
472
+ end
473
+ ################################################################################
474
+ end
475
+ ################################################################################
@@ -0,0 +1,225 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+
26
+ class PDF::Reader
27
+ ################################################################################
28
+ # An internal PDF::Reader class that reads objects from the PDF file and converts
29
+ # them into useable ruby objects (hash's, arrays, true, false, etc)
30
+ class Parser
31
+ ################################################################################
32
+ # Create a new parser around a PDF::Reader::Buffer object
33
+ #
34
+ # buffer - a PDF::Reader::Buffer object that contains PDF data
35
+ # ohash - a PDF::Reader::ObjectHash object that can return objects from the PDF file
36
+ def initialize (buffer, ohash=nil)
37
+ @buffer = buffer
38
+ @ohash = ohash
39
+ end
40
+ ################################################################################
41
+ # Reads the next token from the underlying buffer and convets it to an appropriate
42
+ # object
43
+ #
44
+ # operators - a hash of supported operators to read from the underlying buffer.
45
+ def parse_token (operators={})
46
+ token = @buffer.token
47
+
48
+ case token
49
+ when PDF::Reader::Reference, nil then return token
50
+ when "/" then return pdf_name()
51
+ when "<<" then return dictionary()
52
+ when "[" then return array()
53
+ when "(" then return string()
54
+ when "<" then return hex_string()
55
+ when "true" then return true
56
+ when "false" then return false
57
+ when "null" then return nil
58
+ when "obj", "endobj", "stream", "endstream" then return Token.new(token)
59
+ when "stream", "endstream" then return Token.new(token)
60
+ when ">>", "]", ">", ")" then return Token.new(token)
61
+ else
62
+ if operators.has_key?(token) then return Token.new(token)
63
+ elsif token =~ /\d*\.\d/ then return token.to_f
64
+ else return token.to_i
65
+ end
66
+ end
67
+ end
68
+ ################################################################################
69
+ # Reads an entire PDF object from the buffer and returns it as a Ruby String.
70
+ # If the object is a content stream, returns both the stream and the dictionary
71
+ # that describes it
72
+ #
73
+ # id - the object ID to return
74
+ # gen - the object revision number to return
75
+ def object (id, gen)
76
+ Error.assert_equal(parse_token, id)
77
+ Error.assert_equal(parse_token, gen)
78
+ Error.str_assert(parse_token, "obj")
79
+
80
+ obj = parse_token
81
+ post_obj = parse_token
82
+ if post_obj == "stream"
83
+ stream(obj)
84
+ else
85
+ obj
86
+ end
87
+ end
88
+
89
+ private
90
+
91
+ ################################################################################
92
+ # reads a PDF dict from the buffer and converts it to a Ruby Hash.
93
+ def dictionary
94
+ dict = {}
95
+
96
+ loop do
97
+ key = parse_token
98
+ break if key.kind_of?(Token) and key == ">>"
99
+ raise MalformedPDFError, "Dictionary key (#{key.inspect}) is not a name" unless key.kind_of?(Symbol)
100
+
101
+ value = parse_token
102
+ value.kind_of?(Token) and Error.str_assert_not(value, ">>")
103
+ dict[key] = value
104
+ end
105
+
106
+ dict
107
+ end
108
+ ################################################################################
109
+ # reads a PDF name from the buffer and converts it to a Ruby Symbol
110
+ def pdf_name
111
+ tok = @buffer.token
112
+ tok.scan(/#([A-Fa-f0-9]{2})/).each do |find|
113
+ replace = find[0].hex.chr
114
+ tok.gsub!("#"+find[0], replace)
115
+ end
116
+ tok.to_sym
117
+ end
118
+ ################################################################################
119
+ # reads a PDF array from the buffer and converts it to a Ruby Array.
120
+ def array
121
+ a = []
122
+
123
+ loop do
124
+ item = parse_token
125
+ break if item.kind_of?(Token) and item == "]"
126
+ a << item
127
+ end
128
+
129
+ a
130
+ end
131
+ ################################################################################
132
+ # Reads a PDF hex string from the buffer and converts it to a Ruby String
133
+ def hex_string
134
+ str = ""
135
+
136
+ loop do
137
+ token = @buffer.token
138
+ break if token == ">"
139
+ str << token
140
+ end
141
+
142
+ # add a missing digit if required, as required by the spec
143
+ str << "0" unless str.size % 2 == 0
144
+ str.scan(/../).map {|i| i.hex.chr}.join
145
+ end
146
+ ################################################################################
147
+ # Reads a PDF String from the buffer and converts it to a Ruby String
148
+ def string
149
+ str = @buffer.token
150
+ return "" if str == ")"
151
+ Error.assert_equal(parse_token, ")")
152
+
153
+ ret = ""
154
+ idx = 0
155
+
156
+ while idx < str.size
157
+ chr = str[idx,1]
158
+ jump = 1
159
+
160
+ if chr == "\\"
161
+ jump = 2
162
+ case str[idx+1, 1]
163
+ when "" then jump = 1
164
+ when "n" then chr = "\n"
165
+ when "r" then chr = "\r"
166
+ when "t" then chr = "\t"
167
+ when "b" then chr = "\b"
168
+ when "f" then chr = "\f"
169
+ when "(" then chr = "("
170
+ when ")" then chr = ")"
171
+ when "\\" then chr = "\\"
172
+ when "\n" then
173
+ chr = ""
174
+ jump = 2
175
+ else
176
+ if str[idx+1,3].match(/\d{3}/)
177
+ jump = 4
178
+ chr = str[idx+1,3].oct.chr
179
+ elsif str[idx+1,2].match(/\d{2}/)
180
+ jump = 3
181
+ chr = ("0"+str[idx+1,2]).oct.chr
182
+ elsif str[idx+1,1].match(/\d/)
183
+ jump = 2
184
+ chr = ("00"+str[idx+1,1]).oct.chr
185
+ else
186
+ jump = 1
187
+ chr = ""
188
+ end
189
+
190
+ end
191
+ elsif chr == "\r" && str[idx+1,1] == "\n"
192
+ chr = "\n"
193
+ jump = 2
194
+ elsif chr == "\n" && str[idx+1,1] == "\r"
195
+ chr = "\n"
196
+ jump = 2
197
+ elsif chr == "\r"
198
+ chr = "\n"
199
+ end
200
+ ret << chr
201
+ idx += jump
202
+ end
203
+ ret
204
+ end
205
+ ################################################################################
206
+ # Decodes the contents of a PDF Stream and returns it as a Ruby String.
207
+ def stream (dict)
208
+ raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
209
+ if @ohash
210
+ length = @ohash.object(dict[:Length])
211
+ else
212
+ length = dict[:Length] || 0
213
+ end
214
+ data = @buffer.read(length, :skip_eol => true)
215
+
216
+ Error.str_assert(parse_token, "endstream")
217
+ Error.str_assert(parse_token, "endobj")
218
+
219
+ PDF::Reader::Stream.new(dict, data)
220
+ end
221
+ ################################################################################
222
+ end
223
+ ################################################################################
224
+ end
225
+ ################################################################################