fireinc-pdf-reader 0.11.0.alpha
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +168 -0
- data/MIT-LICENSE +21 -0
- data/README.rdoc +137 -0
- data/Rakefile +34 -0
- data/TODO +45 -0
- data/bin/pdf_list_callbacks +15 -0
- data/bin/pdf_object +48 -0
- data/bin/pdf_text +15 -0
- data/examples/callbacks.rb +21 -0
- data/examples/extract_bates.rb +49 -0
- data/examples/extract_images.rb +108 -0
- data/examples/hash.rb +12 -0
- data/examples/metadata.rb +25 -0
- data/examples/page_counter_improved.rb +23 -0
- data/examples/page_counter_naive.rb +24 -0
- data/examples/rspec.rb +57 -0
- data/examples/text.rb +40 -0
- data/examples/version.rb +25 -0
- data/lib/pdf/hash.rb +15 -0
- data/lib/pdf/reader/abstract_strategy.rb +81 -0
- data/lib/pdf/reader/buffer.rb +346 -0
- data/lib/pdf/reader/cmap.rb +138 -0
- data/lib/pdf/reader/encoding.rb +190 -0
- data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
- data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
- data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
- data/lib/pdf/reader/encodings/standard.txt +47 -0
- data/lib/pdf/reader/encodings/symbol.txt +154 -0
- data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
- data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
- data/lib/pdf/reader/error.rb +53 -0
- data/lib/pdf/reader/filter.rb +219 -0
- data/lib/pdf/reader/font.rb +133 -0
- data/lib/pdf/reader/form_xobject.rb +83 -0
- data/lib/pdf/reader/glyphlist.txt +4322 -0
- data/lib/pdf/reader/lzw.rb +123 -0
- data/lib/pdf/reader/metadata_strategy.rb +56 -0
- data/lib/pdf/reader/object_cache.rb +85 -0
- data/lib/pdf/reader/object_hash.rb +289 -0
- data/lib/pdf/reader/object_stream.rb +51 -0
- data/lib/pdf/reader/page.rb +185 -0
- data/lib/pdf/reader/page_text_receiver.rb +278 -0
- data/lib/pdf/reader/pages_strategy.rb +475 -0
- data/lib/pdf/reader/parser.rb +225 -0
- data/lib/pdf/reader/print_receiver.rb +18 -0
- data/lib/pdf/reader/reference.rb +66 -0
- data/lib/pdf/reader/register_receiver.rb +95 -0
- data/lib/pdf/reader/stream.rb +69 -0
- data/lib/pdf/reader/text_receiver.rb +264 -0
- data/lib/pdf/reader/token.rb +41 -0
- data/lib/pdf/reader/xref.rb +220 -0
- data/lib/pdf/reader.rb +296 -0
- data/lib/pdf-reader.rb +1 -0
- metadata +211 -0
@@ -0,0 +1,475 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
|
26
|
+
class PDF::Reader
|
27
|
+
################################################################################
|
28
|
+
# Walks the pages of the PDF file and calls the appropriate callback methods when
|
29
|
+
# something of interest is found.
|
30
|
+
#
|
31
|
+
# The callback methods should exist on the receiver object passed into the constructor. Whenever
|
32
|
+
# some content is found that will trigger a callback, the receiver is checked to see if the callback
|
33
|
+
# is defined.
|
34
|
+
#
|
35
|
+
# If it is defined it will be called. If not, processing will continue.
|
36
|
+
#
|
37
|
+
# = Available Callbacks
|
38
|
+
# The following callbacks are available and should be methods defined on your receiver class. Only
|
39
|
+
# implement the ones you need - the rest will be ignored.
|
40
|
+
#
|
41
|
+
# Some callbacks will include parameters which will be passed in as an array. For callbacks that supply no
|
42
|
+
# paramters, or where you don't need them, the *params argument can be left off. Some example callback
|
43
|
+
# method definitions are:
|
44
|
+
#
|
45
|
+
# def begin_document
|
46
|
+
# def end_page
|
47
|
+
# def show_text(string, *params)
|
48
|
+
# def fill_stroke(*params)
|
49
|
+
#
|
50
|
+
# You should be able to infer the basic command the callback is reporting based on the name. For
|
51
|
+
# further experimentation, define the callback with just a *params parameter, then print out the
|
52
|
+
# contents of the array using something like:
|
53
|
+
#
|
54
|
+
# puts params.inspect
|
55
|
+
#
|
56
|
+
# == Text Callbacks
|
57
|
+
#
|
58
|
+
# All text passed into these callbacks will be encoded as UTF-8. Depending on where (and when) the
|
59
|
+
# PDF was generated, there's a good chance the text is NOT stored as UTF-8 internally so be careful
|
60
|
+
# when doing a comparison on strings returned from PDF::Reader (when doing unit tests for example). The
|
61
|
+
# string may not be byte-by-byte identical with the string that was originally written to the PDF.
|
62
|
+
#
|
63
|
+
# - end_text_object
|
64
|
+
# - move_to_start_of_next_line
|
65
|
+
# - set_character_spacing
|
66
|
+
# - move_text_position
|
67
|
+
# - move_text_position_and_set_leading
|
68
|
+
# - set_text_font_and_size
|
69
|
+
# - show_text
|
70
|
+
# - show_text_with_positioning
|
71
|
+
# - set_text_leading
|
72
|
+
# - set_text_matrix_and_text_line_matrix
|
73
|
+
# - set_text_rendering_mode
|
74
|
+
# - set_text_rise
|
75
|
+
# - set_word_spacing
|
76
|
+
# - set_horizontal_text_scaling
|
77
|
+
# - move_to_next_line_and_show_text
|
78
|
+
# - set_spacing_next_line_show_text
|
79
|
+
#
|
80
|
+
# If the :raw_text option was passed to the PDF::Reader class the following callbacks
|
81
|
+
# may also appear:
|
82
|
+
#
|
83
|
+
# - show_text_raw
|
84
|
+
# - show_text_with_positioning_raw
|
85
|
+
# - move_to_next_line_and_show_text_raw
|
86
|
+
# - set_spacing_next_line_show_text_raw
|
87
|
+
#
|
88
|
+
# == Graphics Callbacks
|
89
|
+
# - close_fill_stroke
|
90
|
+
# - fill_stroke
|
91
|
+
# - close_fill_stroke_with_even_odd
|
92
|
+
# - fill_stroke_with_even_odd
|
93
|
+
# - begin_marked_content_with_pl
|
94
|
+
# - begin_inline_image
|
95
|
+
# - begin_marked_content
|
96
|
+
# - begin_text_object
|
97
|
+
# - append_curved_segment
|
98
|
+
# - concatenate_matrix
|
99
|
+
# - set_stroke_color_space
|
100
|
+
# - set_nonstroke_color_space
|
101
|
+
# - set_line_dash
|
102
|
+
# - set_glyph_width
|
103
|
+
# - set_glyph_width_and_bounding_box
|
104
|
+
# - invoke_xobject
|
105
|
+
# - define_marked_content_with_pl
|
106
|
+
# - end_inline_image
|
107
|
+
# - end_marked_content
|
108
|
+
# - fill_path_with_nonzero
|
109
|
+
# - fill_path_with_nonzero
|
110
|
+
# - fill_path_with_even_odd
|
111
|
+
# - set_gray_for_stroking
|
112
|
+
# - set_gray_for_nonstroking
|
113
|
+
# - set_graphics_state_parameters
|
114
|
+
# - close_subpath
|
115
|
+
# - set_flatness_tolerance
|
116
|
+
# - begin_inline_image_data
|
117
|
+
# - set_line_join_style
|
118
|
+
# - set_line_cap_style
|
119
|
+
# - set_cmyk_color_for_stroking,
|
120
|
+
# - set_cmyk_color_for_nonstroking
|
121
|
+
# - append_line
|
122
|
+
# - begin_new_subpath
|
123
|
+
# - set_miter_limit
|
124
|
+
# - define_marked_content_point
|
125
|
+
# - end_path
|
126
|
+
# - save_graphics_state
|
127
|
+
# - restore_graphics_state
|
128
|
+
# - append_rectangle
|
129
|
+
# - set_rgb_color_for_stroking
|
130
|
+
# - set_rgb_color_for_nonstroking
|
131
|
+
# - set_color_rendering_intent
|
132
|
+
# - close_and_stroke_path
|
133
|
+
# - stroke_path
|
134
|
+
# - set_color_for_stroking
|
135
|
+
# - set_color_for_nonstroking
|
136
|
+
# - set_color_for_stroking_and_special
|
137
|
+
# - set_color_for_nonstroking_and_special
|
138
|
+
# - paint_area_with_shading_pattern
|
139
|
+
# - append_curved_segment_initial_point_replicated
|
140
|
+
# - set_line_width
|
141
|
+
# - set_clipping_path_with_nonzero
|
142
|
+
# - set_clipping_path_with_even_odd
|
143
|
+
# - append_curved_segment_final_point_replicated
|
144
|
+
#
|
145
|
+
# == Misc Callbacks
|
146
|
+
# - begin_compatibility_section
|
147
|
+
# - end_compatibility_section,
|
148
|
+
# - begin_document
|
149
|
+
# - end_document
|
150
|
+
# - begin_page_container
|
151
|
+
# - end_page_container
|
152
|
+
# - begin_page
|
153
|
+
# - end_page
|
154
|
+
# - metadata
|
155
|
+
# - xml_metadata
|
156
|
+
# - page_count
|
157
|
+
# - begin_form_xobject
|
158
|
+
# - end_form_xobject
|
159
|
+
#
|
160
|
+
# == Resource Callbacks
|
161
|
+
#
|
162
|
+
# Each page can contain (or inherit) a range of resources required for the page,
|
163
|
+
# including things like fonts and images. The following callbacks may appear
|
164
|
+
# after begin_page if the relevant resources exist on a page:
|
165
|
+
#
|
166
|
+
# - resource_procset
|
167
|
+
# - resource_xobject
|
168
|
+
# - resource_extgstate
|
169
|
+
# - resource_colorspace
|
170
|
+
# - resource_pattern
|
171
|
+
# - resource_font
|
172
|
+
#
|
173
|
+
# In most cases, these callbacks associate a name with each resource, allowing it
|
174
|
+
# to be referred to by name in the page content. For example, an XObject can hold an image.
|
175
|
+
# If it gets mapped to the name "IM1", then it can be placed on the page using
|
176
|
+
# invoke_xobject "IM1".
|
177
|
+
#
|
178
|
+
# DEPRECATED: this class was deprecated in version 0.11.0 and will
|
179
|
+
# eventually be removed
|
180
|
+
class PagesStrategy< AbstractStrategy # :nodoc:
|
181
|
+
OPERATORS = {
|
182
|
+
'b' => :close_fill_stroke,
|
183
|
+
'B' => :fill_stroke,
|
184
|
+
'b*' => :close_fill_stroke_with_even_odd,
|
185
|
+
'B*' => :fill_stroke_with_even_odd,
|
186
|
+
'BDC' => :begin_marked_content_with_pl,
|
187
|
+
'BI' => :begin_inline_image,
|
188
|
+
'BMC' => :begin_marked_content,
|
189
|
+
'BT' => :begin_text_object,
|
190
|
+
'BX' => :begin_compatibility_section,
|
191
|
+
'c' => :append_curved_segment,
|
192
|
+
'cm' => :concatenate_matrix,
|
193
|
+
'CS' => :set_stroke_color_space,
|
194
|
+
'cs' => :set_nonstroke_color_space,
|
195
|
+
'd' => :set_line_dash,
|
196
|
+
'd0' => :set_glyph_width,
|
197
|
+
'd1' => :set_glyph_width_and_bounding_box,
|
198
|
+
'Do' => :invoke_xobject,
|
199
|
+
'DP' => :define_marked_content_with_pl,
|
200
|
+
'EI' => :end_inline_image,
|
201
|
+
'EMC' => :end_marked_content,
|
202
|
+
'ET' => :end_text_object,
|
203
|
+
'EX' => :end_compatibility_section,
|
204
|
+
'f' => :fill_path_with_nonzero,
|
205
|
+
'F' => :fill_path_with_nonzero,
|
206
|
+
'f*' => :fill_path_with_even_odd,
|
207
|
+
'G' => :set_gray_for_stroking,
|
208
|
+
'g' => :set_gray_for_nonstroking,
|
209
|
+
'gs' => :set_graphics_state_parameters,
|
210
|
+
'h' => :close_subpath,
|
211
|
+
'i' => :set_flatness_tolerance,
|
212
|
+
'ID' => :begin_inline_image_data,
|
213
|
+
'j' => :set_line_join_style,
|
214
|
+
'J' => :set_line_cap_style,
|
215
|
+
'K' => :set_cmyk_color_for_stroking,
|
216
|
+
'k' => :set_cmyk_color_for_nonstroking,
|
217
|
+
'l' => :append_line,
|
218
|
+
'm' => :begin_new_subpath,
|
219
|
+
'M' => :set_miter_limit,
|
220
|
+
'MP' => :define_marked_content_point,
|
221
|
+
'n' => :end_path,
|
222
|
+
'q' => :save_graphics_state,
|
223
|
+
'Q' => :restore_graphics_state,
|
224
|
+
're' => :append_rectangle,
|
225
|
+
'RG' => :set_rgb_color_for_stroking,
|
226
|
+
'rg' => :set_rgb_color_for_nonstroking,
|
227
|
+
'ri' => :set_color_rendering_intent,
|
228
|
+
's' => :close_and_stroke_path,
|
229
|
+
'S' => :stroke_path,
|
230
|
+
'SC' => :set_color_for_stroking,
|
231
|
+
'sc' => :set_color_for_nonstroking,
|
232
|
+
'SCN' => :set_color_for_stroking_and_special,
|
233
|
+
'scn' => :set_color_for_nonstroking_and_special,
|
234
|
+
'sh' => :paint_area_with_shading_pattern,
|
235
|
+
'T*' => :move_to_start_of_next_line,
|
236
|
+
'Tc' => :set_character_spacing,
|
237
|
+
'Td' => :move_text_position,
|
238
|
+
'TD' => :move_text_position_and_set_leading,
|
239
|
+
'Tf' => :set_text_font_and_size,
|
240
|
+
'Tj' => :show_text,
|
241
|
+
'TJ' => :show_text_with_positioning,
|
242
|
+
'TL' => :set_text_leading,
|
243
|
+
'Tm' => :set_text_matrix_and_text_line_matrix,
|
244
|
+
'Tr' => :set_text_rendering_mode,
|
245
|
+
'Ts' => :set_text_rise,
|
246
|
+
'Tw' => :set_word_spacing,
|
247
|
+
'Tz' => :set_horizontal_text_scaling,
|
248
|
+
'v' => :append_curved_segment_initial_point_replicated,
|
249
|
+
'w' => :set_line_width,
|
250
|
+
'W' => :set_clipping_path_with_nonzero,
|
251
|
+
'W*' => :set_clipping_path_with_even_odd,
|
252
|
+
'y' => :append_curved_segment_final_point_replicated,
|
253
|
+
'\'' => :move_to_next_line_and_show_text,
|
254
|
+
'"' => :set_spacing_next_line_show_text,
|
255
|
+
}
|
256
|
+
def self.to_sym
|
257
|
+
:pages
|
258
|
+
end
|
259
|
+
################################################################################
|
260
|
+
# Begin processing the document
|
261
|
+
def process
|
262
|
+
return false unless options[:pages]
|
263
|
+
|
264
|
+
callback(:begin_document, [root])
|
265
|
+
walk_pages(@ohash.object(root[:Pages]))
|
266
|
+
callback(:end_document)
|
267
|
+
end
|
268
|
+
private
|
269
|
+
################################################################################
|
270
|
+
# Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
|
271
|
+
# its content
|
272
|
+
def walk_pages (page)
|
273
|
+
|
274
|
+
# extract page content
|
275
|
+
if page[:Type] == :Pages
|
276
|
+
callback(:begin_page_container, [page])
|
277
|
+
res = @ohash.object(page[:Resources])
|
278
|
+
resources.push res if res
|
279
|
+
@ohash.object(page[:Kids]).each {|child| walk_pages(@ohash.object(child))}
|
280
|
+
resources.pop if res
|
281
|
+
callback(:end_page_container)
|
282
|
+
elsif page[:Type] == :Page
|
283
|
+
callback(:begin_page, [page])
|
284
|
+
res = @ohash.object(page[:Resources])
|
285
|
+
resources.push res if res
|
286
|
+
walk_resources(current_resources)
|
287
|
+
|
288
|
+
if @ohash.object(page[:Contents]).kind_of?(Array)
|
289
|
+
contents = @ohash.object(page[:Contents])
|
290
|
+
else
|
291
|
+
contents = [page[:Contents]]
|
292
|
+
end
|
293
|
+
|
294
|
+
fonts = font_hash_from_resources(current_resources)
|
295
|
+
|
296
|
+
if page.has_key?(:Contents) and page[:Contents]
|
297
|
+
direct_contents = contents.map { |content| @ohash.object(content) }
|
298
|
+
content_stream(direct_contents, fonts)
|
299
|
+
end
|
300
|
+
|
301
|
+
resources.pop if res
|
302
|
+
callback(:end_page)
|
303
|
+
end
|
304
|
+
end
|
305
|
+
################################################################################
|
306
|
+
# Retreive the XObject for the supplied label and if it's a Form, walk it
|
307
|
+
# like a regular page content stream.
|
308
|
+
#
|
309
|
+
def walk_xobject_form(label)
|
310
|
+
xobjects = @ohash.object(current_resources[:XObject]) || {}
|
311
|
+
xobject = @ohash.object(xobjects[label])
|
312
|
+
|
313
|
+
if xobject && xobject.hash[:Subtype] == :Form
|
314
|
+
callback(:begin_form_xobject)
|
315
|
+
xobj_resources = @ohash.object(xobject.hash[:Resources])
|
316
|
+
if xobj_resources
|
317
|
+
resources.push xobj_resources
|
318
|
+
walk_resources(xobj_resources)
|
319
|
+
end
|
320
|
+
fonts = font_hash_from_resources(xobj_resources)
|
321
|
+
content_stream(xobject, fonts)
|
322
|
+
callback(:end_form_xobject)
|
323
|
+
resources.pop if xobj_resources
|
324
|
+
end
|
325
|
+
end
|
326
|
+
|
327
|
+
################################################################################
|
328
|
+
# Return a merged hash of all resources that are current. Pages, page and xobject
|
329
|
+
#
|
330
|
+
def current_resources
|
331
|
+
hash = {}
|
332
|
+
resources.each do |res|
|
333
|
+
hash.merge!(res)
|
334
|
+
end
|
335
|
+
hash
|
336
|
+
end
|
337
|
+
################################################################################
|
338
|
+
# Reads a PDF content stream and calls all the appropriate callback methods for the operators
|
339
|
+
# it contains
|
340
|
+
#
|
341
|
+
def content_stream (instructions, fonts = {})
|
342
|
+
instructions = [instructions] unless instructions.kind_of?(Array)
|
343
|
+
instructions = instructions.map { |ins|
|
344
|
+
ins.is_a?(PDF::Reader::Stream) ? ins.unfiltered_data : ins.to_s
|
345
|
+
}.join
|
346
|
+
buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
|
347
|
+
parser = Parser.new(buffer, @ohash)
|
348
|
+
current_font = nil
|
349
|
+
params = []
|
350
|
+
|
351
|
+
while (token = parser.parse_token(OPERATORS))
|
352
|
+
if token.kind_of?(Token) and OPERATORS.has_key?(token)
|
353
|
+
if OPERATORS[token] == :set_text_font_and_size
|
354
|
+
current_font = params.first
|
355
|
+
if fonts[current_font].nil?
|
356
|
+
raise MalformedPDFError, "Unknown font #{current_font}"
|
357
|
+
end
|
358
|
+
end
|
359
|
+
|
360
|
+
# handle special cases in response to certain operators
|
361
|
+
if OPERATORS[token].to_s.include?("show_text")
|
362
|
+
# convert any text to utf-8, but output the raw string if the user wants it
|
363
|
+
if options[:raw_text]
|
364
|
+
callback("#{OPERATORS[token]}_raw".to_sym, params)
|
365
|
+
end
|
366
|
+
params = fonts[current_font].to_utf8(params)
|
367
|
+
elsif token == "ID"
|
368
|
+
# inline image data, first convert the current params into a more familiar hash
|
369
|
+
map = {}
|
370
|
+
params.each_slice(2) do |key, value|
|
371
|
+
map[key] = value
|
372
|
+
end
|
373
|
+
params = [map, buffer.token]
|
374
|
+
end
|
375
|
+
|
376
|
+
callback(OPERATORS[token], params)
|
377
|
+
|
378
|
+
if OPERATORS[token] == :invoke_xobject
|
379
|
+
xobject_label = params.first
|
380
|
+
params.clear
|
381
|
+
walk_xobject_form(xobject_label)
|
382
|
+
else
|
383
|
+
params.clear
|
384
|
+
end
|
385
|
+
else
|
386
|
+
params << token
|
387
|
+
end
|
388
|
+
end
|
389
|
+
rescue EOFError => e
|
390
|
+
raise MalformedPDFError, "End Of File while processing a content stream"
|
391
|
+
end
|
392
|
+
################################################################################
|
393
|
+
def walk_resources(resources)
|
394
|
+
return unless resources.respond_to?(:[])
|
395
|
+
|
396
|
+
resources = resolve_references(resources)
|
397
|
+
|
398
|
+
# extract any procset information
|
399
|
+
if resources[:ProcSet]
|
400
|
+
callback(:resource_procset, resources[:ProcSet])
|
401
|
+
end
|
402
|
+
|
403
|
+
# extract any xobject information
|
404
|
+
if resources[:XObject]
|
405
|
+
@ohash.object(resources[:XObject]).each do |name, val|
|
406
|
+
callback(:resource_xobject, [name, @ohash.object(val)])
|
407
|
+
end
|
408
|
+
end
|
409
|
+
|
410
|
+
# extract any extgstate information
|
411
|
+
if resources[:ExtGState]
|
412
|
+
@ohash.object(resources[:ExtGState]).each do |name, val|
|
413
|
+
callback(:resource_extgstate, [name, @ohash.object(val)])
|
414
|
+
end
|
415
|
+
end
|
416
|
+
|
417
|
+
# extract any colorspace information
|
418
|
+
if resources[:ColorSpace]
|
419
|
+
@ohash.object(resources[:ColorSpace]).each do |name, val|
|
420
|
+
callback(:resource_colorspace, [name, @ohash.object(val)])
|
421
|
+
end
|
422
|
+
end
|
423
|
+
|
424
|
+
# extract any pattern information
|
425
|
+
if resources[:Pattern]
|
426
|
+
@ohash.object(resources[:Pattern]).each do |name, val|
|
427
|
+
callback(:resource_pattern, [name, @ohash.object(val)])
|
428
|
+
end
|
429
|
+
end
|
430
|
+
|
431
|
+
# extract any font information
|
432
|
+
if resources[:Font]
|
433
|
+
fonts = font_hash_from_resources(resources)
|
434
|
+
fonts.each do |label, font|
|
435
|
+
callback(:resource_font, [label, font])
|
436
|
+
end
|
437
|
+
end
|
438
|
+
end
|
439
|
+
################################################################################
|
440
|
+
# Convert any PDF::Reader::Resource objects into a real object
|
441
|
+
def resolve_references(obj)
|
442
|
+
case obj
|
443
|
+
when PDF::Reader::Stream then
|
444
|
+
obj.hash = resolve_references(obj.hash)
|
445
|
+
obj
|
446
|
+
when PDF::Reader::Reference then
|
447
|
+
resolve_references(@ohash.object(obj))
|
448
|
+
when Hash then
|
449
|
+
arr = obj.map { |key,val| [key, resolve_references(val)] }.flatten(1)
|
450
|
+
Hash[*arr]
|
451
|
+
when Array then
|
452
|
+
obj.collect { |item| resolve_references(item) }
|
453
|
+
else
|
454
|
+
obj
|
455
|
+
end
|
456
|
+
end
|
457
|
+
################################################################################
|
458
|
+
################################################################################
|
459
|
+
def font_hash_from_resources(resources)
|
460
|
+
return {} unless resources.respond_to?(:[])
|
461
|
+
|
462
|
+
fonts = {}
|
463
|
+
resources = @ohash.object(resources[:Font]) || {}
|
464
|
+
resources.each do |label, desc|
|
465
|
+
fonts[label] = PDF::Reader::Font.new(@ohash, @ohash.object(desc))
|
466
|
+
end
|
467
|
+
fonts
|
468
|
+
end
|
469
|
+
def resources
|
470
|
+
@resources ||= []
|
471
|
+
end
|
472
|
+
end
|
473
|
+
################################################################################
|
474
|
+
end
|
475
|
+
################################################################################
|
@@ -0,0 +1,225 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
|
26
|
+
class PDF::Reader
|
27
|
+
################################################################################
|
28
|
+
# An internal PDF::Reader class that reads objects from the PDF file and converts
|
29
|
+
# them into useable ruby objects (hash's, arrays, true, false, etc)
|
30
|
+
class Parser
|
31
|
+
################################################################################
|
32
|
+
# Create a new parser around a PDF::Reader::Buffer object
|
33
|
+
#
|
34
|
+
# buffer - a PDF::Reader::Buffer object that contains PDF data
|
35
|
+
# ohash - a PDF::Reader::ObjectHash object that can return objects from the PDF file
|
36
|
+
def initialize (buffer, ohash=nil)
|
37
|
+
@buffer = buffer
|
38
|
+
@ohash = ohash
|
39
|
+
end
|
40
|
+
################################################################################
|
41
|
+
# Reads the next token from the underlying buffer and convets it to an appropriate
|
42
|
+
# object
|
43
|
+
#
|
44
|
+
# operators - a hash of supported operators to read from the underlying buffer.
|
45
|
+
def parse_token (operators={})
|
46
|
+
token = @buffer.token
|
47
|
+
|
48
|
+
case token
|
49
|
+
when PDF::Reader::Reference, nil then return token
|
50
|
+
when "/" then return pdf_name()
|
51
|
+
when "<<" then return dictionary()
|
52
|
+
when "[" then return array()
|
53
|
+
when "(" then return string()
|
54
|
+
when "<" then return hex_string()
|
55
|
+
when "true" then return true
|
56
|
+
when "false" then return false
|
57
|
+
when "null" then return nil
|
58
|
+
when "obj", "endobj", "stream", "endstream" then return Token.new(token)
|
59
|
+
when "stream", "endstream" then return Token.new(token)
|
60
|
+
when ">>", "]", ">", ")" then return Token.new(token)
|
61
|
+
else
|
62
|
+
if operators.has_key?(token) then return Token.new(token)
|
63
|
+
elsif token =~ /\d*\.\d/ then return token.to_f
|
64
|
+
else return token.to_i
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
################################################################################
|
69
|
+
# Reads an entire PDF object from the buffer and returns it as a Ruby String.
|
70
|
+
# If the object is a content stream, returns both the stream and the dictionary
|
71
|
+
# that describes it
|
72
|
+
#
|
73
|
+
# id - the object ID to return
|
74
|
+
# gen - the object revision number to return
|
75
|
+
def object (id, gen)
|
76
|
+
Error.assert_equal(parse_token, id)
|
77
|
+
Error.assert_equal(parse_token, gen)
|
78
|
+
Error.str_assert(parse_token, "obj")
|
79
|
+
|
80
|
+
obj = parse_token
|
81
|
+
post_obj = parse_token
|
82
|
+
if post_obj == "stream"
|
83
|
+
stream(obj)
|
84
|
+
else
|
85
|
+
obj
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
private
|
90
|
+
|
91
|
+
################################################################################
|
92
|
+
# reads a PDF dict from the buffer and converts it to a Ruby Hash.
|
93
|
+
def dictionary
|
94
|
+
dict = {}
|
95
|
+
|
96
|
+
loop do
|
97
|
+
key = parse_token
|
98
|
+
break if key.kind_of?(Token) and key == ">>"
|
99
|
+
raise MalformedPDFError, "Dictionary key (#{key.inspect}) is not a name" unless key.kind_of?(Symbol)
|
100
|
+
|
101
|
+
value = parse_token
|
102
|
+
value.kind_of?(Token) and Error.str_assert_not(value, ">>")
|
103
|
+
dict[key] = value
|
104
|
+
end
|
105
|
+
|
106
|
+
dict
|
107
|
+
end
|
108
|
+
################################################################################
|
109
|
+
# reads a PDF name from the buffer and converts it to a Ruby Symbol
|
110
|
+
def pdf_name
|
111
|
+
tok = @buffer.token
|
112
|
+
tok.scan(/#([A-Fa-f0-9]{2})/).each do |find|
|
113
|
+
replace = find[0].hex.chr
|
114
|
+
tok.gsub!("#"+find[0], replace)
|
115
|
+
end
|
116
|
+
tok.to_sym
|
117
|
+
end
|
118
|
+
################################################################################
|
119
|
+
# reads a PDF array from the buffer and converts it to a Ruby Array.
|
120
|
+
def array
|
121
|
+
a = []
|
122
|
+
|
123
|
+
loop do
|
124
|
+
item = parse_token
|
125
|
+
break if item.kind_of?(Token) and item == "]"
|
126
|
+
a << item
|
127
|
+
end
|
128
|
+
|
129
|
+
a
|
130
|
+
end
|
131
|
+
################################################################################
|
132
|
+
# Reads a PDF hex string from the buffer and converts it to a Ruby String
|
133
|
+
def hex_string
|
134
|
+
str = ""
|
135
|
+
|
136
|
+
loop do
|
137
|
+
token = @buffer.token
|
138
|
+
break if token == ">"
|
139
|
+
str << token
|
140
|
+
end
|
141
|
+
|
142
|
+
# add a missing digit if required, as required by the spec
|
143
|
+
str << "0" unless str.size % 2 == 0
|
144
|
+
str.scan(/../).map {|i| i.hex.chr}.join
|
145
|
+
end
|
146
|
+
################################################################################
|
147
|
+
# Reads a PDF String from the buffer and converts it to a Ruby String
|
148
|
+
def string
|
149
|
+
str = @buffer.token
|
150
|
+
return "" if str == ")"
|
151
|
+
Error.assert_equal(parse_token, ")")
|
152
|
+
|
153
|
+
ret = ""
|
154
|
+
idx = 0
|
155
|
+
|
156
|
+
while idx < str.size
|
157
|
+
chr = str[idx,1]
|
158
|
+
jump = 1
|
159
|
+
|
160
|
+
if chr == "\\"
|
161
|
+
jump = 2
|
162
|
+
case str[idx+1, 1]
|
163
|
+
when "" then jump = 1
|
164
|
+
when "n" then chr = "\n"
|
165
|
+
when "r" then chr = "\r"
|
166
|
+
when "t" then chr = "\t"
|
167
|
+
when "b" then chr = "\b"
|
168
|
+
when "f" then chr = "\f"
|
169
|
+
when "(" then chr = "("
|
170
|
+
when ")" then chr = ")"
|
171
|
+
when "\\" then chr = "\\"
|
172
|
+
when "\n" then
|
173
|
+
chr = ""
|
174
|
+
jump = 2
|
175
|
+
else
|
176
|
+
if str[idx+1,3].match(/\d{3}/)
|
177
|
+
jump = 4
|
178
|
+
chr = str[idx+1,3].oct.chr
|
179
|
+
elsif str[idx+1,2].match(/\d{2}/)
|
180
|
+
jump = 3
|
181
|
+
chr = ("0"+str[idx+1,2]).oct.chr
|
182
|
+
elsif str[idx+1,1].match(/\d/)
|
183
|
+
jump = 2
|
184
|
+
chr = ("00"+str[idx+1,1]).oct.chr
|
185
|
+
else
|
186
|
+
jump = 1
|
187
|
+
chr = ""
|
188
|
+
end
|
189
|
+
|
190
|
+
end
|
191
|
+
elsif chr == "\r" && str[idx+1,1] == "\n"
|
192
|
+
chr = "\n"
|
193
|
+
jump = 2
|
194
|
+
elsif chr == "\n" && str[idx+1,1] == "\r"
|
195
|
+
chr = "\n"
|
196
|
+
jump = 2
|
197
|
+
elsif chr == "\r"
|
198
|
+
chr = "\n"
|
199
|
+
end
|
200
|
+
ret << chr
|
201
|
+
idx += jump
|
202
|
+
end
|
203
|
+
ret
|
204
|
+
end
|
205
|
+
################################################################################
|
206
|
+
# Decodes the contents of a PDF Stream and returns it as a Ruby String.
|
207
|
+
def stream (dict)
|
208
|
+
raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
|
209
|
+
if @ohash
|
210
|
+
length = @ohash.object(dict[:Length])
|
211
|
+
else
|
212
|
+
length = dict[:Length] || 0
|
213
|
+
end
|
214
|
+
data = @buffer.read(length, :skip_eol => true)
|
215
|
+
|
216
|
+
Error.str_assert(parse_token, "endstream")
|
217
|
+
Error.str_assert(parse_token, "endobj")
|
218
|
+
|
219
|
+
PDF::Reader::Stream.new(dict, data)
|
220
|
+
end
|
221
|
+
################################################################################
|
222
|
+
end
|
223
|
+
################################################################################
|
224
|
+
end
|
225
|
+
################################################################################
|