fireinc-pdf-reader 0.11.0.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +168 -0
- data/MIT-LICENSE +21 -0
- data/README.rdoc +137 -0
- data/Rakefile +34 -0
- data/TODO +45 -0
- data/bin/pdf_list_callbacks +15 -0
- data/bin/pdf_object +48 -0
- data/bin/pdf_text +15 -0
- data/examples/callbacks.rb +21 -0
- data/examples/extract_bates.rb +49 -0
- data/examples/extract_images.rb +108 -0
- data/examples/hash.rb +12 -0
- data/examples/metadata.rb +25 -0
- data/examples/page_counter_improved.rb +23 -0
- data/examples/page_counter_naive.rb +24 -0
- data/examples/rspec.rb +57 -0
- data/examples/text.rb +40 -0
- data/examples/version.rb +25 -0
- data/lib/pdf/hash.rb +15 -0
- data/lib/pdf/reader/abstract_strategy.rb +81 -0
- data/lib/pdf/reader/buffer.rb +346 -0
- data/lib/pdf/reader/cmap.rb +138 -0
- data/lib/pdf/reader/encoding.rb +190 -0
- data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
- data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
- data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
- data/lib/pdf/reader/encodings/standard.txt +47 -0
- data/lib/pdf/reader/encodings/symbol.txt +154 -0
- data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
- data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
- data/lib/pdf/reader/error.rb +53 -0
- data/lib/pdf/reader/filter.rb +219 -0
- data/lib/pdf/reader/font.rb +133 -0
- data/lib/pdf/reader/form_xobject.rb +83 -0
- data/lib/pdf/reader/glyphlist.txt +4322 -0
- data/lib/pdf/reader/lzw.rb +123 -0
- data/lib/pdf/reader/metadata_strategy.rb +56 -0
- data/lib/pdf/reader/object_cache.rb +85 -0
- data/lib/pdf/reader/object_hash.rb +289 -0
- data/lib/pdf/reader/object_stream.rb +51 -0
- data/lib/pdf/reader/page.rb +185 -0
- data/lib/pdf/reader/page_text_receiver.rb +278 -0
- data/lib/pdf/reader/pages_strategy.rb +475 -0
- data/lib/pdf/reader/parser.rb +225 -0
- data/lib/pdf/reader/print_receiver.rb +18 -0
- data/lib/pdf/reader/reference.rb +66 -0
- data/lib/pdf/reader/register_receiver.rb +95 -0
- data/lib/pdf/reader/stream.rb +69 -0
- data/lib/pdf/reader/text_receiver.rb +264 -0
- data/lib/pdf/reader/token.rb +41 -0
- data/lib/pdf/reader/xref.rb +220 -0
- data/lib/pdf/reader.rb +296 -0
- data/lib/pdf-reader.rb +1 -0
- metadata +211 -0
@@ -0,0 +1,475 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
|
26
|
+
class PDF::Reader
|
27
|
+
################################################################################
|
28
|
+
# Walks the pages of the PDF file and calls the appropriate callback methods when
|
29
|
+
# something of interest is found.
|
30
|
+
#
|
31
|
+
# The callback methods should exist on the receiver object passed into the constructor. Whenever
|
32
|
+
# some content is found that will trigger a callback, the receiver is checked to see if the callback
|
33
|
+
# is defined.
|
34
|
+
#
|
35
|
+
# If it is defined it will be called. If not, processing will continue.
|
36
|
+
#
|
37
|
+
# = Available Callbacks
|
38
|
+
# The following callbacks are available and should be methods defined on your receiver class. Only
|
39
|
+
# implement the ones you need - the rest will be ignored.
|
40
|
+
#
|
41
|
+
# Some callbacks will include parameters which will be passed in as an array. For callbacks that supply no
|
42
|
+
# paramters, or where you don't need them, the *params argument can be left off. Some example callback
|
43
|
+
# method definitions are:
|
44
|
+
#
|
45
|
+
# def begin_document
|
46
|
+
# def end_page
|
47
|
+
# def show_text(string, *params)
|
48
|
+
# def fill_stroke(*params)
|
49
|
+
#
|
50
|
+
# You should be able to infer the basic command the callback is reporting based on the name. For
|
51
|
+
# further experimentation, define the callback with just a *params parameter, then print out the
|
52
|
+
# contents of the array using something like:
|
53
|
+
#
|
54
|
+
# puts params.inspect
|
55
|
+
#
|
56
|
+
# == Text Callbacks
|
57
|
+
#
|
58
|
+
# All text passed into these callbacks will be encoded as UTF-8. Depending on where (and when) the
|
59
|
+
# PDF was generated, there's a good chance the text is NOT stored as UTF-8 internally so be careful
|
60
|
+
# when doing a comparison on strings returned from PDF::Reader (when doing unit tests for example). The
|
61
|
+
# string may not be byte-by-byte identical with the string that was originally written to the PDF.
|
62
|
+
#
|
63
|
+
# - end_text_object
|
64
|
+
# - move_to_start_of_next_line
|
65
|
+
# - set_character_spacing
|
66
|
+
# - move_text_position
|
67
|
+
# - move_text_position_and_set_leading
|
68
|
+
# - set_text_font_and_size
|
69
|
+
# - show_text
|
70
|
+
# - show_text_with_positioning
|
71
|
+
# - set_text_leading
|
72
|
+
# - set_text_matrix_and_text_line_matrix
|
73
|
+
# - set_text_rendering_mode
|
74
|
+
# - set_text_rise
|
75
|
+
# - set_word_spacing
|
76
|
+
# - set_horizontal_text_scaling
|
77
|
+
# - move_to_next_line_and_show_text
|
78
|
+
# - set_spacing_next_line_show_text
|
79
|
+
#
|
80
|
+
# If the :raw_text option was passed to the PDF::Reader class the following callbacks
|
81
|
+
# may also appear:
|
82
|
+
#
|
83
|
+
# - show_text_raw
|
84
|
+
# - show_text_with_positioning_raw
|
85
|
+
# - move_to_next_line_and_show_text_raw
|
86
|
+
# - set_spacing_next_line_show_text_raw
|
87
|
+
#
|
88
|
+
# == Graphics Callbacks
|
89
|
+
# - close_fill_stroke
|
90
|
+
# - fill_stroke
|
91
|
+
# - close_fill_stroke_with_even_odd
|
92
|
+
# - fill_stroke_with_even_odd
|
93
|
+
# - begin_marked_content_with_pl
|
94
|
+
# - begin_inline_image
|
95
|
+
# - begin_marked_content
|
96
|
+
# - begin_text_object
|
97
|
+
# - append_curved_segment
|
98
|
+
# - concatenate_matrix
|
99
|
+
# - set_stroke_color_space
|
100
|
+
# - set_nonstroke_color_space
|
101
|
+
# - set_line_dash
|
102
|
+
# - set_glyph_width
|
103
|
+
# - set_glyph_width_and_bounding_box
|
104
|
+
# - invoke_xobject
|
105
|
+
# - define_marked_content_with_pl
|
106
|
+
# - end_inline_image
|
107
|
+
# - end_marked_content
|
108
|
+
# - fill_path_with_nonzero
|
109
|
+
# - fill_path_with_nonzero
|
110
|
+
# - fill_path_with_even_odd
|
111
|
+
# - set_gray_for_stroking
|
112
|
+
# - set_gray_for_nonstroking
|
113
|
+
# - set_graphics_state_parameters
|
114
|
+
# - close_subpath
|
115
|
+
# - set_flatness_tolerance
|
116
|
+
# - begin_inline_image_data
|
117
|
+
# - set_line_join_style
|
118
|
+
# - set_line_cap_style
|
119
|
+
# - set_cmyk_color_for_stroking,
|
120
|
+
# - set_cmyk_color_for_nonstroking
|
121
|
+
# - append_line
|
122
|
+
# - begin_new_subpath
|
123
|
+
# - set_miter_limit
|
124
|
+
# - define_marked_content_point
|
125
|
+
# - end_path
|
126
|
+
# - save_graphics_state
|
127
|
+
# - restore_graphics_state
|
128
|
+
# - append_rectangle
|
129
|
+
# - set_rgb_color_for_stroking
|
130
|
+
# - set_rgb_color_for_nonstroking
|
131
|
+
# - set_color_rendering_intent
|
132
|
+
# - close_and_stroke_path
|
133
|
+
# - stroke_path
|
134
|
+
# - set_color_for_stroking
|
135
|
+
# - set_color_for_nonstroking
|
136
|
+
# - set_color_for_stroking_and_special
|
137
|
+
# - set_color_for_nonstroking_and_special
|
138
|
+
# - paint_area_with_shading_pattern
|
139
|
+
# - append_curved_segment_initial_point_replicated
|
140
|
+
# - set_line_width
|
141
|
+
# - set_clipping_path_with_nonzero
|
142
|
+
# - set_clipping_path_with_even_odd
|
143
|
+
# - append_curved_segment_final_point_replicated
|
144
|
+
#
|
145
|
+
# == Misc Callbacks
|
146
|
+
# - begin_compatibility_section
|
147
|
+
# - end_compatibility_section,
|
148
|
+
# - begin_document
|
149
|
+
# - end_document
|
150
|
+
# - begin_page_container
|
151
|
+
# - end_page_container
|
152
|
+
# - begin_page
|
153
|
+
# - end_page
|
154
|
+
# - metadata
|
155
|
+
# - xml_metadata
|
156
|
+
# - page_count
|
157
|
+
# - begin_form_xobject
|
158
|
+
# - end_form_xobject
|
159
|
+
#
|
160
|
+
# == Resource Callbacks
|
161
|
+
#
|
162
|
+
# Each page can contain (or inherit) a range of resources required for the page,
|
163
|
+
# including things like fonts and images. The following callbacks may appear
|
164
|
+
# after begin_page if the relevant resources exist on a page:
|
165
|
+
#
|
166
|
+
# - resource_procset
|
167
|
+
# - resource_xobject
|
168
|
+
# - resource_extgstate
|
169
|
+
# - resource_colorspace
|
170
|
+
# - resource_pattern
|
171
|
+
# - resource_font
|
172
|
+
#
|
173
|
+
# In most cases, these callbacks associate a name with each resource, allowing it
|
174
|
+
# to be referred to by name in the page content. For example, an XObject can hold an image.
|
175
|
+
# If it gets mapped to the name "IM1", then it can be placed on the page using
|
176
|
+
# invoke_xobject "IM1".
|
177
|
+
#
|
178
|
+
# DEPRECATED: this class was deprecated in version 0.11.0 and will
|
179
|
+
# eventually be removed
|
180
|
+
class PagesStrategy< AbstractStrategy # :nodoc:
|
181
|
+
OPERATORS = {
|
182
|
+
'b' => :close_fill_stroke,
|
183
|
+
'B' => :fill_stroke,
|
184
|
+
'b*' => :close_fill_stroke_with_even_odd,
|
185
|
+
'B*' => :fill_stroke_with_even_odd,
|
186
|
+
'BDC' => :begin_marked_content_with_pl,
|
187
|
+
'BI' => :begin_inline_image,
|
188
|
+
'BMC' => :begin_marked_content,
|
189
|
+
'BT' => :begin_text_object,
|
190
|
+
'BX' => :begin_compatibility_section,
|
191
|
+
'c' => :append_curved_segment,
|
192
|
+
'cm' => :concatenate_matrix,
|
193
|
+
'CS' => :set_stroke_color_space,
|
194
|
+
'cs' => :set_nonstroke_color_space,
|
195
|
+
'd' => :set_line_dash,
|
196
|
+
'd0' => :set_glyph_width,
|
197
|
+
'd1' => :set_glyph_width_and_bounding_box,
|
198
|
+
'Do' => :invoke_xobject,
|
199
|
+
'DP' => :define_marked_content_with_pl,
|
200
|
+
'EI' => :end_inline_image,
|
201
|
+
'EMC' => :end_marked_content,
|
202
|
+
'ET' => :end_text_object,
|
203
|
+
'EX' => :end_compatibility_section,
|
204
|
+
'f' => :fill_path_with_nonzero,
|
205
|
+
'F' => :fill_path_with_nonzero,
|
206
|
+
'f*' => :fill_path_with_even_odd,
|
207
|
+
'G' => :set_gray_for_stroking,
|
208
|
+
'g' => :set_gray_for_nonstroking,
|
209
|
+
'gs' => :set_graphics_state_parameters,
|
210
|
+
'h' => :close_subpath,
|
211
|
+
'i' => :set_flatness_tolerance,
|
212
|
+
'ID' => :begin_inline_image_data,
|
213
|
+
'j' => :set_line_join_style,
|
214
|
+
'J' => :set_line_cap_style,
|
215
|
+
'K' => :set_cmyk_color_for_stroking,
|
216
|
+
'k' => :set_cmyk_color_for_nonstroking,
|
217
|
+
'l' => :append_line,
|
218
|
+
'm' => :begin_new_subpath,
|
219
|
+
'M' => :set_miter_limit,
|
220
|
+
'MP' => :define_marked_content_point,
|
221
|
+
'n' => :end_path,
|
222
|
+
'q' => :save_graphics_state,
|
223
|
+
'Q' => :restore_graphics_state,
|
224
|
+
're' => :append_rectangle,
|
225
|
+
'RG' => :set_rgb_color_for_stroking,
|
226
|
+
'rg' => :set_rgb_color_for_nonstroking,
|
227
|
+
'ri' => :set_color_rendering_intent,
|
228
|
+
's' => :close_and_stroke_path,
|
229
|
+
'S' => :stroke_path,
|
230
|
+
'SC' => :set_color_for_stroking,
|
231
|
+
'sc' => :set_color_for_nonstroking,
|
232
|
+
'SCN' => :set_color_for_stroking_and_special,
|
233
|
+
'scn' => :set_color_for_nonstroking_and_special,
|
234
|
+
'sh' => :paint_area_with_shading_pattern,
|
235
|
+
'T*' => :move_to_start_of_next_line,
|
236
|
+
'Tc' => :set_character_spacing,
|
237
|
+
'Td' => :move_text_position,
|
238
|
+
'TD' => :move_text_position_and_set_leading,
|
239
|
+
'Tf' => :set_text_font_and_size,
|
240
|
+
'Tj' => :show_text,
|
241
|
+
'TJ' => :show_text_with_positioning,
|
242
|
+
'TL' => :set_text_leading,
|
243
|
+
'Tm' => :set_text_matrix_and_text_line_matrix,
|
244
|
+
'Tr' => :set_text_rendering_mode,
|
245
|
+
'Ts' => :set_text_rise,
|
246
|
+
'Tw' => :set_word_spacing,
|
247
|
+
'Tz' => :set_horizontal_text_scaling,
|
248
|
+
'v' => :append_curved_segment_initial_point_replicated,
|
249
|
+
'w' => :set_line_width,
|
250
|
+
'W' => :set_clipping_path_with_nonzero,
|
251
|
+
'W*' => :set_clipping_path_with_even_odd,
|
252
|
+
'y' => :append_curved_segment_final_point_replicated,
|
253
|
+
'\'' => :move_to_next_line_and_show_text,
|
254
|
+
'"' => :set_spacing_next_line_show_text,
|
255
|
+
}
|
256
|
+
def self.to_sym
|
257
|
+
:pages
|
258
|
+
end
|
259
|
+
################################################################################
|
260
|
+
# Begin processing the document
|
261
|
+
def process
|
262
|
+
return false unless options[:pages]
|
263
|
+
|
264
|
+
callback(:begin_document, [root])
|
265
|
+
walk_pages(@ohash.object(root[:Pages]))
|
266
|
+
callback(:end_document)
|
267
|
+
end
|
268
|
+
private
|
269
|
+
################################################################################
|
270
|
+
# Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
|
271
|
+
# its content
|
272
|
+
def walk_pages (page)
|
273
|
+
|
274
|
+
# extract page content
|
275
|
+
if page[:Type] == :Pages
|
276
|
+
callback(:begin_page_container, [page])
|
277
|
+
res = @ohash.object(page[:Resources])
|
278
|
+
resources.push res if res
|
279
|
+
@ohash.object(page[:Kids]).each {|child| walk_pages(@ohash.object(child))}
|
280
|
+
resources.pop if res
|
281
|
+
callback(:end_page_container)
|
282
|
+
elsif page[:Type] == :Page
|
283
|
+
callback(:begin_page, [page])
|
284
|
+
res = @ohash.object(page[:Resources])
|
285
|
+
resources.push res if res
|
286
|
+
walk_resources(current_resources)
|
287
|
+
|
288
|
+
if @ohash.object(page[:Contents]).kind_of?(Array)
|
289
|
+
contents = @ohash.object(page[:Contents])
|
290
|
+
else
|
291
|
+
contents = [page[:Contents]]
|
292
|
+
end
|
293
|
+
|
294
|
+
fonts = font_hash_from_resources(current_resources)
|
295
|
+
|
296
|
+
if page.has_key?(:Contents) and page[:Contents]
|
297
|
+
direct_contents = contents.map { |content| @ohash.object(content) }
|
298
|
+
content_stream(direct_contents, fonts)
|
299
|
+
end
|
300
|
+
|
301
|
+
resources.pop if res
|
302
|
+
callback(:end_page)
|
303
|
+
end
|
304
|
+
end
|
305
|
+
################################################################################
|
306
|
+
# Retreive the XObject for the supplied label and if it's a Form, walk it
|
307
|
+
# like a regular page content stream.
|
308
|
+
#
|
309
|
+
def walk_xobject_form(label)
|
310
|
+
xobjects = @ohash.object(current_resources[:XObject]) || {}
|
311
|
+
xobject = @ohash.object(xobjects[label])
|
312
|
+
|
313
|
+
if xobject && xobject.hash[:Subtype] == :Form
|
314
|
+
callback(:begin_form_xobject)
|
315
|
+
xobj_resources = @ohash.object(xobject.hash[:Resources])
|
316
|
+
if xobj_resources
|
317
|
+
resources.push xobj_resources
|
318
|
+
walk_resources(xobj_resources)
|
319
|
+
end
|
320
|
+
fonts = font_hash_from_resources(xobj_resources)
|
321
|
+
content_stream(xobject, fonts)
|
322
|
+
callback(:end_form_xobject)
|
323
|
+
resources.pop if xobj_resources
|
324
|
+
end
|
325
|
+
end
|
326
|
+
|
327
|
+
################################################################################
|
328
|
+
# Return a merged hash of all resources that are current. Pages, page and xobject
|
329
|
+
#
|
330
|
+
def current_resources
|
331
|
+
hash = {}
|
332
|
+
resources.each do |res|
|
333
|
+
hash.merge!(res)
|
334
|
+
end
|
335
|
+
hash
|
336
|
+
end
|
337
|
+
################################################################################
|
338
|
+
# Reads a PDF content stream and calls all the appropriate callback methods for the operators
|
339
|
+
# it contains
|
340
|
+
#
|
341
|
+
def content_stream (instructions, fonts = {})
|
342
|
+
instructions = [instructions] unless instructions.kind_of?(Array)
|
343
|
+
instructions = instructions.map { |ins|
|
344
|
+
ins.is_a?(PDF::Reader::Stream) ? ins.unfiltered_data : ins.to_s
|
345
|
+
}.join
|
346
|
+
buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
|
347
|
+
parser = Parser.new(buffer, @ohash)
|
348
|
+
current_font = nil
|
349
|
+
params = []
|
350
|
+
|
351
|
+
while (token = parser.parse_token(OPERATORS))
|
352
|
+
if token.kind_of?(Token) and OPERATORS.has_key?(token)
|
353
|
+
if OPERATORS[token] == :set_text_font_and_size
|
354
|
+
current_font = params.first
|
355
|
+
if fonts[current_font].nil?
|
356
|
+
raise MalformedPDFError, "Unknown font #{current_font}"
|
357
|
+
end
|
358
|
+
end
|
359
|
+
|
360
|
+
# handle special cases in response to certain operators
|
361
|
+
if OPERATORS[token].to_s.include?("show_text")
|
362
|
+
# convert any text to utf-8, but output the raw string if the user wants it
|
363
|
+
if options[:raw_text]
|
364
|
+
callback("#{OPERATORS[token]}_raw".to_sym, params)
|
365
|
+
end
|
366
|
+
params = fonts[current_font].to_utf8(params)
|
367
|
+
elsif token == "ID"
|
368
|
+
# inline image data, first convert the current params into a more familiar hash
|
369
|
+
map = {}
|
370
|
+
params.each_slice(2) do |key, value|
|
371
|
+
map[key] = value
|
372
|
+
end
|
373
|
+
params = [map, buffer.token]
|
374
|
+
end
|
375
|
+
|
376
|
+
callback(OPERATORS[token], params)
|
377
|
+
|
378
|
+
if OPERATORS[token] == :invoke_xobject
|
379
|
+
xobject_label = params.first
|
380
|
+
params.clear
|
381
|
+
walk_xobject_form(xobject_label)
|
382
|
+
else
|
383
|
+
params.clear
|
384
|
+
end
|
385
|
+
else
|
386
|
+
params << token
|
387
|
+
end
|
388
|
+
end
|
389
|
+
rescue EOFError => e
|
390
|
+
raise MalformedPDFError, "End Of File while processing a content stream"
|
391
|
+
end
|
392
|
+
################################################################################
|
393
|
+
def walk_resources(resources)
|
394
|
+
return unless resources.respond_to?(:[])
|
395
|
+
|
396
|
+
resources = resolve_references(resources)
|
397
|
+
|
398
|
+
# extract any procset information
|
399
|
+
if resources[:ProcSet]
|
400
|
+
callback(:resource_procset, resources[:ProcSet])
|
401
|
+
end
|
402
|
+
|
403
|
+
# extract any xobject information
|
404
|
+
if resources[:XObject]
|
405
|
+
@ohash.object(resources[:XObject]).each do |name, val|
|
406
|
+
callback(:resource_xobject, [name, @ohash.object(val)])
|
407
|
+
end
|
408
|
+
end
|
409
|
+
|
410
|
+
# extract any extgstate information
|
411
|
+
if resources[:ExtGState]
|
412
|
+
@ohash.object(resources[:ExtGState]).each do |name, val|
|
413
|
+
callback(:resource_extgstate, [name, @ohash.object(val)])
|
414
|
+
end
|
415
|
+
end
|
416
|
+
|
417
|
+
# extract any colorspace information
|
418
|
+
if resources[:ColorSpace]
|
419
|
+
@ohash.object(resources[:ColorSpace]).each do |name, val|
|
420
|
+
callback(:resource_colorspace, [name, @ohash.object(val)])
|
421
|
+
end
|
422
|
+
end
|
423
|
+
|
424
|
+
# extract any pattern information
|
425
|
+
if resources[:Pattern]
|
426
|
+
@ohash.object(resources[:Pattern]).each do |name, val|
|
427
|
+
callback(:resource_pattern, [name, @ohash.object(val)])
|
428
|
+
end
|
429
|
+
end
|
430
|
+
|
431
|
+
# extract any font information
|
432
|
+
if resources[:Font]
|
433
|
+
fonts = font_hash_from_resources(resources)
|
434
|
+
fonts.each do |label, font|
|
435
|
+
callback(:resource_font, [label, font])
|
436
|
+
end
|
437
|
+
end
|
438
|
+
end
|
439
|
+
################################################################################
|
440
|
+
# Convert any PDF::Reader::Resource objects into a real object
|
441
|
+
def resolve_references(obj)
|
442
|
+
case obj
|
443
|
+
when PDF::Reader::Stream then
|
444
|
+
obj.hash = resolve_references(obj.hash)
|
445
|
+
obj
|
446
|
+
when PDF::Reader::Reference then
|
447
|
+
resolve_references(@ohash.object(obj))
|
448
|
+
when Hash then
|
449
|
+
arr = obj.map { |key,val| [key, resolve_references(val)] }.flatten(1)
|
450
|
+
Hash[*arr]
|
451
|
+
when Array then
|
452
|
+
obj.collect { |item| resolve_references(item) }
|
453
|
+
else
|
454
|
+
obj
|
455
|
+
end
|
456
|
+
end
|
457
|
+
################################################################################
|
458
|
+
################################################################################
|
459
|
+
def font_hash_from_resources(resources)
|
460
|
+
return {} unless resources.respond_to?(:[])
|
461
|
+
|
462
|
+
fonts = {}
|
463
|
+
resources = @ohash.object(resources[:Font]) || {}
|
464
|
+
resources.each do |label, desc|
|
465
|
+
fonts[label] = PDF::Reader::Font.new(@ohash, @ohash.object(desc))
|
466
|
+
end
|
467
|
+
fonts
|
468
|
+
end
|
469
|
+
def resources
|
470
|
+
@resources ||= []
|
471
|
+
end
|
472
|
+
end
|
473
|
+
################################################################################
|
474
|
+
end
|
475
|
+
################################################################################
|
@@ -0,0 +1,225 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
|
26
|
+
class PDF::Reader
|
27
|
+
################################################################################
|
28
|
+
# An internal PDF::Reader class that reads objects from the PDF file and converts
|
29
|
+
# them into useable ruby objects (hash's, arrays, true, false, etc)
|
30
|
+
class Parser
|
31
|
+
################################################################################
|
32
|
+
# Create a new parser around a PDF::Reader::Buffer object
|
33
|
+
#
|
34
|
+
# buffer - a PDF::Reader::Buffer object that contains PDF data
|
35
|
+
# ohash - a PDF::Reader::ObjectHash object that can return objects from the PDF file
|
36
|
+
def initialize (buffer, ohash=nil)
|
37
|
+
@buffer = buffer
|
38
|
+
@ohash = ohash
|
39
|
+
end
|
40
|
+
################################################################################
|
41
|
+
# Reads the next token from the underlying buffer and convets it to an appropriate
|
42
|
+
# object
|
43
|
+
#
|
44
|
+
# operators - a hash of supported operators to read from the underlying buffer.
|
45
|
+
def parse_token (operators={})
|
46
|
+
token = @buffer.token
|
47
|
+
|
48
|
+
case token
|
49
|
+
when PDF::Reader::Reference, nil then return token
|
50
|
+
when "/" then return pdf_name()
|
51
|
+
when "<<" then return dictionary()
|
52
|
+
when "[" then return array()
|
53
|
+
when "(" then return string()
|
54
|
+
when "<" then return hex_string()
|
55
|
+
when "true" then return true
|
56
|
+
when "false" then return false
|
57
|
+
when "null" then return nil
|
58
|
+
when "obj", "endobj", "stream", "endstream" then return Token.new(token)
|
59
|
+
when "stream", "endstream" then return Token.new(token)
|
60
|
+
when ">>", "]", ">", ")" then return Token.new(token)
|
61
|
+
else
|
62
|
+
if operators.has_key?(token) then return Token.new(token)
|
63
|
+
elsif token =~ /\d*\.\d/ then return token.to_f
|
64
|
+
else return token.to_i
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
################################################################################
|
69
|
+
# Reads an entire PDF object from the buffer and returns it as a Ruby String.
|
70
|
+
# If the object is a content stream, returns both the stream and the dictionary
|
71
|
+
# that describes it
|
72
|
+
#
|
73
|
+
# id - the object ID to return
|
74
|
+
# gen - the object revision number to return
|
75
|
+
def object (id, gen)
|
76
|
+
Error.assert_equal(parse_token, id)
|
77
|
+
Error.assert_equal(parse_token, gen)
|
78
|
+
Error.str_assert(parse_token, "obj")
|
79
|
+
|
80
|
+
obj = parse_token
|
81
|
+
post_obj = parse_token
|
82
|
+
if post_obj == "stream"
|
83
|
+
stream(obj)
|
84
|
+
else
|
85
|
+
obj
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
private
|
90
|
+
|
91
|
+
################################################################################
|
92
|
+
# reads a PDF dict from the buffer and converts it to a Ruby Hash.
|
93
|
+
def dictionary
|
94
|
+
dict = {}
|
95
|
+
|
96
|
+
loop do
|
97
|
+
key = parse_token
|
98
|
+
break if key.kind_of?(Token) and key == ">>"
|
99
|
+
raise MalformedPDFError, "Dictionary key (#{key.inspect}) is not a name" unless key.kind_of?(Symbol)
|
100
|
+
|
101
|
+
value = parse_token
|
102
|
+
value.kind_of?(Token) and Error.str_assert_not(value, ">>")
|
103
|
+
dict[key] = value
|
104
|
+
end
|
105
|
+
|
106
|
+
dict
|
107
|
+
end
|
108
|
+
################################################################################
|
109
|
+
# reads a PDF name from the buffer and converts it to a Ruby Symbol
|
110
|
+
def pdf_name
|
111
|
+
tok = @buffer.token
|
112
|
+
tok.scan(/#([A-Fa-f0-9]{2})/).each do |find|
|
113
|
+
replace = find[0].hex.chr
|
114
|
+
tok.gsub!("#"+find[0], replace)
|
115
|
+
end
|
116
|
+
tok.to_sym
|
117
|
+
end
|
118
|
+
################################################################################
|
119
|
+
# reads a PDF array from the buffer and converts it to a Ruby Array.
|
120
|
+
def array
|
121
|
+
a = []
|
122
|
+
|
123
|
+
loop do
|
124
|
+
item = parse_token
|
125
|
+
break if item.kind_of?(Token) and item == "]"
|
126
|
+
a << item
|
127
|
+
end
|
128
|
+
|
129
|
+
a
|
130
|
+
end
|
131
|
+
################################################################################
|
132
|
+
# Reads a PDF hex string from the buffer and converts it to a Ruby String
|
133
|
+
def hex_string
|
134
|
+
str = ""
|
135
|
+
|
136
|
+
loop do
|
137
|
+
token = @buffer.token
|
138
|
+
break if token == ">"
|
139
|
+
str << token
|
140
|
+
end
|
141
|
+
|
142
|
+
# add a missing digit if required, as required by the spec
|
143
|
+
str << "0" unless str.size % 2 == 0
|
144
|
+
str.scan(/../).map {|i| i.hex.chr}.join
|
145
|
+
end
|
146
|
+
################################################################################
|
147
|
+
# Reads a PDF String from the buffer and converts it to a Ruby String
|
148
|
+
def string
|
149
|
+
str = @buffer.token
|
150
|
+
return "" if str == ")"
|
151
|
+
Error.assert_equal(parse_token, ")")
|
152
|
+
|
153
|
+
ret = ""
|
154
|
+
idx = 0
|
155
|
+
|
156
|
+
while idx < str.size
|
157
|
+
chr = str[idx,1]
|
158
|
+
jump = 1
|
159
|
+
|
160
|
+
if chr == "\\"
|
161
|
+
jump = 2
|
162
|
+
case str[idx+1, 1]
|
163
|
+
when "" then jump = 1
|
164
|
+
when "n" then chr = "\n"
|
165
|
+
when "r" then chr = "\r"
|
166
|
+
when "t" then chr = "\t"
|
167
|
+
when "b" then chr = "\b"
|
168
|
+
when "f" then chr = "\f"
|
169
|
+
when "(" then chr = "("
|
170
|
+
when ")" then chr = ")"
|
171
|
+
when "\\" then chr = "\\"
|
172
|
+
when "\n" then
|
173
|
+
chr = ""
|
174
|
+
jump = 2
|
175
|
+
else
|
176
|
+
if str[idx+1,3].match(/\d{3}/)
|
177
|
+
jump = 4
|
178
|
+
chr = str[idx+1,3].oct.chr
|
179
|
+
elsif str[idx+1,2].match(/\d{2}/)
|
180
|
+
jump = 3
|
181
|
+
chr = ("0"+str[idx+1,2]).oct.chr
|
182
|
+
elsif str[idx+1,1].match(/\d/)
|
183
|
+
jump = 2
|
184
|
+
chr = ("00"+str[idx+1,1]).oct.chr
|
185
|
+
else
|
186
|
+
jump = 1
|
187
|
+
chr = ""
|
188
|
+
end
|
189
|
+
|
190
|
+
end
|
191
|
+
elsif chr == "\r" && str[idx+1,1] == "\n"
|
192
|
+
chr = "\n"
|
193
|
+
jump = 2
|
194
|
+
elsif chr == "\n" && str[idx+1,1] == "\r"
|
195
|
+
chr = "\n"
|
196
|
+
jump = 2
|
197
|
+
elsif chr == "\r"
|
198
|
+
chr = "\n"
|
199
|
+
end
|
200
|
+
ret << chr
|
201
|
+
idx += jump
|
202
|
+
end
|
203
|
+
ret
|
204
|
+
end
|
205
|
+
################################################################################
|
206
|
+
# Decodes the contents of a PDF Stream and returns it as a Ruby String.
|
207
|
+
def stream (dict)
|
208
|
+
raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
|
209
|
+
if @ohash
|
210
|
+
length = @ohash.object(dict[:Length])
|
211
|
+
else
|
212
|
+
length = dict[:Length] || 0
|
213
|
+
end
|
214
|
+
data = @buffer.read(length, :skip_eol => true)
|
215
|
+
|
216
|
+
Error.str_assert(parse_token, "endstream")
|
217
|
+
Error.str_assert(parse_token, "endobj")
|
218
|
+
|
219
|
+
PDF::Reader::Stream.new(dict, data)
|
220
|
+
end
|
221
|
+
################################################################################
|
222
|
+
end
|
223
|
+
################################################################################
|
224
|
+
end
|
225
|
+
################################################################################
|