pdf-reader 0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,289 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'stringio'
26
+
27
+ class PDF::Reader
28
+ ################################################################################
29
+ # Walks the PDF file and calls the appropriate callback methods when something of interest is
30
+ # found.
31
+ #
32
+ # The callback methods should exist on the receiver object passed into the constructor. Whenever
33
+ # some content is found that will trigger a callback, the receiver is checked to see if the callback
34
+ # is defined.
35
+ #
36
+ # If it is defined it will be called. If not, processing will continue.
37
+ #
38
+ # = Available Callbacks
39
+ # The following callbacks are available and should be methods defined on your receiver class. Only
40
+ # implement the ones you need - the rest will be ignored.
41
+ #
42
+ # Some callbacks will include parameters which will be passed in as an array. For callbacks that supply no
43
+ # paramters, or where you don't need them, the *params argument can be left off. Some example callback
44
+ # method definitions are:
45
+ #
46
+ # def begin_document
47
+ # def end_page
48
+ # def show_text(string, *params)
49
+ # def fill_stroke(*params)
50
+ #
51
+ # You should be able to infer the basic command the callback is reporting based on the name. For
52
+ # further experimentation, define the callback with just a *params parameter, then print out the
53
+ # contents of the array using something like:
54
+ #
55
+ # puts params.inspect
56
+ #
57
+ # == Text Callbacks
58
+ # - end_text_object
59
+ # - move_to_start_of_next_line
60
+ # - set_character_spacing
61
+ # - move_text_position
62
+ # - move_text_position_and_set_leading
63
+ # - set_text_font_and_size
64
+ # - show_text
65
+ # - show_text_with_positioning
66
+ # - set_text_leading
67
+ # - set_text_matrix_and_text_line_matrix
68
+ # - set_text_rendering_mode
69
+ # - set_text_rise
70
+ # - set_word_spacing
71
+ # - set_horizontal_text_scaling
72
+ # - move_to_next_line_and_show_text
73
+ # - set_spacing_next_line_show_text
74
+ #
75
+ # == Graphics Callbacks
76
+ # - close_fill_stroke
77
+ # - fill_stroke
78
+ # - close_fill_stroke_with_even_odd
79
+ # - fill_stroke_with_even_odd
80
+ # - begin_marked_content_with_pl
81
+ # - begin_inline_image
82
+ # - begin_marked_content
83
+ # - begin_text_object
84
+ # - append_curved_segment
85
+ # - concatenate_matrix
86
+ # - set_stroke_color_space
87
+ # - set_nonstroke_color_space
88
+ # - set_line_dash
89
+ # - set_glyph_width
90
+ # - set_glyph_width_and_bounding_box
91
+ # - invoke_xobject
92
+ # - define_marked_content_with_pl
93
+ # - end_inline_image
94
+ # - end_marked_content
95
+ # - fill_path_with_nonzero
96
+ # - fill_path_with_nonzero
97
+ # - fill_path_with_even_odd
98
+ # - set_gray_for_stroking
99
+ # - set_gray_for_nonstroking
100
+ # - set_graphics_state_parameters
101
+ # - close_subpath
102
+ # - set_flatness_tolerance
103
+ # - begin_inline_image_data
104
+ # - set_line_join_style
105
+ # - set_line_cap_style
106
+ # - set_cmyk_color_for_stroking,
107
+ # - set_cmyk_color_for_nonstroking
108
+ # - append_line
109
+ # - begin_new_subpath
110
+ # - set_miter_limit
111
+ # - define_marked_content_point
112
+ # - end_path
113
+ # - save_graphics_state
114
+ # - restore_graphics_state
115
+ # - append_rectangle
116
+ # - set_rgb_color_for_stroking
117
+ # - set_rgb_color_for_nonstroking
118
+ # - set_color_rendering_intent
119
+ # - close_and_stroke_path
120
+ # - stroke_path
121
+ # - set_color_for_stroking
122
+ # - set_color_for_nonstroking
123
+ # - set_color_for_stroking_and_special
124
+ # - set_color_for_nonstroking_and_special
125
+ # - paint_area_with_shading_pattern
126
+ # - append_curved_segment_initial_point_replicated
127
+ # - set_line_width
128
+ # - set_clipping_path_with_nonzero
129
+ # - set_clipping_path_with_even_odd
130
+ # - append_curved_segment_final_point_replicated
131
+ #
132
+ # == Misc Callbacks
133
+ # - begin_compatibility_section
134
+ # - end_compatibility_section,
135
+ # - begin_document
136
+ # - end_document
137
+ # - begin_page_container
138
+ # - end_page_container
139
+ # - begin_page
140
+ # - end_page
141
+ class Content
142
+ OPERATORS = {
143
+ 'b' => :close_fill_stroke,
144
+ 'B' => :fill_stroke,
145
+ 'b*' => :close_fill_stroke_with_even_odd,
146
+ 'B*' => :fill_stroke_with_even_odd,
147
+ 'BDC' => :begin_marked_content_with_pl,
148
+ 'BI' => :begin_inline_image,
149
+ 'BMC' => :begin_marked_content,
150
+ 'BT' => :begin_text_object,
151
+ 'BX' => :begin_compatibility_section,
152
+ 'c' => :append_curved_segment,
153
+ 'cm' => :concatenate_matrix,
154
+ 'CS' => :set_stroke_color_space,
155
+ 'cs' => :set_nonstroke_color_space,
156
+ 'd' => :set_line_dash,
157
+ 'd0' => :set_glyph_width,
158
+ 'd1' => :set_glyph_width_and_bounding_box,
159
+ 'Do' => :invoke_xobject,
160
+ 'DP' => :define_marked_content_with_pl,
161
+ 'EI' => :end_inline_image,
162
+ 'EMC' => :end_marked_content,
163
+ 'ET' => :end_text_object,
164
+ 'EX' => :end_compatibility_section,
165
+ 'f' => :fill_path_with_nonzero,
166
+ 'F' => :fill_path_with_nonzero,
167
+ 'f*' => :fill_path_with_even_odd,
168
+ 'G' => :set_gray_for_stroking,
169
+ 'g' => :set_gray_for_nonstroking,
170
+ 'gs' => :set_graphics_state_parameters,
171
+ 'h' => :close_subpath,
172
+ 'i' => :set_flatness_tolerance,
173
+ 'ID' => :begin_inline_image_data,
174
+ 'j' => :set_line_join_style,
175
+ 'J' => :set_line_cap_style,
176
+ 'K' => :set_cmyk_color_for_stroking,
177
+ 'k' => :set_cmyk_color_for_nonstroking,
178
+ 'l' => :append_line,
179
+ 'm' => :begin_new_subpath,
180
+ 'M' => :set_miter_limit,
181
+ 'MP' => :define_marked_content_point,
182
+ 'n' => :end_path,
183
+ 'q' => :save_graphics_state,
184
+ 'Q' => :restore_graphics_state,
185
+ 're' => :append_rectangle,
186
+ 'RG' => :set_rgb_color_for_stroking,
187
+ 'rg' => :set_rgb_color_for_nonstroking,
188
+ 'ri' => :set_color_rendering_intent,
189
+ 's' => :close_and_stroke_path,
190
+ 'S' => :stroke_path,
191
+ 'SC' => :set_color_for_stroking,
192
+ 'sc' => :set_color_for_nonstroking,
193
+ 'SCN' => :set_color_for_stroking_and_special,
194
+ 'scn' => :set_color_for_nonstroking_and_special,
195
+ 'sh' => :paint_area_with_shading_pattern,
196
+ 'T*' => :move_to_start_of_next_line,
197
+ 'Tc' => :set_character_spacing,
198
+ 'Td' => :move_text_position,
199
+ 'TD' => :move_text_position_and_set_leading,
200
+ 'Tf' => :set_text_font_and_size,
201
+ 'Tj' => :show_text,
202
+ 'TJ' => :show_text_with_positioning,
203
+ 'TL' => :set_text_leading,
204
+ 'Tm' => :set_text_matrix_and_text_line_matrix,
205
+ 'Tr' => :set_text_rendering_mode,
206
+ 'Ts' => :set_text_rise,
207
+ 'Tw' => :set_word_spacing,
208
+ 'Tz' => :set_horizontal_text_scaling,
209
+ 'v' => :append_curved_segment_initial_point_replicated,
210
+ 'w' => :set_line_width,
211
+ 'W' => :set_clipping_path_with_nonzero,
212
+ 'W*' => :set_clipping_path_with_even_odd,
213
+ 'y' => :append_curved_segment_final_point_replicated,
214
+ '\'' => :move_to_next_line_and_show_text,
215
+ '"' => :set_spacing_next_line_show_text,
216
+ }
217
+ ################################################################################
218
+ # Create a new PDF::Reader::Content object to process the contents of PDF file
219
+ # - receiver - an object containing the required callback methods
220
+ # - xref - a PDF::Reader::Xref object that contains references to all the objects in a PDF file
221
+ def initialize (receiver, xref)
222
+ @receiver = receiver
223
+ @xref = xref
224
+ end
225
+ ################################################################################
226
+ # Begin processing the document
227
+ def document (root)
228
+ callback(:begin_document, [root])
229
+ walk_pages(@xref.object(root['Pages']))
230
+ callback(:end_document)
231
+ end
232
+ ################################################################################
233
+ # Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
234
+ # its content
235
+ def walk_pages (page)
236
+ if page['Type'] == "Pages"
237
+ callback(:begin_page_container, [page])
238
+ page['Kids'].each {|child| walk_pages(@xref.object(child))}
239
+ callback(:end_page_container)
240
+ elsif page['Type'] == "Page"
241
+ callback(:begin_page, [page])
242
+ @page = page
243
+ @params = []
244
+
245
+ page['Contents'].to_a.each do |cstream|
246
+ content_stream(@xref.object(cstream))
247
+ end if page.has_key?('Contents') and page['Contents']
248
+
249
+ callback(:end_page)
250
+ end
251
+ end
252
+ ################################################################################
253
+ # Reads a PDF content stream and calls all the appropriate callback methods for the operators
254
+ # it contains
255
+ def content_stream (instructions)
256
+ @buffer = Buffer.new(StringIO.new(instructions))
257
+ @parser = Parser.new(@buffer, @xref)
258
+ @params = [] if @params.nil?
259
+
260
+ until @buffer.eof?
261
+ loop do
262
+ token = @parser.parse_token(OPERATORS)
263
+
264
+ if token.kind_of?(Token) and OPERATORS.has_key?(token)
265
+ resolve_resources
266
+ callback(OPERATORS[token], @params)
267
+ @params.clear
268
+ break
269
+ end
270
+
271
+ @params << token
272
+ end
273
+ end
274
+ rescue EOFError => e
275
+ end
276
+ ################################################################################
277
+ def resolve_resources
278
+ # FIXME TODO
279
+ end
280
+ ################################################################################
281
+ # calls the name callback method on the receiver class with params as the arguments
282
+ def callback (name, params=[])
283
+ @receiver.send(name, *params) if @receiver.respond_to?(name)
284
+ end
285
+ ################################################################################
286
+ end
287
+ ################################################################################
288
+ end
289
+ ################################################################################
@@ -0,0 +1,53 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+
25
+
26
+ class PDF::Reader
27
+ ################################################################################
28
+ # An internal PDF::Reader class that helps to verify various parts of the PDF file
29
+ # are valid
30
+ class Error
31
+ ################################################################################
32
+ def self.str_assert (lvalue, rvalue, chars=nil)
33
+ raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
34
+ lvalue = lvalue[0,chars] if chars
35
+ raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue != rvalue
36
+ end
37
+ ################################################################################
38
+ def self.str_assert_not (lvalue, rvalue, chars=nil)
39
+ raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
40
+ lvalue = lvalue[0,chars] if chars
41
+ raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue == rvalue
42
+ end
43
+ ################################################################################
44
+ def self.assert_equal (lvalue, rvalue)
45
+ raise MalformedPDFError, "PDF malformed, expected #{rvalue} but found #{lvalue} instead" if lvalue != rvalue
46
+ end
47
+ ################################################################################
48
+ end
49
+ ################################################################################
50
+ class MalformedPDFError < RuntimeError; end
51
+ class UnsupportedFeatureError < RuntimeError; end
52
+ end
53
+ ################################################################################
@@ -0,0 +1,116 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'pathname'
26
+
27
+
28
+ class PDF::Reader
29
+ ################################################################################
30
+ class Explore
31
+ ################################################################################
32
+ def self.file (name)
33
+ PDF::Reader.new.parse(File.open(name), self)
34
+ end
35
+ ################################################################################
36
+ def initialize (receiver, xref)
37
+ @xref = xref
38
+ @pwd = '/'
39
+ end
40
+ ################################################################################
41
+ def document (root)
42
+ @root = root
43
+ self
44
+ end
45
+ ################################################################################
46
+ def output_parent (obj)
47
+ case obj
48
+ when Hash
49
+ obj.each do |k,v|
50
+ print "#{k}"; output_child(v); print "\n"
51
+ Explore::const_set(k, k) if !Explore.const_defined?(k)
52
+ end
53
+ when Array
54
+ obj.each_with_index {|o, i| print "#{i}: "; output_child(o); print "\n"}
55
+ else
56
+ output_child(obj)
57
+ print "\n"
58
+ end
59
+ end
60
+ ################################################################################
61
+ def output_child (obj)
62
+ print ": #{obj.class}"
63
+
64
+ case obj
65
+ when Float
66
+ print ": #{obj}"
67
+ when String
68
+ print ": #{obj[0, 20].sub(/\n/, ' ')}"
69
+ end
70
+ end
71
+ ################################################################################
72
+ def cd (path)
73
+ path = path.to_s
74
+
75
+ if path[0,1] == "/"
76
+ @pwd = path
77
+ else
78
+ @pwd = Pathname.new(@pwd + '/' + path).cleanpath.to_s
79
+ end
80
+ end
81
+ ################################################################################
82
+ def pwd
83
+ @pwd
84
+ end
85
+ ################################################################################
86
+ def ls (entry = nil)
87
+ parts = @pwd.split('/')
88
+ obj = @root
89
+
90
+ parts.shift if parts[0] == ""
91
+ parts.push(entry) if entry
92
+
93
+ parts.each do |p|
94
+ case obj
95
+ when Hash
96
+ unless obj.has_key?(p)
97
+ puts "invalid path at #{p}"
98
+ return
99
+ end
100
+ obj = obj[p]
101
+
102
+ when Array
103
+ obj = obj[p.to_i]
104
+ end
105
+
106
+ obj = @xref.object(obj) if obj.kind_of?(Reference)
107
+ end
108
+
109
+ output_parent(obj)
110
+ "#{@pwd}: #{obj.class}"
111
+ end
112
+ ################################################################################
113
+ end
114
+ ################################################################################
115
+ end
116
+ ################################################################################