pdf-reader 0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,289 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'stringio'
26
+
27
+ class PDF::Reader
28
+ ################################################################################
29
+ # Walks the PDF file and calls the appropriate callback methods when something of interest is
30
+ # found.
31
+ #
32
+ # The callback methods should exist on the receiver object passed into the constructor. Whenever
33
+ # some content is found that will trigger a callback, the receiver is checked to see if the callback
34
+ # is defined.
35
+ #
36
+ # If it is defined it will be called. If not, processing will continue.
37
+ #
38
+ # = Available Callbacks
39
+ # The following callbacks are available and should be methods defined on your receiver class. Only
40
+ # implement the ones you need - the rest will be ignored.
41
+ #
42
+ # Some callbacks will include parameters which will be passed in as an array. For callbacks that supply no
43
+ # paramters, or where you don't need them, the *params argument can be left off. Some example callback
44
+ # method definitions are:
45
+ #
46
+ # def begin_document
47
+ # def end_page
48
+ # def show_text(string, *params)
49
+ # def fill_stroke(*params)
50
+ #
51
+ # You should be able to infer the basic command the callback is reporting based on the name. For
52
+ # further experimentation, define the callback with just a *params parameter, then print out the
53
+ # contents of the array using something like:
54
+ #
55
+ # puts params.inspect
56
+ #
57
+ # == Text Callbacks
58
+ # - end_text_object
59
+ # - move_to_start_of_next_line
60
+ # - set_character_spacing
61
+ # - move_text_position
62
+ # - move_text_position_and_set_leading
63
+ # - set_text_font_and_size
64
+ # - show_text
65
+ # - show_text_with_positioning
66
+ # - set_text_leading
67
+ # - set_text_matrix_and_text_line_matrix
68
+ # - set_text_rendering_mode
69
+ # - set_text_rise
70
+ # - set_word_spacing
71
+ # - set_horizontal_text_scaling
72
+ # - move_to_next_line_and_show_text
73
+ # - set_spacing_next_line_show_text
74
+ #
75
+ # == Graphics Callbacks
76
+ # - close_fill_stroke
77
+ # - fill_stroke
78
+ # - close_fill_stroke_with_even_odd
79
+ # - fill_stroke_with_even_odd
80
+ # - begin_marked_content_with_pl
81
+ # - begin_inline_image
82
+ # - begin_marked_content
83
+ # - begin_text_object
84
+ # - append_curved_segment
85
+ # - concatenate_matrix
86
+ # - set_stroke_color_space
87
+ # - set_nonstroke_color_space
88
+ # - set_line_dash
89
+ # - set_glyph_width
90
+ # - set_glyph_width_and_bounding_box
91
+ # - invoke_xobject
92
+ # - define_marked_content_with_pl
93
+ # - end_inline_image
94
+ # - end_marked_content
95
+ # - fill_path_with_nonzero
96
+ # - fill_path_with_nonzero
97
+ # - fill_path_with_even_odd
98
+ # - set_gray_for_stroking
99
+ # - set_gray_for_nonstroking
100
+ # - set_graphics_state_parameters
101
+ # - close_subpath
102
+ # - set_flatness_tolerance
103
+ # - begin_inline_image_data
104
+ # - set_line_join_style
105
+ # - set_line_cap_style
106
+ # - set_cmyk_color_for_stroking,
107
+ # - set_cmyk_color_for_nonstroking
108
+ # - append_line
109
+ # - begin_new_subpath
110
+ # - set_miter_limit
111
+ # - define_marked_content_point
112
+ # - end_path
113
+ # - save_graphics_state
114
+ # - restore_graphics_state
115
+ # - append_rectangle
116
+ # - set_rgb_color_for_stroking
117
+ # - set_rgb_color_for_nonstroking
118
+ # - set_color_rendering_intent
119
+ # - close_and_stroke_path
120
+ # - stroke_path
121
+ # - set_color_for_stroking
122
+ # - set_color_for_nonstroking
123
+ # - set_color_for_stroking_and_special
124
+ # - set_color_for_nonstroking_and_special
125
+ # - paint_area_with_shading_pattern
126
+ # - append_curved_segment_initial_point_replicated
127
+ # - set_line_width
128
+ # - set_clipping_path_with_nonzero
129
+ # - set_clipping_path_with_even_odd
130
+ # - append_curved_segment_final_point_replicated
131
+ #
132
+ # == Misc Callbacks
133
+ # - begin_compatibility_section
134
+ # - end_compatibility_section,
135
+ # - begin_document
136
+ # - end_document
137
+ # - begin_page_container
138
+ # - end_page_container
139
+ # - begin_page
140
+ # - end_page
141
+ class Content
142
+ OPERATORS = {
143
+ 'b' => :close_fill_stroke,
144
+ 'B' => :fill_stroke,
145
+ 'b*' => :close_fill_stroke_with_even_odd,
146
+ 'B*' => :fill_stroke_with_even_odd,
147
+ 'BDC' => :begin_marked_content_with_pl,
148
+ 'BI' => :begin_inline_image,
149
+ 'BMC' => :begin_marked_content,
150
+ 'BT' => :begin_text_object,
151
+ 'BX' => :begin_compatibility_section,
152
+ 'c' => :append_curved_segment,
153
+ 'cm' => :concatenate_matrix,
154
+ 'CS' => :set_stroke_color_space,
155
+ 'cs' => :set_nonstroke_color_space,
156
+ 'd' => :set_line_dash,
157
+ 'd0' => :set_glyph_width,
158
+ 'd1' => :set_glyph_width_and_bounding_box,
159
+ 'Do' => :invoke_xobject,
160
+ 'DP' => :define_marked_content_with_pl,
161
+ 'EI' => :end_inline_image,
162
+ 'EMC' => :end_marked_content,
163
+ 'ET' => :end_text_object,
164
+ 'EX' => :end_compatibility_section,
165
+ 'f' => :fill_path_with_nonzero,
166
+ 'F' => :fill_path_with_nonzero,
167
+ 'f*' => :fill_path_with_even_odd,
168
+ 'G' => :set_gray_for_stroking,
169
+ 'g' => :set_gray_for_nonstroking,
170
+ 'gs' => :set_graphics_state_parameters,
171
+ 'h' => :close_subpath,
172
+ 'i' => :set_flatness_tolerance,
173
+ 'ID' => :begin_inline_image_data,
174
+ 'j' => :set_line_join_style,
175
+ 'J' => :set_line_cap_style,
176
+ 'K' => :set_cmyk_color_for_stroking,
177
+ 'k' => :set_cmyk_color_for_nonstroking,
178
+ 'l' => :append_line,
179
+ 'm' => :begin_new_subpath,
180
+ 'M' => :set_miter_limit,
181
+ 'MP' => :define_marked_content_point,
182
+ 'n' => :end_path,
183
+ 'q' => :save_graphics_state,
184
+ 'Q' => :restore_graphics_state,
185
+ 're' => :append_rectangle,
186
+ 'RG' => :set_rgb_color_for_stroking,
187
+ 'rg' => :set_rgb_color_for_nonstroking,
188
+ 'ri' => :set_color_rendering_intent,
189
+ 's' => :close_and_stroke_path,
190
+ 'S' => :stroke_path,
191
+ 'SC' => :set_color_for_stroking,
192
+ 'sc' => :set_color_for_nonstroking,
193
+ 'SCN' => :set_color_for_stroking_and_special,
194
+ 'scn' => :set_color_for_nonstroking_and_special,
195
+ 'sh' => :paint_area_with_shading_pattern,
196
+ 'T*' => :move_to_start_of_next_line,
197
+ 'Tc' => :set_character_spacing,
198
+ 'Td' => :move_text_position,
199
+ 'TD' => :move_text_position_and_set_leading,
200
+ 'Tf' => :set_text_font_and_size,
201
+ 'Tj' => :show_text,
202
+ 'TJ' => :show_text_with_positioning,
203
+ 'TL' => :set_text_leading,
204
+ 'Tm' => :set_text_matrix_and_text_line_matrix,
205
+ 'Tr' => :set_text_rendering_mode,
206
+ 'Ts' => :set_text_rise,
207
+ 'Tw' => :set_word_spacing,
208
+ 'Tz' => :set_horizontal_text_scaling,
209
+ 'v' => :append_curved_segment_initial_point_replicated,
210
+ 'w' => :set_line_width,
211
+ 'W' => :set_clipping_path_with_nonzero,
212
+ 'W*' => :set_clipping_path_with_even_odd,
213
+ 'y' => :append_curved_segment_final_point_replicated,
214
+ '\'' => :move_to_next_line_and_show_text,
215
+ '"' => :set_spacing_next_line_show_text,
216
+ }
217
+ ################################################################################
218
+ # Create a new PDF::Reader::Content object to process the contents of PDF file
219
+ # - receiver - an object containing the required callback methods
220
+ # - xref - a PDF::Reader::Xref object that contains references to all the objects in a PDF file
221
+ def initialize (receiver, xref)
222
+ @receiver = receiver
223
+ @xref = xref
224
+ end
225
+ ################################################################################
226
+ # Begin processing the document
227
+ def document (root)
228
+ callback(:begin_document, [root])
229
+ walk_pages(@xref.object(root['Pages']))
230
+ callback(:end_document)
231
+ end
232
+ ################################################################################
233
+ # Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
234
+ # its content
235
+ def walk_pages (page)
236
+ if page['Type'] == "Pages"
237
+ callback(:begin_page_container, [page])
238
+ page['Kids'].each {|child| walk_pages(@xref.object(child))}
239
+ callback(:end_page_container)
240
+ elsif page['Type'] == "Page"
241
+ callback(:begin_page, [page])
242
+ @page = page
243
+ @params = []
244
+
245
+ page['Contents'].to_a.each do |cstream|
246
+ content_stream(@xref.object(cstream))
247
+ end if page.has_key?('Contents') and page['Contents']
248
+
249
+ callback(:end_page)
250
+ end
251
+ end
252
+ ################################################################################
253
+ # Reads a PDF content stream and calls all the appropriate callback methods for the operators
254
+ # it contains
255
+ def content_stream (instructions)
256
+ @buffer = Buffer.new(StringIO.new(instructions))
257
+ @parser = Parser.new(@buffer, @xref)
258
+ @params = [] if @params.nil?
259
+
260
+ until @buffer.eof?
261
+ loop do
262
+ token = @parser.parse_token(OPERATORS)
263
+
264
+ if token.kind_of?(Token) and OPERATORS.has_key?(token)
265
+ resolve_resources
266
+ callback(OPERATORS[token], @params)
267
+ @params.clear
268
+ break
269
+ end
270
+
271
+ @params << token
272
+ end
273
+ end
274
+ rescue EOFError => e
275
+ end
276
+ ################################################################################
277
+ def resolve_resources
278
+ # FIXME TODO
279
+ end
280
+ ################################################################################
281
+ # calls the name callback method on the receiver class with params as the arguments
282
+ def callback (name, params=[])
283
+ @receiver.send(name, *params) if @receiver.respond_to?(name)
284
+ end
285
+ ################################################################################
286
+ end
287
+ ################################################################################
288
+ end
289
+ ################################################################################
@@ -0,0 +1,53 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+
25
+
26
+ class PDF::Reader
27
+ ################################################################################
28
+ # An internal PDF::Reader class that helps to verify various parts of the PDF file
29
+ # are valid
30
+ class Error
31
+ ################################################################################
32
+ def self.str_assert (lvalue, rvalue, chars=nil)
33
+ raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
34
+ lvalue = lvalue[0,chars] if chars
35
+ raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue != rvalue
36
+ end
37
+ ################################################################################
38
+ def self.str_assert_not (lvalue, rvalue, chars=nil)
39
+ raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
40
+ lvalue = lvalue[0,chars] if chars
41
+ raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue == rvalue
42
+ end
43
+ ################################################################################
44
+ def self.assert_equal (lvalue, rvalue)
45
+ raise MalformedPDFError, "PDF malformed, expected #{rvalue} but found #{lvalue} instead" if lvalue != rvalue
46
+ end
47
+ ################################################################################
48
+ end
49
+ ################################################################################
50
+ class MalformedPDFError < RuntimeError; end
51
+ class UnsupportedFeatureError < RuntimeError; end
52
+ end
53
+ ################################################################################
@@ -0,0 +1,116 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'pathname'
26
+
27
+
28
+ class PDF::Reader
29
+ ################################################################################
30
+ class Explore
31
+ ################################################################################
32
+ def self.file (name)
33
+ PDF::Reader.new.parse(File.open(name), self)
34
+ end
35
+ ################################################################################
36
+ def initialize (receiver, xref)
37
+ @xref = xref
38
+ @pwd = '/'
39
+ end
40
+ ################################################################################
41
+ def document (root)
42
+ @root = root
43
+ self
44
+ end
45
+ ################################################################################
46
+ def output_parent (obj)
47
+ case obj
48
+ when Hash
49
+ obj.each do |k,v|
50
+ print "#{k}"; output_child(v); print "\n"
51
+ Explore::const_set(k, k) if !Explore.const_defined?(k)
52
+ end
53
+ when Array
54
+ obj.each_with_index {|o, i| print "#{i}: "; output_child(o); print "\n"}
55
+ else
56
+ output_child(obj)
57
+ print "\n"
58
+ end
59
+ end
60
+ ################################################################################
61
+ def output_child (obj)
62
+ print ": #{obj.class}"
63
+
64
+ case obj
65
+ when Float
66
+ print ": #{obj}"
67
+ when String
68
+ print ": #{obj[0, 20].sub(/\n/, ' ')}"
69
+ end
70
+ end
71
+ ################################################################################
72
+ def cd (path)
73
+ path = path.to_s
74
+
75
+ if path[0,1] == "/"
76
+ @pwd = path
77
+ else
78
+ @pwd = Pathname.new(@pwd + '/' + path).cleanpath.to_s
79
+ end
80
+ end
81
+ ################################################################################
82
+ def pwd
83
+ @pwd
84
+ end
85
+ ################################################################################
86
+ def ls (entry = nil)
87
+ parts = @pwd.split('/')
88
+ obj = @root
89
+
90
+ parts.shift if parts[0] == ""
91
+ parts.push(entry) if entry
92
+
93
+ parts.each do |p|
94
+ case obj
95
+ when Hash
96
+ unless obj.has_key?(p)
97
+ puts "invalid path at #{p}"
98
+ return
99
+ end
100
+ obj = obj[p]
101
+
102
+ when Array
103
+ obj = obj[p.to_i]
104
+ end
105
+
106
+ obj = @xref.object(obj) if obj.kind_of?(Reference)
107
+ end
108
+
109
+ output_parent(obj)
110
+ "#{@pwd}: #{obj.class}"
111
+ end
112
+ ################################################################################
113
+ end
114
+ ################################################################################
115
+ end
116
+ ################################################################################