fireinc-pdf-reader 0.11.0.alpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. data/CHANGELOG +168 -0
  2. data/MIT-LICENSE +21 -0
  3. data/README.rdoc +137 -0
  4. data/Rakefile +34 -0
  5. data/TODO +45 -0
  6. data/bin/pdf_list_callbacks +15 -0
  7. data/bin/pdf_object +48 -0
  8. data/bin/pdf_text +15 -0
  9. data/examples/callbacks.rb +21 -0
  10. data/examples/extract_bates.rb +49 -0
  11. data/examples/extract_images.rb +108 -0
  12. data/examples/hash.rb +12 -0
  13. data/examples/metadata.rb +25 -0
  14. data/examples/page_counter_improved.rb +23 -0
  15. data/examples/page_counter_naive.rb +24 -0
  16. data/examples/rspec.rb +57 -0
  17. data/examples/text.rb +40 -0
  18. data/examples/version.rb +25 -0
  19. data/lib/pdf/hash.rb +15 -0
  20. data/lib/pdf/reader/abstract_strategy.rb +81 -0
  21. data/lib/pdf/reader/buffer.rb +346 -0
  22. data/lib/pdf/reader/cmap.rb +138 -0
  23. data/lib/pdf/reader/encoding.rb +190 -0
  24. data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
  25. data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
  26. data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
  27. data/lib/pdf/reader/encodings/standard.txt +47 -0
  28. data/lib/pdf/reader/encodings/symbol.txt +154 -0
  29. data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
  30. data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
  31. data/lib/pdf/reader/error.rb +53 -0
  32. data/lib/pdf/reader/filter.rb +219 -0
  33. data/lib/pdf/reader/font.rb +133 -0
  34. data/lib/pdf/reader/form_xobject.rb +83 -0
  35. data/lib/pdf/reader/glyphlist.txt +4322 -0
  36. data/lib/pdf/reader/lzw.rb +123 -0
  37. data/lib/pdf/reader/metadata_strategy.rb +56 -0
  38. data/lib/pdf/reader/object_cache.rb +85 -0
  39. data/lib/pdf/reader/object_hash.rb +289 -0
  40. data/lib/pdf/reader/object_stream.rb +51 -0
  41. data/lib/pdf/reader/page.rb +185 -0
  42. data/lib/pdf/reader/page_text_receiver.rb +278 -0
  43. data/lib/pdf/reader/pages_strategy.rb +475 -0
  44. data/lib/pdf/reader/parser.rb +225 -0
  45. data/lib/pdf/reader/print_receiver.rb +18 -0
  46. data/lib/pdf/reader/reference.rb +66 -0
  47. data/lib/pdf/reader/register_receiver.rb +95 -0
  48. data/lib/pdf/reader/stream.rb +69 -0
  49. data/lib/pdf/reader/text_receiver.rb +264 -0
  50. data/lib/pdf/reader/token.rb +41 -0
  51. data/lib/pdf/reader/xref.rb +220 -0
  52. data/lib/pdf/reader.rb +296 -0
  53. data/lib/pdf-reader.rb +1 -0
  54. metadata +211 -0
@@ -0,0 +1,185 @@
1
+ # coding: utf-8
2
+
3
+ module PDF
4
+ class Reader
5
+
6
+ # high level representation of a single PDF page. Ties together the various
7
+ # low level classes in PDF::Reader and provides access to the various
8
+ # components of the page (text, images, fonts, etc) in convenient formats.
9
+ #
10
+ # If you require access to the raw PDF objects for this page, you can access
11
+ # the Page dictionary via the page_object accessor. You will need to use the
12
+ # objects accessor to help walk the page dictionary in any useful way.
13
+ #
14
+ class Page
15
+
16
+ # lowlevel hash-like access to all objects in the underlying PDF
17
+ attr_reader :objects
18
+
19
+ # the raw PDF object that defines this page
20
+ attr_reader :page_object
21
+
22
+ # creates a new page wrapper.
23
+ #
24
+ # * objects - an ObjectHash instance that wraps a PDF file
25
+ # * pagenum - an int specifying the page number to expose. 1 indexed.
26
+ #
27
+ def initialize(objects, pagenum)
28
+ @objects, @pagenum = objects, pagenum
29
+ @page_object = objects.deref(objects.page_references[pagenum - 1])
30
+
31
+ unless @page_object.is_a?(::Hash)
32
+ raise ArgumentError, "invalid page: #{pagenum}"
33
+ end
34
+ end
35
+
36
+ # return the number of this page within the full document
37
+ #
38
+ def number
39
+ @pagenum
40
+ end
41
+
42
+ # return a friendly string representation of this page
43
+ #
44
+ def inspect
45
+ "<PDF::Reader::Page page: #{@pagenum}>"
46
+ end
47
+
48
+ # Returns the attributes that accompany this page. Includes
49
+ # attributes inherited from parents.
50
+ #
51
+ def attributes
52
+ hash = {}
53
+ page_with_ancestors.reverse.each do |obj|
54
+ hash.merge!(@objects.deref(obj))
55
+ end
56
+ hash
57
+ end
58
+
59
+ # Returns the resources that accompany this page. Includes
60
+ # resources inherited from parents.
61
+ #
62
+ def resources
63
+ @resources ||= @objects.deref(attributes[:Resources]) || {}
64
+ end
65
+
66
+ # Returns the XObjects that are available to this page
67
+ #
68
+ def xobjects
69
+ resources[:XObject] || {}
70
+ end
71
+
72
+ # return a hash of fonts used on this page.
73
+ #
74
+ # The keys are the font labels used within the page content stream.
75
+ #
76
+ # The values are a PDF::Reader::Font instances that provide access
77
+ # to most available metrics for each font.
78
+ #
79
+ def fonts
80
+ raw_fonts = objects.deref(resources[:Font] || {})
81
+ ::Hash[raw_fonts.map { |label, font|
82
+ [label, PDF::Reader::Font.new(objects, objects.deref(font))]
83
+ }]
84
+ end
85
+
86
+ # returns the plain text content of this page encoded as UTF-8. Any
87
+ # characters that can't be translated will be returned as a ▯
88
+ #
89
+ def text
90
+ receiver = PageTextReceiver.new
91
+ walk(receiver)
92
+ receiver.content
93
+ end
94
+ alias :to_s :text
95
+
96
+ # processes the raw content stream for this page in sequential order and
97
+ # passes callbacks to the receiver objects.
98
+ #
99
+ # This is mostly low level and you can probably ignore it unless you need
100
+ # access to soemthing like the raw encoded text. For an example of how
101
+ # this can be used as a basis for higher level functionality, see the
102
+ # text() method
103
+ #
104
+ # If someone was motivated enough, this method is intended to provide all
105
+ # the data required to faithfully render the entire page. If you find
106
+ # some required data isn't available it's a bug - let me know.
107
+ #
108
+ # Many operators that generate callbacks will reference resources stored
109
+ # in the page header - think images, fonts, etc. To facilitate these
110
+ # operators, the first available callback is page=. If your receiver
111
+ # accepts that callback it will be passed the current
112
+ # PDF::Reader::Page object. Use the Page#resources method to grab any
113
+ # required resources.
114
+ #
115
+ def walk(*receivers)
116
+ callback(receivers, :page=, [self])
117
+ content_stream(receivers, raw_content)
118
+ end
119
+
120
+ # returns the raw content stream for this page. This is plumbing, nothing to
121
+ # see here unless you're a PDF nerd like me.
122
+ #
123
+ def raw_content
124
+ contents = objects.deref(@page_object[:Contents])
125
+ [contents].flatten.compact.map { |obj|
126
+ objects.deref(obj)
127
+ }.map { |obj|
128
+ obj.unfiltered_data
129
+ }.join
130
+ end
131
+
132
+ private
133
+
134
+ def root
135
+ root ||= objects.deref(@objects.trailer[:Root])
136
+ end
137
+
138
+ def content_stream(receivers, instructions)
139
+ buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
140
+ parser = Parser.new(buffer, @objects)
141
+ params = []
142
+
143
+ while (token = parser.parse_token(PagesStrategy::OPERATORS))
144
+ if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
145
+ callback(receivers, PagesStrategy::OPERATORS[token], params)
146
+ params.clear
147
+ else
148
+ params << token
149
+ end
150
+ end
151
+ rescue EOFError => e
152
+ raise MalformedPDFError, "End Of File while processing a content stream"
153
+ end
154
+
155
+ # calls the name callback method on the receiver class with params as the arguments
156
+ #
157
+ def callback (receivers, name, params=[])
158
+ receivers.each do |receiver|
159
+ receiver.send(name, *params) if receiver.respond_to?(name)
160
+ end
161
+ end
162
+
163
+ def page_with_ancestors(obj = nil)
164
+ obj = objects.deref(obj)
165
+ if obj.nil?
166
+ [@page_object] + page_with_ancestors(@page_object[:Parent])
167
+ elsif obj[:Parent]
168
+ [select_inheritable(obj)] + page_with_ancestors(obj[:Parent])
169
+ else
170
+ [select_inheritable(obj)]
171
+ end
172
+ end
173
+
174
+ # select the elements from a Pages dictionary that can be inherited by
175
+ # child Page dictionaries.
176
+ #
177
+ def select_inheritable(obj)
178
+ ::Hash[obj.select { |key, value|
179
+ [:Resources, :MediaBox, :CropBox, :Rotate, :Parent].include?(key)
180
+ }]
181
+ end
182
+
183
+ end
184
+ end
185
+ end
@@ -0,0 +1,278 @@
1
+ # coding: utf-8
2
+
3
+ require 'matrix'
4
+ require 'yaml'
5
+
6
+ module PDF
7
+ class Reader
8
+ class PageTextReceiver
9
+
10
+ DEFAULT_GRAPHICS_STATE = {
11
+ :ctm => Matrix.identity(3),
12
+ :char_spacing => 0,
13
+ :word_spacing => 0,
14
+ :h_scaling => 100,
15
+ :text_leading => 0,
16
+ :text_font => nil,
17
+ :text_font_size => nil,
18
+ :text_mode => 0,
19
+ :text_rise => 0,
20
+ :text_knockout => 0
21
+ }
22
+
23
+ # starting a new page
24
+ def page=(page)
25
+ @page = page
26
+ @objects = page.objects
27
+ @fonts = page.fonts
28
+ @form_fonts = {}
29
+ @content = ::Hash.new
30
+ @stack = [DEFAULT_GRAPHICS_STATE]
31
+ end
32
+
33
+ def content
34
+ keys = @content.keys.sort.reverse
35
+ keys.map { |key|
36
+ @content[key]
37
+ }.join("\n")
38
+ end
39
+
40
+ #####################################################
41
+ # Graphics State Operators
42
+ #####################################################
43
+
44
+ def save_graphics_state
45
+ @stack.push clone_state
46
+ end
47
+
48
+ def restore_graphics_state
49
+ @stack.pop
50
+ end
51
+
52
+ #####################################################
53
+ # Matrix Operators
54
+ #####################################################
55
+
56
+ # update the current transformation matrix.
57
+ #
58
+ # If the CTM is currently undefined, just store the new values.
59
+ #
60
+ # If there's an existing CTM, then multiply the existing matrix
61
+ # with the new matrix to form the updated matrix.
62
+ #
63
+ def concatenate_matrix(a, b, c, d, e, f)
64
+ transform = Matrix[
65
+ [a, b, 0],
66
+ [c, d, 0],
67
+ [e, f, 1]
68
+ ]
69
+ if state[:ctm]
70
+ state[:ctm] = transform * state[:ctm]
71
+ else
72
+ state[:ctm] = transform
73
+ end
74
+ end
75
+
76
+ #####################################################
77
+ # Text Object Operators
78
+ #####################################################
79
+
80
+ def begin_text_object
81
+ @text_matrix = Matrix.identity(3)
82
+ @text_line_matrix = Matrix.identity(3)
83
+ end
84
+
85
+ def end_text_object
86
+ @text_matrix = Matrix.identity(3)
87
+ @text_line_matrix = Matrix.identity(3)
88
+ end
89
+
90
+ #####################################################
91
+ # Text State Operators
92
+ #####################################################
93
+
94
+ def set_character_spacing(char_spacing)
95
+ state[:char_spacing] = char_spacing
96
+ end
97
+
98
+ def set_horizontal_text_scaling(h_scaling)
99
+ state[:h_scaling] = h_scaling
100
+ end
101
+
102
+ def set_text_font_and_size(label, size)
103
+ state[:text_font] = label
104
+ state[:text_font_size] = size
105
+ end
106
+
107
+ def set_text_leading(leading)
108
+ state[:text_leading] = leading
109
+ end
110
+
111
+ def set_text_rendering_mode(mode)
112
+ state[:text_mode] = mode
113
+ end
114
+
115
+ def set_text_rise(rise)
116
+ state[:text_rise] = rise
117
+ end
118
+
119
+ def set_word_spacing(word_spacing)
120
+ state[:word_spacing] = word_spacing
121
+ end
122
+
123
+ #####################################################
124
+ # Text Positioning Operators
125
+ #####################################################
126
+
127
+ def move_text_position(x, y) # Td
128
+ temp_matrix = Matrix[
129
+ [1, 0, 0],
130
+ [0, 1, 0],
131
+ [x, y, 1]
132
+ ]
133
+ @text_matrix = @text_line_matrix = temp_matrix * @text_line_matrix
134
+ end
135
+
136
+ def move_text_position_and_set_leading(x, y) # TD
137
+ set_text_leading(-1 * y)
138
+ move_text_position(x, y)
139
+ end
140
+
141
+ def set_text_matrix_and_text_line_matrix(a, b, c, d, e, f) # Tm
142
+ @text_matrix = @text_line_matrix = Matrix[
143
+ [a, b, 0],
144
+ [c, d, 0],
145
+ [e, f, 1]
146
+ ]
147
+ end
148
+
149
+ def move_to_start_of_next_line # T*
150
+ move_text_position(0, state[:text_leading])
151
+ end
152
+
153
+ #####################################################
154
+ # Text Showing Operators
155
+ #####################################################
156
+
157
+ # record text that is drawn on the page
158
+ def show_text(string) # Tj
159
+ at = transform(Point.new(0,0))
160
+ @content[at.y] ||= ""
161
+ @content[at.y] << current_font.to_utf8(string)
162
+ end
163
+
164
+ def show_text_with_positioning(params) # TJ
165
+ params.each { |arg|
166
+ case arg
167
+ when String
168
+ show_text(arg)
169
+ when Fixnum, Float
170
+ show_text(" ") if arg > 1000
171
+ end
172
+ }
173
+ end
174
+
175
+ def move_to_next_line_and_show_text(str) # '
176
+ move_to_start_of_next_line
177
+ show_text(str)
178
+ end
179
+
180
+ def set_spacing_next_line_show_text(aw, ac, string) # "
181
+ set_word_spacing(aw)
182
+ set_character_spacing(ac)
183
+ move_to_next_line_and_show_text(string)
184
+ end
185
+
186
+ #####################################################
187
+ # XObjects
188
+ #####################################################
189
+ def invoke_xobject(label)
190
+ save_graphics_state
191
+ xobject = @objects.deref(@page.xobjects[label])
192
+
193
+ matrix = xobject.hash[:Matrix]
194
+ concatenate_matrix(*matrix) if matrix
195
+
196
+ if xobject.hash[:Subtype] == :Form
197
+ form = PDF::Reader::FormXObject.new(@page, xobject)
198
+ @form_fonts = form.fonts
199
+ form.walk(self)
200
+ end
201
+ @form_fonts = {}
202
+
203
+ restore_graphics_state
204
+ end
205
+
206
+ private
207
+
208
+ # transform x and y co-ordinates from the current text space to the
209
+ # underlying device space.
210
+ #
211
+ def transform(point, z = 1)
212
+ trm = text_rendering_matrix
213
+ Point.new(
214
+ (trm[0,0] * point.x) + (trm[1,0] * point.y) + (trm[2,0] * z),
215
+ (trm[0,1] * point.x) + (trm[1,1] * point.y) + (trm[2,1] * z)
216
+ )
217
+ end
218
+
219
+ def text_rendering_matrix
220
+ state_matrix = Matrix[
221
+ [state[:text_font_size] * state[:h_scaling], 0, 0],
222
+ [0, state[:text_font_size], 0],
223
+ [0, state[:text_rise], 1]
224
+ ]
225
+
226
+ state_matrix * @text_matrix * ctm
227
+ end
228
+
229
+ def state
230
+ @stack.last
231
+ end
232
+
233
+ # when save_graphics_state is called, we need to push a new copy of the
234
+ # current state onto the stack. That way any modifications to the state
235
+ # will be undone once restore_graphics_state is called.
236
+ #
237
+ # This returns a deep clone of the current state, ensuring changes are
238
+ # keep separate from earlier states.
239
+ #
240
+ # YAML is used to round-trip the state through a string to easily perform
241
+ # the deep clone. Kinda hacky, but effective.
242
+ #
243
+ def clone_state
244
+ if @stack.empty?
245
+ {}
246
+ else
247
+ yaml_state = YAML.dump(@stack.last)
248
+ YAML.load(yaml_state)
249
+ end
250
+ end
251
+
252
+ # return the current transformation matrix
253
+ #
254
+ def ctm
255
+ state[:ctm]
256
+ end
257
+
258
+ def current_font
259
+ @form_fonts[state[:text_font]] || @fonts[state[:text_font]]
260
+ end
261
+
262
+ # private class for representing points on a cartesian plain. Used
263
+ # to simplify maths in the MinPpi class.
264
+ #
265
+ class Point
266
+ attr_reader :x, :y
267
+
268
+ def initialize(x,y)
269
+ @x, @y = x,y
270
+ end
271
+
272
+ def distance(point)
273
+ Math.hypot(point.x - x, point.y - y)
274
+ end
275
+ end
276
+ end
277
+ end
278
+ end