fireinc-pdf-reader 0.11.0.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. data/CHANGELOG +168 -0
  2. data/MIT-LICENSE +21 -0
  3. data/README.rdoc +137 -0
  4. data/Rakefile +34 -0
  5. data/TODO +45 -0
  6. data/bin/pdf_list_callbacks +15 -0
  7. data/bin/pdf_object +48 -0
  8. data/bin/pdf_text +15 -0
  9. data/examples/callbacks.rb +21 -0
  10. data/examples/extract_bates.rb +49 -0
  11. data/examples/extract_images.rb +108 -0
  12. data/examples/hash.rb +12 -0
  13. data/examples/metadata.rb +25 -0
  14. data/examples/page_counter_improved.rb +23 -0
  15. data/examples/page_counter_naive.rb +24 -0
  16. data/examples/rspec.rb +57 -0
  17. data/examples/text.rb +40 -0
  18. data/examples/version.rb +25 -0
  19. data/lib/pdf/hash.rb +15 -0
  20. data/lib/pdf/reader/abstract_strategy.rb +81 -0
  21. data/lib/pdf/reader/buffer.rb +346 -0
  22. data/lib/pdf/reader/cmap.rb +138 -0
  23. data/lib/pdf/reader/encoding.rb +190 -0
  24. data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
  25. data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
  26. data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
  27. data/lib/pdf/reader/encodings/standard.txt +47 -0
  28. data/lib/pdf/reader/encodings/symbol.txt +154 -0
  29. data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
  30. data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
  31. data/lib/pdf/reader/error.rb +53 -0
  32. data/lib/pdf/reader/filter.rb +219 -0
  33. data/lib/pdf/reader/font.rb +133 -0
  34. data/lib/pdf/reader/form_xobject.rb +83 -0
  35. data/lib/pdf/reader/glyphlist.txt +4322 -0
  36. data/lib/pdf/reader/lzw.rb +123 -0
  37. data/lib/pdf/reader/metadata_strategy.rb +56 -0
  38. data/lib/pdf/reader/object_cache.rb +85 -0
  39. data/lib/pdf/reader/object_hash.rb +289 -0
  40. data/lib/pdf/reader/object_stream.rb +51 -0
  41. data/lib/pdf/reader/page.rb +185 -0
  42. data/lib/pdf/reader/page_text_receiver.rb +278 -0
  43. data/lib/pdf/reader/pages_strategy.rb +475 -0
  44. data/lib/pdf/reader/parser.rb +225 -0
  45. data/lib/pdf/reader/print_receiver.rb +18 -0
  46. data/lib/pdf/reader/reference.rb +66 -0
  47. data/lib/pdf/reader/register_receiver.rb +95 -0
  48. data/lib/pdf/reader/stream.rb +69 -0
  49. data/lib/pdf/reader/text_receiver.rb +264 -0
  50. data/lib/pdf/reader/token.rb +41 -0
  51. data/lib/pdf/reader/xref.rb +220 -0
  52. data/lib/pdf/reader.rb +296 -0
  53. data/lib/pdf-reader.rb +1 -0
  54. metadata +211 -0
@@ -0,0 +1,185 @@
1
+ # coding: utf-8
2
+
3
+ module PDF
4
+ class Reader
5
+
6
+ # high level representation of a single PDF page. Ties together the various
7
+ # low level classes in PDF::Reader and provides access to the various
8
+ # components of the page (text, images, fonts, etc) in convenient formats.
9
+ #
10
+ # If you require access to the raw PDF objects for this page, you can access
11
+ # the Page dictionary via the page_object accessor. You will need to use the
12
+ # objects accessor to help walk the page dictionary in any useful way.
13
+ #
14
+ class Page
15
+
16
+ # lowlevel hash-like access to all objects in the underlying PDF
17
+ attr_reader :objects
18
+
19
+ # the raw PDF object that defines this page
20
+ attr_reader :page_object
21
+
22
+ # creates a new page wrapper.
23
+ #
24
+ # * objects - an ObjectHash instance that wraps a PDF file
25
+ # * pagenum - an int specifying the page number to expose. 1 indexed.
26
+ #
27
+ def initialize(objects, pagenum)
28
+ @objects, @pagenum = objects, pagenum
29
+ @page_object = objects.deref(objects.page_references[pagenum - 1])
30
+
31
+ unless @page_object.is_a?(::Hash)
32
+ raise ArgumentError, "invalid page: #{pagenum}"
33
+ end
34
+ end
35
+
36
+ # return the number of this page within the full document
37
+ #
38
+ def number
39
+ @pagenum
40
+ end
41
+
42
+ # return a friendly string representation of this page
43
+ #
44
+ def inspect
45
+ "<PDF::Reader::Page page: #{@pagenum}>"
46
+ end
47
+
48
+ # Returns the attributes that accompany this page. Includes
49
+ # attributes inherited from parents.
50
+ #
51
+ def attributes
52
+ hash = {}
53
+ page_with_ancestors.reverse.each do |obj|
54
+ hash.merge!(@objects.deref(obj))
55
+ end
56
+ hash
57
+ end
58
+
59
+ # Returns the resources that accompany this page. Includes
60
+ # resources inherited from parents.
61
+ #
62
+ def resources
63
+ @resources ||= @objects.deref(attributes[:Resources]) || {}
64
+ end
65
+
66
+ # Returns the XObjects that are available to this page
67
+ #
68
+ def xobjects
69
+ resources[:XObject] || {}
70
+ end
71
+
72
+ # return a hash of fonts used on this page.
73
+ #
74
+ # The keys are the font labels used within the page content stream.
75
+ #
76
+ # The values are a PDF::Reader::Font instances that provide access
77
+ # to most available metrics for each font.
78
+ #
79
+ def fonts
80
+ raw_fonts = objects.deref(resources[:Font] || {})
81
+ ::Hash[raw_fonts.map { |label, font|
82
+ [label, PDF::Reader::Font.new(objects, objects.deref(font))]
83
+ }]
84
+ end
85
+
86
+ # returns the plain text content of this page encoded as UTF-8. Any
87
+ # characters that can't be translated will be returned as a ▯
88
+ #
89
+ def text
90
+ receiver = PageTextReceiver.new
91
+ walk(receiver)
92
+ receiver.content
93
+ end
94
+ alias :to_s :text
95
+
96
+ # processes the raw content stream for this page in sequential order and
97
+ # passes callbacks to the receiver objects.
98
+ #
99
+ # This is mostly low level and you can probably ignore it unless you need
100
+ # access to soemthing like the raw encoded text. For an example of how
101
+ # this can be used as a basis for higher level functionality, see the
102
+ # text() method
103
+ #
104
+ # If someone was motivated enough, this method is intended to provide all
105
+ # the data required to faithfully render the entire page. If you find
106
+ # some required data isn't available it's a bug - let me know.
107
+ #
108
+ # Many operators that generate callbacks will reference resources stored
109
+ # in the page header - think images, fonts, etc. To facilitate these
110
+ # operators, the first available callback is page=. If your receiver
111
+ # accepts that callback it will be passed the current
112
+ # PDF::Reader::Page object. Use the Page#resources method to grab any
113
+ # required resources.
114
+ #
115
+ def walk(*receivers)
116
+ callback(receivers, :page=, [self])
117
+ content_stream(receivers, raw_content)
118
+ end
119
+
120
+ # returns the raw content stream for this page. This is plumbing, nothing to
121
+ # see here unless you're a PDF nerd like me.
122
+ #
123
+ def raw_content
124
+ contents = objects.deref(@page_object[:Contents])
125
+ [contents].flatten.compact.map { |obj|
126
+ objects.deref(obj)
127
+ }.map { |obj|
128
+ obj.unfiltered_data
129
+ }.join
130
+ end
131
+
132
+ private
133
+
134
+ def root
135
+ root ||= objects.deref(@objects.trailer[:Root])
136
+ end
137
+
138
+ def content_stream(receivers, instructions)
139
+ buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
140
+ parser = Parser.new(buffer, @objects)
141
+ params = []
142
+
143
+ while (token = parser.parse_token(PagesStrategy::OPERATORS))
144
+ if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
145
+ callback(receivers, PagesStrategy::OPERATORS[token], params)
146
+ params.clear
147
+ else
148
+ params << token
149
+ end
150
+ end
151
+ rescue EOFError => e
152
+ raise MalformedPDFError, "End Of File while processing a content stream"
153
+ end
154
+
155
+ # calls the name callback method on the receiver class with params as the arguments
156
+ #
157
+ def callback (receivers, name, params=[])
158
+ receivers.each do |receiver|
159
+ receiver.send(name, *params) if receiver.respond_to?(name)
160
+ end
161
+ end
162
+
163
+ def page_with_ancestors(obj = nil)
164
+ obj = objects.deref(obj)
165
+ if obj.nil?
166
+ [@page_object] + page_with_ancestors(@page_object[:Parent])
167
+ elsif obj[:Parent]
168
+ [select_inheritable(obj)] + page_with_ancestors(obj[:Parent])
169
+ else
170
+ [select_inheritable(obj)]
171
+ end
172
+ end
173
+
174
+ # select the elements from a Pages dictionary that can be inherited by
175
+ # child Page dictionaries.
176
+ #
177
+ def select_inheritable(obj)
178
+ ::Hash[obj.select { |key, value|
179
+ [:Resources, :MediaBox, :CropBox, :Rotate, :Parent].include?(key)
180
+ }]
181
+ end
182
+
183
+ end
184
+ end
185
+ end
@@ -0,0 +1,278 @@
1
+ # coding: utf-8
2
+
3
+ require 'matrix'
4
+ require 'yaml'
5
+
6
+ module PDF
7
+ class Reader
8
+ class PageTextReceiver
9
+
10
+ DEFAULT_GRAPHICS_STATE = {
11
+ :ctm => Matrix.identity(3),
12
+ :char_spacing => 0,
13
+ :word_spacing => 0,
14
+ :h_scaling => 100,
15
+ :text_leading => 0,
16
+ :text_font => nil,
17
+ :text_font_size => nil,
18
+ :text_mode => 0,
19
+ :text_rise => 0,
20
+ :text_knockout => 0
21
+ }
22
+
23
+ # starting a new page
24
+ def page=(page)
25
+ @page = page
26
+ @objects = page.objects
27
+ @fonts = page.fonts
28
+ @form_fonts = {}
29
+ @content = ::Hash.new
30
+ @stack = [DEFAULT_GRAPHICS_STATE]
31
+ end
32
+
33
+ def content
34
+ keys = @content.keys.sort.reverse
35
+ keys.map { |key|
36
+ @content[key]
37
+ }.join("\n")
38
+ end
39
+
40
+ #####################################################
41
+ # Graphics State Operators
42
+ #####################################################
43
+
44
+ def save_graphics_state
45
+ @stack.push clone_state
46
+ end
47
+
48
+ def restore_graphics_state
49
+ @stack.pop
50
+ end
51
+
52
+ #####################################################
53
+ # Matrix Operators
54
+ #####################################################
55
+
56
+ # update the current transformation matrix.
57
+ #
58
+ # If the CTM is currently undefined, just store the new values.
59
+ #
60
+ # If there's an existing CTM, then multiply the existing matrix
61
+ # with the new matrix to form the updated matrix.
62
+ #
63
+ def concatenate_matrix(a, b, c, d, e, f)
64
+ transform = Matrix[
65
+ [a, b, 0],
66
+ [c, d, 0],
67
+ [e, f, 1]
68
+ ]
69
+ if state[:ctm]
70
+ state[:ctm] = transform * state[:ctm]
71
+ else
72
+ state[:ctm] = transform
73
+ end
74
+ end
75
+
76
+ #####################################################
77
+ # Text Object Operators
78
+ #####################################################
79
+
80
+ def begin_text_object
81
+ @text_matrix = Matrix.identity(3)
82
+ @text_line_matrix = Matrix.identity(3)
83
+ end
84
+
85
+ def end_text_object
86
+ @text_matrix = Matrix.identity(3)
87
+ @text_line_matrix = Matrix.identity(3)
88
+ end
89
+
90
+ #####################################################
91
+ # Text State Operators
92
+ #####################################################
93
+
94
+ def set_character_spacing(char_spacing)
95
+ state[:char_spacing] = char_spacing
96
+ end
97
+
98
+ def set_horizontal_text_scaling(h_scaling)
99
+ state[:h_scaling] = h_scaling
100
+ end
101
+
102
+ def set_text_font_and_size(label, size)
103
+ state[:text_font] = label
104
+ state[:text_font_size] = size
105
+ end
106
+
107
+ def set_text_leading(leading)
108
+ state[:text_leading] = leading
109
+ end
110
+
111
+ def set_text_rendering_mode(mode)
112
+ state[:text_mode] = mode
113
+ end
114
+
115
+ def set_text_rise(rise)
116
+ state[:text_rise] = rise
117
+ end
118
+
119
+ def set_word_spacing(word_spacing)
120
+ state[:word_spacing] = word_spacing
121
+ end
122
+
123
+ #####################################################
124
+ # Text Positioning Operators
125
+ #####################################################
126
+
127
+ def move_text_position(x, y) # Td
128
+ temp_matrix = Matrix[
129
+ [1, 0, 0],
130
+ [0, 1, 0],
131
+ [x, y, 1]
132
+ ]
133
+ @text_matrix = @text_line_matrix = temp_matrix * @text_line_matrix
134
+ end
135
+
136
+ def move_text_position_and_set_leading(x, y) # TD
137
+ set_text_leading(-1 * y)
138
+ move_text_position(x, y)
139
+ end
140
+
141
+ def set_text_matrix_and_text_line_matrix(a, b, c, d, e, f) # Tm
142
+ @text_matrix = @text_line_matrix = Matrix[
143
+ [a, b, 0],
144
+ [c, d, 0],
145
+ [e, f, 1]
146
+ ]
147
+ end
148
+
149
+ def move_to_start_of_next_line # T*
150
+ move_text_position(0, state[:text_leading])
151
+ end
152
+
153
+ #####################################################
154
+ # Text Showing Operators
155
+ #####################################################
156
+
157
+ # record text that is drawn on the page
158
+ def show_text(string) # Tj
159
+ at = transform(Point.new(0,0))
160
+ @content[at.y] ||= ""
161
+ @content[at.y] << current_font.to_utf8(string)
162
+ end
163
+
164
+ def show_text_with_positioning(params) # TJ
165
+ params.each { |arg|
166
+ case arg
167
+ when String
168
+ show_text(arg)
169
+ when Fixnum, Float
170
+ show_text(" ") if arg > 1000
171
+ end
172
+ }
173
+ end
174
+
175
+ def move_to_next_line_and_show_text(str) # '
176
+ move_to_start_of_next_line
177
+ show_text(str)
178
+ end
179
+
180
+ def set_spacing_next_line_show_text(aw, ac, string) # "
181
+ set_word_spacing(aw)
182
+ set_character_spacing(ac)
183
+ move_to_next_line_and_show_text(string)
184
+ end
185
+
186
+ #####################################################
187
+ # XObjects
188
+ #####################################################
189
+ def invoke_xobject(label)
190
+ save_graphics_state
191
+ xobject = @objects.deref(@page.xobjects[label])
192
+
193
+ matrix = xobject.hash[:Matrix]
194
+ concatenate_matrix(*matrix) if matrix
195
+
196
+ if xobject.hash[:Subtype] == :Form
197
+ form = PDF::Reader::FormXObject.new(@page, xobject)
198
+ @form_fonts = form.fonts
199
+ form.walk(self)
200
+ end
201
+ @form_fonts = {}
202
+
203
+ restore_graphics_state
204
+ end
205
+
206
+ private
207
+
208
+ # transform x and y co-ordinates from the current text space to the
209
+ # underlying device space.
210
+ #
211
+ def transform(point, z = 1)
212
+ trm = text_rendering_matrix
213
+ Point.new(
214
+ (trm[0,0] * point.x) + (trm[1,0] * point.y) + (trm[2,0] * z),
215
+ (trm[0,1] * point.x) + (trm[1,1] * point.y) + (trm[2,1] * z)
216
+ )
217
+ end
218
+
219
+ def text_rendering_matrix
220
+ state_matrix = Matrix[
221
+ [state[:text_font_size] * state[:h_scaling], 0, 0],
222
+ [0, state[:text_font_size], 0],
223
+ [0, state[:text_rise], 1]
224
+ ]
225
+
226
+ state_matrix * @text_matrix * ctm
227
+ end
228
+
229
+ def state
230
+ @stack.last
231
+ end
232
+
233
+ # when save_graphics_state is called, we need to push a new copy of the
234
+ # current state onto the stack. That way any modifications to the state
235
+ # will be undone once restore_graphics_state is called.
236
+ #
237
+ # This returns a deep clone of the current state, ensuring changes are
238
+ # keep separate from earlier states.
239
+ #
240
+ # YAML is used to round-trip the state through a string to easily perform
241
+ # the deep clone. Kinda hacky, but effective.
242
+ #
243
+ def clone_state
244
+ if @stack.empty?
245
+ {}
246
+ else
247
+ yaml_state = YAML.dump(@stack.last)
248
+ YAML.load(yaml_state)
249
+ end
250
+ end
251
+
252
+ # return the current transformation matrix
253
+ #
254
+ def ctm
255
+ state[:ctm]
256
+ end
257
+
258
+ def current_font
259
+ @form_fonts[state[:text_font]] || @fonts[state[:text_font]]
260
+ end
261
+
262
+ # private class for representing points on a cartesian plain. Used
263
+ # to simplify maths in the MinPpi class.
264
+ #
265
+ class Point
266
+ attr_reader :x, :y
267
+
268
+ def initialize(x,y)
269
+ @x, @y = x,y
270
+ end
271
+
272
+ def distance(point)
273
+ Math.hypot(point.x - x, point.y - y)
274
+ end
275
+ end
276
+ end
277
+ end
278
+ end