pdf-reader 1.2.0 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. data/CHANGELOG +7 -1
  2. data/README.rdoc +1 -0
  3. data/Rakefile +23 -8
  4. data/lib/pdf-reader.rb +3 -1
  5. data/lib/pdf/hash.rb +5 -1
  6. data/lib/pdf/reader.rb +8 -1
  7. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
  8. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
  9. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
  10. data/lib/pdf/reader/afm/Courier.afm +342 -0
  11. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
  12. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
  13. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
  14. data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
  15. data/lib/pdf/reader/afm/Symbol.afm +213 -0
  16. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
  17. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
  18. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
  19. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
  20. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
  21. data/lib/pdf/reader/buffer.rb +14 -6
  22. data/lib/pdf/reader/cid_widths.rb +61 -0
  23. data/lib/pdf/reader/cmap.rb +8 -2
  24. data/lib/pdf/reader/encoding.rb +52 -27
  25. data/lib/pdf/reader/error.rb +16 -1
  26. data/lib/pdf/reader/filter.rb +2 -0
  27. data/lib/pdf/reader/filter/ascii85.rb +3 -1
  28. data/lib/pdf/reader/filter/ascii_hex.rb +3 -1
  29. data/lib/pdf/reader/filter/depredict.rb +2 -0
  30. data/lib/pdf/reader/filter/flate.rb +3 -1
  31. data/lib/pdf/reader/filter/lzw.rb +1 -0
  32. data/lib/pdf/reader/filter/null.rb +1 -0
  33. data/lib/pdf/reader/filter/run_length.rb +2 -1
  34. data/lib/pdf/reader/font.rb +74 -18
  35. data/lib/pdf/reader/font_descriptor.rb +80 -0
  36. data/lib/pdf/reader/glyph_hash.rb +6 -0
  37. data/lib/pdf/reader/lzw.rb +1 -0
  38. data/lib/pdf/reader/object_cache.rb +1 -1
  39. data/lib/pdf/reader/object_hash.rb +1 -1
  40. data/lib/pdf/reader/page_layout.rb +125 -0
  41. data/lib/pdf/reader/page_state.rb +172 -69
  42. data/lib/pdf/reader/page_text_receiver.rb +50 -21
  43. data/lib/pdf/reader/pages_strategy.rb +17 -4
  44. data/lib/pdf/reader/parser.rb +25 -52
  45. data/lib/pdf/reader/print_receiver.rb +5 -0
  46. data/lib/pdf/reader/reference.rb +2 -0
  47. data/lib/pdf/reader/register_receiver.rb +1 -1
  48. data/lib/pdf/reader/standard_security_handler.rb +2 -0
  49. data/lib/pdf/reader/stream.rb +2 -0
  50. data/lib/pdf/reader/synchronized_cache.rb +32 -0
  51. data/lib/pdf/reader/text_receiver.rb +5 -4
  52. data/lib/pdf/reader/text_run.rb +80 -0
  53. data/lib/pdf/reader/token.rb +2 -0
  54. data/lib/pdf/reader/transformation_matrix.rb +194 -0
  55. data/lib/pdf/reader/width_calculator.rb +11 -0
  56. data/lib/pdf/reader/width_calculator/built_in.rb +50 -0
  57. data/lib/pdf/reader/width_calculator/composite.rb +27 -0
  58. data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
  59. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +32 -0
  60. data/lib/pdf/reader/width_calculator/type_zero.rb +24 -0
  61. data/lib/pdf/reader/xref.rb +9 -2
  62. metadata +119 -13
@@ -1,22 +1,23 @@
1
1
  # coding: utf-8
2
2
 
3
- require 'matrix'
3
+ require 'pdf/reader/transformation_matrix'
4
4
 
5
- module PDF
6
- class Reader
5
+ class PDF::Reader
6
+ # encapsulates logic for tracking graphics state as the instructions for
7
+ # a single page are processed. Most of the public methods correspond
8
+ # directly to PDF operators.
7
9
  class PageState
8
10
 
9
11
  DEFAULT_GRAPHICS_STATE = {
10
- :ctm => Matrix.identity(3),
11
- :char_spacing => 0,
12
- :word_spacing => 0,
13
- :h_scaling => 100,
14
- :text_leading => 0,
15
- :text_font => nil,
12
+ :char_spacing => 0,
13
+ :word_spacing => 0,
14
+ :h_scaling => 1.0,
15
+ :text_leading => 0,
16
+ :text_font => nil,
16
17
  :text_font_size => nil,
17
- :text_mode => 0,
18
- :text_rise => 0,
19
- :text_knockout => 0
18
+ :text_mode => 0,
19
+ :text_rise => 0,
20
+ :text_knockout => 0
20
21
  }
21
22
 
22
23
  # starting a new page
@@ -28,16 +29,23 @@ module PDF
28
29
  @xobject_stack = [page.xobjects]
29
30
  @cs_stack = [page.color_spaces]
30
31
  @stack = [DEFAULT_GRAPHICS_STATE.dup]
32
+ state[:ctm] = identity_matrix
31
33
  end
32
34
 
33
35
  #####################################################
34
36
  # Graphics State Operators
35
37
  #####################################################
36
38
 
39
+ # Clones the current graphics state and push it onto the top of the stack.
40
+ # Any changes that are subsequently made to the state can then by reversed
41
+ # by calling restore_graphics_state.
42
+ #
37
43
  def save_graphics_state
38
44
  @stack.push clone_state
39
45
  end
40
46
 
47
+ # Restore the state to the previous value on the stack.
48
+ #
41
49
  def restore_graphics_state
42
50
  @stack.pop
43
51
  end
@@ -54,16 +62,12 @@ module PDF
54
62
  # with the new matrix to form the updated matrix.
55
63
  #
56
64
  def concatenate_matrix(a, b, c, d, e, f)
57
- transform = Matrix[
58
- [a, b, 0],
59
- [c, d, 0],
60
- [e, f, 1]
61
- ]
62
65
  if state[:ctm]
63
- state[:ctm] = transform * state[:ctm]
66
+ state[:ctm].multiply!(a,b,c,d,e,f)
64
67
  else
65
- state[:ctm] = transform
68
+ state[:ctm] = TransformationMatrix.new(a,b,c,d,e,f)
66
69
  end
70
+ @text_rendering_matrix = nil # invalidate cached value
67
71
  end
68
72
 
69
73
  #####################################################
@@ -71,13 +75,13 @@ module PDF
71
75
  #####################################################
72
76
 
73
77
  def begin_text_object
74
- @text_matrix = Matrix.identity(3)
75
- @text_line_matrix = Matrix.identity(3)
78
+ @text_matrix = identity_matrix
79
+ @text_line_matrix = identity_matrix
80
+ @font_size = nil
76
81
  end
77
82
 
78
83
  def end_text_object
79
- @text_matrix = Matrix.identity(3)
80
- @text_line_matrix = Matrix.identity(3)
84
+ # don't need to do anything
81
85
  end
82
86
 
83
87
  #####################################################
@@ -89,7 +93,7 @@ module PDF
89
93
  end
90
94
 
91
95
  def set_horizontal_text_scaling(h_scaling)
92
- state[:h_scaling] = h_scaling
96
+ state[:h_scaling] = h_scaling / 100.0
93
97
  end
94
98
 
95
99
  def set_text_font_and_size(label, size)
@@ -98,7 +102,7 @@ module PDF
98
102
  end
99
103
 
100
104
  def font_size
101
- state[:text_font_size] * @text_matrix[0,0]
105
+ @font_size ||= state[:text_font_size] * @text_matrix.a * ctm.a
102
106
  end
103
107
 
104
108
  def set_text_leading(leading)
@@ -122,12 +126,16 @@ module PDF
122
126
  #####################################################
123
127
 
124
128
  def move_text_position(x, y) # Td
125
- temp_matrix = Matrix[
126
- [1, 0, 0],
127
- [0, 1, 0],
128
- [x, y, 1]
129
- ]
130
- @text_matrix = @text_line_matrix = temp_matrix * @text_line_matrix
129
+ temp = TransformationMatrix.new(1, 0,
130
+ 0, 1,
131
+ x, y)
132
+ @text_line_matrix = temp.multiply!(
133
+ @text_line_matrix.a, @text_line_matrix.b,
134
+ @text_line_matrix.c, @text_line_matrix.d,
135
+ @text_line_matrix.e, @text_line_matrix.f
136
+ )
137
+ @text_matrix = @text_line_matrix.dup
138
+ @font_size = @text_rendering_matrix = nil # invalidate cached value
131
139
  end
132
140
 
133
141
  def move_text_position_and_set_leading(x, y) # TD
@@ -136,11 +144,13 @@ module PDF
136
144
  end
137
145
 
138
146
  def set_text_matrix_and_text_line_matrix(a, b, c, d, e, f) # Tm
139
- @text_matrix = @text_line_matrix = Matrix[
140
- [a, b, 0],
141
- [c, d, 0],
142
- [e, f, 1]
143
- ]
147
+ @text_matrix = TransformationMatrix.new(
148
+ a, b,
149
+ c, d,
150
+ e, f
151
+ )
152
+ @text_line_matrix = @text_matrix.dup
153
+ @font_size = @text_rendering_matrix = nil # invalidate cached value
144
154
  end
145
155
 
146
156
  def move_to_start_of_next_line # T*
@@ -197,22 +207,29 @@ module PDF
197
207
  # transform x and y co-ordinates from the current user space to the
198
208
  # underlying device space.
199
209
  #
200
- def ctm_transform(x, y, z = 1)
210
+ def ctm_transform(x, y)
201
211
  [
202
- (ctm[0,0] * x) + (ctm[1,0] * y) + (ctm[2,0] * z),
203
- (ctm[0,1] * x) + (ctm[1,1] * y) + (ctm[2,1] * z)
212
+ (ctm.a * x) + (ctm.c * y) + (ctm.e),
213
+ (ctm.b * x) + (ctm.d * y) + (ctm.f)
204
214
  ]
205
215
  end
206
216
 
207
217
  # transform x and y co-ordinates from the current text space to the
208
218
  # underlying device space.
209
219
  #
210
- def trm_transform(x, y, z = 1)
220
+ # transforming (0,0) is a really common case, so optimise for it to
221
+ # avoid unnecessary object allocations
222
+ #
223
+ def trm_transform(x, y)
211
224
  trm = text_rendering_matrix
212
- [
213
- (trm[0,0] * x) + (trm[1,0] * y) + (trm[2,0] * z),
214
- (trm[0,1] * x) + (trm[1,1] * y) + (trm[2,1] * z)
215
- ]
225
+ if x == 0 && y == 0
226
+ [trm.e, trm.f]
227
+ else
228
+ [
229
+ (trm.a * x) + (trm.c * y) + (trm.e),
230
+ (trm.b * x) + (trm.d * y) + (trm.f)
231
+ ]
232
+ end
216
233
  end
217
234
 
218
235
  def current_font
@@ -240,16 +257,105 @@ module PDF
240
257
  dict ? dict[label] : nil
241
258
  end
242
259
 
260
+ # when save_graphics_state is called, we need to push a new copy of the
261
+ # current state onto the stack. That way any modifications to the state
262
+ # will be undone once restore_graphics_state is called.
263
+ #
264
+ def stack_depth
265
+ @stack.size
266
+ end
267
+
268
+ # This returns a deep clone of the current state, ensuring changes are
269
+ # keep separate from earlier states.
270
+ #
271
+ # Marshal is used to round-trip the state through a string to easily
272
+ # perform the deep clone. Kinda hacky, but effective.
273
+ #
274
+ def clone_state
275
+ if @stack.empty?
276
+ {}
277
+ else
278
+ Marshal.load Marshal.dump(@stack.last)
279
+ end
280
+ end
281
+
282
+ # after each glyph is painted onto the page the text matrix must be
283
+ # modified. There's no defined operator for this, but depending on
284
+ # the use case some receivers may need to mutate the state with this
285
+ # while walking a page.
286
+ #
287
+ # NOTE: some of the variable names in this method are obscure because
288
+ # they mirror variable names from the PDF spec
289
+ #
290
+ # NOTE: see Section 9.4.4, PDF 32000-1:2008, pp 252
291
+ #
292
+ # Arguments:
293
+ #
294
+ # w0 - the glyph width in *text space*. This generally means the width
295
+ # in glyph space should be divded by 1000 before being passed to
296
+ # this function
297
+ # tj - any kerning that should be applied to the text matrix before the
298
+ # following glyph is painted. This is usually the numeric arguments
299
+ # in the array passed to a TJ operator
300
+ # word_boundary - a boolean indicating if a word boundary was just
301
+ # reached. Depending on the current state extra space
302
+ # may need to be added
303
+ #
304
+ def process_glyph_displacement(w0, tj, word_boundary)
305
+ fs = font_size # font size
306
+ tc = state[:char_spacing]
307
+ if word_boundary
308
+ tw = state[:word_spacing]
309
+ else
310
+ tw = 0
311
+ end
312
+ th = state[:h_scaling]
313
+ # optimise the common path to reduce Float allocations
314
+ if th == 1 && tj == 0 && tc == 0 && tw == 0
315
+ glyph_width = w0 * fs
316
+ tx = glyph_width
317
+ else
318
+ glyph_width = ((w0 - (tj/1000.0)) * fs) * th
319
+ tx = glyph_width + ((tc + tw) * th)
320
+ end
321
+ ty = 0
322
+
323
+ # TODO: I'm pretty sure that tx shouldn't need to be divided by
324
+ # ctm[0] here, but this gets my tests green and I'm out of
325
+ # ideas for now
326
+ # TODO: support ty > 0
327
+ if ctm.a == 1
328
+ @text_matrix.horizontal_displacement_multiply!(tx)
329
+ else
330
+ @text_matrix.horizontal_displacement_multiply!(tx/ctm.a)
331
+ end
332
+ @font_size = @text_rendering_matrix = nil # invalidate cached value
333
+ end
334
+
243
335
  private
244
336
 
337
+ # used for many and varied text positioning calculations. We potentially
338
+ # need to access the results of this method many times when working with
339
+ # text, so memoize it
340
+ #
245
341
  def text_rendering_matrix
246
- state_matrix = Matrix[
247
- [font_size * state[:h_scaling], 0, 0],
248
- [0, font_size, 0],
249
- [0, state[:text_rise], 1]
250
- ]
251
-
252
- state_matrix * @text_matrix * ctm
342
+ @text_rendering_matrix ||= begin
343
+ state_matrix = TransformationMatrix.new(
344
+ font_size * state[:h_scaling], 0,
345
+ 0, font_size,
346
+ 0, state[:text_rise]
347
+ )
348
+ state_matrix.multiply!(
349
+ @text_matrix.a, @text_matrix.b,
350
+ @text_matrix.c, @text_matrix.d,
351
+ @text_matrix.e, @text_matrix.f
352
+ )
353
+ state_matrix.multiply!(
354
+ ctm.a, ctm.b,
355
+ ctm.c, ctm.d,
356
+ ctm.e, ctm.f
357
+ )
358
+ end
253
359
  end
254
360
 
255
361
  # return the current transformation matrix
@@ -272,25 +378,22 @@ module PDF
272
378
  ::Hash[wrapped_fonts]
273
379
  end
274
380
 
275
- # when save_graphics_state is called, we need to push a new copy of the
276
- # current state onto the stack. That way any modifications to the state
277
- # will be undone once restore_graphics_state is called.
278
- #
279
- # This returns a deep clone of the current state, ensuring changes are
280
- # keep separate from earlier states.
281
- #
282
- # Marshal is used to round-trip the state through a string to easily
283
- # perform the deep clone. Kinda hacky, but effective.
284
- #
285
- def clone_state
286
- if @stack.empty?
287
- {}
288
- else
289
- Marshal.load Marshal.dump(@stack.last)
290
- end
381
+ #####################################################
382
+ # Low-level Matrix Operations
383
+ #####################################################
384
+
385
+ # This class uses 3x3 matrices to represent geometric transformations
386
+ # These matrices are represented by arrays with 9 elements
387
+ # The array [a,b,c,d,e,f,g,h,i] would represent a matrix like:
388
+ # a b c
389
+ # d e f
390
+ # g h i
391
+
392
+ def identity_matrix
393
+ TransformationMatrix.new(1, 0,
394
+ 0, 1,
395
+ 0, 0)
291
396
  end
292
397
 
293
398
  end
294
- end
295
399
  end
296
-
@@ -1,13 +1,22 @@
1
1
  # coding: utf-8
2
2
 
3
- require 'matrix'
4
3
  require 'forwardable'
4
+ require 'pdf/reader/page_layout'
5
5
 
6
6
  module PDF
7
7
  class Reader
8
+
9
+ # Builds a UTF-8 string of all the text on a single page by processing all
10
+ # the operaters in a content stream.
11
+ #
8
12
  class PageTextReceiver
9
13
  extend Forwardable
10
14
 
15
+ SPACE = " "
16
+
17
+ attr_reader :state, :content, :options
18
+
19
+ ########## BEGIN FORWARDERS ##########
11
20
  # Graphics State Operators
12
21
  def_delegators :@state, :save_graphics_state, :restore_graphics_state
13
22
 
@@ -26,41 +35,32 @@ module PDF
26
35
  # Text Positioning Operators
27
36
  def_delegators :@state, :move_text_position, :move_text_position_and_set_leading
28
37
  def_delegators :@state, :set_text_matrix_and_text_line_matrix, :move_to_start_of_next_line
38
+ ########## END FORWARDERS ##########
29
39
 
30
40
  # starting a new page
31
41
  def page=(page)
32
42
  @state = PageState.new(page)
33
- @content = {}
43
+ @content = []
44
+ @characters = []
45
+ @mediabox = page.attributes[:MediaBox]
34
46
  end
35
47
 
36
48
  def content
37
- keys = @content.keys.sort.reverse
38
- keys.map { |key|
39
- @content[key]
40
- }.join("\n")
49
+ PageLayout.new(@characters, @mediabox).to_s
41
50
  end
42
51
 
43
52
  #####################################################
44
53
  # Text Showing Operators
45
54
  #####################################################
46
-
47
55
  # record text that is drawn on the page
48
- def show_text(string) # Tj
49
- raise PDF::Reader::MalformedPDFError, "current font is invalid" if @state.current_font.nil?
50
- newx, newy = @state.trm_transform(0,0)
51
- @content[newy] ||= ""
52
- @content[newy] << @state.current_font.to_utf8(string)
56
+ def show_text(string) # Tj (AWAY)
57
+ internal_show_text(string)
53
58
  end
54
59
 
55
- def show_text_with_positioning(params) # TJ
56
- params.each { |arg|
57
- case arg
58
- when String
59
- show_text(arg)
60
- when Fixnum, Float
61
- show_text(" ") if arg > 1000
62
- end
63
- }
60
+ def show_text_with_positioning(params) # TJ [(A) 120 (WA) 20 (Y)]
61
+ params.each_slice(2).each do |string, kerning|
62
+ internal_show_text(string, kerning || 0)
63
+ end
64
64
  end
65
65
 
66
66
  def move_to_next_line_and_show_text(str) # '
@@ -86,6 +86,35 @@ module PDF
86
86
  end
87
87
  end
88
88
 
89
+ private
90
+
91
+ def internal_show_text(string, kerning = 0)
92
+ if @state.current_font.nil?
93
+ raise PDF::Reader::MalformedPDFError, "current font is invalid"
94
+ end
95
+ glyphs = @state.current_font.unpack(string)
96
+ glyphs.each_with_index do |glyph_code, index|
97
+ # paint the current glyph
98
+ newx, newy = @state.trm_transform(0,0)
99
+ utf8_chars = @state.current_font.to_utf8(glyph_code)
100
+
101
+ # apply to glyph displacment for the current glyph so the next
102
+ # glyph will appear in the correct position
103
+ glyph_width = @state.current_font.glyph_width(glyph_code) / 1000.0
104
+ th = 1
105
+ if kerning != 0 && index == glyphs.size - 1
106
+ tj = kerning
107
+ else
108
+ tj = 0
109
+ end
110
+ scaled_glyph_width = glyph_width * @state.font_size * th
111
+ unless utf8_chars == SPACE
112
+ @characters << TextRun.new(newx, newy, scaled_glyph_width, @state.font_size, utf8_chars)
113
+ end
114
+ @state.process_glyph_displacement(glyph_width, tj, utf8_chars == SPACE)
115
+ end
116
+ end
117
+
89
118
  end
90
119
  end
91
120
  end