pdf-reader 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. data/CHANGELOG +7 -1
  2. data/README.rdoc +1 -0
  3. data/Rakefile +23 -8
  4. data/lib/pdf-reader.rb +3 -1
  5. data/lib/pdf/hash.rb +5 -1
  6. data/lib/pdf/reader.rb +8 -1
  7. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
  8. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
  9. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
  10. data/lib/pdf/reader/afm/Courier.afm +342 -0
  11. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
  12. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
  13. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
  14. data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
  15. data/lib/pdf/reader/afm/Symbol.afm +213 -0
  16. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
  17. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
  18. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
  19. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
  20. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
  21. data/lib/pdf/reader/buffer.rb +14 -6
  22. data/lib/pdf/reader/cid_widths.rb +61 -0
  23. data/lib/pdf/reader/cmap.rb +8 -2
  24. data/lib/pdf/reader/encoding.rb +52 -27
  25. data/lib/pdf/reader/error.rb +16 -1
  26. data/lib/pdf/reader/filter.rb +2 -0
  27. data/lib/pdf/reader/filter/ascii85.rb +3 -1
  28. data/lib/pdf/reader/filter/ascii_hex.rb +3 -1
  29. data/lib/pdf/reader/filter/depredict.rb +2 -0
  30. data/lib/pdf/reader/filter/flate.rb +3 -1
  31. data/lib/pdf/reader/filter/lzw.rb +1 -0
  32. data/lib/pdf/reader/filter/null.rb +1 -0
  33. data/lib/pdf/reader/filter/run_length.rb +2 -1
  34. data/lib/pdf/reader/font.rb +74 -18
  35. data/lib/pdf/reader/font_descriptor.rb +80 -0
  36. data/lib/pdf/reader/glyph_hash.rb +6 -0
  37. data/lib/pdf/reader/lzw.rb +1 -0
  38. data/lib/pdf/reader/object_cache.rb +1 -1
  39. data/lib/pdf/reader/object_hash.rb +1 -1
  40. data/lib/pdf/reader/page_layout.rb +125 -0
  41. data/lib/pdf/reader/page_state.rb +172 -69
  42. data/lib/pdf/reader/page_text_receiver.rb +50 -21
  43. data/lib/pdf/reader/pages_strategy.rb +17 -4
  44. data/lib/pdf/reader/parser.rb +25 -52
  45. data/lib/pdf/reader/print_receiver.rb +5 -0
  46. data/lib/pdf/reader/reference.rb +2 -0
  47. data/lib/pdf/reader/register_receiver.rb +1 -1
  48. data/lib/pdf/reader/standard_security_handler.rb +2 -0
  49. data/lib/pdf/reader/stream.rb +2 -0
  50. data/lib/pdf/reader/synchronized_cache.rb +32 -0
  51. data/lib/pdf/reader/text_receiver.rb +5 -4
  52. data/lib/pdf/reader/text_run.rb +80 -0
  53. data/lib/pdf/reader/token.rb +2 -0
  54. data/lib/pdf/reader/transformation_matrix.rb +194 -0
  55. data/lib/pdf/reader/width_calculator.rb +11 -0
  56. data/lib/pdf/reader/width_calculator/built_in.rb +50 -0
  57. data/lib/pdf/reader/width_calculator/composite.rb +27 -0
  58. data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
  59. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +32 -0
  60. data/lib/pdf/reader/width_calculator/type_zero.rb +24 -0
  61. data/lib/pdf/reader/xref.rb +9 -2
  62. metadata +119 -13
@@ -1,22 +1,23 @@
1
1
  # coding: utf-8
2
2
 
3
- require 'matrix'
3
+ require 'pdf/reader/transformation_matrix'
4
4
 
5
- module PDF
6
- class Reader
5
+ class PDF::Reader
6
+ # encapsulates logic for tracking graphics state as the instructions for
7
+ # a single page are processed. Most of the public methods correspond
8
+ # directly to PDF operators.
7
9
  class PageState
8
10
 
9
11
  DEFAULT_GRAPHICS_STATE = {
10
- :ctm => Matrix.identity(3),
11
- :char_spacing => 0,
12
- :word_spacing => 0,
13
- :h_scaling => 100,
14
- :text_leading => 0,
15
- :text_font => nil,
12
+ :char_spacing => 0,
13
+ :word_spacing => 0,
14
+ :h_scaling => 1.0,
15
+ :text_leading => 0,
16
+ :text_font => nil,
16
17
  :text_font_size => nil,
17
- :text_mode => 0,
18
- :text_rise => 0,
19
- :text_knockout => 0
18
+ :text_mode => 0,
19
+ :text_rise => 0,
20
+ :text_knockout => 0
20
21
  }
21
22
 
22
23
  # starting a new page
@@ -28,16 +29,23 @@ module PDF
28
29
  @xobject_stack = [page.xobjects]
29
30
  @cs_stack = [page.color_spaces]
30
31
  @stack = [DEFAULT_GRAPHICS_STATE.dup]
32
+ state[:ctm] = identity_matrix
31
33
  end
32
34
 
33
35
  #####################################################
34
36
  # Graphics State Operators
35
37
  #####################################################
36
38
 
39
+ # Clones the current graphics state and push it onto the top of the stack.
40
+ # Any changes that are subsequently made to the state can then by reversed
41
+ # by calling restore_graphics_state.
42
+ #
37
43
  def save_graphics_state
38
44
  @stack.push clone_state
39
45
  end
40
46
 
47
+ # Restore the state to the previous value on the stack.
48
+ #
41
49
  def restore_graphics_state
42
50
  @stack.pop
43
51
  end
@@ -54,16 +62,12 @@ module PDF
54
62
  # with the new matrix to form the updated matrix.
55
63
  #
56
64
  def concatenate_matrix(a, b, c, d, e, f)
57
- transform = Matrix[
58
- [a, b, 0],
59
- [c, d, 0],
60
- [e, f, 1]
61
- ]
62
65
  if state[:ctm]
63
- state[:ctm] = transform * state[:ctm]
66
+ state[:ctm].multiply!(a,b,c,d,e,f)
64
67
  else
65
- state[:ctm] = transform
68
+ state[:ctm] = TransformationMatrix.new(a,b,c,d,e,f)
66
69
  end
70
+ @text_rendering_matrix = nil # invalidate cached value
67
71
  end
68
72
 
69
73
  #####################################################
@@ -71,13 +75,13 @@ module PDF
71
75
  #####################################################
72
76
 
73
77
  def begin_text_object
74
- @text_matrix = Matrix.identity(3)
75
- @text_line_matrix = Matrix.identity(3)
78
+ @text_matrix = identity_matrix
79
+ @text_line_matrix = identity_matrix
80
+ @font_size = nil
76
81
  end
77
82
 
78
83
  def end_text_object
79
- @text_matrix = Matrix.identity(3)
80
- @text_line_matrix = Matrix.identity(3)
84
+ # don't need to do anything
81
85
  end
82
86
 
83
87
  #####################################################
@@ -89,7 +93,7 @@ module PDF
89
93
  end
90
94
 
91
95
  def set_horizontal_text_scaling(h_scaling)
92
- state[:h_scaling] = h_scaling
96
+ state[:h_scaling] = h_scaling / 100.0
93
97
  end
94
98
 
95
99
  def set_text_font_and_size(label, size)
@@ -98,7 +102,7 @@ module PDF
98
102
  end
99
103
 
100
104
  def font_size
101
- state[:text_font_size] * @text_matrix[0,0]
105
+ @font_size ||= state[:text_font_size] * @text_matrix.a * ctm.a
102
106
  end
103
107
 
104
108
  def set_text_leading(leading)
@@ -122,12 +126,16 @@ module PDF
122
126
  #####################################################
123
127
 
124
128
  def move_text_position(x, y) # Td
125
- temp_matrix = Matrix[
126
- [1, 0, 0],
127
- [0, 1, 0],
128
- [x, y, 1]
129
- ]
130
- @text_matrix = @text_line_matrix = temp_matrix * @text_line_matrix
129
+ temp = TransformationMatrix.new(1, 0,
130
+ 0, 1,
131
+ x, y)
132
+ @text_line_matrix = temp.multiply!(
133
+ @text_line_matrix.a, @text_line_matrix.b,
134
+ @text_line_matrix.c, @text_line_matrix.d,
135
+ @text_line_matrix.e, @text_line_matrix.f
136
+ )
137
+ @text_matrix = @text_line_matrix.dup
138
+ @font_size = @text_rendering_matrix = nil # invalidate cached value
131
139
  end
132
140
 
133
141
  def move_text_position_and_set_leading(x, y) # TD
@@ -136,11 +144,13 @@ module PDF
136
144
  end
137
145
 
138
146
  def set_text_matrix_and_text_line_matrix(a, b, c, d, e, f) # Tm
139
- @text_matrix = @text_line_matrix = Matrix[
140
- [a, b, 0],
141
- [c, d, 0],
142
- [e, f, 1]
143
- ]
147
+ @text_matrix = TransformationMatrix.new(
148
+ a, b,
149
+ c, d,
150
+ e, f
151
+ )
152
+ @text_line_matrix = @text_matrix.dup
153
+ @font_size = @text_rendering_matrix = nil # invalidate cached value
144
154
  end
145
155
 
146
156
  def move_to_start_of_next_line # T*
@@ -197,22 +207,29 @@ module PDF
197
207
  # transform x and y co-ordinates from the current user space to the
198
208
  # underlying device space.
199
209
  #
200
- def ctm_transform(x, y, z = 1)
210
+ def ctm_transform(x, y)
201
211
  [
202
- (ctm[0,0] * x) + (ctm[1,0] * y) + (ctm[2,0] * z),
203
- (ctm[0,1] * x) + (ctm[1,1] * y) + (ctm[2,1] * z)
212
+ (ctm.a * x) + (ctm.c * y) + (ctm.e),
213
+ (ctm.b * x) + (ctm.d * y) + (ctm.f)
204
214
  ]
205
215
  end
206
216
 
207
217
  # transform x and y co-ordinates from the current text space to the
208
218
  # underlying device space.
209
219
  #
210
- def trm_transform(x, y, z = 1)
220
+ # transforming (0,0) is a really common case, so optimise for it to
221
+ # avoid unnecessary object allocations
222
+ #
223
+ def trm_transform(x, y)
211
224
  trm = text_rendering_matrix
212
- [
213
- (trm[0,0] * x) + (trm[1,0] * y) + (trm[2,0] * z),
214
- (trm[0,1] * x) + (trm[1,1] * y) + (trm[2,1] * z)
215
- ]
225
+ if x == 0 && y == 0
226
+ [trm.e, trm.f]
227
+ else
228
+ [
229
+ (trm.a * x) + (trm.c * y) + (trm.e),
230
+ (trm.b * x) + (trm.d * y) + (trm.f)
231
+ ]
232
+ end
216
233
  end
217
234
 
218
235
  def current_font
@@ -240,16 +257,105 @@ module PDF
240
257
  dict ? dict[label] : nil
241
258
  end
242
259
 
260
+ # when save_graphics_state is called, we need to push a new copy of the
261
+ # current state onto the stack. That way any modifications to the state
262
+ # will be undone once restore_graphics_state is called.
263
+ #
264
+ def stack_depth
265
+ @stack.size
266
+ end
267
+
268
+ # This returns a deep clone of the current state, ensuring changes are
269
+ # keep separate from earlier states.
270
+ #
271
+ # Marshal is used to round-trip the state through a string to easily
272
+ # perform the deep clone. Kinda hacky, but effective.
273
+ #
274
+ def clone_state
275
+ if @stack.empty?
276
+ {}
277
+ else
278
+ Marshal.load Marshal.dump(@stack.last)
279
+ end
280
+ end
281
+
282
+ # after each glyph is painted onto the page the text matrix must be
283
+ # modified. There's no defined operator for this, but depending on
284
+ # the use case some receivers may need to mutate the state with this
285
+ # while walking a page.
286
+ #
287
+ # NOTE: some of the variable names in this method are obscure because
288
+ # they mirror variable names from the PDF spec
289
+ #
290
+ # NOTE: see Section 9.4.4, PDF 32000-1:2008, pp 252
291
+ #
292
+ # Arguments:
293
+ #
294
+ # w0 - the glyph width in *text space*. This generally means the width
295
+ # in glyph space should be divded by 1000 before being passed to
296
+ # this function
297
+ # tj - any kerning that should be applied to the text matrix before the
298
+ # following glyph is painted. This is usually the numeric arguments
299
+ # in the array passed to a TJ operator
300
+ # word_boundary - a boolean indicating if a word boundary was just
301
+ # reached. Depending on the current state extra space
302
+ # may need to be added
303
+ #
304
+ def process_glyph_displacement(w0, tj, word_boundary)
305
+ fs = font_size # font size
306
+ tc = state[:char_spacing]
307
+ if word_boundary
308
+ tw = state[:word_spacing]
309
+ else
310
+ tw = 0
311
+ end
312
+ th = state[:h_scaling]
313
+ # optimise the common path to reduce Float allocations
314
+ if th == 1 && tj == 0 && tc == 0 && tw == 0
315
+ glyph_width = w0 * fs
316
+ tx = glyph_width
317
+ else
318
+ glyph_width = ((w0 - (tj/1000.0)) * fs) * th
319
+ tx = glyph_width + ((tc + tw) * th)
320
+ end
321
+ ty = 0
322
+
323
+ # TODO: I'm pretty sure that tx shouldn't need to be divided by
324
+ # ctm[0] here, but this gets my tests green and I'm out of
325
+ # ideas for now
326
+ # TODO: support ty > 0
327
+ if ctm.a == 1
328
+ @text_matrix.horizontal_displacement_multiply!(tx)
329
+ else
330
+ @text_matrix.horizontal_displacement_multiply!(tx/ctm.a)
331
+ end
332
+ @font_size = @text_rendering_matrix = nil # invalidate cached value
333
+ end
334
+
243
335
  private
244
336
 
337
+ # used for many and varied text positioning calculations. We potentially
338
+ # need to access the results of this method many times when working with
339
+ # text, so memoize it
340
+ #
245
341
  def text_rendering_matrix
246
- state_matrix = Matrix[
247
- [font_size * state[:h_scaling], 0, 0],
248
- [0, font_size, 0],
249
- [0, state[:text_rise], 1]
250
- ]
251
-
252
- state_matrix * @text_matrix * ctm
342
+ @text_rendering_matrix ||= begin
343
+ state_matrix = TransformationMatrix.new(
344
+ font_size * state[:h_scaling], 0,
345
+ 0, font_size,
346
+ 0, state[:text_rise]
347
+ )
348
+ state_matrix.multiply!(
349
+ @text_matrix.a, @text_matrix.b,
350
+ @text_matrix.c, @text_matrix.d,
351
+ @text_matrix.e, @text_matrix.f
352
+ )
353
+ state_matrix.multiply!(
354
+ ctm.a, ctm.b,
355
+ ctm.c, ctm.d,
356
+ ctm.e, ctm.f
357
+ )
358
+ end
253
359
  end
254
360
 
255
361
  # return the current transformation matrix
@@ -272,25 +378,22 @@ module PDF
272
378
  ::Hash[wrapped_fonts]
273
379
  end
274
380
 
275
- # when save_graphics_state is called, we need to push a new copy of the
276
- # current state onto the stack. That way any modifications to the state
277
- # will be undone once restore_graphics_state is called.
278
- #
279
- # This returns a deep clone of the current state, ensuring changes are
280
- # keep separate from earlier states.
281
- #
282
- # Marshal is used to round-trip the state through a string to easily
283
- # perform the deep clone. Kinda hacky, but effective.
284
- #
285
- def clone_state
286
- if @stack.empty?
287
- {}
288
- else
289
- Marshal.load Marshal.dump(@stack.last)
290
- end
381
+ #####################################################
382
+ # Low-level Matrix Operations
383
+ #####################################################
384
+
385
+ # This class uses 3x3 matrices to represent geometric transformations
386
+ # These matrices are represented by arrays with 9 elements
387
+ # The array [a,b,c,d,e,f,g,h,i] would represent a matrix like:
388
+ # a b c
389
+ # d e f
390
+ # g h i
391
+
392
+ def identity_matrix
393
+ TransformationMatrix.new(1, 0,
394
+ 0, 1,
395
+ 0, 0)
291
396
  end
292
397
 
293
398
  end
294
- end
295
399
  end
296
-
@@ -1,13 +1,22 @@
1
1
  # coding: utf-8
2
2
 
3
- require 'matrix'
4
3
  require 'forwardable'
4
+ require 'pdf/reader/page_layout'
5
5
 
6
6
  module PDF
7
7
  class Reader
8
+
9
+ # Builds a UTF-8 string of all the text on a single page by processing all
10
+ # the operaters in a content stream.
11
+ #
8
12
  class PageTextReceiver
9
13
  extend Forwardable
10
14
 
15
+ SPACE = " "
16
+
17
+ attr_reader :state, :content, :options
18
+
19
+ ########## BEGIN FORWARDERS ##########
11
20
  # Graphics State Operators
12
21
  def_delegators :@state, :save_graphics_state, :restore_graphics_state
13
22
 
@@ -26,41 +35,32 @@ module PDF
26
35
  # Text Positioning Operators
27
36
  def_delegators :@state, :move_text_position, :move_text_position_and_set_leading
28
37
  def_delegators :@state, :set_text_matrix_and_text_line_matrix, :move_to_start_of_next_line
38
+ ########## END FORWARDERS ##########
29
39
 
30
40
  # starting a new page
31
41
  def page=(page)
32
42
  @state = PageState.new(page)
33
- @content = {}
43
+ @content = []
44
+ @characters = []
45
+ @mediabox = page.attributes[:MediaBox]
34
46
  end
35
47
 
36
48
  def content
37
- keys = @content.keys.sort.reverse
38
- keys.map { |key|
39
- @content[key]
40
- }.join("\n")
49
+ PageLayout.new(@characters, @mediabox).to_s
41
50
  end
42
51
 
43
52
  #####################################################
44
53
  # Text Showing Operators
45
54
  #####################################################
46
-
47
55
  # record text that is drawn on the page
48
- def show_text(string) # Tj
49
- raise PDF::Reader::MalformedPDFError, "current font is invalid" if @state.current_font.nil?
50
- newx, newy = @state.trm_transform(0,0)
51
- @content[newy] ||= ""
52
- @content[newy] << @state.current_font.to_utf8(string)
56
+ def show_text(string) # Tj (AWAY)
57
+ internal_show_text(string)
53
58
  end
54
59
 
55
- def show_text_with_positioning(params) # TJ
56
- params.each { |arg|
57
- case arg
58
- when String
59
- show_text(arg)
60
- when Fixnum, Float
61
- show_text(" ") if arg > 1000
62
- end
63
- }
60
+ def show_text_with_positioning(params) # TJ [(A) 120 (WA) 20 (Y)]
61
+ params.each_slice(2).each do |string, kerning|
62
+ internal_show_text(string, kerning || 0)
63
+ end
64
64
  end
65
65
 
66
66
  def move_to_next_line_and_show_text(str) # '
@@ -86,6 +86,35 @@ module PDF
86
86
  end
87
87
  end
88
88
 
89
+ private
90
+
91
+ def internal_show_text(string, kerning = 0)
92
+ if @state.current_font.nil?
93
+ raise PDF::Reader::MalformedPDFError, "current font is invalid"
94
+ end
95
+ glyphs = @state.current_font.unpack(string)
96
+ glyphs.each_with_index do |glyph_code, index|
97
+ # paint the current glyph
98
+ newx, newy = @state.trm_transform(0,0)
99
+ utf8_chars = @state.current_font.to_utf8(glyph_code)
100
+
101
+ # apply to glyph displacment for the current glyph so the next
102
+ # glyph will appear in the correct position
103
+ glyph_width = @state.current_font.glyph_width(glyph_code) / 1000.0
104
+ th = 1
105
+ if kerning != 0 && index == glyphs.size - 1
106
+ tj = kerning
107
+ else
108
+ tj = 0
109
+ end
110
+ scaled_glyph_width = glyph_width * @state.font_size * th
111
+ unless utf8_chars == SPACE
112
+ @characters << TextRun.new(newx, newy, scaled_glyph_width, @state.font_size, utf8_chars)
113
+ end
114
+ @state.process_glyph_displacement(glyph_width, tj, utf8_chars == SPACE)
115
+ end
116
+ end
117
+
89
118
  end
90
119
  end
91
120
  end