pdf-reader 2.7.0 → 2.9.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +20 -0
  3. data/Rakefile +1 -1
  4. data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
  5. data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
  6. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
  7. data/lib/pdf/reader/buffer.rb +36 -34
  8. data/lib/pdf/reader/cmap.rb +64 -51
  9. data/lib/pdf/reader/error.rb +8 -0
  10. data/lib/pdf/reader/filter/ascii85.rb +1 -1
  11. data/lib/pdf/reader/filter/ascii_hex.rb +1 -1
  12. data/lib/pdf/reader/filter/depredict.rb +1 -1
  13. data/lib/pdf/reader/filter/flate.rb +3 -3
  14. data/lib/pdf/reader/filter/lzw.rb +1 -1
  15. data/lib/pdf/reader/filter/null.rb +1 -2
  16. data/lib/pdf/reader/filter/run_length.rb +1 -1
  17. data/lib/pdf/reader/filter.rb +10 -11
  18. data/lib/pdf/reader/font.rb +71 -16
  19. data/lib/pdf/reader/font_descriptor.rb +18 -17
  20. data/lib/pdf/reader/form_xobject.rb +14 -5
  21. data/lib/pdf/reader/key_builder_v5.rb +138 -0
  22. data/lib/pdf/reader/null_security_handler.rb +0 -4
  23. data/lib/pdf/reader/object_hash.rb +251 -44
  24. data/lib/pdf/reader/page.rb +51 -22
  25. data/lib/pdf/reader/page_layout.rb +14 -28
  26. data/lib/pdf/reader/page_state.rb +1 -1
  27. data/lib/pdf/reader/page_text_receiver.rb +52 -10
  28. data/lib/pdf/reader/parser.rb +22 -7
  29. data/lib/pdf/reader/point.rb +1 -1
  30. data/lib/pdf/reader/rc4_security_handler.rb +38 -0
  31. data/lib/pdf/reader/rectangle.rb +20 -2
  32. data/lib/pdf/reader/{resource_methods.rb → resources.rb} +15 -13
  33. data/lib/pdf/reader/security_handler_factory.rb +79 -0
  34. data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -95
  35. data/lib/pdf/reader/stream.rb +2 -2
  36. data/lib/pdf/reader/text_run.rb +13 -6
  37. data/lib/pdf/reader/type_check.rb +52 -0
  38. data/lib/pdf/reader/validating_receiver.rb +262 -0
  39. data/lib/pdf/reader/width_calculator/true_type.rb +1 -1
  40. data/lib/pdf/reader/xref.rb +20 -3
  41. data/lib/pdf/reader.rb +32 -11
  42. data/rbi/pdf-reader.rbi +408 -174
  43. metadata +16 -9
  44. data/lib/pdf/reader/standard_security_handler_v5.rb +0 -92
@@ -48,7 +48,11 @@ class PDF::Reader
48
48
  @trailer = @xref.trailer
49
49
  @cache = opts[:cache] || PDF::Reader::ObjectCache.new
50
50
  @sec_handler = NullSecurityHandler.new
51
- @sec_handler = build_security_handler(opts)
51
+ @sec_handler = SecurityHandlerFactory.build(
52
+ deref(trailer[:Encrypt]),
53
+ deref(trailer[:ID]),
54
+ opts[:password]
55
+ )
52
56
  end
53
57
 
54
58
  # returns the type of object a ref points to
@@ -92,6 +96,218 @@ class PDF::Reader
92
96
  end
93
97
  alias :deref :object
94
98
 
99
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
100
+ # object in the PDF and return it. Otherwise return key untouched.
101
+ #
102
+ # Guaranteed to only return an Array or nil. If the dereference results in
103
+ # any other type then a MalformedPDFError exception will raise. Useful when
104
+ # expecting an Array and no other type will do.
105
+ def deref_array(key)
106
+ obj = deref(key)
107
+
108
+ return obj if obj.nil?
109
+
110
+ obj.tap { |obj|
111
+ raise MalformedPDFError, "expected object to be an Array or nil" if !obj.is_a?(Array)
112
+ }
113
+ end
114
+
115
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
116
+ # object in the PDF and return it. Otherwise return key untouched.
117
+ #
118
+ # Guaranteed to only return an Array of Numerics or nil. If the dereference results in
119
+ # any other type then a MalformedPDFError exception will raise. Useful when
120
+ # expecting an Array and no other type will do.
121
+ #
122
+ # Some effort to cast array elements to a number is made for any non-numeric elements.
123
+ def deref_array_of_numbers(key)
124
+ arr = deref(key)
125
+
126
+ return arr if arr.nil?
127
+
128
+ raise MalformedPDFError, "expected object to be an Array" unless arr.is_a?(Array)
129
+
130
+ arr.map { |item|
131
+ if item.is_a?(Numeric)
132
+ item
133
+ elsif item.respond_to?(:to_f)
134
+ item.to_f
135
+ elsif item.respond_to?(:to_i)
136
+ item.to_i
137
+ else
138
+ raise MalformedPDFError, "expected object to be a number"
139
+ end
140
+ }
141
+ end
142
+
143
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
144
+ # object in the PDF and return it. Otherwise return key untouched.
145
+ #
146
+ # Guaranteed to only return a Hash or nil. If the dereference results in
147
+ # any other type then a MalformedPDFError exception will raise. Useful when
148
+ # expecting an Array and no other type will do.
149
+ def deref_hash(key)
150
+ obj = deref(key)
151
+
152
+ return obj if obj.nil?
153
+
154
+ obj.tap { |obj|
155
+ raise MalformedPDFError, "expected object to be a Hash or nil" if !obj.is_a?(Hash)
156
+ }
157
+ end
158
+
159
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
160
+ # object in the PDF and return it. Otherwise return key untouched.
161
+ #
162
+ # Guaranteed to only return a PDF name (Symbol) or nil. If the dereference results in
163
+ # any other type then a MalformedPDFError exception will raise. Useful when
164
+ # expecting an Array and no other type will do.
165
+ #
166
+ # Some effort to cast to a symbol is made when the reference points to a non-symbol.
167
+ def deref_name(key)
168
+ obj = deref(key)
169
+
170
+ return obj if obj.nil?
171
+
172
+ if !obj.is_a?(Symbol)
173
+ if obj.respond_to?(:to_sym)
174
+ obj = obj.to_sym
175
+ else
176
+ raise MalformedPDFError, "expected object to be a Name"
177
+ end
178
+ end
179
+
180
+ obj
181
+ end
182
+
183
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
184
+ # object in the PDF and return it. Otherwise return key untouched.
185
+ #
186
+ # Guaranteed to only return an Integer or nil. If the dereference results in
187
+ # any other type then a MalformedPDFError exception will raise. Useful when
188
+ # expecting an Array and no other type will do.
189
+ #
190
+ # Some effort to cast to an int is made when the reference points to a non-integer.
191
+ def deref_integer(key)
192
+ obj = deref(key)
193
+
194
+ return obj if obj.nil?
195
+
196
+ if !obj.is_a?(Integer)
197
+ if obj.respond_to?(:to_i)
198
+ obj = obj.to_i
199
+ else
200
+ raise MalformedPDFError, "expected object to be an Integer"
201
+ end
202
+ end
203
+
204
+ obj
205
+ end
206
+
207
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
208
+ # object in the PDF and return it. Otherwise return key untouched.
209
+ #
210
+ # Guaranteed to only return a Numeric or nil. If the dereference results in
211
+ # any other type then a MalformedPDFError exception will raise. Useful when
212
+ # expecting an Array and no other type will do.
213
+ #
214
+ # Some effort to cast to a number is made when the reference points to a non-number.
215
+ def deref_number(key)
216
+ obj = deref(key)
217
+
218
+ return obj if obj.nil?
219
+
220
+ if !obj.is_a?(Numeric)
221
+ if obj.respond_to?(:to_f)
222
+ obj = obj.to_f
223
+ elsif obj.respond_to?(:to_i)
224
+ obj.to_i
225
+ else
226
+ raise MalformedPDFError, "expected object to be a number"
227
+ end
228
+ end
229
+
230
+ obj
231
+ end
232
+
233
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
234
+ # object in the PDF and return it. Otherwise return key untouched.
235
+ #
236
+ # Guaranteed to only return a PDF::Reader::Stream or nil. If the dereference results in
237
+ # any other type then a MalformedPDFError exception will raise. Useful when
238
+ # expecting a stream and no other type will do.
239
+ def deref_stream(key)
240
+ obj = deref(key)
241
+
242
+ return obj if obj.nil?
243
+
244
+ obj.tap { |obj|
245
+ if !obj.is_a?(PDF::Reader::Stream)
246
+ raise MalformedPDFError, "expected object to be an Array or nil"
247
+ end
248
+ }
249
+ end
250
+
251
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
252
+ # object in the PDF and return it. Otherwise return key untouched.
253
+ #
254
+ # Guaranteed to only return a String or nil. If the dereference results in
255
+ # any other type then a MalformedPDFError exception will raise. Useful when
256
+ # expecting a string and no other type will do.
257
+ #
258
+ # Some effort to cast to a string is made when the reference points to a non-string.
259
+ def deref_string(key)
260
+ obj = deref(key)
261
+
262
+ return obj if obj.nil?
263
+
264
+ if !obj.is_a?(String)
265
+ if obj.respond_to?(:to_s)
266
+ obj = obj.to_s
267
+ else
268
+ raise MalformedPDFError, "expected object to be a string"
269
+ end
270
+ end
271
+
272
+ obj
273
+ end
274
+
275
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
276
+ # object in the PDF and return it. Otherwise return key untouched.
277
+ #
278
+ # Guaranteed to only return a PDF Name (symbol), Array or nil. If the dereference results in
279
+ # any other type then a MalformedPDFError exception will raise. Useful when
280
+ # expecting a Name or Array and no other type will do.
281
+ def deref_name_or_array(key)
282
+ obj = deref(key)
283
+
284
+ return obj if obj.nil?
285
+
286
+ obj.tap { |obj|
287
+ if !obj.is_a?(Symbol) && !obj.is_a?(Array)
288
+ raise MalformedPDFError, "expected object to be an Array or Name"
289
+ end
290
+ }
291
+ end
292
+
293
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
294
+ # object in the PDF and return it. Otherwise return key untouched.
295
+ #
296
+ # Guaranteed to only return a PDF::Reader::Stream, Array or nil. If the dereference results in
297
+ # any other type then a MalformedPDFError exception will raise. Useful when
298
+ # expecting a stream or Array and no other type will do.
299
+ def deref_stream_or_array(key)
300
+ obj = deref(key)
301
+
302
+ return obj if obj.nil?
303
+
304
+ obj.tap { |obj|
305
+ if !obj.is_a?(PDF::Reader::Stream) && !obj.is_a?(Array)
306
+ raise MalformedPDFError, "expected object to be an Array or Stream"
307
+ end
308
+ }
309
+ end
310
+
95
311
  # Recursively dereferences the object refered to be +key+. If +key+ is not
96
312
  # a PDF::Reader::Reference, the key is returned unchanged.
97
313
  #
@@ -99,6 +315,22 @@ class PDF::Reader
99
315
  deref_internal!(key, {})
100
316
  end
101
317
 
318
+ def deref_array!(key)
319
+ deref!(key).tap { |obj|
320
+ if !obj.nil? && !obj.is_a?(Array)
321
+ raise MalformedPDFError, "expected object (#{obj.inspect}) to be an Array or nil"
322
+ end
323
+ }
324
+ end
325
+
326
+ def deref_hash!(key)
327
+ deref!(key).tap { |obj|
328
+ if !obj.nil? && !obj.is_a?(Hash)
329
+ raise MalformedPDFError, "expected object (#{obj.inspect}) to be a Hash or nil"
330
+ end
331
+ }
332
+ end
333
+
102
334
  # Access an object from the PDF. key can be an int or a PDF::Reader::Reference
103
335
  # object.
104
336
  #
@@ -233,7 +465,10 @@ class PDF::Reader
233
465
  #
234
466
  def page_references
235
467
  root = fetch(trailer[:Root])
236
- @page_references ||= get_page_objects(root[:Pages]).flatten
468
+ @page_references ||= begin
469
+ pages_root = deref_hash(root[:Pages]) || {}
470
+ get_page_objects(pages_root)
471
+ end
237
472
  end
238
473
 
239
474
  def encrypted?
@@ -299,36 +534,6 @@ class PDF::Reader
299
534
  end
300
535
  end
301
536
 
302
- def build_security_handler(opts = {})
303
- encrypt = deref(trailer[:Encrypt])
304
- if NullSecurityHandler.supports?(encrypt)
305
- NullSecurityHandler.new
306
- elsif StandardSecurityHandler.supports?(encrypt)
307
- encmeta = !encrypt.has_key?(:EncryptMetadata) || encrypt[:EncryptMetadata].to_s == "true"
308
- StandardSecurityHandler.new(
309
- key_length: (encrypt[:Length] || 40).to_i,
310
- revision: encrypt[:R],
311
- owner_key: encrypt[:O],
312
- user_key: encrypt[:U],
313
- permissions: encrypt[:P].to_i,
314
- encrypted_metadata: encmeta,
315
- file_id: (deref(trailer[:ID]) || []).first,
316
- password: opts[:password],
317
- cfm: encrypt.fetch(:CF, {}).fetch(encrypt[:StmF], {}).fetch(:CFM, nil)
318
- )
319
- elsif StandardSecurityHandlerV5.supports?(encrypt)
320
- StandardSecurityHandlerV5.new(
321
- O: encrypt[:O],
322
- U: encrypt[:U],
323
- OE: encrypt[:OE],
324
- UE: encrypt[:UE],
325
- password: opts[:password]
326
- )
327
- else
328
- UnimplementedSecurityHandler.new
329
- end
330
- end
331
-
332
537
  def decrypt(ref, obj)
333
538
  case obj
334
539
  when PDF::Reader::Stream then
@@ -362,19 +567,21 @@ class PDF::Reader
362
567
  @object_stream ||= {}
363
568
  end
364
569
 
365
- # returns a nested array of object references for all pages in this object store.
570
+ # returns an array of object references for all pages in this object store. The ordering of
571
+ # the Array is significant and matches the page ordering of the document
366
572
  #
367
- def get_page_objects(ref)
368
- obj = deref(ref)
369
-
370
- unless obj.kind_of?(::Hash)
371
- raise MalformedPDFError, "Dereferenced page object must be a dict"
372
- end
373
-
374
- if obj[:Type] == :Page
375
- ref
376
- elsif obj[:Kids]
377
- deref(obj[:Kids]).map { |kid| get_page_objects(kid) }
573
+ def get_page_objects(obj)
574
+ derefed_obj = deref_hash(obj)
575
+
576
+ if derefed_obj[:Type] == :Page
577
+ [obj]
578
+ elsif derefed_obj[:Kids]
579
+ kids = deref_array(derefed_obj[:Kids]) || []
580
+ kids.map { |kid|
581
+ get_page_objects(kid)
582
+ }.flatten
583
+ else
584
+ raise MalformedPDFError, "Expected Page or Pages object"
378
585
  end
379
586
  end
380
587
 
@@ -14,7 +14,7 @@ module PDF
14
14
  # objects accessor to help walk the page dictionary in any useful way.
15
15
  #
16
16
  class Page
17
- include ResourceMethods
17
+ extend Forwardable
18
18
 
19
19
  # lowlevel hash-like access to all objects in the underlying PDF
20
20
  attr_reader :objects
@@ -27,6 +27,15 @@ module PDF
27
27
  # operations
28
28
  attr_reader :cache
29
29
 
30
+ def_delegators :resources, :color_spaces
31
+ def_delegators :resources, :fonts
32
+ def_delegators :resources, :graphic_states
33
+ def_delegators :resources, :patterns
34
+ def_delegators :resources, :procedure_sets
35
+ def_delegators :resources, :properties
36
+ def_delegators :resources, :shadings
37
+ def_delegators :resources, :xobjects
38
+
30
39
  # creates a new page wrapper.
31
40
  #
32
41
  # * objects - an ObjectHash instance that wraps a PDF file
@@ -34,7 +43,7 @@ module PDF
34
43
  #
35
44
  def initialize(objects, pagenum, options = {})
36
45
  @objects, @pagenum = objects, pagenum
37
- @page_object = objects.deref(objects.page_references[pagenum - 1])
46
+ @page_object = objects.deref_hash(objects.page_references[pagenum - 1])
38
47
  @cache = options[:cache] || {}
39
48
 
40
49
  unless @page_object.is_a?(::Hash)
@@ -60,7 +69,7 @@ module PDF
60
69
  def attributes
61
70
  @attributes ||= {}.tap { |hash|
62
71
  page_with_ancestors.reverse.each do |obj|
63
- hash.merge!(@objects.deref(obj))
72
+ hash.merge!(@objects.deref_hash(obj) || {})
64
73
  end
65
74
  }
66
75
  # This shouldn't be necesary, but some non compliant PDFs leave MediaBox
@@ -101,13 +110,24 @@ module PDF
101
110
  # returns the plain text content of this page encoded as UTF-8. Any
102
111
  # characters that can't be translated will be returned as a ▯
103
112
  #
104
- def text
113
+ def text(opts = {})
105
114
  receiver = PageTextReceiver.new
106
115
  walk(receiver)
107
- receiver.content
116
+ runs = receiver.runs(opts)
117
+
118
+ # rectangles[:MediaBox] can never be nil, but I have no easy way to tell sorbet that atm
119
+ mediabox = rectangles[:MediaBox] || Rectangle.new(0, 0, 0, 0)
120
+
121
+ PageLayout.new(runs, mediabox).to_s
108
122
  end
109
123
  alias :to_s :text
110
124
 
125
+ def runs(opts = {})
126
+ receiver = PageTextReceiver.new
127
+ walk(receiver)
128
+ receiver.runs(opts)
129
+ end
130
+
111
131
  # processes the raw content stream for this page in sequential order and
112
132
  # passes callbacks to the receiver objects.
113
133
  #
@@ -132,6 +152,9 @@ module PDF
132
152
  # the program in the correct order and calls out to your implementation.
133
153
  #
134
154
  def walk(*receivers)
155
+ receivers = receivers.map { |receiver|
156
+ ValidatingReceiver.new(receiver)
157
+ }
135
158
  callback(receivers, :page=, [self])
136
159
  content_stream(receivers, raw_content)
137
160
  end
@@ -140,10 +163,10 @@ module PDF
140
163
  # see here unless you're a PDF nerd like me.
141
164
  #
142
165
  def raw_content
143
- contents = objects.deref(@page_object[:Contents])
166
+ contents = objects.deref_stream_or_array(@page_object[:Contents])
144
167
  [contents].flatten.compact.map { |obj|
145
- objects.deref(obj)
146
- }.map { |obj|
168
+ objects.deref_stream(obj)
169
+ }.compact.map { |obj|
147
170
  obj.unfiltered_data
148
171
  }.join(" ")
149
172
  end
@@ -174,17 +197,22 @@ module PDF
174
197
  # values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
175
198
  #
176
199
  def rectangles
177
- mediabox = objects.deref!(attributes[:MediaBox])
178
- cropbox = objects.deref!(attributes[:Cropbox]) || mediabox
179
- bleedbox = objects.deref!(attributes[:BleedBox]) || cropbox
180
- trimbox = objects.deref!(attributes[:TrimBox]) || cropbox
181
- artbox = objects.deref!(attributes[:ArtBox]) || cropbox
182
-
183
- mediarect = Rectangle.new(*mediabox)
184
- croprect = Rectangle.new(*cropbox)
185
- bleedrect = Rectangle.new(*bleedbox)
186
- trimrect = Rectangle.new(*trimbox)
187
- artrect = Rectangle.new(*artbox)
200
+ # attributes[:MediaBox] can never be nil, but I have no easy way to tell sorbet that atm
201
+ mediabox = objects.deref_array_of_numbers(attributes[:MediaBox]) || []
202
+ cropbox = objects.deref_array_of_numbers(attributes[:CropBox]) || mediabox
203
+ bleedbox = objects.deref_array_of_numbers(attributes[:BleedBox]) || cropbox
204
+ trimbox = objects.deref_array_of_numbers(attributes[:TrimBox]) || cropbox
205
+ artbox = objects.deref_array_of_numbers(attributes[:ArtBox]) || cropbox
206
+
207
+ begin
208
+ mediarect = Rectangle.from_array(mediabox)
209
+ croprect = Rectangle.from_array(cropbox)
210
+ bleedrect = Rectangle.from_array(bleedbox)
211
+ trimrect = Rectangle.from_array(trimbox)
212
+ artrect = Rectangle.from_array(artbox)
213
+ rescue ArgumentError => e
214
+ raise MalformedPDFError, e.message
215
+ end
188
216
 
189
217
  if rotate > 0
190
218
  mediarect.apply_rotation(rotate)
@@ -206,14 +234,14 @@ module PDF
206
234
  private
207
235
 
208
236
  def root
209
- @root ||= objects.deref(@objects.trailer[:Root])
237
+ @root ||= objects.deref_hash(@objects.trailer[:Root]) || {}
210
238
  end
211
239
 
212
240
  # Returns the resources that accompany this page. Includes
213
241
  # resources inherited from parents.
214
242
  #
215
243
  def resources
216
- @resources ||= @objects.deref(attributes[:Resources]) || {}
244
+ @resources ||= Resources.new(@objects, @objects.deref_hash(attributes[:Resources]) || {})
217
245
  end
218
246
 
219
247
  def content_stream(receivers, instructions)
@@ -249,7 +277,8 @@ module PDF
249
277
  if origin.nil?
250
278
  []
251
279
  else
252
- obj = objects.deref(origin)
280
+ obj = objects.deref_hash(origin)
281
+ PDF::Reader::Error.validate_not_nil_as_malformed(obj, "parent")
253
282
  [ select_inheritable(obj) ] + ancestors(obj[:Parent])
254
283
  end
255
284
  end
@@ -21,10 +21,8 @@ class PDF::Reader
21
21
  # PDF::Reader::Rectangle at some point
22
22
  PDF::Reader::Error.validate_not_nil(mediabox, "mediabox")
23
23
 
24
- runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
25
- runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
26
- @mediabox = mediabox
27
- @runs = merge_runs(runs)
24
+ @mediabox = process_mediabox(mediabox)
25
+ @runs = runs
28
26
  @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
29
27
  @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
30
28
  @median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
@@ -51,13 +49,11 @@ class PDF::Reader
51
49
  private
52
50
 
53
51
  def page_width
54
- # TODO once @mediabox is a Rectangle, this can be just `@mediabox.width`
55
- (@mediabox[2].to_f - @mediabox[0].to_f).abs
52
+ @mediabox.width
56
53
  end
57
54
 
58
55
  def page_height
59
- # TODO once @mediabox is a Rectangle, this can be just `@mediabox.height`
60
- (@mediabox[3].to_f - @mediabox[1].to_f).abs
56
+ @mediabox.height
61
57
  end
62
58
 
63
59
  # given an array of strings, return a new array with empty rows from the
@@ -109,30 +105,20 @@ class PDF::Reader
109
105
  end
110
106
  end
111
107
 
112
- # take a collection of TextRun objects and merge any that are in close
113
- # proximity
114
- def merge_runs(runs)
115
- runs.group_by { |char|
116
- char.y.to_i
117
- }.map { |y, chars|
118
- group_chars_into_runs(chars.sort)
119
- }.flatten.sort
108
+ def local_string_insert(haystack, needle, index)
109
+ haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
120
110
  end
121
111
 
122
- def group_chars_into_runs(chars)
123
- chars.each_with_object([]) do |char, runs|
124
- if runs.empty?
125
- runs << char
126
- elsif runs.last.mergable?(char)
127
- runs[-1] = runs.last + char
128
- else
129
- runs << char
130
- end
112
+ def process_mediabox(mediabox)
113
+ if mediabox.is_a?(Array)
114
+ msg = "Passing the mediabox to PageLayout as an Array is deprecated," +
115
+ " please use a Rectangle instead"
116
+ $stderr.puts msg
117
+ PDF::Reader::Rectangle.from_array(mediabox)
118
+ else
119
+ mediabox
131
120
  end
132
121
  end
133
122
 
134
- def local_string_insert(haystack, needle, index)
135
- haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
136
- end
137
123
  end
138
124
  end
@@ -384,7 +384,7 @@ class PDF::Reader
384
384
  #
385
385
  def build_fonts(raw_fonts)
386
386
  wrapped_fonts = raw_fonts.map { |label, font|
387
- [label, PDF::Reader::Font.new(@objects, @objects.deref(font))]
387
+ [label, PDF::Reader::Font.new(@objects, @objects.deref_hash(font) || {})]
388
388
  }
389
389
 
390
390
  ::Hash[wrapped_fonts]
@@ -47,9 +47,32 @@ module PDF
47
47
  @characters = []
48
48
  end
49
49
 
50
+ def runs(opts = {})
51
+ runs = @characters
52
+
53
+ if rect = opts.fetch(:rect, @page.rectangles[:CropBox])
54
+ runs = BoundingRectangleRunsFilter.runs_within_rect(runs, rect)
55
+ end
56
+
57
+ if opts.fetch(:skip_zero_width, true)
58
+ runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
59
+ end
60
+
61
+ if opts.fetch(:skip_overlapping, true)
62
+ runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
63
+ end
64
+
65
+ if opts.fetch(:merge, true)
66
+ runs = merge_runs(runs)
67
+ end
68
+
69
+ runs
70
+ end
71
+
72
+ # deprecated
50
73
  def content
51
- mediabox = @page.rectangles[:MediaBox].to_a
52
- PageLayout.new(@characters, mediabox).to_s
74
+ mediabox = @page.rectangles[:MediaBox]
75
+ PageLayout.new(runs, mediabox).to_s
53
76
  end
54
77
 
55
78
  #####################################################
@@ -64,8 +87,10 @@ module PDF
64
87
  params.each do |arg|
65
88
  if arg.is_a?(String)
66
89
  internal_show_text(arg)
67
- else
90
+ elsif arg.is_a?(Numeric)
68
91
  @state.process_glyph_displacement(0, arg, false)
92
+ else
93
+ # skip it
69
94
  end
70
95
  end
71
96
  end
@@ -96,6 +121,7 @@ module PDF
96
121
  private
97
122
 
98
123
  def internal_show_text(string)
124
+ PDF::Reader::Error.validate_type_as_malformed(string, "string", String)
99
125
  if @state.current_font.nil?
100
126
  raise PDF::Reader::MalformedPDFError, "current font is invalid"
101
127
  end
@@ -109,7 +135,7 @@ module PDF
109
135
 
110
136
  # apply to glyph displacment for the current glyph so the next
111
137
  # glyph will appear in the correct position
112
- glyph_width = @state.current_font.glyph_width(glyph_code) / 1000.0
138
+ glyph_width = @state.current_font.glyph_width_in_text_space(glyph_code)
113
139
  th = 1
114
140
  scaled_glyph_width = glyph_width * @state.font_size * th
115
141
  unless utf8_chars == SPACE
@@ -119,12 +145,6 @@ module PDF
119
145
  end
120
146
  end
121
147
 
122
- # TODO: revist this. It rotates the co-ordinates to the right direction, but I don't
123
- # think it sets the correct x,y values. We get away with it because we don't
124
- # return the text with co-ordinates, only the full text arranged in a string.
125
- #
126
- # We should provide an API for extracting the text with positioning data and spec
127
- # that. I suspect the co-ords might be wrong for rotated pages
128
148
  def apply_rotation(x, y)
129
149
  if @page.rotate == 90
130
150
  tmp = x
@@ -141,6 +161,28 @@ module PDF
141
161
  return x, y
142
162
  end
143
163
 
164
+ # take a collection of TextRun objects and merge any that are in close
165
+ # proximity
166
+ def merge_runs(runs)
167
+ runs.group_by { |char|
168
+ char.y.to_i
169
+ }.map { |y, chars|
170
+ group_chars_into_runs(chars.sort)
171
+ }.flatten.sort
172
+ end
173
+
174
+ def group_chars_into_runs(chars)
175
+ chars.each_with_object([]) do |char, runs|
176
+ if runs.empty?
177
+ runs << char
178
+ elsif runs.last.mergable?(char)
179
+ runs[-1] = runs.last + char
180
+ else
181
+ runs << char
182
+ end
183
+ end
184
+ end
185
+
144
186
  end
145
187
  end
146
188
  end