pdf-reader 2.7.0 → 2.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +20 -0
  3. data/Rakefile +1 -1
  4. data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
  5. data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
  6. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
  7. data/lib/pdf/reader/buffer.rb +36 -34
  8. data/lib/pdf/reader/cmap.rb +64 -51
  9. data/lib/pdf/reader/error.rb +8 -0
  10. data/lib/pdf/reader/filter/ascii85.rb +1 -1
  11. data/lib/pdf/reader/filter/ascii_hex.rb +1 -1
  12. data/lib/pdf/reader/filter/depredict.rb +1 -1
  13. data/lib/pdf/reader/filter/flate.rb +3 -3
  14. data/lib/pdf/reader/filter/lzw.rb +1 -1
  15. data/lib/pdf/reader/filter/null.rb +1 -2
  16. data/lib/pdf/reader/filter/run_length.rb +1 -1
  17. data/lib/pdf/reader/filter.rb +10 -11
  18. data/lib/pdf/reader/font.rb +71 -16
  19. data/lib/pdf/reader/font_descriptor.rb +18 -17
  20. data/lib/pdf/reader/form_xobject.rb +14 -5
  21. data/lib/pdf/reader/key_builder_v5.rb +138 -0
  22. data/lib/pdf/reader/null_security_handler.rb +0 -4
  23. data/lib/pdf/reader/object_hash.rb +251 -44
  24. data/lib/pdf/reader/page.rb +51 -22
  25. data/lib/pdf/reader/page_layout.rb +14 -28
  26. data/lib/pdf/reader/page_state.rb +1 -1
  27. data/lib/pdf/reader/page_text_receiver.rb +52 -10
  28. data/lib/pdf/reader/parser.rb +22 -7
  29. data/lib/pdf/reader/point.rb +1 -1
  30. data/lib/pdf/reader/rc4_security_handler.rb +38 -0
  31. data/lib/pdf/reader/rectangle.rb +20 -2
  32. data/lib/pdf/reader/{resource_methods.rb → resources.rb} +15 -13
  33. data/lib/pdf/reader/security_handler_factory.rb +79 -0
  34. data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -95
  35. data/lib/pdf/reader/stream.rb +2 -2
  36. data/lib/pdf/reader/text_run.rb +13 -6
  37. data/lib/pdf/reader/type_check.rb +52 -0
  38. data/lib/pdf/reader/validating_receiver.rb +262 -0
  39. data/lib/pdf/reader/width_calculator/true_type.rb +1 -1
  40. data/lib/pdf/reader/xref.rb +20 -3
  41. data/lib/pdf/reader.rb +32 -11
  42. data/rbi/pdf-reader.rbi +408 -174
  43. metadata +16 -9
  44. data/lib/pdf/reader/standard_security_handler_v5.rb +0 -92
@@ -48,7 +48,11 @@ class PDF::Reader
48
48
  @trailer = @xref.trailer
49
49
  @cache = opts[:cache] || PDF::Reader::ObjectCache.new
50
50
  @sec_handler = NullSecurityHandler.new
51
- @sec_handler = build_security_handler(opts)
51
+ @sec_handler = SecurityHandlerFactory.build(
52
+ deref(trailer[:Encrypt]),
53
+ deref(trailer[:ID]),
54
+ opts[:password]
55
+ )
52
56
  end
53
57
 
54
58
  # returns the type of object a ref points to
@@ -92,6 +96,218 @@ class PDF::Reader
92
96
  end
93
97
  alias :deref :object
94
98
 
99
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
100
+ # object in the PDF and return it. Otherwise return key untouched.
101
+ #
102
+ # Guaranteed to only return an Array or nil. If the dereference results in
103
+ # any other type then a MalformedPDFError exception will raise. Useful when
104
+ # expecting an Array and no other type will do.
105
+ def deref_array(key)
106
+ obj = deref(key)
107
+
108
+ return obj if obj.nil?
109
+
110
+ obj.tap { |obj|
111
+ raise MalformedPDFError, "expected object to be an Array or nil" if !obj.is_a?(Array)
112
+ }
113
+ end
114
+
115
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
116
+ # object in the PDF and return it. Otherwise return key untouched.
117
+ #
118
+ # Guaranteed to only return an Array of Numerics or nil. If the dereference results in
119
+ # any other type then a MalformedPDFError exception will raise. Useful when
120
+ # expecting an Array and no other type will do.
121
+ #
122
+ # Some effort to cast array elements to a number is made for any non-numeric elements.
123
+ def deref_array_of_numbers(key)
124
+ arr = deref(key)
125
+
126
+ return arr if arr.nil?
127
+
128
+ raise MalformedPDFError, "expected object to be an Array" unless arr.is_a?(Array)
129
+
130
+ arr.map { |item|
131
+ if item.is_a?(Numeric)
132
+ item
133
+ elsif item.respond_to?(:to_f)
134
+ item.to_f
135
+ elsif item.respond_to?(:to_i)
136
+ item.to_i
137
+ else
138
+ raise MalformedPDFError, "expected object to be a number"
139
+ end
140
+ }
141
+ end
142
+
143
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
144
+ # object in the PDF and return it. Otherwise return key untouched.
145
+ #
146
+ # Guaranteed to only return a Hash or nil. If the dereference results in
147
+ # any other type then a MalformedPDFError exception will raise. Useful when
148
+ # expecting an Array and no other type will do.
149
+ def deref_hash(key)
150
+ obj = deref(key)
151
+
152
+ return obj if obj.nil?
153
+
154
+ obj.tap { |obj|
155
+ raise MalformedPDFError, "expected object to be a Hash or nil" if !obj.is_a?(Hash)
156
+ }
157
+ end
158
+
159
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
160
+ # object in the PDF and return it. Otherwise return key untouched.
161
+ #
162
+ # Guaranteed to only return a PDF name (Symbol) or nil. If the dereference results in
163
+ # any other type then a MalformedPDFError exception will raise. Useful when
164
+ # expecting an Array and no other type will do.
165
+ #
166
+ # Some effort to cast to a symbol is made when the reference points to a non-symbol.
167
+ def deref_name(key)
168
+ obj = deref(key)
169
+
170
+ return obj if obj.nil?
171
+
172
+ if !obj.is_a?(Symbol)
173
+ if obj.respond_to?(:to_sym)
174
+ obj = obj.to_sym
175
+ else
176
+ raise MalformedPDFError, "expected object to be a Name"
177
+ end
178
+ end
179
+
180
+ obj
181
+ end
182
+
183
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
184
+ # object in the PDF and return it. Otherwise return key untouched.
185
+ #
186
+ # Guaranteed to only return an Integer or nil. If the dereference results in
187
+ # any other type then a MalformedPDFError exception will raise. Useful when
188
+ # expecting an Array and no other type will do.
189
+ #
190
+ # Some effort to cast to an int is made when the reference points to a non-integer.
191
+ def deref_integer(key)
192
+ obj = deref(key)
193
+
194
+ return obj if obj.nil?
195
+
196
+ if !obj.is_a?(Integer)
197
+ if obj.respond_to?(:to_i)
198
+ obj = obj.to_i
199
+ else
200
+ raise MalformedPDFError, "expected object to be an Integer"
201
+ end
202
+ end
203
+
204
+ obj
205
+ end
206
+
207
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
208
+ # object in the PDF and return it. Otherwise return key untouched.
209
+ #
210
+ # Guaranteed to only return a Numeric or nil. If the dereference results in
211
+ # any other type then a MalformedPDFError exception will raise. Useful when
212
+ # expecting an Array and no other type will do.
213
+ #
214
+ # Some effort to cast to a number is made when the reference points to a non-number.
215
+ def deref_number(key)
216
+ obj = deref(key)
217
+
218
+ return obj if obj.nil?
219
+
220
+ if !obj.is_a?(Numeric)
221
+ if obj.respond_to?(:to_f)
222
+ obj = obj.to_f
223
+ elsif obj.respond_to?(:to_i)
224
+ obj.to_i
225
+ else
226
+ raise MalformedPDFError, "expected object to be a number"
227
+ end
228
+ end
229
+
230
+ obj
231
+ end
232
+
233
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
234
+ # object in the PDF and return it. Otherwise return key untouched.
235
+ #
236
+ # Guaranteed to only return a PDF::Reader::Stream or nil. If the dereference results in
237
+ # any other type then a MalformedPDFError exception will raise. Useful when
238
+ # expecting a stream and no other type will do.
239
+ def deref_stream(key)
240
+ obj = deref(key)
241
+
242
+ return obj if obj.nil?
243
+
244
+ obj.tap { |obj|
245
+ if !obj.is_a?(PDF::Reader::Stream)
246
+ raise MalformedPDFError, "expected object to be an Array or nil"
247
+ end
248
+ }
249
+ end
250
+
251
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
252
+ # object in the PDF and return it. Otherwise return key untouched.
253
+ #
254
+ # Guaranteed to only return a String or nil. If the dereference results in
255
+ # any other type then a MalformedPDFError exception will raise. Useful when
256
+ # expecting a string and no other type will do.
257
+ #
258
+ # Some effort to cast to a string is made when the reference points to a non-string.
259
+ def deref_string(key)
260
+ obj = deref(key)
261
+
262
+ return obj if obj.nil?
263
+
264
+ if !obj.is_a?(String)
265
+ if obj.respond_to?(:to_s)
266
+ obj = obj.to_s
267
+ else
268
+ raise MalformedPDFError, "expected object to be a string"
269
+ end
270
+ end
271
+
272
+ obj
273
+ end
274
+
275
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
276
+ # object in the PDF and return it. Otherwise return key untouched.
277
+ #
278
+ # Guaranteed to only return a PDF Name (symbol), Array or nil. If the dereference results in
279
+ # any other type then a MalformedPDFError exception will raise. Useful when
280
+ # expecting a Name or Array and no other type will do.
281
+ def deref_name_or_array(key)
282
+ obj = deref(key)
283
+
284
+ return obj if obj.nil?
285
+
286
+ obj.tap { |obj|
287
+ if !obj.is_a?(Symbol) && !obj.is_a?(Array)
288
+ raise MalformedPDFError, "expected object to be an Array or Name"
289
+ end
290
+ }
291
+ end
292
+
293
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
294
+ # object in the PDF and return it. Otherwise return key untouched.
295
+ #
296
+ # Guaranteed to only return a PDF::Reader::Stream, Array or nil. If the dereference results in
297
+ # any other type then a MalformedPDFError exception will raise. Useful when
298
+ # expecting a stream or Array and no other type will do.
299
+ def deref_stream_or_array(key)
300
+ obj = deref(key)
301
+
302
+ return obj if obj.nil?
303
+
304
+ obj.tap { |obj|
305
+ if !obj.is_a?(PDF::Reader::Stream) && !obj.is_a?(Array)
306
+ raise MalformedPDFError, "expected object to be an Array or Stream"
307
+ end
308
+ }
309
+ end
310
+
95
311
  # Recursively dereferences the object refered to be +key+. If +key+ is not
96
312
  # a PDF::Reader::Reference, the key is returned unchanged.
97
313
  #
@@ -99,6 +315,22 @@ class PDF::Reader
99
315
  deref_internal!(key, {})
100
316
  end
101
317
 
318
+ def deref_array!(key)
319
+ deref!(key).tap { |obj|
320
+ if !obj.nil? && !obj.is_a?(Array)
321
+ raise MalformedPDFError, "expected object (#{obj.inspect}) to be an Array or nil"
322
+ end
323
+ }
324
+ end
325
+
326
+ def deref_hash!(key)
327
+ deref!(key).tap { |obj|
328
+ if !obj.nil? && !obj.is_a?(Hash)
329
+ raise MalformedPDFError, "expected object (#{obj.inspect}) to be a Hash or nil"
330
+ end
331
+ }
332
+ end
333
+
102
334
  # Access an object from the PDF. key can be an int or a PDF::Reader::Reference
103
335
  # object.
104
336
  #
@@ -233,7 +465,10 @@ class PDF::Reader
233
465
  #
234
466
  def page_references
235
467
  root = fetch(trailer[:Root])
236
- @page_references ||= get_page_objects(root[:Pages]).flatten
468
+ @page_references ||= begin
469
+ pages_root = deref_hash(root[:Pages]) || {}
470
+ get_page_objects(pages_root)
471
+ end
237
472
  end
238
473
 
239
474
  def encrypted?
@@ -299,36 +534,6 @@ class PDF::Reader
299
534
  end
300
535
  end
301
536
 
302
- def build_security_handler(opts = {})
303
- encrypt = deref(trailer[:Encrypt])
304
- if NullSecurityHandler.supports?(encrypt)
305
- NullSecurityHandler.new
306
- elsif StandardSecurityHandler.supports?(encrypt)
307
- encmeta = !encrypt.has_key?(:EncryptMetadata) || encrypt[:EncryptMetadata].to_s == "true"
308
- StandardSecurityHandler.new(
309
- key_length: (encrypt[:Length] || 40).to_i,
310
- revision: encrypt[:R],
311
- owner_key: encrypt[:O],
312
- user_key: encrypt[:U],
313
- permissions: encrypt[:P].to_i,
314
- encrypted_metadata: encmeta,
315
- file_id: (deref(trailer[:ID]) || []).first,
316
- password: opts[:password],
317
- cfm: encrypt.fetch(:CF, {}).fetch(encrypt[:StmF], {}).fetch(:CFM, nil)
318
- )
319
- elsif StandardSecurityHandlerV5.supports?(encrypt)
320
- StandardSecurityHandlerV5.new(
321
- O: encrypt[:O],
322
- U: encrypt[:U],
323
- OE: encrypt[:OE],
324
- UE: encrypt[:UE],
325
- password: opts[:password]
326
- )
327
- else
328
- UnimplementedSecurityHandler.new
329
- end
330
- end
331
-
332
537
  def decrypt(ref, obj)
333
538
  case obj
334
539
  when PDF::Reader::Stream then
@@ -362,19 +567,21 @@ class PDF::Reader
362
567
  @object_stream ||= {}
363
568
  end
364
569
 
365
- # returns a nested array of object references for all pages in this object store.
570
+ # returns an array of object references for all pages in this object store. The ordering of
571
+ # the Array is significant and matches the page ordering of the document
366
572
  #
367
- def get_page_objects(ref)
368
- obj = deref(ref)
369
-
370
- unless obj.kind_of?(::Hash)
371
- raise MalformedPDFError, "Dereferenced page object must be a dict"
372
- end
373
-
374
- if obj[:Type] == :Page
375
- ref
376
- elsif obj[:Kids]
377
- deref(obj[:Kids]).map { |kid| get_page_objects(kid) }
573
+ def get_page_objects(obj)
574
+ derefed_obj = deref_hash(obj)
575
+
576
+ if derefed_obj[:Type] == :Page
577
+ [obj]
578
+ elsif derefed_obj[:Kids]
579
+ kids = deref_array(derefed_obj[:Kids]) || []
580
+ kids.map { |kid|
581
+ get_page_objects(kid)
582
+ }.flatten
583
+ else
584
+ raise MalformedPDFError, "Expected Page or Pages object"
378
585
  end
379
586
  end
380
587
 
@@ -14,7 +14,7 @@ module PDF
14
14
  # objects accessor to help walk the page dictionary in any useful way.
15
15
  #
16
16
  class Page
17
- include ResourceMethods
17
+ extend Forwardable
18
18
 
19
19
  # lowlevel hash-like access to all objects in the underlying PDF
20
20
  attr_reader :objects
@@ -27,6 +27,15 @@ module PDF
27
27
  # operations
28
28
  attr_reader :cache
29
29
 
30
+ def_delegators :resources, :color_spaces
31
+ def_delegators :resources, :fonts
32
+ def_delegators :resources, :graphic_states
33
+ def_delegators :resources, :patterns
34
+ def_delegators :resources, :procedure_sets
35
+ def_delegators :resources, :properties
36
+ def_delegators :resources, :shadings
37
+ def_delegators :resources, :xobjects
38
+
30
39
  # creates a new page wrapper.
31
40
  #
32
41
  # * objects - an ObjectHash instance that wraps a PDF file
@@ -34,7 +43,7 @@ module PDF
34
43
  #
35
44
  def initialize(objects, pagenum, options = {})
36
45
  @objects, @pagenum = objects, pagenum
37
- @page_object = objects.deref(objects.page_references[pagenum - 1])
46
+ @page_object = objects.deref_hash(objects.page_references[pagenum - 1])
38
47
  @cache = options[:cache] || {}
39
48
 
40
49
  unless @page_object.is_a?(::Hash)
@@ -60,7 +69,7 @@ module PDF
60
69
  def attributes
61
70
  @attributes ||= {}.tap { |hash|
62
71
  page_with_ancestors.reverse.each do |obj|
63
- hash.merge!(@objects.deref(obj))
72
+ hash.merge!(@objects.deref_hash(obj) || {})
64
73
  end
65
74
  }
66
75
  # This shouldn't be necesary, but some non compliant PDFs leave MediaBox
@@ -101,13 +110,24 @@ module PDF
101
110
  # returns the plain text content of this page encoded as UTF-8. Any
102
111
  # characters that can't be translated will be returned as a ▯
103
112
  #
104
- def text
113
+ def text(opts = {})
105
114
  receiver = PageTextReceiver.new
106
115
  walk(receiver)
107
- receiver.content
116
+ runs = receiver.runs(opts)
117
+
118
+ # rectangles[:MediaBox] can never be nil, but I have no easy way to tell sorbet that atm
119
+ mediabox = rectangles[:MediaBox] || Rectangle.new(0, 0, 0, 0)
120
+
121
+ PageLayout.new(runs, mediabox).to_s
108
122
  end
109
123
  alias :to_s :text
110
124
 
125
+ def runs(opts = {})
126
+ receiver = PageTextReceiver.new
127
+ walk(receiver)
128
+ receiver.runs(opts)
129
+ end
130
+
111
131
  # processes the raw content stream for this page in sequential order and
112
132
  # passes callbacks to the receiver objects.
113
133
  #
@@ -132,6 +152,9 @@ module PDF
132
152
  # the program in the correct order and calls out to your implementation.
133
153
  #
134
154
  def walk(*receivers)
155
+ receivers = receivers.map { |receiver|
156
+ ValidatingReceiver.new(receiver)
157
+ }
135
158
  callback(receivers, :page=, [self])
136
159
  content_stream(receivers, raw_content)
137
160
  end
@@ -140,10 +163,10 @@ module PDF
140
163
  # see here unless you're a PDF nerd like me.
141
164
  #
142
165
  def raw_content
143
- contents = objects.deref(@page_object[:Contents])
166
+ contents = objects.deref_stream_or_array(@page_object[:Contents])
144
167
  [contents].flatten.compact.map { |obj|
145
- objects.deref(obj)
146
- }.map { |obj|
168
+ objects.deref_stream(obj)
169
+ }.compact.map { |obj|
147
170
  obj.unfiltered_data
148
171
  }.join(" ")
149
172
  end
@@ -174,17 +197,22 @@ module PDF
174
197
  # values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
175
198
  #
176
199
  def rectangles
177
- mediabox = objects.deref!(attributes[:MediaBox])
178
- cropbox = objects.deref!(attributes[:Cropbox]) || mediabox
179
- bleedbox = objects.deref!(attributes[:BleedBox]) || cropbox
180
- trimbox = objects.deref!(attributes[:TrimBox]) || cropbox
181
- artbox = objects.deref!(attributes[:ArtBox]) || cropbox
182
-
183
- mediarect = Rectangle.new(*mediabox)
184
- croprect = Rectangle.new(*cropbox)
185
- bleedrect = Rectangle.new(*bleedbox)
186
- trimrect = Rectangle.new(*trimbox)
187
- artrect = Rectangle.new(*artbox)
200
+ # attributes[:MediaBox] can never be nil, but I have no easy way to tell sorbet that atm
201
+ mediabox = objects.deref_array_of_numbers(attributes[:MediaBox]) || []
202
+ cropbox = objects.deref_array_of_numbers(attributes[:CropBox]) || mediabox
203
+ bleedbox = objects.deref_array_of_numbers(attributes[:BleedBox]) || cropbox
204
+ trimbox = objects.deref_array_of_numbers(attributes[:TrimBox]) || cropbox
205
+ artbox = objects.deref_array_of_numbers(attributes[:ArtBox]) || cropbox
206
+
207
+ begin
208
+ mediarect = Rectangle.from_array(mediabox)
209
+ croprect = Rectangle.from_array(cropbox)
210
+ bleedrect = Rectangle.from_array(bleedbox)
211
+ trimrect = Rectangle.from_array(trimbox)
212
+ artrect = Rectangle.from_array(artbox)
213
+ rescue ArgumentError => e
214
+ raise MalformedPDFError, e.message
215
+ end
188
216
 
189
217
  if rotate > 0
190
218
  mediarect.apply_rotation(rotate)
@@ -206,14 +234,14 @@ module PDF
206
234
  private
207
235
 
208
236
  def root
209
- @root ||= objects.deref(@objects.trailer[:Root])
237
+ @root ||= objects.deref_hash(@objects.trailer[:Root]) || {}
210
238
  end
211
239
 
212
240
  # Returns the resources that accompany this page. Includes
213
241
  # resources inherited from parents.
214
242
  #
215
243
  def resources
216
- @resources ||= @objects.deref(attributes[:Resources]) || {}
244
+ @resources ||= Resources.new(@objects, @objects.deref_hash(attributes[:Resources]) || {})
217
245
  end
218
246
 
219
247
  def content_stream(receivers, instructions)
@@ -249,7 +277,8 @@ module PDF
249
277
  if origin.nil?
250
278
  []
251
279
  else
252
- obj = objects.deref(origin)
280
+ obj = objects.deref_hash(origin)
281
+ PDF::Reader::Error.validate_not_nil_as_malformed(obj, "parent")
253
282
  [ select_inheritable(obj) ] + ancestors(obj[:Parent])
254
283
  end
255
284
  end
@@ -21,10 +21,8 @@ class PDF::Reader
21
21
  # PDF::Reader::Rectangle at some point
22
22
  PDF::Reader::Error.validate_not_nil(mediabox, "mediabox")
23
23
 
24
- runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
25
- runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
26
- @mediabox = mediabox
27
- @runs = merge_runs(runs)
24
+ @mediabox = process_mediabox(mediabox)
25
+ @runs = runs
28
26
  @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
29
27
  @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
30
28
  @median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
@@ -51,13 +49,11 @@ class PDF::Reader
51
49
  private
52
50
 
53
51
  def page_width
54
- # TODO once @mediabox is a Rectangle, this can be just `@mediabox.width`
55
- (@mediabox[2].to_f - @mediabox[0].to_f).abs
52
+ @mediabox.width
56
53
  end
57
54
 
58
55
  def page_height
59
- # TODO once @mediabox is a Rectangle, this can be just `@mediabox.height`
60
- (@mediabox[3].to_f - @mediabox[1].to_f).abs
56
+ @mediabox.height
61
57
  end
62
58
 
63
59
  # given an array of strings, return a new array with empty rows from the
@@ -109,30 +105,20 @@ class PDF::Reader
109
105
  end
110
106
  end
111
107
 
112
- # take a collection of TextRun objects and merge any that are in close
113
- # proximity
114
- def merge_runs(runs)
115
- runs.group_by { |char|
116
- char.y.to_i
117
- }.map { |y, chars|
118
- group_chars_into_runs(chars.sort)
119
- }.flatten.sort
108
+ def local_string_insert(haystack, needle, index)
109
+ haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
120
110
  end
121
111
 
122
- def group_chars_into_runs(chars)
123
- chars.each_with_object([]) do |char, runs|
124
- if runs.empty?
125
- runs << char
126
- elsif runs.last.mergable?(char)
127
- runs[-1] = runs.last + char
128
- else
129
- runs << char
130
- end
112
+ def process_mediabox(mediabox)
113
+ if mediabox.is_a?(Array)
114
+ msg = "Passing the mediabox to PageLayout as an Array is deprecated," +
115
+ " please use a Rectangle instead"
116
+ $stderr.puts msg
117
+ PDF::Reader::Rectangle.from_array(mediabox)
118
+ else
119
+ mediabox
131
120
  end
132
121
  end
133
122
 
134
- def local_string_insert(haystack, needle, index)
135
- haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
136
- end
137
123
  end
138
124
  end
@@ -384,7 +384,7 @@ class PDF::Reader
384
384
  #
385
385
  def build_fonts(raw_fonts)
386
386
  wrapped_fonts = raw_fonts.map { |label, font|
387
- [label, PDF::Reader::Font.new(@objects, @objects.deref(font))]
387
+ [label, PDF::Reader::Font.new(@objects, @objects.deref_hash(font) || {})]
388
388
  }
389
389
 
390
390
  ::Hash[wrapped_fonts]
@@ -47,9 +47,32 @@ module PDF
47
47
  @characters = []
48
48
  end
49
49
 
50
+ def runs(opts = {})
51
+ runs = @characters
52
+
53
+ if rect = opts.fetch(:rect, @page.rectangles[:CropBox])
54
+ runs = BoundingRectangleRunsFilter.runs_within_rect(runs, rect)
55
+ end
56
+
57
+ if opts.fetch(:skip_zero_width, true)
58
+ runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
59
+ end
60
+
61
+ if opts.fetch(:skip_overlapping, true)
62
+ runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
63
+ end
64
+
65
+ if opts.fetch(:merge, true)
66
+ runs = merge_runs(runs)
67
+ end
68
+
69
+ runs
70
+ end
71
+
72
+ # deprecated
50
73
  def content
51
- mediabox = @page.rectangles[:MediaBox].to_a
52
- PageLayout.new(@characters, mediabox).to_s
74
+ mediabox = @page.rectangles[:MediaBox]
75
+ PageLayout.new(runs, mediabox).to_s
53
76
  end
54
77
 
55
78
  #####################################################
@@ -64,8 +87,10 @@ module PDF
64
87
  params.each do |arg|
65
88
  if arg.is_a?(String)
66
89
  internal_show_text(arg)
67
- else
90
+ elsif arg.is_a?(Numeric)
68
91
  @state.process_glyph_displacement(0, arg, false)
92
+ else
93
+ # skip it
69
94
  end
70
95
  end
71
96
  end
@@ -96,6 +121,7 @@ module PDF
96
121
  private
97
122
 
98
123
  def internal_show_text(string)
124
+ PDF::Reader::Error.validate_type_as_malformed(string, "string", String)
99
125
  if @state.current_font.nil?
100
126
  raise PDF::Reader::MalformedPDFError, "current font is invalid"
101
127
  end
@@ -109,7 +135,7 @@ module PDF
109
135
 
110
136
  # apply to glyph displacment for the current glyph so the next
111
137
  # glyph will appear in the correct position
112
- glyph_width = @state.current_font.glyph_width(glyph_code) / 1000.0
138
+ glyph_width = @state.current_font.glyph_width_in_text_space(glyph_code)
113
139
  th = 1
114
140
  scaled_glyph_width = glyph_width * @state.font_size * th
115
141
  unless utf8_chars == SPACE
@@ -119,12 +145,6 @@ module PDF
119
145
  end
120
146
  end
121
147
 
122
- # TODO: revist this. It rotates the co-ordinates to the right direction, but I don't
123
- # think it sets the correct x,y values. We get away with it because we don't
124
- # return the text with co-ordinates, only the full text arranged in a string.
125
- #
126
- # We should provide an API for extracting the text with positioning data and spec
127
- # that. I suspect the co-ords might be wrong for rotated pages
128
148
  def apply_rotation(x, y)
129
149
  if @page.rotate == 90
130
150
  tmp = x
@@ -141,6 +161,28 @@ module PDF
141
161
  return x, y
142
162
  end
143
163
 
164
+ # take a collection of TextRun objects and merge any that are in close
165
+ # proximity
166
+ def merge_runs(runs)
167
+ runs.group_by { |char|
168
+ char.y.to_i
169
+ }.map { |y, chars|
170
+ group_chars_into_runs(chars.sort)
171
+ }.flatten.sort
172
+ end
173
+
174
+ def group_chars_into_runs(chars)
175
+ chars.each_with_object([]) do |char, runs|
176
+ if runs.empty?
177
+ runs << char
178
+ elsif runs.last.mergable?(char)
179
+ runs[-1] = runs.last + char
180
+ else
181
+ runs << char
182
+ end
183
+ end
184
+ end
185
+
144
186
  end
145
187
  end
146
188
  end