pdf-reader 2.2.0 → 2.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +90 -0
  3. data/README.md +18 -3
  4. data/Rakefile +1 -1
  5. data/bin/pdf_callbacks +1 -1
  6. data/bin/pdf_text +1 -1
  7. data/examples/extract_fonts.rb +12 -7
  8. data/examples/rspec.rb +1 -0
  9. data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
  10. data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
  11. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
  12. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
  13. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
  14. data/lib/pdf/reader/afm/Courier.afm +342 -342
  15. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
  16. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
  17. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
  18. data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
  19. data/lib/pdf/reader/afm/MustRead.html +19 -0
  20. data/lib/pdf/reader/afm/Symbol.afm +213 -213
  21. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
  22. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
  23. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
  24. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
  25. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
  26. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
  27. data/lib/pdf/reader/buffer.rb +91 -47
  28. data/lib/pdf/reader/cid_widths.rb +7 -4
  29. data/lib/pdf/reader/cmap.rb +83 -59
  30. data/lib/pdf/reader/encoding.rb +17 -14
  31. data/lib/pdf/reader/error.rb +15 -3
  32. data/lib/pdf/reader/filter/ascii85.rb +7 -1
  33. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  34. data/lib/pdf/reader/filter/depredict.rb +12 -10
  35. data/lib/pdf/reader/filter/flate.rb +30 -16
  36. data/lib/pdf/reader/filter/lzw.rb +2 -0
  37. data/lib/pdf/reader/filter/null.rb +1 -1
  38. data/lib/pdf/reader/filter/run_length.rb +19 -13
  39. data/lib/pdf/reader/filter.rb +11 -11
  40. data/lib/pdf/reader/font.rb +89 -26
  41. data/lib/pdf/reader/font_descriptor.rb +22 -18
  42. data/lib/pdf/reader/form_xobject.rb +18 -5
  43. data/lib/pdf/reader/glyph_hash.rb +28 -13
  44. data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
  45. data/lib/pdf/reader/key_builder_v5.rb +138 -0
  46. data/lib/pdf/reader/lzw.rb +28 -11
  47. data/lib/pdf/reader/no_text_filter.rb +14 -0
  48. data/lib/pdf/reader/null_security_handler.rb +1 -4
  49. data/lib/pdf/reader/object_cache.rb +1 -0
  50. data/lib/pdf/reader/object_hash.rb +292 -63
  51. data/lib/pdf/reader/object_stream.rb +3 -2
  52. data/lib/pdf/reader/overlapping_runs_filter.rb +72 -0
  53. data/lib/pdf/reader/page.rb +143 -16
  54. data/lib/pdf/reader/page_layout.rb +43 -39
  55. data/lib/pdf/reader/page_state.rb +26 -17
  56. data/lib/pdf/reader/page_text_receiver.rb +74 -4
  57. data/lib/pdf/reader/pages_strategy.rb +1 -0
  58. data/lib/pdf/reader/parser.rb +34 -14
  59. data/lib/pdf/reader/point.rb +25 -0
  60. data/lib/pdf/reader/print_receiver.rb +1 -0
  61. data/lib/pdf/reader/rc4_security_handler.rb +38 -0
  62. data/lib/pdf/reader/rectangle.rb +113 -0
  63. data/lib/pdf/reader/reference.rb +3 -1
  64. data/lib/pdf/reader/register_receiver.rb +1 -0
  65. data/lib/pdf/reader/{resource_methods.rb → resources.rb} +17 -9
  66. data/lib/pdf/reader/security_handler_factory.rb +79 -0
  67. data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
  68. data/lib/pdf/reader/stream.rb +3 -2
  69. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  70. data/lib/pdf/reader/text_run.rb +40 -5
  71. data/lib/pdf/reader/token.rb +1 -0
  72. data/lib/pdf/reader/transformation_matrix.rb +8 -7
  73. data/lib/pdf/reader/type_check.rb +98 -0
  74. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  75. data/lib/pdf/reader/validating_receiver.rb +262 -0
  76. data/lib/pdf/reader/width_calculator/built_in.rb +27 -17
  77. data/lib/pdf/reader/width_calculator/composite.rb +6 -1
  78. data/lib/pdf/reader/width_calculator/true_type.rb +10 -11
  79. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +6 -4
  80. data/lib/pdf/reader/width_calculator/type_zero.rb +6 -2
  81. data/lib/pdf/reader/width_calculator.rb +1 -0
  82. data/lib/pdf/reader/xref.rb +37 -11
  83. data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
  84. data/lib/pdf/reader.rb +49 -24
  85. data/lib/pdf-reader.rb +1 -0
  86. data/rbi/pdf-reader.rbi +2048 -0
  87. metadata +39 -23
  88. data/lib/pdf/hash.rb +0 -20
  89. data/lib/pdf/reader/orientation_detector.rb +0 -34
  90. data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
@@ -1,6 +1,9 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
5
+ require 'tempfile'
6
+
4
7
  class PDF::Reader
5
8
  # Provides low level access to the objects in a PDF file via a hash-like
6
9
  # object.
@@ -47,7 +50,11 @@ class PDF::Reader
47
50
  @trailer = @xref.trailer
48
51
  @cache = opts[:cache] || PDF::Reader::ObjectCache.new
49
52
  @sec_handler = NullSecurityHandler.new
50
- @sec_handler = build_security_handler(opts)
53
+ @sec_handler = SecurityHandlerFactory.build(
54
+ deref(trailer[:Encrypt]),
55
+ deref(trailer[:ID]),
56
+ opts[:password]
57
+ )
51
58
  end
52
59
 
53
60
  # returns the type of object a ref points to
@@ -78,16 +85,7 @@ class PDF::Reader
78
85
  key = PDF::Reader::Reference.new(key.to_i, 0)
79
86
  end
80
87
 
81
- if @cache.has_key?(key)
82
- @cache[key]
83
- elsif xref[key].is_a?(Integer)
84
- buf = new_buffer(xref[key])
85
- @cache[key] = decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
86
- elsif xref[key].is_a?(PDF::Reader::Reference)
87
- container_key = xref[key]
88
- object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
89
- @cache[key] = object_streams[container_key][key.id]
90
- end
88
+ @cache[key] ||= fetch_object(key) || fetch_object_stream(key)
91
89
  rescue InvalidObjectError
92
90
  return default
93
91
  end
@@ -100,6 +98,218 @@ class PDF::Reader
100
98
  end
101
99
  alias :deref :object
102
100
 
101
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
102
+ # object in the PDF and return it. Otherwise return key untouched.
103
+ #
104
+ # Guaranteed to only return an Array or nil. If the dereference results in
105
+ # any other type then a MalformedPDFError exception will raise. Useful when
106
+ # expecting an Array and no other type will do.
107
+ def deref_array(key)
108
+ obj = deref(key)
109
+
110
+ return obj if obj.nil?
111
+
112
+ obj.tap { |obj|
113
+ raise MalformedPDFError, "expected object to be an Array or nil" if !obj.is_a?(Array)
114
+ }
115
+ end
116
+
117
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
118
+ # object in the PDF and return it. Otherwise return key untouched.
119
+ #
120
+ # Guaranteed to only return an Array of Numerics or nil. If the dereference results in
121
+ # any other type then a MalformedPDFError exception will raise. Useful when
122
+ # expecting an Array and no other type will do.
123
+ #
124
+ # Some effort to cast array elements to a number is made for any non-numeric elements.
125
+ def deref_array_of_numbers(key)
126
+ arr = deref(key)
127
+
128
+ return arr if arr.nil?
129
+
130
+ raise MalformedPDFError, "expected object to be an Array" unless arr.is_a?(Array)
131
+
132
+ arr.map { |item|
133
+ if item.is_a?(Numeric)
134
+ item
135
+ elsif item.respond_to?(:to_f)
136
+ item.to_f
137
+ elsif item.respond_to?(:to_i)
138
+ item.to_i
139
+ else
140
+ raise MalformedPDFError, "expected object to be a number"
141
+ end
142
+ }
143
+ end
144
+
145
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
146
+ # object in the PDF and return it. Otherwise return key untouched.
147
+ #
148
+ # Guaranteed to only return a Hash or nil. If the dereference results in
149
+ # any other type then a MalformedPDFError exception will raise. Useful when
150
+ # expecting an Array and no other type will do.
151
+ def deref_hash(key)
152
+ obj = deref(key)
153
+
154
+ return obj if obj.nil?
155
+
156
+ obj.tap { |obj|
157
+ raise MalformedPDFError, "expected object to be a Hash or nil" if !obj.is_a?(Hash)
158
+ }
159
+ end
160
+
161
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
162
+ # object in the PDF and return it. Otherwise return key untouched.
163
+ #
164
+ # Guaranteed to only return a PDF name (Symbol) or nil. If the dereference results in
165
+ # any other type then a MalformedPDFError exception will raise. Useful when
166
+ # expecting an Array and no other type will do.
167
+ #
168
+ # Some effort to cast to a symbol is made when the reference points to a non-symbol.
169
+ def deref_name(key)
170
+ obj = deref(key)
171
+
172
+ return obj if obj.nil?
173
+
174
+ if !obj.is_a?(Symbol)
175
+ if obj.respond_to?(:to_sym)
176
+ obj = obj.to_sym
177
+ else
178
+ raise MalformedPDFError, "expected object to be a Name"
179
+ end
180
+ end
181
+
182
+ obj
183
+ end
184
+
185
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
186
+ # object in the PDF and return it. Otherwise return key untouched.
187
+ #
188
+ # Guaranteed to only return an Integer or nil. If the dereference results in
189
+ # any other type then a MalformedPDFError exception will raise. Useful when
190
+ # expecting an Array and no other type will do.
191
+ #
192
+ # Some effort to cast to an int is made when the reference points to a non-integer.
193
+ def deref_integer(key)
194
+ obj = deref(key)
195
+
196
+ return obj if obj.nil?
197
+
198
+ if !obj.is_a?(Integer)
199
+ if obj.respond_to?(:to_i)
200
+ obj = obj.to_i
201
+ else
202
+ raise MalformedPDFError, "expected object to be an Integer"
203
+ end
204
+ end
205
+
206
+ obj
207
+ end
208
+
209
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
210
+ # object in the PDF and return it. Otherwise return key untouched.
211
+ #
212
+ # Guaranteed to only return a Numeric or nil. If the dereference results in
213
+ # any other type then a MalformedPDFError exception will raise. Useful when
214
+ # expecting an Array and no other type will do.
215
+ #
216
+ # Some effort to cast to a number is made when the reference points to a non-number.
217
+ def deref_number(key)
218
+ obj = deref(key)
219
+
220
+ return obj if obj.nil?
221
+
222
+ if !obj.is_a?(Numeric)
223
+ if obj.respond_to?(:to_f)
224
+ obj = obj.to_f
225
+ elsif obj.respond_to?(:to_i)
226
+ obj.to_i
227
+ else
228
+ raise MalformedPDFError, "expected object to be a number"
229
+ end
230
+ end
231
+
232
+ obj
233
+ end
234
+
235
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
236
+ # object in the PDF and return it. Otherwise return key untouched.
237
+ #
238
+ # Guaranteed to only return a PDF::Reader::Stream or nil. If the dereference results in
239
+ # any other type then a MalformedPDFError exception will raise. Useful when
240
+ # expecting a stream and no other type will do.
241
+ def deref_stream(key)
242
+ obj = deref(key)
243
+
244
+ return obj if obj.nil?
245
+
246
+ obj.tap { |obj|
247
+ if !obj.is_a?(PDF::Reader::Stream)
248
+ raise MalformedPDFError, "expected object to be a Stream or nil"
249
+ end
250
+ }
251
+ end
252
+
253
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
254
+ # object in the PDF and return it. Otherwise return key untouched.
255
+ #
256
+ # Guaranteed to only return a String or nil. If the dereference results in
257
+ # any other type then a MalformedPDFError exception will raise. Useful when
258
+ # expecting a string and no other type will do.
259
+ #
260
+ # Some effort to cast to a string is made when the reference points to a non-string.
261
+ def deref_string(key)
262
+ obj = deref(key)
263
+
264
+ return obj if obj.nil?
265
+
266
+ if !obj.is_a?(String)
267
+ if obj.respond_to?(:to_s)
268
+ obj = obj.to_s
269
+ else
270
+ raise MalformedPDFError, "expected object to be a string"
271
+ end
272
+ end
273
+
274
+ obj
275
+ end
276
+
277
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
278
+ # object in the PDF and return it. Otherwise return key untouched.
279
+ #
280
+ # Guaranteed to only return a PDF Name (symbol), Array or nil. If the dereference results in
281
+ # any other type then a MalformedPDFError exception will raise. Useful when
282
+ # expecting a Name or Array and no other type will do.
283
+ def deref_name_or_array(key)
284
+ obj = deref(key)
285
+
286
+ return obj if obj.nil?
287
+
288
+ obj.tap { |obj|
289
+ if !obj.is_a?(Symbol) && !obj.is_a?(Array)
290
+ raise MalformedPDFError, "expected object to be an Array or Name"
291
+ end
292
+ }
293
+ end
294
+
295
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
296
+ # object in the PDF and return it. Otherwise return key untouched.
297
+ #
298
+ # Guaranteed to only return a PDF::Reader::Stream, Array or nil. If the dereference results in
299
+ # any other type then a MalformedPDFError exception will raise. Useful when
300
+ # expecting a stream or Array and no other type will do.
301
+ def deref_stream_or_array(key)
302
+ obj = deref(key)
303
+
304
+ return obj if obj.nil?
305
+
306
+ obj.tap { |obj|
307
+ if !obj.is_a?(PDF::Reader::Stream) && !obj.is_a?(Array)
308
+ raise MalformedPDFError, "expected object to be an Array or Stream"
309
+ end
310
+ }
311
+ end
312
+
103
313
  # Recursively dereferences the object refered to be +key+. If +key+ is not
104
314
  # a PDF::Reader::Reference, the key is returned unchanged.
105
315
  #
@@ -107,6 +317,22 @@ class PDF::Reader
107
317
  deref_internal!(key, {})
108
318
  end
109
319
 
320
+ def deref_array!(key)
321
+ deref!(key).tap { |obj|
322
+ if !obj.nil? && !obj.is_a?(Array)
323
+ raise MalformedPDFError, "expected object (#{obj.inspect}) to be an Array or nil"
324
+ end
325
+ }
326
+ end
327
+
328
+ def deref_hash!(key)
329
+ deref!(key).tap { |obj|
330
+ if !obj.nil? && !obj.is_a?(Hash)
331
+ raise MalformedPDFError, "expected object (#{obj.inspect}) to be a Hash or nil"
332
+ end
333
+ }
334
+ end
335
+
110
336
  # Access an object from the PDF. key can be an int or a PDF::Reader::Reference
111
337
  # object.
112
338
  #
@@ -241,7 +467,10 @@ class PDF::Reader
241
467
  #
242
468
  def page_references
243
469
  root = fetch(trailer[:Root])
244
- @page_references ||= get_page_objects(root[:Pages]).flatten
470
+ @page_references ||= begin
471
+ pages_root = deref_hash(root[:Pages]) || {}
472
+ get_page_objects(pages_root)
473
+ end
245
474
  end
246
475
 
247
476
  def encrypted?
@@ -254,6 +483,28 @@ class PDF::Reader
254
483
 
255
484
  private
256
485
 
486
+ # parse a traditional object from the PDF, starting from the byte offset indicated
487
+ # in the xref table
488
+ #
489
+ def fetch_object(key)
490
+ if xref[key].is_a?(Integer)
491
+ buf = new_buffer(xref[key])
492
+ decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
493
+ end
494
+ end
495
+
496
+ # parse a object that's embedded in an object stream in the PDF
497
+ #
498
+ def fetch_object_stream(key)
499
+ if xref[key].is_a?(PDF::Reader::Reference)
500
+ container_key = xref[key]
501
+ stream = deref_stream(container_key)
502
+ raise MalformedPDFError, "Object Stream cannot be nil" if stream.nil?
503
+ object_streams[container_key] ||= PDF::Reader::ObjectStream.new(stream)
504
+ object_streams[container_key][key.id]
505
+ end
506
+ end
507
+
257
508
  # Private implementation of deref!, which exists to ensure the `seen` argument
258
509
  # isn't publicly available. It's used to avoid endless loops in the recursion, and
259
510
  # doesn't need to be part of the public API.
@@ -287,44 +538,18 @@ class PDF::Reader
287
538
  end
288
539
  end
289
540
 
290
- def build_security_handler(opts = {})
291
- encrypt = deref(trailer[:Encrypt])
292
- if NullSecurityHandler.supports?(encrypt)
293
- NullSecurityHandler.new
294
- elsif StandardSecurityHandler.supports?(encrypt)
295
- encmeta = !encrypt.has_key?(:EncryptMetadata) || encrypt[:EncryptMetadata].to_s == "true"
296
- StandardSecurityHandler.new(
297
- key_length: (encrypt[:Length] || 40).to_i,
298
- revision: encrypt[:R],
299
- owner_key: encrypt[:O],
300
- user_key: encrypt[:U],
301
- permissions: encrypt[:P].to_i,
302
- encrypted_metadata: encmeta,
303
- file_id: (deref(trailer[:ID]) || []).first,
304
- password: opts[:password],
305
- cfm: encrypt.fetch(:CF, {}).fetch(encrypt[:StmF], {}).fetch(:CFM, nil)
306
- )
307
- elsif StandardSecurityHandlerV5.supports?(encrypt)
308
- StandardSecurityHandlerV5.new(
309
- O: encrypt[:O],
310
- U: encrypt[:U],
311
- OE: encrypt[:OE],
312
- UE: encrypt[:UE],
313
- password: opts[:password]
314
- )
315
- else
316
- UnimplementedSecurityHandler.new
317
- end
318
- end
319
-
320
541
  def decrypt(ref, obj)
321
542
  case obj
322
543
  when PDF::Reader::Stream then
323
- obj.data = sec_handler.decrypt(obj.data, ref)
544
+ # PDF 32000-1:2008 7.5.8.2: "The cross-reference stream shall not be encrypted [...]."
545
+ # Therefore we shouldn't try to decrypt it.
546
+ obj.data = sec_handler.decrypt(obj.data, ref) unless obj.hash[:Type] == :XRef
324
547
  obj
325
548
  when Hash then
326
- arr = obj.map { |key,val| [key, decrypt(ref, val)] }.flatten(1)
327
- Hash[*arr]
549
+ arr = obj.map { |key,val| [key, decrypt(ref, val)] }
550
+ arr.each_with_object({}) { |(k,v), accum|
551
+ accum[k] = v
552
+ }
328
553
  when Array then
329
554
  obj.collect { |item| decrypt(ref, item) }
330
555
  when String
@@ -343,39 +568,43 @@ class PDF::Reader
343
568
  end
344
569
 
345
570
  def object_streams
346
- @object_stream ||= {}
571
+ @object_streams ||= {}
347
572
  end
348
573
 
349
- # returns a nested array of object references for all pages in this object store.
574
+ # returns an array of object references for all pages in this object store. The ordering of
575
+ # the Array is significant and matches the page ordering of the document
350
576
  #
351
- def get_page_objects(ref)
352
- obj = deref(ref)
353
-
354
- unless obj.kind_of?(::Hash)
355
- raise MalformedPDFError, "Dereferenced page object must be a dict"
356
- end
357
-
358
- if obj[:Type] == :Page
359
- ref
360
- elsif obj[:Kids]
361
- deref(obj[:Kids]).map { |kid| get_page_objects(kid) }
577
+ def get_page_objects(obj)
578
+ derefed_obj = deref_hash(obj)
579
+
580
+ if derefed_obj.nil?
581
+ raise MalformedPDFError, "Expected Page or Pages object, got nil"
582
+ elsif derefed_obj[:Type] == :Page
583
+ [obj]
584
+ elsif derefed_obj[:Kids]
585
+ kids = deref_array(derefed_obj[:Kids]) || []
586
+ kids.map { |kid|
587
+ get_page_objects(kid)
588
+ }.flatten
589
+ else
590
+ raise MalformedPDFError, "Expected Page or Pages object"
362
591
  end
363
592
  end
364
593
 
365
594
  def read_version
366
595
  @io.seek(0)
367
- _m, version = *@io.read(10).match(/PDF-(\d.\d)/)
596
+ _m, version = *@io.read(10).to_s.match(/PDF-(\d.\d)/)
368
597
  @io.seek(0)
369
598
  version.to_f
370
599
  end
371
600
 
372
601
  def extract_io_from(input)
373
- if input.respond_to?(:seek) && input.respond_to?(:read)
602
+ if input.is_a?(IO) || input.is_a?(StringIO) || input.is_a?(Tempfile)
374
603
  input
375
604
  elsif File.file?(input.to_s)
376
- StringIO.new read_as_binary(input)
605
+ StringIO.new read_as_binary(input.to_s)
377
606
  else
378
- raise ArgumentError, "input must be an IO-like object or a filename"
607
+ raise ArgumentError, "input must be an IO-like object or a filename (#{input.class})"
379
608
  end
380
609
  end
381
610
 
@@ -383,7 +612,7 @@ class PDF::Reader
383
612
  if File.respond_to?(:binread)
384
613
  File.binread(input.to_s)
385
614
  else
386
- File.open(input.to_s,"rb") { |f| f.read }
615
+ File.open(input.to_s,"rb") { |f| f.read } || ""
387
616
  end
388
617
  end
389
618
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -23,7 +24,7 @@ class PDF::Reader
23
24
  end
24
25
 
25
26
  def size
26
- @dict[:N]
27
+ TypeCheck.cast_to_int!(@dict[:N])
27
28
  end
28
29
 
29
30
  private
@@ -39,7 +40,7 @@ class PDF::Reader
39
40
  end
40
41
 
41
42
  def first
42
- @dict[:First]
43
+ TypeCheck.cast_to_int!(@dict[:First])
43
44
  end
44
45
 
45
46
  def buffer
@@ -0,0 +1,72 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+ # typed: strict
4
+
5
+ class PDF::Reader
6
+ # remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
7
+ # uses slightly offset overlapping characters to achieve a fake 'bold' effect.
8
+ class OverlappingRunsFilter
9
+
10
+ # This should be between 0 and 1. If TextRun B obscures this much of TextRun A (and they
11
+ # have identical characters) then one will be discarded
12
+ OVERLAPPING_THRESHOLD = 0.5
13
+
14
+ def self.exclude_redundant_runs(runs)
15
+ sweep_line_status = Array.new
16
+ event_point_schedule = Array.new
17
+ to_exclude = []
18
+
19
+ runs.each do |run|
20
+ event_point_schedule << EventPoint.new(run.x, run)
21
+ event_point_schedule << EventPoint.new(run.endx, run)
22
+ end
23
+
24
+ event_point_schedule.sort! { |a,b| a.x <=> b.x }
25
+
26
+ event_point_schedule.each do |event_point|
27
+ run = event_point.run
28
+
29
+ if event_point.start?
30
+ if detect_intersection(sweep_line_status, event_point)
31
+ to_exclude << run
32
+ end
33
+ sweep_line_status.push(run)
34
+ else
35
+ sweep_line_status.delete(run)
36
+ end
37
+ end
38
+ runs - to_exclude
39
+ end
40
+
41
+ def self.detect_intersection(sweep_line_status, event_point)
42
+ sweep_line_status.each do |open_text_run|
43
+ if open_text_run.text == event_point.run.text &&
44
+ event_point.x >= open_text_run.x &&
45
+ event_point.x <= open_text_run.endx &&
46
+ open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
47
+ return true
48
+ end
49
+ end
50
+ return false
51
+ end
52
+ end
53
+
54
+ # Utility class used to avoid modifying the underlying TextRun objects while we're
55
+ # looking for duplicates
56
+ class EventPoint
57
+
58
+ attr_reader :x
59
+
60
+ attr_reader :run
61
+
62
+ def initialize(x, run)
63
+ @x = x
64
+ @run = run
65
+ end
66
+
67
+ def start?
68
+ @x == @run.x
69
+ end
70
+ end
71
+
72
+ end