pdf-reader 2.2.0 → 2.11.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (90) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +90 -0
  3. data/README.md +18 -3
  4. data/Rakefile +1 -1
  5. data/bin/pdf_callbacks +1 -1
  6. data/bin/pdf_text +1 -1
  7. data/examples/extract_fonts.rb +12 -7
  8. data/examples/rspec.rb +1 -0
  9. data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
  10. data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
  11. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
  12. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
  13. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
  14. data/lib/pdf/reader/afm/Courier.afm +342 -342
  15. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
  16. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
  17. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
  18. data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
  19. data/lib/pdf/reader/afm/MustRead.html +19 -0
  20. data/lib/pdf/reader/afm/Symbol.afm +213 -213
  21. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
  22. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
  23. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
  24. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
  25. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
  26. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
  27. data/lib/pdf/reader/buffer.rb +91 -47
  28. data/lib/pdf/reader/cid_widths.rb +7 -4
  29. data/lib/pdf/reader/cmap.rb +83 -59
  30. data/lib/pdf/reader/encoding.rb +17 -14
  31. data/lib/pdf/reader/error.rb +15 -3
  32. data/lib/pdf/reader/filter/ascii85.rb +7 -1
  33. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  34. data/lib/pdf/reader/filter/depredict.rb +12 -10
  35. data/lib/pdf/reader/filter/flate.rb +30 -16
  36. data/lib/pdf/reader/filter/lzw.rb +2 -0
  37. data/lib/pdf/reader/filter/null.rb +1 -1
  38. data/lib/pdf/reader/filter/run_length.rb +19 -13
  39. data/lib/pdf/reader/filter.rb +11 -11
  40. data/lib/pdf/reader/font.rb +89 -26
  41. data/lib/pdf/reader/font_descriptor.rb +22 -18
  42. data/lib/pdf/reader/form_xobject.rb +18 -5
  43. data/lib/pdf/reader/glyph_hash.rb +28 -13
  44. data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
  45. data/lib/pdf/reader/key_builder_v5.rb +138 -0
  46. data/lib/pdf/reader/lzw.rb +28 -11
  47. data/lib/pdf/reader/no_text_filter.rb +14 -0
  48. data/lib/pdf/reader/null_security_handler.rb +1 -4
  49. data/lib/pdf/reader/object_cache.rb +1 -0
  50. data/lib/pdf/reader/object_hash.rb +292 -63
  51. data/lib/pdf/reader/object_stream.rb +3 -2
  52. data/lib/pdf/reader/overlapping_runs_filter.rb +72 -0
  53. data/lib/pdf/reader/page.rb +143 -16
  54. data/lib/pdf/reader/page_layout.rb +43 -39
  55. data/lib/pdf/reader/page_state.rb +26 -17
  56. data/lib/pdf/reader/page_text_receiver.rb +74 -4
  57. data/lib/pdf/reader/pages_strategy.rb +1 -0
  58. data/lib/pdf/reader/parser.rb +34 -14
  59. data/lib/pdf/reader/point.rb +25 -0
  60. data/lib/pdf/reader/print_receiver.rb +1 -0
  61. data/lib/pdf/reader/rc4_security_handler.rb +38 -0
  62. data/lib/pdf/reader/rectangle.rb +113 -0
  63. data/lib/pdf/reader/reference.rb +3 -1
  64. data/lib/pdf/reader/register_receiver.rb +1 -0
  65. data/lib/pdf/reader/{resource_methods.rb → resources.rb} +17 -9
  66. data/lib/pdf/reader/security_handler_factory.rb +79 -0
  67. data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
  68. data/lib/pdf/reader/stream.rb +3 -2
  69. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  70. data/lib/pdf/reader/text_run.rb +40 -5
  71. data/lib/pdf/reader/token.rb +1 -0
  72. data/lib/pdf/reader/transformation_matrix.rb +8 -7
  73. data/lib/pdf/reader/type_check.rb +98 -0
  74. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  75. data/lib/pdf/reader/validating_receiver.rb +262 -0
  76. data/lib/pdf/reader/width_calculator/built_in.rb +27 -17
  77. data/lib/pdf/reader/width_calculator/composite.rb +6 -1
  78. data/lib/pdf/reader/width_calculator/true_type.rb +10 -11
  79. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +6 -4
  80. data/lib/pdf/reader/width_calculator/type_zero.rb +6 -2
  81. data/lib/pdf/reader/width_calculator.rb +1 -0
  82. data/lib/pdf/reader/xref.rb +37 -11
  83. data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
  84. data/lib/pdf/reader.rb +49 -24
  85. data/lib/pdf-reader.rb +1 -0
  86. data/rbi/pdf-reader.rbi +2048 -0
  87. metadata +39 -23
  88. data/lib/pdf/hash.rb +0 -20
  89. data/lib/pdf/reader/orientation_detector.rb +0 -34
  90. data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
@@ -1,6 +1,9 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
5
+ require 'tempfile'
6
+
4
7
  class PDF::Reader
5
8
  # Provides low level access to the objects in a PDF file via a hash-like
6
9
  # object.
@@ -47,7 +50,11 @@ class PDF::Reader
47
50
  @trailer = @xref.trailer
48
51
  @cache = opts[:cache] || PDF::Reader::ObjectCache.new
49
52
  @sec_handler = NullSecurityHandler.new
50
- @sec_handler = build_security_handler(opts)
53
+ @sec_handler = SecurityHandlerFactory.build(
54
+ deref(trailer[:Encrypt]),
55
+ deref(trailer[:ID]),
56
+ opts[:password]
57
+ )
51
58
  end
52
59
 
53
60
  # returns the type of object a ref points to
@@ -78,16 +85,7 @@ class PDF::Reader
78
85
  key = PDF::Reader::Reference.new(key.to_i, 0)
79
86
  end
80
87
 
81
- if @cache.has_key?(key)
82
- @cache[key]
83
- elsif xref[key].is_a?(Integer)
84
- buf = new_buffer(xref[key])
85
- @cache[key] = decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
86
- elsif xref[key].is_a?(PDF::Reader::Reference)
87
- container_key = xref[key]
88
- object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
89
- @cache[key] = object_streams[container_key][key.id]
90
- end
88
+ @cache[key] ||= fetch_object(key) || fetch_object_stream(key)
91
89
  rescue InvalidObjectError
92
90
  return default
93
91
  end
@@ -100,6 +98,218 @@ class PDF::Reader
100
98
  end
101
99
  alias :deref :object
102
100
 
101
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
102
+ # object in the PDF and return it. Otherwise return key untouched.
103
+ #
104
+ # Guaranteed to only return an Array or nil. If the dereference results in
105
+ # any other type then a MalformedPDFError exception will raise. Useful when
106
+ # expecting an Array and no other type will do.
107
+ def deref_array(key)
108
+ obj = deref(key)
109
+
110
+ return obj if obj.nil?
111
+
112
+ obj.tap { |obj|
113
+ raise MalformedPDFError, "expected object to be an Array or nil" if !obj.is_a?(Array)
114
+ }
115
+ end
116
+
117
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
118
+ # object in the PDF and return it. Otherwise return key untouched.
119
+ #
120
+ # Guaranteed to only return an Array of Numerics or nil. If the dereference results in
121
+ # any other type then a MalformedPDFError exception will raise. Useful when
122
+ # expecting an Array and no other type will do.
123
+ #
124
+ # Some effort to cast array elements to a number is made for any non-numeric elements.
125
+ def deref_array_of_numbers(key)
126
+ arr = deref(key)
127
+
128
+ return arr if arr.nil?
129
+
130
+ raise MalformedPDFError, "expected object to be an Array" unless arr.is_a?(Array)
131
+
132
+ arr.map { |item|
133
+ if item.is_a?(Numeric)
134
+ item
135
+ elsif item.respond_to?(:to_f)
136
+ item.to_f
137
+ elsif item.respond_to?(:to_i)
138
+ item.to_i
139
+ else
140
+ raise MalformedPDFError, "expected object to be a number"
141
+ end
142
+ }
143
+ end
144
+
145
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
146
+ # object in the PDF and return it. Otherwise return key untouched.
147
+ #
148
+ # Guaranteed to only return a Hash or nil. If the dereference results in
149
+ # any other type then a MalformedPDFError exception will raise. Useful when
150
+ # expecting an Array and no other type will do.
151
+ def deref_hash(key)
152
+ obj = deref(key)
153
+
154
+ return obj if obj.nil?
155
+
156
+ obj.tap { |obj|
157
+ raise MalformedPDFError, "expected object to be a Hash or nil" if !obj.is_a?(Hash)
158
+ }
159
+ end
160
+
161
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
162
+ # object in the PDF and return it. Otherwise return key untouched.
163
+ #
164
+ # Guaranteed to only return a PDF name (Symbol) or nil. If the dereference results in
165
+ # any other type then a MalformedPDFError exception will raise. Useful when
166
+ # expecting an Array and no other type will do.
167
+ #
168
+ # Some effort to cast to a symbol is made when the reference points to a non-symbol.
169
+ def deref_name(key)
170
+ obj = deref(key)
171
+
172
+ return obj if obj.nil?
173
+
174
+ if !obj.is_a?(Symbol)
175
+ if obj.respond_to?(:to_sym)
176
+ obj = obj.to_sym
177
+ else
178
+ raise MalformedPDFError, "expected object to be a Name"
179
+ end
180
+ end
181
+
182
+ obj
183
+ end
184
+
185
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
186
+ # object in the PDF and return it. Otherwise return key untouched.
187
+ #
188
+ # Guaranteed to only return an Integer or nil. If the dereference results in
189
+ # any other type then a MalformedPDFError exception will raise. Useful when
190
+ # expecting an Array and no other type will do.
191
+ #
192
+ # Some effort to cast to an int is made when the reference points to a non-integer.
193
+ def deref_integer(key)
194
+ obj = deref(key)
195
+
196
+ return obj if obj.nil?
197
+
198
+ if !obj.is_a?(Integer)
199
+ if obj.respond_to?(:to_i)
200
+ obj = obj.to_i
201
+ else
202
+ raise MalformedPDFError, "expected object to be an Integer"
203
+ end
204
+ end
205
+
206
+ obj
207
+ end
208
+
209
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
210
+ # object in the PDF and return it. Otherwise return key untouched.
211
+ #
212
+ # Guaranteed to only return a Numeric or nil. If the dereference results in
213
+ # any other type then a MalformedPDFError exception will raise. Useful when
214
+ # expecting an Array and no other type will do.
215
+ #
216
+ # Some effort to cast to a number is made when the reference points to a non-number.
217
+ def deref_number(key)
218
+ obj = deref(key)
219
+
220
+ return obj if obj.nil?
221
+
222
+ if !obj.is_a?(Numeric)
223
+ if obj.respond_to?(:to_f)
224
+ obj = obj.to_f
225
+ elsif obj.respond_to?(:to_i)
226
+ obj.to_i
227
+ else
228
+ raise MalformedPDFError, "expected object to be a number"
229
+ end
230
+ end
231
+
232
+ obj
233
+ end
234
+
235
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
236
+ # object in the PDF and return it. Otherwise return key untouched.
237
+ #
238
+ # Guaranteed to only return a PDF::Reader::Stream or nil. If the dereference results in
239
+ # any other type then a MalformedPDFError exception will raise. Useful when
240
+ # expecting a stream and no other type will do.
241
+ def deref_stream(key)
242
+ obj = deref(key)
243
+
244
+ return obj if obj.nil?
245
+
246
+ obj.tap { |obj|
247
+ if !obj.is_a?(PDF::Reader::Stream)
248
+ raise MalformedPDFError, "expected object to be a Stream or nil"
249
+ end
250
+ }
251
+ end
252
+
253
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
254
+ # object in the PDF and return it. Otherwise return key untouched.
255
+ #
256
+ # Guaranteed to only return a String or nil. If the dereference results in
257
+ # any other type then a MalformedPDFError exception will raise. Useful when
258
+ # expecting a string and no other type will do.
259
+ #
260
+ # Some effort to cast to a string is made when the reference points to a non-string.
261
+ def deref_string(key)
262
+ obj = deref(key)
263
+
264
+ return obj if obj.nil?
265
+
266
+ if !obj.is_a?(String)
267
+ if obj.respond_to?(:to_s)
268
+ obj = obj.to_s
269
+ else
270
+ raise MalformedPDFError, "expected object to be a string"
271
+ end
272
+ end
273
+
274
+ obj
275
+ end
276
+
277
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
278
+ # object in the PDF and return it. Otherwise return key untouched.
279
+ #
280
+ # Guaranteed to only return a PDF Name (symbol), Array or nil. If the dereference results in
281
+ # any other type then a MalformedPDFError exception will raise. Useful when
282
+ # expecting a Name or Array and no other type will do.
283
+ def deref_name_or_array(key)
284
+ obj = deref(key)
285
+
286
+ return obj if obj.nil?
287
+
288
+ obj.tap { |obj|
289
+ if !obj.is_a?(Symbol) && !obj.is_a?(Array)
290
+ raise MalformedPDFError, "expected object to be an Array or Name"
291
+ end
292
+ }
293
+ end
294
+
295
+ # If key is a PDF::Reader::Reference object, lookup the corresponding
296
+ # object in the PDF and return it. Otherwise return key untouched.
297
+ #
298
+ # Guaranteed to only return a PDF::Reader::Stream, Array or nil. If the dereference results in
299
+ # any other type then a MalformedPDFError exception will raise. Useful when
300
+ # expecting a stream or Array and no other type will do.
301
+ def deref_stream_or_array(key)
302
+ obj = deref(key)
303
+
304
+ return obj if obj.nil?
305
+
306
+ obj.tap { |obj|
307
+ if !obj.is_a?(PDF::Reader::Stream) && !obj.is_a?(Array)
308
+ raise MalformedPDFError, "expected object to be an Array or Stream"
309
+ end
310
+ }
311
+ end
312
+
103
313
  # Recursively dereferences the object refered to be +key+. If +key+ is not
104
314
  # a PDF::Reader::Reference, the key is returned unchanged.
105
315
  #
@@ -107,6 +317,22 @@ class PDF::Reader
107
317
  deref_internal!(key, {})
108
318
  end
109
319
 
320
+ def deref_array!(key)
321
+ deref!(key).tap { |obj|
322
+ if !obj.nil? && !obj.is_a?(Array)
323
+ raise MalformedPDFError, "expected object (#{obj.inspect}) to be an Array or nil"
324
+ end
325
+ }
326
+ end
327
+
328
+ def deref_hash!(key)
329
+ deref!(key).tap { |obj|
330
+ if !obj.nil? && !obj.is_a?(Hash)
331
+ raise MalformedPDFError, "expected object (#{obj.inspect}) to be a Hash or nil"
332
+ end
333
+ }
334
+ end
335
+
110
336
  # Access an object from the PDF. key can be an int or a PDF::Reader::Reference
111
337
  # object.
112
338
  #
@@ -241,7 +467,10 @@ class PDF::Reader
241
467
  #
242
468
  def page_references
243
469
  root = fetch(trailer[:Root])
244
- @page_references ||= get_page_objects(root[:Pages]).flatten
470
+ @page_references ||= begin
471
+ pages_root = deref_hash(root[:Pages]) || {}
472
+ get_page_objects(pages_root)
473
+ end
245
474
  end
246
475
 
247
476
  def encrypted?
@@ -254,6 +483,28 @@ class PDF::Reader
254
483
 
255
484
  private
256
485
 
486
+ # parse a traditional object from the PDF, starting from the byte offset indicated
487
+ # in the xref table
488
+ #
489
+ def fetch_object(key)
490
+ if xref[key].is_a?(Integer)
491
+ buf = new_buffer(xref[key])
492
+ decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
493
+ end
494
+ end
495
+
496
+ # parse a object that's embedded in an object stream in the PDF
497
+ #
498
+ def fetch_object_stream(key)
499
+ if xref[key].is_a?(PDF::Reader::Reference)
500
+ container_key = xref[key]
501
+ stream = deref_stream(container_key)
502
+ raise MalformedPDFError, "Object Stream cannot be nil" if stream.nil?
503
+ object_streams[container_key] ||= PDF::Reader::ObjectStream.new(stream)
504
+ object_streams[container_key][key.id]
505
+ end
506
+ end
507
+
257
508
  # Private implementation of deref!, which exists to ensure the `seen` argument
258
509
  # isn't publicly available. It's used to avoid endless loops in the recursion, and
259
510
  # doesn't need to be part of the public API.
@@ -287,44 +538,18 @@ class PDF::Reader
287
538
  end
288
539
  end
289
540
 
290
- def build_security_handler(opts = {})
291
- encrypt = deref(trailer[:Encrypt])
292
- if NullSecurityHandler.supports?(encrypt)
293
- NullSecurityHandler.new
294
- elsif StandardSecurityHandler.supports?(encrypt)
295
- encmeta = !encrypt.has_key?(:EncryptMetadata) || encrypt[:EncryptMetadata].to_s == "true"
296
- StandardSecurityHandler.new(
297
- key_length: (encrypt[:Length] || 40).to_i,
298
- revision: encrypt[:R],
299
- owner_key: encrypt[:O],
300
- user_key: encrypt[:U],
301
- permissions: encrypt[:P].to_i,
302
- encrypted_metadata: encmeta,
303
- file_id: (deref(trailer[:ID]) || []).first,
304
- password: opts[:password],
305
- cfm: encrypt.fetch(:CF, {}).fetch(encrypt[:StmF], {}).fetch(:CFM, nil)
306
- )
307
- elsif StandardSecurityHandlerV5.supports?(encrypt)
308
- StandardSecurityHandlerV5.new(
309
- O: encrypt[:O],
310
- U: encrypt[:U],
311
- OE: encrypt[:OE],
312
- UE: encrypt[:UE],
313
- password: opts[:password]
314
- )
315
- else
316
- UnimplementedSecurityHandler.new
317
- end
318
- end
319
-
320
541
  def decrypt(ref, obj)
321
542
  case obj
322
543
  when PDF::Reader::Stream then
323
- obj.data = sec_handler.decrypt(obj.data, ref)
544
+ # PDF 32000-1:2008 7.5.8.2: "The cross-reference stream shall not be encrypted [...]."
545
+ # Therefore we shouldn't try to decrypt it.
546
+ obj.data = sec_handler.decrypt(obj.data, ref) unless obj.hash[:Type] == :XRef
324
547
  obj
325
548
  when Hash then
326
- arr = obj.map { |key,val| [key, decrypt(ref, val)] }.flatten(1)
327
- Hash[*arr]
549
+ arr = obj.map { |key,val| [key, decrypt(ref, val)] }
550
+ arr.each_with_object({}) { |(k,v), accum|
551
+ accum[k] = v
552
+ }
328
553
  when Array then
329
554
  obj.collect { |item| decrypt(ref, item) }
330
555
  when String
@@ -343,39 +568,43 @@ class PDF::Reader
343
568
  end
344
569
 
345
570
  def object_streams
346
- @object_stream ||= {}
571
+ @object_streams ||= {}
347
572
  end
348
573
 
349
- # returns a nested array of object references for all pages in this object store.
574
+ # returns an array of object references for all pages in this object store. The ordering of
575
+ # the Array is significant and matches the page ordering of the document
350
576
  #
351
- def get_page_objects(ref)
352
- obj = deref(ref)
353
-
354
- unless obj.kind_of?(::Hash)
355
- raise MalformedPDFError, "Dereferenced page object must be a dict"
356
- end
357
-
358
- if obj[:Type] == :Page
359
- ref
360
- elsif obj[:Kids]
361
- deref(obj[:Kids]).map { |kid| get_page_objects(kid) }
577
+ def get_page_objects(obj)
578
+ derefed_obj = deref_hash(obj)
579
+
580
+ if derefed_obj.nil?
581
+ raise MalformedPDFError, "Expected Page or Pages object, got nil"
582
+ elsif derefed_obj[:Type] == :Page
583
+ [obj]
584
+ elsif derefed_obj[:Kids]
585
+ kids = deref_array(derefed_obj[:Kids]) || []
586
+ kids.map { |kid|
587
+ get_page_objects(kid)
588
+ }.flatten
589
+ else
590
+ raise MalformedPDFError, "Expected Page or Pages object"
362
591
  end
363
592
  end
364
593
 
365
594
  def read_version
366
595
  @io.seek(0)
367
- _m, version = *@io.read(10).match(/PDF-(\d.\d)/)
596
+ _m, version = *@io.read(10).to_s.match(/PDF-(\d.\d)/)
368
597
  @io.seek(0)
369
598
  version.to_f
370
599
  end
371
600
 
372
601
  def extract_io_from(input)
373
- if input.respond_to?(:seek) && input.respond_to?(:read)
602
+ if input.is_a?(IO) || input.is_a?(StringIO) || input.is_a?(Tempfile)
374
603
  input
375
604
  elsif File.file?(input.to_s)
376
- StringIO.new read_as_binary(input)
605
+ StringIO.new read_as_binary(input.to_s)
377
606
  else
378
- raise ArgumentError, "input must be an IO-like object or a filename"
607
+ raise ArgumentError, "input must be an IO-like object or a filename (#{input.class})"
379
608
  end
380
609
  end
381
610
 
@@ -383,7 +612,7 @@ class PDF::Reader
383
612
  if File.respond_to?(:binread)
384
613
  File.binread(input.to_s)
385
614
  else
386
- File.open(input.to_s,"rb") { |f| f.read }
615
+ File.open(input.to_s,"rb") { |f| f.read } || ""
387
616
  end
388
617
  end
389
618
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -23,7 +24,7 @@ class PDF::Reader
23
24
  end
24
25
 
25
26
  def size
26
- @dict[:N]
27
+ TypeCheck.cast_to_int!(@dict[:N])
27
28
  end
28
29
 
29
30
  private
@@ -39,7 +40,7 @@ class PDF::Reader
39
40
  end
40
41
 
41
42
  def first
42
- @dict[:First]
43
+ TypeCheck.cast_to_int!(@dict[:First])
43
44
  end
44
45
 
45
46
  def buffer
@@ -0,0 +1,72 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+ # typed: strict
4
+
5
+ class PDF::Reader
6
+ # remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
7
+ # uses slightly offset overlapping characters to achieve a fake 'bold' effect.
8
+ class OverlappingRunsFilter
9
+
10
+ # This should be between 0 and 1. If TextRun B obscures this much of TextRun A (and they
11
+ # have identical characters) then one will be discarded
12
+ OVERLAPPING_THRESHOLD = 0.5
13
+
14
+ def self.exclude_redundant_runs(runs)
15
+ sweep_line_status = Array.new
16
+ event_point_schedule = Array.new
17
+ to_exclude = []
18
+
19
+ runs.each do |run|
20
+ event_point_schedule << EventPoint.new(run.x, run)
21
+ event_point_schedule << EventPoint.new(run.endx, run)
22
+ end
23
+
24
+ event_point_schedule.sort! { |a,b| a.x <=> b.x }
25
+
26
+ event_point_schedule.each do |event_point|
27
+ run = event_point.run
28
+
29
+ if event_point.start?
30
+ if detect_intersection(sweep_line_status, event_point)
31
+ to_exclude << run
32
+ end
33
+ sweep_line_status.push(run)
34
+ else
35
+ sweep_line_status.delete(run)
36
+ end
37
+ end
38
+ runs - to_exclude
39
+ end
40
+
41
+ def self.detect_intersection(sweep_line_status, event_point)
42
+ sweep_line_status.each do |open_text_run|
43
+ if open_text_run.text == event_point.run.text &&
44
+ event_point.x >= open_text_run.x &&
45
+ event_point.x <= open_text_run.endx &&
46
+ open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
47
+ return true
48
+ end
49
+ end
50
+ return false
51
+ end
52
+ end
53
+
54
+ # Utility class used to avoid modifying the underlying TextRun objects while we're
55
+ # looking for duplicates
56
+ class EventPoint
57
+
58
+ attr_reader :x
59
+
60
+ attr_reader :run
61
+
62
+ def initialize(x, run)
63
+ @x = x
64
+ @run = run
65
+ end
66
+
67
+ def start?
68
+ @x == @run.x
69
+ end
70
+ end
71
+
72
+ end