pdf-reader 1.1.1 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG +87 -2
  3. data/{README.rdoc → README.md} +43 -31
  4. data/Rakefile +21 -16
  5. data/bin/pdf_callbacks +1 -1
  6. data/bin/pdf_object +4 -1
  7. data/bin/pdf_text +1 -3
  8. data/examples/callbacks.rb +2 -1
  9. data/examples/extract_images.rb +11 -6
  10. data/examples/fuzzy_paragraphs.rb +24 -0
  11. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
  12. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
  13. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
  14. data/lib/pdf/reader/afm/Courier.afm +342 -0
  15. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
  16. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
  17. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
  18. data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
  19. data/lib/pdf/reader/afm/MustRead.html +19 -0
  20. data/lib/pdf/reader/afm/Symbol.afm +213 -0
  21. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
  22. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
  23. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
  24. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
  25. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
  26. data/lib/pdf/reader/buffer.rb +90 -63
  27. data/lib/pdf/reader/cid_widths.rb +63 -0
  28. data/lib/pdf/reader/cmap.rb +69 -38
  29. data/lib/pdf/reader/encoding.rb +74 -48
  30. data/lib/pdf/reader/error.rb +24 -4
  31. data/lib/pdf/reader/filter/ascii85.rb +28 -0
  32. data/lib/pdf/reader/filter/ascii_hex.rb +30 -0
  33. data/lib/pdf/reader/filter/depredict.rb +141 -0
  34. data/lib/pdf/reader/filter/flate.rb +53 -0
  35. data/lib/pdf/reader/filter/lzw.rb +21 -0
  36. data/lib/pdf/reader/filter/null.rb +18 -0
  37. data/lib/pdf/reader/filter/run_length.rb +45 -0
  38. data/lib/pdf/reader/filter.rb +15 -234
  39. data/lib/pdf/reader/font.rb +107 -43
  40. data/lib/pdf/reader/font_descriptor.rb +80 -0
  41. data/lib/pdf/reader/form_xobject.rb +26 -4
  42. data/lib/pdf/reader/glyph_hash.rb +56 -18
  43. data/lib/pdf/reader/lzw.rb +6 -4
  44. data/lib/pdf/reader/null_security_handler.rb +17 -0
  45. data/lib/pdf/reader/object_cache.rb +40 -16
  46. data/lib/pdf/reader/object_hash.rb +94 -40
  47. data/lib/pdf/reader/object_stream.rb +1 -0
  48. data/lib/pdf/reader/orientation_detector.rb +34 -0
  49. data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
  50. data/lib/pdf/reader/page.rb +48 -3
  51. data/lib/pdf/reader/page_layout.rb +125 -0
  52. data/lib/pdf/reader/page_state.rb +185 -70
  53. data/lib/pdf/reader/page_text_receiver.rb +70 -20
  54. data/lib/pdf/reader/pages_strategy.rb +4 -293
  55. data/lib/pdf/reader/parser.rb +37 -61
  56. data/lib/pdf/reader/print_receiver.rb +6 -0
  57. data/lib/pdf/reader/reference.rb +4 -1
  58. data/lib/pdf/reader/register_receiver.rb +17 -31
  59. data/lib/pdf/reader/resource_methods.rb +1 -0
  60. data/lib/pdf/reader/standard_security_handler.rb +82 -42
  61. data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
  62. data/lib/pdf/reader/stream.rb +5 -2
  63. data/lib/pdf/reader/synchronized_cache.rb +33 -0
  64. data/lib/pdf/reader/text_run.rb +99 -0
  65. data/lib/pdf/reader/token.rb +4 -1
  66. data/lib/pdf/reader/transformation_matrix.rb +195 -0
  67. data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
  68. data/lib/pdf/reader/width_calculator/built_in.rb +67 -0
  69. data/lib/pdf/reader/width_calculator/composite.rb +28 -0
  70. data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
  71. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +33 -0
  72. data/lib/pdf/reader/width_calculator/type_zero.rb +25 -0
  73. data/lib/pdf/reader/width_calculator.rb +12 -0
  74. data/lib/pdf/reader/xref.rb +41 -9
  75. data/lib/pdf/reader.rb +45 -104
  76. data/lib/pdf-reader.rb +4 -1
  77. metadata +220 -101
  78. data/bin/pdf_list_callbacks +0 -17
  79. data/lib/pdf/hash.rb +0 -15
  80. data/lib/pdf/reader/abstract_strategy.rb +0 -81
  81. data/lib/pdf/reader/metadata_strategy.rb +0 -56
  82. data/lib/pdf/reader/text_receiver.rb +0 -264
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  module PDF
4
5
 
@@ -17,11 +18,12 @@ module PDF
17
18
  #
18
19
  class LZW # :nodoc:
19
20
 
21
+ # Wraps an LZW encoded string
20
22
  class BitStream # :nodoc:
21
23
 
22
24
  def initialize(data, bits_in_chunk)
23
25
  @data = data
24
- @data.force_encoding("BINARY") if @data.respond_to?(:force_encoding)
26
+ @data.force_encoding("BINARY")
25
27
  @bits_in_chunk = bits_in_chunk
26
28
  @current_pos = 0
27
29
  @bits_left_in_byte = 8
@@ -81,9 +83,10 @@ module PDF
81
83
  #
82
84
  def self.decode(data)
83
85
  stream = BitStream.new data.to_s, 9 # size of codes between 9 and 12 bits
84
- result = ''
86
+ result = "".dup
85
87
  until (code = stream.read) == CODE_EOD
86
88
  if code == CODE_CLEAR_TABLE
89
+ stream.set_bits_in_chunk(9)
87
90
  string_table = StringTable.new
88
91
  code = stream.read
89
92
  break if code == CODE_EOD
@@ -114,11 +117,10 @@ module PDF
114
117
  result
115
118
  end
116
119
 
117
- private
118
-
119
120
  def self.create_new_string(string_table,some_code, other_code)
120
121
  string_table[some_code] + string_table[other_code][0].chr
121
122
  end
123
+ private_class_method :create_new_string
122
124
 
123
125
  end
124
126
  end
@@ -0,0 +1,17 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ class PDF::Reader
5
+
6
+ # A null object security handler. Used when a PDF is unencrypted.
7
+ class NullSecurityHandler
8
+
9
+ def self.supports?(encrypt)
10
+ encrypt.nil?
11
+ end
12
+
13
+ def decrypt(buf, _ref)
14
+ buf
15
+ end
16
+ end
17
+ end
@@ -1,10 +1,13 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ require 'hashery/lru_hash'
2
5
 
3
6
  class PDF::Reader
4
7
 
5
8
  # A Hash-like object for caching commonly used objects from a PDF file.
6
9
  #
7
- # This is an internal class used by PDF::Reader::ObjectHash
10
+ # This is an internal class, no promises about a stable API.
8
11
  #
9
12
  class ObjectCache # nodoc
10
13
 
@@ -13,53 +16,67 @@ class PDF::Reader
13
16
  # avoid lots of repetitive (and expensive) tokenising
14
17
  CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
15
18
 
16
- def initialize
19
+ attr_reader :hits, :misses
20
+
21
+ def initialize(lru_size = 1000)
17
22
  @objects = {}
23
+ @lru_cache = Hashery::LRUHash.new(lru_size.to_i)
24
+ @hits = 0
25
+ @misses = 0
18
26
  end
19
27
 
20
28
  def [](key)
21
- @objects[key]
29
+ update_stats(key)
30
+ @objects[key] || @lru_cache[key]
22
31
  end
23
32
 
24
33
  def []=(key, value)
25
- @objects[key] = value if cacheable?(value)
34
+ if cacheable?(value)
35
+ @objects[key] = value
36
+ else
37
+ @lru_cache[key] = value
38
+ end
26
39
  end
27
40
 
28
41
  def fetch(key, local_default = nil)
29
- @objects.fetch(key, local_default)
42
+ update_stats(key)
43
+ @objects[key] || @lru_cache.fetch(key, local_default)
30
44
  end
31
45
 
32
46
  def each(&block)
33
47
  @objects.each(&block)
48
+ @lru_cache.each(&block)
34
49
  end
35
50
  alias :each_pair :each
36
51
 
37
52
  def each_key(&block)
38
53
  @objects.each_key(&block)
54
+ @lru_cache.each_key(&block)
39
55
  end
40
56
 
41
57
  def each_value(&block)
42
58
  @objects.each_value(&block)
59
+ @lru_cache.each_value(&block)
43
60
  end
44
61
 
45
62
  def size
46
- @objects.size
63
+ @objects.size + @lru_cache.size
47
64
  end
48
65
  alias :length :size
49
66
 
50
67
  def empty?
51
- @objects.empty?
68
+ @objects.empty? && @lru_cache.empty?
52
69
  end
53
70
 
54
- def has_key?(key)
55
- @objects.has_key?(key)
71
+ def include?(key)
72
+ @objects.include?(key) || @lru_cache.include?(key)
56
73
  end
57
- alias :include? :has_key?
58
- alias :key? :has_key?
59
- alias :member? :has_key?
74
+ alias :has_key? :include?
75
+ alias :key? :include?
76
+ alias :member? :include?
60
77
 
61
78
  def has_value?(value)
62
- @objects.has_value?(value)
79
+ @objects.has_value?(value) || @lru_cache.has_value?(value)
63
80
  end
64
81
 
65
82
  def to_s
@@ -67,19 +84,26 @@ class PDF::Reader
67
84
  end
68
85
 
69
86
  def keys
70
- @objects.keys
87
+ @objects.keys + @lru_cache.keys
71
88
  end
72
89
 
73
90
  def values
74
- @objects.values
91
+ @objects.values + @lru_cache.values
75
92
  end
76
93
 
77
94
  private
78
95
 
96
+ def update_stats(key)
97
+ if has_key?(key)
98
+ @hits += 1
99
+ else
100
+ @misses += 1
101
+ end
102
+ end
103
+
79
104
  def cacheable?(obj)
80
105
  obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
81
106
  end
82
107
 
83
-
84
108
  end
85
109
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
  # Provides low level access to the objects in a PDF file via a hash-like
@@ -41,10 +42,11 @@ class PDF::Reader
41
42
  #
42
43
  def initialize(input, opts = {})
43
44
  @io = extract_io_from(input)
44
- @pdf_version = read_version
45
45
  @xref = PDF::Reader::XRef.new(@io)
46
+ @pdf_version = read_version
46
47
  @trailer = @xref.trailer
47
- @cache = PDF::Reader::ObjectCache.new
48
+ @cache = opts[:cache] || PDF::Reader::ObjectCache.new
49
+ @sec_handler = NullSecurityHandler.new
48
50
  @sec_handler = build_security_handler(opts)
49
51
  end
50
52
 
@@ -76,16 +78,7 @@ class PDF::Reader
76
78
  key = PDF::Reader::Reference.new(key.to_i, 0)
77
79
  end
78
80
 
79
- if @cache.has_key?(key)
80
- @cache[key]
81
- elsif xref[key].is_a?(Fixnum)
82
- buf = new_buffer(xref[key])
83
- @cache[key] = decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
84
- elsif xref[key].is_a?(PDF::Reader::Reference)
85
- container_key = xref[key]
86
- object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
87
- @cache[key] = object_streams[container_key][key.id]
88
- end
81
+ @cache[key] ||= fetch_object(key) || fetch_object_stream(key)
89
82
  rescue InvalidObjectError
90
83
  return default
91
84
  end
@@ -102,21 +95,7 @@ class PDF::Reader
102
95
  # a PDF::Reader::Reference, the key is returned unchanged.
103
96
  #
104
97
  def deref!(key)
105
- case object = deref(key)
106
- when Hash
107
- {}.tap { |hash|
108
- object.each do |k, value|
109
- hash[k] = deref!(value)
110
- end
111
- }
112
- when PDF::Reader::Stream
113
- object.hash = deref!(object.hash)
114
- object
115
- when Array
116
- object.map { |value| deref!(value) }
117
- else
118
- object
119
- end
98
+ deref_internal!(key, {})
120
99
  end
121
100
 
122
101
  # Access an object from the PDF. key can be an int or a PDF::Reader::Reference
@@ -266,24 +245,95 @@ class PDF::Reader
266
245
 
267
246
  private
268
247
 
269
- def build_security_handler(opts = {})
270
- return nil if trailer[:Encrypt].nil?
248
+ # parse a traditional object from the PDF, starting from the byte offset indicated
249
+ # in the xref table
250
+ #
251
+ def fetch_object(key)
252
+ if xref[key].is_a?(Integer)
253
+ buf = new_buffer(xref[key])
254
+ decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
255
+ end
256
+ end
257
+
258
+ # parse a object that's embedded in an object stream in the PDF
259
+ #
260
+ def fetch_object_stream(key)
261
+ if xref[key].is_a?(PDF::Reader::Reference)
262
+ container_key = xref[key]
263
+ object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
264
+ object_streams[container_key][key.id]
265
+ end
266
+ end
267
+
268
+ # Private implementation of deref!, which exists to ensure the `seen` argument
269
+ # isn't publicly available. It's used to avoid endless loops in the recursion, and
270
+ # doesn't need to be part of the public API.
271
+ #
272
+ def deref_internal!(key, seen)
273
+ seen_key = key.is_a?(PDF::Reader::Reference) ? key : key.object_id
274
+
275
+ return seen[seen_key] if seen.key?(seen_key)
276
+
277
+ case object = deref(key)
278
+ when Hash
279
+ seen[seen_key] ||= {}
280
+ object.each do |k, value|
281
+ seen[seen_key][k] = deref_internal!(value, seen)
282
+ end
283
+ seen[seen_key]
284
+ when PDF::Reader::Stream
285
+ seen[seen_key] ||= PDF::Reader::Stream.new({}, object.data)
286
+ object.hash.each do |k,value|
287
+ seen[seen_key].hash[k] = deref_internal!(value, seen)
288
+ end
289
+ seen[seen_key]
290
+ when Array
291
+ seen[seen_key] ||= []
292
+ object.each do |value|
293
+ seen[seen_key] << deref_internal!(value, seen)
294
+ end
295
+ seen[seen_key]
296
+ else
297
+ object
298
+ end
299
+ end
271
300
 
272
- enc = deref(trailer[:Encrypt])
273
- case enc[:Filter]
274
- when :Standard
275
- StandardSecurityHandler.new(enc, deref(trailer[:ID]), opts[:password])
301
+ def build_security_handler(opts = {})
302
+ encrypt = deref(trailer[:Encrypt])
303
+ if NullSecurityHandler.supports?(encrypt)
304
+ NullSecurityHandler.new
305
+ elsif StandardSecurityHandler.supports?(encrypt)
306
+ encmeta = !encrypt.has_key?(:EncryptMetadata) || encrypt[:EncryptMetadata].to_s == "true"
307
+ StandardSecurityHandler.new(
308
+ key_length: (encrypt[:Length] || 40).to_i,
309
+ revision: encrypt[:R],
310
+ owner_key: encrypt[:O],
311
+ user_key: encrypt[:U],
312
+ permissions: encrypt[:P].to_i,
313
+ encrypted_metadata: encmeta,
314
+ file_id: (deref(trailer[:ID]) || []).first,
315
+ password: opts[:password],
316
+ cfm: encrypt.fetch(:CF, {}).fetch(encrypt[:StmF], {}).fetch(:CFM, nil)
317
+ )
318
+ elsif StandardSecurityHandlerV5.supports?(encrypt)
319
+ StandardSecurityHandlerV5.new(
320
+ O: encrypt[:O],
321
+ U: encrypt[:U],
322
+ OE: encrypt[:OE],
323
+ UE: encrypt[:UE],
324
+ password: opts[:password]
325
+ )
276
326
  else
277
- raise PDF::Reader::EncryptedPDFError, "Unsupported encryption method (#{enc[:Filter]})"
327
+ UnimplementedSecurityHandler.new
278
328
  end
279
329
  end
280
330
 
281
331
  def decrypt(ref, obj)
282
- return obj unless sec_handler?
283
-
284
332
  case obj
285
333
  when PDF::Reader::Stream then
286
- obj.data = sec_handler.decrypt(obj.data, ref)
334
+ # PDF 32000-1:2008 7.5.8.2: "The cross-reference stream shall not be encrypted [...]."
335
+ # Therefore we shouldn't try to decrypt it.
336
+ obj.data = sec_handler.decrypt(obj.data, ref) unless obj.hash[:Type] == :XRef
287
337
  obj
288
338
  when Hash then
289
339
  arr = obj.map { |key,val| [key, decrypt(ref, val)] }.flatten(1)
@@ -312,18 +362,22 @@ class PDF::Reader
312
362
  # returns a nested array of object references for all pages in this object store.
313
363
  #
314
364
  def get_page_objects(ref)
315
- obj = fetch(ref)
365
+ obj = deref(ref)
366
+
367
+ unless obj.kind_of?(::Hash)
368
+ raise MalformedPDFError, "Dereferenced page object must be a dict"
369
+ end
316
370
 
317
371
  if obj[:Type] == :Page
318
372
  ref
319
- elsif obj[:Type] == :Pages
373
+ elsif obj[:Kids]
320
374
  deref(obj[:Kids]).map { |kid| get_page_objects(kid) }
321
375
  end
322
376
  end
323
377
 
324
378
  def read_version
325
379
  @io.seek(0)
326
- m, version = *@io.read(10).match(/PDF-(\d.\d)/)
380
+ _m, version = *@io.read(10).match(/PDF-(\d.\d)/)
327
381
  @io.seek(0)
328
382
  version.to_f
329
383
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
 
@@ -0,0 +1,34 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ class PDF::Reader
5
+ # Small util class for detecting the orientation of a single PDF page. Accounts
6
+ # for any page rotation that is in place.
7
+ #
8
+ # OrientationDetector.new(:MediaBox => [0,0,612,792]).orientation
9
+ # => "portrait"
10
+ #
11
+ class OrientationDetector
12
+ def initialize(attributes)
13
+ @attributes = attributes
14
+ end
15
+
16
+ def orientation
17
+ @orientation ||= detect_orientation
18
+ end
19
+
20
+ private
21
+
22
+ def detect_orientation
23
+ llx,lly,urx,ury = @attributes[:MediaBox]
24
+ rotation = @attributes[:Rotate].to_i
25
+ width = (urx.to_i - llx.to_i).abs
26
+ height = (ury.to_i - lly.to_i).abs
27
+ if width > height
28
+ (rotation % 180).zero? ? 'landscape' : 'portrait'
29
+ else
30
+ (rotation % 180).zero? ? 'portrait' : 'landscape'
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,65 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+ # remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
5
+ # uses slightly offset overlapping characters to achieve a fake 'bold' effect.
6
+ class OverlappingRunsFilter
7
+
8
+ # This should be between 0 and 1. If TextRun B obscures this much of TextRun A (and they
9
+ # have identical characters) then one will be discarded
10
+ OVERLAPPING_THRESHOLD = 0.5
11
+
12
+ def self.exclude_redundant_runs(runs)
13
+ sweep_line_status = Array.new
14
+ event_point_schedule = Array.new
15
+ to_exclude = []
16
+
17
+ runs.each do |run|
18
+ event_point_schedule << EventPoint.new(run.x, run)
19
+ event_point_schedule << EventPoint.new(run.endx, run)
20
+ end
21
+
22
+ event_point_schedule.sort! { |a,b| a.x <=> b.x }
23
+
24
+ event_point_schedule.each do |event_point|
25
+ run = event_point.run
26
+
27
+ if event_point.start?
28
+ if detect_intersection(sweep_line_status, event_point)
29
+ to_exclude << run
30
+ end
31
+ sweep_line_status.push(run)
32
+ else
33
+ sweep_line_status.delete(run)
34
+ end
35
+ end
36
+ runs - to_exclude
37
+ end
38
+
39
+ def self.detect_intersection(sweep_line_status, event_point)
40
+ sweep_line_status.each do |open_text_run|
41
+ if event_point.x >= open_text_run.x &&
42
+ event_point.x <= open_text_run.endx &&
43
+ open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
44
+ return true
45
+ end
46
+ end
47
+ return false
48
+ end
49
+ end
50
+
51
+ # Utility class used to avoid modifying the underlying TextRun objects while we're
52
+ # looking for duplicates
53
+ class EventPoint
54
+ attr_reader :x, :run
55
+
56
+ def initialize x, run
57
+ @x, @run = x, run
58
+ end
59
+
60
+ def start?
61
+ @x == @run.x
62
+ end
63
+ end
64
+
65
+ end