pdf-reader 1.1.1 → 2.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG +87 -2
  3. data/{README.rdoc → README.md} +43 -31
  4. data/Rakefile +21 -16
  5. data/bin/pdf_callbacks +1 -1
  6. data/bin/pdf_object +4 -1
  7. data/bin/pdf_text +1 -3
  8. data/examples/callbacks.rb +2 -1
  9. data/examples/extract_images.rb +11 -6
  10. data/examples/fuzzy_paragraphs.rb +24 -0
  11. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
  12. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
  13. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
  14. data/lib/pdf/reader/afm/Courier.afm +342 -0
  15. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
  16. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
  17. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
  18. data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
  19. data/lib/pdf/reader/afm/MustRead.html +19 -0
  20. data/lib/pdf/reader/afm/Symbol.afm +213 -0
  21. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
  22. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
  23. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
  24. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
  25. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
  26. data/lib/pdf/reader/buffer.rb +90 -63
  27. data/lib/pdf/reader/cid_widths.rb +63 -0
  28. data/lib/pdf/reader/cmap.rb +69 -38
  29. data/lib/pdf/reader/encoding.rb +74 -48
  30. data/lib/pdf/reader/error.rb +24 -4
  31. data/lib/pdf/reader/filter/ascii85.rb +28 -0
  32. data/lib/pdf/reader/filter/ascii_hex.rb +30 -0
  33. data/lib/pdf/reader/filter/depredict.rb +141 -0
  34. data/lib/pdf/reader/filter/flate.rb +53 -0
  35. data/lib/pdf/reader/filter/lzw.rb +21 -0
  36. data/lib/pdf/reader/filter/null.rb +18 -0
  37. data/lib/pdf/reader/filter/run_length.rb +45 -0
  38. data/lib/pdf/reader/filter.rb +15 -234
  39. data/lib/pdf/reader/font.rb +107 -43
  40. data/lib/pdf/reader/font_descriptor.rb +80 -0
  41. data/lib/pdf/reader/form_xobject.rb +26 -4
  42. data/lib/pdf/reader/glyph_hash.rb +56 -18
  43. data/lib/pdf/reader/lzw.rb +6 -4
  44. data/lib/pdf/reader/null_security_handler.rb +17 -0
  45. data/lib/pdf/reader/object_cache.rb +40 -16
  46. data/lib/pdf/reader/object_hash.rb +94 -40
  47. data/lib/pdf/reader/object_stream.rb +1 -0
  48. data/lib/pdf/reader/orientation_detector.rb +34 -0
  49. data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
  50. data/lib/pdf/reader/page.rb +48 -3
  51. data/lib/pdf/reader/page_layout.rb +125 -0
  52. data/lib/pdf/reader/page_state.rb +185 -70
  53. data/lib/pdf/reader/page_text_receiver.rb +70 -20
  54. data/lib/pdf/reader/pages_strategy.rb +4 -293
  55. data/lib/pdf/reader/parser.rb +37 -61
  56. data/lib/pdf/reader/print_receiver.rb +6 -0
  57. data/lib/pdf/reader/reference.rb +4 -1
  58. data/lib/pdf/reader/register_receiver.rb +17 -31
  59. data/lib/pdf/reader/resource_methods.rb +1 -0
  60. data/lib/pdf/reader/standard_security_handler.rb +82 -42
  61. data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
  62. data/lib/pdf/reader/stream.rb +5 -2
  63. data/lib/pdf/reader/synchronized_cache.rb +33 -0
  64. data/lib/pdf/reader/text_run.rb +99 -0
  65. data/lib/pdf/reader/token.rb +4 -1
  66. data/lib/pdf/reader/transformation_matrix.rb +195 -0
  67. data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
  68. data/lib/pdf/reader/width_calculator/built_in.rb +67 -0
  69. data/lib/pdf/reader/width_calculator/composite.rb +28 -0
  70. data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
  71. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +33 -0
  72. data/lib/pdf/reader/width_calculator/type_zero.rb +25 -0
  73. data/lib/pdf/reader/width_calculator.rb +12 -0
  74. data/lib/pdf/reader/xref.rb +41 -9
  75. data/lib/pdf/reader.rb +45 -104
  76. data/lib/pdf-reader.rb +4 -1
  77. metadata +220 -101
  78. data/bin/pdf_list_callbacks +0 -17
  79. data/lib/pdf/hash.rb +0 -15
  80. data/lib/pdf/reader/abstract_strategy.rb +0 -81
  81. data/lib/pdf/reader/metadata_strategy.rb +0 -56
  82. data/lib/pdf/reader/text_receiver.rb +0 -264
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  module PDF
4
5
 
@@ -17,11 +18,12 @@ module PDF
17
18
  #
18
19
  class LZW # :nodoc:
19
20
 
21
+ # Wraps an LZW encoded string
20
22
  class BitStream # :nodoc:
21
23
 
22
24
  def initialize(data, bits_in_chunk)
23
25
  @data = data
24
- @data.force_encoding("BINARY") if @data.respond_to?(:force_encoding)
26
+ @data.force_encoding("BINARY")
25
27
  @bits_in_chunk = bits_in_chunk
26
28
  @current_pos = 0
27
29
  @bits_left_in_byte = 8
@@ -81,9 +83,10 @@ module PDF
81
83
  #
82
84
  def self.decode(data)
83
85
  stream = BitStream.new data.to_s, 9 # size of codes between 9 and 12 bits
84
- result = ''
86
+ result = "".dup
85
87
  until (code = stream.read) == CODE_EOD
86
88
  if code == CODE_CLEAR_TABLE
89
+ stream.set_bits_in_chunk(9)
87
90
  string_table = StringTable.new
88
91
  code = stream.read
89
92
  break if code == CODE_EOD
@@ -114,11 +117,10 @@ module PDF
114
117
  result
115
118
  end
116
119
 
117
- private
118
-
119
120
  def self.create_new_string(string_table,some_code, other_code)
120
121
  string_table[some_code] + string_table[other_code][0].chr
121
122
  end
123
+ private_class_method :create_new_string
122
124
 
123
125
  end
124
126
  end
@@ -0,0 +1,17 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ class PDF::Reader
5
+
6
+ # A null object security handler. Used when a PDF is unencrypted.
7
+ class NullSecurityHandler
8
+
9
+ def self.supports?(encrypt)
10
+ encrypt.nil?
11
+ end
12
+
13
+ def decrypt(buf, _ref)
14
+ buf
15
+ end
16
+ end
17
+ end
@@ -1,10 +1,13 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ require 'hashery/lru_hash'
2
5
 
3
6
  class PDF::Reader
4
7
 
5
8
  # A Hash-like object for caching commonly used objects from a PDF file.
6
9
  #
7
- # This is an internal class used by PDF::Reader::ObjectHash
10
+ # This is an internal class, no promises about a stable API.
8
11
  #
9
12
  class ObjectCache # nodoc
10
13
 
@@ -13,53 +16,67 @@ class PDF::Reader
13
16
  # avoid lots of repetitive (and expensive) tokenising
14
17
  CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
15
18
 
16
- def initialize
19
+ attr_reader :hits, :misses
20
+
21
+ def initialize(lru_size = 1000)
17
22
  @objects = {}
23
+ @lru_cache = Hashery::LRUHash.new(lru_size.to_i)
24
+ @hits = 0
25
+ @misses = 0
18
26
  end
19
27
 
20
28
  def [](key)
21
- @objects[key]
29
+ update_stats(key)
30
+ @objects[key] || @lru_cache[key]
22
31
  end
23
32
 
24
33
  def []=(key, value)
25
- @objects[key] = value if cacheable?(value)
34
+ if cacheable?(value)
35
+ @objects[key] = value
36
+ else
37
+ @lru_cache[key] = value
38
+ end
26
39
  end
27
40
 
28
41
  def fetch(key, local_default = nil)
29
- @objects.fetch(key, local_default)
42
+ update_stats(key)
43
+ @objects[key] || @lru_cache.fetch(key, local_default)
30
44
  end
31
45
 
32
46
  def each(&block)
33
47
  @objects.each(&block)
48
+ @lru_cache.each(&block)
34
49
  end
35
50
  alias :each_pair :each
36
51
 
37
52
  def each_key(&block)
38
53
  @objects.each_key(&block)
54
+ @lru_cache.each_key(&block)
39
55
  end
40
56
 
41
57
  def each_value(&block)
42
58
  @objects.each_value(&block)
59
+ @lru_cache.each_value(&block)
43
60
  end
44
61
 
45
62
  def size
46
- @objects.size
63
+ @objects.size + @lru_cache.size
47
64
  end
48
65
  alias :length :size
49
66
 
50
67
  def empty?
51
- @objects.empty?
68
+ @objects.empty? && @lru_cache.empty?
52
69
  end
53
70
 
54
- def has_key?(key)
55
- @objects.has_key?(key)
71
+ def include?(key)
72
+ @objects.include?(key) || @lru_cache.include?(key)
56
73
  end
57
- alias :include? :has_key?
58
- alias :key? :has_key?
59
- alias :member? :has_key?
74
+ alias :has_key? :include?
75
+ alias :key? :include?
76
+ alias :member? :include?
60
77
 
61
78
  def has_value?(value)
62
- @objects.has_value?(value)
79
+ @objects.has_value?(value) || @lru_cache.has_value?(value)
63
80
  end
64
81
 
65
82
  def to_s
@@ -67,19 +84,26 @@ class PDF::Reader
67
84
  end
68
85
 
69
86
  def keys
70
- @objects.keys
87
+ @objects.keys + @lru_cache.keys
71
88
  end
72
89
 
73
90
  def values
74
- @objects.values
91
+ @objects.values + @lru_cache.values
75
92
  end
76
93
 
77
94
  private
78
95
 
96
+ def update_stats(key)
97
+ if has_key?(key)
98
+ @hits += 1
99
+ else
100
+ @misses += 1
101
+ end
102
+ end
103
+
79
104
  def cacheable?(obj)
80
105
  obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
81
106
  end
82
107
 
83
-
84
108
  end
85
109
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
  # Provides low level access to the objects in a PDF file via a hash-like
@@ -41,10 +42,11 @@ class PDF::Reader
41
42
  #
42
43
  def initialize(input, opts = {})
43
44
  @io = extract_io_from(input)
44
- @pdf_version = read_version
45
45
  @xref = PDF::Reader::XRef.new(@io)
46
+ @pdf_version = read_version
46
47
  @trailer = @xref.trailer
47
- @cache = PDF::Reader::ObjectCache.new
48
+ @cache = opts[:cache] || PDF::Reader::ObjectCache.new
49
+ @sec_handler = NullSecurityHandler.new
48
50
  @sec_handler = build_security_handler(opts)
49
51
  end
50
52
 
@@ -76,16 +78,7 @@ class PDF::Reader
76
78
  key = PDF::Reader::Reference.new(key.to_i, 0)
77
79
  end
78
80
 
79
- if @cache.has_key?(key)
80
- @cache[key]
81
- elsif xref[key].is_a?(Fixnum)
82
- buf = new_buffer(xref[key])
83
- @cache[key] = decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
84
- elsif xref[key].is_a?(PDF::Reader::Reference)
85
- container_key = xref[key]
86
- object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
87
- @cache[key] = object_streams[container_key][key.id]
88
- end
81
+ @cache[key] ||= fetch_object(key) || fetch_object_stream(key)
89
82
  rescue InvalidObjectError
90
83
  return default
91
84
  end
@@ -102,21 +95,7 @@ class PDF::Reader
102
95
  # a PDF::Reader::Reference, the key is returned unchanged.
103
96
  #
104
97
  def deref!(key)
105
- case object = deref(key)
106
- when Hash
107
- {}.tap { |hash|
108
- object.each do |k, value|
109
- hash[k] = deref!(value)
110
- end
111
- }
112
- when PDF::Reader::Stream
113
- object.hash = deref!(object.hash)
114
- object
115
- when Array
116
- object.map { |value| deref!(value) }
117
- else
118
- object
119
- end
98
+ deref_internal!(key, {})
120
99
  end
121
100
 
122
101
  # Access an object from the PDF. key can be an int or a PDF::Reader::Reference
@@ -266,24 +245,95 @@ class PDF::Reader
266
245
 
267
246
  private
268
247
 
269
- def build_security_handler(opts = {})
270
- return nil if trailer[:Encrypt].nil?
248
+ # parse a traditional object from the PDF, starting from the byte offset indicated
249
+ # in the xref table
250
+ #
251
+ def fetch_object(key)
252
+ if xref[key].is_a?(Integer)
253
+ buf = new_buffer(xref[key])
254
+ decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
255
+ end
256
+ end
257
+
258
+ # parse a object that's embedded in an object stream in the PDF
259
+ #
260
+ def fetch_object_stream(key)
261
+ if xref[key].is_a?(PDF::Reader::Reference)
262
+ container_key = xref[key]
263
+ object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
264
+ object_streams[container_key][key.id]
265
+ end
266
+ end
267
+
268
+ # Private implementation of deref!, which exists to ensure the `seen` argument
269
+ # isn't publicly available. It's used to avoid endless loops in the recursion, and
270
+ # doesn't need to be part of the public API.
271
+ #
272
+ def deref_internal!(key, seen)
273
+ seen_key = key.is_a?(PDF::Reader::Reference) ? key : key.object_id
274
+
275
+ return seen[seen_key] if seen.key?(seen_key)
276
+
277
+ case object = deref(key)
278
+ when Hash
279
+ seen[seen_key] ||= {}
280
+ object.each do |k, value|
281
+ seen[seen_key][k] = deref_internal!(value, seen)
282
+ end
283
+ seen[seen_key]
284
+ when PDF::Reader::Stream
285
+ seen[seen_key] ||= PDF::Reader::Stream.new({}, object.data)
286
+ object.hash.each do |k,value|
287
+ seen[seen_key].hash[k] = deref_internal!(value, seen)
288
+ end
289
+ seen[seen_key]
290
+ when Array
291
+ seen[seen_key] ||= []
292
+ object.each do |value|
293
+ seen[seen_key] << deref_internal!(value, seen)
294
+ end
295
+ seen[seen_key]
296
+ else
297
+ object
298
+ end
299
+ end
271
300
 
272
- enc = deref(trailer[:Encrypt])
273
- case enc[:Filter]
274
- when :Standard
275
- StandardSecurityHandler.new(enc, deref(trailer[:ID]), opts[:password])
301
+ def build_security_handler(opts = {})
302
+ encrypt = deref(trailer[:Encrypt])
303
+ if NullSecurityHandler.supports?(encrypt)
304
+ NullSecurityHandler.new
305
+ elsif StandardSecurityHandler.supports?(encrypt)
306
+ encmeta = !encrypt.has_key?(:EncryptMetadata) || encrypt[:EncryptMetadata].to_s == "true"
307
+ StandardSecurityHandler.new(
308
+ key_length: (encrypt[:Length] || 40).to_i,
309
+ revision: encrypt[:R],
310
+ owner_key: encrypt[:O],
311
+ user_key: encrypt[:U],
312
+ permissions: encrypt[:P].to_i,
313
+ encrypted_metadata: encmeta,
314
+ file_id: (deref(trailer[:ID]) || []).first,
315
+ password: opts[:password],
316
+ cfm: encrypt.fetch(:CF, {}).fetch(encrypt[:StmF], {}).fetch(:CFM, nil)
317
+ )
318
+ elsif StandardSecurityHandlerV5.supports?(encrypt)
319
+ StandardSecurityHandlerV5.new(
320
+ O: encrypt[:O],
321
+ U: encrypt[:U],
322
+ OE: encrypt[:OE],
323
+ UE: encrypt[:UE],
324
+ password: opts[:password]
325
+ )
276
326
  else
277
- raise PDF::Reader::EncryptedPDFError, "Unsupported encryption method (#{enc[:Filter]})"
327
+ UnimplementedSecurityHandler.new
278
328
  end
279
329
  end
280
330
 
281
331
  def decrypt(ref, obj)
282
- return obj unless sec_handler?
283
-
284
332
  case obj
285
333
  when PDF::Reader::Stream then
286
- obj.data = sec_handler.decrypt(obj.data, ref)
334
+ # PDF 32000-1:2008 7.5.8.2: "The cross-reference stream shall not be encrypted [...]."
335
+ # Therefore we shouldn't try to decrypt it.
336
+ obj.data = sec_handler.decrypt(obj.data, ref) unless obj.hash[:Type] == :XRef
287
337
  obj
288
338
  when Hash then
289
339
  arr = obj.map { |key,val| [key, decrypt(ref, val)] }.flatten(1)
@@ -312,18 +362,22 @@ class PDF::Reader
312
362
  # returns a nested array of object references for all pages in this object store.
313
363
  #
314
364
  def get_page_objects(ref)
315
- obj = fetch(ref)
365
+ obj = deref(ref)
366
+
367
+ unless obj.kind_of?(::Hash)
368
+ raise MalformedPDFError, "Dereferenced page object must be a dict"
369
+ end
316
370
 
317
371
  if obj[:Type] == :Page
318
372
  ref
319
- elsif obj[:Type] == :Pages
373
+ elsif obj[:Kids]
320
374
  deref(obj[:Kids]).map { |kid| get_page_objects(kid) }
321
375
  end
322
376
  end
323
377
 
324
378
  def read_version
325
379
  @io.seek(0)
326
- m, version = *@io.read(10).match(/PDF-(\d.\d)/)
380
+ _m, version = *@io.read(10).match(/PDF-(\d.\d)/)
327
381
  @io.seek(0)
328
382
  version.to_f
329
383
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
 
@@ -0,0 +1,34 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ class PDF::Reader
5
+ # Small util class for detecting the orientation of a single PDF page. Accounts
6
+ # for any page rotation that is in place.
7
+ #
8
+ # OrientationDetector.new(:MediaBox => [0,0,612,792]).orientation
9
+ # => "portrait"
10
+ #
11
+ class OrientationDetector
12
+ def initialize(attributes)
13
+ @attributes = attributes
14
+ end
15
+
16
+ def orientation
17
+ @orientation ||= detect_orientation
18
+ end
19
+
20
+ private
21
+
22
+ def detect_orientation
23
+ llx,lly,urx,ury = @attributes[:MediaBox]
24
+ rotation = @attributes[:Rotate].to_i
25
+ width = (urx.to_i - llx.to_i).abs
26
+ height = (ury.to_i - lly.to_i).abs
27
+ if width > height
28
+ (rotation % 180).zero? ? 'landscape' : 'portrait'
29
+ else
30
+ (rotation % 180).zero? ? 'portrait' : 'landscape'
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,65 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+ # remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
5
+ # uses slightly offset overlapping characters to achieve a fake 'bold' effect.
6
+ class OverlappingRunsFilter
7
+
8
+ # This should be between 0 and 1. If TextRun B obscures this much of TextRun A (and they
9
+ # have identical characters) then one will be discarded
10
+ OVERLAPPING_THRESHOLD = 0.5
11
+
12
+ def self.exclude_redundant_runs(runs)
13
+ sweep_line_status = Array.new
14
+ event_point_schedule = Array.new
15
+ to_exclude = []
16
+
17
+ runs.each do |run|
18
+ event_point_schedule << EventPoint.new(run.x, run)
19
+ event_point_schedule << EventPoint.new(run.endx, run)
20
+ end
21
+
22
+ event_point_schedule.sort! { |a,b| a.x <=> b.x }
23
+
24
+ event_point_schedule.each do |event_point|
25
+ run = event_point.run
26
+
27
+ if event_point.start?
28
+ if detect_intersection(sweep_line_status, event_point)
29
+ to_exclude << run
30
+ end
31
+ sweep_line_status.push(run)
32
+ else
33
+ sweep_line_status.delete(run)
34
+ end
35
+ end
36
+ runs - to_exclude
37
+ end
38
+
39
+ def self.detect_intersection(sweep_line_status, event_point)
40
+ sweep_line_status.each do |open_text_run|
41
+ if event_point.x >= open_text_run.x &&
42
+ event_point.x <= open_text_run.endx &&
43
+ open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
44
+ return true
45
+ end
46
+ end
47
+ return false
48
+ end
49
+ end
50
+
51
+ # Utility class used to avoid modifying the underlying TextRun objects while we're
52
+ # looking for duplicates
53
+ class EventPoint
54
+ attr_reader :x, :run
55
+
56
+ def initialize x, run
57
+ @x, @run = x, run
58
+ end
59
+
60
+ def start?
61
+ @x == @run.x
62
+ end
63
+ end
64
+
65
+ end