pdf-reader 2.1.0 → 2.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +28 -1
  3. data/README.md +2 -2
  4. data/bin/pdf_callbacks +1 -1
  5. data/bin/pdf_text +1 -1
  6. data/lib/pdf-reader.rb +1 -0
  7. data/lib/pdf/reader.rb +2 -2
  8. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
  9. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
  10. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
  11. data/lib/pdf/reader/afm/Courier.afm +342 -342
  12. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
  13. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
  14. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
  15. data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
  16. data/lib/pdf/reader/afm/MustRead.html +19 -0
  17. data/lib/pdf/reader/afm/Symbol.afm +213 -213
  18. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
  19. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
  20. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
  21. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
  22. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
  23. data/lib/pdf/reader/buffer.rb +12 -11
  24. data/lib/pdf/reader/cid_widths.rb +2 -0
  25. data/lib/pdf/reader/cmap.rb +22 -12
  26. data/lib/pdf/reader/encoding.rb +12 -9
  27. data/lib/pdf/reader/error.rb +1 -0
  28. data/lib/pdf/reader/filter.rb +1 -0
  29. data/lib/pdf/reader/filter/ascii85.rb +1 -0
  30. data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
  31. data/lib/pdf/reader/filter/depredict.rb +1 -0
  32. data/lib/pdf/reader/filter/flate.rb +6 -4
  33. data/lib/pdf/reader/filter/lzw.rb +2 -0
  34. data/lib/pdf/reader/filter/null.rb +2 -0
  35. data/lib/pdf/reader/filter/run_length.rb +3 -1
  36. data/lib/pdf/reader/font.rb +11 -2
  37. data/lib/pdf/reader/font_descriptor.rb +1 -0
  38. data/lib/pdf/reader/form_xobject.rb +1 -0
  39. data/lib/pdf/reader/glyph_hash.rb +1 -0
  40. data/lib/pdf/reader/lzw.rb +2 -1
  41. data/lib/pdf/reader/null_security_handler.rb +1 -0
  42. data/lib/pdf/reader/object_cache.rb +1 -0
  43. data/lib/pdf/reader/object_hash.rb +22 -10
  44. data/lib/pdf/reader/object_stream.rb +1 -0
  45. data/lib/pdf/reader/orientation_detector.rb +5 -4
  46. data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
  47. data/lib/pdf/reader/page.rb +29 -0
  48. data/lib/pdf/reader/page_layout.rb +10 -5
  49. data/lib/pdf/reader/page_state.rb +10 -1
  50. data/lib/pdf/reader/page_text_receiver.rb +5 -1
  51. data/lib/pdf/reader/pages_strategy.rb +1 -0
  52. data/lib/pdf/reader/parser.rb +5 -4
  53. data/lib/pdf/reader/print_receiver.rb +1 -0
  54. data/lib/pdf/reader/reference.rb +1 -0
  55. data/lib/pdf/reader/register_receiver.rb +1 -0
  56. data/lib/pdf/reader/resource_methods.rb +1 -0
  57. data/lib/pdf/reader/standard_security_handler.rb +1 -0
  58. data/lib/pdf/reader/standard_security_handler_v5.rb +2 -0
  59. data/lib/pdf/reader/stream.rb +1 -0
  60. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  61. data/lib/pdf/reader/text_run.rb +25 -0
  62. data/lib/pdf/reader/token.rb +1 -0
  63. data/lib/pdf/reader/transformation_matrix.rb +1 -0
  64. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  65. data/lib/pdf/reader/width_calculator.rb +1 -0
  66. data/lib/pdf/reader/width_calculator/built_in.rb +18 -1
  67. data/lib/pdf/reader/width_calculator/composite.rb +1 -0
  68. data/lib/pdf/reader/width_calculator/true_type.rb +2 -2
  69. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
  70. data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
  71. data/lib/pdf/reader/xref.rb +11 -5
  72. metadata +17 -13
  73. data/lib/pdf/hash.rb +0 -19
@@ -1,4 +1,6 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
 
4
6
  require 'forwardable'
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -95,25 +96,34 @@ class PDF::Reader
95
96
  Parser.new(buffer)
96
97
  end
97
98
 
99
+ # The following includes some manual decoding of UTF-16BE strings into unicode codepoints. In
100
+ # theory we could replace all the UTF-16 code with something based on Ruby's encoding support:
101
+ #
102
+ # str.dup.force_encoding("utf-16be").encode!("utf-8").unpack("U*")
103
+ #
104
+ # However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an
105
+ # exception when we try converting broken UTF-16 to UTF-8
106
+ #
98
107
  def str_to_int(str)
99
108
  return nil if str.nil? || str.size == 0
100
- unpacked_string = if str.size == 1 # UTF-8
109
+ unpacked_string = if str.bytesize == 1 # UTF-8
101
110
  str.unpack("C*")
102
111
  else # UTF-16
103
112
  str.unpack("n*")
104
113
  end
105
- if unpacked_string.size == 1
106
- unpacked_string
107
- elsif unpacked_string.size == 2 && unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF
108
- # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
109
- # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
110
- # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
111
- [(unpacked_string[0] - 0xD800) * 0x400 + (unpacked_string[1] - 0xDC00) + 0x10000]
112
- else
113
- # it is a bad idea to just return the first 16 bits, as this doesn't allow
114
- # for ligatures for example fi (U+0066 U+0069)
115
- unpacked_string
114
+ result = []
115
+ while unpacked_string.any? do
116
+ if unpacked_string.size >= 2 && unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF
117
+ # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
118
+ # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
119
+ # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
120
+ points = [unpacked_string.shift, unpacked_string.shift]
121
+ result << (points[0] - 0xD800) * 0x400 + (points[1] - 0xDC00) + 0x10000
122
+ else
123
+ result << unpacked_string.shift
124
+ end
116
125
  end
126
+ result
117
127
  end
118
128
 
119
129
  def process_bfchar_instructions(instructions)
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -39,20 +40,22 @@ class PDF::Reader
39
40
  @mapping = default_mapping # maps from character codes to Unicode codepoints
40
41
  @string_cache = {} # maps from character codes to UTF-8 strings.
41
42
 
42
- if enc.kind_of?(Hash)
43
- self.differences = enc[:Differences] if enc[:Differences]
44
- enc = enc[:Encoding] || enc[:BaseEncoding]
45
- elsif enc != nil
46
- enc = enc.to_sym
43
+ @enc_name = if enc.kind_of?(Hash)
44
+ enc[:Encoding] || enc[:BaseEncoding]
45
+ elsif enc && enc.respond_to?(:to_sym)
46
+ enc.to_sym
47
47
  else
48
- enc = nil
48
+ :StandardEncoding
49
49
  end
50
50
 
51
- @enc_name = enc
52
- @unpack = get_unpack(enc)
53
- @map_file = get_mapping_file(enc)
51
+ @unpack = get_unpack(@enc_name)
52
+ @map_file = get_mapping_file(@enc_name)
54
53
 
55
54
  load_mapping(@map_file) if @map_file
55
+
56
+ if enc.is_a?(Hash) && enc[:Differences]
57
+ self.differences = enc[:Differences]
58
+ end
56
59
  end
57
60
 
58
61
  # set the differences table for this encoding. should be an array in the following format:
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'ascii85'
4
5
 
@@ -1,4 +1,6 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  class PDF::Reader
4
6
  module Filter # :nodoc:
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
  module Filter # :nodoc:
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
 
4
5
  require 'zlib'
@@ -7,6 +8,8 @@ class PDF::Reader
7
8
  module Filter # :nodoc:
8
9
  # implementation of the Flate (zlib) stream filter
9
10
  class Flate
11
+ ZLIB_AUTO_DETECT_ZLIB_OR_GZIP = 47 # Zlib::MAX_WBITS + 32
12
+
10
13
  def initialize(options = {})
11
14
  @options = options
12
15
  end
@@ -16,16 +19,15 @@ class PDF::Reader
16
19
  def filter(data)
17
20
  deflated = nil
18
21
  begin
19
- deflated = Zlib::Inflate.new.inflate(data)
22
+ deflated = Zlib::Inflate.new(ZLIB_AUTO_DETECT_ZLIB_OR_GZIP).inflate(data)
20
23
  rescue Zlib::DataError => e
21
24
  # by default, Ruby's Zlib assumes the data it's inflating
22
- # is RFC1951 deflated data, wrapped in a RFC1951 zlib container.
23
- # If that fails, then use an undocumented 'feature' to attempt to inflate
25
+ # is RFC1951 deflated data, wrapped in a RFC1950 zlib container. If that
26
+ # fails, then use a lightly-documented 'feature' to attempt to inflate
24
27
  # the data as a raw RFC1951 stream.
25
28
  #
26
29
  # See
27
30
  # - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
28
- # - http://www.gzip.org/zlib/zlib_faq.html#faq38
29
31
  deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
30
32
  end
31
33
  Depredict.new(@options).filter(deflated)
@@ -1,4 +1,6 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  class PDF::Reader
4
6
  module Filter # :nodoc:
@@ -1,4 +1,6 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  class PDF::Reader
4
6
  module Filter # :nodoc:
@@ -1,4 +1,6 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  class PDF::Reader # :nodoc:
4
6
  module Filter # :nodoc:
@@ -12,7 +14,7 @@ class PDF::Reader # :nodoc:
12
14
  # Decode the specified data with the RunLengthDecode compression algorithm
13
15
  def filter(data)
14
16
  pos = 0
15
- out = ""
17
+ out = "".dup
16
18
 
17
19
  while pos < data.length
18
20
  length = data.getbyte(pos)
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -96,7 +97,13 @@ class PDF::Reader
96
97
  elsif @subtype == :Type3
97
98
  PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
98
99
  elsif @subtype == :TrueType
99
- PDF::Reader::WidthCalculator::TrueType.new(self)
100
+ if @font_descriptor
101
+ PDF::Reader::WidthCalculator::TrueType.new(self)
102
+ else
103
+ # A TrueType font that isn't embedded. Most readers look for a version on the
104
+ # local system and fallback to a substitute. For now, we go straight to a substitute
105
+ PDF::Reader::WidthCalculator::BuiltIn.new(self)
106
+ end
100
107
  elsif @subtype == :CIDFontType0 || @subtype == :CIDFontType2
101
108
  PDF::Reader::WidthCalculator::Composite.new(self)
102
109
  else
@@ -124,7 +131,9 @@ class PDF::Reader
124
131
  if obj[:ToUnicode]
125
132
  # ToUnicode is optional for Type1 and Type3
126
133
  stream = @ohash.object(obj[:ToUnicode])
127
- @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
134
+ if stream.is_a?(PDF::Reader::Stream)
135
+ @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
136
+ end
128
137
  end
129
138
  end
130
139
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'ttfunk'
4
5
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'digest/md5'
4
5
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  module PDF
4
5
 
@@ -82,7 +83,7 @@ module PDF
82
83
  #
83
84
  def self.decode(data)
84
85
  stream = BitStream.new data.to_s, 9 # size of codes between 9 and 12 bits
85
- result = ''
86
+ result = "".dup
86
87
  until (code = stream.read) == CODE_EOD
87
88
  if code == CODE_CLEAR_TABLE
88
89
  stream.set_bits_in_chunk(9)
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'hashery/lru_hash'
4
5
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
  # Provides low level access to the objects in a PDF file via a hash-like
@@ -77,16 +78,7 @@ class PDF::Reader
77
78
  key = PDF::Reader::Reference.new(key.to_i, 0)
78
79
  end
79
80
 
80
- if @cache.has_key?(key)
81
- @cache[key]
82
- elsif xref[key].is_a?(Integer)
83
- buf = new_buffer(xref[key])
84
- @cache[key] = decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
85
- elsif xref[key].is_a?(PDF::Reader::Reference)
86
- container_key = xref[key]
87
- object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
88
- @cache[key] = object_streams[container_key][key.id]
89
- end
81
+ @cache[key] ||= fetch_object(key) || fetch_object_stream(key)
90
82
  rescue InvalidObjectError
91
83
  return default
92
84
  end
@@ -253,6 +245,26 @@ class PDF::Reader
253
245
 
254
246
  private
255
247
 
248
+ # parse a traditional object from the PDF, starting from the byte offset indicated
249
+ # in the xref table
250
+ #
251
+ def fetch_object(key)
252
+ if xref[key].is_a?(Integer)
253
+ buf = new_buffer(xref[key])
254
+ decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
255
+ end
256
+ end
257
+
258
+ # parse a object that's embedded in an object stream in the PDF
259
+ #
260
+ def fetch_object_stream(key)
261
+ if xref[key].is_a?(PDF::Reader::Reference)
262
+ container_key = xref[key]
263
+ object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
264
+ object_streams[container_key][key.id]
265
+ end
266
+ end
267
+
256
268
  # Private implementation of deref!, which exists to ensure the `seen` argument
257
269
  # isn't publicly available. It's used to avoid endless loops in the recursion, and
258
270
  # doesn't need to be part of the public API.
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
  # Small util class for detecting the orientation of a single PDF page. Accounts
@@ -21,12 +22,12 @@ class PDF::Reader
21
22
  def detect_orientation
22
23
  llx,lly,urx,ury = @attributes[:MediaBox]
23
24
  rotation = @attributes[:Rotate].to_i
24
- width = urx.to_i - llx.to_i
25
- height = ury.to_i - lly.to_i
25
+ width = (urx.to_i - llx.to_i).abs
26
+ height = (ury.to_i - lly.to_i).abs
26
27
  if width > height
27
- [0,180].include?(rotation) ? 'landscape' : 'portrait'
28
+ (rotation % 180).zero? ? 'landscape' : 'portrait'
28
29
  else
29
- [0,180].include?(rotation) ? 'portrait' : 'landscape'
30
+ (rotation % 180).zero? ? 'portrait' : 'landscape'
30
31
  end
31
32
  end
32
33
  end
@@ -0,0 +1,65 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+ # remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
5
+ # uses slightly offset overlapping characters to achieve a fake 'bold' effect.
6
+ class OverlappingRunsFilter
7
+
8
+ # This should be between 0 and 1. If TextRun B obscures this much of TextRun A (and they
9
+ # have identical characters) then one will be discarded
10
+ OVERLAPPING_THRESHOLD = 0.5
11
+
12
+ def self.exclude_redundant_runs(runs)
13
+ sweep_line_status = Array.new
14
+ event_point_schedule = Array.new
15
+ to_exclude = []
16
+
17
+ runs.each do |run|
18
+ event_point_schedule << EventPoint.new(run.x, run)
19
+ event_point_schedule << EventPoint.new(run.endx, run)
20
+ end
21
+
22
+ event_point_schedule.sort! { |a,b| a.x <=> b.x }
23
+
24
+ event_point_schedule.each do |event_point|
25
+ run = event_point.run
26
+
27
+ if event_point.start?
28
+ if detect_intersection(sweep_line_status, event_point)
29
+ to_exclude << run
30
+ end
31
+ sweep_line_status.push(run)
32
+ else
33
+ sweep_line_status.delete(run)
34
+ end
35
+ end
36
+ runs - to_exclude
37
+ end
38
+
39
+ def self.detect_intersection(sweep_line_status, event_point)
40
+ sweep_line_status.each do |open_text_run|
41
+ if event_point.x >= open_text_run.x &&
42
+ event_point.x <= open_text_run.endx &&
43
+ open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
44
+ return true
45
+ end
46
+ end
47
+ return false
48
+ end
49
+ end
50
+
51
+ # Utility class used to avoid modifying the underlying TextRun objects while we're
52
+ # looking for duplicates
53
+ class EventPoint
54
+ attr_reader :x, :run
55
+
56
+ def initialize x, run
57
+ @x, @run = x, run
58
+ end
59
+
60
+ def start?
61
+ @x == @run.x
62
+ end
63
+ end
64
+
65
+ end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  module PDF
4
5
  class Reader
@@ -123,6 +124,34 @@ module PDF
123
124
  }.join(" ")
124
125
  end
125
126
 
127
+ # returns the angle to rotate the page clockwise. Always 0, 90, 180 or 270
128
+ #
129
+ def rotate
130
+ value = attributes[:Rotate].to_i
131
+ case value
132
+ when 0, 90, 180, 270
133
+ value
134
+ else
135
+ 0
136
+ end
137
+ end
138
+
139
+ # returns the "boxes" that define the page object.
140
+ # values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
141
+ #
142
+ def boxes
143
+ mediabox = attributes[:MediaBox]
144
+ cropbox = attributes[:Cropbox] || mediabox
145
+
146
+ {
147
+ MediaBox: objects.deref!(mediabox),
148
+ CropBox: objects.deref!(cropbox),
149
+ BleedBox: objects.deref!(attributes[:BleedBox] || cropbox),
150
+ TrimBox: objects.deref!(attributes[:TrimBox] || cropbox),
151
+ ArtBox: objects.deref!(attributes[:ArtBox] || cropbox)
152
+ }
153
+ end
154
+
126
155
  private
127
156
 
128
157
  def root