pdf-reader 2.1.0 → 2.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (73) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +28 -1
  3. data/README.md +2 -2
  4. data/bin/pdf_callbacks +1 -1
  5. data/bin/pdf_text +1 -1
  6. data/lib/pdf-reader.rb +1 -0
  7. data/lib/pdf/reader.rb +2 -2
  8. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
  9. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
  10. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
  11. data/lib/pdf/reader/afm/Courier.afm +342 -342
  12. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
  13. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
  14. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
  15. data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
  16. data/lib/pdf/reader/afm/MustRead.html +19 -0
  17. data/lib/pdf/reader/afm/Symbol.afm +213 -213
  18. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
  19. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
  20. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
  21. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
  22. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
  23. data/lib/pdf/reader/buffer.rb +12 -11
  24. data/lib/pdf/reader/cid_widths.rb +2 -0
  25. data/lib/pdf/reader/cmap.rb +22 -12
  26. data/lib/pdf/reader/encoding.rb +12 -9
  27. data/lib/pdf/reader/error.rb +1 -0
  28. data/lib/pdf/reader/filter.rb +1 -0
  29. data/lib/pdf/reader/filter/ascii85.rb +1 -0
  30. data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
  31. data/lib/pdf/reader/filter/depredict.rb +1 -0
  32. data/lib/pdf/reader/filter/flate.rb +6 -4
  33. data/lib/pdf/reader/filter/lzw.rb +2 -0
  34. data/lib/pdf/reader/filter/null.rb +2 -0
  35. data/lib/pdf/reader/filter/run_length.rb +3 -1
  36. data/lib/pdf/reader/font.rb +11 -2
  37. data/lib/pdf/reader/font_descriptor.rb +1 -0
  38. data/lib/pdf/reader/form_xobject.rb +1 -0
  39. data/lib/pdf/reader/glyph_hash.rb +1 -0
  40. data/lib/pdf/reader/lzw.rb +2 -1
  41. data/lib/pdf/reader/null_security_handler.rb +1 -0
  42. data/lib/pdf/reader/object_cache.rb +1 -0
  43. data/lib/pdf/reader/object_hash.rb +22 -10
  44. data/lib/pdf/reader/object_stream.rb +1 -0
  45. data/lib/pdf/reader/orientation_detector.rb +5 -4
  46. data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
  47. data/lib/pdf/reader/page.rb +29 -0
  48. data/lib/pdf/reader/page_layout.rb +10 -5
  49. data/lib/pdf/reader/page_state.rb +10 -1
  50. data/lib/pdf/reader/page_text_receiver.rb +5 -1
  51. data/lib/pdf/reader/pages_strategy.rb +1 -0
  52. data/lib/pdf/reader/parser.rb +5 -4
  53. data/lib/pdf/reader/print_receiver.rb +1 -0
  54. data/lib/pdf/reader/reference.rb +1 -0
  55. data/lib/pdf/reader/register_receiver.rb +1 -0
  56. data/lib/pdf/reader/resource_methods.rb +1 -0
  57. data/lib/pdf/reader/standard_security_handler.rb +1 -0
  58. data/lib/pdf/reader/standard_security_handler_v5.rb +2 -0
  59. data/lib/pdf/reader/stream.rb +1 -0
  60. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  61. data/lib/pdf/reader/text_run.rb +25 -0
  62. data/lib/pdf/reader/token.rb +1 -0
  63. data/lib/pdf/reader/transformation_matrix.rb +1 -0
  64. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  65. data/lib/pdf/reader/width_calculator.rb +1 -0
  66. data/lib/pdf/reader/width_calculator/built_in.rb +18 -1
  67. data/lib/pdf/reader/width_calculator/composite.rb +1 -0
  68. data/lib/pdf/reader/width_calculator/true_type.rb +2 -2
  69. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
  70. data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
  71. data/lib/pdf/reader/xref.rb +11 -5
  72. metadata +17 -13
  73. data/lib/pdf/hash.rb +0 -19
@@ -1,4 +1,6 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
 
4
6
  require 'forwardable'
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -95,25 +96,34 @@ class PDF::Reader
95
96
  Parser.new(buffer)
96
97
  end
97
98
 
99
+ # The following includes some manual decoding of UTF-16BE strings into unicode codepoints. In
100
+ # theory we could replace all the UTF-16 code with something based on Ruby's encoding support:
101
+ #
102
+ # str.dup.force_encoding("utf-16be").encode!("utf-8").unpack("U*")
103
+ #
104
+ # However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an
105
+ # exception when we try converting broken UTF-16 to UTF-8
106
+ #
98
107
  def str_to_int(str)
99
108
  return nil if str.nil? || str.size == 0
100
- unpacked_string = if str.size == 1 # UTF-8
109
+ unpacked_string = if str.bytesize == 1 # UTF-8
101
110
  str.unpack("C*")
102
111
  else # UTF-16
103
112
  str.unpack("n*")
104
113
  end
105
- if unpacked_string.size == 1
106
- unpacked_string
107
- elsif unpacked_string.size == 2 && unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF
108
- # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
109
- # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
110
- # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
111
- [(unpacked_string[0] - 0xD800) * 0x400 + (unpacked_string[1] - 0xDC00) + 0x10000]
112
- else
113
- # it is a bad idea to just return the first 16 bits, as this doesn't allow
114
- # for ligatures for example fi (U+0066 U+0069)
115
- unpacked_string
114
+ result = []
115
+ while unpacked_string.any? do
116
+ if unpacked_string.size >= 2 && unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF
117
+ # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
118
+ # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
119
+ # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
120
+ points = [unpacked_string.shift, unpacked_string.shift]
121
+ result << (points[0] - 0xD800) * 0x400 + (points[1] - 0xDC00) + 0x10000
122
+ else
123
+ result << unpacked_string.shift
124
+ end
116
125
  end
126
+ result
117
127
  end
118
128
 
119
129
  def process_bfchar_instructions(instructions)
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -39,20 +40,22 @@ class PDF::Reader
39
40
  @mapping = default_mapping # maps from character codes to Unicode codepoints
40
41
  @string_cache = {} # maps from character codes to UTF-8 strings.
41
42
 
42
- if enc.kind_of?(Hash)
43
- self.differences = enc[:Differences] if enc[:Differences]
44
- enc = enc[:Encoding] || enc[:BaseEncoding]
45
- elsif enc != nil
46
- enc = enc.to_sym
43
+ @enc_name = if enc.kind_of?(Hash)
44
+ enc[:Encoding] || enc[:BaseEncoding]
45
+ elsif enc && enc.respond_to?(:to_sym)
46
+ enc.to_sym
47
47
  else
48
- enc = nil
48
+ :StandardEncoding
49
49
  end
50
50
 
51
- @enc_name = enc
52
- @unpack = get_unpack(enc)
53
- @map_file = get_mapping_file(enc)
51
+ @unpack = get_unpack(@enc_name)
52
+ @map_file = get_mapping_file(@enc_name)
54
53
 
55
54
  load_mapping(@map_file) if @map_file
55
+
56
+ if enc.is_a?(Hash) && enc[:Differences]
57
+ self.differences = enc[:Differences]
58
+ end
56
59
  end
57
60
 
58
61
  # set the differences table for this encoding. should be an array in the following format:
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'ascii85'
4
5
 
@@ -1,4 +1,6 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  class PDF::Reader
4
6
  module Filter # :nodoc:
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
  module Filter # :nodoc:
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
 
4
5
  require 'zlib'
@@ -7,6 +8,8 @@ class PDF::Reader
7
8
  module Filter # :nodoc:
8
9
  # implementation of the Flate (zlib) stream filter
9
10
  class Flate
11
+ ZLIB_AUTO_DETECT_ZLIB_OR_GZIP = 47 # Zlib::MAX_WBITS + 32
12
+
10
13
  def initialize(options = {})
11
14
  @options = options
12
15
  end
@@ -16,16 +19,15 @@ class PDF::Reader
16
19
  def filter(data)
17
20
  deflated = nil
18
21
  begin
19
- deflated = Zlib::Inflate.new.inflate(data)
22
+ deflated = Zlib::Inflate.new(ZLIB_AUTO_DETECT_ZLIB_OR_GZIP).inflate(data)
20
23
  rescue Zlib::DataError => e
21
24
  # by default, Ruby's Zlib assumes the data it's inflating
22
- # is RFC1951 deflated data, wrapped in a RFC1951 zlib container.
23
- # If that fails, then use an undocumented 'feature' to attempt to inflate
25
+ # is RFC1951 deflated data, wrapped in a RFC1950 zlib container. If that
26
+ # fails, then use a lightly-documented 'feature' to attempt to inflate
24
27
  # the data as a raw RFC1951 stream.
25
28
  #
26
29
  # See
27
30
  # - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
28
- # - http://www.gzip.org/zlib/zlib_faq.html#faq38
29
31
  deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
30
32
  end
31
33
  Depredict.new(@options).filter(deflated)
@@ -1,4 +1,6 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  class PDF::Reader
4
6
  module Filter # :nodoc:
@@ -1,4 +1,6 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  class PDF::Reader
4
6
  module Filter # :nodoc:
@@ -1,4 +1,6 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
3
+
2
4
  #
3
5
  class PDF::Reader # :nodoc:
4
6
  module Filter # :nodoc:
@@ -12,7 +14,7 @@ class PDF::Reader # :nodoc:
12
14
  # Decode the specified data with the RunLengthDecode compression algorithm
13
15
  def filter(data)
14
16
  pos = 0
15
- out = ""
17
+ out = "".dup
16
18
 
17
19
  while pos < data.length
18
20
  length = data.getbyte(pos)
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -96,7 +97,13 @@ class PDF::Reader
96
97
  elsif @subtype == :Type3
97
98
  PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
98
99
  elsif @subtype == :TrueType
99
- PDF::Reader::WidthCalculator::TrueType.new(self)
100
+ if @font_descriptor
101
+ PDF::Reader::WidthCalculator::TrueType.new(self)
102
+ else
103
+ # A TrueType font that isn't embedded. Most readers look for a version on the
104
+ # local system and fallback to a substitute. For now, we go straight to a substitute
105
+ PDF::Reader::WidthCalculator::BuiltIn.new(self)
106
+ end
100
107
  elsif @subtype == :CIDFontType0 || @subtype == :CIDFontType2
101
108
  PDF::Reader::WidthCalculator::Composite.new(self)
102
109
  else
@@ -124,7 +131,9 @@ class PDF::Reader
124
131
  if obj[:ToUnicode]
125
132
  # ToUnicode is optional for Type1 and Type3
126
133
  stream = @ohash.object(obj[:ToUnicode])
127
- @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
134
+ if stream.is_a?(PDF::Reader::Stream)
135
+ @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
136
+ end
128
137
  end
129
138
  end
130
139
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'ttfunk'
4
5
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'digest/md5'
4
5
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  module PDF
4
5
 
@@ -82,7 +83,7 @@ module PDF
82
83
  #
83
84
  def self.decode(data)
84
85
  stream = BitStream.new data.to_s, 9 # size of codes between 9 and 12 bits
85
- result = ''
86
+ result = "".dup
86
87
  until (code = stream.read) == CODE_EOD
87
88
  if code == CODE_CLEAR_TABLE
88
89
  stream.set_bits_in_chunk(9)
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'hashery/lru_hash'
4
5
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
  # Provides low level access to the objects in a PDF file via a hash-like
@@ -77,16 +78,7 @@ class PDF::Reader
77
78
  key = PDF::Reader::Reference.new(key.to_i, 0)
78
79
  end
79
80
 
80
- if @cache.has_key?(key)
81
- @cache[key]
82
- elsif xref[key].is_a?(Integer)
83
- buf = new_buffer(xref[key])
84
- @cache[key] = decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
85
- elsif xref[key].is_a?(PDF::Reader::Reference)
86
- container_key = xref[key]
87
- object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
88
- @cache[key] = object_streams[container_key][key.id]
89
- end
81
+ @cache[key] ||= fetch_object(key) || fetch_object_stream(key)
90
82
  rescue InvalidObjectError
91
83
  return default
92
84
  end
@@ -253,6 +245,26 @@ class PDF::Reader
253
245
 
254
246
  private
255
247
 
248
+ # parse a traditional object from the PDF, starting from the byte offset indicated
249
+ # in the xref table
250
+ #
251
+ def fetch_object(key)
252
+ if xref[key].is_a?(Integer)
253
+ buf = new_buffer(xref[key])
254
+ decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
255
+ end
256
+ end
257
+
258
+ # parse a object that's embedded in an object stream in the PDF
259
+ #
260
+ def fetch_object_stream(key)
261
+ if xref[key].is_a?(PDF::Reader::Reference)
262
+ container_key = xref[key]
263
+ object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
264
+ object_streams[container_key][key.id]
265
+ end
266
+ end
267
+
256
268
  # Private implementation of deref!, which exists to ensure the `seen` argument
257
269
  # isn't publicly available. It's used to avoid endless loops in the recursion, and
258
270
  # doesn't need to be part of the public API.
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
  # Small util class for detecting the orientation of a single PDF page. Accounts
@@ -21,12 +22,12 @@ class PDF::Reader
21
22
  def detect_orientation
22
23
  llx,lly,urx,ury = @attributes[:MediaBox]
23
24
  rotation = @attributes[:Rotate].to_i
24
- width = urx.to_i - llx.to_i
25
- height = ury.to_i - lly.to_i
25
+ width = (urx.to_i - llx.to_i).abs
26
+ height = (ury.to_i - lly.to_i).abs
26
27
  if width > height
27
- [0,180].include?(rotation) ? 'landscape' : 'portrait'
28
+ (rotation % 180).zero? ? 'landscape' : 'portrait'
28
29
  else
29
- [0,180].include?(rotation) ? 'portrait' : 'landscape'
30
+ (rotation % 180).zero? ? 'portrait' : 'landscape'
30
31
  end
31
32
  end
32
33
  end
@@ -0,0 +1,65 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+ # remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
5
+ # uses slightly offset overlapping characters to achieve a fake 'bold' effect.
6
+ class OverlappingRunsFilter
7
+
8
+ # This should be between 0 and 1. If TextRun B obscures this much of TextRun A (and they
9
+ # have identical characters) then one will be discarded
10
+ OVERLAPPING_THRESHOLD = 0.5
11
+
12
+ def self.exclude_redundant_runs(runs)
13
+ sweep_line_status = Array.new
14
+ event_point_schedule = Array.new
15
+ to_exclude = []
16
+
17
+ runs.each do |run|
18
+ event_point_schedule << EventPoint.new(run.x, run)
19
+ event_point_schedule << EventPoint.new(run.endx, run)
20
+ end
21
+
22
+ event_point_schedule.sort! { |a,b| a.x <=> b.x }
23
+
24
+ event_point_schedule.each do |event_point|
25
+ run = event_point.run
26
+
27
+ if event_point.start?
28
+ if detect_intersection(sweep_line_status, event_point)
29
+ to_exclude << run
30
+ end
31
+ sweep_line_status.push(run)
32
+ else
33
+ sweep_line_status.delete(run)
34
+ end
35
+ end
36
+ runs - to_exclude
37
+ end
38
+
39
+ def self.detect_intersection(sweep_line_status, event_point)
40
+ sweep_line_status.each do |open_text_run|
41
+ if event_point.x >= open_text_run.x &&
42
+ event_point.x <= open_text_run.endx &&
43
+ open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
44
+ return true
45
+ end
46
+ end
47
+ return false
48
+ end
49
+ end
50
+
51
+ # Utility class used to avoid modifying the underlying TextRun objects while we're
52
+ # looking for duplicates
53
+ class EventPoint
54
+ attr_reader :x, :run
55
+
56
+ def initialize x, run
57
+ @x, @run = x, run
58
+ end
59
+
60
+ def start?
61
+ @x == @run.x
62
+ end
63
+ end
64
+
65
+ end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  module PDF
4
5
  class Reader
@@ -123,6 +124,34 @@ module PDF
123
124
  }.join(" ")
124
125
  end
125
126
 
127
+ # returns the angle to rotate the page clockwise. Always 0, 90, 180 or 270
128
+ #
129
+ def rotate
130
+ value = attributes[:Rotate].to_i
131
+ case value
132
+ when 0, 90, 180, 270
133
+ value
134
+ else
135
+ 0
136
+ end
137
+ end
138
+
139
+ # returns the "boxes" that define the page object.
140
+ # values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
141
+ #
142
+ def boxes
143
+ mediabox = attributes[:MediaBox]
144
+ cropbox = attributes[:Cropbox] || mediabox
145
+
146
+ {
147
+ MediaBox: objects.deref!(mediabox),
148
+ CropBox: objects.deref!(cropbox),
149
+ BleedBox: objects.deref!(attributes[:BleedBox] || cropbox),
150
+ TrimBox: objects.deref!(attributes[:TrimBox] || cropbox),
151
+ ArtBox: objects.deref!(attributes[:ArtBox] || cropbox)
152
+ }
153
+ end
154
+
126
155
  private
127
156
 
128
157
  def root