pdf-reader 2.2.1 → 2.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +30 -0
  3. data/README.md +2 -2
  4. data/bin/pdf_callbacks +1 -1
  5. data/bin/pdf_text +1 -1
  6. data/lib/pdf/reader.rb +1 -2
  7. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
  8. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
  9. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
  10. data/lib/pdf/reader/afm/Courier.afm +342 -342
  11. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
  12. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
  13. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
  14. data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
  15. data/lib/pdf/reader/afm/MustRead.html +19 -0
  16. data/lib/pdf/reader/afm/Symbol.afm +213 -213
  17. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
  18. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
  19. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
  20. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
  21. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
  22. data/lib/pdf/reader/buffer.rb +1 -1
  23. data/lib/pdf/reader/cmap.rb +8 -0
  24. data/lib/pdf/reader/encoding.rb +11 -9
  25. data/lib/pdf/reader/filter/flate.rb +28 -16
  26. data/lib/pdf/reader/font.rb +10 -2
  27. data/lib/pdf/reader/object_hash.rb +24 -11
  28. data/lib/pdf/reader/orientation_detector.rb +2 -2
  29. data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
  30. data/lib/pdf/reader/page.rb +28 -0
  31. data/lib/pdf/reader/page_layout.rb +10 -5
  32. data/lib/pdf/reader/page_state.rb +7 -5
  33. data/lib/pdf/reader/page_text_receiver.rb +22 -1
  34. data/lib/pdf/reader/text_run.rb +24 -0
  35. data/lib/pdf/reader/width_calculator/built_in.rb +24 -16
  36. data/lib/pdf/reader/xref.rb +7 -4
  37. metadata +22 -17
  38. data/lib/pdf/hash.rb +0 -20
@@ -96,6 +96,14 @@ class PDF::Reader
96
96
  Parser.new(buffer)
97
97
  end
98
98
 
99
+ # The following includes some manual decoding of UTF-16BE strings into unicode codepoints. In
100
+ # theory we could replace all the UTF-16 code with something based on Ruby's encoding support:
101
+ #
102
+ # str.dup.force_encoding("utf-16be").encode!("utf-8").unpack("U*")
103
+ #
104
+ # However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an
105
+ # exception when we try converting broken UTF-16 to UTF-8
106
+ #
99
107
  def str_to_int(str)
100
108
  return nil if str.nil? || str.size == 0
101
109
  unpacked_string = if str.bytesize == 1 # UTF-8
@@ -40,20 +40,22 @@ class PDF::Reader
40
40
  @mapping = default_mapping # maps from character codes to Unicode codepoints
41
41
  @string_cache = {} # maps from character codes to UTF-8 strings.
42
42
 
43
- if enc.kind_of?(Hash)
44
- self.differences = enc[:Differences] if enc[:Differences]
45
- enc = enc[:Encoding] || enc[:BaseEncoding]
46
- elsif enc != nil
47
- enc = enc.to_sym
43
+ @enc_name = if enc.kind_of?(Hash)
44
+ enc[:Encoding] || enc[:BaseEncoding]
45
+ elsif enc && enc.respond_to?(:to_sym)
46
+ enc.to_sym
48
47
  else
49
- enc = nil
48
+ :StandardEncoding
50
49
  end
51
50
 
52
- @enc_name = enc
53
- @unpack = get_unpack(enc)
54
- @map_file = get_mapping_file(enc)
51
+ @unpack = get_unpack(@enc_name)
52
+ @map_file = get_mapping_file(@enc_name)
55
53
 
56
54
  load_mapping(@map_file) if @map_file
55
+
56
+ if enc.is_a?(Hash) && enc[:Differences]
57
+ self.differences = enc[:Differences]
58
+ end
57
59
  end
58
60
 
59
61
  # set the differences table for this encoding. should be an array in the following format:
@@ -8,6 +8,9 @@ class PDF::Reader
8
8
  module Filter # :nodoc:
9
9
  # implementation of the Flate (zlib) stream filter
10
10
  class Flate
11
+ ZLIB_AUTO_DETECT_ZLIB_OR_GZIP = 47 # Zlib::MAX_WBITS + 32
12
+ ZLIB_RAW_DEFLATE = -15 # Zlib::MAX_WBITS * -1
13
+
11
14
  def initialize(options = {})
12
15
  @options = options
13
16
  end
@@ -15,25 +18,34 @@ class PDF::Reader
15
18
  ################################################################################
16
19
  # Decode the specified data with the Zlib compression algorithm
17
20
  def filter(data)
18
- deflated = nil
21
+ deflated = zlib_inflate(data) || zlib_inflate(data[0, data.bytesize-1])
22
+
23
+ if deflated.nil?
24
+ raise MalformedPDFError,
25
+ "Error while inflating a compressed stream (no suitable inflation algorithm found)"
26
+ end
27
+ Depredict.new(@options).filter(deflated)
28
+ end
29
+
30
+ private
31
+
32
+ def zlib_inflate(data)
19
33
  begin
20
- deflated = Zlib::Inflate.new.inflate(data)
21
- rescue Zlib::DataError => e
34
+ return Zlib::Inflate.new(ZLIB_AUTO_DETECT_ZLIB_OR_GZIP).inflate(data)
35
+ rescue Zlib::DataError
22
36
  # by default, Ruby's Zlib assumes the data it's inflating
23
- # is RFC1951 deflated data, wrapped in a RFC1951 zlib container.
24
- # If that fails, then use an undocumented 'feature' to attempt to inflate
25
- # the data as a raw RFC1951 stream.
26
- #
27
- # See
28
- # - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
29
- # - http://www.gzip.org/zlib/zlib_faq.html#faq38
30
- deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
37
+ # is RFC1951 deflated data, wrapped in a RFC1950 zlib container. If that
38
+ # fails, swallow the exception and attempt to inflate the data as a raw
39
+ # RFC1951 stream.
31
40
  end
32
- Depredict.new(@options).filter(deflated)
33
- rescue Exception => e
34
- # Oops, there was a problem inflating the stream
35
- raise MalformedPDFError,
36
- "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
41
+
42
+ begin
43
+ return Zlib::Inflate.new(ZLIB_RAW_DEFLATE).inflate(data)
44
+ rescue StandardError
45
+ # swallow this one too, so we can try some other fallback options
46
+ end
47
+
48
+ nil
37
49
  end
38
50
  end
39
51
  end
@@ -97,7 +97,13 @@ class PDF::Reader
97
97
  elsif @subtype == :Type3
98
98
  PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
99
99
  elsif @subtype == :TrueType
100
- PDF::Reader::WidthCalculator::TrueType.new(self)
100
+ if @font_descriptor
101
+ PDF::Reader::WidthCalculator::TrueType.new(self)
102
+ else
103
+ # A TrueType font that isn't embedded. Most readers look for a version on the
104
+ # local system and fallback to a substitute. For now, we go straight to a substitute
105
+ PDF::Reader::WidthCalculator::BuiltIn.new(self)
106
+ end
101
107
  elsif @subtype == :CIDFontType0 || @subtype == :CIDFontType2
102
108
  PDF::Reader::WidthCalculator::Composite.new(self)
103
109
  else
@@ -125,7 +131,9 @@ class PDF::Reader
125
131
  if obj[:ToUnicode]
126
132
  # ToUnicode is optional for Type1 and Type3
127
133
  stream = @ohash.object(obj[:ToUnicode])
128
- @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
134
+ if stream.is_a?(PDF::Reader::Stream)
135
+ @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
136
+ end
129
137
  end
130
138
  end
131
139
 
@@ -78,16 +78,7 @@ class PDF::Reader
78
78
  key = PDF::Reader::Reference.new(key.to_i, 0)
79
79
  end
80
80
 
81
- if @cache.has_key?(key)
82
- @cache[key]
83
- elsif xref[key].is_a?(Integer)
84
- buf = new_buffer(xref[key])
85
- @cache[key] = decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
86
- elsif xref[key].is_a?(PDF::Reader::Reference)
87
- container_key = xref[key]
88
- object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
89
- @cache[key] = object_streams[container_key][key.id]
90
- end
81
+ @cache[key] ||= fetch_object(key) || fetch_object_stream(key)
91
82
  rescue InvalidObjectError
92
83
  return default
93
84
  end
@@ -254,6 +245,26 @@ class PDF::Reader
254
245
 
255
246
  private
256
247
 
248
+ # parse a traditional object from the PDF, starting from the byte offset indicated
249
+ # in the xref table
250
+ #
251
+ def fetch_object(key)
252
+ if xref[key].is_a?(Integer)
253
+ buf = new_buffer(xref[key])
254
+ decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
255
+ end
256
+ end
257
+
258
+ # parse a object that's embedded in an object stream in the PDF
259
+ #
260
+ def fetch_object_stream(key)
261
+ if xref[key].is_a?(PDF::Reader::Reference)
262
+ container_key = xref[key]
263
+ object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
264
+ object_streams[container_key][key.id]
265
+ end
266
+ end
267
+
257
268
  # Private implementation of deref!, which exists to ensure the `seen` argument
258
269
  # isn't publicly available. It's used to avoid endless loops in the recursion, and
259
270
  # doesn't need to be part of the public API.
@@ -320,7 +331,9 @@ class PDF::Reader
320
331
  def decrypt(ref, obj)
321
332
  case obj
322
333
  when PDF::Reader::Stream then
323
- obj.data = sec_handler.decrypt(obj.data, ref)
334
+ # PDF 32000-1:2008 7.5.8.2: "The cross-reference stream shall not be encrypted [...]."
335
+ # Therefore we shouldn't try to decrypt it.
336
+ obj.data = sec_handler.decrypt(obj.data, ref) unless obj.hash[:Type] == :XRef
324
337
  obj
325
338
  when Hash then
326
339
  arr = obj.map { |key,val| [key, decrypt(ref, val)] }.flatten(1)
@@ -22,8 +22,8 @@ class PDF::Reader
22
22
  def detect_orientation
23
23
  llx,lly,urx,ury = @attributes[:MediaBox]
24
24
  rotation = @attributes[:Rotate].to_i
25
- width = urx.to_i - llx.to_i
26
- height = ury.to_i - lly.to_i
25
+ width = (urx.to_i - llx.to_i).abs
26
+ height = (ury.to_i - lly.to_i).abs
27
27
  if width > height
28
28
  (rotation % 180).zero? ? 'landscape' : 'portrait'
29
29
  else
@@ -0,0 +1,65 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+ # remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
5
+ # uses slightly offset overlapping characters to achieve a fake 'bold' effect.
6
+ class OverlappingRunsFilter
7
+
8
+ # This should be between 0 and 1. If TextRun B obscures this much of TextRun A (and they
9
+ # have identical characters) then one will be discarded
10
+ OVERLAPPING_THRESHOLD = 0.5
11
+
12
+ def self.exclude_redundant_runs(runs)
13
+ sweep_line_status = Array.new
14
+ event_point_schedule = Array.new
15
+ to_exclude = []
16
+
17
+ runs.each do |run|
18
+ event_point_schedule << EventPoint.new(run.x, run)
19
+ event_point_schedule << EventPoint.new(run.endx, run)
20
+ end
21
+
22
+ event_point_schedule.sort! { |a,b| a.x <=> b.x }
23
+
24
+ event_point_schedule.each do |event_point|
25
+ run = event_point.run
26
+
27
+ if event_point.start?
28
+ if detect_intersection(sweep_line_status, event_point)
29
+ to_exclude << run
30
+ end
31
+ sweep_line_status.push(run)
32
+ else
33
+ sweep_line_status.delete(run)
34
+ end
35
+ end
36
+ runs - to_exclude
37
+ end
38
+
39
+ def self.detect_intersection(sweep_line_status, event_point)
40
+ sweep_line_status.each do |open_text_run|
41
+ if event_point.x >= open_text_run.x &&
42
+ event_point.x <= open_text_run.endx &&
43
+ open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
44
+ return true
45
+ end
46
+ end
47
+ return false
48
+ end
49
+ end
50
+
51
+ # Utility class used to avoid modifying the underlying TextRun objects while we're
52
+ # looking for duplicates
53
+ class EventPoint
54
+ attr_reader :x, :run
55
+
56
+ def initialize x, run
57
+ @x, @run = x, run
58
+ end
59
+
60
+ def start?
61
+ @x == @run.x
62
+ end
63
+ end
64
+
65
+ end
@@ -124,6 +124,34 @@ module PDF
124
124
  }.join(" ")
125
125
  end
126
126
 
127
+ # returns the angle to rotate the page clockwise. Always 0, 90, 180 or 270
128
+ #
129
+ def rotate
130
+ value = attributes[:Rotate].to_i
131
+ case value
132
+ when 0, 90, 180, 270
133
+ value
134
+ else
135
+ 0
136
+ end
137
+ end
138
+
139
+ # returns the "boxes" that define the page object.
140
+ # values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
141
+ #
142
+ def boxes
143
+ mediabox = attributes[:MediaBox]
144
+ cropbox = attributes[:Cropbox] || mediabox
145
+
146
+ {
147
+ MediaBox: objects.deref!(mediabox),
148
+ CropBox: objects.deref!(cropbox),
149
+ BleedBox: objects.deref!(attributes[:BleedBox] || cropbox),
150
+ TrimBox: objects.deref!(attributes[:TrimBox] || cropbox),
151
+ ArtBox: objects.deref!(attributes[:ArtBox] || cropbox)
152
+ }
153
+ end
154
+
127
155
  private
128
156
 
129
157
  def root
@@ -1,6 +1,8 @@
1
1
  # coding: utf-8
2
2
  # frozen_string_literal: true
3
3
 
4
+ require 'pdf/reader/overlapping_runs_filter'
5
+
4
6
  class PDF::Reader
5
7
 
6
8
  # Takes a collection of TextRun objects and renders them into a single
@@ -15,22 +17,25 @@ class PDF::Reader
15
17
  def initialize(runs, mediabox)
16
18
  raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
17
19
 
18
- @runs = merge_runs(runs)
20
+ @runs = merge_runs(OverlappingRunsFilter.exclude_redundant_runs(runs))
19
21
  @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
20
22
  @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
21
23
  @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
22
- @page_width = mediabox[2] - mediabox[0]
23
- @page_height = mediabox[3] - mediabox[1]
24
- @x_offset = @runs.map(&:x).sort.first
24
+ @page_width = (mediabox[2] - mediabox[0]).abs
25
+ @page_height = (mediabox[3] - mediabox[1]).abs
26
+ @x_offset = @runs.map(&:x).sort.first || 0
27
+ lowest_y = @runs.map(&:y).sort.first || 0
28
+ @y_offset = lowest_y > 0 ? 0 : lowest_y
25
29
  end
26
30
 
27
31
  def to_s
28
32
  return "" if @runs.empty?
33
+ return "" if row_count == 0
29
34
 
30
35
  page = row_count.times.map { |i| " " * col_count }
31
36
  @runs.each do |run|
32
37
  x_pos = ((run.x - @x_offset) / col_multiplier).round
33
- y_pos = row_count - (run.y / row_multiplier).round
38
+ y_pos = row_count - ((run.y - @y_offset) / row_multiplier).round
34
39
  if y_pos <= row_count && y_pos >= 0 && x_pos <= col_count && x_pos >= 0
35
40
  local_string_insert(page[y_pos-1], run.text, x_pos)
36
41
  end
@@ -30,7 +30,7 @@ class PDF::Reader
30
30
  @xobject_stack = [page.xobjects]
31
31
  @cs_stack = [page.color_spaces]
32
32
  @stack = [DEFAULT_GRAPHICS_STATE.dup]
33
- state[:ctm] = identity_matrix
33
+ state[:ctm] = identity_matrix
34
34
  end
35
35
 
36
36
  #####################################################
@@ -322,11 +322,13 @@ class PDF::Reader
322
322
  th = state[:h_scaling]
323
323
  # optimise the common path to reduce Float allocations
324
324
  if th == 1 && tj == 0 && tc == 0 && tw == 0
325
- glyph_width = w0 * fs
326
- tx = glyph_width
325
+ tx = w0 * fs
326
+ elsif tj != 0
327
+ # don't apply spacing to TJ displacement
328
+ tx = (w0 - (tj/1000.0)) * fs * th
327
329
  else
328
- glyph_width = ((w0 - (tj/1000.0)) * fs) * th
329
- tx = glyph_width + ((tc + tw) * th)
330
+ # apply horizontal scaling to spacing values but not font size
331
+ tx = ((w0 * fs) + tc + tw) * th
330
332
  end
331
333
 
332
334
  # TODO: I'm pretty sure that tx shouldn't need to be divided by
@@ -41,13 +41,17 @@ module PDF
41
41
  # starting a new page
42
42
  def page=(page)
43
43
  @state = PageState.new(page)
44
+ @page = page
44
45
  @content = []
45
46
  @characters = []
46
47
  @mediabox = page.objects.deref(page.attributes[:MediaBox])
48
+ device_bl = @state.ctm_transform(@mediabox[0], @mediabox[1])
49
+ device_tr = @state.ctm_transform(@mediabox[2], @mediabox[3])
50
+ @device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
47
51
  end
48
52
 
49
53
  def content
50
- PageLayout.new(@characters, @mediabox).to_s
54
+ PageLayout.new(@characters, @device_mediabox).to_s
51
55
  end
52
56
 
53
57
  #####################################################
@@ -101,6 +105,8 @@ module PDF
101
105
  glyphs.each_with_index do |glyph_code, index|
102
106
  # paint the current glyph
103
107
  newx, newy = @state.trm_transform(0,0)
108
+ newx, newy = apply_rotation(newx, newy)
109
+
104
110
  utf8_chars = @state.current_font.to_utf8(glyph_code)
105
111
 
106
112
  # apply to glyph displacment for the current glyph so the next
@@ -115,6 +121,21 @@ module PDF
115
121
  end
116
122
  end
117
123
 
124
+ def apply_rotation(x, y)
125
+ if @page.rotate == 90
126
+ tmp = x
127
+ x = y
128
+ y = tmp * -1
129
+ elsif @page.rotate == 180
130
+ y *= -1
131
+ elsif @page.rotate == 270
132
+ tmp = x
133
+ x = y * -1
134
+ y = tmp * -1
135
+ end
136
+ return x, y
137
+ end
138
+
118
139
  end
119
140
  end
120
141
  end
@@ -38,6 +38,10 @@ class PDF::Reader
38
38
  @endx ||= x + width
39
39
  end
40
40
 
41
+ def endy
42
+ @endy ||= y + font_size
43
+ end
44
+
41
45
  def mean_character_width
42
46
  @width / character_count
43
47
  end
@@ -60,8 +64,28 @@ class PDF::Reader
60
64
  "#{text} w:#{width} f:#{font_size} @#{x},#{y}"
61
65
  end
62
66
 
67
+ def intersect?(other_run)
68
+ x <= other_run.endx && endx >= other_run.x &&
69
+ endy >= other_run.y && y <= other_run.endy
70
+ end
71
+
72
+ # return what percentage of this text run is overlapped by another run
73
+ def intersection_area_percent(other_run)
74
+ return 0 unless intersect?(other_run)
75
+
76
+ dx = [endx, other_run.endx].min - [x, other_run.x].max
77
+ dy = [endy, other_run.endy].min - [y, other_run.y].max
78
+ intersection_area = dx*dy
79
+
80
+ intersection_area.to_f / area
81
+ end
82
+
63
83
  private
64
84
 
85
+ def area
86
+ (endx - x) * (endy - y)
87
+ end
88
+
65
89
  def mergable_range
66
90
  @mergable_range ||= Range.new(endx - 3, endx + font_size)
67
91
  end