pdf-reader 2.2.1 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +30 -0
  3. data/README.md +2 -2
  4. data/bin/pdf_callbacks +1 -1
  5. data/bin/pdf_text +1 -1
  6. data/lib/pdf/reader.rb +1 -2
  7. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
  8. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
  9. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
  10. data/lib/pdf/reader/afm/Courier.afm +342 -342
  11. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
  12. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
  13. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
  14. data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
  15. data/lib/pdf/reader/afm/MustRead.html +19 -0
  16. data/lib/pdf/reader/afm/Symbol.afm +213 -213
  17. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
  18. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
  19. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
  20. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
  21. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
  22. data/lib/pdf/reader/buffer.rb +1 -1
  23. data/lib/pdf/reader/cmap.rb +8 -0
  24. data/lib/pdf/reader/encoding.rb +11 -9
  25. data/lib/pdf/reader/filter/flate.rb +28 -16
  26. data/lib/pdf/reader/font.rb +10 -2
  27. data/lib/pdf/reader/object_hash.rb +24 -11
  28. data/lib/pdf/reader/orientation_detector.rb +2 -2
  29. data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
  30. data/lib/pdf/reader/page.rb +28 -0
  31. data/lib/pdf/reader/page_layout.rb +10 -5
  32. data/lib/pdf/reader/page_state.rb +7 -5
  33. data/lib/pdf/reader/page_text_receiver.rb +22 -1
  34. data/lib/pdf/reader/text_run.rb +24 -0
  35. data/lib/pdf/reader/width_calculator/built_in.rb +24 -16
  36. data/lib/pdf/reader/xref.rb +7 -4
  37. metadata +22 -17
  38. data/lib/pdf/hash.rb +0 -20
@@ -96,6 +96,14 @@ class PDF::Reader
96
96
  Parser.new(buffer)
97
97
  end
98
98
 
99
+ # The following includes some manual decoding of UTF-16BE strings into unicode codepoints. In
100
+ # theory we could replace all the UTF-16 code with something based on Ruby's encoding support:
101
+ #
102
+ # str.dup.force_encoding("utf-16be").encode!("utf-8").unpack("U*")
103
+ #
104
+ # However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an
105
+ # exception when we try converting broken UTF-16 to UTF-8
106
+ #
99
107
  def str_to_int(str)
100
108
  return nil if str.nil? || str.size == 0
101
109
  unpacked_string = if str.bytesize == 1 # UTF-8
@@ -40,20 +40,22 @@ class PDF::Reader
40
40
  @mapping = default_mapping # maps from character codes to Unicode codepoints
41
41
  @string_cache = {} # maps from character codes to UTF-8 strings.
42
42
 
43
- if enc.kind_of?(Hash)
44
- self.differences = enc[:Differences] if enc[:Differences]
45
- enc = enc[:Encoding] || enc[:BaseEncoding]
46
- elsif enc != nil
47
- enc = enc.to_sym
43
+ @enc_name = if enc.kind_of?(Hash)
44
+ enc[:Encoding] || enc[:BaseEncoding]
45
+ elsif enc && enc.respond_to?(:to_sym)
46
+ enc.to_sym
48
47
  else
49
- enc = nil
48
+ :StandardEncoding
50
49
  end
51
50
 
52
- @enc_name = enc
53
- @unpack = get_unpack(enc)
54
- @map_file = get_mapping_file(enc)
51
+ @unpack = get_unpack(@enc_name)
52
+ @map_file = get_mapping_file(@enc_name)
55
53
 
56
54
  load_mapping(@map_file) if @map_file
55
+
56
+ if enc.is_a?(Hash) && enc[:Differences]
57
+ self.differences = enc[:Differences]
58
+ end
57
59
  end
58
60
 
59
61
  # set the differences table for this encoding. should be an array in the following format:
@@ -8,6 +8,9 @@ class PDF::Reader
8
8
  module Filter # :nodoc:
9
9
  # implementation of the Flate (zlib) stream filter
10
10
  class Flate
11
+ ZLIB_AUTO_DETECT_ZLIB_OR_GZIP = 47 # Zlib::MAX_WBITS + 32
12
+ ZLIB_RAW_DEFLATE = -15 # Zlib::MAX_WBITS * -1
13
+
11
14
  def initialize(options = {})
12
15
  @options = options
13
16
  end
@@ -15,25 +18,34 @@ class PDF::Reader
15
18
  ################################################################################
16
19
  # Decode the specified data with the Zlib compression algorithm
17
20
  def filter(data)
18
- deflated = nil
21
+ deflated = zlib_inflate(data) || zlib_inflate(data[0, data.bytesize-1])
22
+
23
+ if deflated.nil?
24
+ raise MalformedPDFError,
25
+ "Error while inflating a compressed stream (no suitable inflation algorithm found)"
26
+ end
27
+ Depredict.new(@options).filter(deflated)
28
+ end
29
+
30
+ private
31
+
32
+ def zlib_inflate(data)
19
33
  begin
20
- deflated = Zlib::Inflate.new.inflate(data)
21
- rescue Zlib::DataError => e
34
+ return Zlib::Inflate.new(ZLIB_AUTO_DETECT_ZLIB_OR_GZIP).inflate(data)
35
+ rescue Zlib::DataError
22
36
  # by default, Ruby's Zlib assumes the data it's inflating
23
- # is RFC1951 deflated data, wrapped in a RFC1951 zlib container.
24
- # If that fails, then use an undocumented 'feature' to attempt to inflate
25
- # the data as a raw RFC1951 stream.
26
- #
27
- # See
28
- # - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
29
- # - http://www.gzip.org/zlib/zlib_faq.html#faq38
30
- deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
37
+ # is RFC1951 deflated data, wrapped in a RFC1950 zlib container. If that
38
+ # fails, swallow the exception and attempt to inflate the data as a raw
39
+ # RFC1951 stream.
31
40
  end
32
- Depredict.new(@options).filter(deflated)
33
- rescue Exception => e
34
- # Oops, there was a problem inflating the stream
35
- raise MalformedPDFError,
36
- "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
41
+
42
+ begin
43
+ return Zlib::Inflate.new(ZLIB_RAW_DEFLATE).inflate(data)
44
+ rescue StandardError
45
+ # swallow this one too, so we can try some other fallback options
46
+ end
47
+
48
+ nil
37
49
  end
38
50
  end
39
51
  end
@@ -97,7 +97,13 @@ class PDF::Reader
97
97
  elsif @subtype == :Type3
98
98
  PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
99
99
  elsif @subtype == :TrueType
100
- PDF::Reader::WidthCalculator::TrueType.new(self)
100
+ if @font_descriptor
101
+ PDF::Reader::WidthCalculator::TrueType.new(self)
102
+ else
103
+ # A TrueType font that isn't embedded. Most readers look for a version on the
104
+ # local system and fallback to a substitute. For now, we go straight to a substitute
105
+ PDF::Reader::WidthCalculator::BuiltIn.new(self)
106
+ end
101
107
  elsif @subtype == :CIDFontType0 || @subtype == :CIDFontType2
102
108
  PDF::Reader::WidthCalculator::Composite.new(self)
103
109
  else
@@ -125,7 +131,9 @@ class PDF::Reader
125
131
  if obj[:ToUnicode]
126
132
  # ToUnicode is optional for Type1 and Type3
127
133
  stream = @ohash.object(obj[:ToUnicode])
128
- @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
134
+ if stream.is_a?(PDF::Reader::Stream)
135
+ @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
136
+ end
129
137
  end
130
138
  end
131
139
 
@@ -78,16 +78,7 @@ class PDF::Reader
78
78
  key = PDF::Reader::Reference.new(key.to_i, 0)
79
79
  end
80
80
 
81
- if @cache.has_key?(key)
82
- @cache[key]
83
- elsif xref[key].is_a?(Integer)
84
- buf = new_buffer(xref[key])
85
- @cache[key] = decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
86
- elsif xref[key].is_a?(PDF::Reader::Reference)
87
- container_key = xref[key]
88
- object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
89
- @cache[key] = object_streams[container_key][key.id]
90
- end
81
+ @cache[key] ||= fetch_object(key) || fetch_object_stream(key)
91
82
  rescue InvalidObjectError
92
83
  return default
93
84
  end
@@ -254,6 +245,26 @@ class PDF::Reader
254
245
 
255
246
  private
256
247
 
248
+ # parse a traditional object from the PDF, starting from the byte offset indicated
249
+ # in the xref table
250
+ #
251
+ def fetch_object(key)
252
+ if xref[key].is_a?(Integer)
253
+ buf = new_buffer(xref[key])
254
+ decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
255
+ end
256
+ end
257
+
258
+ # parse a object that's embedded in an object stream in the PDF
259
+ #
260
+ def fetch_object_stream(key)
261
+ if xref[key].is_a?(PDF::Reader::Reference)
262
+ container_key = xref[key]
263
+ object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
264
+ object_streams[container_key][key.id]
265
+ end
266
+ end
267
+
257
268
  # Private implementation of deref!, which exists to ensure the `seen` argument
258
269
  # isn't publicly available. It's used to avoid endless loops in the recursion, and
259
270
  # doesn't need to be part of the public API.
@@ -320,7 +331,9 @@ class PDF::Reader
320
331
  def decrypt(ref, obj)
321
332
  case obj
322
333
  when PDF::Reader::Stream then
323
- obj.data = sec_handler.decrypt(obj.data, ref)
334
+ # PDF 32000-1:2008 7.5.8.2: "The cross-reference stream shall not be encrypted [...]."
335
+ # Therefore we shouldn't try to decrypt it.
336
+ obj.data = sec_handler.decrypt(obj.data, ref) unless obj.hash[:Type] == :XRef
324
337
  obj
325
338
  when Hash then
326
339
  arr = obj.map { |key,val| [key, decrypt(ref, val)] }.flatten(1)
@@ -22,8 +22,8 @@ class PDF::Reader
22
22
  def detect_orientation
23
23
  llx,lly,urx,ury = @attributes[:MediaBox]
24
24
  rotation = @attributes[:Rotate].to_i
25
- width = urx.to_i - llx.to_i
26
- height = ury.to_i - lly.to_i
25
+ width = (urx.to_i - llx.to_i).abs
26
+ height = (ury.to_i - lly.to_i).abs
27
27
  if width > height
28
28
  (rotation % 180).zero? ? 'landscape' : 'portrait'
29
29
  else
@@ -0,0 +1,65 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+ # remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
5
+ # uses slightly offset overlapping characters to achieve a fake 'bold' effect.
6
+ class OverlappingRunsFilter
7
+
8
+ # This should be between 0 and 1. If TextRun B obscures this much of TextRun A (and they
9
+ # have identical characters) then one will be discarded
10
+ OVERLAPPING_THRESHOLD = 0.5
11
+
12
+ def self.exclude_redundant_runs(runs)
13
+ sweep_line_status = Array.new
14
+ event_point_schedule = Array.new
15
+ to_exclude = []
16
+
17
+ runs.each do |run|
18
+ event_point_schedule << EventPoint.new(run.x, run)
19
+ event_point_schedule << EventPoint.new(run.endx, run)
20
+ end
21
+
22
+ event_point_schedule.sort! { |a,b| a.x <=> b.x }
23
+
24
+ event_point_schedule.each do |event_point|
25
+ run = event_point.run
26
+
27
+ if event_point.start?
28
+ if detect_intersection(sweep_line_status, event_point)
29
+ to_exclude << run
30
+ end
31
+ sweep_line_status.push(run)
32
+ else
33
+ sweep_line_status.delete(run)
34
+ end
35
+ end
36
+ runs - to_exclude
37
+ end
38
+
39
+ def self.detect_intersection(sweep_line_status, event_point)
40
+ sweep_line_status.each do |open_text_run|
41
+ if event_point.x >= open_text_run.x &&
42
+ event_point.x <= open_text_run.endx &&
43
+ open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
44
+ return true
45
+ end
46
+ end
47
+ return false
48
+ end
49
+ end
50
+
51
+ # Utility class used to avoid modifying the underlying TextRun objects while we're
52
+ # looking for duplicates
53
+ class EventPoint
54
+ attr_reader :x, :run
55
+
56
+ def initialize x, run
57
+ @x, @run = x, run
58
+ end
59
+
60
+ def start?
61
+ @x == @run.x
62
+ end
63
+ end
64
+
65
+ end
@@ -124,6 +124,34 @@ module PDF
124
124
  }.join(" ")
125
125
  end
126
126
 
127
+ # returns the angle to rotate the page clockwise. Always 0, 90, 180 or 270
128
+ #
129
+ def rotate
130
+ value = attributes[:Rotate].to_i
131
+ case value
132
+ when 0, 90, 180, 270
133
+ value
134
+ else
135
+ 0
136
+ end
137
+ end
138
+
139
+ # returns the "boxes" that define the page object.
140
+ # values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
141
+ #
142
+ def boxes
143
+ mediabox = attributes[:MediaBox]
144
+ cropbox = attributes[:Cropbox] || mediabox
145
+
146
+ {
147
+ MediaBox: objects.deref!(mediabox),
148
+ CropBox: objects.deref!(cropbox),
149
+ BleedBox: objects.deref!(attributes[:BleedBox] || cropbox),
150
+ TrimBox: objects.deref!(attributes[:TrimBox] || cropbox),
151
+ ArtBox: objects.deref!(attributes[:ArtBox] || cropbox)
152
+ }
153
+ end
154
+
127
155
  private
128
156
 
129
157
  def root
@@ -1,6 +1,8 @@
1
1
  # coding: utf-8
2
2
  # frozen_string_literal: true
3
3
 
4
+ require 'pdf/reader/overlapping_runs_filter'
5
+
4
6
  class PDF::Reader
5
7
 
6
8
  # Takes a collection of TextRun objects and renders them into a single
@@ -15,22 +17,25 @@ class PDF::Reader
15
17
  def initialize(runs, mediabox)
16
18
  raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
17
19
 
18
- @runs = merge_runs(runs)
20
+ @runs = merge_runs(OverlappingRunsFilter.exclude_redundant_runs(runs))
19
21
  @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
20
22
  @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
21
23
  @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
22
- @page_width = mediabox[2] - mediabox[0]
23
- @page_height = mediabox[3] - mediabox[1]
24
- @x_offset = @runs.map(&:x).sort.first
24
+ @page_width = (mediabox[2] - mediabox[0]).abs
25
+ @page_height = (mediabox[3] - mediabox[1]).abs
26
+ @x_offset = @runs.map(&:x).sort.first || 0
27
+ lowest_y = @runs.map(&:y).sort.first || 0
28
+ @y_offset = lowest_y > 0 ? 0 : lowest_y
25
29
  end
26
30
 
27
31
  def to_s
28
32
  return "" if @runs.empty?
33
+ return "" if row_count == 0
29
34
 
30
35
  page = row_count.times.map { |i| " " * col_count }
31
36
  @runs.each do |run|
32
37
  x_pos = ((run.x - @x_offset) / col_multiplier).round
33
- y_pos = row_count - (run.y / row_multiplier).round
38
+ y_pos = row_count - ((run.y - @y_offset) / row_multiplier).round
34
39
  if y_pos <= row_count && y_pos >= 0 && x_pos <= col_count && x_pos >= 0
35
40
  local_string_insert(page[y_pos-1], run.text, x_pos)
36
41
  end
@@ -30,7 +30,7 @@ class PDF::Reader
30
30
  @xobject_stack = [page.xobjects]
31
31
  @cs_stack = [page.color_spaces]
32
32
  @stack = [DEFAULT_GRAPHICS_STATE.dup]
33
- state[:ctm] = identity_matrix
33
+ state[:ctm] = identity_matrix
34
34
  end
35
35
 
36
36
  #####################################################
@@ -322,11 +322,13 @@ class PDF::Reader
322
322
  th = state[:h_scaling]
323
323
  # optimise the common path to reduce Float allocations
324
324
  if th == 1 && tj == 0 && tc == 0 && tw == 0
325
- glyph_width = w0 * fs
326
- tx = glyph_width
325
+ tx = w0 * fs
326
+ elsif tj != 0
327
+ # don't apply spacing to TJ displacement
328
+ tx = (w0 - (tj/1000.0)) * fs * th
327
329
  else
328
- glyph_width = ((w0 - (tj/1000.0)) * fs) * th
329
- tx = glyph_width + ((tc + tw) * th)
330
+ # apply horizontal scaling to spacing values but not font size
331
+ tx = ((w0 * fs) + tc + tw) * th
330
332
  end
331
333
 
332
334
  # TODO: I'm pretty sure that tx shouldn't need to be divided by
@@ -41,13 +41,17 @@ module PDF
41
41
  # starting a new page
42
42
  def page=(page)
43
43
  @state = PageState.new(page)
44
+ @page = page
44
45
  @content = []
45
46
  @characters = []
46
47
  @mediabox = page.objects.deref(page.attributes[:MediaBox])
48
+ device_bl = @state.ctm_transform(@mediabox[0], @mediabox[1])
49
+ device_tr = @state.ctm_transform(@mediabox[2], @mediabox[3])
50
+ @device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
47
51
  end
48
52
 
49
53
  def content
50
- PageLayout.new(@characters, @mediabox).to_s
54
+ PageLayout.new(@characters, @device_mediabox).to_s
51
55
  end
52
56
 
53
57
  #####################################################
@@ -101,6 +105,8 @@ module PDF
101
105
  glyphs.each_with_index do |glyph_code, index|
102
106
  # paint the current glyph
103
107
  newx, newy = @state.trm_transform(0,0)
108
+ newx, newy = apply_rotation(newx, newy)
109
+
104
110
  utf8_chars = @state.current_font.to_utf8(glyph_code)
105
111
 
106
112
  # apply to glyph displacment for the current glyph so the next
@@ -115,6 +121,21 @@ module PDF
115
121
  end
116
122
  end
117
123
 
124
+ def apply_rotation(x, y)
125
+ if @page.rotate == 90
126
+ tmp = x
127
+ x = y
128
+ y = tmp * -1
129
+ elsif @page.rotate == 180
130
+ y *= -1
131
+ elsif @page.rotate == 270
132
+ tmp = x
133
+ x = y * -1
134
+ y = tmp * -1
135
+ end
136
+ return x, y
137
+ end
138
+
118
139
  end
119
140
  end
120
141
  end
@@ -38,6 +38,10 @@ class PDF::Reader
38
38
  @endx ||= x + width
39
39
  end
40
40
 
41
+ def endy
42
+ @endy ||= y + font_size
43
+ end
44
+
41
45
  def mean_character_width
42
46
  @width / character_count
43
47
  end
@@ -60,8 +64,28 @@ class PDF::Reader
60
64
  "#{text} w:#{width} f:#{font_size} @#{x},#{y}"
61
65
  end
62
66
 
67
+ def intersect?(other_run)
68
+ x <= other_run.endx && endx >= other_run.x &&
69
+ endy >= other_run.y && y <= other_run.endy
70
+ end
71
+
72
+ # return what percentage of this text run is overlapped by another run
73
+ def intersection_area_percent(other_run)
74
+ return 0 unless intersect?(other_run)
75
+
76
+ dx = [endx, other_run.endx].min - [x, other_run.x].max
77
+ dy = [endy, other_run.endy].min - [y, other_run.y].max
78
+ intersection_area = dx*dy
79
+
80
+ intersection_area.to_f / area
81
+ end
82
+
63
83
  private
64
84
 
85
+ def area
86
+ (endx - x) * (endy - y)
87
+ end
88
+
65
89
  def mergable_range
66
90
  @mergable_range ||= Range.new(endx - 3, endx + font_size)
67
91
  end