pdf-reader 2.2.0 → 2.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +26 -0
  3. data/README.md +2 -2
  4. data/bin/pdf_callbacks +1 -1
  5. data/bin/pdf_text +1 -1
  6. data/lib/pdf/reader.rb +1 -2
  7. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
  8. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
  9. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
  10. data/lib/pdf/reader/afm/Courier.afm +342 -342
  11. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
  12. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
  13. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
  14. data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
  15. data/lib/pdf/reader/afm/MustRead.html +19 -0
  16. data/lib/pdf/reader/afm/Symbol.afm +213 -213
  17. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
  18. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
  19. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
  20. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
  21. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
  22. data/lib/pdf/reader/buffer.rb +1 -1
  23. data/lib/pdf/reader/cmap.rb +21 -12
  24. data/lib/pdf/reader/encoding.rb +11 -9
  25. data/lib/pdf/reader/filter/flate.rb +27 -15
  26. data/lib/pdf/reader/font.rb +10 -2
  27. data/lib/pdf/reader/object_hash.rb +21 -10
  28. data/lib/pdf/reader/orientation_detector.rb +4 -4
  29. data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
  30. data/lib/pdf/reader/page.rb +28 -0
  31. data/lib/pdf/reader/page_layout.rb +9 -5
  32. data/lib/pdf/reader/page_state.rb +9 -1
  33. data/lib/pdf/reader/page_text_receiver.rb +4 -1
  34. data/lib/pdf/reader/text_run.rb +24 -0
  35. data/lib/pdf/reader/width_calculator/built_in.rb +17 -1
  36. data/lib/pdf/reader/xref.rb +7 -4
  37. metadata +22 -18
  38. data/lib/pdf/hash.rb +0 -20
@@ -96,25 +96,34 @@ class PDF::Reader
96
96
  Parser.new(buffer)
97
97
  end
98
98
 
99
+ # The following includes some manual decoding of UTF-16BE strings into unicode codepoints. In
100
+ # theory we could replace all the UTF-16 code with something based on Ruby's encoding support:
101
+ #
102
+ # str.dup.force_encoding("utf-16be").encode!("utf-8").unpack("U*")
103
+ #
104
+ # However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an
105
+ # exception when we try converting broken UTF-16 to UTF-8
106
+ #
99
107
  def str_to_int(str)
100
108
  return nil if str.nil? || str.size == 0
101
- unpacked_string = if str.size == 1 # UTF-8
109
+ unpacked_string = if str.bytesize == 1 # UTF-8
102
110
  str.unpack("C*")
103
111
  else # UTF-16
104
112
  str.unpack("n*")
105
113
  end
106
- if unpacked_string.size == 1
107
- unpacked_string
108
- elsif unpacked_string.size == 2 && unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF
109
- # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
110
- # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
111
- # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
112
- [(unpacked_string[0] - 0xD800) * 0x400 + (unpacked_string[1] - 0xDC00) + 0x10000]
113
- else
114
- # it is a bad idea to just return the first 16 bits, as this doesn't allow
115
- # for ligatures for example fi (U+0066 U+0069)
116
- unpacked_string
114
+ result = []
115
+ while unpacked_string.any? do
116
+ if unpacked_string.size >= 2 && unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF
117
+ # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
118
+ # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
119
+ # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
120
+ points = [unpacked_string.shift, unpacked_string.shift]
121
+ result << (points[0] - 0xD800) * 0x400 + (points[1] - 0xDC00) + 0x10000
122
+ else
123
+ result << unpacked_string.shift
124
+ end
117
125
  end
126
+ result
118
127
  end
119
128
 
120
129
  def process_bfchar_instructions(instructions)
@@ -40,20 +40,22 @@ class PDF::Reader
40
40
  @mapping = default_mapping # maps from character codes to Unicode codepoints
41
41
  @string_cache = {} # maps from character codes to UTF-8 strings.
42
42
 
43
- if enc.kind_of?(Hash)
44
- self.differences = enc[:Differences] if enc[:Differences]
45
- enc = enc[:Encoding] || enc[:BaseEncoding]
46
- elsif enc != nil
47
- enc = enc.to_sym
43
+ @enc_name = if enc.kind_of?(Hash)
44
+ enc[:Encoding] || enc[:BaseEncoding]
45
+ elsif enc && enc.respond_to?(:to_sym)
46
+ enc.to_sym
48
47
  else
49
- enc = nil
48
+ :StandardEncoding
50
49
  end
51
50
 
52
- @enc_name = enc
53
- @unpack = get_unpack(enc)
54
- @map_file = get_mapping_file(enc)
51
+ @unpack = get_unpack(@enc_name)
52
+ @map_file = get_mapping_file(@enc_name)
55
53
 
56
54
  load_mapping(@map_file) if @map_file
55
+
56
+ if enc.is_a?(Hash) && enc[:Differences]
57
+ self.differences = enc[:Differences]
58
+ end
57
59
  end
58
60
 
59
61
  # set the differences table for this encoding. should be an array in the following format:
@@ -8,6 +8,9 @@ class PDF::Reader
8
8
  module Filter # :nodoc:
9
9
  # implementation of the Flate (zlib) stream filter
10
10
  class Flate
11
+ ZLIB_AUTO_DETECT_ZLIB_OR_GZIP = 47 # Zlib::MAX_WBITS + 32
12
+ ZLIB_RAW_DEFLATE = -15 # Zlib::MAX_WBITS * -1
13
+
11
14
  def initialize(options = {})
12
15
  @options = options
13
16
  end
@@ -15,25 +18,34 @@ class PDF::Reader
15
18
  ################################################################################
16
19
  # Decode the specified data with the Zlib compression algorithm
17
20
  def filter(data)
18
- deflated = nil
21
+ deflated = zlib_inflate(data) || zlib_inflate(data[0, data.bytesize-1])
22
+
23
+ if deflated.nil?
24
+ raise MalformedPDFError,
25
+ "Error while inflating a compressed stream (no suitable inflation algorithm found)"
26
+ end
27
+ Depredict.new(@options).filter(deflated)
28
+ end
29
+
30
+ private
31
+
32
+ def zlib_inflate(data)
19
33
  begin
20
- deflated = Zlib::Inflate.new.inflate(data)
34
+ return Zlib::Inflate.new(ZLIB_AUTO_DETECT_ZLIB_OR_GZIP).inflate(data)
21
35
  rescue Zlib::DataError => e
22
36
  # by default, Ruby's Zlib assumes the data it's inflating
23
- # is RFC1951 deflated data, wrapped in a RFC1951 zlib container.
24
- # If that fails, then use an undocumented 'feature' to attempt to inflate
25
- # the data as a raw RFC1951 stream.
26
- #
27
- # See
28
- # - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
29
- # - http://www.gzip.org/zlib/zlib_faq.html#faq38
30
- deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
37
+ # is RFC1951 deflated data, wrapped in a RFC1950 zlib container. If that
38
+ # fails, swallow the exception and attempt to inflate the data as a raw
39
+ # RFC1951 stream.
31
40
  end
32
- Depredict.new(@options).filter(deflated)
33
- rescue Exception => e
34
- # Oops, there was a problem inflating the stream
35
- raise MalformedPDFError,
36
- "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
41
+
42
+ begin
43
+ return Zlib::Inflate.new(ZLIB_RAW_DEFLATE).inflate(data)
44
+ rescue StandardError => e
45
+ # swallow this one too, so we can try some other fallback options
46
+ end
47
+
48
+ nil
37
49
  end
38
50
  end
39
51
  end
@@ -97,7 +97,13 @@ class PDF::Reader
97
97
  elsif @subtype == :Type3
98
98
  PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
99
99
  elsif @subtype == :TrueType
100
- PDF::Reader::WidthCalculator::TrueType.new(self)
100
+ if @font_descriptor
101
+ PDF::Reader::WidthCalculator::TrueType.new(self)
102
+ else
103
+ # A TrueType font that isn't embedded. Most readers look for a version on the
104
+ # local system and fallback to a substitute. For now, we go straight to a substitute
105
+ PDF::Reader::WidthCalculator::BuiltIn.new(self)
106
+ end
101
107
  elsif @subtype == :CIDFontType0 || @subtype == :CIDFontType2
102
108
  PDF::Reader::WidthCalculator::Composite.new(self)
103
109
  else
@@ -125,7 +131,9 @@ class PDF::Reader
125
131
  if obj[:ToUnicode]
126
132
  # ToUnicode is optional for Type1 and Type3
127
133
  stream = @ohash.object(obj[:ToUnicode])
128
- @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
134
+ if stream.is_a?(PDF::Reader::Stream)
135
+ @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
136
+ end
129
137
  end
130
138
  end
131
139
 
@@ -78,16 +78,7 @@ class PDF::Reader
78
78
  key = PDF::Reader::Reference.new(key.to_i, 0)
79
79
  end
80
80
 
81
- if @cache.has_key?(key)
82
- @cache[key]
83
- elsif xref[key].is_a?(Integer)
84
- buf = new_buffer(xref[key])
85
- @cache[key] = decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
86
- elsif xref[key].is_a?(PDF::Reader::Reference)
87
- container_key = xref[key]
88
- object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
89
- @cache[key] = object_streams[container_key][key.id]
90
- end
81
+ @cache[key] ||= fetch_object(key) || fetch_object_stream(key)
91
82
  rescue InvalidObjectError
92
83
  return default
93
84
  end
@@ -254,6 +245,26 @@ class PDF::Reader
254
245
 
255
246
  private
256
247
 
248
+ # parse a traditional object from the PDF, starting from the byte offset indicated
249
+ # in the xref table
250
+ #
251
+ def fetch_object(key)
252
+ if xref[key].is_a?(Integer)
253
+ buf = new_buffer(xref[key])
254
+ decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
255
+ end
256
+ end
257
+
258
+ # parse a object that's embedded in an object stream in the PDF
259
+ #
260
+ def fetch_object_stream(key)
261
+ if xref[key].is_a?(PDF::Reader::Reference)
262
+ container_key = xref[key]
263
+ object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
264
+ object_streams[container_key][key.id]
265
+ end
266
+ end
267
+
257
268
  # Private implementation of deref!, which exists to ensure the `seen` argument
258
269
  # isn't publicly available. It's used to avoid endless loops in the recursion, and
259
270
  # doesn't need to be part of the public API.
@@ -22,12 +22,12 @@ class PDF::Reader
22
22
  def detect_orientation
23
23
  llx,lly,urx,ury = @attributes[:MediaBox]
24
24
  rotation = @attributes[:Rotate].to_i
25
- width = urx.to_i - llx.to_i
26
- height = ury.to_i - lly.to_i
25
+ width = (urx.to_i - llx.to_i).abs
26
+ height = (ury.to_i - lly.to_i).abs
27
27
  if width > height
28
- [0,180].include?(rotation) ? 'landscape' : 'portrait'
28
+ (rotation % 180).zero? ? 'landscape' : 'portrait'
29
29
  else
30
- [0,180].include?(rotation) ? 'portrait' : 'landscape'
30
+ (rotation % 180).zero? ? 'portrait' : 'landscape'
31
31
  end
32
32
  end
33
33
  end
@@ -0,0 +1,65 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+ # remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
5
+ # uses slightly offset overlapping characters to achieve a fake 'bold' effect.
6
+ class OverlappingRunsFilter
7
+
8
+ # This should be between 0 and 1. If TextRun B obscures this much of TextRun A (and they
9
+ # have identical characters) then one will be discarded
10
+ OVERLAPPING_THRESHOLD = 0.5
11
+
12
+ def self.exclude_redundant_runs(runs)
13
+ sweep_line_status = Array.new
14
+ event_point_schedule = Array.new
15
+ to_exclude = []
16
+
17
+ runs.each do |run|
18
+ event_point_schedule << EventPoint.new(run.x, run)
19
+ event_point_schedule << EventPoint.new(run.endx, run)
20
+ end
21
+
22
+ event_point_schedule.sort! { |a,b| a.x <=> b.x }
23
+
24
+ event_point_schedule.each do |event_point|
25
+ run = event_point.run
26
+
27
+ if event_point.start?
28
+ if detect_intersection(sweep_line_status, event_point)
29
+ to_exclude << run
30
+ end
31
+ sweep_line_status.push(run)
32
+ else
33
+ sweep_line_status.delete(run)
34
+ end
35
+ end
36
+ runs - to_exclude
37
+ end
38
+
39
+ def self.detect_intersection(sweep_line_status, event_point)
40
+ sweep_line_status.each do |open_text_run|
41
+ if event_point.x >= open_text_run.x &&
42
+ event_point.x <= open_text_run.endx &&
43
+ open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
44
+ return true
45
+ end
46
+ end
47
+ return false
48
+ end
49
+ end
50
+
51
+ # Utility class used to avoid modifying the underlying TextRun objects while we're
52
+ # looking for duplicates
53
+ class EventPoint
54
+ attr_reader :x, :run
55
+
56
+ def initialize x, run
57
+ @x, @run = x, run
58
+ end
59
+
60
+ def start?
61
+ @x == @run.x
62
+ end
63
+ end
64
+
65
+ end
@@ -124,6 +124,34 @@ module PDF
124
124
  }.join(" ")
125
125
  end
126
126
 
127
+ # returns the angle to rotate the page clockwise. Always 0, 90, 180 or 270
128
+ #
129
+ def rotate
130
+ value = attributes[:Rotate].to_i
131
+ case value
132
+ when 0, 90, 180, 270
133
+ value
134
+ else
135
+ 0
136
+ end
137
+ end
138
+
139
+ # returns the "boxes" that define the page object.
140
+ # values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
141
+ #
142
+ def boxes
143
+ mediabox = attributes[:MediaBox]
144
+ cropbox = attributes[:Cropbox] || mediabox
145
+
146
+ {
147
+ MediaBox: objects.deref!(mediabox),
148
+ CropBox: objects.deref!(cropbox),
149
+ BleedBox: objects.deref!(attributes[:BleedBox] || cropbox),
150
+ TrimBox: objects.deref!(attributes[:TrimBox] || cropbox),
151
+ ArtBox: objects.deref!(attributes[:ArtBox] || cropbox)
152
+ }
153
+ end
154
+
127
155
  private
128
156
 
129
157
  def root
@@ -1,6 +1,8 @@
1
1
  # coding: utf-8
2
2
  # frozen_string_literal: true
3
3
 
4
+ require 'pdf/reader/overlapping_runs_filter'
5
+
4
6
  class PDF::Reader
5
7
 
6
8
  # Takes a collection of TextRun objects and renders them into a single
@@ -15,13 +17,15 @@ class PDF::Reader
15
17
  def initialize(runs, mediabox)
16
18
  raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
17
19
 
18
- @runs = merge_runs(runs)
20
+ @runs = merge_runs(OverlappingRunsFilter.exclude_redundant_runs(runs))
19
21
  @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
20
22
  @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
21
23
  @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
22
- @page_width = mediabox[2] - mediabox[0]
23
- @page_height = mediabox[3] - mediabox[1]
24
- @x_offset = @runs.map(&:x).sort.first
24
+ @page_width = (mediabox[2] - mediabox[0]).abs
25
+ @page_height = (mediabox[3] - mediabox[1]).abs
26
+ @x_offset = @runs.map(&:x).sort.first || 0
27
+ lowest_y = @runs.map(&:y).sort.first || 0
28
+ @y_offset = lowest_y > 0 ? 0 : lowest_y
25
29
  end
26
30
 
27
31
  def to_s
@@ -30,7 +34,7 @@ class PDF::Reader
30
34
  page = row_count.times.map { |i| " " * col_count }
31
35
  @runs.each do |run|
32
36
  x_pos = ((run.x - @x_offset) / col_multiplier).round
33
- y_pos = row_count - (run.y / row_multiplier).round
37
+ y_pos = row_count - ((run.y - @y_offset) / row_multiplier).round
34
38
  if y_pos <= row_count && y_pos >= 0 && x_pos <= col_count && x_pos >= 0
35
39
  local_string_insert(page[y_pos-1], run.text, x_pos)
36
40
  end
@@ -30,7 +30,15 @@ class PDF::Reader
30
30
  @xobject_stack = [page.xobjects]
31
31
  @cs_stack = [page.color_spaces]
32
32
  @stack = [DEFAULT_GRAPHICS_STATE.dup]
33
- state[:ctm] = identity_matrix
33
+ if page.rotate == 0
34
+ state[:ctm] = identity_matrix
35
+ else
36
+ rotate_cos = Math.cos(page.rotate * (Math::PI/180.0)).round(2)
37
+ rotate_sin = Math.sin(page.rotate * (Math::PI/180.0)).round(2)
38
+ state[:ctm] = TransformationMatrix.new(rotate_cos, rotate_sin,
39
+ rotate_sin * -1, rotate_cos,
40
+ 0, 0)
41
+ end
34
42
  end
35
43
 
36
44
  #####################################################
@@ -44,10 +44,13 @@ module PDF
44
44
  @content = []
45
45
  @characters = []
46
46
  @mediabox = page.objects.deref(page.attributes[:MediaBox])
47
+ device_bl = @state.ctm_transform(@mediabox[0], @mediabox[1])
48
+ device_tr = @state.ctm_transform(@mediabox[2], @mediabox[3])
49
+ @device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
47
50
  end
48
51
 
49
52
  def content
50
- PageLayout.new(@characters, @mediabox).to_s
53
+ PageLayout.new(@characters, @device_mediabox).to_s
51
54
  end
52
55
 
53
56
  #####################################################
@@ -38,6 +38,10 @@ class PDF::Reader
38
38
  @endx ||= x + width
39
39
  end
40
40
 
41
+ def endy
42
+ @endy ||= y + font_size
43
+ end
44
+
41
45
  def mean_character_width
42
46
  @width / character_count
43
47
  end
@@ -60,8 +64,28 @@ class PDF::Reader
60
64
  "#{text} w:#{width} f:#{font_size} @#{x},#{y}"
61
65
  end
62
66
 
67
+ def intersect?(other_run)
68
+ x <= other_run.endx && endx >= other_run.x &&
69
+ endy >= other_run.y && y <= other_run.endy
70
+ end
71
+
72
+ # return what percentage of this text run is overlapped by another run
73
+ def intersection_area_percent(other_run)
74
+ return 0 unless intersect?(other_run)
75
+
76
+ dx = [endx, other_run.endx].min - [x, other_run.x].max
77
+ dy = [endy, other_run.endy].min - [y, other_run.y].max
78
+ intersection_area = dx*dy
79
+
80
+ intersection_area.to_f / area
81
+ end
82
+
63
83
  private
64
84
 
85
+ def area
86
+ (endx - x) * (endy - y)
87
+ end
88
+
65
89
  def mergable_range
66
90
  @mergable_range ||= Range.new(endx - 3, endx + font_size)
67
91
  end
@@ -12,11 +12,20 @@ class PDF::Reader
12
12
  # see Section 9.6.2.2, PDF 32000-1:2008, pp 256
13
13
  class BuiltIn
14
14
 
15
+ BUILTINS = [
16
+ :Courier, :"Courier-Bold", :"Courier-BoldOblique", :"Courier-Oblique",
17
+ :Helvetica, :"Helvetica-Bold", :"Helvetica-BoldOblique", :"Helvetica-Oblique",
18
+ :Symbol,
19
+ :"Times-Roman", :"Times-Bold", :"Times-BoldItalic", :"Times-Italic",
20
+ :ZapfDingbats
21
+ ]
22
+
15
23
  def initialize(font)
16
24
  @font = font
17
25
  @@all_metrics ||= PDF::Reader::SynchronizedCache.new
18
26
 
19
- metrics_path = File.join(File.dirname(__FILE__), "..","afm","#{font.basefont}.afm")
27
+ basefont = extract_basefont(font.basefont)
28
+ metrics_path = File.join(File.dirname(__FILE__), "..","afm","#{basefont}.afm")
20
29
 
21
30
  if File.file?(metrics_path)
22
31
  @metrics = @@all_metrics[metrics_path] ||= AFM::Font.new(metrics_path)
@@ -54,6 +63,13 @@ class PDF::Reader
54
63
  @font.encoding.int_to_name(code_point).first.to_s[/\Acontrol..\Z/]
55
64
  end
56
65
 
66
+ def extract_basefont(font_name)
67
+ if BUILTINS.include?(font_name)
68
+ font_name
69
+ else
70
+ "Times-Roman"
71
+ end
72
+ end
57
73
  end
58
74
  end
59
75
  end