pdf-reader 2.2.1 → 2.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e1d87a1e4cc6989cb579c5c720ebe8277ab8099a2a6d7044a5c6f843cfabe2a7
4
- data.tar.gz: '02693bdcc7d21572494ffa3f7e4a7e7ecaa558601951590f6c348c97c892ead3'
3
+ metadata.gz: 419ef1c2770f8cff11f2ee6453f70cec80562eddb7912ddd618013c5c013bcad
4
+ data.tar.gz: 71a7a814472b527b7a03e24d4923893962a8c0a1748e0d9007eb5cd7c8bbf7b3
5
5
  SHA512:
6
- metadata.gz: ae3845f040bff4089ba8e4b2df1e22c10ddea1019475e4525b89fdf3889ffd904f98c72162cfe451ff7cfbe2f697e9462d5b2efe4a4144fdfa34568343c51f2c
7
- data.tar.gz: 26755a0cc78cd490e7013f548ed8b46999629995109986f4ee474fff430fd77913888e6172512630118a2b99ef14546a3e94a38681ff66ac9b80482f7504351b
6
+ metadata.gz: 4a5d4e76a74a766ceae3960587efce9aa63600c1b78b16175e9b41b58435d1c766871c2b288e79edec1499444aa12c786937eda70634bf301cd05ad8f2373063
7
+ data.tar.gz: 7e7bf8f2bb43822a64f89ca46bf0369a1e34b0e60078483ad1d4cf774ef6c6122f689b765f41d02a2120eafeba30fd47bb2cbd1b0dd5c56e7cf556648b3f4e33
data/CHANGELOG CHANGED
@@ -1,3 +1,10 @@
1
+ v2.3.0 (7th November 2019)
2
+ - Text extraction now makes an effort to skip duplicate characters that overlap, a
3
+ common approach used for a fake "bold" effect, This will make text extraction a bit
4
+ slower - if that turns out to be an issue I'll look into further optimisations or
5
+ provide a toggle to turn it off
6
+ - Several small bug fixes
7
+
1
8
  v2.2.1 (27th July 2019)
2
9
  - Improve utf8 text extraction from CMaps that contain surrogate pair ligatures
3
10
 
@@ -55,7 +55,7 @@ class PDF::Reader
55
55
  #
56
56
  # Params:
57
57
  #
58
- # io - an IO stream or string with the raw data to tokenise
58
+ # io - an IO stream (usually a StringIO) with the raw data to tokenise
59
59
  #
60
60
  # options:
61
61
  #
@@ -96,6 +96,14 @@ class PDF::Reader
96
96
  Parser.new(buffer)
97
97
  end
98
98
 
99
+ # The following includes some manual decoding of UTF-16BE strings into unicode codepoints. In
100
+ # theory we could replace all the UTF-16 code with something based on Ruby's encoding support:
101
+ #
102
+ # str.dup.force_encoding("utf-16be").encode!("utf-8").unpack("U*")
103
+ #
104
+ # However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an
105
+ # exception when we try converting broken UTF-16 to UTF-8
106
+ #
99
107
  def str_to_int(str)
100
108
  return nil if str.nil? || str.size == 0
101
109
  unpacked_string = if str.bytesize == 1 # UTF-8
@@ -40,20 +40,22 @@ class PDF::Reader
40
40
  @mapping = default_mapping # maps from character codes to Unicode codepoints
41
41
  @string_cache = {} # maps from character codes to UTF-8 strings.
42
42
 
43
- if enc.kind_of?(Hash)
44
- self.differences = enc[:Differences] if enc[:Differences]
45
- enc = enc[:Encoding] || enc[:BaseEncoding]
43
+ @enc_name = if enc.kind_of?(Hash)
44
+ enc[:Encoding] || enc[:BaseEncoding]
46
45
  elsif enc != nil
47
- enc = enc.to_sym
46
+ enc.to_sym
48
47
  else
49
- enc = nil
48
+ nil
50
49
  end
51
50
 
52
- @enc_name = enc
53
- @unpack = get_unpack(enc)
54
- @map_file = get_mapping_file(enc)
51
+ @unpack = get_unpack(@enc_name)
52
+ @map_file = get_mapping_file(@enc_name)
55
53
 
56
54
  load_mapping(@map_file) if @map_file
55
+
56
+ if enc.is_a?(Hash) && enc[:Differences]
57
+ self.differences = enc[:Differences]
58
+ end
57
59
  end
58
60
 
59
61
  # set the differences table for this encoding. should be an array in the following format:
@@ -78,16 +78,7 @@ class PDF::Reader
78
78
  key = PDF::Reader::Reference.new(key.to_i, 0)
79
79
  end
80
80
 
81
- if @cache.has_key?(key)
82
- @cache[key]
83
- elsif xref[key].is_a?(Integer)
84
- buf = new_buffer(xref[key])
85
- @cache[key] = decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
86
- elsif xref[key].is_a?(PDF::Reader::Reference)
87
- container_key = xref[key]
88
- object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
89
- @cache[key] = object_streams[container_key][key.id]
90
- end
81
+ @cache[key] ||= fetch_object(key) || fetch_object_stream(key)
91
82
  rescue InvalidObjectError
92
83
  return default
93
84
  end
@@ -254,6 +245,26 @@ class PDF::Reader
254
245
 
255
246
  private
256
247
 
248
+ # parse a traditional object from the PDF, starting from the byte offset indicated
249
+ # in the xref table
250
+ #
251
+ def fetch_object(key)
252
+ if xref[key].is_a?(Integer)
253
+ buf = new_buffer(xref[key])
254
+ decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
255
+ end
256
+ end
257
+
258
+ # parse a object that's embedded in an object stream in the PDF
259
+ #
260
+ def fetch_object_stream(key)
261
+ if xref[key].is_a?(PDF::Reader::Reference)
262
+ container_key = xref[key]
263
+ object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
264
+ object_streams[container_key][key.id]
265
+ end
266
+ end
267
+
257
268
  # Private implementation of deref!, which exists to ensure the `seen` argument
258
269
  # isn't publicly available. It's used to avoid endless loops in the recursion, and
259
270
  # doesn't need to be part of the public API.
@@ -0,0 +1,66 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+ # remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
5
+ # uses slightly offset overlapping characters to achieve a fake 'bold' effect.
6
+ class OverlappingRunsFilter
7
+
8
+ # This should be between 0 and 1. If TextRun B obscures this much of TextRun A (and they
9
+ # have identical characters) then one will be discarded
10
+ OVERLAPPING_THRESHOLD = 0.5
11
+
12
+ def self.exclude_redundant_runs(runs)
13
+ sweep_line_status = Array.new
14
+ event_point_schedule = Array.new
15
+ to_exclude = []
16
+
17
+ runs.each do |run|
18
+ event_point_schedule << EventPoint.new(run.x, run)
19
+ event_point_schedule << EventPoint.new(run.endx, run)
20
+ end
21
+
22
+ event_point_schedule.sort! { |a,b| a.x <=> b.x }
23
+
24
+ while not event_point_schedule.empty? do
25
+ event_point = event_point_schedule.shift
26
+ break unless event_point
27
+
28
+ if event_point.start? then
29
+ if detect_intersection(sweep_line_status, event_point)
30
+ to_exclude << event_point.run
31
+ end
32
+ sweep_line_status.push event_point
33
+ else
34
+ sweep_line_status.delete event_point
35
+ end
36
+ end
37
+ runs - to_exclude
38
+ end
39
+
40
+ def self.detect_intersection(sweep_line_status, event_point)
41
+ sweep_line_status.each do |point_in_sls|
42
+ if event_point.x >= point_in_sls.run.x &&
43
+ event_point.x <= point_in_sls.run.endx &&
44
+ point_in_sls.run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
45
+ return true
46
+ end
47
+ end
48
+ return false
49
+ end
50
+ end
51
+
52
+ # Utility class used to avoid modifying the underlying TextRun objects while we're
53
+ # looking for duplicates
54
+ class EventPoint
55
+ attr_reader :x, :run
56
+
57
+ def initialize x, run
58
+ @x, @run = x, run
59
+ end
60
+
61
+ def start?
62
+ @x == @run.x
63
+ end
64
+ end
65
+
66
+ end
@@ -1,6 +1,8 @@
1
1
  # coding: utf-8
2
2
  # frozen_string_literal: true
3
3
 
4
+ require 'pdf/reader/overlapping_runs_filter'
5
+
4
6
  class PDF::Reader
5
7
 
6
8
  # Takes a collection of TextRun objects and renders them into a single
@@ -15,7 +17,7 @@ class PDF::Reader
15
17
  def initialize(runs, mediabox)
16
18
  raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
17
19
 
18
- @runs = merge_runs(runs)
20
+ @runs = merge_runs(OverlappingRunsFilter.exclude_redundant_runs(runs))
19
21
  @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
20
22
  @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
21
23
  @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
@@ -38,6 +38,10 @@ class PDF::Reader
38
38
  @endx ||= x + width
39
39
  end
40
40
 
41
+ def endy
42
+ @endy ||= y + font_size
43
+ end
44
+
41
45
  def mean_character_width
42
46
  @width / character_count
43
47
  end
@@ -60,8 +64,28 @@ class PDF::Reader
60
64
  "#{text} w:#{width} f:#{font_size} @#{x},#{y}"
61
65
  end
62
66
 
67
+ def intersect?(other_run)
68
+ x <= other_run.endx && endx >= other_run.x &&
69
+ endy >= other_run.y && y <= other_run.endy
70
+ end
71
+
72
+ # return what percentage of this text run is overlapped by another run
73
+ def intersection_area_percent(other_run)
74
+ return 0 unless intersect?(other_run)
75
+
76
+ dx = [endx, other_run.endx].min - [x, other_run.x].max
77
+ dy = [endy, other_run.endy].min - [y, other_run.y].max
78
+ intersection_area = dx*dy
79
+
80
+ intersection_area.to_f / area
81
+ end
82
+
63
83
  private
64
84
 
85
+ def area
86
+ (endx - x) * (endy - y)
87
+ end
88
+
65
89
  def mergable_range
66
90
  @mergable_range ||= Range.new(endx - 3, endx + font_size)
67
91
  end
@@ -230,18 +230,21 @@ class PDF::Reader
230
230
  # should always be 0, but all sort of crazy junk is prefixed to PDF files
231
231
  # in the real world.
232
232
  #
233
- # Checks up to 50 chars into the file, returns nil if no PDF data detected.
233
+ # Checks up to 1024 chars into the file,
234
+ # returns nil if no PDF data detected.
235
+ # Adobe PDF 1.4 spec (3.4.1) 12. Acrobat viewers require only that the
236
+ # header appear somewhere within the first 1024 bytes of the file
234
237
  #
235
238
  def calc_junk_offset(io)
236
239
  io.rewind
237
240
  offset = io.pos
238
- until (c = io.readchar) == '%' || c == 37 || offset > 50
241
+ until (c = io.readchar) == '%' || c == 37 || offset > 1024
239
242
  offset += 1
240
243
  end
241
244
  io.rewind
242
- offset < 50 ? offset : nil
245
+ offset < 1024 ? offset : nil
243
246
  rescue EOFError
244
- return nil
247
+ nil
245
248
  end
246
249
  end
247
250
  ################################################################################
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.2.1
4
+ version: 2.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Healy
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-07-27 00:00:00.000000000 Z
11
+ date: 2019-11-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -246,6 +246,7 @@ files:
246
246
  - lib/pdf/reader/object_hash.rb
247
247
  - lib/pdf/reader/object_stream.rb
248
248
  - lib/pdf/reader/orientation_detector.rb
249
+ - lib/pdf/reader/overlapping_runs_filter.rb
249
250
  - lib/pdf/reader/page.rb
250
251
  - lib/pdf/reader/page_layout.rb
251
252
  - lib/pdf/reader/page_state.rb
@@ -295,7 +296,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
295
296
  - !ruby/object:Gem::Version
296
297
  version: '0'
297
298
  requirements: []
298
- rubygems_version: 3.0.1
299
+ rubygems_version: 3.0.3
299
300
  signing_key:
300
301
  specification_version: 4
301
302
  summary: A library for accessing the content of PDF files