pdf-reader 2.2.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e1d87a1e4cc6989cb579c5c720ebe8277ab8099a2a6d7044a5c6f843cfabe2a7
4
- data.tar.gz: '02693bdcc7d21572494ffa3f7e4a7e7ecaa558601951590f6c348c97c892ead3'
3
+ metadata.gz: 419ef1c2770f8cff11f2ee6453f70cec80562eddb7912ddd618013c5c013bcad
4
+ data.tar.gz: 71a7a814472b527b7a03e24d4923893962a8c0a1748e0d9007eb5cd7c8bbf7b3
5
5
  SHA512:
6
- metadata.gz: ae3845f040bff4089ba8e4b2df1e22c10ddea1019475e4525b89fdf3889ffd904f98c72162cfe451ff7cfbe2f697e9462d5b2efe4a4144fdfa34568343c51f2c
7
- data.tar.gz: 26755a0cc78cd490e7013f548ed8b46999629995109986f4ee474fff430fd77913888e6172512630118a2b99ef14546a3e94a38681ff66ac9b80482f7504351b
6
+ metadata.gz: 4a5d4e76a74a766ceae3960587efce9aa63600c1b78b16175e9b41b58435d1c766871c2b288e79edec1499444aa12c786937eda70634bf301cd05ad8f2373063
7
+ data.tar.gz: 7e7bf8f2bb43822a64f89ca46bf0369a1e34b0e60078483ad1d4cf774ef6c6122f689b765f41d02a2120eafeba30fd47bb2cbd1b0dd5c56e7cf556648b3f4e33
data/CHANGELOG CHANGED
@@ -1,3 +1,10 @@
1
+ v2.3.0 (7th November 2019)
2
+ - Text extraction now makes an effort to skip duplicate characters that overlap, a
3
+ common approach used for a fake "bold" effect, This will make text extraction a bit
4
+ slower - if that turns out to be an issue I'll look into further optimisations or
5
+ provide a toggle to turn it off
6
+ - Several small bug fixes
7
+
1
8
  v2.2.1 (27th July 2019)
2
9
  - Improve utf8 text extraction from CMaps that contain surrogate pair ligatures
3
10
 
@@ -55,7 +55,7 @@ class PDF::Reader
55
55
  #
56
56
  # Params:
57
57
  #
58
- # io - an IO stream or string with the raw data to tokenise
58
+ # io - an IO stream (usually a StringIO) with the raw data to tokenise
59
59
  #
60
60
  # options:
61
61
  #
@@ -96,6 +96,14 @@ class PDF::Reader
96
96
  Parser.new(buffer)
97
97
  end
98
98
 
99
+ # The following includes some manual decoding of UTF-16BE strings into unicode codepoints. In
100
+ # theory we could replace all the UTF-16 code with something based on Ruby's encoding support:
101
+ #
102
+ # str.dup.force_encoding("utf-16be").encode!("utf-8").unpack("U*")
103
+ #
104
+ # However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an
105
+ # exception when we try converting broken UTF-16 to UTF-8
106
+ #
99
107
  def str_to_int(str)
100
108
  return nil if str.nil? || str.size == 0
101
109
  unpacked_string = if str.bytesize == 1 # UTF-8
@@ -40,20 +40,22 @@ class PDF::Reader
40
40
  @mapping = default_mapping # maps from character codes to Unicode codepoints
41
41
  @string_cache = {} # maps from character codes to UTF-8 strings.
42
42
 
43
- if enc.kind_of?(Hash)
44
- self.differences = enc[:Differences] if enc[:Differences]
45
- enc = enc[:Encoding] || enc[:BaseEncoding]
43
+ @enc_name = if enc.kind_of?(Hash)
44
+ enc[:Encoding] || enc[:BaseEncoding]
46
45
  elsif enc != nil
47
- enc = enc.to_sym
46
+ enc.to_sym
48
47
  else
49
- enc = nil
48
+ nil
50
49
  end
51
50
 
52
- @enc_name = enc
53
- @unpack = get_unpack(enc)
54
- @map_file = get_mapping_file(enc)
51
+ @unpack = get_unpack(@enc_name)
52
+ @map_file = get_mapping_file(@enc_name)
55
53
 
56
54
  load_mapping(@map_file) if @map_file
55
+
56
+ if enc.is_a?(Hash) && enc[:Differences]
57
+ self.differences = enc[:Differences]
58
+ end
57
59
  end
58
60
 
59
61
  # set the differences table for this encoding. should be an array in the following format:
@@ -78,16 +78,7 @@ class PDF::Reader
78
78
  key = PDF::Reader::Reference.new(key.to_i, 0)
79
79
  end
80
80
 
81
- if @cache.has_key?(key)
82
- @cache[key]
83
- elsif xref[key].is_a?(Integer)
84
- buf = new_buffer(xref[key])
85
- @cache[key] = decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
86
- elsif xref[key].is_a?(PDF::Reader::Reference)
87
- container_key = xref[key]
88
- object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
89
- @cache[key] = object_streams[container_key][key.id]
90
- end
81
+ @cache[key] ||= fetch_object(key) || fetch_object_stream(key)
91
82
  rescue InvalidObjectError
92
83
  return default
93
84
  end
@@ -254,6 +245,26 @@ class PDF::Reader
254
245
 
255
246
  private
256
247
 
248
+ # parse a traditional object from the PDF, starting from the byte offset indicated
249
+ # in the xref table
250
+ #
251
+ def fetch_object(key)
252
+ if xref[key].is_a?(Integer)
253
+ buf = new_buffer(xref[key])
254
+ decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
255
+ end
256
+ end
257
+
258
+ # parse a object that's embedded in an object stream in the PDF
259
+ #
260
+ def fetch_object_stream(key)
261
+ if xref[key].is_a?(PDF::Reader::Reference)
262
+ container_key = xref[key]
263
+ object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
264
+ object_streams[container_key][key.id]
265
+ end
266
+ end
267
+
257
268
  # Private implementation of deref!, which exists to ensure the `seen` argument
258
269
  # isn't publicly available. It's used to avoid endless loops in the recursion, and
259
270
  # doesn't need to be part of the public API.
@@ -0,0 +1,66 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+ # remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
5
+ # uses slightly offset overlapping characters to achieve a fake 'bold' effect.
6
+ class OverlappingRunsFilter
7
+
8
+ # This should be between 0 and 1. If TextRun B obscures this much of TextRun A (and they
9
+ # have identical characters) then one will be discarded
10
+ OVERLAPPING_THRESHOLD = 0.5
11
+
12
+ def self.exclude_redundant_runs(runs)
13
+ sweep_line_status = Array.new
14
+ event_point_schedule = Array.new
15
+ to_exclude = []
16
+
17
+ runs.each do |run|
18
+ event_point_schedule << EventPoint.new(run.x, run)
19
+ event_point_schedule << EventPoint.new(run.endx, run)
20
+ end
21
+
22
+ event_point_schedule.sort! { |a,b| a.x <=> b.x }
23
+
24
+ while not event_point_schedule.empty? do
25
+ event_point = event_point_schedule.shift
26
+ break unless event_point
27
+
28
+ if event_point.start? then
29
+ if detect_intersection(sweep_line_status, event_point)
30
+ to_exclude << event_point.run
31
+ end
32
+ sweep_line_status.push event_point
33
+ else
34
+ sweep_line_status.delete event_point
35
+ end
36
+ end
37
+ runs - to_exclude
38
+ end
39
+
40
+ def self.detect_intersection(sweep_line_status, event_point)
41
+ sweep_line_status.each do |point_in_sls|
42
+ if event_point.x >= point_in_sls.run.x &&
43
+ event_point.x <= point_in_sls.run.endx &&
44
+ point_in_sls.run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
45
+ return true
46
+ end
47
+ end
48
+ return false
49
+ end
50
+ end
51
+
52
+ # Utility class used to avoid modifying the underlying TextRun objects while we're
53
+ # looking for duplicates
54
+ class EventPoint
55
+ attr_reader :x, :run
56
+
57
+ def initialize x, run
58
+ @x, @run = x, run
59
+ end
60
+
61
+ def start?
62
+ @x == @run.x
63
+ end
64
+ end
65
+
66
+ end
@@ -1,6 +1,8 @@
1
1
  # coding: utf-8
2
2
  # frozen_string_literal: true
3
3
 
4
+ require 'pdf/reader/overlapping_runs_filter'
5
+
4
6
  class PDF::Reader
5
7
 
6
8
  # Takes a collection of TextRun objects and renders them into a single
@@ -15,7 +17,7 @@ class PDF::Reader
15
17
  def initialize(runs, mediabox)
16
18
  raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
17
19
 
18
- @runs = merge_runs(runs)
20
+ @runs = merge_runs(OverlappingRunsFilter.exclude_redundant_runs(runs))
19
21
  @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
20
22
  @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
21
23
  @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
@@ -38,6 +38,10 @@ class PDF::Reader
38
38
  @endx ||= x + width
39
39
  end
40
40
 
41
+ def endy
42
+ @endy ||= y + font_size
43
+ end
44
+
41
45
  def mean_character_width
42
46
  @width / character_count
43
47
  end
@@ -60,8 +64,28 @@ class PDF::Reader
60
64
  "#{text} w:#{width} f:#{font_size} @#{x},#{y}"
61
65
  end
62
66
 
67
+ def intersect?(other_run)
68
+ x <= other_run.endx && endx >= other_run.x &&
69
+ endy >= other_run.y && y <= other_run.endy
70
+ end
71
+
72
+ # return what percentage of this text run is overlapped by another run
73
+ def intersection_area_percent(other_run)
74
+ return 0 unless intersect?(other_run)
75
+
76
+ dx = [endx, other_run.endx].min - [x, other_run.x].max
77
+ dy = [endy, other_run.endy].min - [y, other_run.y].max
78
+ intersection_area = dx*dy
79
+
80
+ intersection_area.to_f / area
81
+ end
82
+
63
83
  private
64
84
 
85
+ def area
86
+ (endx - x) * (endy - y)
87
+ end
88
+
65
89
  def mergable_range
66
90
  @mergable_range ||= Range.new(endx - 3, endx + font_size)
67
91
  end
@@ -230,18 +230,21 @@ class PDF::Reader
230
230
  # should always be 0, but all sort of crazy junk is prefixed to PDF files
231
231
  # in the real world.
232
232
  #
233
- # Checks up to 50 chars into the file, returns nil if no PDF data detected.
233
+ # Checks up to 1024 chars into the file,
234
+ # returns nil if no PDF data detected.
235
+ # Adobe PDF 1.4 spec (3.4.1) 12. Acrobat viewers require only that the
236
+ # header appear somewhere within the first 1024 bytes of the file
234
237
  #
235
238
  def calc_junk_offset(io)
236
239
  io.rewind
237
240
  offset = io.pos
238
- until (c = io.readchar) == '%' || c == 37 || offset > 50
241
+ until (c = io.readchar) == '%' || c == 37 || offset > 1024
239
242
  offset += 1
240
243
  end
241
244
  io.rewind
242
- offset < 50 ? offset : nil
245
+ offset < 1024 ? offset : nil
243
246
  rescue EOFError
244
- return nil
247
+ nil
245
248
  end
246
249
  end
247
250
  ################################################################################
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.2.1
4
+ version: 2.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Healy
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-07-27 00:00:00.000000000 Z
11
+ date: 2019-11-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -246,6 +246,7 @@ files:
246
246
  - lib/pdf/reader/object_hash.rb
247
247
  - lib/pdf/reader/object_stream.rb
248
248
  - lib/pdf/reader/orientation_detector.rb
249
+ - lib/pdf/reader/overlapping_runs_filter.rb
249
250
  - lib/pdf/reader/page.rb
250
251
  - lib/pdf/reader/page_layout.rb
251
252
  - lib/pdf/reader/page_state.rb
@@ -295,7 +296,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
295
296
  - !ruby/object:Gem::Version
296
297
  version: '0'
297
298
  requirements: []
298
- rubygems_version: 3.0.1
299
+ rubygems_version: 3.0.3
299
300
  signing_key:
300
301
  specification_version: 4
301
302
  summary: A library for accessing the content of PDF files