pdf-reader 2.2.1 → 2.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +7 -0
- data/lib/pdf/reader/buffer.rb +1 -1
- data/lib/pdf/reader/cmap.rb +8 -0
- data/lib/pdf/reader/encoding.rb +10 -8
- data/lib/pdf/reader/object_hash.rb +21 -10
- data/lib/pdf/reader/overlapping_runs_filter.rb +66 -0
- data/lib/pdf/reader/page_layout.rb +3 -1
- data/lib/pdf/reader/text_run.rb +24 -0
- data/lib/pdf/reader/xref.rb +7 -4
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 419ef1c2770f8cff11f2ee6453f70cec80562eddb7912ddd618013c5c013bcad
|
4
|
+
data.tar.gz: 71a7a814472b527b7a03e24d4923893962a8c0a1748e0d9007eb5cd7c8bbf7b3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4a5d4e76a74a766ceae3960587efce9aa63600c1b78b16175e9b41b58435d1c766871c2b288e79edec1499444aa12c786937eda70634bf301cd05ad8f2373063
|
7
|
+
data.tar.gz: 7e7bf8f2bb43822a64f89ca46bf0369a1e34b0e60078483ad1d4cf774ef6c6122f689b765f41d02a2120eafeba30fd47bb2cbd1b0dd5c56e7cf556648b3f4e33
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
v2.3.0 (7th November 2019)
|
2
|
+
- Text extraction now makes an effort to skip duplicate characters that overlap, a
|
3
|
+
common approach used for a fake "bold" effect, This will make text extraction a bit
|
4
|
+
slower - if that turns out to be an issue I'll look into further optimisations or
|
5
|
+
provide a toggle to turn it off
|
6
|
+
- Several small bug fixes
|
7
|
+
|
1
8
|
v2.2.1 (27th July 2019)
|
2
9
|
- Improve utf8 text extraction from CMaps that contain surrogate pair ligatures
|
3
10
|
|
data/lib/pdf/reader/buffer.rb
CHANGED
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -96,6 +96,14 @@ class PDF::Reader
|
|
96
96
|
Parser.new(buffer)
|
97
97
|
end
|
98
98
|
|
99
|
+
# The following includes some manual decoding of UTF-16BE strings into unicode codepoints. In
|
100
|
+
# theory we could replace all the UTF-16 code with something based on Ruby's encoding support:
|
101
|
+
#
|
102
|
+
# str.dup.force_encoding("utf-16be").encode!("utf-8").unpack("U*")
|
103
|
+
#
|
104
|
+
# However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an
|
105
|
+
# exception when we try converting broken UTF-16 to UTF-8
|
106
|
+
#
|
99
107
|
def str_to_int(str)
|
100
108
|
return nil if str.nil? || str.size == 0
|
101
109
|
unpacked_string = if str.bytesize == 1 # UTF-8
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -40,20 +40,22 @@ class PDF::Reader
|
|
40
40
|
@mapping = default_mapping # maps from character codes to Unicode codepoints
|
41
41
|
@string_cache = {} # maps from character codes to UTF-8 strings.
|
42
42
|
|
43
|
-
if enc.kind_of?(Hash)
|
44
|
-
|
45
|
-
enc = enc[:Encoding] || enc[:BaseEncoding]
|
43
|
+
@enc_name = if enc.kind_of?(Hash)
|
44
|
+
enc[:Encoding] || enc[:BaseEncoding]
|
46
45
|
elsif enc != nil
|
47
|
-
enc
|
46
|
+
enc.to_sym
|
48
47
|
else
|
49
|
-
|
48
|
+
nil
|
50
49
|
end
|
51
50
|
|
52
|
-
@
|
53
|
-
@
|
54
|
-
@map_file = get_mapping_file(enc)
|
51
|
+
@unpack = get_unpack(@enc_name)
|
52
|
+
@map_file = get_mapping_file(@enc_name)
|
55
53
|
|
56
54
|
load_mapping(@map_file) if @map_file
|
55
|
+
|
56
|
+
if enc.is_a?(Hash) && enc[:Differences]
|
57
|
+
self.differences = enc[:Differences]
|
58
|
+
end
|
57
59
|
end
|
58
60
|
|
59
61
|
# set the differences table for this encoding. should be an array in the following format:
|
@@ -78,16 +78,7 @@ class PDF::Reader
|
|
78
78
|
key = PDF::Reader::Reference.new(key.to_i, 0)
|
79
79
|
end
|
80
80
|
|
81
|
-
|
82
|
-
@cache[key]
|
83
|
-
elsif xref[key].is_a?(Integer)
|
84
|
-
buf = new_buffer(xref[key])
|
85
|
-
@cache[key] = decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
|
86
|
-
elsif xref[key].is_a?(PDF::Reader::Reference)
|
87
|
-
container_key = xref[key]
|
88
|
-
object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
|
89
|
-
@cache[key] = object_streams[container_key][key.id]
|
90
|
-
end
|
81
|
+
@cache[key] ||= fetch_object(key) || fetch_object_stream(key)
|
91
82
|
rescue InvalidObjectError
|
92
83
|
return default
|
93
84
|
end
|
@@ -254,6 +245,26 @@ class PDF::Reader
|
|
254
245
|
|
255
246
|
private
|
256
247
|
|
248
|
+
# parse a traditional object from the PDF, starting from the byte offset indicated
|
249
|
+
# in the xref table
|
250
|
+
#
|
251
|
+
def fetch_object(key)
|
252
|
+
if xref[key].is_a?(Integer)
|
253
|
+
buf = new_buffer(xref[key])
|
254
|
+
decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
# parse a object that's embedded in an object stream in the PDF
|
259
|
+
#
|
260
|
+
def fetch_object_stream(key)
|
261
|
+
if xref[key].is_a?(PDF::Reader::Reference)
|
262
|
+
container_key = xref[key]
|
263
|
+
object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
|
264
|
+
object_streams[container_key][key.id]
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
257
268
|
# Private implementation of deref!, which exists to ensure the `seen` argument
|
258
269
|
# isn't publicly available. It's used to avoid endless loops in the recursion, and
|
259
270
|
# doesn't need to be part of the public API.
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
# remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
|
5
|
+
# uses slightly offset overlapping characters to achieve a fake 'bold' effect.
|
6
|
+
class OverlappingRunsFilter
|
7
|
+
|
8
|
+
# This should be between 0 and 1. If TextRun B obscures this much of TextRun A (and they
|
9
|
+
# have identical characters) then one will be discarded
|
10
|
+
OVERLAPPING_THRESHOLD = 0.5
|
11
|
+
|
12
|
+
def self.exclude_redundant_runs(runs)
|
13
|
+
sweep_line_status = Array.new
|
14
|
+
event_point_schedule = Array.new
|
15
|
+
to_exclude = []
|
16
|
+
|
17
|
+
runs.each do |run|
|
18
|
+
event_point_schedule << EventPoint.new(run.x, run)
|
19
|
+
event_point_schedule << EventPoint.new(run.endx, run)
|
20
|
+
end
|
21
|
+
|
22
|
+
event_point_schedule.sort! { |a,b| a.x <=> b.x }
|
23
|
+
|
24
|
+
while not event_point_schedule.empty? do
|
25
|
+
event_point = event_point_schedule.shift
|
26
|
+
break unless event_point
|
27
|
+
|
28
|
+
if event_point.start? then
|
29
|
+
if detect_intersection(sweep_line_status, event_point)
|
30
|
+
to_exclude << event_point.run
|
31
|
+
end
|
32
|
+
sweep_line_status.push event_point
|
33
|
+
else
|
34
|
+
sweep_line_status.delete event_point
|
35
|
+
end
|
36
|
+
end
|
37
|
+
runs - to_exclude
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.detect_intersection(sweep_line_status, event_point)
|
41
|
+
sweep_line_status.each do |point_in_sls|
|
42
|
+
if event_point.x >= point_in_sls.run.x &&
|
43
|
+
event_point.x <= point_in_sls.run.endx &&
|
44
|
+
point_in_sls.run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
|
45
|
+
return true
|
46
|
+
end
|
47
|
+
end
|
48
|
+
return false
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Utility class used to avoid modifying the underlying TextRun objects while we're
|
53
|
+
# looking for duplicates
|
54
|
+
class EventPoint
|
55
|
+
attr_reader :x, :run
|
56
|
+
|
57
|
+
def initialize x, run
|
58
|
+
@x, @run = x, run
|
59
|
+
end
|
60
|
+
|
61
|
+
def start?
|
62
|
+
@x == @run.x
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
@@ -1,6 +1,8 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
# frozen_string_literal: true
|
3
3
|
|
4
|
+
require 'pdf/reader/overlapping_runs_filter'
|
5
|
+
|
4
6
|
class PDF::Reader
|
5
7
|
|
6
8
|
# Takes a collection of TextRun objects and renders them into a single
|
@@ -15,7 +17,7 @@ class PDF::Reader
|
|
15
17
|
def initialize(runs, mediabox)
|
16
18
|
raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
|
17
19
|
|
18
|
-
@runs = merge_runs(runs)
|
20
|
+
@runs = merge_runs(OverlappingRunsFilter.exclude_redundant_runs(runs))
|
19
21
|
@mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
|
20
22
|
@mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
|
21
23
|
@mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
|
data/lib/pdf/reader/text_run.rb
CHANGED
@@ -38,6 +38,10 @@ class PDF::Reader
|
|
38
38
|
@endx ||= x + width
|
39
39
|
end
|
40
40
|
|
41
|
+
def endy
|
42
|
+
@endy ||= y + font_size
|
43
|
+
end
|
44
|
+
|
41
45
|
def mean_character_width
|
42
46
|
@width / character_count
|
43
47
|
end
|
@@ -60,8 +64,28 @@ class PDF::Reader
|
|
60
64
|
"#{text} w:#{width} f:#{font_size} @#{x},#{y}"
|
61
65
|
end
|
62
66
|
|
67
|
+
def intersect?(other_run)
|
68
|
+
x <= other_run.endx && endx >= other_run.x &&
|
69
|
+
endy >= other_run.y && y <= other_run.endy
|
70
|
+
end
|
71
|
+
|
72
|
+
# return what percentage of this text run is overlapped by another run
|
73
|
+
def intersection_area_percent(other_run)
|
74
|
+
return 0 unless intersect?(other_run)
|
75
|
+
|
76
|
+
dx = [endx, other_run.endx].min - [x, other_run.x].max
|
77
|
+
dy = [endy, other_run.endy].min - [y, other_run.y].max
|
78
|
+
intersection_area = dx*dy
|
79
|
+
|
80
|
+
intersection_area.to_f / area
|
81
|
+
end
|
82
|
+
|
63
83
|
private
|
64
84
|
|
85
|
+
def area
|
86
|
+
(endx - x) * (endy - y)
|
87
|
+
end
|
88
|
+
|
65
89
|
def mergable_range
|
66
90
|
@mergable_range ||= Range.new(endx - 3, endx + font_size)
|
67
91
|
end
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -230,18 +230,21 @@ class PDF::Reader
|
|
230
230
|
# should always be 0, but all sort of crazy junk is prefixed to PDF files
|
231
231
|
# in the real world.
|
232
232
|
#
|
233
|
-
# Checks up to
|
233
|
+
# Checks up to 1024 chars into the file,
|
234
|
+
# returns nil if no PDF data detected.
|
235
|
+
# Adobe PDF 1.4 spec (3.4.1) 12. Acrobat viewers require only that the
|
236
|
+
# header appear somewhere within the first 1024 bytes of the file
|
234
237
|
#
|
235
238
|
def calc_junk_offset(io)
|
236
239
|
io.rewind
|
237
240
|
offset = io.pos
|
238
|
-
until (c = io.readchar) == '%' || c == 37 || offset >
|
241
|
+
until (c = io.readchar) == '%' || c == 37 || offset > 1024
|
239
242
|
offset += 1
|
240
243
|
end
|
241
244
|
io.rewind
|
242
|
-
offset <
|
245
|
+
offset < 1024 ? offset : nil
|
243
246
|
rescue EOFError
|
244
|
-
|
247
|
+
nil
|
245
248
|
end
|
246
249
|
end
|
247
250
|
################################################################################
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Healy
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-07
|
11
|
+
date: 2019-11-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -246,6 +246,7 @@ files:
|
|
246
246
|
- lib/pdf/reader/object_hash.rb
|
247
247
|
- lib/pdf/reader/object_stream.rb
|
248
248
|
- lib/pdf/reader/orientation_detector.rb
|
249
|
+
- lib/pdf/reader/overlapping_runs_filter.rb
|
249
250
|
- lib/pdf/reader/page.rb
|
250
251
|
- lib/pdf/reader/page_layout.rb
|
251
252
|
- lib/pdf/reader/page_state.rb
|
@@ -295,7 +296,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
295
296
|
- !ruby/object:Gem::Version
|
296
297
|
version: '0'
|
297
298
|
requirements: []
|
298
|
-
rubygems_version: 3.0.
|
299
|
+
rubygems_version: 3.0.3
|
299
300
|
signing_key:
|
300
301
|
specification_version: 4
|
301
302
|
summary: A library for accessing the content of PDF files
|