pdf-reader 2.2.1 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +7 -0
- data/lib/pdf/reader/buffer.rb +1 -1
- data/lib/pdf/reader/cmap.rb +8 -0
- data/lib/pdf/reader/encoding.rb +10 -8
- data/lib/pdf/reader/object_hash.rb +21 -10
- data/lib/pdf/reader/overlapping_runs_filter.rb +66 -0
- data/lib/pdf/reader/page_layout.rb +3 -1
- data/lib/pdf/reader/text_run.rb +24 -0
- data/lib/pdf/reader/xref.rb +7 -4
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 419ef1c2770f8cff11f2ee6453f70cec80562eddb7912ddd618013c5c013bcad
|
4
|
+
data.tar.gz: 71a7a814472b527b7a03e24d4923893962a8c0a1748e0d9007eb5cd7c8bbf7b3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4a5d4e76a74a766ceae3960587efce9aa63600c1b78b16175e9b41b58435d1c766871c2b288e79edec1499444aa12c786937eda70634bf301cd05ad8f2373063
|
7
|
+
data.tar.gz: 7e7bf8f2bb43822a64f89ca46bf0369a1e34b0e60078483ad1d4cf774ef6c6122f689b765f41d02a2120eafeba30fd47bb2cbd1b0dd5c56e7cf556648b3f4e33
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
v2.3.0 (7th November 2019)
|
2
|
+
- Text extraction now makes an effort to skip duplicate characters that overlap, a
|
3
|
+
common approach used for a fake "bold" effect, This will make text extraction a bit
|
4
|
+
slower - if that turns out to be an issue I'll look into further optimisations or
|
5
|
+
provide a toggle to turn it off
|
6
|
+
- Several small bug fixes
|
7
|
+
|
1
8
|
v2.2.1 (27th July 2019)
|
2
9
|
- Improve utf8 text extraction from CMaps that contain surrogate pair ligatures
|
3
10
|
|
data/lib/pdf/reader/buffer.rb
CHANGED
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -96,6 +96,14 @@ class PDF::Reader
|
|
96
96
|
Parser.new(buffer)
|
97
97
|
end
|
98
98
|
|
99
|
+
# The following includes some manual decoding of UTF-16BE strings into unicode codepoints. In
|
100
|
+
# theory we could replace all the UTF-16 code with something based on Ruby's encoding support:
|
101
|
+
#
|
102
|
+
# str.dup.force_encoding("utf-16be").encode!("utf-8").unpack("U*")
|
103
|
+
#
|
104
|
+
# However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an
|
105
|
+
# exception when we try converting broken UTF-16 to UTF-8
|
106
|
+
#
|
99
107
|
def str_to_int(str)
|
100
108
|
return nil if str.nil? || str.size == 0
|
101
109
|
unpacked_string = if str.bytesize == 1 # UTF-8
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -40,20 +40,22 @@ class PDF::Reader
|
|
40
40
|
@mapping = default_mapping # maps from character codes to Unicode codepoints
|
41
41
|
@string_cache = {} # maps from character codes to UTF-8 strings.
|
42
42
|
|
43
|
-
if enc.kind_of?(Hash)
|
44
|
-
|
45
|
-
enc = enc[:Encoding] || enc[:BaseEncoding]
|
43
|
+
@enc_name = if enc.kind_of?(Hash)
|
44
|
+
enc[:Encoding] || enc[:BaseEncoding]
|
46
45
|
elsif enc != nil
|
47
|
-
enc
|
46
|
+
enc.to_sym
|
48
47
|
else
|
49
|
-
|
48
|
+
nil
|
50
49
|
end
|
51
50
|
|
52
|
-
@
|
53
|
-
@
|
54
|
-
@map_file = get_mapping_file(enc)
|
51
|
+
@unpack = get_unpack(@enc_name)
|
52
|
+
@map_file = get_mapping_file(@enc_name)
|
55
53
|
|
56
54
|
load_mapping(@map_file) if @map_file
|
55
|
+
|
56
|
+
if enc.is_a?(Hash) && enc[:Differences]
|
57
|
+
self.differences = enc[:Differences]
|
58
|
+
end
|
57
59
|
end
|
58
60
|
|
59
61
|
# set the differences table for this encoding. should be an array in the following format:
|
@@ -78,16 +78,7 @@ class PDF::Reader
|
|
78
78
|
key = PDF::Reader::Reference.new(key.to_i, 0)
|
79
79
|
end
|
80
80
|
|
81
|
-
|
82
|
-
@cache[key]
|
83
|
-
elsif xref[key].is_a?(Integer)
|
84
|
-
buf = new_buffer(xref[key])
|
85
|
-
@cache[key] = decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
|
86
|
-
elsif xref[key].is_a?(PDF::Reader::Reference)
|
87
|
-
container_key = xref[key]
|
88
|
-
object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
|
89
|
-
@cache[key] = object_streams[container_key][key.id]
|
90
|
-
end
|
81
|
+
@cache[key] ||= fetch_object(key) || fetch_object_stream(key)
|
91
82
|
rescue InvalidObjectError
|
92
83
|
return default
|
93
84
|
end
|
@@ -254,6 +245,26 @@ class PDF::Reader
|
|
254
245
|
|
255
246
|
private
|
256
247
|
|
248
|
+
# parse a traditional object from the PDF, starting from the byte offset indicated
|
249
|
+
# in the xref table
|
250
|
+
#
|
251
|
+
def fetch_object(key)
|
252
|
+
if xref[key].is_a?(Integer)
|
253
|
+
buf = new_buffer(xref[key])
|
254
|
+
decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
# parse a object that's embedded in an object stream in the PDF
|
259
|
+
#
|
260
|
+
def fetch_object_stream(key)
|
261
|
+
if xref[key].is_a?(PDF::Reader::Reference)
|
262
|
+
container_key = xref[key]
|
263
|
+
object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
|
264
|
+
object_streams[container_key][key.id]
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
257
268
|
# Private implementation of deref!, which exists to ensure the `seen` argument
|
258
269
|
# isn't publicly available. It's used to avoid endless loops in the recursion, and
|
259
270
|
# doesn't need to be part of the public API.
|
@@ -0,0 +1,66 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
# remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
|
5
|
+
# uses slightly offset overlapping characters to achieve a fake 'bold' effect.
|
6
|
+
class OverlappingRunsFilter
|
7
|
+
|
8
|
+
# This should be between 0 and 1. If TextRun B obscures this much of TextRun A (and they
|
9
|
+
# have identical characters) then one will be discarded
|
10
|
+
OVERLAPPING_THRESHOLD = 0.5
|
11
|
+
|
12
|
+
def self.exclude_redundant_runs(runs)
|
13
|
+
sweep_line_status = Array.new
|
14
|
+
event_point_schedule = Array.new
|
15
|
+
to_exclude = []
|
16
|
+
|
17
|
+
runs.each do |run|
|
18
|
+
event_point_schedule << EventPoint.new(run.x, run)
|
19
|
+
event_point_schedule << EventPoint.new(run.endx, run)
|
20
|
+
end
|
21
|
+
|
22
|
+
event_point_schedule.sort! { |a,b| a.x <=> b.x }
|
23
|
+
|
24
|
+
while not event_point_schedule.empty? do
|
25
|
+
event_point = event_point_schedule.shift
|
26
|
+
break unless event_point
|
27
|
+
|
28
|
+
if event_point.start? then
|
29
|
+
if detect_intersection(sweep_line_status, event_point)
|
30
|
+
to_exclude << event_point.run
|
31
|
+
end
|
32
|
+
sweep_line_status.push event_point
|
33
|
+
else
|
34
|
+
sweep_line_status.delete event_point
|
35
|
+
end
|
36
|
+
end
|
37
|
+
runs - to_exclude
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.detect_intersection(sweep_line_status, event_point)
|
41
|
+
sweep_line_status.each do |point_in_sls|
|
42
|
+
if event_point.x >= point_in_sls.run.x &&
|
43
|
+
event_point.x <= point_in_sls.run.endx &&
|
44
|
+
point_in_sls.run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
|
45
|
+
return true
|
46
|
+
end
|
47
|
+
end
|
48
|
+
return false
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Utility class used to avoid modifying the underlying TextRun objects while we're
|
53
|
+
# looking for duplicates
|
54
|
+
class EventPoint
|
55
|
+
attr_reader :x, :run
|
56
|
+
|
57
|
+
def initialize x, run
|
58
|
+
@x, @run = x, run
|
59
|
+
end
|
60
|
+
|
61
|
+
def start?
|
62
|
+
@x == @run.x
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
@@ -1,6 +1,8 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
# frozen_string_literal: true
|
3
3
|
|
4
|
+
require 'pdf/reader/overlapping_runs_filter'
|
5
|
+
|
4
6
|
class PDF::Reader
|
5
7
|
|
6
8
|
# Takes a collection of TextRun objects and renders them into a single
|
@@ -15,7 +17,7 @@ class PDF::Reader
|
|
15
17
|
def initialize(runs, mediabox)
|
16
18
|
raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
|
17
19
|
|
18
|
-
@runs = merge_runs(runs)
|
20
|
+
@runs = merge_runs(OverlappingRunsFilter.exclude_redundant_runs(runs))
|
19
21
|
@mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
|
20
22
|
@mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
|
21
23
|
@mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
|
data/lib/pdf/reader/text_run.rb
CHANGED
@@ -38,6 +38,10 @@ class PDF::Reader
|
|
38
38
|
@endx ||= x + width
|
39
39
|
end
|
40
40
|
|
41
|
+
def endy
|
42
|
+
@endy ||= y + font_size
|
43
|
+
end
|
44
|
+
|
41
45
|
def mean_character_width
|
42
46
|
@width / character_count
|
43
47
|
end
|
@@ -60,8 +64,28 @@ class PDF::Reader
|
|
60
64
|
"#{text} w:#{width} f:#{font_size} @#{x},#{y}"
|
61
65
|
end
|
62
66
|
|
67
|
+
def intersect?(other_run)
|
68
|
+
x <= other_run.endx && endx >= other_run.x &&
|
69
|
+
endy >= other_run.y && y <= other_run.endy
|
70
|
+
end
|
71
|
+
|
72
|
+
# return what percentage of this text run is overlapped by another run
|
73
|
+
def intersection_area_percent(other_run)
|
74
|
+
return 0 unless intersect?(other_run)
|
75
|
+
|
76
|
+
dx = [endx, other_run.endx].min - [x, other_run.x].max
|
77
|
+
dy = [endy, other_run.endy].min - [y, other_run.y].max
|
78
|
+
intersection_area = dx*dy
|
79
|
+
|
80
|
+
intersection_area.to_f / area
|
81
|
+
end
|
82
|
+
|
63
83
|
private
|
64
84
|
|
85
|
+
def area
|
86
|
+
(endx - x) * (endy - y)
|
87
|
+
end
|
88
|
+
|
65
89
|
def mergable_range
|
66
90
|
@mergable_range ||= Range.new(endx - 3, endx + font_size)
|
67
91
|
end
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -230,18 +230,21 @@ class PDF::Reader
|
|
230
230
|
# should always be 0, but all sort of crazy junk is prefixed to PDF files
|
231
231
|
# in the real world.
|
232
232
|
#
|
233
|
-
# Checks up to
|
233
|
+
# Checks up to 1024 chars into the file,
|
234
|
+
# returns nil if no PDF data detected.
|
235
|
+
# Adobe PDF 1.4 spec (3.4.1) 12. Acrobat viewers require only that the
|
236
|
+
# header appear somewhere within the first 1024 bytes of the file
|
234
237
|
#
|
235
238
|
def calc_junk_offset(io)
|
236
239
|
io.rewind
|
237
240
|
offset = io.pos
|
238
|
-
until (c = io.readchar) == '%' || c == 37 || offset >
|
241
|
+
until (c = io.readchar) == '%' || c == 37 || offset > 1024
|
239
242
|
offset += 1
|
240
243
|
end
|
241
244
|
io.rewind
|
242
|
-
offset <
|
245
|
+
offset < 1024 ? offset : nil
|
243
246
|
rescue EOFError
|
244
|
-
|
247
|
+
nil
|
245
248
|
end
|
246
249
|
end
|
247
250
|
################################################################################
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Healy
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-07
|
11
|
+
date: 2019-11-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -246,6 +246,7 @@ files:
|
|
246
246
|
- lib/pdf/reader/object_hash.rb
|
247
247
|
- lib/pdf/reader/object_stream.rb
|
248
248
|
- lib/pdf/reader/orientation_detector.rb
|
249
|
+
- lib/pdf/reader/overlapping_runs_filter.rb
|
249
250
|
- lib/pdf/reader/page.rb
|
250
251
|
- lib/pdf/reader/page_layout.rb
|
251
252
|
- lib/pdf/reader/page_state.rb
|
@@ -295,7 +296,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
295
296
|
- !ruby/object:Gem::Version
|
296
297
|
version: '0'
|
297
298
|
requirements: []
|
298
|
-
rubygems_version: 3.0.
|
299
|
+
rubygems_version: 3.0.3
|
299
300
|
signing_key:
|
300
301
|
specification_version: 4
|
301
302
|
summary: A library for accessing the content of PDF files
|