pdf-reader 2.3.0 → 2.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +9 -0
- data/README.md +1 -1
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_text +1 -1
- data/lib/pdf/reader.rb +1 -2
- data/lib/pdf/reader/encoding.rb +2 -2
- data/lib/pdf/reader/font.rb +7 -1
- data/lib/pdf/reader/overlapping_runs_filter.rb +10 -11
- data/lib/pdf/reader/width_calculator/built_in.rb +17 -1
- metadata +8 -5
- data/lib/pdf/hash.rb +0 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c1acf8110733b6aff447e40353cc7d847e6edbb9bad016beab35bebe191bc91a
|
4
|
+
data.tar.gz: 1e50a894289d9c4f8df83bcf55e1f0bfe9e7c3088365705bea9b253a34a9254e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 473d030dd4e12e6aba2e037fe7b73410e99f14112ddd7823483056281cdd89e6865efb1f02866760038f892b7d60f153db8c88643d73035ee354e43d1cf047c5
|
7
|
+
data.tar.gz: 30d842ae8260a8a2005b484c9550ae04a77e23b17e36b324e36e6d6faa4a7e0f83ebdab7cc69b504e143740ebcb9fad341e5ca8c0b0b6339fa826634f94996e2
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
v2.4.0 (21st November 2019)
|
2
|
+
- Optimise overlapping characters code introduced in 2.3.0. Text extraction of pages with
|
3
|
+
thousands of characters is still slower than it was in 2.2.1, but it might tolerable
|
4
|
+
for now. See https://github.com/yob/pdf-reader/pull/308 for details.
|
5
|
+
- Implement very basic font substitution for Type1 and TrueType fonts that aren't embedded
|
6
|
+
- Remove PDF::Hash class. It's been deprecated since 2010, and it's hard to believe anyone
|
7
|
+
is still using it.
|
8
|
+
- Several small bug fixes
|
9
|
+
|
1
10
|
v2.3.0 (7th November 2019)
|
2
11
|
- Text extraction now makes an effort to skip duplicate characters that overlap, a
|
3
12
|
common approach used for a fake "bold" effect, This will make text extraction a bit
|
data/README.md
CHANGED
data/bin/pdf_callbacks
CHANGED
data/bin/pdf_text
CHANGED
data/lib/pdf/reader.rb
CHANGED
@@ -180,7 +180,7 @@ module PDF
|
|
180
180
|
(1..self.page_count).map do |num|
|
181
181
|
begin
|
182
182
|
PDF::Reader::Page.new(@objects, num, :cache => @cache)
|
183
|
-
rescue InvalidPageError
|
183
|
+
rescue InvalidPageError
|
184
184
|
raise MalformedPDFError, "Missing data for page: #{num}"
|
185
185
|
end
|
186
186
|
end
|
@@ -301,4 +301,3 @@ require 'pdf/reader/token'
|
|
301
301
|
require 'pdf/reader/xref'
|
302
302
|
require 'pdf/reader/orientation_detector'
|
303
303
|
require 'pdf/reader/page'
|
304
|
-
require 'pdf/hash'
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -42,10 +42,10 @@ class PDF::Reader
|
|
42
42
|
|
43
43
|
@enc_name = if enc.kind_of?(Hash)
|
44
44
|
enc[:Encoding] || enc[:BaseEncoding]
|
45
|
-
elsif enc
|
45
|
+
elsif enc && enc.respond_to?(:to_sym)
|
46
46
|
enc.to_sym
|
47
47
|
else
|
48
|
-
|
48
|
+
:StandardEncoding
|
49
49
|
end
|
50
50
|
|
51
51
|
@unpack = get_unpack(@enc_name)
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -97,7 +97,13 @@ class PDF::Reader
|
|
97
97
|
elsif @subtype == :Type3
|
98
98
|
PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
|
99
99
|
elsif @subtype == :TrueType
|
100
|
-
|
100
|
+
if @font_descriptor
|
101
|
+
PDF::Reader::WidthCalculator::TrueType.new(self)
|
102
|
+
else
|
103
|
+
# A TrueType font that isn't embedded. Most readers look for a version on the
|
104
|
+
# local system and fallback to a substitute. For now, we go straight to a substitute
|
105
|
+
PDF::Reader::WidthCalculator::BuiltIn.new(self)
|
106
|
+
end
|
101
107
|
elsif @subtype == :CIDFontType0 || @subtype == :CIDFontType2
|
102
108
|
PDF::Reader::WidthCalculator::Composite.new(self)
|
103
109
|
else
|
@@ -21,27 +21,26 @@ class PDF::Reader
|
|
21
21
|
|
22
22
|
event_point_schedule.sort! { |a,b| a.x <=> b.x }
|
23
23
|
|
24
|
-
|
25
|
-
|
26
|
-
break unless event_point
|
24
|
+
event_point_schedule.each do |event_point|
|
25
|
+
run = event_point.run
|
27
26
|
|
28
|
-
if event_point.start?
|
27
|
+
if event_point.start?
|
29
28
|
if detect_intersection(sweep_line_status, event_point)
|
30
|
-
to_exclude <<
|
29
|
+
to_exclude << run
|
31
30
|
end
|
32
|
-
sweep_line_status.push
|
31
|
+
sweep_line_status.push(run)
|
33
32
|
else
|
34
|
-
sweep_line_status.delete
|
33
|
+
sweep_line_status.delete(run)
|
35
34
|
end
|
36
35
|
end
|
37
36
|
runs - to_exclude
|
38
37
|
end
|
39
38
|
|
40
39
|
def self.detect_intersection(sweep_line_status, event_point)
|
41
|
-
sweep_line_status.each do |
|
42
|
-
if event_point.x >=
|
43
|
-
event_point.x <=
|
44
|
-
|
40
|
+
sweep_line_status.each do |open_text_run|
|
41
|
+
if event_point.x >= open_text_run.x &&
|
42
|
+
event_point.x <= open_text_run.endx &&
|
43
|
+
open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
|
45
44
|
return true
|
46
45
|
end
|
47
46
|
end
|
@@ -12,11 +12,20 @@ class PDF::Reader
|
|
12
12
|
# see Section 9.6.2.2, PDF 32000-1:2008, pp 256
|
13
13
|
class BuiltIn
|
14
14
|
|
15
|
+
BUILTINS = [
|
16
|
+
:Courier, :"Courier-Bold", :"Courier-BoldOblique", :"Courier-Oblique",
|
17
|
+
:Helvetica, :"Helvetica-Bold", :"Helvetica-BoldOblique", :"Helvetica-Oblique",
|
18
|
+
:Symbol,
|
19
|
+
:"Times-Roman", :"Times-Bold", :"Times-BoldItalic", :"Times-Italic",
|
20
|
+
:ZapfDingbats
|
21
|
+
]
|
22
|
+
|
15
23
|
def initialize(font)
|
16
24
|
@font = font
|
17
25
|
@@all_metrics ||= PDF::Reader::SynchronizedCache.new
|
18
26
|
|
19
|
-
|
27
|
+
basefont = extract_basefont(font.basefont)
|
28
|
+
metrics_path = File.join(File.dirname(__FILE__), "..","afm","#{basefont}.afm")
|
20
29
|
|
21
30
|
if File.file?(metrics_path)
|
22
31
|
@metrics = @@all_metrics[metrics_path] ||= AFM::Font.new(metrics_path)
|
@@ -54,6 +63,13 @@ class PDF::Reader
|
|
54
63
|
@font.encoding.int_to_name(code_point).first.to_s[/\Acontrol..\Z/]
|
55
64
|
end
|
56
65
|
|
66
|
+
def extract_basefont(font_name)
|
67
|
+
if BUILTINS.include?(font_name)
|
68
|
+
font_name
|
69
|
+
else
|
70
|
+
"Times-Roman"
|
71
|
+
end
|
72
|
+
end
|
57
73
|
end
|
58
74
|
end
|
59
75
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Healy
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-11-
|
11
|
+
date: 2019-11-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -199,7 +199,6 @@ files:
|
|
199
199
|
- examples/text.rb
|
200
200
|
- examples/version.rb
|
201
201
|
- lib/pdf-reader.rb
|
202
|
-
- lib/pdf/hash.rb
|
203
202
|
- lib/pdf/reader.rb
|
204
203
|
- lib/pdf/reader/afm/Courier-Bold.afm
|
205
204
|
- lib/pdf/reader/afm/Courier-BoldOblique.afm
|
@@ -272,10 +271,14 @@ files:
|
|
272
271
|
- lib/pdf/reader/width_calculator/type_one_or_three.rb
|
273
272
|
- lib/pdf/reader/width_calculator/type_zero.rb
|
274
273
|
- lib/pdf/reader/xref.rb
|
275
|
-
homepage:
|
274
|
+
homepage: https://github.com/yob/pdf-reader
|
276
275
|
licenses:
|
277
276
|
- MIT
|
278
|
-
metadata:
|
277
|
+
metadata:
|
278
|
+
bug_tracker_uri: https://github.com/yob/pdf-reader/issues
|
279
|
+
changelog_uri: https://github.com/yob/pdf-reader/blob/v2.4.0/CHANGELOG
|
280
|
+
documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.4.0
|
281
|
+
source_code_uri: https://github.com/yob/pdf-reader/tree/v2.4.0
|
279
282
|
post_install_message:
|
280
283
|
rdoc_options:
|
281
284
|
- "--title"
|
data/lib/pdf/hash.rb
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
# coding: utf-8
|
2
|
-
# frozen_string_literal: true
|
3
|
-
|
4
|
-
module PDF
|
5
|
-
# This class is deprecated, please stop using it.
|
6
|
-
class Hash < ::PDF::Reader::ObjectHash # :nodoc:
|
7
|
-
def initialize(input)
|
8
|
-
warn "DEPRECATION NOTICE: PDF::Hash has been deprecated, use PDF::Reader::ObjectHash instead"
|
9
|
-
super
|
10
|
-
end
|
11
|
-
|
12
|
-
def version
|
13
|
-
warn <<-EOS
|
14
|
-
DEPRECATION NOTICE: PDF::Hash#version has been deprecated,
|
15
|
-
use PDF::Reader::ObjectHash#pdf_version instead
|
16
|
-
EOS
|
17
|
-
pdf_version
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|