pdf-reader 2.3.0 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +9 -0
- data/README.md +1 -1
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_text +1 -1
- data/lib/pdf/reader.rb +1 -2
- data/lib/pdf/reader/encoding.rb +2 -2
- data/lib/pdf/reader/font.rb +7 -1
- data/lib/pdf/reader/overlapping_runs_filter.rb +10 -11
- data/lib/pdf/reader/width_calculator/built_in.rb +17 -1
- metadata +8 -5
- data/lib/pdf/hash.rb +0 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c1acf8110733b6aff447e40353cc7d847e6edbb9bad016beab35bebe191bc91a
|
4
|
+
data.tar.gz: 1e50a894289d9c4f8df83bcf55e1f0bfe9e7c3088365705bea9b253a34a9254e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 473d030dd4e12e6aba2e037fe7b73410e99f14112ddd7823483056281cdd89e6865efb1f02866760038f892b7d60f153db8c88643d73035ee354e43d1cf047c5
|
7
|
+
data.tar.gz: 30d842ae8260a8a2005b484c9550ae04a77e23b17e36b324e36e6d6faa4a7e0f83ebdab7cc69b504e143740ebcb9fad341e5ca8c0b0b6339fa826634f94996e2
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
v2.4.0 (21st November 2019)
|
2
|
+
- Optimise overlapping characters code introduced in 2.3.0. Text extraction of pages with
|
3
|
+
thousands of characters is still slower than it was in 2.2.1, but it might tolerable
|
4
|
+
for now. See https://github.com/yob/pdf-reader/pull/308 for details.
|
5
|
+
- Implement very basic font substitution for Type1 and TrueType fonts that aren't embedded
|
6
|
+
- Remove PDF::Hash class. It's been deprecated since 2010, and it's hard to believe anyone
|
7
|
+
is still using it.
|
8
|
+
- Several small bug fixes
|
9
|
+
|
1
10
|
v2.3.0 (7th November 2019)
|
2
11
|
- Text extraction now makes an effort to skip duplicate characters that overlap, a
|
3
12
|
common approach used for a fake "bold" effect, This will make text extraction a bit
|
data/README.md
CHANGED
data/bin/pdf_callbacks
CHANGED
data/bin/pdf_text
CHANGED
data/lib/pdf/reader.rb
CHANGED
@@ -180,7 +180,7 @@ module PDF
|
|
180
180
|
(1..self.page_count).map do |num|
|
181
181
|
begin
|
182
182
|
PDF::Reader::Page.new(@objects, num, :cache => @cache)
|
183
|
-
rescue InvalidPageError
|
183
|
+
rescue InvalidPageError
|
184
184
|
raise MalformedPDFError, "Missing data for page: #{num}"
|
185
185
|
end
|
186
186
|
end
|
@@ -301,4 +301,3 @@ require 'pdf/reader/token'
|
|
301
301
|
require 'pdf/reader/xref'
|
302
302
|
require 'pdf/reader/orientation_detector'
|
303
303
|
require 'pdf/reader/page'
|
304
|
-
require 'pdf/hash'
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -42,10 +42,10 @@ class PDF::Reader
|
|
42
42
|
|
43
43
|
@enc_name = if enc.kind_of?(Hash)
|
44
44
|
enc[:Encoding] || enc[:BaseEncoding]
|
45
|
-
elsif enc
|
45
|
+
elsif enc && enc.respond_to?(:to_sym)
|
46
46
|
enc.to_sym
|
47
47
|
else
|
48
|
-
|
48
|
+
:StandardEncoding
|
49
49
|
end
|
50
50
|
|
51
51
|
@unpack = get_unpack(@enc_name)
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -97,7 +97,13 @@ class PDF::Reader
|
|
97
97
|
elsif @subtype == :Type3
|
98
98
|
PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
|
99
99
|
elsif @subtype == :TrueType
|
100
|
-
|
100
|
+
if @font_descriptor
|
101
|
+
PDF::Reader::WidthCalculator::TrueType.new(self)
|
102
|
+
else
|
103
|
+
# A TrueType font that isn't embedded. Most readers look for a version on the
|
104
|
+
# local system and fallback to a substitute. For now, we go straight to a substitute
|
105
|
+
PDF::Reader::WidthCalculator::BuiltIn.new(self)
|
106
|
+
end
|
101
107
|
elsif @subtype == :CIDFontType0 || @subtype == :CIDFontType2
|
102
108
|
PDF::Reader::WidthCalculator::Composite.new(self)
|
103
109
|
else
|
@@ -21,27 +21,26 @@ class PDF::Reader
|
|
21
21
|
|
22
22
|
event_point_schedule.sort! { |a,b| a.x <=> b.x }
|
23
23
|
|
24
|
-
|
25
|
-
|
26
|
-
break unless event_point
|
24
|
+
event_point_schedule.each do |event_point|
|
25
|
+
run = event_point.run
|
27
26
|
|
28
|
-
if event_point.start?
|
27
|
+
if event_point.start?
|
29
28
|
if detect_intersection(sweep_line_status, event_point)
|
30
|
-
to_exclude <<
|
29
|
+
to_exclude << run
|
31
30
|
end
|
32
|
-
sweep_line_status.push
|
31
|
+
sweep_line_status.push(run)
|
33
32
|
else
|
34
|
-
sweep_line_status.delete
|
33
|
+
sweep_line_status.delete(run)
|
35
34
|
end
|
36
35
|
end
|
37
36
|
runs - to_exclude
|
38
37
|
end
|
39
38
|
|
40
39
|
def self.detect_intersection(sweep_line_status, event_point)
|
41
|
-
sweep_line_status.each do |
|
42
|
-
if event_point.x >=
|
43
|
-
event_point.x <=
|
44
|
-
|
40
|
+
sweep_line_status.each do |open_text_run|
|
41
|
+
if event_point.x >= open_text_run.x &&
|
42
|
+
event_point.x <= open_text_run.endx &&
|
43
|
+
open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
|
45
44
|
return true
|
46
45
|
end
|
47
46
|
end
|
@@ -12,11 +12,20 @@ class PDF::Reader
|
|
12
12
|
# see Section 9.6.2.2, PDF 32000-1:2008, pp 256
|
13
13
|
class BuiltIn
|
14
14
|
|
15
|
+
BUILTINS = [
|
16
|
+
:Courier, :"Courier-Bold", :"Courier-BoldOblique", :"Courier-Oblique",
|
17
|
+
:Helvetica, :"Helvetica-Bold", :"Helvetica-BoldOblique", :"Helvetica-Oblique",
|
18
|
+
:Symbol,
|
19
|
+
:"Times-Roman", :"Times-Bold", :"Times-BoldItalic", :"Times-Italic",
|
20
|
+
:ZapfDingbats
|
21
|
+
]
|
22
|
+
|
15
23
|
def initialize(font)
|
16
24
|
@font = font
|
17
25
|
@@all_metrics ||= PDF::Reader::SynchronizedCache.new
|
18
26
|
|
19
|
-
|
27
|
+
basefont = extract_basefont(font.basefont)
|
28
|
+
metrics_path = File.join(File.dirname(__FILE__), "..","afm","#{basefont}.afm")
|
20
29
|
|
21
30
|
if File.file?(metrics_path)
|
22
31
|
@metrics = @@all_metrics[metrics_path] ||= AFM::Font.new(metrics_path)
|
@@ -54,6 +63,13 @@ class PDF::Reader
|
|
54
63
|
@font.encoding.int_to_name(code_point).first.to_s[/\Acontrol..\Z/]
|
55
64
|
end
|
56
65
|
|
66
|
+
def extract_basefont(font_name)
|
67
|
+
if BUILTINS.include?(font_name)
|
68
|
+
font_name
|
69
|
+
else
|
70
|
+
"Times-Roman"
|
71
|
+
end
|
72
|
+
end
|
57
73
|
end
|
58
74
|
end
|
59
75
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Healy
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-11-
|
11
|
+
date: 2019-11-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -199,7 +199,6 @@ files:
|
|
199
199
|
- examples/text.rb
|
200
200
|
- examples/version.rb
|
201
201
|
- lib/pdf-reader.rb
|
202
|
-
- lib/pdf/hash.rb
|
203
202
|
- lib/pdf/reader.rb
|
204
203
|
- lib/pdf/reader/afm/Courier-Bold.afm
|
205
204
|
- lib/pdf/reader/afm/Courier-BoldOblique.afm
|
@@ -272,10 +271,14 @@ files:
|
|
272
271
|
- lib/pdf/reader/width_calculator/type_one_or_three.rb
|
273
272
|
- lib/pdf/reader/width_calculator/type_zero.rb
|
274
273
|
- lib/pdf/reader/xref.rb
|
275
|
-
homepage:
|
274
|
+
homepage: https://github.com/yob/pdf-reader
|
276
275
|
licenses:
|
277
276
|
- MIT
|
278
|
-
metadata:
|
277
|
+
metadata:
|
278
|
+
bug_tracker_uri: https://github.com/yob/pdf-reader/issues
|
279
|
+
changelog_uri: https://github.com/yob/pdf-reader/blob/v2.4.0/CHANGELOG
|
280
|
+
documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.4.0
|
281
|
+
source_code_uri: https://github.com/yob/pdf-reader/tree/v2.4.0
|
279
282
|
post_install_message:
|
280
283
|
rdoc_options:
|
281
284
|
- "--title"
|
data/lib/pdf/hash.rb
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
# coding: utf-8
|
2
|
-
# frozen_string_literal: true
|
3
|
-
|
4
|
-
module PDF
|
5
|
-
# This class is deprecated, please stop using it.
|
6
|
-
class Hash < ::PDF::Reader::ObjectHash # :nodoc:
|
7
|
-
def initialize(input)
|
8
|
-
warn "DEPRECATION NOTICE: PDF::Hash has been deprecated, use PDF::Reader::ObjectHash instead"
|
9
|
-
super
|
10
|
-
end
|
11
|
-
|
12
|
-
def version
|
13
|
-
warn <<-EOS
|
14
|
-
DEPRECATION NOTICE: PDF::Hash#version has been deprecated,
|
15
|
-
use PDF::Reader::ObjectHash#pdf_version instead
|
16
|
-
EOS
|
17
|
-
pdf_version
|
18
|
-
end
|
19
|
-
end
|
20
|
-
end
|