pdf-reader 2.3.0 → 2.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 419ef1c2770f8cff11f2ee6453f70cec80562eddb7912ddd618013c5c013bcad
4
- data.tar.gz: 71a7a814472b527b7a03e24d4923893962a8c0a1748e0d9007eb5cd7c8bbf7b3
3
+ metadata.gz: c1acf8110733b6aff447e40353cc7d847e6edbb9bad016beab35bebe191bc91a
4
+ data.tar.gz: 1e50a894289d9c4f8df83bcf55e1f0bfe9e7c3088365705bea9b253a34a9254e
5
5
  SHA512:
6
- metadata.gz: 4a5d4e76a74a766ceae3960587efce9aa63600c1b78b16175e9b41b58435d1c766871c2b288e79edec1499444aa12c786937eda70634bf301cd05ad8f2373063
7
- data.tar.gz: 7e7bf8f2bb43822a64f89ca46bf0369a1e34b0e60078483ad1d4cf774ef6c6122f689b765f41d02a2120eafeba30fd47bb2cbd1b0dd5c56e7cf556648b3f4e33
6
+ metadata.gz: 473d030dd4e12e6aba2e037fe7b73410e99f14112ddd7823483056281cdd89e6865efb1f02866760038f892b7d60f153db8c88643d73035ee354e43d1cf047c5
7
+ data.tar.gz: 30d842ae8260a8a2005b484c9550ae04a77e23b17e36b324e36e6d6faa4a7e0f83ebdab7cc69b504e143740ebcb9fad341e5ca8c0b0b6339fa826634f94996e2
data/CHANGELOG CHANGED
@@ -1,3 +1,12 @@
1
+ v2.4.0 (21st November 2019)
2
+ - Optimise overlapping characters code introduced in 2.3.0. Text extraction of pages with
3
+ thousands of characters is still slower than it was in 2.2.1, but it might tolerable
4
+ for now. See https://github.com/yob/pdf-reader/pull/308 for details.
5
+ - Implement very basic font substitution for Type1 and TrueType fonts that aren't embedded
6
+ - Remove PDF::Hash class. It's been deprecated since 2010, and it's hard to believe anyone
7
+ is still using it.
8
+ - Several small bug fixes
9
+
1
10
  v2.3.0 (7th November 2019)
2
11
  - Text extraction now makes an effort to skip duplicate characters that overlap, a
3
12
  common approach used for a fake "bold" effect, This will make text extraction a bit
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # Release Notes
1
+ # pdf-reader
2
2
 
3
3
  The PDF::Reader library implements a PDF parser conforming as much as possible
4
4
  to the PDF specification from Adobe.
@@ -9,7 +9,7 @@ require 'pdf/reader'
9
9
  receiver = PDF::Reader::PrintReceiver.new
10
10
 
11
11
  if ARGV.empty?
12
- browser = PDF::Reader.new($stdin)
12
+ browser = PDF::Reader.new(StringIO.new(ARGF.read))
13
13
  else
14
14
  browser = PDF::Reader.new(ARGV[0])
15
15
  end
@@ -4,7 +4,7 @@ require 'rubygems'
4
4
  require 'pdf/reader'
5
5
 
6
6
  if ARGV.empty?
7
- browser = PDF::Reader.new($stdin)
7
+ browser = PDF::Reader.new(StringIO.new(ARGF.read))
8
8
  else
9
9
  browser = PDF::Reader.new(ARGV[0])
10
10
  end
@@ -180,7 +180,7 @@ module PDF
180
180
  (1..self.page_count).map do |num|
181
181
  begin
182
182
  PDF::Reader::Page.new(@objects, num, :cache => @cache)
183
- rescue InvalidPageError => ex
183
+ rescue InvalidPageError
184
184
  raise MalformedPDFError, "Missing data for page: #{num}"
185
185
  end
186
186
  end
@@ -301,4 +301,3 @@ require 'pdf/reader/token'
301
301
  require 'pdf/reader/xref'
302
302
  require 'pdf/reader/orientation_detector'
303
303
  require 'pdf/reader/page'
304
- require 'pdf/hash'
@@ -42,10 +42,10 @@ class PDF::Reader
42
42
 
43
43
  @enc_name = if enc.kind_of?(Hash)
44
44
  enc[:Encoding] || enc[:BaseEncoding]
45
- elsif enc != nil
45
+ elsif enc && enc.respond_to?(:to_sym)
46
46
  enc.to_sym
47
47
  else
48
- nil
48
+ :StandardEncoding
49
49
  end
50
50
 
51
51
  @unpack = get_unpack(@enc_name)
@@ -97,7 +97,13 @@ class PDF::Reader
97
97
  elsif @subtype == :Type3
98
98
  PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
99
99
  elsif @subtype == :TrueType
100
- PDF::Reader::WidthCalculator::TrueType.new(self)
100
+ if @font_descriptor
101
+ PDF::Reader::WidthCalculator::TrueType.new(self)
102
+ else
103
+ # A TrueType font that isn't embedded. Most readers look for a version on the
104
+ # local system and fallback to a substitute. For now, we go straight to a substitute
105
+ PDF::Reader::WidthCalculator::BuiltIn.new(self)
106
+ end
101
107
  elsif @subtype == :CIDFontType0 || @subtype == :CIDFontType2
102
108
  PDF::Reader::WidthCalculator::Composite.new(self)
103
109
  else
@@ -21,27 +21,26 @@ class PDF::Reader
21
21
 
22
22
  event_point_schedule.sort! { |a,b| a.x <=> b.x }
23
23
 
24
- while not event_point_schedule.empty? do
25
- event_point = event_point_schedule.shift
26
- break unless event_point
24
+ event_point_schedule.each do |event_point|
25
+ run = event_point.run
27
26
 
28
- if event_point.start? then
27
+ if event_point.start?
29
28
  if detect_intersection(sweep_line_status, event_point)
30
- to_exclude << event_point.run
29
+ to_exclude << run
31
30
  end
32
- sweep_line_status.push event_point
31
+ sweep_line_status.push(run)
33
32
  else
34
- sweep_line_status.delete event_point
33
+ sweep_line_status.delete(run)
35
34
  end
36
35
  end
37
36
  runs - to_exclude
38
37
  end
39
38
 
40
39
  def self.detect_intersection(sweep_line_status, event_point)
41
- sweep_line_status.each do |point_in_sls|
42
- if event_point.x >= point_in_sls.run.x &&
43
- event_point.x <= point_in_sls.run.endx &&
44
- point_in_sls.run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
40
+ sweep_line_status.each do |open_text_run|
41
+ if event_point.x >= open_text_run.x &&
42
+ event_point.x <= open_text_run.endx &&
43
+ open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
45
44
  return true
46
45
  end
47
46
  end
@@ -12,11 +12,20 @@ class PDF::Reader
12
12
  # see Section 9.6.2.2, PDF 32000-1:2008, pp 256
13
13
  class BuiltIn
14
14
 
15
+ BUILTINS = [
16
+ :Courier, :"Courier-Bold", :"Courier-BoldOblique", :"Courier-Oblique",
17
+ :Helvetica, :"Helvetica-Bold", :"Helvetica-BoldOblique", :"Helvetica-Oblique",
18
+ :Symbol,
19
+ :"Times-Roman", :"Times-Bold", :"Times-BoldItalic", :"Times-Italic",
20
+ :ZapfDingbats
21
+ ]
22
+
15
23
  def initialize(font)
16
24
  @font = font
17
25
  @@all_metrics ||= PDF::Reader::SynchronizedCache.new
18
26
 
19
- metrics_path = File.join(File.dirname(__FILE__), "..","afm","#{font.basefont}.afm")
27
+ basefont = extract_basefont(font.basefont)
28
+ metrics_path = File.join(File.dirname(__FILE__), "..","afm","#{basefont}.afm")
20
29
 
21
30
  if File.file?(metrics_path)
22
31
  @metrics = @@all_metrics[metrics_path] ||= AFM::Font.new(metrics_path)
@@ -54,6 +63,13 @@ class PDF::Reader
54
63
  @font.encoding.int_to_name(code_point).first.to_s[/\Acontrol..\Z/]
55
64
  end
56
65
 
66
+ def extract_basefont(font_name)
67
+ if BUILTINS.include?(font_name)
68
+ font_name
69
+ else
70
+ "Times-Roman"
71
+ end
72
+ end
57
73
  end
58
74
  end
59
75
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.3.0
4
+ version: 2.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Healy
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-11-07 00:00:00.000000000 Z
11
+ date: 2019-11-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -199,7 +199,6 @@ files:
199
199
  - examples/text.rb
200
200
  - examples/version.rb
201
201
  - lib/pdf-reader.rb
202
- - lib/pdf/hash.rb
203
202
  - lib/pdf/reader.rb
204
203
  - lib/pdf/reader/afm/Courier-Bold.afm
205
204
  - lib/pdf/reader/afm/Courier-BoldOblique.afm
@@ -272,10 +271,14 @@ files:
272
271
  - lib/pdf/reader/width_calculator/type_one_or_three.rb
273
272
  - lib/pdf/reader/width_calculator/type_zero.rb
274
273
  - lib/pdf/reader/xref.rb
275
- homepage: http://github.com/yob/pdf-reader
274
+ homepage: https://github.com/yob/pdf-reader
276
275
  licenses:
277
276
  - MIT
278
- metadata: {}
277
+ metadata:
278
+ bug_tracker_uri: https://github.com/yob/pdf-reader/issues
279
+ changelog_uri: https://github.com/yob/pdf-reader/blob/v2.4.0/CHANGELOG
280
+ documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.4.0
281
+ source_code_uri: https://github.com/yob/pdf-reader/tree/v2.4.0
279
282
  post_install_message:
280
283
  rdoc_options:
281
284
  - "--title"
@@ -1,20 +0,0 @@
1
- # coding: utf-8
2
- # frozen_string_literal: true
3
-
4
- module PDF
5
- # This class is deprecated, please stop using it.
6
- class Hash < ::PDF::Reader::ObjectHash # :nodoc:
7
- def initialize(input)
8
- warn "DEPRECATION NOTICE: PDF::Hash has been deprecated, use PDF::Reader::ObjectHash instead"
9
- super
10
- end
11
-
12
- def version
13
- warn <<-EOS
14
- DEPRECATION NOTICE: PDF::Hash#version has been deprecated,
15
- use PDF::Reader::ObjectHash#pdf_version instead
16
- EOS
17
- pdf_version
18
- end
19
- end
20
- end