pdf-reader 2.3.0 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 419ef1c2770f8cff11f2ee6453f70cec80562eddb7912ddd618013c5c013bcad
4
- data.tar.gz: 71a7a814472b527b7a03e24d4923893962a8c0a1748e0d9007eb5cd7c8bbf7b3
3
+ metadata.gz: c1acf8110733b6aff447e40353cc7d847e6edbb9bad016beab35bebe191bc91a
4
+ data.tar.gz: 1e50a894289d9c4f8df83bcf55e1f0bfe9e7c3088365705bea9b253a34a9254e
5
5
  SHA512:
6
- metadata.gz: 4a5d4e76a74a766ceae3960587efce9aa63600c1b78b16175e9b41b58435d1c766871c2b288e79edec1499444aa12c786937eda70634bf301cd05ad8f2373063
7
- data.tar.gz: 7e7bf8f2bb43822a64f89ca46bf0369a1e34b0e60078483ad1d4cf774ef6c6122f689b765f41d02a2120eafeba30fd47bb2cbd1b0dd5c56e7cf556648b3f4e33
6
+ metadata.gz: 473d030dd4e12e6aba2e037fe7b73410e99f14112ddd7823483056281cdd89e6865efb1f02866760038f892b7d60f153db8c88643d73035ee354e43d1cf047c5
7
+ data.tar.gz: 30d842ae8260a8a2005b484c9550ae04a77e23b17e36b324e36e6d6faa4a7e0f83ebdab7cc69b504e143740ebcb9fad341e5ca8c0b0b6339fa826634f94996e2
data/CHANGELOG CHANGED
@@ -1,3 +1,12 @@
1
+ v2.4.0 (21st November 2019)
2
+ - Optimise overlapping characters code introduced in 2.3.0. Text extraction of pages with
3
+ thousands of characters is still slower than it was in 2.2.1, but it might tolerable
4
+ for now. See https://github.com/yob/pdf-reader/pull/308 for details.
5
+ - Implement very basic font substitution for Type1 and TrueType fonts that aren't embedded
6
+ - Remove PDF::Hash class. It's been deprecated since 2010, and it's hard to believe anyone
7
+ is still using it.
8
+ - Several small bug fixes
9
+
1
10
  v2.3.0 (7th November 2019)
2
11
  - Text extraction now makes an effort to skip duplicate characters that overlap, a
3
12
  common approach used for a fake "bold" effect, This will make text extraction a bit
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # Release Notes
1
+ # pdf-reader
2
2
 
3
3
  The PDF::Reader library implements a PDF parser conforming as much as possible
4
4
  to the PDF specification from Adobe.
@@ -9,7 +9,7 @@ require 'pdf/reader'
9
9
  receiver = PDF::Reader::PrintReceiver.new
10
10
 
11
11
  if ARGV.empty?
12
- browser = PDF::Reader.new($stdin)
12
+ browser = PDF::Reader.new(StringIO.new(ARGF.read))
13
13
  else
14
14
  browser = PDF::Reader.new(ARGV[0])
15
15
  end
@@ -4,7 +4,7 @@ require 'rubygems'
4
4
  require 'pdf/reader'
5
5
 
6
6
  if ARGV.empty?
7
- browser = PDF::Reader.new($stdin)
7
+ browser = PDF::Reader.new(StringIO.new(ARGF.read))
8
8
  else
9
9
  browser = PDF::Reader.new(ARGV[0])
10
10
  end
@@ -180,7 +180,7 @@ module PDF
180
180
  (1..self.page_count).map do |num|
181
181
  begin
182
182
  PDF::Reader::Page.new(@objects, num, :cache => @cache)
183
- rescue InvalidPageError => ex
183
+ rescue InvalidPageError
184
184
  raise MalformedPDFError, "Missing data for page: #{num}"
185
185
  end
186
186
  end
@@ -301,4 +301,3 @@ require 'pdf/reader/token'
301
301
  require 'pdf/reader/xref'
302
302
  require 'pdf/reader/orientation_detector'
303
303
  require 'pdf/reader/page'
304
- require 'pdf/hash'
@@ -42,10 +42,10 @@ class PDF::Reader
42
42
 
43
43
  @enc_name = if enc.kind_of?(Hash)
44
44
  enc[:Encoding] || enc[:BaseEncoding]
45
- elsif enc != nil
45
+ elsif enc && enc.respond_to?(:to_sym)
46
46
  enc.to_sym
47
47
  else
48
- nil
48
+ :StandardEncoding
49
49
  end
50
50
 
51
51
  @unpack = get_unpack(@enc_name)
@@ -97,7 +97,13 @@ class PDF::Reader
97
97
  elsif @subtype == :Type3
98
98
  PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
99
99
  elsif @subtype == :TrueType
100
- PDF::Reader::WidthCalculator::TrueType.new(self)
100
+ if @font_descriptor
101
+ PDF::Reader::WidthCalculator::TrueType.new(self)
102
+ else
103
+ # A TrueType font that isn't embedded. Most readers look for a version on the
104
+ # local system and fallback to a substitute. For now, we go straight to a substitute
105
+ PDF::Reader::WidthCalculator::BuiltIn.new(self)
106
+ end
101
107
  elsif @subtype == :CIDFontType0 || @subtype == :CIDFontType2
102
108
  PDF::Reader::WidthCalculator::Composite.new(self)
103
109
  else
@@ -21,27 +21,26 @@ class PDF::Reader
21
21
 
22
22
  event_point_schedule.sort! { |a,b| a.x <=> b.x }
23
23
 
24
- while not event_point_schedule.empty? do
25
- event_point = event_point_schedule.shift
26
- break unless event_point
24
+ event_point_schedule.each do |event_point|
25
+ run = event_point.run
27
26
 
28
- if event_point.start? then
27
+ if event_point.start?
29
28
  if detect_intersection(sweep_line_status, event_point)
30
- to_exclude << event_point.run
29
+ to_exclude << run
31
30
  end
32
- sweep_line_status.push event_point
31
+ sweep_line_status.push(run)
33
32
  else
34
- sweep_line_status.delete event_point
33
+ sweep_line_status.delete(run)
35
34
  end
36
35
  end
37
36
  runs - to_exclude
38
37
  end
39
38
 
40
39
  def self.detect_intersection(sweep_line_status, event_point)
41
- sweep_line_status.each do |point_in_sls|
42
- if event_point.x >= point_in_sls.run.x &&
43
- event_point.x <= point_in_sls.run.endx &&
44
- point_in_sls.run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
40
+ sweep_line_status.each do |open_text_run|
41
+ if event_point.x >= open_text_run.x &&
42
+ event_point.x <= open_text_run.endx &&
43
+ open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
45
44
  return true
46
45
  end
47
46
  end
@@ -12,11 +12,20 @@ class PDF::Reader
12
12
  # see Section 9.6.2.2, PDF 32000-1:2008, pp 256
13
13
  class BuiltIn
14
14
 
15
+ BUILTINS = [
16
+ :Courier, :"Courier-Bold", :"Courier-BoldOblique", :"Courier-Oblique",
17
+ :Helvetica, :"Helvetica-Bold", :"Helvetica-BoldOblique", :"Helvetica-Oblique",
18
+ :Symbol,
19
+ :"Times-Roman", :"Times-Bold", :"Times-BoldItalic", :"Times-Italic",
20
+ :ZapfDingbats
21
+ ]
22
+
15
23
  def initialize(font)
16
24
  @font = font
17
25
  @@all_metrics ||= PDF::Reader::SynchronizedCache.new
18
26
 
19
- metrics_path = File.join(File.dirname(__FILE__), "..","afm","#{font.basefont}.afm")
27
+ basefont = extract_basefont(font.basefont)
28
+ metrics_path = File.join(File.dirname(__FILE__), "..","afm","#{basefont}.afm")
20
29
 
21
30
  if File.file?(metrics_path)
22
31
  @metrics = @@all_metrics[metrics_path] ||= AFM::Font.new(metrics_path)
@@ -54,6 +63,13 @@ class PDF::Reader
54
63
  @font.encoding.int_to_name(code_point).first.to_s[/\Acontrol..\Z/]
55
64
  end
56
65
 
66
+ def extract_basefont(font_name)
67
+ if BUILTINS.include?(font_name)
68
+ font_name
69
+ else
70
+ "Times-Roman"
71
+ end
72
+ end
57
73
  end
58
74
  end
59
75
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.3.0
4
+ version: 2.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Healy
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-11-07 00:00:00.000000000 Z
11
+ date: 2019-11-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -199,7 +199,6 @@ files:
199
199
  - examples/text.rb
200
200
  - examples/version.rb
201
201
  - lib/pdf-reader.rb
202
- - lib/pdf/hash.rb
203
202
  - lib/pdf/reader.rb
204
203
  - lib/pdf/reader/afm/Courier-Bold.afm
205
204
  - lib/pdf/reader/afm/Courier-BoldOblique.afm
@@ -272,10 +271,14 @@ files:
272
271
  - lib/pdf/reader/width_calculator/type_one_or_three.rb
273
272
  - lib/pdf/reader/width_calculator/type_zero.rb
274
273
  - lib/pdf/reader/xref.rb
275
- homepage: http://github.com/yob/pdf-reader
274
+ homepage: https://github.com/yob/pdf-reader
276
275
  licenses:
277
276
  - MIT
278
- metadata: {}
277
+ metadata:
278
+ bug_tracker_uri: https://github.com/yob/pdf-reader/issues
279
+ changelog_uri: https://github.com/yob/pdf-reader/blob/v2.4.0/CHANGELOG
280
+ documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.4.0
281
+ source_code_uri: https://github.com/yob/pdf-reader/tree/v2.4.0
279
282
  post_install_message:
280
283
  rdoc_options:
281
284
  - "--title"
@@ -1,20 +0,0 @@
1
- # coding: utf-8
2
- # frozen_string_literal: true
3
-
4
- module PDF
5
- # This class is deprecated, please stop using it.
6
- class Hash < ::PDF::Reader::ObjectHash # :nodoc:
7
- def initialize(input)
8
- warn "DEPRECATION NOTICE: PDF::Hash has been deprecated, use PDF::Reader::ObjectHash instead"
9
- super
10
- end
11
-
12
- def version
13
- warn <<-EOS
14
- DEPRECATION NOTICE: PDF::Hash#version has been deprecated,
15
- use PDF::Reader::ObjectHash#pdf_version instead
16
- EOS
17
- pdf_version
18
- end
19
- end
20
- end