pdf-reader 2.10.0 → 2.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 55bfae4c5211a0f3ac70845500183e237d3d5f9cc81d548a27f4b8c5fd5acfc9
4
- data.tar.gz: 9e9000474695100c4874afd9abf5f6290ea0b59ffa773c3d03a4a3fc3d2a0a4c
3
+ metadata.gz: 2c84983c18d983798ff5f2ede514b540ee55a788229501976474b7341bf57fba
4
+ data.tar.gz: 79b8f092e72a194110062cf7d7e9425c0a6531e145009c9b7c10c2c072b3d1d5
5
5
  SHA512:
6
- metadata.gz: 2dc96f064c3b233bd499a5a8140bb6f61fd1bdadb2582ac9bc569adff911e63aaadc16188d6624e0ed766481d453912bf5e0057add84262d753681cc40f51776
7
- data.tar.gz: 64dfac4dd2b73a5302be95c47e74e7ca8b94fc6daeaea30c7a33cca6b4be79b68facc571d209f44a59de2632f6241166a290fe0f2835498b2b7d611906b05a31
6
+ metadata.gz: '09c97a875bb46389172ed48ae8b2779ba3a8e032852b6a9943f187de13c23649e2398a5374358c62b64cf9e13bbf7f819bb5072d9aaa6882b9b94e96d23f5c13'
7
+ data.tar.gz: ed92250acee85f4e355785dd043f7774a5883550fe82b01b3cd9e10011f93a1fcdd500108b0e1f4e2af562bddd833c03ca601078b3eba8ee2e9990fd5e76305a
data/CHANGELOG CHANGED
@@ -1,3 +1,7 @@
1
+ v2.11.0 (26th October 2022)
2
+ - Various bug fixes
3
+ - Expanded sorbet type annotations
4
+
1
5
  v2.10.0 (12th May 2022)
2
6
  - Various bug fixes
3
7
  - Expanded sorbet type annotations
data/Rakefile CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
14
14
  Cane::RakeTask.new(:quality) do |cane|
15
15
  cane.abc_max = 20
16
16
  cane.style_measure = 100
17
- cane.max_violations = 28
17
+ cane.max_violations = 33
18
18
 
19
19
  cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
20
20
  end
@@ -1,5 +1,5 @@
1
1
  # coding: ASCII-8BIT
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  ################################################################################
@@ -52,7 +52,9 @@ class PDF::Reader
52
52
 
53
53
  # this is the form 10 20 123 where all index between 10 and 20 have width 123
54
54
  def parse_second_form(first, final, width)
55
- raise MalformedPDFError, "CidWidths: #{first} must be less than #{final}" unless first < final
55
+ if first > final
56
+ raise MalformedPDFError, "CidWidths: #{first} must be less than #{final}"
57
+ end
56
58
 
57
59
  (first..final).inject({}) { |accum, index|
58
60
  accum[index] = width
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  ################################################################################
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  ################################################################################
@@ -76,9 +76,9 @@ class PDF::Reader
76
76
  diff.each do |val|
77
77
  if val.kind_of?(Numeric)
78
78
  byte = val.to_i
79
- else
79
+ elsif codepoint = glyphlist.name_to_unicode(val)
80
80
  @differences[byte] = val
81
- @mapping[byte] = glyphlist.name_to_unicode(val)
81
+ @mapping[byte] = codepoint
82
82
  byte += 1
83
83
  end
84
84
  end
@@ -167,7 +167,7 @@ class PDF::Reader
167
167
  end
168
168
 
169
169
  def convert_to_utf8(str)
170
- ret = str.unpack(unpack).map! { |c| @mapping[c] || c }.pack("U*")
170
+ ret = str.unpack(unpack).map! { |c| @mapping[c.to_i] || c }.pack("U*")
171
171
  ret.force_encoding("UTF-8")
172
172
  ret
173
173
  end
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  class PDF::Reader
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  ################################################################################
@@ -33,10 +33,18 @@ class PDF::Reader
33
33
  #
34
34
  class GlyphHash # :nodoc:
35
35
  def initialize
36
+ @@by_codepoint_cache ||= nil
37
+ @@by_name_cache ||= nil
38
+
36
39
  # only parse the glyph list once, and cache the results (for performance)
37
- adobe = @@cache ||= load_adobe_glyph_mapping
38
- @by_name = adobe.first
39
- @by_codepoint = adobe.last
40
+ if @@by_codepoint_cache != nil && @@by_name_cache != nil
41
+ @by_name = @@by_name_cache
42
+ @by_codepoint = @@by_codepoint_cache
43
+ else
44
+ by_name, by_codepoint = load_adobe_glyph_mapping
45
+ @by_name = @@by_name_cache ||= by_name
46
+ @by_codepoint = @@by_codepoint_cache ||= by_codepoint
47
+ end
40
48
  end
41
49
 
42
50
  # attempt to convert a PDF Name to a unicode codepoint. Returns nil
@@ -127,7 +135,7 @@ class PDF::Reader
127
135
  end
128
136
  end
129
137
 
130
- [keyed_by_name.freeze, keyed_by_codepoint.freeze]
138
+ return keyed_by_name.freeze, keyed_by_codepoint.freeze
131
139
  end
132
140
 
133
141
  end
@@ -0,0 +1,14 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ class PDF::Reader
6
+ # There's no point rendering zero-width characters
7
+ class NoTextFilter
8
+
9
+ def self.exclude_empty_strings(runs)
10
+ runs.reject { |run| run.text.to_s.size == 0 }
11
+ end
12
+ end
13
+ end
14
+
@@ -2,6 +2,8 @@
2
2
  # typed: true
3
3
  # frozen_string_literal: true
4
4
 
5
+ require 'tempfile'
6
+
5
7
  class PDF::Reader
6
8
  # Provides low level access to the objects in a PDF file via a hash-like
7
9
  # object.
@@ -566,7 +568,7 @@ class PDF::Reader
566
568
  end
567
569
 
568
570
  def object_streams
569
- @object_stream ||= {}
571
+ @object_streams ||= {}
570
572
  end
571
573
 
572
574
  # returns an array of object references for all pages in this object store. The ordering of
@@ -591,18 +593,18 @@ class PDF::Reader
591
593
 
592
594
  def read_version
593
595
  @io.seek(0)
594
- _m, version = *@io.read(10).match(/PDF-(\d.\d)/)
596
+ _m, version = *@io.read(10).to_s.match(/PDF-(\d.\d)/)
595
597
  @io.seek(0)
596
598
  version.to_f
597
599
  end
598
600
 
599
601
  def extract_io_from(input)
600
- if input.respond_to?(:seek) && input.respond_to?(:read)
602
+ if input.is_a?(IO) || input.is_a?(StringIO) || input.is_a?(Tempfile)
601
603
  input
602
604
  elsif File.file?(input.to_s)
603
- StringIO.new read_as_binary(input)
605
+ StringIO.new read_as_binary(input.to_s)
604
606
  else
605
- raise ArgumentError, "input must be an IO-like object or a filename"
607
+ raise ArgumentError, "input must be an IO-like object or a filename (#{input.class})"
606
608
  end
607
609
  end
608
610
 
@@ -610,7 +612,7 @@ class PDF::Reader
610
612
  if File.respond_to?(:binread)
611
613
  File.binread(input.to_s)
612
614
  else
613
- File.open(input.to_s,"rb") { |f| f.read }
615
+ File.open(input.to_s,"rb") { |f| f.read } || ""
614
616
  end
615
617
  end
616
618
 
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  class PDF::Reader
@@ -1,6 +1,6 @@
1
- # typed: true
2
1
  # coding: utf-8
3
2
  # frozen_string_literal: true
3
+ # typed: strict
4
4
 
5
5
  class PDF::Reader
6
6
  # remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  module PDF
@@ -43,10 +43,10 @@ module PDF
43
43
  #
44
44
  def initialize(objects, pagenum, options = {})
45
45
  @objects, @pagenum = objects, pagenum
46
- @page_object = objects.deref_hash(objects.page_references[pagenum - 1])
46
+ @page_object = objects.deref_hash(objects.page_references[pagenum - 1]) || {}
47
47
  @cache = options[:cache] || {}
48
48
 
49
- unless @page_object.is_a?(::Hash)
49
+ if @page_object.empty?
50
50
  raise InvalidPageError, "Invalid page: #{pagenum}"
51
51
  end
52
52
  end
@@ -250,8 +250,8 @@ module PDF
250
250
  params = []
251
251
 
252
252
  while (token = parser.parse_token(PagesStrategy::OPERATORS))
253
- if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
254
- callback(receivers, PagesStrategy::OPERATORS[token], params)
253
+ if token.kind_of?(Token) && method_name = PagesStrategy::OPERATORS[token]
254
+ callback(receivers, method_name, params)
255
255
  params.clear
256
256
  else
257
257
  params << token
@@ -263,9 +263,26 @@ module PDF
263
263
 
264
264
  # calls the name callback method on each receiver object with params as the arguments
265
265
  #
266
+ # The silly style here is because sorbet won't let me use splat arguments
267
+ #
266
268
  def callback(receivers, name, params=[])
267
269
  receivers.each do |receiver|
268
- receiver.send(name, *params) if receiver.respond_to?(name)
270
+ if receiver.respond_to?(name)
271
+ case params.size
272
+ when 0 then receiver.send(name)
273
+ when 1 then receiver.send(name, params[0])
274
+ when 2 then receiver.send(name, params[0], params[1])
275
+ when 3 then receiver.send(name, params[0], params[1], params[2])
276
+ when 4 then receiver.send(name, params[0], params[1], params[2], params[3])
277
+ when 5 then receiver.send(name, params[0], params[1], params[2], params[3], params[4])
278
+ when 6 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5])
279
+ when 7 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6])
280
+ when 8 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7])
281
+ when 9 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7], params[8])
282
+ else
283
+ receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7], params[8], params[9])
284
+ end
285
+ end
269
286
  end
270
287
  end
271
288
 
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  require 'pdf/reader/overlapping_runs_filter'
@@ -62,6 +62,8 @@ module PDF
62
62
  runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
63
63
  end
64
64
 
65
+ runs = NoTextFilter.exclude_empty_strings(runs)
66
+
65
67
  if opts.fetch(:merge, true)
66
68
  runs = merge_runs(runs)
67
69
  end
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  ################################################################################
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  ################################################################################
@@ -31,7 +31,8 @@ class PDF::Reader
31
31
  ################################################################################
32
32
  # An internal PDF::Reader class that represents an indirect reference to a PDF Object
33
33
  class Reference
34
- attr_reader :id, :gen
34
+ attr_reader :id
35
+ attr_reader :gen
35
36
  ################################################################################
36
37
  # Create a new Reference to an object with the specified id and revision number
37
38
  def initialize(id, gen)
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  module PDF
@@ -1,5 +1,5 @@
1
1
  # encoding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  # utilities.rb : General-purpose utility classes which don't fit anywhere else
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  class PDF::Reader
@@ -7,7 +7,10 @@ class PDF::Reader
7
7
  class TextRun
8
8
  include Comparable
9
9
 
10
- attr_reader :origin, :width, :font_size, :text
10
+ attr_reader :origin
11
+ attr_reader :width
12
+ attr_reader :font_size
13
+ attr_reader :text
11
14
 
12
15
  alias :to_s :text
13
16
 
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  class PDF::Reader
@@ -51,7 +51,7 @@ class PDF::Reader
51
51
  # displacement to speed up processing documents that use vertical
52
52
  # writing systems
53
53
  #
54
- def multiply!(a,b=nil,c=nil, d=nil,e=nil,f=nil)
54
+ def multiply!(a,b,c, d,e,f)
55
55
  if a == 1 && b == 0 && c == 0 && d == 1 && e == 0 && f == 0
56
56
  # the identity matrix, no effect
57
57
  self
@@ -164,12 +164,12 @@ class PDF::Reader
164
164
  # [ e f 1 ] [ e f 1 ]
165
165
  #
166
166
  def regular_multiply!(a2,b2,c2,d2,e2,f2)
167
- newa = (@a * a2) + (@b * c2) + (0 * e2)
168
- newb = (@a * b2) + (@b * d2) + (0 * f2)
169
- newc = (@c * a2) + (@d * c2) + (0 * e2)
170
- newd = (@c * b2) + (@d * d2) + (0 * f2)
171
- newe = (@e * a2) + (@f * c2) + (1 * e2)
172
- newf = (@e * b2) + (@f * d2) + (1 * f2)
167
+ newa = (@a * a2) + (@b * c2) + (e2 * 0)
168
+ newb = (@a * b2) + (@b * d2) + (f2 * 0)
169
+ newc = (@c * a2) + (@d * c2) + (e2 * 0)
170
+ newd = (@c * b2) + (@d * d2) + (f2 * 0)
171
+ newe = (@e * a2) + (@f * c2) + (e2 * 1)
172
+ newf = (@e * b2) + (@f * d2) + (f2 * 1)
173
173
  @a, @b, @c, @d, @e, @f = newa, newb, newc, newd, newe, newf
174
174
  end
175
175
 
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  require 'afm'
@@ -73,7 +73,7 @@ class PDF::Reader
73
73
  #
74
74
  # ref - a PDF::Reader::Reference object containing an object ID and revision number
75
75
  def [](ref)
76
- @xref[ref.id][ref.gen]
76
+ @xref.fetch(ref.id, {}).fetch(ref.gen)
77
77
  rescue
78
78
  raise InvalidObjectError, "Object #{ref.id}, Generation #{ref.gen} is invalid"
79
79
  end
@@ -82,8 +82,8 @@ class PDF::Reader
82
82
  def each(&block)
83
83
  ids = @xref.keys.sort
84
84
  ids.each do |id|
85
- gen = @xref[id].keys.sort[-1]
86
- yield PDF::Reader::Reference.new(id, gen)
85
+ gen = @xref.fetch(id, {}).keys.sort[-1]
86
+ yield PDF::Reader::Reference.new(id, gen.to_i)
87
87
  end
88
88
  end
89
89
  ################################################################################
data/lib/pdf/reader.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  ################################################################################
@@ -128,7 +128,7 @@ module PDF
128
128
  doc_strings_to_utf8(dict)
129
129
  end
130
130
 
131
- # Return a Hash with extra metadata provided by the author of the PDF file. Not
131
+ # Return a String with extra XML metadata provided by the author of the PDF file. Not
132
132
  # always present.
133
133
  #
134
134
  def metadata
@@ -182,7 +182,7 @@ module PDF
182
182
  #
183
183
  # reader.pages.each do |page|
184
184
  # puts page.fonts
185
- # puts page.images
185
+ # puts page.rectangles
186
186
  # puts page.text
187
187
  # end
188
188
  #
@@ -272,13 +272,7 @@ module PDF
272
272
  end
273
273
 
274
274
  def root
275
- @root ||= begin
276
- obj = @objects.deref_hash(@objects.trailer[:Root]) || {}
277
- unless obj.kind_of?(::Hash)
278
- raise MalformedPDFError, "PDF malformed, trailer Root should be a dictionary"
279
- end
280
- obj
281
- end
275
+ @root ||= @objects.deref_hash(@objects.trailer[:Root]) || {}
282
276
  end
283
277
 
284
278
  end
@@ -315,6 +309,7 @@ require 'pdf/reader/print_receiver'
315
309
  require 'pdf/reader/rectangle'
316
310
  require 'pdf/reader/reference'
317
311
  require 'pdf/reader/register_receiver'
312
+ require 'pdf/reader/no_text_filter'
318
313
  require 'pdf/reader/null_security_handler'
319
314
  require 'pdf/reader/security_handler_factory'
320
315
  require 'pdf/reader/standard_key_builder'