pdf-reader 2.10.0 → 2.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 55bfae4c5211a0f3ac70845500183e237d3d5f9cc81d548a27f4b8c5fd5acfc9
4
- data.tar.gz: 9e9000474695100c4874afd9abf5f6290ea0b59ffa773c3d03a4a3fc3d2a0a4c
3
+ metadata.gz: e3b00946c8b23b65d19ace187550b15bb3fd2537e518c778f4c12da28672c9d8
4
+ data.tar.gz: 4c2ebeb19dada9f257fa65c2add2f2f6d64f011cb13e997533a4b63fc81baa6d
5
5
  SHA512:
6
- metadata.gz: 2dc96f064c3b233bd499a5a8140bb6f61fd1bdadb2582ac9bc569adff911e63aaadc16188d6624e0ed766481d453912bf5e0057add84262d753681cc40f51776
7
- data.tar.gz: 64dfac4dd2b73a5302be95c47e74e7ca8b94fc6daeaea30c7a33cca6b4be79b68facc571d209f44a59de2632f6241166a290fe0f2835498b2b7d611906b05a31
6
+ metadata.gz: 99c9ac879424056221f616d7f7299d03dfc9906c6b81c333ad255439780cf56d2dfc0c31a62347a7a163bcdb4075f8d0c914e2deeebb5d78e8ebc34e19cd7abc
7
+ data.tar.gz: 50ef8b5e1061dd1d6b24a7727b5537664bcb22473757274b4cc2b92c89b9ba5ea7516f055571f5c8b72d678f7cef549858631408c86a6984196ba7d1773daaca
data/CHANGELOG CHANGED
@@ -1,3 +1,12 @@
1
+ v2.12.0 (26th December 2023)
2
+ - Fix a sorbet method signature (http://github.com/yob/pdf-reader/pull/512)
3
+ - Reduce allocations when parsing PDFs with hex strings (http://github.com/yob/pdf-reader/pull/528)
4
+ - Fix text extraction of some rare unicode codepoints (http://github.com/yob/pdf-reader/pull/529)
5
+
6
+ v2.11.0 (26th October 2022)
7
+ - Various bug fixes
8
+ - Expanded sorbet type annotations
9
+
1
10
  v2.10.0 (12th May 2022)
2
11
  - Various bug fixes
3
12
  - Expanded sorbet type annotations
data/Rakefile CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
14
14
  Cane::RakeTask.new(:quality) do |cane|
15
15
  cane.abc_max = 20
16
16
  cane.style_measure = 100
17
- cane.max_violations = 28
17
+ cane.max_violations = 33
18
18
 
19
19
  cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
20
20
  end
@@ -300,13 +300,12 @@ class PDF::Reader
300
300
  # we find a closing >
301
301
  #
302
302
  def prepare_hex_token
303
- finished = :false
304
303
  str = "".dup
305
304
 
306
- until finished == :true
305
+ loop do
307
306
  byte = @io.getbyte
308
307
  if byte.nil?
309
- finished = :true # unbalanced params
308
+ break
310
309
  elsif (48..57).include?(byte) || (65..90).include?(byte) || (97..122).include?(byte)
311
310
  str << byte
312
311
  elsif byte <= 32
@@ -315,7 +314,7 @@ class PDF::Reader
315
314
  @tokens << str if str.size > 0
316
315
  @tokens << ">" if byte != 0x3E # '>'
317
316
  @tokens << byte.chr
318
- finished = :true
317
+ break
319
318
  end
320
319
  end
321
320
  end
@@ -52,7 +52,9 @@ class PDF::Reader
52
52
 
53
53
  # this is the form 10 20 123 where all index between 10 and 20 have width 123
54
54
  def parse_second_form(first, final, width)
55
- raise MalformedPDFError, "CidWidths: #{first} must be less than #{final}" unless first < final
55
+ if first > final
56
+ raise MalformedPDFError, "CidWidths: #{first} must be less than #{final}"
57
+ end
56
58
 
57
59
  (first..final).inject({}) { |accum, index|
58
60
  accum[index] = width
@@ -118,8 +118,8 @@ class PDF::Reader
118
118
  result = []
119
119
  while unpacked_string.any? do
120
120
  if unpacked_string.size >= 2 &&
121
- unpacked_string.first.to_i > 0xD800 &&
122
- unpacked_string.first.to_i < 0xDBFF
121
+ unpacked_string.first.to_i >= 0xD800 &&
122
+ unpacked_string.first.to_i <= 0xDBFF
123
123
  # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
124
124
  # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
125
125
  # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
@@ -76,9 +76,9 @@ class PDF::Reader
76
76
  diff.each do |val|
77
77
  if val.kind_of?(Numeric)
78
78
  byte = val.to_i
79
- else
79
+ elsif codepoint = glyphlist.name_to_unicode(val)
80
80
  @differences[byte] = val
81
- @mapping[byte] = glyphlist.name_to_unicode(val)
81
+ @mapping[byte] = codepoint
82
82
  byte += 1
83
83
  end
84
84
  end
@@ -119,7 +119,7 @@ class PDF::Reader
119
119
  # => [:A]
120
120
  #
121
121
  def int_to_name(glyph_code)
122
- if @enc_name == "Identity-H" || @enc_name == "Identity-V"
122
+ if @enc_name == :"Identity-H" || @enc_name == :"Identity-V"
123
123
  []
124
124
  elsif differences[glyph_code]
125
125
  [differences[glyph_code]]
@@ -143,7 +143,6 @@ class PDF::Reader
143
143
  CONTROL_CHARS.include?(i) ? [i, UNKNOWN_CHAR] : [i,i]
144
144
  }
145
145
  mapping = Hash[tuples]
146
- mapping[nil] = UNKNOWN_CHAR
147
146
  mapping
148
147
  end
149
148
 
@@ -167,7 +166,7 @@ class PDF::Reader
167
166
  end
168
167
 
169
168
  def convert_to_utf8(str)
170
- ret = str.unpack(unpack).map! { |c| @mapping[c] || c }.pack("U*")
169
+ ret = str.unpack(unpack).map! { |c| @mapping[c.to_i] || c }.pack("U*")
171
170
  ret.force_encoding("UTF-8")
172
171
  ret
173
172
  end
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  class PDF::Reader
@@ -82,8 +82,8 @@ class PDF::Reader
82
82
  glyph_width_in_glyph_space = glyph_width(code_point)
83
83
 
84
84
  if @subtype == :Type3
85
- x1, y1 = font_matrix_transform(0,0)
86
- x2, y2 = font_matrix_transform(glyph_width_in_glyph_space, 0)
85
+ x1, _y1 = font_matrix_transform(0,0)
86
+ x2, _y2 = font_matrix_transform(glyph_width_in_glyph_space, 0)
87
87
  (x2 - x1).abs.round(2)
88
88
  else
89
89
  glyph_width_in_glyph_space / 1000.0
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  ################################################################################
@@ -33,10 +33,18 @@ class PDF::Reader
33
33
  #
34
34
  class GlyphHash # :nodoc:
35
35
  def initialize
36
+ @@by_codepoint_cache ||= nil
37
+ @@by_name_cache ||= nil
38
+
36
39
  # only parse the glyph list once, and cache the results (for performance)
37
- adobe = @@cache ||= load_adobe_glyph_mapping
38
- @by_name = adobe.first
39
- @by_codepoint = adobe.last
40
+ if @@by_codepoint_cache != nil && @@by_name_cache != nil
41
+ @by_name = @@by_name_cache
42
+ @by_codepoint = @@by_codepoint_cache
43
+ else
44
+ by_name, by_codepoint = load_adobe_glyph_mapping
45
+ @by_name = @@by_name_cache ||= by_name
46
+ @by_codepoint = @@by_codepoint_cache ||= by_codepoint
47
+ end
40
48
  end
41
49
 
42
50
  # attempt to convert a PDF Name to a unicode codepoint. Returns nil
@@ -127,7 +135,7 @@ class PDF::Reader
127
135
  end
128
136
  end
129
137
 
130
- [keyed_by_name.freeze, keyed_by_codepoint.freeze]
138
+ return keyed_by_name.freeze, keyed_by_codepoint.freeze
131
139
  end
132
140
 
133
141
  end
@@ -42,7 +42,7 @@ module PDF
42
42
  while bits_left_in_chunk > 0 and @current_pos < @data.size
43
43
  chunk = 0 if chunk < 0
44
44
  codepoint = @data[@current_pos, 1].to_s.unpack("C*")[0].to_i
45
- current_byte = codepoint & (2**@bits_left_in_byte - 1) #clear consumed bits
45
+ current_byte = codepoint & (2**@bits_left_in_byte - 1).to_i #clear consumed bits
46
46
  dif = bits_left_in_chunk - @bits_left_in_byte
47
47
  if dif > 0 then current_byte <<= dif
48
48
  elsif dif < 0 then current_byte >>= dif.abs
@@ -0,0 +1,14 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ class PDF::Reader
6
+ # There's no point rendering zero-width characters
7
+ class NoTextFilter
8
+
9
+ def self.exclude_empty_strings(runs)
10
+ runs.reject { |run| run.text.to_s.size == 0 }
11
+ end
12
+ end
13
+ end
14
+
@@ -2,6 +2,8 @@
2
2
  # typed: true
3
3
  # frozen_string_literal: true
4
4
 
5
+ require 'tempfile'
6
+
5
7
  class PDF::Reader
6
8
  # Provides low level access to the objects in a PDF file via a hash-like
7
9
  # object.
@@ -566,7 +568,7 @@ class PDF::Reader
566
568
  end
567
569
 
568
570
  def object_streams
569
- @object_stream ||= {}
571
+ @object_streams ||= {}
570
572
  end
571
573
 
572
574
  # returns an array of object references for all pages in this object store. The ordering of
@@ -591,18 +593,18 @@ class PDF::Reader
591
593
 
592
594
  def read_version
593
595
  @io.seek(0)
594
- _m, version = *@io.read(10).match(/PDF-(\d.\d)/)
596
+ _m, version = *@io.read(10).to_s.match(/PDF-(\d.\d)/)
595
597
  @io.seek(0)
596
598
  version.to_f
597
599
  end
598
600
 
599
601
  def extract_io_from(input)
600
- if input.respond_to?(:seek) && input.respond_to?(:read)
602
+ if input.is_a?(IO) || input.is_a?(StringIO) || input.is_a?(Tempfile)
601
603
  input
602
604
  elsif File.file?(input.to_s)
603
- StringIO.new read_as_binary(input)
605
+ StringIO.new read_as_binary(input.to_s)
604
606
  else
605
- raise ArgumentError, "input must be an IO-like object or a filename"
607
+ raise ArgumentError, "input must be an IO-like object or a filename (#{input.class})"
606
608
  end
607
609
  end
608
610
 
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  class PDF::Reader
@@ -1,6 +1,6 @@
1
- # typed: true
2
1
  # coding: utf-8
3
2
  # frozen_string_literal: true
3
+ # typed: strict
4
4
 
5
5
  class PDF::Reader
6
6
  # remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  module PDF
@@ -43,10 +43,10 @@ module PDF
43
43
  #
44
44
  def initialize(objects, pagenum, options = {})
45
45
  @objects, @pagenum = objects, pagenum
46
- @page_object = objects.deref_hash(objects.page_references[pagenum - 1])
46
+ @page_object = objects.deref_hash(objects.page_references[pagenum - 1]) || {}
47
47
  @cache = options[:cache] || {}
48
48
 
49
- unless @page_object.is_a?(::Hash)
49
+ if @page_object.empty?
50
50
  raise InvalidPageError, "Invalid page: #{pagenum}"
51
51
  end
52
52
  end
@@ -250,8 +250,8 @@ module PDF
250
250
  params = []
251
251
 
252
252
  while (token = parser.parse_token(PagesStrategy::OPERATORS))
253
- if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
254
- callback(receivers, PagesStrategy::OPERATORS[token], params)
253
+ if token.kind_of?(Token) && method_name = PagesStrategy::OPERATORS[token]
254
+ callback(receivers, method_name, params)
255
255
  params.clear
256
256
  else
257
257
  params << token
@@ -263,9 +263,26 @@ module PDF
263
263
 
264
264
  # calls the name callback method on each receiver object with params as the arguments
265
265
  #
266
+ # The silly style here is because sorbet won't let me use splat arguments
267
+ #
266
268
  def callback(receivers, name, params=[])
267
269
  receivers.each do |receiver|
268
- receiver.send(name, *params) if receiver.respond_to?(name)
270
+ if receiver.respond_to?(name)
271
+ case params.size
272
+ when 0 then receiver.send(name)
273
+ when 1 then receiver.send(name, params[0])
274
+ when 2 then receiver.send(name, params[0], params[1])
275
+ when 3 then receiver.send(name, params[0], params[1], params[2])
276
+ when 4 then receiver.send(name, params[0], params[1], params[2], params[3])
277
+ when 5 then receiver.send(name, params[0], params[1], params[2], params[3], params[4])
278
+ when 6 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5])
279
+ when 7 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6])
280
+ when 8 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7])
281
+ when 9 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7], params[8])
282
+ else
283
+ receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7], params[8], params[9])
284
+ end
285
+ end
269
286
  end
270
287
  end
271
288
 
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  require 'pdf/reader/overlapping_runs_filter'
@@ -62,6 +62,8 @@ module PDF
62
62
  runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
63
63
  end
64
64
 
65
+ runs = NoTextFilter.exclude_empty_strings(runs)
66
+
65
67
  if opts.fetch(:merge, true)
66
68
  runs = merge_runs(runs)
67
69
  end
@@ -173,9 +173,7 @@ class PDF::Reader
173
173
 
174
174
  # add a missing digit if required, as required by the spec
175
175
  str << "0" unless str.size % 2 == 0
176
- str.chars.each_slice(2).map { |nibbles|
177
- nibbles.join("").hex.chr
178
- }.join.force_encoding("binary")
176
+ [str].pack('H*')
179
177
  end
180
178
  ################################################################################
181
179
  # Reads a PDF String from the buffer and converts it to a Ruby String
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  ################################################################################
@@ -31,7 +31,8 @@ class PDF::Reader
31
31
  ################################################################################
32
32
  # An internal PDF::Reader class that represents an indirect reference to a PDF Object
33
33
  class Reference
34
- attr_reader :id, :gen
34
+ attr_reader :id
35
+ attr_reader :gen
35
36
  ################################################################################
36
37
  # Create a new Reference to an object with the specified id and revision number
37
38
  def initialize(id, gen)
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  module PDF
@@ -1,5 +1,5 @@
1
1
  # encoding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  # utilities.rb : General-purpose utility classes which don't fit anywhere else
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  class PDF::Reader
@@ -7,7 +7,10 @@ class PDF::Reader
7
7
  class TextRun
8
8
  include Comparable
9
9
 
10
- attr_reader :origin, :width, :font_size, :text
10
+ attr_reader :origin
11
+ attr_reader :width
12
+ attr_reader :font_size
13
+ attr_reader :text
11
14
 
12
15
  alias :to_s :text
13
16
 
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  class PDF::Reader
@@ -51,7 +51,7 @@ class PDF::Reader
51
51
  # displacement to speed up processing documents that use vertical
52
52
  # writing systems
53
53
  #
54
- def multiply!(a,b=nil,c=nil, d=nil,e=nil,f=nil)
54
+ def multiply!(a,b,c, d,e,f)
55
55
  if a == 1 && b == 0 && c == 0 && d == 1 && e == 0 && f == 0
56
56
  # the identity matrix, no effect
57
57
  self
@@ -164,12 +164,12 @@ class PDF::Reader
164
164
  # [ e f 1 ] [ e f 1 ]
165
165
  #
166
166
  def regular_multiply!(a2,b2,c2,d2,e2,f2)
167
- newa = (@a * a2) + (@b * c2) + (0 * e2)
168
- newb = (@a * b2) + (@b * d2) + (0 * f2)
169
- newc = (@c * a2) + (@d * c2) + (0 * e2)
170
- newd = (@c * b2) + (@d * d2) + (0 * f2)
171
- newe = (@e * a2) + (@f * c2) + (1 * e2)
172
- newf = (@e * b2) + (@f * d2) + (1 * f2)
167
+ newa = (@a * a2) + (@b * c2) + (e2 * 0)
168
+ newb = (@a * b2) + (@b * d2) + (f2 * 0)
169
+ newc = (@c * a2) + (@d * c2) + (e2 * 0)
170
+ newd = (@c * b2) + (@d * d2) + (f2 * 0)
171
+ newe = (@e * a2) + (@f * c2) + (e2 * 1)
172
+ newf = (@e * b2) + (@f * d2) + (f2 * 1)
173
173
  @a, @b, @c, @d, @e, @f = newa, newb, newc, newd, newe, newf
174
174
  end
175
175
 
@@ -73,7 +73,7 @@ class PDF::Reader
73
73
  #
74
74
  # ref - a PDF::Reader::Reference object containing an object ID and revision number
75
75
  def [](ref)
76
- @xref[ref.id][ref.gen]
76
+ @xref.fetch(ref.id, {}).fetch(ref.gen)
77
77
  rescue
78
78
  raise InvalidObjectError, "Object #{ref.id}, Generation #{ref.gen} is invalid"
79
79
  end
@@ -82,8 +82,8 @@ class PDF::Reader
82
82
  def each(&block)
83
83
  ids = @xref.keys.sort
84
84
  ids.each do |id|
85
- gen = @xref[id].keys.sort[-1]
86
- yield PDF::Reader::Reference.new(id, gen)
85
+ gen = @xref.fetch(id, {}).keys.sort[-1]
86
+ yield PDF::Reader::Reference.new(id, gen.to_i)
87
87
  end
88
88
  end
89
89
  ################################################################################
data/lib/pdf/reader.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  ################################################################################
@@ -128,7 +128,7 @@ module PDF
128
128
  doc_strings_to_utf8(dict)
129
129
  end
130
130
 
131
- # Return a Hash with extra metadata provided by the author of the PDF file. Not
131
+ # Return a String with extra XML metadata provided by the author of the PDF file. Not
132
132
  # always present.
133
133
  #
134
134
  def metadata
@@ -182,7 +182,7 @@ module PDF
182
182
  #
183
183
  # reader.pages.each do |page|
184
184
  # puts page.fonts
185
- # puts page.images
185
+ # puts page.rectangles
186
186
  # puts page.text
187
187
  # end
188
188
  #
@@ -272,13 +272,7 @@ module PDF
272
272
  end
273
273
 
274
274
  def root
275
- @root ||= begin
276
- obj = @objects.deref_hash(@objects.trailer[:Root]) || {}
277
- unless obj.kind_of?(::Hash)
278
- raise MalformedPDFError, "PDF malformed, trailer Root should be a dictionary"
279
- end
280
- obj
281
- end
275
+ @root ||= @objects.deref_hash(@objects.trailer[:Root]) || {}
282
276
  end
283
277
 
284
278
  end
@@ -315,6 +309,7 @@ require 'pdf/reader/print_receiver'
315
309
  require 'pdf/reader/rectangle'
316
310
  require 'pdf/reader/reference'
317
311
  require 'pdf/reader/register_receiver'
312
+ require 'pdf/reader/no_text_filter'
318
313
  require 'pdf/reader/null_security_handler'
319
314
  require 'pdf/reader/security_handler_factory'
320
315
  require 'pdf/reader/standard_key_builder'