pdf-reader 2.4.1 → 2.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +40 -0
  3. data/README.md +16 -1
  4. data/Rakefile +1 -1
  5. data/examples/extract_fonts.rb +12 -7
  6. data/examples/rspec.rb +1 -0
  7. data/lib/pdf/reader/buffer.rb +63 -21
  8. data/lib/pdf/reader/cid_widths.rb +1 -0
  9. data/lib/pdf/reader/cmap.rb +5 -3
  10. data/lib/pdf/reader/encoding.rb +3 -2
  11. data/lib/pdf/reader/error.rb +11 -3
  12. data/lib/pdf/reader/filter/ascii85.rb +7 -1
  13. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  14. data/lib/pdf/reader/filter/depredict.rb +10 -8
  15. data/lib/pdf/reader/filter/flate.rb +27 -14
  16. data/lib/pdf/reader/filter/lzw.rb +2 -0
  17. data/lib/pdf/reader/filter/null.rb +1 -0
  18. data/lib/pdf/reader/filter/run_length.rb +19 -13
  19. data/lib/pdf/reader/filter.rb +1 -0
  20. data/lib/pdf/reader/font.rb +1 -0
  21. data/lib/pdf/reader/font_descriptor.rb +1 -0
  22. data/lib/pdf/reader/form_xobject.rb +1 -0
  23. data/lib/pdf/reader/glyph_hash.rb +16 -9
  24. data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
  25. data/lib/pdf/reader/lzw.rb +4 -2
  26. data/lib/pdf/reader/null_security_handler.rb +1 -0
  27. data/lib/pdf/reader/object_cache.rb +1 -0
  28. data/lib/pdf/reader/object_hash.rb +8 -3
  29. data/lib/pdf/reader/object_stream.rb +1 -0
  30. data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
  31. data/lib/pdf/reader/page.rb +60 -9
  32. data/lib/pdf/reader/page_layout.rb +37 -23
  33. data/lib/pdf/reader/page_state.rb +18 -23
  34. data/lib/pdf/reader/page_text_receiver.rb +28 -5
  35. data/lib/pdf/reader/pages_strategy.rb +1 -0
  36. data/lib/pdf/reader/parser.rb +12 -7
  37. data/lib/pdf/reader/point.rb +25 -0
  38. data/lib/pdf/reader/print_receiver.rb +1 -0
  39. data/lib/pdf/reader/rectangle.rb +95 -0
  40. data/lib/pdf/reader/reference.rb +1 -0
  41. data/lib/pdf/reader/register_receiver.rb +1 -0
  42. data/lib/pdf/reader/resource_methods.rb +5 -0
  43. data/lib/pdf/reader/standard_security_handler.rb +1 -0
  44. data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
  45. data/lib/pdf/reader/stream.rb +1 -0
  46. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  47. data/lib/pdf/reader/text_run.rb +1 -0
  48. data/lib/pdf/reader/token.rb +1 -0
  49. data/lib/pdf/reader/transformation_matrix.rb +1 -0
  50. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  51. data/lib/pdf/reader/width_calculator/built_in.rb +8 -15
  52. data/lib/pdf/reader/width_calculator/composite.rb +1 -0
  53. data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
  54. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
  55. data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
  56. data/lib/pdf/reader/width_calculator.rb +1 -0
  57. data/lib/pdf/reader/xref.rb +7 -1
  58. data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
  59. data/lib/pdf/reader.rb +14 -4
  60. data/lib/pdf-reader.rb +1 -0
  61. data/rbi/pdf-reader.rbi +1744 -0
  62. metadata +17 -13
  63. data/lib/pdf/reader/orientation_detector.rb +0 -34
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2b38615953615bfbca1a80ab344f26166377d8c08d2ba2e05badf43c10682415
4
- data.tar.gz: 658b5d05a14300ad056ee31c10ea998533ccb1b91600e8bc9097070605d003ea
3
+ metadata.gz: 5ee0d8c3c55f6a0aebb60a0a6dce92428e8371b96a6beb6d75bfe90602bffae7
4
+ data.tar.gz: '0911d108353bf577aa9fd7b49b97dda1cf9d54816bf8ff6c4225281eeda63229'
5
5
  SHA512:
6
- metadata.gz: 210b0bee8c4ac009808555c8ba945f3b17b85af22126ac1440eb9b49d91f542f1974b0984efb22726985f2cf8e03440511ebc4664ac5c4d91a6bddea9a43687e
7
- data.tar.gz: 8fb60cb59dc4430179a4b9ba83d30ae6dc23aa13dbef5e8febe1569311ddf7e531783da7e7dd0a6542f0087748e97898af56d540e7c088832f213b48059aa7d3
6
+ metadata.gz: 917db2b1fb977b41e7b057ff3d215b8f249577254d9fe3df72f330b32ff49630874c58f480495ddcd137d9f31d014083438623cdf7260b0d7a87bbe3a5f3685a
7
+ data.tar.gz: cd9832f025264e54d586e81eff69727379e8646d741f53ae61e90a5b38945d852147853891d468bab683581bdd0beb68a9b7c7f5e54e064e9a3935262ea9d651
data/CHANGELOG CHANGED
@@ -1,3 +1,43 @@
1
+ v2.7.0 (13th December 2021)
2
+ - Include RBI type files in the gem
3
+ - Downstream users of pdf-reader who also use sorbet *should* find many parts of the API will
4
+ now be typed checked by sorbet
5
+ - Fix glyph positioning in some rotation scenarios (http://github.com/yob/pdf-reader/pull/403)
6
+ - Improved text extraction on some rotated pages, and rotated text on normal pages
7
+ - Add PDF::Reader::Page#rectangles (http://github.com/yob/pdf-reader/pull/402)
8
+ - Returns page boxes (MediaBox, etc) with rotation applied, and as PORO rather than arrays of numbers
9
+ - Add PDF::Reader::Page#origin (http://github.com/yob/pdf-reader/pull/400)
10
+ - Add PDF::Reader::Page#{height,width} (http://github.com/yob/pdf-reader/pull/399)
11
+ - Overlap filter should only drop characters that overlap *and* match (http://github.com/yob/pdf-reader/pull/401)
12
+
13
+ v2.6.0 (12th November 2021)
14
+ - Text extraction improvements
15
+ - Improved text layout on pages with a variety of font sizes (http://github.com/yob/pdf-reader/pull/355)
16
+ - Fixed text positioning for some rotated pages (http://github.com/yob/pdf-reader/pull/356)
17
+ - Improved character width calculation for PDFs using built-in (non-embedded) ZapfDingbats (http://github.com/yob/pdf-reader/pull/373)
18
+ - Skip zero-width characters (http://github.com/yob/pdf-reader/pull/372)
19
+ - Performance improvements
20
+ - Reduced memory pressure when decoding TIFF images (http://github.com/yob/pdf-reader/pull/360)
21
+ - Optional dependency on ascii81_native gem for faster processing of files using the ascii85 filter (http://github.com/yob/pdf-reader/pull/359)
22
+ - Successfully parse more files
23
+ - Gracefully handle some non-spec compliant CR/LF issues (http://github.com/yob/pdf-reader/pull/364)
24
+ - Fix parsing of some escape sequences in content streams (http://github.com/yob/pdf-reader/pull/368)
25
+ - Increase the amount of junk bytes we detect and skip at the end of a file (382)
26
+ - Ignore "/Prev 0" in trailers (http://github.com/yob/pdf-reader/pull/383)
27
+ - Fix parsing of some inline images (BI ID EI tokens) (http://github.com/yob/pdf-reader/pull/389)
28
+ - Gracefully handle some xref tables that incorrectly start with 1 (http://github.com/yob/pdf-reader/pull/384)
29
+
30
+ v2.5.0 (6th June 2021)
31
+ - bump minimum ruby version to 2.0
32
+ - Correctly handle trascoding to UTF-8 from some fonts that use a difference table [#344](https://github.com/yob/pdf-reader/pull/344/)
33
+ - Fix some character spacing issues with the TJ operator [#343](https://github.com/yob/pdf-reader/pull/343)
34
+ - Fix crash with some encrypted PDFs [#348](https://github.com/yob/pdf-reader/pull/348/)
35
+ - Fix positions of text on some PDFs with pages rotated 90° [#350](https://github.com/yob/pdf-reader/pull/350/)
36
+
37
+ v2.4.2 (28th January 2021)
38
+ - relax ASCII85 dependency to allow 1.x
39
+ - improved support for decompressing objects with slightly malformed zlib data
40
+
1
41
  v.2.4.1 (24th September 2020)
2
42
  - Re-vendor font metrics from Adobe to clarify their license
3
43
 
data/README.md CHANGED
@@ -166,6 +166,19 @@ http://groups.google.com/group/pdf-reader
166
166
  The easiest way to explain how this works in practice is to show some examples.
167
167
  Check out the examples/ directory for a few files.
168
168
 
169
+ # Alternate Decoder
170
+
171
+ For PDF files containing Ascii85 streams, the [ascii85_native](https://github.com/AnomalousBit/ascii85_native) gem can be used for increased performance. If the ascii85_native gem is detected, pdf-reader will automatically use the gem.
172
+
173
+ First, run `gem install ascii85_native` and then require the gem alongside pdf-reader:
174
+
175
+ ```ruby
176
+ require "pdf-reader"
177
+ require "ascii85_native"
178
+ ```
179
+
180
+ Another way of enabling native Ascii85 decoding is to place `gem 'ascii85_native'` in your project's `Gemfile`.
181
+
169
182
  # Known Limitations
170
183
 
171
184
  Occasionally some text cannot be extracted properly due to the way it has been
@@ -176,7 +189,9 @@ little UTF-8 friendly box to indicate an unrecognisable character.
176
189
 
177
190
  * PDF::Reader Code Repository: http://github.com/yob/pdf-reader
178
191
 
179
- * PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
192
+ * PDF Specification: https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf
193
+
194
+ * Adobe PDF Developer Resources: http://www.adobe.com/devnet/pdf/pdf_reference.html
180
195
 
181
196
  * PDF Tutorial Slide Presentations: https://web.archive.org/web/20150110042057/http://home.comcast.net/~jk05/presentations/PDFTutorials.html
182
197
 
data/Rakefile CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
14
14
  Cane::RakeTask.new(:quality) do |cane|
15
15
  cane.abc_max = 20
16
16
  cane.style_measure = 100
17
- cane.max_violations = 31
17
+ cane.max_violations = 32
18
18
 
19
19
  cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
20
20
  end
@@ -17,8 +17,8 @@ module ExtractFonts
17
17
  return count if page.fonts.nil? || page.fonts.empty?
18
18
 
19
19
  page.fonts.each do |label, font|
20
- next if complete_refs[font]
21
- complete_refs[font] = true
20
+ next if complete_refs[label]
21
+ complete_refs[label] = true
22
22
 
23
23
  process_font(page, font)
24
24
 
@@ -39,7 +39,7 @@ module ExtractFonts
39
39
  when :TrueType, :CIDFontType2 then
40
40
  ExtractFonts::TTF.new(page.objects, font).save("#{font[:BaseFont]}.ttf")
41
41
  else
42
- $stderr.puts "unsupported font type #{font[:Subtype]}"
42
+ $stderr.puts "unsupported font type #{font[:Subtype]} for #{font[:BaseFont]}"
43
43
  end
44
44
  end
45
45
 
@@ -68,10 +68,15 @@ module ExtractFonts
68
68
  end
69
69
  end
70
70
 
71
- filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cairo-unicode.pdf"
71
+ if ARGV.size == 0 # default file name
72
+ ARGV << File.expand_path(File.join(File.dirname(__dir__), "spec", "data", "cairo-unicode.pdf"))
73
+ end
74
+
72
75
  extractor = ExtractFonts::Extractor.new
73
76
 
74
- PDF::Reader.open(filename) do |reader|
75
- page = reader.page(1)
76
- extractor.page(page)
77
+ ARGV.each do |arg|
78
+ PDF::Reader.open(arg) do |reader|
79
+ page = reader.page(1)
80
+ extractor.page(page)
81
+ end
77
82
  end
data/examples/rspec.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  # coding: utf-8
3
+ # typed: ignore
3
4
 
4
5
  # Basic RSpec of a generated PDF
5
6
  #
@@ -1,4 +1,5 @@
1
1
  # coding: ASCII-8BIT
2
+ # typed: false
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -48,6 +49,15 @@ class PDF::Reader
48
49
  ID = "ID"
49
50
  FWD_SLASH = "/"
50
51
  NULL_BYTE = "\x00"
52
+ CR = "\r"
53
+ LF = "\n"
54
+ CRLF = "\r\n"
55
+ WHITE_SPACE = [LF, CR, ' ']
56
+
57
+ # Quite a few PDFs have trailing junk.
58
+ # This can be several k of nuls in some cases
59
+ # Allow for this here
60
+ TRAILING_BYTECOUNT = 5000
51
61
 
52
62
  attr_reader :pos
53
63
 
@@ -86,9 +96,12 @@ class PDF::Reader
86
96
  #
87
97
  # options:
88
98
  #
89
- # :skip_eol - if true, the IO stream is advanced past a CRLF or LF that
90
- # is sitting under the io cursor.
91
- #
99
+ # :skip_eol - if true, the IO stream is advanced past a CRLF, CR or LF
100
+ # that is sitting under the io cursor.
101
+ # Note:
102
+ # Skipping a bare CR is not spec-compliant.
103
+ # This is because the data may start with LF.
104
+ # However we check for CRLF first, so the ambiguity is avoided.
92
105
  def read(bytes, opts = {})
93
106
  reset_pos
94
107
 
@@ -97,9 +110,9 @@ class PDF::Reader
97
110
  str = @io.read(2)
98
111
  if str.nil?
99
112
  return nil
100
- elsif str == "\r\n"
113
+ elsif str == CRLF # This MUST be done before checking for CR alone
101
114
  # do nothing
102
- elsif str[0,1] == "\n"
115
+ elsif str[0, 1] == LF || str[0, 1] == CR # LF or CR alone
103
116
  @io.seek(-1, IO::SEEK_CUR)
104
117
  else
105
118
  @io.seek(-2, IO::SEEK_CUR)
@@ -127,8 +140,8 @@ class PDF::Reader
127
140
  #
128
141
  def find_first_xref_offset
129
142
  check_size_is_non_zero
130
- @io.seek(-1024, IO::SEEK_END) rescue @io.seek(0)
131
- data = @io.read(1024)
143
+ @io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
144
+ data = @io.read(TRAILING_BYTECOUNT)
132
145
 
133
146
  # the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
134
147
  lines = data.split(/[\n\r]+/).reverse
@@ -217,7 +230,9 @@ class PDF::Reader
217
230
  return if @tokens.size < 3
218
231
  return if @tokens[2] != "R"
219
232
 
220
- if @tokens[0].match(/\d+/) && @tokens[1].match(/\d+/)
233
+ # must match whole tokens
234
+ digits_only = %r{\A\d+\z}
235
+ if @tokens[0].match(digits_only) && @tokens[1].match(digits_only)
221
236
  @tokens[0] = PDF::Reader::Reference.new(@tokens[0].to_i, @tokens[1].to_i)
222
237
  @tokens[1] = nil
223
238
  @tokens[2] = nil
@@ -225,24 +240,51 @@ class PDF::Reader
225
240
  end
226
241
  end
227
242
 
243
+ # Extract data between ID and EI
244
+ # If the EI follows white-space the space is dropped from the data
245
+ # The EI must followed by white-space or end of buffer
246
+ # This is to reduce the chance of accidentally matching an embedded EI
228
247
  def prepare_inline_token
229
- str = "".dup
230
-
231
- buffer = []
232
-
233
- until buffer[0] =~ /\s|\0/ && buffer[1, 2] == ["E", "I"]
248
+ idstart = @io.pos
249
+ chr = prevchr = nil
250
+ eisize = 0 # how many chars in the end marker
251
+ seeking = 'E' # what are we looking for now?
252
+ loop do
234
253
  chr = @io.read(1)
235
- buffer << chr
236
-
237
- if buffer.length > 3
238
- str << buffer.shift
254
+ break if chr.nil?
255
+ case seeking
256
+ when 'E'
257
+ if chr == 'E'
258
+ seeking = 'I'
259
+ if WHITE_SPACE.include? prevchr
260
+ eisize = 3 # include whitespace in delimiter, i.e. drop from data
261
+ else # assume the EI immediately follows the data
262
+ eisize = 2 # leave prevchr in data
263
+ end
264
+ end
265
+ when 'I'
266
+ if chr == 'I'
267
+ seeking = :END
268
+ else
269
+ seeking = 'E'
270
+ end
271
+ when :END
272
+ if WHITE_SPACE.include? chr
273
+ eisize += 1 # Drop trailer
274
+ break
275
+ else
276
+ seeking = 'E'
277
+ end
239
278
  end
279
+ prevchr = chr
240
280
  end
241
-
242
- str << NULL_BYTE if buffer.first == NULL_BYTE
243
-
281
+ unless seeking == :END
282
+ raise MalformedPDFError, "EI terminator not found"
283
+ end
284
+ eiend = @io.pos
285
+ @io.seek(idstart, IO::SEEK_SET)
286
+ str = @io.read(eiend - eisize - idstart) # get the ID content
244
287
  @tokens << string_token(str)
245
- @io.seek(-3, IO::SEEK_CUR) unless chr.nil?
246
288
  end
247
289
 
248
290
  # if we're currently inside a hex string, read hex nibbles until
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: false
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -32,6 +33,7 @@ class PDF::Reader
32
33
  # extracting various useful information.
33
34
  #
34
35
  class CMap # :nodoc:
36
+
35
37
  CMAP_KEYWORDS = {
36
38
  "begincodespacerange" => 1,
37
39
  "endcodespacerange" => 1,
@@ -53,7 +55,7 @@ class PDF::Reader
53
55
 
54
56
  def process_data(data)
55
57
  parser = build_parser(data)
56
- mode = nil
58
+ mode = :none
57
59
  instructions = []
58
60
 
59
61
  while token = parser.parse_token(CMAP_KEYWORDS)
@@ -62,13 +64,13 @@ class PDF::Reader
62
64
  elsif token == "endbfchar"
63
65
  process_bfchar_instructions(instructions)
64
66
  instructions = []
65
- mode = nil
67
+ mode = :none
66
68
  elsif token == "beginbfrange"
67
69
  mode = :range
68
70
  elsif token == "endbfrange"
69
71
  process_bfrange_instructions(instructions)
70
72
  instructions = []
71
- mode = nil
73
+ mode = :none
72
74
  elsif mode == :char || mode == :range
73
75
  instructions << token
74
76
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -68,7 +69,7 @@ class PDF::Reader
68
69
  #
69
70
  # [25, :A, :B]
70
71
  def differences=(diff)
71
- raise ArgumentError, "diff must be an array" unless diff.kind_of?(Array)
72
+ PDF::Reader::Error.validate_type(diff, "diff", Array)
72
73
 
73
74
  @differences = {}
74
75
  byte = 0
@@ -208,7 +209,7 @@ class PDF::Reader
208
209
  def load_mapping(file)
209
210
  File.open(file, "r:BINARY") do |f|
210
211
  f.each do |l|
211
- _m, single_byte, unicode = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
212
+ _m, single_byte, unicode = *l.match(/\A([0-9A-Za-z]+);([0-9A-F]{4})/)
212
213
  @mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
213
214
  end
214
215
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -33,19 +34,26 @@ class PDF::Reader
33
34
  def self.str_assert(lvalue, rvalue, chars=nil)
34
35
  raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
35
36
  lvalue = lvalue[0,chars] if chars
36
- raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue != rvalue
37
+ raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
37
38
  end
38
39
  ################################################################################
39
40
  def self.str_assert_not(lvalue, rvalue, chars=nil)
40
41
  raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
41
42
  lvalue = lvalue[0,chars] if chars
42
- raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue == rvalue
43
+ raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue == rvalue
43
44
  end
44
45
  ################################################################################
45
46
  def self.assert_equal(lvalue, rvalue)
46
- raise MalformedPDFError, "PDF malformed, expected #{rvalue} but found #{lvalue} instead" if lvalue != rvalue
47
+ raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
47
48
  end
48
49
  ################################################################################
50
+ def self.validate_type(object, name, klass)
51
+ raise ArgumentError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
52
+ end
53
+ ################################################################################
54
+ def self.validate_not_nil(object, name)
55
+ raise ArgumentError, "#{object} must not be nil" if object.nil?
56
+ end
49
57
  end
50
58
 
51
59
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: false
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'ascii85'
@@ -7,6 +8,7 @@ class PDF::Reader
7
8
  module Filter # :nodoc:
8
9
  # implementation of the Ascii85 filter
9
10
  class Ascii85
11
+
10
12
  def initialize(options = {})
11
13
  @options = options
12
14
  end
@@ -17,7 +19,11 @@ class PDF::Reader
17
19
  #
18
20
  def filter(data)
19
21
  data = "<~#{data}" unless data.to_s[0,2] == "<~"
20
- ::Ascii85::decode(data)
22
+ if defined?(::Ascii85Native)
23
+ ::Ascii85Native::decode(data)
24
+ else
25
+ ::Ascii85::decode(data)
26
+ end
21
27
  rescue Exception => e
22
28
  # Oops, there was a problem decoding the stream
23
29
  raise MalformedPDFError,
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -6,6 +7,7 @@ class PDF::Reader
6
7
  module Filter # :nodoc:
7
8
  # implementation of the AsciiHex stream filter
8
9
  class AsciiHex
10
+
9
11
  def initialize(options = {})
10
12
  @options = options
11
13
  end
@@ -16,9 +18,12 @@ class PDF::Reader
16
18
  def filter(data)
17
19
  data.chop! if data[-1,1] == ">"
18
20
  data = data[1,data.size] if data[0,1] == "<"
21
+
22
+ return "" if data.nil?
23
+
19
24
  data.gsub!(/[^A-Fa-f0-9]/,"")
20
25
  data << "0" if data.size % 2 == 1
21
- data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
26
+ data.scan(/.{2}/).flatten.map { |s| s.hex.chr }.join("")
22
27
  rescue Exception => e
23
28
  # Oops, there was a problem decoding the stream
24
29
  raise MalformedPDFError,
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -6,6 +7,7 @@ class PDF::Reader
6
7
  # some filter implementations support preprocessing of the data to
7
8
  # improve compression
8
9
  class Depredict
10
+
9
11
  def initialize(options = {})
10
12
  @options = options || {}
11
13
  end
@@ -34,7 +36,7 @@ class PDF::Reader
34
36
  ################################################################################
35
37
  def tiff_depredict(data)
36
38
  data = data.unpack("C*")
37
- unfiltered = []
39
+ unfiltered = ''
38
40
  bpc = @options[:BitsPerComponent] || 8
39
41
  pixel_bits = bpc * @options[:Colors]
40
42
  pixel_bytes = pixel_bits / 8
@@ -51,11 +53,11 @@ class PDF::Reader
51
53
  left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
52
54
  row_data[index] = (byte + left) % 256
53
55
  end
54
- unfiltered += row_data
56
+ unfiltered += row_data.pack("C*")
55
57
  pos += line_len
56
58
  end
57
59
 
58
- unfiltered.pack("C*")
60
+ unfiltered
59
61
  end
60
62
  ################################################################################
61
63
  def png_depredict(data)
@@ -67,7 +69,7 @@ class PDF::Reader
67
69
  scanline_length = (pixel_bytes * @options[:Columns]) + 1
68
70
  row = 0
69
71
  pixels = []
70
- paeth, pa, pb, pc = nil
72
+ paeth, pa, pb, pc = 0, 0, 0, 0
71
73
  until data.empty? do
72
74
  row_data = data.slice! 0, scanline_length
73
75
  filter = row_data.shift
@@ -94,17 +96,17 @@ class PDF::Reader
94
96
  row_data[index] = (byte + ((left + upper)/2).floor) % 256
95
97
  end
96
98
  when 4 # Paeth
97
- left = upper = upper_left = nil
99
+ left = upper = upper_left = 0
98
100
  row_data.each_with_index do |byte, index|
99
101
  col = index / pixel_bytes
100
102
 
101
- left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
103
+ left = index < pixel_bytes ? 0 : Integer(row_data[index - pixel_bytes])
102
104
  if row.zero?
103
105
  upper = upper_left = 0
104
106
  else
105
- upper = pixels[row-1][col][index % pixel_bytes]
107
+ upper = Integer(pixels[row-1][col][index % pixel_bytes])
106
108
  upper_left = col.zero? ? 0 :
107
- pixels[row-1][col-1][index % pixel_bytes]
109
+ Integer(pixels[row-1][col-1][index % pixel_bytes])
108
110
  end
109
111
 
110
112
  p = left + upper - upper_left
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
 
@@ -8,7 +9,9 @@ class PDF::Reader
8
9
  module Filter # :nodoc:
9
10
  # implementation of the Flate (zlib) stream filter
10
11
  class Flate
12
+
11
13
  ZLIB_AUTO_DETECT_ZLIB_OR_GZIP = 47 # Zlib::MAX_WBITS + 32
14
+ ZLIB_RAW_DEFLATE = -15 # Zlib::MAX_WBITS * -1
12
15
 
13
16
  def initialize(options = {})
14
17
  @options = options
@@ -17,24 +20,34 @@ class PDF::Reader
17
20
  ################################################################################
18
21
  # Decode the specified data with the Zlib compression algorithm
19
22
  def filter(data)
20
- deflated = nil
23
+ deflated = zlib_inflate(data) || zlib_inflate(data[0, data.bytesize-1])
24
+
25
+ if deflated.nil?
26
+ raise MalformedPDFError,
27
+ "Error while inflating a compressed stream (no suitable inflation algorithm found)"
28
+ end
29
+ Depredict.new(@options).filter(deflated)
30
+ end
31
+
32
+ private
33
+
34
+ def zlib_inflate(data)
21
35
  begin
22
- deflated = Zlib::Inflate.new(ZLIB_AUTO_DETECT_ZLIB_OR_GZIP).inflate(data)
23
- rescue Zlib::DataError => e
36
+ return Zlib::Inflate.new(ZLIB_AUTO_DETECT_ZLIB_OR_GZIP).inflate(data)
37
+ rescue Zlib::DataError
24
38
  # by default, Ruby's Zlib assumes the data it's inflating
25
39
  # is RFC1951 deflated data, wrapped in a RFC1950 zlib container. If that
26
- # fails, then use a lightly-documented 'feature' to attempt to inflate
27
- # the data as a raw RFC1951 stream.
28
- #
29
- # See
30
- # - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
31
- deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
40
+ # fails, swallow the exception and attempt to inflate the data as a raw
41
+ # RFC1951 stream.
32
42
  end
33
- Depredict.new(@options).filter(deflated)
34
- rescue Exception => e
35
- # Oops, there was a problem inflating the stream
36
- raise MalformedPDFError,
37
- "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
43
+
44
+ begin
45
+ return Zlib::Inflate.new(ZLIB_RAW_DEFLATE).inflate(data)
46
+ rescue StandardError
47
+ # swallow this one too, so we can try some other fallback options
48
+ end
49
+
50
+ nil
38
51
  end
39
52
  end
40
53
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -6,6 +7,7 @@ class PDF::Reader
6
7
  module Filter # :nodoc:
7
8
  # implementation of the LZW stream filter
8
9
  class Lzw
10
+
9
11
  def initialize(options = {})
10
12
  @options = options
11
13
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -6,6 +7,7 @@ class PDF::Reader # :nodoc:
6
7
  module Filter # :nodoc:
7
8
  # implementation of the run length stream filter
8
9
  class RunLength
10
+
9
11
  def initialize(options = {})
10
12
  @options = options
11
13
  end
@@ -20,19 +22,23 @@ class PDF::Reader # :nodoc:
20
22
  length = data.getbyte(pos)
21
23
  pos += 1
22
24
 
23
- case
24
- when length == 128
25
- break
26
- when length < 128
27
- # When the length is < 128, we copy the following length+1 bytes
28
- # literally.
29
- out << data[pos, length + 1]
30
- pos += length
31
- else
32
- # When the length is > 128, we copy the next byte (257 - length)
33
- # times; i.e., "\xFA\x00" ([250, 0]) will expand to
34
- # "\x00\x00\x00\x00\x00\x00\x00".
35
- out << data[pos, 1] * (257 - length)
25
+ unless length.nil?
26
+ case
27
+ # nothing
28
+ when length == 128
29
+ break
30
+ when length < 128
31
+ # When the length is < 128, we copy the following length+1 bytes
32
+ # literally.
33
+ out << data[pos, length + 1]
34
+ pos += length
35
+ else
36
+ # When the length is > 128, we copy the next byte (257 - length)
37
+ # times; i.e., "\xFA\x00" ([250, 0]) will expand to
38
+ # "\x00\x00\x00\x00\x00\x00\x00".
39
+ previous_byte = data[pos, 1] || ""
40
+ out << previous_byte * (257 - length)
41
+ end
36
42
  end
37
43
 
38
44
  pos += 1
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################