pdf-reader 2.4.1 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +40 -0
  3. data/README.md +16 -1
  4. data/Rakefile +1 -1
  5. data/examples/extract_fonts.rb +12 -7
  6. data/examples/rspec.rb +1 -0
  7. data/lib/pdf/reader/buffer.rb +63 -21
  8. data/lib/pdf/reader/cid_widths.rb +1 -0
  9. data/lib/pdf/reader/cmap.rb +5 -3
  10. data/lib/pdf/reader/encoding.rb +3 -2
  11. data/lib/pdf/reader/error.rb +11 -3
  12. data/lib/pdf/reader/filter/ascii85.rb +7 -1
  13. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  14. data/lib/pdf/reader/filter/depredict.rb +10 -8
  15. data/lib/pdf/reader/filter/flate.rb +27 -14
  16. data/lib/pdf/reader/filter/lzw.rb +2 -0
  17. data/lib/pdf/reader/filter/null.rb +1 -0
  18. data/lib/pdf/reader/filter/run_length.rb +19 -13
  19. data/lib/pdf/reader/filter.rb +1 -0
  20. data/lib/pdf/reader/font.rb +1 -0
  21. data/lib/pdf/reader/font_descriptor.rb +1 -0
  22. data/lib/pdf/reader/form_xobject.rb +1 -0
  23. data/lib/pdf/reader/glyph_hash.rb +16 -9
  24. data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
  25. data/lib/pdf/reader/lzw.rb +4 -2
  26. data/lib/pdf/reader/null_security_handler.rb +1 -0
  27. data/lib/pdf/reader/object_cache.rb +1 -0
  28. data/lib/pdf/reader/object_hash.rb +8 -3
  29. data/lib/pdf/reader/object_stream.rb +1 -0
  30. data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
  31. data/lib/pdf/reader/page.rb +60 -9
  32. data/lib/pdf/reader/page_layout.rb +37 -23
  33. data/lib/pdf/reader/page_state.rb +18 -23
  34. data/lib/pdf/reader/page_text_receiver.rb +28 -5
  35. data/lib/pdf/reader/pages_strategy.rb +1 -0
  36. data/lib/pdf/reader/parser.rb +12 -7
  37. data/lib/pdf/reader/point.rb +25 -0
  38. data/lib/pdf/reader/print_receiver.rb +1 -0
  39. data/lib/pdf/reader/rectangle.rb +95 -0
  40. data/lib/pdf/reader/reference.rb +1 -0
  41. data/lib/pdf/reader/register_receiver.rb +1 -0
  42. data/lib/pdf/reader/resource_methods.rb +5 -0
  43. data/lib/pdf/reader/standard_security_handler.rb +1 -0
  44. data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
  45. data/lib/pdf/reader/stream.rb +1 -0
  46. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  47. data/lib/pdf/reader/text_run.rb +1 -0
  48. data/lib/pdf/reader/token.rb +1 -0
  49. data/lib/pdf/reader/transformation_matrix.rb +1 -0
  50. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  51. data/lib/pdf/reader/width_calculator/built_in.rb +8 -15
  52. data/lib/pdf/reader/width_calculator/composite.rb +1 -0
  53. data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
  54. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
  55. data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
  56. data/lib/pdf/reader/width_calculator.rb +1 -0
  57. data/lib/pdf/reader/xref.rb +7 -1
  58. data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
  59. data/lib/pdf/reader.rb +14 -4
  60. data/lib/pdf-reader.rb +1 -0
  61. data/rbi/pdf-reader.rbi +1744 -0
  62. metadata +17 -13
  63. data/lib/pdf/reader/orientation_detector.rb +0 -34
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2b38615953615bfbca1a80ab344f26166377d8c08d2ba2e05badf43c10682415
4
- data.tar.gz: 658b5d05a14300ad056ee31c10ea998533ccb1b91600e8bc9097070605d003ea
3
+ metadata.gz: 5ee0d8c3c55f6a0aebb60a0a6dce92428e8371b96a6beb6d75bfe90602bffae7
4
+ data.tar.gz: '0911d108353bf577aa9fd7b49b97dda1cf9d54816bf8ff6c4225281eeda63229'
5
5
  SHA512:
6
- metadata.gz: 210b0bee8c4ac009808555c8ba945f3b17b85af22126ac1440eb9b49d91f542f1974b0984efb22726985f2cf8e03440511ebc4664ac5c4d91a6bddea9a43687e
7
- data.tar.gz: 8fb60cb59dc4430179a4b9ba83d30ae6dc23aa13dbef5e8febe1569311ddf7e531783da7e7dd0a6542f0087748e97898af56d540e7c088832f213b48059aa7d3
6
+ metadata.gz: 917db2b1fb977b41e7b057ff3d215b8f249577254d9fe3df72f330b32ff49630874c58f480495ddcd137d9f31d014083438623cdf7260b0d7a87bbe3a5f3685a
7
+ data.tar.gz: cd9832f025264e54d586e81eff69727379e8646d741f53ae61e90a5b38945d852147853891d468bab683581bdd0beb68a9b7c7f5e54e064e9a3935262ea9d651
data/CHANGELOG CHANGED
@@ -1,3 +1,43 @@
1
+ v2.7.0 (13th December 2021)
2
+ - Include RBI type files in the gem
3
+ - Downstream users of pdf-reader who also use sorbet *should* find many parts of the API will
4
+ now be typed checked by sorbet
5
+ - Fix glyph positioning in some rotation scenarios (http://github.com/yob/pdf-reader/pull/403)
6
+ - Improved text extraction on some rotated pages, and rotated text on normal pages
7
+ - Add PDF::Reader::Page#rectangles (http://github.com/yob/pdf-reader/pull/402)
8
+ - Returns page boxes (MediaBox, etc) with rotation applied, and as PORO rather than arrays of numbers
9
+ - Add PDF::Reader::Page#origin (http://github.com/yob/pdf-reader/pull/400)
10
+ - Add PDF::Reader::Page#{height,width} (http://github.com/yob/pdf-reader/pull/399)
11
+ - Overlap filter should only drop characters that overlap *and* match (http://github.com/yob/pdf-reader/pull/401)
12
+
13
+ v2.6.0 (12th November 2021)
14
+ - Text extraction improvements
15
+ - Improved text layout on pages with a variety of font sizes (http://github.com/yob/pdf-reader/pull/355)
16
+ - Fixed text positioning for some rotated pages (http://github.com/yob/pdf-reader/pull/356)
17
+ - Improved character width calculation for PDFs using built-in (non-embedded) ZapfDingbats (http://github.com/yob/pdf-reader/pull/373)
18
+ - Skip zero-width characters (http://github.com/yob/pdf-reader/pull/372)
19
+ - Performance improvements
20
+ - Reduced memory pressure when decoding TIFF images (http://github.com/yob/pdf-reader/pull/360)
21
+ - Optional dependency on ascii81_native gem for faster processing of files using the ascii85 filter (http://github.com/yob/pdf-reader/pull/359)
22
+ - Successfully parse more files
23
+ - Gracefully handle some non-spec compliant CR/LF issues (http://github.com/yob/pdf-reader/pull/364)
24
+ - Fix parsing of some escape sequences in content streams (http://github.com/yob/pdf-reader/pull/368)
25
+ - Increase the amount of junk bytes we detect and skip at the end of a file (382)
26
+ - Ignore "/Prev 0" in trailers (http://github.com/yob/pdf-reader/pull/383)
27
+ - Fix parsing of some inline images (BI ID EI tokens) (http://github.com/yob/pdf-reader/pull/389)
28
+ - Gracefully handle some xref tables that incorrectly start with 1 (http://github.com/yob/pdf-reader/pull/384)
29
+
30
+ v2.5.0 (6th June 2021)
31
+ - bump minimum ruby version to 2.0
32
+ - Correctly handle trascoding to UTF-8 from some fonts that use a difference table [#344](https://github.com/yob/pdf-reader/pull/344/)
33
+ - Fix some character spacing issues with the TJ operator [#343](https://github.com/yob/pdf-reader/pull/343)
34
+ - Fix crash with some encrypted PDFs [#348](https://github.com/yob/pdf-reader/pull/348/)
35
+ - Fix positions of text on some PDFs with pages rotated 90° [#350](https://github.com/yob/pdf-reader/pull/350/)
36
+
37
+ v2.4.2 (28th January 2021)
38
+ - relax ASCII85 dependency to allow 1.x
39
+ - improved support for decompressing objects with slightly malformed zlib data
40
+
1
41
  v.2.4.1 (24th September 2020)
2
42
  - Re-vendor font metrics from Adobe to clarify their license
3
43
 
data/README.md CHANGED
@@ -166,6 +166,19 @@ http://groups.google.com/group/pdf-reader
166
166
  The easiest way to explain how this works in practice is to show some examples.
167
167
  Check out the examples/ directory for a few files.
168
168
 
169
+ # Alternate Decoder
170
+
171
+ For PDF files containing Ascii85 streams, the [ascii85_native](https://github.com/AnomalousBit/ascii85_native) gem can be used for increased performance. If the ascii85_native gem is detected, pdf-reader will automatically use the gem.
172
+
173
+ First, run `gem install ascii85_native` and then require the gem alongside pdf-reader:
174
+
175
+ ```ruby
176
+ require "pdf-reader"
177
+ require "ascii85_native"
178
+ ```
179
+
180
+ Another way of enabling native Ascii85 decoding is to place `gem 'ascii85_native'` in your project's `Gemfile`.
181
+
169
182
  # Known Limitations
170
183
 
171
184
  Occasionally some text cannot be extracted properly due to the way it has been
@@ -176,7 +189,9 @@ little UTF-8 friendly box to indicate an unrecognisable character.
176
189
 
177
190
  * PDF::Reader Code Repository: http://github.com/yob/pdf-reader
178
191
 
179
- * PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
192
+ * PDF Specification: https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf
193
+
194
+ * Adobe PDF Developer Resources: http://www.adobe.com/devnet/pdf/pdf_reference.html
180
195
 
181
196
  * PDF Tutorial Slide Presentations: https://web.archive.org/web/20150110042057/http://home.comcast.net/~jk05/presentations/PDFTutorials.html
182
197
 
data/Rakefile CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
14
14
  Cane::RakeTask.new(:quality) do |cane|
15
15
  cane.abc_max = 20
16
16
  cane.style_measure = 100
17
- cane.max_violations = 31
17
+ cane.max_violations = 32
18
18
 
19
19
  cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
20
20
  end
@@ -17,8 +17,8 @@ module ExtractFonts
17
17
  return count if page.fonts.nil? || page.fonts.empty?
18
18
 
19
19
  page.fonts.each do |label, font|
20
- next if complete_refs[font]
21
- complete_refs[font] = true
20
+ next if complete_refs[label]
21
+ complete_refs[label] = true
22
22
 
23
23
  process_font(page, font)
24
24
 
@@ -39,7 +39,7 @@ module ExtractFonts
39
39
  when :TrueType, :CIDFontType2 then
40
40
  ExtractFonts::TTF.new(page.objects, font).save("#{font[:BaseFont]}.ttf")
41
41
  else
42
- $stderr.puts "unsupported font type #{font[:Subtype]}"
42
+ $stderr.puts "unsupported font type #{font[:Subtype]} for #{font[:BaseFont]}"
43
43
  end
44
44
  end
45
45
 
@@ -68,10 +68,15 @@ module ExtractFonts
68
68
  end
69
69
  end
70
70
 
71
- filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cairo-unicode.pdf"
71
+ if ARGV.size == 0 # default file name
72
+ ARGV << File.expand_path(File.join(File.dirname(__dir__), "spec", "data", "cairo-unicode.pdf"))
73
+ end
74
+
72
75
  extractor = ExtractFonts::Extractor.new
73
76
 
74
- PDF::Reader.open(filename) do |reader|
75
- page = reader.page(1)
76
- extractor.page(page)
77
+ ARGV.each do |arg|
78
+ PDF::Reader.open(arg) do |reader|
79
+ page = reader.page(1)
80
+ extractor.page(page)
81
+ end
77
82
  end
data/examples/rspec.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  # coding: utf-8
3
+ # typed: ignore
3
4
 
4
5
  # Basic RSpec of a generated PDF
5
6
  #
@@ -1,4 +1,5 @@
1
1
  # coding: ASCII-8BIT
2
+ # typed: false
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -48,6 +49,15 @@ class PDF::Reader
48
49
  ID = "ID"
49
50
  FWD_SLASH = "/"
50
51
  NULL_BYTE = "\x00"
52
+ CR = "\r"
53
+ LF = "\n"
54
+ CRLF = "\r\n"
55
+ WHITE_SPACE = [LF, CR, ' ']
56
+
57
+ # Quite a few PDFs have trailing junk.
58
+ # This can be several k of nuls in some cases
59
+ # Allow for this here
60
+ TRAILING_BYTECOUNT = 5000
51
61
 
52
62
  attr_reader :pos
53
63
 
@@ -86,9 +96,12 @@ class PDF::Reader
86
96
  #
87
97
  # options:
88
98
  #
89
- # :skip_eol - if true, the IO stream is advanced past a CRLF or LF that
90
- # is sitting under the io cursor.
91
- #
99
+ # :skip_eol - if true, the IO stream is advanced past a CRLF, CR or LF
100
+ # that is sitting under the io cursor.
101
+ # Note:
102
+ # Skipping a bare CR is not spec-compliant.
103
+ # This is because the data may start with LF.
104
+ # However we check for CRLF first, so the ambiguity is avoided.
92
105
  def read(bytes, opts = {})
93
106
  reset_pos
94
107
 
@@ -97,9 +110,9 @@ class PDF::Reader
97
110
  str = @io.read(2)
98
111
  if str.nil?
99
112
  return nil
100
- elsif str == "\r\n"
113
+ elsif str == CRLF # This MUST be done before checking for CR alone
101
114
  # do nothing
102
- elsif str[0,1] == "\n"
115
+ elsif str[0, 1] == LF || str[0, 1] == CR # LF or CR alone
103
116
  @io.seek(-1, IO::SEEK_CUR)
104
117
  else
105
118
  @io.seek(-2, IO::SEEK_CUR)
@@ -127,8 +140,8 @@ class PDF::Reader
127
140
  #
128
141
  def find_first_xref_offset
129
142
  check_size_is_non_zero
130
- @io.seek(-1024, IO::SEEK_END) rescue @io.seek(0)
131
- data = @io.read(1024)
143
+ @io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
144
+ data = @io.read(TRAILING_BYTECOUNT)
132
145
 
133
146
  # the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
134
147
  lines = data.split(/[\n\r]+/).reverse
@@ -217,7 +230,9 @@ class PDF::Reader
217
230
  return if @tokens.size < 3
218
231
  return if @tokens[2] != "R"
219
232
 
220
- if @tokens[0].match(/\d+/) && @tokens[1].match(/\d+/)
233
+ # must match whole tokens
234
+ digits_only = %r{\A\d+\z}
235
+ if @tokens[0].match(digits_only) && @tokens[1].match(digits_only)
221
236
  @tokens[0] = PDF::Reader::Reference.new(@tokens[0].to_i, @tokens[1].to_i)
222
237
  @tokens[1] = nil
223
238
  @tokens[2] = nil
@@ -225,24 +240,51 @@ class PDF::Reader
225
240
  end
226
241
  end
227
242
 
243
+ # Extract data between ID and EI
244
+ # If the EI follows white-space the space is dropped from the data
245
+ # The EI must followed by white-space or end of buffer
246
+ # This is to reduce the chance of accidentally matching an embedded EI
228
247
  def prepare_inline_token
229
- str = "".dup
230
-
231
- buffer = []
232
-
233
- until buffer[0] =~ /\s|\0/ && buffer[1, 2] == ["E", "I"]
248
+ idstart = @io.pos
249
+ chr = prevchr = nil
250
+ eisize = 0 # how many chars in the end marker
251
+ seeking = 'E' # what are we looking for now?
252
+ loop do
234
253
  chr = @io.read(1)
235
- buffer << chr
236
-
237
- if buffer.length > 3
238
- str << buffer.shift
254
+ break if chr.nil?
255
+ case seeking
256
+ when 'E'
257
+ if chr == 'E'
258
+ seeking = 'I'
259
+ if WHITE_SPACE.include? prevchr
260
+ eisize = 3 # include whitespace in delimiter, i.e. drop from data
261
+ else # assume the EI immediately follows the data
262
+ eisize = 2 # leave prevchr in data
263
+ end
264
+ end
265
+ when 'I'
266
+ if chr == 'I'
267
+ seeking = :END
268
+ else
269
+ seeking = 'E'
270
+ end
271
+ when :END
272
+ if WHITE_SPACE.include? chr
273
+ eisize += 1 # Drop trailer
274
+ break
275
+ else
276
+ seeking = 'E'
277
+ end
239
278
  end
279
+ prevchr = chr
240
280
  end
241
-
242
- str << NULL_BYTE if buffer.first == NULL_BYTE
243
-
281
+ unless seeking == :END
282
+ raise MalformedPDFError, "EI terminator not found"
283
+ end
284
+ eiend = @io.pos
285
+ @io.seek(idstart, IO::SEEK_SET)
286
+ str = @io.read(eiend - eisize - idstart) # get the ID content
244
287
  @tokens << string_token(str)
245
- @io.seek(-3, IO::SEEK_CUR) unless chr.nil?
246
288
  end
247
289
 
248
290
  # if we're currently inside a hex string, read hex nibbles until
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: false
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -32,6 +33,7 @@ class PDF::Reader
32
33
  # extracting various useful information.
33
34
  #
34
35
  class CMap # :nodoc:
36
+
35
37
  CMAP_KEYWORDS = {
36
38
  "begincodespacerange" => 1,
37
39
  "endcodespacerange" => 1,
@@ -53,7 +55,7 @@ class PDF::Reader
53
55
 
54
56
  def process_data(data)
55
57
  parser = build_parser(data)
56
- mode = nil
58
+ mode = :none
57
59
  instructions = []
58
60
 
59
61
  while token = parser.parse_token(CMAP_KEYWORDS)
@@ -62,13 +64,13 @@ class PDF::Reader
62
64
  elsif token == "endbfchar"
63
65
  process_bfchar_instructions(instructions)
64
66
  instructions = []
65
- mode = nil
67
+ mode = :none
66
68
  elsif token == "beginbfrange"
67
69
  mode = :range
68
70
  elsif token == "endbfrange"
69
71
  process_bfrange_instructions(instructions)
70
72
  instructions = []
71
- mode = nil
73
+ mode = :none
72
74
  elsif mode == :char || mode == :range
73
75
  instructions << token
74
76
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -68,7 +69,7 @@ class PDF::Reader
68
69
  #
69
70
  # [25, :A, :B]
70
71
  def differences=(diff)
71
- raise ArgumentError, "diff must be an array" unless diff.kind_of?(Array)
72
+ PDF::Reader::Error.validate_type(diff, "diff", Array)
72
73
 
73
74
  @differences = {}
74
75
  byte = 0
@@ -208,7 +209,7 @@ class PDF::Reader
208
209
  def load_mapping(file)
209
210
  File.open(file, "r:BINARY") do |f|
210
211
  f.each do |l|
211
- _m, single_byte, unicode = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
212
+ _m, single_byte, unicode = *l.match(/\A([0-9A-Za-z]+);([0-9A-F]{4})/)
212
213
  @mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
213
214
  end
214
215
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -33,19 +34,26 @@ class PDF::Reader
33
34
  def self.str_assert(lvalue, rvalue, chars=nil)
34
35
  raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
35
36
  lvalue = lvalue[0,chars] if chars
36
- raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue != rvalue
37
+ raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
37
38
  end
38
39
  ################################################################################
39
40
  def self.str_assert_not(lvalue, rvalue, chars=nil)
40
41
  raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
41
42
  lvalue = lvalue[0,chars] if chars
42
- raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue == rvalue
43
+ raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue == rvalue
43
44
  end
44
45
  ################################################################################
45
46
  def self.assert_equal(lvalue, rvalue)
46
- raise MalformedPDFError, "PDF malformed, expected #{rvalue} but found #{lvalue} instead" if lvalue != rvalue
47
+ raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
47
48
  end
48
49
  ################################################################################
50
+ def self.validate_type(object, name, klass)
51
+ raise ArgumentError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
52
+ end
53
+ ################################################################################
54
+ def self.validate_not_nil(object, name)
55
+ raise ArgumentError, "#{object} must not be nil" if object.nil?
56
+ end
49
57
  end
50
58
 
51
59
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: false
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'ascii85'
@@ -7,6 +8,7 @@ class PDF::Reader
7
8
  module Filter # :nodoc:
8
9
  # implementation of the Ascii85 filter
9
10
  class Ascii85
11
+
10
12
  def initialize(options = {})
11
13
  @options = options
12
14
  end
@@ -17,7 +19,11 @@ class PDF::Reader
17
19
  #
18
20
  def filter(data)
19
21
  data = "<~#{data}" unless data.to_s[0,2] == "<~"
20
- ::Ascii85::decode(data)
22
+ if defined?(::Ascii85Native)
23
+ ::Ascii85Native::decode(data)
24
+ else
25
+ ::Ascii85::decode(data)
26
+ end
21
27
  rescue Exception => e
22
28
  # Oops, there was a problem decoding the stream
23
29
  raise MalformedPDFError,
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -6,6 +7,7 @@ class PDF::Reader
6
7
  module Filter # :nodoc:
7
8
  # implementation of the AsciiHex stream filter
8
9
  class AsciiHex
10
+
9
11
  def initialize(options = {})
10
12
  @options = options
11
13
  end
@@ -16,9 +18,12 @@ class PDF::Reader
16
18
  def filter(data)
17
19
  data.chop! if data[-1,1] == ">"
18
20
  data = data[1,data.size] if data[0,1] == "<"
21
+
22
+ return "" if data.nil?
23
+
19
24
  data.gsub!(/[^A-Fa-f0-9]/,"")
20
25
  data << "0" if data.size % 2 == 1
21
- data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
26
+ data.scan(/.{2}/).flatten.map { |s| s.hex.chr }.join("")
22
27
  rescue Exception => e
23
28
  # Oops, there was a problem decoding the stream
24
29
  raise MalformedPDFError,
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -6,6 +7,7 @@ class PDF::Reader
6
7
  # some filter implementations support preprocessing of the data to
7
8
  # improve compression
8
9
  class Depredict
10
+
9
11
  def initialize(options = {})
10
12
  @options = options || {}
11
13
  end
@@ -34,7 +36,7 @@ class PDF::Reader
34
36
  ################################################################################
35
37
  def tiff_depredict(data)
36
38
  data = data.unpack("C*")
37
- unfiltered = []
39
+ unfiltered = ''
38
40
  bpc = @options[:BitsPerComponent] || 8
39
41
  pixel_bits = bpc * @options[:Colors]
40
42
  pixel_bytes = pixel_bits / 8
@@ -51,11 +53,11 @@ class PDF::Reader
51
53
  left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
52
54
  row_data[index] = (byte + left) % 256
53
55
  end
54
- unfiltered += row_data
56
+ unfiltered += row_data.pack("C*")
55
57
  pos += line_len
56
58
  end
57
59
 
58
- unfiltered.pack("C*")
60
+ unfiltered
59
61
  end
60
62
  ################################################################################
61
63
  def png_depredict(data)
@@ -67,7 +69,7 @@ class PDF::Reader
67
69
  scanline_length = (pixel_bytes * @options[:Columns]) + 1
68
70
  row = 0
69
71
  pixels = []
70
- paeth, pa, pb, pc = nil
72
+ paeth, pa, pb, pc = 0, 0, 0, 0
71
73
  until data.empty? do
72
74
  row_data = data.slice! 0, scanline_length
73
75
  filter = row_data.shift
@@ -94,17 +96,17 @@ class PDF::Reader
94
96
  row_data[index] = (byte + ((left + upper)/2).floor) % 256
95
97
  end
96
98
  when 4 # Paeth
97
- left = upper = upper_left = nil
99
+ left = upper = upper_left = 0
98
100
  row_data.each_with_index do |byte, index|
99
101
  col = index / pixel_bytes
100
102
 
101
- left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
103
+ left = index < pixel_bytes ? 0 : Integer(row_data[index - pixel_bytes])
102
104
  if row.zero?
103
105
  upper = upper_left = 0
104
106
  else
105
- upper = pixels[row-1][col][index % pixel_bytes]
107
+ upper = Integer(pixels[row-1][col][index % pixel_bytes])
106
108
  upper_left = col.zero? ? 0 :
107
- pixels[row-1][col-1][index % pixel_bytes]
109
+ Integer(pixels[row-1][col-1][index % pixel_bytes])
108
110
  end
109
111
 
110
112
  p = left + upper - upper_left
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
 
@@ -8,7 +9,9 @@ class PDF::Reader
8
9
  module Filter # :nodoc:
9
10
  # implementation of the Flate (zlib) stream filter
10
11
  class Flate
12
+
11
13
  ZLIB_AUTO_DETECT_ZLIB_OR_GZIP = 47 # Zlib::MAX_WBITS + 32
14
+ ZLIB_RAW_DEFLATE = -15 # Zlib::MAX_WBITS * -1
12
15
 
13
16
  def initialize(options = {})
14
17
  @options = options
@@ -17,24 +20,34 @@ class PDF::Reader
17
20
  ################################################################################
18
21
  # Decode the specified data with the Zlib compression algorithm
19
22
  def filter(data)
20
- deflated = nil
23
+ deflated = zlib_inflate(data) || zlib_inflate(data[0, data.bytesize-1])
24
+
25
+ if deflated.nil?
26
+ raise MalformedPDFError,
27
+ "Error while inflating a compressed stream (no suitable inflation algorithm found)"
28
+ end
29
+ Depredict.new(@options).filter(deflated)
30
+ end
31
+
32
+ private
33
+
34
+ def zlib_inflate(data)
21
35
  begin
22
- deflated = Zlib::Inflate.new(ZLIB_AUTO_DETECT_ZLIB_OR_GZIP).inflate(data)
23
- rescue Zlib::DataError => e
36
+ return Zlib::Inflate.new(ZLIB_AUTO_DETECT_ZLIB_OR_GZIP).inflate(data)
37
+ rescue Zlib::DataError
24
38
  # by default, Ruby's Zlib assumes the data it's inflating
25
39
  # is RFC1951 deflated data, wrapped in a RFC1950 zlib container. If that
26
- # fails, then use a lightly-documented 'feature' to attempt to inflate
27
- # the data as a raw RFC1951 stream.
28
- #
29
- # See
30
- # - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
31
- deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
40
+ # fails, swallow the exception and attempt to inflate the data as a raw
41
+ # RFC1951 stream.
32
42
  end
33
- Depredict.new(@options).filter(deflated)
34
- rescue Exception => e
35
- # Oops, there was a problem inflating the stream
36
- raise MalformedPDFError,
37
- "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
43
+
44
+ begin
45
+ return Zlib::Inflate.new(ZLIB_RAW_DEFLATE).inflate(data)
46
+ rescue StandardError
47
+ # swallow this one too, so we can try some other fallback options
48
+ end
49
+
50
+ nil
38
51
  end
39
52
  end
40
53
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -6,6 +7,7 @@ class PDF::Reader
6
7
  module Filter # :nodoc:
7
8
  # implementation of the LZW stream filter
8
9
  class Lzw
10
+
9
11
  def initialize(options = {})
10
12
  @options = options
11
13
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -6,6 +7,7 @@ class PDF::Reader # :nodoc:
6
7
  module Filter # :nodoc:
7
8
  # implementation of the run length stream filter
8
9
  class RunLength
10
+
9
11
  def initialize(options = {})
10
12
  @options = options
11
13
  end
@@ -20,19 +22,23 @@ class PDF::Reader # :nodoc:
20
22
  length = data.getbyte(pos)
21
23
  pos += 1
22
24
 
23
- case
24
- when length == 128
25
- break
26
- when length < 128
27
- # When the length is < 128, we copy the following length+1 bytes
28
- # literally.
29
- out << data[pos, length + 1]
30
- pos += length
31
- else
32
- # When the length is > 128, we copy the next byte (257 - length)
33
- # times; i.e., "\xFA\x00" ([250, 0]) will expand to
34
- # "\x00\x00\x00\x00\x00\x00\x00".
35
- out << data[pos, 1] * (257 - length)
25
+ unless length.nil?
26
+ case
27
+ # nothing
28
+ when length == 128
29
+ break
30
+ when length < 128
31
+ # When the length is < 128, we copy the following length+1 bytes
32
+ # literally.
33
+ out << data[pos, length + 1]
34
+ pos += length
35
+ else
36
+ # When the length is > 128, we copy the next byte (257 - length)
37
+ # times; i.e., "\xFA\x00" ([250, 0]) will expand to
38
+ # "\x00\x00\x00\x00\x00\x00\x00".
39
+ previous_byte = data[pos, 1] || ""
40
+ out << previous_byte * (257 - length)
41
+ end
36
42
  end
37
43
 
38
44
  pos += 1
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################