pdf-reader 2.9.2 → 2.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +39 -0
  3. data/README.md +33 -33
  4. data/Rakefile +2 -2
  5. data/lib/pdf/reader/advanced_text_run_filter.rb +152 -0
  6. data/lib/pdf/reader/aes_v2_security_handler.rb +30 -0
  7. data/lib/pdf/reader/aes_v3_security_handler.rb +35 -3
  8. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +1 -0
  9. data/lib/pdf/reader/buffer.rb +39 -22
  10. data/lib/pdf/reader/cid_widths.rb +14 -6
  11. data/lib/pdf/reader/cmap.rb +16 -5
  12. data/lib/pdf/reader/encoding.rb +42 -18
  13. data/lib/pdf/reader/error.rb +6 -4
  14. data/lib/pdf/reader/filter/ascii85.rb +2 -0
  15. data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
  16. data/lib/pdf/reader/filter/depredict.rb +6 -2
  17. data/lib/pdf/reader/filter/flate.rb +5 -2
  18. data/lib/pdf/reader/filter/lzw.rb +2 -0
  19. data/lib/pdf/reader/filter/null.rb +2 -0
  20. data/lib/pdf/reader/filter/run_length.rb +2 -0
  21. data/lib/pdf/reader/filter.rb +1 -0
  22. data/lib/pdf/reader/font.rb +99 -32
  23. data/lib/pdf/reader/font_descriptor.rb +79 -24
  24. data/lib/pdf/reader/form_xobject.rb +15 -1
  25. data/lib/pdf/reader/glyph_hash.rb +41 -8
  26. data/lib/pdf/reader/key_builder_v5.rb +17 -9
  27. data/lib/pdf/reader/lzw.rb +42 -16
  28. data/lib/pdf/reader/no_text_filter.rb +15 -0
  29. data/lib/pdf/reader/null_security_handler.rb +1 -0
  30. data/lib/pdf/reader/object_cache.rb +7 -2
  31. data/lib/pdf/reader/object_hash.rb +129 -16
  32. data/lib/pdf/reader/object_stream.rb +22 -5
  33. data/lib/pdf/reader/overlapping_runs_filter.rb +8 -2
  34. data/lib/pdf/reader/page.rb +66 -13
  35. data/lib/pdf/reader/page_layout.rb +26 -9
  36. data/lib/pdf/reader/page_state.rb +12 -3
  37. data/lib/pdf/reader/page_text_receiver.rb +16 -2
  38. data/lib/pdf/reader/pages_strategy.rb +1 -1
  39. data/lib/pdf/reader/parser.rb +52 -13
  40. data/lib/pdf/reader/point.rb +9 -2
  41. data/lib/pdf/reader/print_receiver.rb +2 -6
  42. data/lib/pdf/reader/rc4_security_handler.rb +2 -0
  43. data/lib/pdf/reader/rectangle.rb +24 -1
  44. data/lib/pdf/reader/reference.rb +13 -3
  45. data/lib/pdf/reader/register_receiver.rb +15 -2
  46. data/lib/pdf/reader/resources.rb +12 -2
  47. data/lib/pdf/reader/security_handler_factory.rb +13 -0
  48. data/lib/pdf/reader/standard_key_builder.rb +37 -23
  49. data/lib/pdf/reader/stream.rb +9 -3
  50. data/lib/pdf/reader/synchronized_cache.rb +6 -3
  51. data/lib/pdf/reader/text_run.rb +33 -3
  52. data/lib/pdf/reader/token.rb +1 -0
  53. data/lib/pdf/reader/transformation_matrix.rb +41 -10
  54. data/lib/pdf/reader/type_check.rb +53 -0
  55. data/lib/pdf/reader/unimplemented_security_handler.rb +2 -0
  56. data/lib/pdf/reader/validating_receiver.rb +29 -0
  57. data/lib/pdf/reader/width_calculator/built_in.rb +13 -5
  58. data/lib/pdf/reader/width_calculator/composite.rb +11 -3
  59. data/lib/pdf/reader/width_calculator/true_type.rb +14 -12
  60. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +8 -5
  61. data/lib/pdf/reader/width_calculator/type_zero.rb +8 -3
  62. data/lib/pdf/reader/xref.rb +31 -10
  63. data/lib/pdf/reader/zero_width_runs_filter.rb +1 -0
  64. data/lib/pdf/reader.rb +24 -12
  65. data/rbi/pdf-reader.rbi +1504 -1480
  66. metadata +34 -17
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cc98ab07b3c66f13f663ea5faf8132b45d769912e0da737917dd054e38318ede
4
- data.tar.gz: 0f2928d9778b5b3ea8fca5e723a2b3fa6f275df70b02f1eb4385e077c535ac78
3
+ metadata.gz: 1be615eb6abc5557e61ba53958c7211fac0f1528e75dc54eff27ffb5554d7c80
4
+ data.tar.gz: 875221f31dc119cd0f7ae3cc0246b3bbb70f6127c0047ec924c8030e9186b55b
5
5
  SHA512:
6
- metadata.gz: 210cd8c8cef93b0e0fac1446c091c2a62772ffe8b1786627089e5a330ca7defd501df7cccc0b48d326d38ff74318b162e512220e8a4460260bebe7da0ef8b757
7
- data.tar.gz: 047e7f6641411557b1d3b50035dbdf55647c63deede273b6ce4442230b85372045494b81e88c1ffcaa09a7c5ea26823ee33b33c3bf82013328d0e32a95021284
6
+ metadata.gz: 4b4501ca72d06b5a569fdcc77f384131fbd85342f8da7a084a02210ec7a3821e8b9f1cad88685262d0cc4e993f7b0031bed5d510c353c7d8fb5fe28f97a2ea83
7
+ data.tar.gz: a4fe329f2d8ae7cc295cb17d573963ddab6c0cde52d6524ad182f4651dab8ba90215bcb1ecf60c7fcf248135aed152b50a1d34afa03b270b93c5a172ac4048b3
data/CHANGELOG CHANGED
@@ -1,3 +1,42 @@
1
+ v2.15.0 (13th August 2025)
2
+
3
+ - Overhaul sorbet types, moving from an external RBI file to inline comments in RBS syntax
4
+ - multiple PRs, but mainly https://github.com/yob/pdf-reader/pull/562
5
+ - See https://railsatscale.com/2025-04-23-rbs-support-for-sorbet/
6
+ - No impact expected for most users, but projects that use sorbet may find subtle changes in
7
+ the RBI file that is shipped with the gem
8
+ - Relax version requirements for dependency `afm`, allow 1.x (https://github.com/yob/pdf-reader/pull/557)
9
+ - Improve text positioning logic in some PDFs (https://github.com/yob/pdf-reader/pull/554)
10
+ - Multiple fixes for encrypted files
11
+ - Some files with passwords > 32 bytes long (https://github.com/yob/pdf-reader/pull/555)
12
+ - Some files that contain cipher text with a 16 byte IV and no further blocks (https://github.com/yob/pdf-reader/pull/561)
13
+ - Some files that encrypted data with no padding (https://github.com/yob/pdf-reader/pull/564)
14
+ - Add jruby 10 to CI matrix (https://github.com/yob/pdf-reader/pull/552)
15
+
16
+ v2.14.1 (4th February 2025)
17
+ - Fix issue in RBI signatures, introduced in v2.14.0(https://github.com/yob/pdf-reader/pull/550)
18
+
19
+ v2.14.0 (29th January 2025)
20
+ - Raise minimum supported ruby to 2.1 (https://github.com/yob/pdf-reader/pull/543)
21
+ - Add support for filtering to Page#text (https://github.com/yob/pdf-reader/pull/545)
22
+
23
+ v2.13.0 (2nd November 2024)
24
+ - Permit Ascii86 v1.0 and v2.0 (https://github.com/yob/pdf-reader/pull/539)
25
+ - Allow StringIO type for PDF::Reader input (https://github.com/yob/pdf-reader/pull/535)
26
+
27
+ v2.12.0 (26th December 2023)
28
+ - Fix a sorbet method signature (http://github.com/yob/pdf-reader/pull/512)
29
+ - Reduce allocations when parsing PDFs with hex strings (http://github.com/yob/pdf-reader/pull/528)
30
+ - Fix text extraction of some rare unicode codepoints (http://github.com/yob/pdf-reader/pull/529)
31
+
32
+ v2.11.0 (26th October 2022)
33
+ - Various bug fixes
34
+ - Expanded sorbet type annotations
35
+
36
+ v2.10.0 (12th May 2022)
37
+ - Various bug fixes
38
+ - Expanded sorbet type annotations
39
+
1
40
  v2.9.2 (20th February 2022)
2
41
  - Fix PDF::Reader::ObjectHash#page_references to return an Array of PDF::Reader::Reference (http://github.com/yob/pdf-reader/pull/444)
3
42
 
data/README.md CHANGED
@@ -20,7 +20,7 @@ page.
20
20
  The recommended installation method is via Rubygems.
21
21
 
22
22
  ```ruby
23
- gem install pdf-reader
23
+ gem install pdf-reader
24
24
  ```
25
25
 
26
26
  # Usage
@@ -30,23 +30,23 @@ level information (metadata, page count, bookmarks, etc) is available via
30
30
  this object.
31
31
 
32
32
  ```ruby
33
- reader = PDF::Reader.new("somefile.pdf")
33
+ reader = PDF::Reader.new("somefile.pdf")
34
34
 
35
- puts reader.pdf_version
36
- puts reader.info
37
- puts reader.metadata
38
- puts reader.page_count
35
+ puts reader.pdf_version
36
+ puts reader.info
37
+ puts reader.metadata
38
+ puts reader.page_count
39
39
  ```
40
40
 
41
41
  PDF::Reader.new accepts an IO stream or a filename. Here's an example with
42
42
  an IO stream:
43
43
 
44
44
  ```ruby
45
- require 'open-uri'
45
+ require 'open-uri'
46
46
 
47
- io = open('http://example.com/somefile.pdf')
48
- reader = PDF::Reader.new(io)
49
- puts reader.info
47
+ io = open('http://example.com/somefile.pdf')
48
+ reader = PDF::Reader.new(io)
49
+ puts reader.info
50
50
  ```
51
51
 
52
52
  If you open a PDF with File#open or IO#open, I strongly recommend using "rb"
@@ -54,47 +54,47 @@ mode to ensure the file isn't mangled by ruby being 'helpful'. This is
54
54
  particularly important on windows and MRI >= 1.9.2.
55
55
 
56
56
  ```ruby
57
- File.open("somefile.pdf", "rb") do |io|
58
- reader = PDF::Reader.new(io)
59
- puts reader.info
60
- end
57
+ File.open("somefile.pdf", "rb") do |io|
58
+ reader = PDF::Reader.new(io)
59
+ puts reader.info
60
+ end
61
61
  ```
62
62
 
63
63
  PDF is a page based file format, so most visible information is available via
64
64
  page-based iteration
65
65
 
66
66
  ```ruby
67
- reader = PDF::Reader.new("somefile.pdf")
67
+ reader = PDF::Reader.new("somefile.pdf")
68
68
 
69
- reader.pages.each do |page|
70
- puts page.fonts
71
- puts page.text
72
- puts page.raw_content
73
- end
69
+ reader.pages.each do |page|
70
+ puts page.fonts
71
+ puts page.text
72
+ puts page.raw_content
73
+ end
74
74
  ```
75
75
 
76
76
  If you need to access the full program for rendering a page, use the walk() method
77
77
  of PDF::Reader::Page.
78
78
 
79
79
  ```ruby
80
- class RedGreenBlue
81
- def set_rgb_color_for_nonstroking(r, g, b)
82
- puts "R: #{r}, G: #{g}, B: #{b}"
83
- end
84
- end
85
-
86
- reader = PDF::Reader.new("somefile.pdf")
87
- page = reader.page(1)
88
- receiver = RedGreenBlue.new
89
- page.walk(receiver)
80
+ class RedGreenBlue
81
+ def set_rgb_color_for_nonstroking(r, g, b)
82
+ puts "R: #{r}, G: #{g}, B: #{b}"
83
+ end
84
+ end
85
+
86
+ reader = PDF::Reader.new("somefile.pdf")
87
+ page = reader.page(1)
88
+ receiver = RedGreenBlue.new
89
+ page.walk(receiver)
90
90
  ```
91
91
 
92
92
  For low level access to the objects in a PDF file, use the ObjectHash class like
93
93
  so:
94
94
 
95
95
  ```ruby
96
- reader = PDF::Reader.new("somefile.pdf")
97
- puts reader.objects.inspect
96
+ reader = PDF::Reader.new("somefile.pdf")
97
+ puts reader.objects.inspect
98
98
  ```
99
99
 
100
100
  # Text Encoding
@@ -141,7 +141,7 @@ the spec folder when you checkout a branch from Git.
141
141
  To remove any invalid CRLF characters added while checking out a branch from Git, run:
142
142
 
143
143
  ```ruby
144
- rake fix_integrity
144
+ rake fix_integrity
145
145
  ```
146
146
 
147
147
  # Maintainers
data/Rakefile CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
14
14
  Cane::RakeTask.new(:quality) do |cane|
15
15
  cane.abc_max = 20
16
16
  cane.style_measure = 100
17
- cane.max_violations = 28
17
+ cane.max_violations = 33
18
18
 
19
19
  cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
20
20
  end
@@ -41,7 +41,7 @@ end
41
41
  desc "Create a YAML file of integrity info for PDFs in the spec suite"
42
42
  task :integrity_yaml do
43
43
  data = {}
44
- Dir.glob("spec/data/**/*.*").sort.each do |path|
44
+ Dir.glob("spec/data/**/*.pdf").sort.each do |path|
45
45
  path_without_spec = path.gsub("spec/","")
46
46
  data[path_without_spec] = {
47
47
  :bytes => File.size(path),
@@ -0,0 +1,152 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+ # typed: strict
4
+
5
+ class PDF::Reader
6
+ # Filter a collection of TextRun objects based on a set of conditions.
7
+ # It can be used to filter text runs based on their attributes.
8
+ # The filter can return the text runs that matches the conditions (only) or
9
+ # the text runs that do not match the conditions (exclude).
10
+ #
11
+ # You can filter the text runs based on all its attributes with the operators
12
+ # mentioned in VALID_OPERATORS.
13
+ # The filter can be nested with 'or' and 'and' conditions.
14
+ #
15
+ # Examples:
16
+ # 1. Single condition
17
+ # AdvancedTextRunFilter.exclude(text_runs, text: { include: 'sample' })
18
+ #
19
+ # 2. Multiple conditions (and)
20
+ # AdvancedTextRunFilter.exclude(text_runs, {
21
+ # font_size: { greater_than: 10, less_than: 15 }
22
+ # })
23
+ #
24
+ # 3. Multiple possible values (or)
25
+ # AdvancedTextRunFilter.exclude(text_runs, {
26
+ # font_size: { equal: [10, 12] }
27
+ # })
28
+ #
29
+ # 4. Complex AND/OR filter
30
+ # AdvancedTextRunFilter.exclude(text_runs, {
31
+ # and: [
32
+ # { font_size: { greater_than: 10 } },
33
+ # { or: [
34
+ # { text: { include: "sample" } },
35
+ # { width: { greater_than: 100 } }
36
+ # ]}
37
+ # ]
38
+ # })
39
+ class AdvancedTextRunFilter
40
+ VALID_OPERATORS = %i[
41
+ equal
42
+ not_equal
43
+ greater_than
44
+ less_than
45
+ greater_than_or_equal
46
+ less_than_or_equal
47
+ include
48
+ exclude
49
+ ] #: Array[Symbol]
50
+
51
+ #: (Array[PDF::Reader::TextRun], Hash[Symbol, untyped]) -> Array[PDF::Reader::TextRun]
52
+ def self.only(text_runs, filter_hash)
53
+ new(text_runs, filter_hash).only
54
+ end
55
+
56
+ #: (Array[PDF::Reader::TextRun], Hash[Symbol, untyped]) -> Array[PDF::Reader::TextRun]
57
+ def self.exclude(text_runs, filter_hash)
58
+ new(text_runs, filter_hash).exclude
59
+ end
60
+
61
+ #: Array[PDF::Reader::TextRun]
62
+ attr_reader :text_runs
63
+
64
+ #: Hash[Symbol, untyped]
65
+ attr_reader :filter_hash
66
+
67
+ #: (Array[PDF::Reader::TextRun], Hash[Symbol, untyped]) -> void
68
+ def initialize(text_runs, filter_hash)
69
+ @text_runs = text_runs
70
+ @filter_hash = filter_hash
71
+ end
72
+
73
+ #: () -> Array[PDF::Reader::TextRun]
74
+ def only
75
+ return text_runs if filter_hash.empty?
76
+ text_runs.select { |text_run| evaluate_filter(text_run) }
77
+ end
78
+
79
+ #: () -> Array[PDF::Reader::TextRun]
80
+ def exclude
81
+ return text_runs if filter_hash.empty?
82
+ text_runs.reject { |text_run| evaluate_filter(text_run) }
83
+ end
84
+
85
+ private
86
+
87
+ #: (PDF::Reader::TextRun) -> bool
88
+ def evaluate_filter(text_run)
89
+ if filter_hash[:or]
90
+ evaluate_or_filters(text_run, filter_hash[:or])
91
+ elsif filter_hash[:and]
92
+ evaluate_and_filters(text_run, filter_hash[:and])
93
+ else
94
+ evaluate_filters(text_run, filter_hash)
95
+ end
96
+ end
97
+
98
+ #: (PDF::Reader::TextRun, Array[Hash[Symbol, untyped]]) -> bool
99
+ def evaluate_or_filters(text_run, conditions)
100
+ conditions.any? do |condition|
101
+ evaluate_filters(text_run, condition)
102
+ end
103
+ end
104
+
105
+ #: (PDF::Reader::TextRun, Array[Hash[Symbol, untyped]]) -> bool
106
+ def evaluate_and_filters(text_run, conditions)
107
+ conditions.all? do |condition|
108
+ evaluate_filters(text_run, condition)
109
+ end
110
+ end
111
+
112
+ #: (PDF::Reader::TextRun, Hash[Symbol, untyped]) -> bool
113
+ def evaluate_filters(text_run, filter_hash)
114
+ filter_hash.all? do |attribute, conditions|
115
+ evaluate_attribute_conditions(text_run, attribute, conditions)
116
+ end
117
+ end
118
+
119
+ #: (PDF::Reader::TextRun, Symbol, Hash[Symbol, untyped]) -> bool
120
+ def evaluate_attribute_conditions(text_run, attribute, conditions)
121
+ conditions.all? do |operator, value|
122
+ unless VALID_OPERATORS.include?(operator)
123
+ raise ArgumentError, "Invalid operator: #{operator}"
124
+ end
125
+
126
+ apply_operator(text_run.send(attribute), operator, value)
127
+ end
128
+ end
129
+
130
+ #: (untyped, Symbol, untyped) -> bool
131
+ def apply_operator(attribute_value, operator, filter_value)
132
+ case operator
133
+ when :equal
134
+ Array(filter_value).include?(attribute_value)
135
+ when :not_equal
136
+ !Array(filter_value).include?(attribute_value)
137
+ when :greater_than
138
+ attribute_value > filter_value
139
+ when :less_than
140
+ attribute_value < filter_value
141
+ when :greater_than_or_equal
142
+ attribute_value >= filter_value
143
+ when :less_than_or_equal
144
+ attribute_value <= filter_value
145
+ when :include
146
+ Array(filter_value).any? { |v| attribute_value.to_s.include?(v.to_s) }
147
+ when :exclude
148
+ Array(filter_value).none? { |v| attribute_value.to_s.include?(v.to_s) }
149
+ end
150
+ end
151
+ end
152
+ end
@@ -11,6 +11,7 @@ class PDF::Reader
11
11
  #
12
12
  class AesV2SecurityHandler
13
13
 
14
+ #: (String) -> void
14
15
  def initialize(key)
15
16
  @encrypt_key = key
16
17
  end
@@ -21,10 +22,38 @@ class PDF::Reader
21
22
  #
22
23
  # version == 4 and CFM == AESV2
23
24
  #
25
+ # used to decrypt PDF streams (buf). Input data should be in bytesizes of
26
+ # a multiple of 16, anything else is an error. The first 16 bytes are the initialization
27
+ # vector, so any input of exactly 16 bytes decrypts to an empty string
28
+ #
24
29
  # buf - a string to decrypt
25
30
  # ref - a PDF::Reader::Reference for the object to decrypt
26
31
  #
32
+ #: (String, PDF::Reader::Reference) -> String
27
33
  def decrypt( buf, ref )
34
+ if buf.bytesize % 16 > 0
35
+ raise PDF::Reader::MalformedPDFError.new("Ciphertext not a multiple of 16")
36
+ elsif buf.bytesize == 16
37
+ return ""
38
+ else
39
+ begin
40
+ internal_decrypt(buf, ref)
41
+ rescue OpenSSL::Cipher::CipherError
42
+ # If we failed to decrypt it might be a padding error, so try again
43
+ # and assume no padding in the ciphertext. This will "suceed" but might
44
+ # return garbage if the key is incorrect but that's OK - well before this
45
+ # class is used we have confirmed the user provided key is correct so if
46
+ # this works without error we can be confident the returned plaintext is
47
+ # correct
48
+ internal_decrypt(buf, ref, false)
49
+ end
50
+ end
51
+ end
52
+
53
+ private
54
+
55
+ #: (String, PDF::Reader::Reference, ?bool) -> String
56
+ def internal_decrypt(buf, ref, padding = true)
28
57
  objKey = @encrypt_key.dup
29
58
  (0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
30
59
  (0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
@@ -32,6 +61,7 @@ class PDF::Reader
32
61
  length = objKey.length < 16 ? objKey.length : 16
33
62
  cipher = OpenSSL::Cipher.new("AES-#{length << 3}-CBC")
34
63
  cipher.decrypt
64
+ cipher.padding = 0 unless padding
35
65
  cipher.key = Digest::MD5.digest(objKey)[0,length]
36
66
  cipher.iv = buf[0..15]
37
67
  cipher.update(buf[16..-1]) + cipher.final
@@ -12,27 +12,59 @@ class PDF::Reader
12
12
  #
13
13
  class AesV3SecurityHandler
14
14
 
15
+ #: (String) -> void
15
16
  def initialize(key)
17
+ if key.bytesize != 32
18
+ raise PDF::Reader::MalformedPDFError.new(
19
+ "AES-256 key must be exactly 32 bytes, got #{key.bytesize}"
20
+ )
21
+ end
16
22
  @encrypt_key = key
17
- @cipher = "AES-256-CBC"
23
+ @cipher = "AES-256-CBC" #: String
18
24
  end
19
25
 
20
26
  ##7.6.2 General Encryption Algorithm
21
27
  #
22
28
  # Algorithm 1: Encryption of data using the RC4 or AES algorithms
23
29
  #
24
- # used to decrypt RC4/AES encrypted PDF streams (buf)
30
+ # used to decrypt RC4/AES encrypted PDF streams (buf). Input data should be in bytesizes of
31
+ # a multiple of 16, anything else is an error. The first 16 bytes are the initialization
32
+ # vector, so any input of exactly 16 bytes decrypts to an empty string
25
33
  #
26
34
  # buf - a string to decrypt
27
35
  # ref - a PDF::Reader::Reference for the object to decrypt
28
36
  #
37
+ #: (String, PDF::Reader::Reference) -> String
29
38
  def decrypt( buf, ref )
39
+ if buf.bytesize % 16 > 0
40
+ raise PDF::Reader::MalformedPDFError.new("Ciphertext not a multiple of 16")
41
+ elsif buf.bytesize == 16
42
+ return ""
43
+ else
44
+ begin
45
+ internal_decrypt(buf, ref)
46
+ rescue OpenSSL::Cipher::CipherError
47
+ # If we failed to decrypt it might be a padding error, so try again
48
+ # and assume no padding in the ciphertext. This will "suceed" but might
49
+ # return garbage if the key is incorrect but that's OK - well before this
50
+ # class is used we have confirmed the user provided key is correct so if
51
+ # this works without error we can be confident the returned plaintext is
52
+ # correct
53
+ internal_decrypt(buf, ref, false)
54
+ end
55
+ end
56
+ end
57
+
58
+ private
59
+
60
+ #: (String, PDF::Reader::Reference, ?bool) -> String
61
+ def internal_decrypt(buf, ref, padding = true)
30
62
  cipher = OpenSSL::Cipher.new(@cipher)
31
63
  cipher.decrypt
64
+ cipher.padding = 0 unless padding
32
65
  cipher.key = @encrypt_key.dup
33
66
  cipher.iv = buf[0..15]
34
67
  cipher.update(buf[16..-1]) + cipher.final
35
68
  end
36
-
37
69
  end
38
70
  end
@@ -8,6 +8,7 @@ class PDF::Reader
8
8
  # MediaBox or CropBox, but could be a user specified rectangle too
9
9
  class BoundingRectangleRunsFilter
10
10
 
11
+ #: (Array[PDF::Reader::TextRun], PDF::Reader::Rectangle) -> Array[PDF::Reader::TextRun]
11
12
  def self.runs_within_rect(runs, rect)
12
13
  runs.select { |run| rect.contains?(run.origin) }
13
14
  end
@@ -1,5 +1,5 @@
1
1
  # coding: ASCII-8BIT
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  ################################################################################
@@ -38,30 +38,31 @@ class PDF::Reader
38
38
  # the raw tokens into objects we can work with (strings, ints, arrays, etc)
39
39
  #
40
40
  class Buffer
41
- TOKEN_WHITESPACE=[0x00, 0x09, 0x0A, 0x0C, 0x0D, 0x20]
42
- TOKEN_DELIMITER=[0x25, 0x3C, 0x3E, 0x28, 0x5B, 0x7B, 0x29, 0x5D, 0x7D, 0x2F]
41
+ TOKEN_WHITESPACE=[0x00, 0x09, 0x0A, 0x0C, 0x0D, 0x20] #: Array[Integer]
42
+ TOKEN_DELIMITER=[0x25, 0x3C, 0x3E, 0x28, 0x5B, 0x7B, 0x29, 0x5D, 0x7D, 0x2F] #: Array[Integer]
43
43
 
44
44
  # some strings for comparissons. Declaring them here avoids creating new
45
45
  # strings that need GC over and over
46
- LEFT_PAREN = "("
47
- LESS_THAN = "<"
48
- STREAM = "stream"
49
- ID = "ID"
50
- FWD_SLASH = "/"
51
- NULL_BYTE = "\x00"
52
- CR = "\r"
53
- LF = "\n"
54
- CRLF = "\r\n"
55
- WHITE_SPACE = [LF, CR, ' ']
46
+ LEFT_PAREN = "(" #: String
47
+ LESS_THAN = "<" #: String
48
+ STREAM = "stream" #: String
49
+ ID = "ID" #: String
50
+ FWD_SLASH = "/" #: String
51
+ NULL_BYTE = "\x00" #: String
52
+ CR = "\r" #: String
53
+ LF = "\n" #: String
54
+ CRLF = "\r\n" #: String
55
+ WHITE_SPACE = ["\n", "\r", ' '] #: Array[String]
56
56
 
57
57
  # Quite a few PDFs have trailing junk.
58
58
  # This can be several k of nuls in some cases
59
59
  # Allow for this here
60
- TRAILING_BYTECOUNT = 5000
60
+ TRAILING_BYTECOUNT = 5000 #: Integer
61
61
 
62
62
  # must match whole tokens
63
- DIGITS_ONLY = %r{\A\d+\z}
63
+ DIGITS_ONLY = %r{\A\d+\z} #: Regexp
64
64
 
65
+ #: Integer
65
66
  attr_reader :pos
66
67
 
67
68
  # Creates a new buffer.
@@ -76,17 +77,19 @@ class PDF::Reader
76
77
  # :content_stream - set to true if buffer will be tokenising a
77
78
  # content stream. Defaults to false
78
79
  #
80
+ #: ((StringIO | Tempfile | IO), ?Hash[Symbol, untyped]) -> void
79
81
  def initialize(io, opts = {})
80
82
  @io = io
81
- @tokens = []
82
- @in_content_stream = opts[:content_stream]
83
+ @tokens = [] #: Array[String | PDF::Reader::Reference]
84
+ @in_content_stream = opts[:content_stream] #: bool
83
85
 
84
86
  @io.seek(opts[:seek]) if opts[:seek]
85
- @pos = @io.pos
87
+ @pos = @io.pos #: Integer
86
88
  end
87
89
 
88
90
  # return true if there are no more tokens left
89
91
  #
92
+ #: () -> bool
90
93
  def empty?
91
94
  prepare_tokens if @tokens.size < 3
92
95
 
@@ -105,6 +108,7 @@ class PDF::Reader
105
108
  # Skipping a bare CR is not spec-compliant.
106
109
  # This is because the data may start with LF.
107
110
  # However we check for CRLF first, so the ambiguity is avoided.
111
+ #: (Integer, ?Hash[Symbol, untyped]) -> String?
108
112
  def read(bytes, opts = {})
109
113
  reset_pos
110
114
 
@@ -130,6 +134,7 @@ class PDF::Reader
130
134
  # return the next token from the source. Returns a string if a token
131
135
  # is found, nil if there are no tokens left.
132
136
  #
137
+ #: () -> (nil | String | PDF::Reader::Reference)
133
138
  def token
134
139
  reset_pos
135
140
  prepare_tokens if @tokens.size < 3
@@ -141,6 +146,7 @@ class PDF::Reader
141
146
 
142
147
  # return the byte offset where the first XRef table in th source can be found.
143
148
  #
149
+ #: () -> Integer
144
150
  def find_first_xref_offset
145
151
  check_size_is_non_zero
146
152
  @io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
@@ -164,6 +170,7 @@ class PDF::Reader
164
170
 
165
171
  private
166
172
 
173
+ #: () -> void
167
174
  def check_size_is_non_zero
168
175
  @io.seek(-1, IO::SEEK_END)
169
176
  @io.seek(0)
@@ -173,12 +180,14 @@ class PDF::Reader
173
180
 
174
181
  # Returns true if this buffer is parsing a content stream
175
182
  #
183
+ #: () -> bool
176
184
  def in_content_stream?
177
185
  @in_content_stream ? true : false
178
186
  end
179
187
 
180
188
  # Some bastard moved our IO stream cursor. Restore it.
181
189
  #
190
+ #: () -> void
182
191
  def reset_pos
183
192
  @io.seek(@pos) if @io.pos != @pos
184
193
  end
@@ -186,12 +195,14 @@ class PDF::Reader
186
195
  # save the current position of the source IO stream. If someone else (like another buffer)
187
196
  # moves the cursor, we can then restore it.
188
197
  #
198
+ #: () -> void
189
199
  def save_pos
190
200
  @pos = @io.pos
191
201
  end
192
202
 
193
203
  # attempt to prime the buffer with the next few tokens.
194
204
  #
205
+ #: () -> void
195
206
  def prepare_tokens
196
207
  10.times do
197
208
  case state
@@ -208,6 +219,7 @@ class PDF::Reader
208
219
  # tokenising behaves slightly differently based on the current context.
209
220
  # Determine the current context/state by examining the last token we found
210
221
  #
222
+ #: () -> Symbol
211
223
  def state
212
224
  case @tokens.last
213
225
  when LEFT_PAREN then :literal_string
@@ -236,6 +248,7 @@ class PDF::Reader
236
248
  # indirect reference, so test for that case first and avoid the relatively
237
249
  # expensive regexp checks if possible.
238
250
  #
251
+ #: () -> void
239
252
  def merge_indirect_reference
240
253
  return if @tokens.size < 3
241
254
  return if @tokens[2] != "R"
@@ -253,6 +266,7 @@ class PDF::Reader
253
266
  # If the EI follows white-space the space is dropped from the data
254
267
  # The EI must followed by white-space or end of buffer
255
268
  # This is to reduce the chance of accidentally matching an embedded EI
269
+ #: () -> void
256
270
  def prepare_inline_token
257
271
  idstart = @io.pos
258
272
  prevchr = ''
@@ -299,14 +313,14 @@ class PDF::Reader
299
313
  # if we're currently inside a hex string, read hex nibbles until
300
314
  # we find a closing >
301
315
  #
316
+ #: () -> void
302
317
  def prepare_hex_token
303
- finished = :false
304
318
  str = "".dup
305
319
 
306
- until finished == :true
320
+ loop do
307
321
  byte = @io.getbyte
308
322
  if byte.nil?
309
- finished = :true # unbalanced params
323
+ break
310
324
  elsif (48..57).include?(byte) || (65..90).include?(byte) || (97..122).include?(byte)
311
325
  str << byte
312
326
  elsif byte <= 32
@@ -315,7 +329,7 @@ class PDF::Reader
315
329
  @tokens << str if str.size > 0
316
330
  @tokens << ">" if byte != 0x3E # '>'
317
331
  @tokens << byte.chr
318
- finished = :true
332
+ break
319
333
  end
320
334
  end
321
335
  end
@@ -329,6 +343,7 @@ class PDF::Reader
329
343
  # processing to fix things like escaped new lines, but that's someone else's
330
344
  # problem.
331
345
  #
346
+ #: () -> void
332
347
  def prepare_literal_token
333
348
  str = "".dup
334
349
  count = 1
@@ -359,6 +374,7 @@ class PDF::Reader
359
374
  # What each byte means is complex, check out section "3.1.1 Character Set" of the 1.7 spec
360
375
  # to read up on it.
361
376
  #
377
+ #: () -> void
362
378
  def prepare_regular_token
363
379
  tok = "".dup
364
380
 
@@ -436,6 +452,7 @@ class PDF::Reader
436
452
  # peek at the next character in the io stream, leaving the stream position
437
453
  # untouched
438
454
  #
455
+ #: () -> (Integer | nil)
439
456
  def peek_byte
440
457
  byte = @io.getbyte
441
458
  @io.seek(-1, IO::SEEK_CUR) if byte