pdf-reader 2.9.2 → 2.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +39 -0
- data/README.md +33 -33
- data/Rakefile +2 -2
- data/lib/pdf/reader/advanced_text_run_filter.rb +152 -0
- data/lib/pdf/reader/aes_v2_security_handler.rb +30 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +35 -3
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +1 -0
- data/lib/pdf/reader/buffer.rb +39 -22
- data/lib/pdf/reader/cid_widths.rb +14 -6
- data/lib/pdf/reader/cmap.rb +16 -5
- data/lib/pdf/reader/encoding.rb +42 -18
- data/lib/pdf/reader/error.rb +6 -4
- data/lib/pdf/reader/filter/ascii85.rb +2 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
- data/lib/pdf/reader/filter/depredict.rb +6 -2
- data/lib/pdf/reader/filter/flate.rb +5 -2
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +2 -0
- data/lib/pdf/reader/filter/run_length.rb +2 -0
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/font.rb +99 -32
- data/lib/pdf/reader/font_descriptor.rb +79 -24
- data/lib/pdf/reader/form_xobject.rb +15 -1
- data/lib/pdf/reader/glyph_hash.rb +41 -8
- data/lib/pdf/reader/key_builder_v5.rb +17 -9
- data/lib/pdf/reader/lzw.rb +42 -16
- data/lib/pdf/reader/no_text_filter.rb +15 -0
- data/lib/pdf/reader/null_security_handler.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +7 -2
- data/lib/pdf/reader/object_hash.rb +129 -16
- data/lib/pdf/reader/object_stream.rb +22 -5
- data/lib/pdf/reader/overlapping_runs_filter.rb +8 -2
- data/lib/pdf/reader/page.rb +66 -13
- data/lib/pdf/reader/page_layout.rb +26 -9
- data/lib/pdf/reader/page_state.rb +12 -3
- data/lib/pdf/reader/page_text_receiver.rb +16 -2
- data/lib/pdf/reader/pages_strategy.rb +1 -1
- data/lib/pdf/reader/parser.rb +52 -13
- data/lib/pdf/reader/point.rb +9 -2
- data/lib/pdf/reader/print_receiver.rb +2 -6
- data/lib/pdf/reader/rc4_security_handler.rb +2 -0
- data/lib/pdf/reader/rectangle.rb +24 -1
- data/lib/pdf/reader/reference.rb +13 -3
- data/lib/pdf/reader/register_receiver.rb +15 -2
- data/lib/pdf/reader/resources.rb +12 -2
- data/lib/pdf/reader/security_handler_factory.rb +13 -0
- data/lib/pdf/reader/standard_key_builder.rb +37 -23
- data/lib/pdf/reader/stream.rb +9 -3
- data/lib/pdf/reader/synchronized_cache.rb +6 -3
- data/lib/pdf/reader/text_run.rb +33 -3
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +41 -10
- data/lib/pdf/reader/type_check.rb +53 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +2 -0
- data/lib/pdf/reader/validating_receiver.rb +29 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +13 -5
- data/lib/pdf/reader/width_calculator/composite.rb +11 -3
- data/lib/pdf/reader/width_calculator/true_type.rb +14 -12
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +8 -5
- data/lib/pdf/reader/width_calculator/type_zero.rb +8 -3
- data/lib/pdf/reader/xref.rb +31 -10
- data/lib/pdf/reader/zero_width_runs_filter.rb +1 -0
- data/lib/pdf/reader.rb +24 -12
- data/rbi/pdf-reader.rbi +1504 -1480
- metadata +34 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1be615eb6abc5557e61ba53958c7211fac0f1528e75dc54eff27ffb5554d7c80
|
4
|
+
data.tar.gz: 875221f31dc119cd0f7ae3cc0246b3bbb70f6127c0047ec924c8030e9186b55b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4b4501ca72d06b5a569fdcc77f384131fbd85342f8da7a084a02210ec7a3821e8b9f1cad88685262d0cc4e993f7b0031bed5d510c353c7d8fb5fe28f97a2ea83
|
7
|
+
data.tar.gz: a4fe329f2d8ae7cc295cb17d573963ddab6c0cde52d6524ad182f4651dab8ba90215bcb1ecf60c7fcf248135aed152b50a1d34afa03b270b93c5a172ac4048b3
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,42 @@
|
|
1
|
+
v2.15.0 (13th August 2025)
|
2
|
+
|
3
|
+
- Overhaul sorbet types, moving from an external RBI file to inline comments in RBS syntax
|
4
|
+
- multiple PRs, but mainly https://github.com/yob/pdf-reader/pull/562
|
5
|
+
- See https://railsatscale.com/2025-04-23-rbs-support-for-sorbet/
|
6
|
+
- No impact expected for most users, but projects that use sorbet may find subtle changes in
|
7
|
+
the RBI file that is shipped with the gem
|
8
|
+
- Relax version requirements for dependency `afm`, allow 1.x (https://github.com/yob/pdf-reader/pull/557)
|
9
|
+
- Improve text positioning logic in some PDFs (https://github.com/yob/pdf-reader/pull/554)
|
10
|
+
- Multiple fixes for encrypted files
|
11
|
+
- Some files with passwords > 32 bytes long (https://github.com/yob/pdf-reader/pull/555)
|
12
|
+
- Some files that contain cipher text with a 16 byte IV and no further blocks (https://github.com/yob/pdf-reader/pull/561)
|
13
|
+
- Some files that encrypted data with no padding (https://github.com/yob/pdf-reader/pull/564)
|
14
|
+
- Add jruby 10 to CI matrix (https://github.com/yob/pdf-reader/pull/552)
|
15
|
+
|
16
|
+
v2.14.1 (4th February 2025)
|
17
|
+
- Fix issue in RBI signatures, introduced in v2.14.0(https://github.com/yob/pdf-reader/pull/550)
|
18
|
+
|
19
|
+
v2.14.0 (29th January 2025)
|
20
|
+
- Raise minimum supported ruby to 2.1 (https://github.com/yob/pdf-reader/pull/543)
|
21
|
+
- Add support for filtering to Page#text (https://github.com/yob/pdf-reader/pull/545)
|
22
|
+
|
23
|
+
v2.13.0 (2nd November 2024)
|
24
|
+
- Permit Ascii86 v1.0 and v2.0 (https://github.com/yob/pdf-reader/pull/539)
|
25
|
+
- Allow StringIO type for PDF::Reader input (https://github.com/yob/pdf-reader/pull/535)
|
26
|
+
|
27
|
+
v2.12.0 (26th December 2023)
|
28
|
+
- Fix a sorbet method signature (http://github.com/yob/pdf-reader/pull/512)
|
29
|
+
- Reduce allocations when parsing PDFs with hex strings (http://github.com/yob/pdf-reader/pull/528)
|
30
|
+
- Fix text extraction of some rare unicode codepoints (http://github.com/yob/pdf-reader/pull/529)
|
31
|
+
|
32
|
+
v2.11.0 (26th October 2022)
|
33
|
+
- Various bug fixes
|
34
|
+
- Expanded sorbet type annotations
|
35
|
+
|
36
|
+
v2.10.0 (12th May 2022)
|
37
|
+
- Various bug fixes
|
38
|
+
- Expanded sorbet type annotations
|
39
|
+
|
1
40
|
v2.9.2 (20th February 2022)
|
2
41
|
- Fix PDF::Reader::ObjectHash#page_references to return an Array of PDF::Reader::Reference (http://github.com/yob/pdf-reader/pull/444)
|
3
42
|
|
data/README.md
CHANGED
@@ -20,7 +20,7 @@ page.
|
|
20
20
|
The recommended installation method is via Rubygems.
|
21
21
|
|
22
22
|
```ruby
|
23
|
-
|
23
|
+
gem install pdf-reader
|
24
24
|
```
|
25
25
|
|
26
26
|
# Usage
|
@@ -30,23 +30,23 @@ level information (metadata, page count, bookmarks, etc) is available via
|
|
30
30
|
this object.
|
31
31
|
|
32
32
|
```ruby
|
33
|
-
|
33
|
+
reader = PDF::Reader.new("somefile.pdf")
|
34
34
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
35
|
+
puts reader.pdf_version
|
36
|
+
puts reader.info
|
37
|
+
puts reader.metadata
|
38
|
+
puts reader.page_count
|
39
39
|
```
|
40
40
|
|
41
41
|
PDF::Reader.new accepts an IO stream or a filename. Here's an example with
|
42
42
|
an IO stream:
|
43
43
|
|
44
44
|
```ruby
|
45
|
-
|
45
|
+
require 'open-uri'
|
46
46
|
|
47
|
-
|
48
|
-
|
49
|
-
|
47
|
+
io = open('http://example.com/somefile.pdf')
|
48
|
+
reader = PDF::Reader.new(io)
|
49
|
+
puts reader.info
|
50
50
|
```
|
51
51
|
|
52
52
|
If you open a PDF with File#open or IO#open, I strongly recommend using "rb"
|
@@ -54,47 +54,47 @@ mode to ensure the file isn't mangled by ruby being 'helpful'. This is
|
|
54
54
|
particularly important on windows and MRI >= 1.9.2.
|
55
55
|
|
56
56
|
```ruby
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
57
|
+
File.open("somefile.pdf", "rb") do |io|
|
58
|
+
reader = PDF::Reader.new(io)
|
59
|
+
puts reader.info
|
60
|
+
end
|
61
61
|
```
|
62
62
|
|
63
63
|
PDF is a page based file format, so most visible information is available via
|
64
64
|
page-based iteration
|
65
65
|
|
66
66
|
```ruby
|
67
|
-
|
67
|
+
reader = PDF::Reader.new("somefile.pdf")
|
68
68
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
69
|
+
reader.pages.each do |page|
|
70
|
+
puts page.fonts
|
71
|
+
puts page.text
|
72
|
+
puts page.raw_content
|
73
|
+
end
|
74
74
|
```
|
75
75
|
|
76
76
|
If you need to access the full program for rendering a page, use the walk() method
|
77
77
|
of PDF::Reader::Page.
|
78
78
|
|
79
79
|
```ruby
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
80
|
+
class RedGreenBlue
|
81
|
+
def set_rgb_color_for_nonstroking(r, g, b)
|
82
|
+
puts "R: #{r}, G: #{g}, B: #{b}"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
reader = PDF::Reader.new("somefile.pdf")
|
87
|
+
page = reader.page(1)
|
88
|
+
receiver = RedGreenBlue.new
|
89
|
+
page.walk(receiver)
|
90
90
|
```
|
91
91
|
|
92
92
|
For low level access to the objects in a PDF file, use the ObjectHash class like
|
93
93
|
so:
|
94
94
|
|
95
95
|
```ruby
|
96
|
-
|
97
|
-
|
96
|
+
reader = PDF::Reader.new("somefile.pdf")
|
97
|
+
puts reader.objects.inspect
|
98
98
|
```
|
99
99
|
|
100
100
|
# Text Encoding
|
@@ -141,7 +141,7 @@ the spec folder when you checkout a branch from Git.
|
|
141
141
|
To remove any invalid CRLF characters added while checking out a branch from Git, run:
|
142
142
|
|
143
143
|
```ruby
|
144
|
-
|
144
|
+
rake fix_integrity
|
145
145
|
```
|
146
146
|
|
147
147
|
# Maintainers
|
data/Rakefile
CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
|
|
14
14
|
Cane::RakeTask.new(:quality) do |cane|
|
15
15
|
cane.abc_max = 20
|
16
16
|
cane.style_measure = 100
|
17
|
-
cane.max_violations =
|
17
|
+
cane.max_violations = 33
|
18
18
|
|
19
19
|
cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
|
20
20
|
end
|
@@ -41,7 +41,7 @@ end
|
|
41
41
|
desc "Create a YAML file of integrity info for PDFs in the spec suite"
|
42
42
|
task :integrity_yaml do
|
43
43
|
data = {}
|
44
|
-
Dir.glob("spec/data
|
44
|
+
Dir.glob("spec/data/**/*.pdf").sort.each do |path|
|
45
45
|
path_without_spec = path.gsub("spec/","")
|
46
46
|
data[path_without_spec] = {
|
47
47
|
:bytes => File.size(path),
|
@@ -0,0 +1,152 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
# typed: strict
|
4
|
+
|
5
|
+
class PDF::Reader
|
6
|
+
# Filter a collection of TextRun objects based on a set of conditions.
|
7
|
+
# It can be used to filter text runs based on their attributes.
|
8
|
+
# The filter can return the text runs that matches the conditions (only) or
|
9
|
+
# the text runs that do not match the conditions (exclude).
|
10
|
+
#
|
11
|
+
# You can filter the text runs based on all its attributes with the operators
|
12
|
+
# mentioned in VALID_OPERATORS.
|
13
|
+
# The filter can be nested with 'or' and 'and' conditions.
|
14
|
+
#
|
15
|
+
# Examples:
|
16
|
+
# 1. Single condition
|
17
|
+
# AdvancedTextRunFilter.exclude(text_runs, text: { include: 'sample' })
|
18
|
+
#
|
19
|
+
# 2. Multiple conditions (and)
|
20
|
+
# AdvancedTextRunFilter.exclude(text_runs, {
|
21
|
+
# font_size: { greater_than: 10, less_than: 15 }
|
22
|
+
# })
|
23
|
+
#
|
24
|
+
# 3. Multiple possible values (or)
|
25
|
+
# AdvancedTextRunFilter.exclude(text_runs, {
|
26
|
+
# font_size: { equal: [10, 12] }
|
27
|
+
# })
|
28
|
+
#
|
29
|
+
# 4. Complex AND/OR filter
|
30
|
+
# AdvancedTextRunFilter.exclude(text_runs, {
|
31
|
+
# and: [
|
32
|
+
# { font_size: { greater_than: 10 } },
|
33
|
+
# { or: [
|
34
|
+
# { text: { include: "sample" } },
|
35
|
+
# { width: { greater_than: 100 } }
|
36
|
+
# ]}
|
37
|
+
# ]
|
38
|
+
# })
|
39
|
+
class AdvancedTextRunFilter
|
40
|
+
VALID_OPERATORS = %i[
|
41
|
+
equal
|
42
|
+
not_equal
|
43
|
+
greater_than
|
44
|
+
less_than
|
45
|
+
greater_than_or_equal
|
46
|
+
less_than_or_equal
|
47
|
+
include
|
48
|
+
exclude
|
49
|
+
] #: Array[Symbol]
|
50
|
+
|
51
|
+
#: (Array[PDF::Reader::TextRun], Hash[Symbol, untyped]) -> Array[PDF::Reader::TextRun]
|
52
|
+
def self.only(text_runs, filter_hash)
|
53
|
+
new(text_runs, filter_hash).only
|
54
|
+
end
|
55
|
+
|
56
|
+
#: (Array[PDF::Reader::TextRun], Hash[Symbol, untyped]) -> Array[PDF::Reader::TextRun]
|
57
|
+
def self.exclude(text_runs, filter_hash)
|
58
|
+
new(text_runs, filter_hash).exclude
|
59
|
+
end
|
60
|
+
|
61
|
+
#: Array[PDF::Reader::TextRun]
|
62
|
+
attr_reader :text_runs
|
63
|
+
|
64
|
+
#: Hash[Symbol, untyped]
|
65
|
+
attr_reader :filter_hash
|
66
|
+
|
67
|
+
#: (Array[PDF::Reader::TextRun], Hash[Symbol, untyped]) -> void
|
68
|
+
def initialize(text_runs, filter_hash)
|
69
|
+
@text_runs = text_runs
|
70
|
+
@filter_hash = filter_hash
|
71
|
+
end
|
72
|
+
|
73
|
+
#: () -> Array[PDF::Reader::TextRun]
|
74
|
+
def only
|
75
|
+
return text_runs if filter_hash.empty?
|
76
|
+
text_runs.select { |text_run| evaluate_filter(text_run) }
|
77
|
+
end
|
78
|
+
|
79
|
+
#: () -> Array[PDF::Reader::TextRun]
|
80
|
+
def exclude
|
81
|
+
return text_runs if filter_hash.empty?
|
82
|
+
text_runs.reject { |text_run| evaluate_filter(text_run) }
|
83
|
+
end
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
#: (PDF::Reader::TextRun) -> bool
|
88
|
+
def evaluate_filter(text_run)
|
89
|
+
if filter_hash[:or]
|
90
|
+
evaluate_or_filters(text_run, filter_hash[:or])
|
91
|
+
elsif filter_hash[:and]
|
92
|
+
evaluate_and_filters(text_run, filter_hash[:and])
|
93
|
+
else
|
94
|
+
evaluate_filters(text_run, filter_hash)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
#: (PDF::Reader::TextRun, Array[Hash[Symbol, untyped]]) -> bool
|
99
|
+
def evaluate_or_filters(text_run, conditions)
|
100
|
+
conditions.any? do |condition|
|
101
|
+
evaluate_filters(text_run, condition)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
#: (PDF::Reader::TextRun, Array[Hash[Symbol, untyped]]) -> bool
|
106
|
+
def evaluate_and_filters(text_run, conditions)
|
107
|
+
conditions.all? do |condition|
|
108
|
+
evaluate_filters(text_run, condition)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
#: (PDF::Reader::TextRun, Hash[Symbol, untyped]) -> bool
|
113
|
+
def evaluate_filters(text_run, filter_hash)
|
114
|
+
filter_hash.all? do |attribute, conditions|
|
115
|
+
evaluate_attribute_conditions(text_run, attribute, conditions)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
#: (PDF::Reader::TextRun, Symbol, Hash[Symbol, untyped]) -> bool
|
120
|
+
def evaluate_attribute_conditions(text_run, attribute, conditions)
|
121
|
+
conditions.all? do |operator, value|
|
122
|
+
unless VALID_OPERATORS.include?(operator)
|
123
|
+
raise ArgumentError, "Invalid operator: #{operator}"
|
124
|
+
end
|
125
|
+
|
126
|
+
apply_operator(text_run.send(attribute), operator, value)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
#: (untyped, Symbol, untyped) -> bool
|
131
|
+
def apply_operator(attribute_value, operator, filter_value)
|
132
|
+
case operator
|
133
|
+
when :equal
|
134
|
+
Array(filter_value).include?(attribute_value)
|
135
|
+
when :not_equal
|
136
|
+
!Array(filter_value).include?(attribute_value)
|
137
|
+
when :greater_than
|
138
|
+
attribute_value > filter_value
|
139
|
+
when :less_than
|
140
|
+
attribute_value < filter_value
|
141
|
+
when :greater_than_or_equal
|
142
|
+
attribute_value >= filter_value
|
143
|
+
when :less_than_or_equal
|
144
|
+
attribute_value <= filter_value
|
145
|
+
when :include
|
146
|
+
Array(filter_value).any? { |v| attribute_value.to_s.include?(v.to_s) }
|
147
|
+
when :exclude
|
148
|
+
Array(filter_value).none? { |v| attribute_value.to_s.include?(v.to_s) }
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
@@ -11,6 +11,7 @@ class PDF::Reader
|
|
11
11
|
#
|
12
12
|
class AesV2SecurityHandler
|
13
13
|
|
14
|
+
#: (String) -> void
|
14
15
|
def initialize(key)
|
15
16
|
@encrypt_key = key
|
16
17
|
end
|
@@ -21,10 +22,38 @@ class PDF::Reader
|
|
21
22
|
#
|
22
23
|
# version == 4 and CFM == AESV2
|
23
24
|
#
|
25
|
+
# used to decrypt PDF streams (buf). Input data should be in bytesizes of
|
26
|
+
# a multiple of 16, anything else is an error. The first 16 bytes are the initialization
|
27
|
+
# vector, so any input of exactly 16 bytes decrypts to an empty string
|
28
|
+
#
|
24
29
|
# buf - a string to decrypt
|
25
30
|
# ref - a PDF::Reader::Reference for the object to decrypt
|
26
31
|
#
|
32
|
+
#: (String, PDF::Reader::Reference) -> String
|
27
33
|
def decrypt( buf, ref )
|
34
|
+
if buf.bytesize % 16 > 0
|
35
|
+
raise PDF::Reader::MalformedPDFError.new("Ciphertext not a multiple of 16")
|
36
|
+
elsif buf.bytesize == 16
|
37
|
+
return ""
|
38
|
+
else
|
39
|
+
begin
|
40
|
+
internal_decrypt(buf, ref)
|
41
|
+
rescue OpenSSL::Cipher::CipherError
|
42
|
+
# If we failed to decrypt it might be a padding error, so try again
|
43
|
+
# and assume no padding in the ciphertext. This will "suceed" but might
|
44
|
+
# return garbage if the key is incorrect but that's OK - well before this
|
45
|
+
# class is used we have confirmed the user provided key is correct so if
|
46
|
+
# this works without error we can be confident the returned plaintext is
|
47
|
+
# correct
|
48
|
+
internal_decrypt(buf, ref, false)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
#: (String, PDF::Reader::Reference, ?bool) -> String
|
56
|
+
def internal_decrypt(buf, ref, padding = true)
|
28
57
|
objKey = @encrypt_key.dup
|
29
58
|
(0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
|
30
59
|
(0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
|
@@ -32,6 +61,7 @@ class PDF::Reader
|
|
32
61
|
length = objKey.length < 16 ? objKey.length : 16
|
33
62
|
cipher = OpenSSL::Cipher.new("AES-#{length << 3}-CBC")
|
34
63
|
cipher.decrypt
|
64
|
+
cipher.padding = 0 unless padding
|
35
65
|
cipher.key = Digest::MD5.digest(objKey)[0,length]
|
36
66
|
cipher.iv = buf[0..15]
|
37
67
|
cipher.update(buf[16..-1]) + cipher.final
|
@@ -12,27 +12,59 @@ class PDF::Reader
|
|
12
12
|
#
|
13
13
|
class AesV3SecurityHandler
|
14
14
|
|
15
|
+
#: (String) -> void
|
15
16
|
def initialize(key)
|
17
|
+
if key.bytesize != 32
|
18
|
+
raise PDF::Reader::MalformedPDFError.new(
|
19
|
+
"AES-256 key must be exactly 32 bytes, got #{key.bytesize}"
|
20
|
+
)
|
21
|
+
end
|
16
22
|
@encrypt_key = key
|
17
|
-
@cipher = "AES-256-CBC"
|
23
|
+
@cipher = "AES-256-CBC" #: String
|
18
24
|
end
|
19
25
|
|
20
26
|
##7.6.2 General Encryption Algorithm
|
21
27
|
#
|
22
28
|
# Algorithm 1: Encryption of data using the RC4 or AES algorithms
|
23
29
|
#
|
24
|
-
# used to decrypt RC4/AES encrypted PDF streams (buf)
|
30
|
+
# used to decrypt RC4/AES encrypted PDF streams (buf). Input data should be in bytesizes of
|
31
|
+
# a multiple of 16, anything else is an error. The first 16 bytes are the initialization
|
32
|
+
# vector, so any input of exactly 16 bytes decrypts to an empty string
|
25
33
|
#
|
26
34
|
# buf - a string to decrypt
|
27
35
|
# ref - a PDF::Reader::Reference for the object to decrypt
|
28
36
|
#
|
37
|
+
#: (String, PDF::Reader::Reference) -> String
|
29
38
|
def decrypt( buf, ref )
|
39
|
+
if buf.bytesize % 16 > 0
|
40
|
+
raise PDF::Reader::MalformedPDFError.new("Ciphertext not a multiple of 16")
|
41
|
+
elsif buf.bytesize == 16
|
42
|
+
return ""
|
43
|
+
else
|
44
|
+
begin
|
45
|
+
internal_decrypt(buf, ref)
|
46
|
+
rescue OpenSSL::Cipher::CipherError
|
47
|
+
# If we failed to decrypt it might be a padding error, so try again
|
48
|
+
# and assume no padding in the ciphertext. This will "suceed" but might
|
49
|
+
# return garbage if the key is incorrect but that's OK - well before this
|
50
|
+
# class is used we have confirmed the user provided key is correct so if
|
51
|
+
# this works without error we can be confident the returned plaintext is
|
52
|
+
# correct
|
53
|
+
internal_decrypt(buf, ref, false)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
#: (String, PDF::Reader::Reference, ?bool) -> String
|
61
|
+
def internal_decrypt(buf, ref, padding = true)
|
30
62
|
cipher = OpenSSL::Cipher.new(@cipher)
|
31
63
|
cipher.decrypt
|
64
|
+
cipher.padding = 0 unless padding
|
32
65
|
cipher.key = @encrypt_key.dup
|
33
66
|
cipher.iv = buf[0..15]
|
34
67
|
cipher.update(buf[16..-1]) + cipher.final
|
35
68
|
end
|
36
|
-
|
37
69
|
end
|
38
70
|
end
|
@@ -8,6 +8,7 @@ class PDF::Reader
|
|
8
8
|
# MediaBox or CropBox, but could be a user specified rectangle too
|
9
9
|
class BoundingRectangleRunsFilter
|
10
10
|
|
11
|
+
#: (Array[PDF::Reader::TextRun], PDF::Reader::Rectangle) -> Array[PDF::Reader::TextRun]
|
11
12
|
def self.runs_within_rect(runs, rect)
|
12
13
|
runs.select { |run| rect.contains?(run.origin) }
|
13
14
|
end
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
# coding: ASCII-8BIT
|
2
|
-
# typed:
|
2
|
+
# typed: strict
|
3
3
|
# frozen_string_literal: true
|
4
4
|
|
5
5
|
################################################################################
|
@@ -38,30 +38,31 @@ class PDF::Reader
|
|
38
38
|
# the raw tokens into objects we can work with (strings, ints, arrays, etc)
|
39
39
|
#
|
40
40
|
class Buffer
|
41
|
-
TOKEN_WHITESPACE=[0x00, 0x09, 0x0A, 0x0C, 0x0D, 0x20]
|
42
|
-
TOKEN_DELIMITER=[0x25, 0x3C, 0x3E, 0x28, 0x5B, 0x7B, 0x29, 0x5D, 0x7D, 0x2F]
|
41
|
+
TOKEN_WHITESPACE=[0x00, 0x09, 0x0A, 0x0C, 0x0D, 0x20] #: Array[Integer]
|
42
|
+
TOKEN_DELIMITER=[0x25, 0x3C, 0x3E, 0x28, 0x5B, 0x7B, 0x29, 0x5D, 0x7D, 0x2F] #: Array[Integer]
|
43
43
|
|
44
44
|
# some strings for comparissons. Declaring them here avoids creating new
|
45
45
|
# strings that need GC over and over
|
46
|
-
LEFT_PAREN = "("
|
47
|
-
LESS_THAN = "<"
|
48
|
-
STREAM = "stream"
|
49
|
-
ID = "ID"
|
50
|
-
FWD_SLASH = "/"
|
51
|
-
NULL_BYTE = "\x00"
|
52
|
-
CR = "\r"
|
53
|
-
LF = "\n"
|
54
|
-
CRLF = "\r\n"
|
55
|
-
WHITE_SPACE = [
|
46
|
+
LEFT_PAREN = "(" #: String
|
47
|
+
LESS_THAN = "<" #: String
|
48
|
+
STREAM = "stream" #: String
|
49
|
+
ID = "ID" #: String
|
50
|
+
FWD_SLASH = "/" #: String
|
51
|
+
NULL_BYTE = "\x00" #: String
|
52
|
+
CR = "\r" #: String
|
53
|
+
LF = "\n" #: String
|
54
|
+
CRLF = "\r\n" #: String
|
55
|
+
WHITE_SPACE = ["\n", "\r", ' '] #: Array[String]
|
56
56
|
|
57
57
|
# Quite a few PDFs have trailing junk.
|
58
58
|
# This can be several k of nuls in some cases
|
59
59
|
# Allow for this here
|
60
|
-
TRAILING_BYTECOUNT = 5000
|
60
|
+
TRAILING_BYTECOUNT = 5000 #: Integer
|
61
61
|
|
62
62
|
# must match whole tokens
|
63
|
-
DIGITS_ONLY = %r{\A\d+\z}
|
63
|
+
DIGITS_ONLY = %r{\A\d+\z} #: Regexp
|
64
64
|
|
65
|
+
#: Integer
|
65
66
|
attr_reader :pos
|
66
67
|
|
67
68
|
# Creates a new buffer.
|
@@ -76,17 +77,19 @@ class PDF::Reader
|
|
76
77
|
# :content_stream - set to true if buffer will be tokenising a
|
77
78
|
# content stream. Defaults to false
|
78
79
|
#
|
80
|
+
#: ((StringIO | Tempfile | IO), ?Hash[Symbol, untyped]) -> void
|
79
81
|
def initialize(io, opts = {})
|
80
82
|
@io = io
|
81
|
-
@tokens = []
|
82
|
-
@in_content_stream = opts[:content_stream]
|
83
|
+
@tokens = [] #: Array[String | PDF::Reader::Reference]
|
84
|
+
@in_content_stream = opts[:content_stream] #: bool
|
83
85
|
|
84
86
|
@io.seek(opts[:seek]) if opts[:seek]
|
85
|
-
@pos = @io.pos
|
87
|
+
@pos = @io.pos #: Integer
|
86
88
|
end
|
87
89
|
|
88
90
|
# return true if there are no more tokens left
|
89
91
|
#
|
92
|
+
#: () -> bool
|
90
93
|
def empty?
|
91
94
|
prepare_tokens if @tokens.size < 3
|
92
95
|
|
@@ -105,6 +108,7 @@ class PDF::Reader
|
|
105
108
|
# Skipping a bare CR is not spec-compliant.
|
106
109
|
# This is because the data may start with LF.
|
107
110
|
# However we check for CRLF first, so the ambiguity is avoided.
|
111
|
+
#: (Integer, ?Hash[Symbol, untyped]) -> String?
|
108
112
|
def read(bytes, opts = {})
|
109
113
|
reset_pos
|
110
114
|
|
@@ -130,6 +134,7 @@ class PDF::Reader
|
|
130
134
|
# return the next token from the source. Returns a string if a token
|
131
135
|
# is found, nil if there are no tokens left.
|
132
136
|
#
|
137
|
+
#: () -> (nil | String | PDF::Reader::Reference)
|
133
138
|
def token
|
134
139
|
reset_pos
|
135
140
|
prepare_tokens if @tokens.size < 3
|
@@ -141,6 +146,7 @@ class PDF::Reader
|
|
141
146
|
|
142
147
|
# return the byte offset where the first XRef table in th source can be found.
|
143
148
|
#
|
149
|
+
#: () -> Integer
|
144
150
|
def find_first_xref_offset
|
145
151
|
check_size_is_non_zero
|
146
152
|
@io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
|
@@ -164,6 +170,7 @@ class PDF::Reader
|
|
164
170
|
|
165
171
|
private
|
166
172
|
|
173
|
+
#: () -> void
|
167
174
|
def check_size_is_non_zero
|
168
175
|
@io.seek(-1, IO::SEEK_END)
|
169
176
|
@io.seek(0)
|
@@ -173,12 +180,14 @@ class PDF::Reader
|
|
173
180
|
|
174
181
|
# Returns true if this buffer is parsing a content stream
|
175
182
|
#
|
183
|
+
#: () -> bool
|
176
184
|
def in_content_stream?
|
177
185
|
@in_content_stream ? true : false
|
178
186
|
end
|
179
187
|
|
180
188
|
# Some bastard moved our IO stream cursor. Restore it.
|
181
189
|
#
|
190
|
+
#: () -> void
|
182
191
|
def reset_pos
|
183
192
|
@io.seek(@pos) if @io.pos != @pos
|
184
193
|
end
|
@@ -186,12 +195,14 @@ class PDF::Reader
|
|
186
195
|
# save the current position of the source IO stream. If someone else (like another buffer)
|
187
196
|
# moves the cursor, we can then restore it.
|
188
197
|
#
|
198
|
+
#: () -> void
|
189
199
|
def save_pos
|
190
200
|
@pos = @io.pos
|
191
201
|
end
|
192
202
|
|
193
203
|
# attempt to prime the buffer with the next few tokens.
|
194
204
|
#
|
205
|
+
#: () -> void
|
195
206
|
def prepare_tokens
|
196
207
|
10.times do
|
197
208
|
case state
|
@@ -208,6 +219,7 @@ class PDF::Reader
|
|
208
219
|
# tokenising behaves slightly differently based on the current context.
|
209
220
|
# Determine the current context/state by examining the last token we found
|
210
221
|
#
|
222
|
+
#: () -> Symbol
|
211
223
|
def state
|
212
224
|
case @tokens.last
|
213
225
|
when LEFT_PAREN then :literal_string
|
@@ -236,6 +248,7 @@ class PDF::Reader
|
|
236
248
|
# indirect reference, so test for that case first and avoid the relatively
|
237
249
|
# expensive regexp checks if possible.
|
238
250
|
#
|
251
|
+
#: () -> void
|
239
252
|
def merge_indirect_reference
|
240
253
|
return if @tokens.size < 3
|
241
254
|
return if @tokens[2] != "R"
|
@@ -253,6 +266,7 @@ class PDF::Reader
|
|
253
266
|
# If the EI follows white-space the space is dropped from the data
|
254
267
|
# The EI must followed by white-space or end of buffer
|
255
268
|
# This is to reduce the chance of accidentally matching an embedded EI
|
269
|
+
#: () -> void
|
256
270
|
def prepare_inline_token
|
257
271
|
idstart = @io.pos
|
258
272
|
prevchr = ''
|
@@ -299,14 +313,14 @@ class PDF::Reader
|
|
299
313
|
# if we're currently inside a hex string, read hex nibbles until
|
300
314
|
# we find a closing >
|
301
315
|
#
|
316
|
+
#: () -> void
|
302
317
|
def prepare_hex_token
|
303
|
-
finished = :false
|
304
318
|
str = "".dup
|
305
319
|
|
306
|
-
|
320
|
+
loop do
|
307
321
|
byte = @io.getbyte
|
308
322
|
if byte.nil?
|
309
|
-
|
323
|
+
break
|
310
324
|
elsif (48..57).include?(byte) || (65..90).include?(byte) || (97..122).include?(byte)
|
311
325
|
str << byte
|
312
326
|
elsif byte <= 32
|
@@ -315,7 +329,7 @@ class PDF::Reader
|
|
315
329
|
@tokens << str if str.size > 0
|
316
330
|
@tokens << ">" if byte != 0x3E # '>'
|
317
331
|
@tokens << byte.chr
|
318
|
-
|
332
|
+
break
|
319
333
|
end
|
320
334
|
end
|
321
335
|
end
|
@@ -329,6 +343,7 @@ class PDF::Reader
|
|
329
343
|
# processing to fix things like escaped new lines, but that's someone else's
|
330
344
|
# problem.
|
331
345
|
#
|
346
|
+
#: () -> void
|
332
347
|
def prepare_literal_token
|
333
348
|
str = "".dup
|
334
349
|
count = 1
|
@@ -359,6 +374,7 @@ class PDF::Reader
|
|
359
374
|
# What each byte means is complex, check out section "3.1.1 Character Set" of the 1.7 spec
|
360
375
|
# to read up on it.
|
361
376
|
#
|
377
|
+
#: () -> void
|
362
378
|
def prepare_regular_token
|
363
379
|
tok = "".dup
|
364
380
|
|
@@ -436,6 +452,7 @@ class PDF::Reader
|
|
436
452
|
# peek at the next character in the io stream, leaving the stream position
|
437
453
|
# untouched
|
438
454
|
#
|
455
|
+
#: () -> (Integer | nil)
|
439
456
|
def peek_byte
|
440
457
|
byte = @io.getbyte
|
441
458
|
@io.seek(-1, IO::SEEK_CUR) if byte
|