pdf-reader 2.5.0 → 2.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +42 -0
- data/README.md +16 -1
- data/Rakefile +1 -1
- data/examples/extract_fonts.rb +12 -7
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
- data/lib/pdf/reader/buffer.rb +90 -46
- data/lib/pdf/reader/cid_widths.rb +1 -0
- data/lib/pdf/reader/cmap.rb +65 -50
- data/lib/pdf/reader/encoding.rb +3 -2
- data/lib/pdf/reader/error.rb +19 -3
- data/lib/pdf/reader/filter/ascii85.rb +7 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +11 -9
- data/lib/pdf/reader/filter/flate.rb +4 -2
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -1
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +2 -1
- data/lib/pdf/reader/font.rb +72 -16
- data/lib/pdf/reader/font_descriptor.rb +19 -17
- data/lib/pdf/reader/form_xobject.rb +15 -5
- data/lib/pdf/reader/glyph_hash.rb +16 -9
- data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
- data/lib/pdf/reader/key_builder_v5.rb +138 -0
- data/lib/pdf/reader/lzw.rb +4 -2
- data/lib/pdf/reader/null_security_handler.rb +1 -4
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +252 -44
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
- data/lib/pdf/reader/page.rb +99 -19
- data/lib/pdf/reader/page_layout.rb +36 -37
- data/lib/pdf/reader/page_state.rb +12 -11
- data/lib/pdf/reader/page_text_receiver.rb +57 -10
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +23 -12
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rc4_security_handler.rb +38 -0
- data/lib/pdf/reader/rectangle.rb +113 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/{resource_methods.rb → resources.rb} +16 -9
- data/lib/pdf/reader/security_handler_factory.rb +79 -0
- data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
- data/lib/pdf/reader/stream.rb +2 -1
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +14 -6
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/type_check.rb +52 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/validating_receiver.rb +262 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +1 -0
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +2 -1
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +27 -4
- data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
- data/lib/pdf/reader.rb +46 -15
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +1978 -0
- metadata +21 -10
- data/lib/pdf/reader/orientation_detector.rb +0 -34
- data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2b4616131d0ad73c4ef2c4992ae79d4fde420d6857aba60e8dfac9b088a0b915
|
4
|
+
data.tar.gz: f93f481d7f76af426420dbf507a88e8ecead8ec84690781f42de3b7b5ffbd1bd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 86dbe3450a11e0deb3f5db98625375b252cc25f289d76c98b5de48342d1b4957de81c1c2b6cce53d7d09738e9576bd48213c92166d48911c1f45ad6a77f195a5
|
7
|
+
data.tar.gz: ee852ff644a095bae93eb7cc30c6d070c8c6adda4f9bfadecf938bf3ba2723fed08c75a3bf15ba30fcf8fded7ad6a5b74dad8a3f512823798686350f24b912eb
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,45 @@
|
|
1
|
+
v2.9.0 (24th January 2022)
|
2
|
+
- Support additional encryption standards (http://github.com/yob/pdf-reader/pull/419)
|
3
|
+
- Return CropBox correctly from Page#rectangles (https://github.com/yob/pdf-reader/pull/420)
|
4
|
+
- For sorbet users, additional type annotations are included in the gem
|
5
|
+
|
6
|
+
v2.8.0 (28th Decemeber 2021)
|
7
|
+
- Add PDF::Reader::Page#runs for extracting text from a page with positioning metadata (http://github.com/yob/pdf-reader/pull/411)
|
8
|
+
- Add options to PDF::Reader::Page#text to make some behaviour configurable (http://github.com/yob/pdf-reader/pull/411)
|
9
|
+
- including extracting the text for only part of the page
|
10
|
+
- Improve text positioning and extraction for Type3 fonts (http://github.com/yob/pdf-reader/pull/412)
|
11
|
+
- Skip extracting text that is positioned outside the page (http://github.com/yob/pdf-reader/pull/413)
|
12
|
+
- Fix occasional crash when reading some streams (http://github.com/yob/pdf-reader/pull/405)
|
13
|
+
|
14
|
+
v2.7.0 (13th December 2021)
|
15
|
+
- Include RBI type files in the gem
|
16
|
+
- Downstream users of pdf-reader who also use sorbet *should* find many parts of the API will
|
17
|
+
now be typed checked by sorbet
|
18
|
+
- Fix glyph positioning in some rotation scenarios (http://github.com/yob/pdf-reader/pull/403)
|
19
|
+
- Improved text extraction on some rotated pages, and rotated text on normal pages
|
20
|
+
- Add PDF::Reader::Page#rectangles (http://github.com/yob/pdf-reader/pull/402)
|
21
|
+
- Returns page boxes (MediaBox, etc) with rotation applied, and as PORO rather than arrays of numbers
|
22
|
+
- Add PDF::Reader::Page#origin (http://github.com/yob/pdf-reader/pull/400)
|
23
|
+
- Add PDF::Reader::Page#{height,width} (http://github.com/yob/pdf-reader/pull/399)
|
24
|
+
- Overlap filter should only drop characters that overlap *and* match (http://github.com/yob/pdf-reader/pull/401)
|
25
|
+
|
26
|
+
v2.6.0 (12th November 2021)
|
27
|
+
- Text extraction improvements
|
28
|
+
- Improved text layout on pages with a variety of font sizes (http://github.com/yob/pdf-reader/pull/355)
|
29
|
+
- Fixed text positioning for some rotated pages (http://github.com/yob/pdf-reader/pull/356)
|
30
|
+
- Improved character width calculation for PDFs using built-in (non-embedded) ZapfDingbats (http://github.com/yob/pdf-reader/pull/373)
|
31
|
+
- Skip zero-width characters (http://github.com/yob/pdf-reader/pull/372)
|
32
|
+
- Performance improvements
|
33
|
+
- Reduced memory pressure when decoding TIFF images (http://github.com/yob/pdf-reader/pull/360)
|
34
|
+
- Optional dependency on ascii81_native gem for faster processing of files using the ascii85 filter (http://github.com/yob/pdf-reader/pull/359)
|
35
|
+
- Successfully parse more files
|
36
|
+
- Gracefully handle some non-spec compliant CR/LF issues (http://github.com/yob/pdf-reader/pull/364)
|
37
|
+
- Fix parsing of some escape sequences in content streams (http://github.com/yob/pdf-reader/pull/368)
|
38
|
+
- Increase the amount of junk bytes we detect and skip at the end of a file (382)
|
39
|
+
- Ignore "/Prev 0" in trailers (http://github.com/yob/pdf-reader/pull/383)
|
40
|
+
- Fix parsing of some inline images (BI ID EI tokens) (http://github.com/yob/pdf-reader/pull/389)
|
41
|
+
- Gracefully handle some xref tables that incorrectly start with 1 (http://github.com/yob/pdf-reader/pull/384)
|
42
|
+
|
1
43
|
v2.5.0 (6th June 2021)
|
2
44
|
- bump minimum ruby version to 2.0
|
3
45
|
- Correctly handle trascoding to UTF-8 from some fonts that use a difference table [#344](https://github.com/yob/pdf-reader/pull/344/)
|
data/README.md
CHANGED
@@ -166,6 +166,19 @@ http://groups.google.com/group/pdf-reader
|
|
166
166
|
The easiest way to explain how this works in practice is to show some examples.
|
167
167
|
Check out the examples/ directory for a few files.
|
168
168
|
|
169
|
+
# Alternate Decoder
|
170
|
+
|
171
|
+
For PDF files containing Ascii85 streams, the [ascii85_native](https://github.com/AnomalousBit/ascii85_native) gem can be used for increased performance. If the ascii85_native gem is detected, pdf-reader will automatically use the gem.
|
172
|
+
|
173
|
+
First, run `gem install ascii85_native` and then require the gem alongside pdf-reader:
|
174
|
+
|
175
|
+
```ruby
|
176
|
+
require "pdf-reader"
|
177
|
+
require "ascii85_native"
|
178
|
+
```
|
179
|
+
|
180
|
+
Another way of enabling native Ascii85 decoding is to place `gem 'ascii85_native'` in your project's `Gemfile`.
|
181
|
+
|
169
182
|
# Known Limitations
|
170
183
|
|
171
184
|
Occasionally some text cannot be extracted properly due to the way it has been
|
@@ -176,7 +189,9 @@ little UTF-8 friendly box to indicate an unrecognisable character.
|
|
176
189
|
|
177
190
|
* PDF::Reader Code Repository: http://github.com/yob/pdf-reader
|
178
191
|
|
179
|
-
* PDF Specification:
|
192
|
+
* PDF Specification: https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf
|
193
|
+
|
194
|
+
* Adobe PDF Developer Resources: http://www.adobe.com/devnet/pdf/pdf_reference.html
|
180
195
|
|
181
196
|
* PDF Tutorial Slide Presentations: https://web.archive.org/web/20150110042057/http://home.comcast.net/~jk05/presentations/PDFTutorials.html
|
182
197
|
|
data/Rakefile
CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
|
|
14
14
|
Cane::RakeTask.new(:quality) do |cane|
|
15
15
|
cane.abc_max = 20
|
16
16
|
cane.style_measure = 100
|
17
|
-
cane.max_violations =
|
17
|
+
cane.max_violations = 28
|
18
18
|
|
19
19
|
cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
|
20
20
|
end
|
data/examples/extract_fonts.rb
CHANGED
@@ -17,8 +17,8 @@ module ExtractFonts
|
|
17
17
|
return count if page.fonts.nil? || page.fonts.empty?
|
18
18
|
|
19
19
|
page.fonts.each do |label, font|
|
20
|
-
next if complete_refs[
|
21
|
-
complete_refs[
|
20
|
+
next if complete_refs[label]
|
21
|
+
complete_refs[label] = true
|
22
22
|
|
23
23
|
process_font(page, font)
|
24
24
|
|
@@ -39,7 +39,7 @@ module ExtractFonts
|
|
39
39
|
when :TrueType, :CIDFontType2 then
|
40
40
|
ExtractFonts::TTF.new(page.objects, font).save("#{font[:BaseFont]}.ttf")
|
41
41
|
else
|
42
|
-
$stderr.puts "unsupported font type #{font[:Subtype]}"
|
42
|
+
$stderr.puts "unsupported font type #{font[:Subtype]} for #{font[:BaseFont]}"
|
43
43
|
end
|
44
44
|
end
|
45
45
|
|
@@ -68,10 +68,15 @@ module ExtractFonts
|
|
68
68
|
end
|
69
69
|
end
|
70
70
|
|
71
|
-
|
71
|
+
if ARGV.size == 0 # default file name
|
72
|
+
ARGV << File.expand_path(File.join(File.dirname(__dir__), "spec", "data", "cairo-unicode.pdf"))
|
73
|
+
end
|
74
|
+
|
72
75
|
extractor = ExtractFonts::Extractor.new
|
73
76
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
+
ARGV.each do |arg|
|
78
|
+
PDF::Reader.open(arg) do |reader|
|
79
|
+
page = reader.page(1)
|
80
|
+
extractor.page(page)
|
81
|
+
end
|
77
82
|
end
|
data/examples/rspec.rb
CHANGED
@@ -0,0 +1,41 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: strict
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
require 'digest/md5'
|
6
|
+
|
7
|
+
class PDF::Reader
|
8
|
+
|
9
|
+
# Decrypts data using the AESV2 algorithim defined in the PDF spec. Requires
|
10
|
+
# a decryption key, which is usually generated by PDF::Reader::StandardKeyBuilder
|
11
|
+
#
|
12
|
+
class AesV2SecurityHandler
|
13
|
+
|
14
|
+
def initialize(key)
|
15
|
+
@encrypt_key = key
|
16
|
+
end
|
17
|
+
|
18
|
+
##7.6.2 General Encryption Algorithm
|
19
|
+
#
|
20
|
+
# Algorithm 1: Encryption of data using the AES-128-CBC algorithm
|
21
|
+
#
|
22
|
+
# version == 4 and CFM == AESV2
|
23
|
+
#
|
24
|
+
# buf - a string to decrypt
|
25
|
+
# ref - a PDF::Reader::Reference for the object to decrypt
|
26
|
+
#
|
27
|
+
def decrypt( buf, ref )
|
28
|
+
objKey = @encrypt_key.dup
|
29
|
+
(0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
|
30
|
+
(0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
|
31
|
+
objKey << 'sAlT' # Algorithm 1, b)
|
32
|
+
length = objKey.length < 16 ? objKey.length : 16
|
33
|
+
cipher = OpenSSL::Cipher.new("AES-#{length << 3}-CBC")
|
34
|
+
cipher.decrypt
|
35
|
+
cipher.key = Digest::MD5.digest(objKey)[0,length]
|
36
|
+
cipher.iv = buf[0..15]
|
37
|
+
cipher.update(buf[16..-1]) + cipher.final
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: strict
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
require 'digest'
|
6
|
+
require 'openssl'
|
7
|
+
|
8
|
+
class PDF::Reader
|
9
|
+
|
10
|
+
# Decrypts data using the AESV3 algorithim defined in the PDF 1.7, Extension Level 3 spec.
|
11
|
+
# Requires a decryption key, which is usually generated by PDF::Reader::KeyBuilderV5
|
12
|
+
#
|
13
|
+
class AesV3SecurityHandler
|
14
|
+
|
15
|
+
def initialize(key)
|
16
|
+
@encrypt_key = key
|
17
|
+
@cipher = "AES-256-CBC"
|
18
|
+
end
|
19
|
+
|
20
|
+
##7.6.2 General Encryption Algorithm
|
21
|
+
#
|
22
|
+
# Algorithm 1: Encryption of data using the RC4 or AES algorithms
|
23
|
+
#
|
24
|
+
# used to decrypt RC4/AES encrypted PDF streams (buf)
|
25
|
+
#
|
26
|
+
# buf - a string to decrypt
|
27
|
+
# ref - a PDF::Reader::Reference for the object to decrypt
|
28
|
+
#
|
29
|
+
def decrypt( buf, ref )
|
30
|
+
cipher = OpenSSL::Cipher.new(@cipher)
|
31
|
+
cipher.decrypt
|
32
|
+
cipher.key = @encrypt_key.dup
|
33
|
+
cipher.iv = buf[0..15]
|
34
|
+
cipher.update(buf[16..-1]) + cipher.final
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: strict
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
class PDF::Reader
|
6
|
+
|
7
|
+
# Filter our text/characters that are positioned outside a rectangle. Usually the page
|
8
|
+
# MediaBox or CropBox, but could be a user specified rectangle too
|
9
|
+
class BoundingRectangleRunsFilter
|
10
|
+
|
11
|
+
def self.runs_within_rect(runs, rect)
|
12
|
+
runs.select { |run| rect.contains?(run.origin) }
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: ASCII-8BIT
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -48,6 +49,18 @@ class PDF::Reader
|
|
48
49
|
ID = "ID"
|
49
50
|
FWD_SLASH = "/"
|
50
51
|
NULL_BYTE = "\x00"
|
52
|
+
CR = "\r"
|
53
|
+
LF = "\n"
|
54
|
+
CRLF = "\r\n"
|
55
|
+
WHITE_SPACE = [LF, CR, ' ']
|
56
|
+
|
57
|
+
# Quite a few PDFs have trailing junk.
|
58
|
+
# This can be several k of nuls in some cases
|
59
|
+
# Allow for this here
|
60
|
+
TRAILING_BYTECOUNT = 5000
|
61
|
+
|
62
|
+
# must match whole tokens
|
63
|
+
DIGITS_ONLY = %r{\A\d+\z}
|
51
64
|
|
52
65
|
attr_reader :pos
|
53
66
|
|
@@ -86,9 +99,12 @@ class PDF::Reader
|
|
86
99
|
#
|
87
100
|
# options:
|
88
101
|
#
|
89
|
-
# :skip_eol - if true, the IO stream is advanced past a CRLF or LF
|
90
|
-
# is sitting under the io cursor.
|
91
|
-
#
|
102
|
+
# :skip_eol - if true, the IO stream is advanced past a CRLF, CR or LF
|
103
|
+
# that is sitting under the io cursor.
|
104
|
+
# Note:
|
105
|
+
# Skipping a bare CR is not spec-compliant.
|
106
|
+
# This is because the data may start with LF.
|
107
|
+
# However we check for CRLF first, so the ambiguity is avoided.
|
92
108
|
def read(bytes, opts = {})
|
93
109
|
reset_pos
|
94
110
|
|
@@ -97,9 +113,9 @@ class PDF::Reader
|
|
97
113
|
str = @io.read(2)
|
98
114
|
if str.nil?
|
99
115
|
return nil
|
100
|
-
elsif str ==
|
116
|
+
elsif str == CRLF # This MUST be done before checking for CR alone
|
101
117
|
# do nothing
|
102
|
-
elsif str[0,1] ==
|
118
|
+
elsif str[0, 1] == LF || str[0, 1] == CR # LF or CR alone
|
103
119
|
@io.seek(-1, IO::SEEK_CUR)
|
104
120
|
else
|
105
121
|
@io.seek(-2, IO::SEEK_CUR)
|
@@ -127,8 +143,10 @@ class PDF::Reader
|
|
127
143
|
#
|
128
144
|
def find_first_xref_offset
|
129
145
|
check_size_is_non_zero
|
130
|
-
@io.seek(-
|
131
|
-
data = @io.read(
|
146
|
+
@io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
|
147
|
+
data = @io.read(TRAILING_BYTECOUNT)
|
148
|
+
|
149
|
+
raise MalformedPDFError, "PDF does not contain EOF marker" if data.nil?
|
132
150
|
|
133
151
|
# the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
|
134
152
|
lines = data.split(/[\n\r]+/).reverse
|
@@ -136,7 +154,12 @@ class PDF::Reader
|
|
136
154
|
|
137
155
|
raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
|
138
156
|
raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
|
139
|
-
lines[eof_index+1].to_i
|
157
|
+
offset = lines[eof_index+1].to_i
|
158
|
+
|
159
|
+
# a byte offset < 0 doesn't make much sense. This is unlikely to happen, but in theory some
|
160
|
+
# corrupted PDFs might have a line that looks like a negative int preceding the `%%EOF`
|
161
|
+
raise MalformedPDFError, "invalid xref offset" if offset < 0
|
162
|
+
offset
|
140
163
|
end
|
141
164
|
|
142
165
|
private
|
@@ -217,45 +240,73 @@ class PDF::Reader
|
|
217
240
|
return if @tokens.size < 3
|
218
241
|
return if @tokens[2] != "R"
|
219
242
|
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
@tokens[
|
224
|
-
@tokens.
|
243
|
+
token_one = @tokens[0]
|
244
|
+
token_two = @tokens[1]
|
245
|
+
if token_one.is_a?(String) && token_two.is_a?(String) && token_one.match(DIGITS_ONLY) && token_two.match(DIGITS_ONLY)
|
246
|
+
@tokens[0] = PDF::Reader::Reference.new(token_one.to_i, token_two.to_i)
|
247
|
+
@tokens.delete_at(2)
|
248
|
+
@tokens.delete_at(1)
|
225
249
|
end
|
226
250
|
end
|
227
251
|
|
252
|
+
# Extract data between ID and EI
|
253
|
+
# If the EI follows white-space the space is dropped from the data
|
254
|
+
# The EI must followed by white-space or end of buffer
|
255
|
+
# This is to reduce the chance of accidentally matching an embedded EI
|
228
256
|
def prepare_inline_token
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
257
|
+
idstart = @io.pos
|
258
|
+
prevchr = ''
|
259
|
+
eisize = 0 # how many chars in the end marker
|
260
|
+
seeking = 'E' # what are we looking for now?
|
261
|
+
loop do
|
234
262
|
chr = @io.read(1)
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
263
|
+
break if chr.nil?
|
264
|
+
case seeking
|
265
|
+
when 'E'
|
266
|
+
if chr == 'E'
|
267
|
+
seeking = 'I'
|
268
|
+
if WHITE_SPACE.include? prevchr
|
269
|
+
eisize = 3 # include whitespace in delimiter, i.e. drop from data
|
270
|
+
else # assume the EI immediately follows the data
|
271
|
+
eisize = 2 # leave prevchr in data
|
272
|
+
end
|
273
|
+
end
|
274
|
+
when 'I'
|
275
|
+
if chr == 'I'
|
276
|
+
seeking = ''
|
277
|
+
else
|
278
|
+
seeking = 'E'
|
279
|
+
end
|
280
|
+
when ''
|
281
|
+
if WHITE_SPACE.include? chr
|
282
|
+
eisize += 1 # Drop trailer
|
283
|
+
break
|
284
|
+
else
|
285
|
+
seeking = 'E'
|
286
|
+
end
|
239
287
|
end
|
288
|
+
prevchr = chr.is_a?(String) ? chr : ''
|
240
289
|
end
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
@io.seek(
|
290
|
+
unless seeking == ''
|
291
|
+
raise MalformedPDFError, "EI terminator not found"
|
292
|
+
end
|
293
|
+
eiend = @io.pos
|
294
|
+
@io.seek(idstart, IO::SEEK_SET)
|
295
|
+
str = @io.read(eiend - eisize - idstart) # get the ID content
|
296
|
+
@tokens << str.freeze if str
|
246
297
|
end
|
247
298
|
|
248
299
|
# if we're currently inside a hex string, read hex nibbles until
|
249
300
|
# we find a closing >
|
250
301
|
#
|
251
302
|
def prepare_hex_token
|
303
|
+
finished = :false
|
252
304
|
str = "".dup
|
253
|
-
finished = false
|
254
305
|
|
255
|
-
|
306
|
+
until finished == :true
|
256
307
|
byte = @io.getbyte
|
257
308
|
if byte.nil?
|
258
|
-
finished = true # unbalanced params
|
309
|
+
finished = :true # unbalanced params
|
259
310
|
elsif (48..57).include?(byte) || (65..90).include?(byte) || (97..122).include?(byte)
|
260
311
|
str << byte
|
261
312
|
elsif byte <= 32
|
@@ -264,7 +315,7 @@ class PDF::Reader
|
|
264
315
|
@tokens << str if str.size > 0
|
265
316
|
@tokens << ">" if byte != 0x3E # '>'
|
266
317
|
@tokens << byte.chr
|
267
|
-
finished = true
|
318
|
+
finished = :true
|
268
319
|
end
|
269
320
|
end
|
270
321
|
end
|
@@ -311,14 +362,17 @@ class PDF::Reader
|
|
311
362
|
def prepare_regular_token
|
312
363
|
tok = "".dup
|
313
364
|
|
314
|
-
|
365
|
+
loop do
|
366
|
+
byte = @io.getbyte
|
367
|
+
|
315
368
|
case byte
|
369
|
+
when nil
|
370
|
+
break
|
316
371
|
when 0x25
|
317
372
|
# comment, ignore everything until the next EOL char
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
done = true if byte.nil? || byte == 0x0A || byte == 0x0D
|
373
|
+
loop do
|
374
|
+
commentbyte = @io.getbyte
|
375
|
+
break if commentbyte.nil? || commentbyte == 0x0A || commentbyte == 0x0D
|
322
376
|
end
|
323
377
|
when *TOKEN_WHITESPACE
|
324
378
|
# white space, token finished
|
@@ -388,15 +442,5 @@ class PDF::Reader
|
|
388
442
|
byte
|
389
443
|
end
|
390
444
|
|
391
|
-
# for a handful of tokens we want to tell the parser how to convert them
|
392
|
-
# into higher level tokens. This methods adds a to_token() method
|
393
|
-
# to tokens that should remain as strings.
|
394
|
-
#
|
395
|
-
def string_token(token)
|
396
|
-
def token.to_token
|
397
|
-
to_s
|
398
|
-
end
|
399
|
-
token
|
400
|
-
end
|
401
445
|
end
|
402
446
|
end
|
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -32,16 +33,17 @@ class PDF::Reader
|
|
32
33
|
# extracting various useful information.
|
33
34
|
#
|
34
35
|
class CMap # :nodoc:
|
36
|
+
|
35
37
|
CMAP_KEYWORDS = {
|
36
|
-
"begincodespacerange" =>
|
37
|
-
"endcodespacerange" =>
|
38
|
-
"beginbfchar" =>
|
39
|
-
"endbfchar" =>
|
40
|
-
"beginbfrange" =>
|
41
|
-
"endbfrange" =>
|
42
|
-
"begin" =>
|
43
|
-
"begincmap" =>
|
44
|
-
"def" =>
|
38
|
+
"begincodespacerange" => :noop,
|
39
|
+
"endcodespacerange" => :noop,
|
40
|
+
"beginbfchar" => :noop,
|
41
|
+
"endbfchar" => :noop,
|
42
|
+
"beginbfrange" => :noop,
|
43
|
+
"endbfrange" => :noop,
|
44
|
+
"begin" => :noop,
|
45
|
+
"begincmap" => :noop,
|
46
|
+
"def" => :noop
|
45
47
|
}
|
46
48
|
|
47
49
|
attr_reader :map
|
@@ -51,30 +53,6 @@ class PDF::Reader
|
|
51
53
|
process_data(data)
|
52
54
|
end
|
53
55
|
|
54
|
-
def process_data(data)
|
55
|
-
parser = build_parser(data)
|
56
|
-
mode = nil
|
57
|
-
instructions = []
|
58
|
-
|
59
|
-
while token = parser.parse_token(CMAP_KEYWORDS)
|
60
|
-
if token == "beginbfchar"
|
61
|
-
mode = :char
|
62
|
-
elsif token == "endbfchar"
|
63
|
-
process_bfchar_instructions(instructions)
|
64
|
-
instructions = []
|
65
|
-
mode = nil
|
66
|
-
elsif token == "beginbfrange"
|
67
|
-
mode = :range
|
68
|
-
elsif token == "endbfrange"
|
69
|
-
process_bfrange_instructions(instructions)
|
70
|
-
instructions = []
|
71
|
-
mode = nil
|
72
|
-
elsif mode == :char || mode == :range
|
73
|
-
instructions << token
|
74
|
-
end
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
56
|
def size
|
79
57
|
@map.size
|
80
58
|
end
|
@@ -84,13 +62,40 @@ class PDF::Reader
|
|
84
62
|
# Returns an array of Integers.
|
85
63
|
#
|
86
64
|
def decode(c)
|
87
|
-
|
88
|
-
return c unless Integer === c
|
89
|
-
@map[c]
|
65
|
+
@map.fetch(c, [])
|
90
66
|
end
|
91
67
|
|
92
68
|
private
|
93
69
|
|
70
|
+
def process_data(data, initial_mode = :none)
|
71
|
+
parser = build_parser(data)
|
72
|
+
mode = initial_mode
|
73
|
+
instructions = []
|
74
|
+
|
75
|
+
while token = parser.parse_token(CMAP_KEYWORDS)
|
76
|
+
if token.is_a?(String) || token.is_a?(Array)
|
77
|
+
if token == "beginbfchar"
|
78
|
+
mode = :char
|
79
|
+
elsif token == "endbfchar"
|
80
|
+
process_bfchar_instructions(instructions)
|
81
|
+
instructions = []
|
82
|
+
mode = :none
|
83
|
+
elsif token == "beginbfrange"
|
84
|
+
mode = :range
|
85
|
+
elsif token == "endbfrange"
|
86
|
+
process_bfrange_instructions(instructions)
|
87
|
+
instructions = []
|
88
|
+
mode = :none
|
89
|
+
elsif mode == :char
|
90
|
+
instructions << token.to_s
|
91
|
+
elsif mode == :range
|
92
|
+
instructions << token
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
|
94
99
|
def build_parser(instructions)
|
95
100
|
buffer = Buffer.new(StringIO.new(instructions))
|
96
101
|
Parser.new(buffer)
|
@@ -105,7 +110,6 @@ class PDF::Reader
|
|
105
110
|
# exception when we try converting broken UTF-16 to UTF-8
|
106
111
|
#
|
107
112
|
def str_to_int(str)
|
108
|
-
return nil if str.nil? || str.size == 0
|
109
113
|
unpacked_string = if str.bytesize == 1 # UTF-8
|
110
114
|
str.unpack("C*")
|
111
115
|
else # UTF-16
|
@@ -113,12 +117,15 @@ class PDF::Reader
|
|
113
117
|
end
|
114
118
|
result = []
|
115
119
|
while unpacked_string.any? do
|
116
|
-
if unpacked_string.size >= 2 &&
|
120
|
+
if unpacked_string.size >= 2 &&
|
121
|
+
unpacked_string.first.to_i > 0xD800 &&
|
122
|
+
unpacked_string.first.to_i < 0xDBFF
|
117
123
|
# this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
|
118
124
|
# lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
|
119
125
|
# low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
|
120
|
-
|
121
|
-
|
126
|
+
point_one = unpacked_string.shift.to_i
|
127
|
+
point_two = unpacked_string.shift.to_i
|
128
|
+
result << (point_one - 0xD800) * 0x400 + (point_two - 0xDC00) + 0x10000
|
122
129
|
else
|
123
130
|
result << unpacked_string.shift
|
124
131
|
end
|
@@ -128,9 +135,11 @@ class PDF::Reader
|
|
128
135
|
|
129
136
|
def process_bfchar_instructions(instructions)
|
130
137
|
instructions.each_slice(2) do |one, two|
|
131
|
-
find = str_to_int(one)
|
132
|
-
replace = str_to_int(two)
|
133
|
-
|
138
|
+
find = str_to_int(one.to_s)
|
139
|
+
replace = str_to_int(two.to_s)
|
140
|
+
if find.any? && replace.any?
|
141
|
+
@map[find.first.to_i] = replace
|
142
|
+
end
|
134
143
|
end
|
135
144
|
end
|
136
145
|
|
@@ -141,30 +150,36 @@ class PDF::Reader
|
|
141
150
|
elsif start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(Array)
|
142
151
|
bfrange_type_two(start, finish, to)
|
143
152
|
else
|
144
|
-
raise "invalid bfrange section"
|
153
|
+
raise MalformedPDFError, "invalid bfrange section"
|
145
154
|
end
|
146
155
|
end
|
147
156
|
end
|
148
157
|
|
149
158
|
def bfrange_type_one(start_code, end_code, dst)
|
150
|
-
start_code = str_to_int(start_code)
|
151
|
-
end_code = str_to_int(end_code)
|
159
|
+
start_code = str_to_int(start_code).first
|
160
|
+
end_code = str_to_int(end_code).first
|
152
161
|
dst = str_to_int(dst)
|
153
162
|
|
163
|
+
return if start_code.nil? || end_code.nil?
|
164
|
+
|
154
165
|
# add all values in the range to our mapping
|
155
166
|
(start_code..end_code).each_with_index do |val, idx|
|
156
|
-
@map[val] = dst.length == 1 ? [dst[0] + idx] : [dst[0], dst[1] + 1]
|
167
|
+
@map[val] = dst.length == 1 ? [dst[0].to_i + idx] : [dst[0].to_i, dst[1].to_i + 1]
|
157
168
|
end
|
158
169
|
end
|
159
170
|
|
160
171
|
def bfrange_type_two(start_code, end_code, dst)
|
161
|
-
start_code = str_to_int(start_code)
|
162
|
-
end_code = str_to_int(end_code)
|
172
|
+
start_code = str_to_int(start_code).first
|
173
|
+
end_code = str_to_int(end_code).first
|
174
|
+
|
175
|
+
return if start_code.nil? || end_code.nil?
|
176
|
+
|
163
177
|
from_range = (start_code..end_code)
|
164
178
|
|
165
179
|
# add all values in the range to our mapping
|
166
180
|
from_range.each_with_index do |val, idx|
|
167
|
-
|
181
|
+
dst_char = dst[idx]
|
182
|
+
@map[val.to_i] = str_to_int(dst_char) if dst_char
|
168
183
|
end
|
169
184
|
end
|
170
185
|
end
|