pdf-reader 2.5.0 → 2.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +42 -0
- data/README.md +16 -1
- data/Rakefile +1 -1
- data/examples/extract_fonts.rb +12 -7
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
- data/lib/pdf/reader/buffer.rb +90 -46
- data/lib/pdf/reader/cid_widths.rb +1 -0
- data/lib/pdf/reader/cmap.rb +65 -50
- data/lib/pdf/reader/encoding.rb +3 -2
- data/lib/pdf/reader/error.rb +19 -3
- data/lib/pdf/reader/filter/ascii85.rb +7 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +11 -9
- data/lib/pdf/reader/filter/flate.rb +4 -2
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -1
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +2 -1
- data/lib/pdf/reader/font.rb +72 -16
- data/lib/pdf/reader/font_descriptor.rb +19 -17
- data/lib/pdf/reader/form_xobject.rb +15 -5
- data/lib/pdf/reader/glyph_hash.rb +16 -9
- data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
- data/lib/pdf/reader/key_builder_v5.rb +138 -0
- data/lib/pdf/reader/lzw.rb +4 -2
- data/lib/pdf/reader/null_security_handler.rb +1 -4
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +252 -44
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
- data/lib/pdf/reader/page.rb +99 -19
- data/lib/pdf/reader/page_layout.rb +36 -37
- data/lib/pdf/reader/page_state.rb +12 -11
- data/lib/pdf/reader/page_text_receiver.rb +57 -10
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +23 -12
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rc4_security_handler.rb +38 -0
- data/lib/pdf/reader/rectangle.rb +113 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/{resource_methods.rb → resources.rb} +16 -9
- data/lib/pdf/reader/security_handler_factory.rb +79 -0
- data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
- data/lib/pdf/reader/stream.rb +2 -1
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +14 -6
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/type_check.rb +52 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/validating_receiver.rb +262 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +1 -0
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +2 -1
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +27 -4
- data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
- data/lib/pdf/reader.rb +46 -15
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +1978 -0
- metadata +21 -10
- data/lib/pdf/reader/orientation_detector.rb +0 -34
- data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2b4616131d0ad73c4ef2c4992ae79d4fde420d6857aba60e8dfac9b088a0b915
|
4
|
+
data.tar.gz: f93f481d7f76af426420dbf507a88e8ecead8ec84690781f42de3b7b5ffbd1bd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 86dbe3450a11e0deb3f5db98625375b252cc25f289d76c98b5de48342d1b4957de81c1c2b6cce53d7d09738e9576bd48213c92166d48911c1f45ad6a77f195a5
|
7
|
+
data.tar.gz: ee852ff644a095bae93eb7cc30c6d070c8c6adda4f9bfadecf938bf3ba2723fed08c75a3bf15ba30fcf8fded7ad6a5b74dad8a3f512823798686350f24b912eb
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,45 @@
|
|
1
|
+
v2.9.0 (24th January 2022)
|
2
|
+
- Support additional encryption standards (http://github.com/yob/pdf-reader/pull/419)
|
3
|
+
- Return CropBox correctly from Page#rectangles (https://github.com/yob/pdf-reader/pull/420)
|
4
|
+
- For sorbet users, additional type annotations are included in the gem
|
5
|
+
|
6
|
+
v2.8.0 (28th Decemeber 2021)
|
7
|
+
- Add PDF::Reader::Page#runs for extracting text from a page with positioning metadata (http://github.com/yob/pdf-reader/pull/411)
|
8
|
+
- Add options to PDF::Reader::Page#text to make some behaviour configurable (http://github.com/yob/pdf-reader/pull/411)
|
9
|
+
- including extracting the text for only part of the page
|
10
|
+
- Improve text positioning and extraction for Type3 fonts (http://github.com/yob/pdf-reader/pull/412)
|
11
|
+
- Skip extracting text that is positioned outside the page (http://github.com/yob/pdf-reader/pull/413)
|
12
|
+
- Fix occasional crash when reading some streams (http://github.com/yob/pdf-reader/pull/405)
|
13
|
+
|
14
|
+
v2.7.0 (13th December 2021)
|
15
|
+
- Include RBI type files in the gem
|
16
|
+
- Downstream users of pdf-reader who also use sorbet *should* find many parts of the API will
|
17
|
+
now be typed checked by sorbet
|
18
|
+
- Fix glyph positioning in some rotation scenarios (http://github.com/yob/pdf-reader/pull/403)
|
19
|
+
- Improved text extraction on some rotated pages, and rotated text on normal pages
|
20
|
+
- Add PDF::Reader::Page#rectangles (http://github.com/yob/pdf-reader/pull/402)
|
21
|
+
- Returns page boxes (MediaBox, etc) with rotation applied, and as PORO rather than arrays of numbers
|
22
|
+
- Add PDF::Reader::Page#origin (http://github.com/yob/pdf-reader/pull/400)
|
23
|
+
- Add PDF::Reader::Page#{height,width} (http://github.com/yob/pdf-reader/pull/399)
|
24
|
+
- Overlap filter should only drop characters that overlap *and* match (http://github.com/yob/pdf-reader/pull/401)
|
25
|
+
|
26
|
+
v2.6.0 (12th November 2021)
|
27
|
+
- Text extraction improvements
|
28
|
+
- Improved text layout on pages with a variety of font sizes (http://github.com/yob/pdf-reader/pull/355)
|
29
|
+
- Fixed text positioning for some rotated pages (http://github.com/yob/pdf-reader/pull/356)
|
30
|
+
- Improved character width calculation for PDFs using built-in (non-embedded) ZapfDingbats (http://github.com/yob/pdf-reader/pull/373)
|
31
|
+
- Skip zero-width characters (http://github.com/yob/pdf-reader/pull/372)
|
32
|
+
- Performance improvements
|
33
|
+
- Reduced memory pressure when decoding TIFF images (http://github.com/yob/pdf-reader/pull/360)
|
34
|
+
- Optional dependency on ascii81_native gem for faster processing of files using the ascii85 filter (http://github.com/yob/pdf-reader/pull/359)
|
35
|
+
- Successfully parse more files
|
36
|
+
- Gracefully handle some non-spec compliant CR/LF issues (http://github.com/yob/pdf-reader/pull/364)
|
37
|
+
- Fix parsing of some escape sequences in content streams (http://github.com/yob/pdf-reader/pull/368)
|
38
|
+
- Increase the amount of junk bytes we detect and skip at the end of a file (382)
|
39
|
+
- Ignore "/Prev 0" in trailers (http://github.com/yob/pdf-reader/pull/383)
|
40
|
+
- Fix parsing of some inline images (BI ID EI tokens) (http://github.com/yob/pdf-reader/pull/389)
|
41
|
+
- Gracefully handle some xref tables that incorrectly start with 1 (http://github.com/yob/pdf-reader/pull/384)
|
42
|
+
|
1
43
|
v2.5.0 (6th June 2021)
|
2
44
|
- bump minimum ruby version to 2.0
|
3
45
|
- Correctly handle trascoding to UTF-8 from some fonts that use a difference table [#344](https://github.com/yob/pdf-reader/pull/344/)
|
data/README.md
CHANGED
@@ -166,6 +166,19 @@ http://groups.google.com/group/pdf-reader
|
|
166
166
|
The easiest way to explain how this works in practice is to show some examples.
|
167
167
|
Check out the examples/ directory for a few files.
|
168
168
|
|
169
|
+
# Alternate Decoder
|
170
|
+
|
171
|
+
For PDF files containing Ascii85 streams, the [ascii85_native](https://github.com/AnomalousBit/ascii85_native) gem can be used for increased performance. If the ascii85_native gem is detected, pdf-reader will automatically use the gem.
|
172
|
+
|
173
|
+
First, run `gem install ascii85_native` and then require the gem alongside pdf-reader:
|
174
|
+
|
175
|
+
```ruby
|
176
|
+
require "pdf-reader"
|
177
|
+
require "ascii85_native"
|
178
|
+
```
|
179
|
+
|
180
|
+
Another way of enabling native Ascii85 decoding is to place `gem 'ascii85_native'` in your project's `Gemfile`.
|
181
|
+
|
169
182
|
# Known Limitations
|
170
183
|
|
171
184
|
Occasionally some text cannot be extracted properly due to the way it has been
|
@@ -176,7 +189,9 @@ little UTF-8 friendly box to indicate an unrecognisable character.
|
|
176
189
|
|
177
190
|
* PDF::Reader Code Repository: http://github.com/yob/pdf-reader
|
178
191
|
|
179
|
-
* PDF Specification:
|
192
|
+
* PDF Specification: https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf
|
193
|
+
|
194
|
+
* Adobe PDF Developer Resources: http://www.adobe.com/devnet/pdf/pdf_reference.html
|
180
195
|
|
181
196
|
* PDF Tutorial Slide Presentations: https://web.archive.org/web/20150110042057/http://home.comcast.net/~jk05/presentations/PDFTutorials.html
|
182
197
|
|
data/Rakefile
CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
|
|
14
14
|
Cane::RakeTask.new(:quality) do |cane|
|
15
15
|
cane.abc_max = 20
|
16
16
|
cane.style_measure = 100
|
17
|
-
cane.max_violations =
|
17
|
+
cane.max_violations = 28
|
18
18
|
|
19
19
|
cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
|
20
20
|
end
|
data/examples/extract_fonts.rb
CHANGED
@@ -17,8 +17,8 @@ module ExtractFonts
|
|
17
17
|
return count if page.fonts.nil? || page.fonts.empty?
|
18
18
|
|
19
19
|
page.fonts.each do |label, font|
|
20
|
-
next if complete_refs[
|
21
|
-
complete_refs[
|
20
|
+
next if complete_refs[label]
|
21
|
+
complete_refs[label] = true
|
22
22
|
|
23
23
|
process_font(page, font)
|
24
24
|
|
@@ -39,7 +39,7 @@ module ExtractFonts
|
|
39
39
|
when :TrueType, :CIDFontType2 then
|
40
40
|
ExtractFonts::TTF.new(page.objects, font).save("#{font[:BaseFont]}.ttf")
|
41
41
|
else
|
42
|
-
$stderr.puts "unsupported font type #{font[:Subtype]}"
|
42
|
+
$stderr.puts "unsupported font type #{font[:Subtype]} for #{font[:BaseFont]}"
|
43
43
|
end
|
44
44
|
end
|
45
45
|
|
@@ -68,10 +68,15 @@ module ExtractFonts
|
|
68
68
|
end
|
69
69
|
end
|
70
70
|
|
71
|
-
|
71
|
+
if ARGV.size == 0 # default file name
|
72
|
+
ARGV << File.expand_path(File.join(File.dirname(__dir__), "spec", "data", "cairo-unicode.pdf"))
|
73
|
+
end
|
74
|
+
|
72
75
|
extractor = ExtractFonts::Extractor.new
|
73
76
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
+
ARGV.each do |arg|
|
78
|
+
PDF::Reader.open(arg) do |reader|
|
79
|
+
page = reader.page(1)
|
80
|
+
extractor.page(page)
|
81
|
+
end
|
77
82
|
end
|
data/examples/rspec.rb
CHANGED
@@ -0,0 +1,41 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: strict
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
require 'digest/md5'
|
6
|
+
|
7
|
+
class PDF::Reader
|
8
|
+
|
9
|
+
# Decrypts data using the AESV2 algorithim defined in the PDF spec. Requires
|
10
|
+
# a decryption key, which is usually generated by PDF::Reader::StandardKeyBuilder
|
11
|
+
#
|
12
|
+
class AesV2SecurityHandler
|
13
|
+
|
14
|
+
def initialize(key)
|
15
|
+
@encrypt_key = key
|
16
|
+
end
|
17
|
+
|
18
|
+
##7.6.2 General Encryption Algorithm
|
19
|
+
#
|
20
|
+
# Algorithm 1: Encryption of data using the AES-128-CBC algorithm
|
21
|
+
#
|
22
|
+
# version == 4 and CFM == AESV2
|
23
|
+
#
|
24
|
+
# buf - a string to decrypt
|
25
|
+
# ref - a PDF::Reader::Reference for the object to decrypt
|
26
|
+
#
|
27
|
+
def decrypt( buf, ref )
|
28
|
+
objKey = @encrypt_key.dup
|
29
|
+
(0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
|
30
|
+
(0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
|
31
|
+
objKey << 'sAlT' # Algorithm 1, b)
|
32
|
+
length = objKey.length < 16 ? objKey.length : 16
|
33
|
+
cipher = OpenSSL::Cipher.new("AES-#{length << 3}-CBC")
|
34
|
+
cipher.decrypt
|
35
|
+
cipher.key = Digest::MD5.digest(objKey)[0,length]
|
36
|
+
cipher.iv = buf[0..15]
|
37
|
+
cipher.update(buf[16..-1]) + cipher.final
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: strict
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
require 'digest'
|
6
|
+
require 'openssl'
|
7
|
+
|
8
|
+
class PDF::Reader
|
9
|
+
|
10
|
+
# Decrypts data using the AESV3 algorithim defined in the PDF 1.7, Extension Level 3 spec.
|
11
|
+
# Requires a decryption key, which is usually generated by PDF::Reader::KeyBuilderV5
|
12
|
+
#
|
13
|
+
class AesV3SecurityHandler
|
14
|
+
|
15
|
+
def initialize(key)
|
16
|
+
@encrypt_key = key
|
17
|
+
@cipher = "AES-256-CBC"
|
18
|
+
end
|
19
|
+
|
20
|
+
##7.6.2 General Encryption Algorithm
|
21
|
+
#
|
22
|
+
# Algorithm 1: Encryption of data using the RC4 or AES algorithms
|
23
|
+
#
|
24
|
+
# used to decrypt RC4/AES encrypted PDF streams (buf)
|
25
|
+
#
|
26
|
+
# buf - a string to decrypt
|
27
|
+
# ref - a PDF::Reader::Reference for the object to decrypt
|
28
|
+
#
|
29
|
+
def decrypt( buf, ref )
|
30
|
+
cipher = OpenSSL::Cipher.new(@cipher)
|
31
|
+
cipher.decrypt
|
32
|
+
cipher.key = @encrypt_key.dup
|
33
|
+
cipher.iv = buf[0..15]
|
34
|
+
cipher.update(buf[16..-1]) + cipher.final
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: strict
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
class PDF::Reader
|
6
|
+
|
7
|
+
# Filter our text/characters that are positioned outside a rectangle. Usually the page
|
8
|
+
# MediaBox or CropBox, but could be a user specified rectangle too
|
9
|
+
class BoundingRectangleRunsFilter
|
10
|
+
|
11
|
+
def self.runs_within_rect(runs, rect)
|
12
|
+
runs.select { |run| rect.contains?(run.origin) }
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: ASCII-8BIT
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -48,6 +49,18 @@ class PDF::Reader
|
|
48
49
|
ID = "ID"
|
49
50
|
FWD_SLASH = "/"
|
50
51
|
NULL_BYTE = "\x00"
|
52
|
+
CR = "\r"
|
53
|
+
LF = "\n"
|
54
|
+
CRLF = "\r\n"
|
55
|
+
WHITE_SPACE = [LF, CR, ' ']
|
56
|
+
|
57
|
+
# Quite a few PDFs have trailing junk.
|
58
|
+
# This can be several k of nuls in some cases
|
59
|
+
# Allow for this here
|
60
|
+
TRAILING_BYTECOUNT = 5000
|
61
|
+
|
62
|
+
# must match whole tokens
|
63
|
+
DIGITS_ONLY = %r{\A\d+\z}
|
51
64
|
|
52
65
|
attr_reader :pos
|
53
66
|
|
@@ -86,9 +99,12 @@ class PDF::Reader
|
|
86
99
|
#
|
87
100
|
# options:
|
88
101
|
#
|
89
|
-
# :skip_eol - if true, the IO stream is advanced past a CRLF or LF
|
90
|
-
# is sitting under the io cursor.
|
91
|
-
#
|
102
|
+
# :skip_eol - if true, the IO stream is advanced past a CRLF, CR or LF
|
103
|
+
# that is sitting under the io cursor.
|
104
|
+
# Note:
|
105
|
+
# Skipping a bare CR is not spec-compliant.
|
106
|
+
# This is because the data may start with LF.
|
107
|
+
# However we check for CRLF first, so the ambiguity is avoided.
|
92
108
|
def read(bytes, opts = {})
|
93
109
|
reset_pos
|
94
110
|
|
@@ -97,9 +113,9 @@ class PDF::Reader
|
|
97
113
|
str = @io.read(2)
|
98
114
|
if str.nil?
|
99
115
|
return nil
|
100
|
-
elsif str ==
|
116
|
+
elsif str == CRLF # This MUST be done before checking for CR alone
|
101
117
|
# do nothing
|
102
|
-
elsif str[0,1] ==
|
118
|
+
elsif str[0, 1] == LF || str[0, 1] == CR # LF or CR alone
|
103
119
|
@io.seek(-1, IO::SEEK_CUR)
|
104
120
|
else
|
105
121
|
@io.seek(-2, IO::SEEK_CUR)
|
@@ -127,8 +143,10 @@ class PDF::Reader
|
|
127
143
|
#
|
128
144
|
def find_first_xref_offset
|
129
145
|
check_size_is_non_zero
|
130
|
-
@io.seek(-
|
131
|
-
data = @io.read(
|
146
|
+
@io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
|
147
|
+
data = @io.read(TRAILING_BYTECOUNT)
|
148
|
+
|
149
|
+
raise MalformedPDFError, "PDF does not contain EOF marker" if data.nil?
|
132
150
|
|
133
151
|
# the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
|
134
152
|
lines = data.split(/[\n\r]+/).reverse
|
@@ -136,7 +154,12 @@ class PDF::Reader
|
|
136
154
|
|
137
155
|
raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
|
138
156
|
raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
|
139
|
-
lines[eof_index+1].to_i
|
157
|
+
offset = lines[eof_index+1].to_i
|
158
|
+
|
159
|
+
# a byte offset < 0 doesn't make much sense. This is unlikely to happen, but in theory some
|
160
|
+
# corrupted PDFs might have a line that looks like a negative int preceding the `%%EOF`
|
161
|
+
raise MalformedPDFError, "invalid xref offset" if offset < 0
|
162
|
+
offset
|
140
163
|
end
|
141
164
|
|
142
165
|
private
|
@@ -217,45 +240,73 @@ class PDF::Reader
|
|
217
240
|
return if @tokens.size < 3
|
218
241
|
return if @tokens[2] != "R"
|
219
242
|
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
@tokens[
|
224
|
-
@tokens.
|
243
|
+
token_one = @tokens[0]
|
244
|
+
token_two = @tokens[1]
|
245
|
+
if token_one.is_a?(String) && token_two.is_a?(String) && token_one.match(DIGITS_ONLY) && token_two.match(DIGITS_ONLY)
|
246
|
+
@tokens[0] = PDF::Reader::Reference.new(token_one.to_i, token_two.to_i)
|
247
|
+
@tokens.delete_at(2)
|
248
|
+
@tokens.delete_at(1)
|
225
249
|
end
|
226
250
|
end
|
227
251
|
|
252
|
+
# Extract data between ID and EI
|
253
|
+
# If the EI follows white-space the space is dropped from the data
|
254
|
+
# The EI must followed by white-space or end of buffer
|
255
|
+
# This is to reduce the chance of accidentally matching an embedded EI
|
228
256
|
def prepare_inline_token
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
257
|
+
idstart = @io.pos
|
258
|
+
prevchr = ''
|
259
|
+
eisize = 0 # how many chars in the end marker
|
260
|
+
seeking = 'E' # what are we looking for now?
|
261
|
+
loop do
|
234
262
|
chr = @io.read(1)
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
263
|
+
break if chr.nil?
|
264
|
+
case seeking
|
265
|
+
when 'E'
|
266
|
+
if chr == 'E'
|
267
|
+
seeking = 'I'
|
268
|
+
if WHITE_SPACE.include? prevchr
|
269
|
+
eisize = 3 # include whitespace in delimiter, i.e. drop from data
|
270
|
+
else # assume the EI immediately follows the data
|
271
|
+
eisize = 2 # leave prevchr in data
|
272
|
+
end
|
273
|
+
end
|
274
|
+
when 'I'
|
275
|
+
if chr == 'I'
|
276
|
+
seeking = ''
|
277
|
+
else
|
278
|
+
seeking = 'E'
|
279
|
+
end
|
280
|
+
when ''
|
281
|
+
if WHITE_SPACE.include? chr
|
282
|
+
eisize += 1 # Drop trailer
|
283
|
+
break
|
284
|
+
else
|
285
|
+
seeking = 'E'
|
286
|
+
end
|
239
287
|
end
|
288
|
+
prevchr = chr.is_a?(String) ? chr : ''
|
240
289
|
end
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
@io.seek(
|
290
|
+
unless seeking == ''
|
291
|
+
raise MalformedPDFError, "EI terminator not found"
|
292
|
+
end
|
293
|
+
eiend = @io.pos
|
294
|
+
@io.seek(idstart, IO::SEEK_SET)
|
295
|
+
str = @io.read(eiend - eisize - idstart) # get the ID content
|
296
|
+
@tokens << str.freeze if str
|
246
297
|
end
|
247
298
|
|
248
299
|
# if we're currently inside a hex string, read hex nibbles until
|
249
300
|
# we find a closing >
|
250
301
|
#
|
251
302
|
def prepare_hex_token
|
303
|
+
finished = :false
|
252
304
|
str = "".dup
|
253
|
-
finished = false
|
254
305
|
|
255
|
-
|
306
|
+
until finished == :true
|
256
307
|
byte = @io.getbyte
|
257
308
|
if byte.nil?
|
258
|
-
finished = true # unbalanced params
|
309
|
+
finished = :true # unbalanced params
|
259
310
|
elsif (48..57).include?(byte) || (65..90).include?(byte) || (97..122).include?(byte)
|
260
311
|
str << byte
|
261
312
|
elsif byte <= 32
|
@@ -264,7 +315,7 @@ class PDF::Reader
|
|
264
315
|
@tokens << str if str.size > 0
|
265
316
|
@tokens << ">" if byte != 0x3E # '>'
|
266
317
|
@tokens << byte.chr
|
267
|
-
finished = true
|
318
|
+
finished = :true
|
268
319
|
end
|
269
320
|
end
|
270
321
|
end
|
@@ -311,14 +362,17 @@ class PDF::Reader
|
|
311
362
|
def prepare_regular_token
|
312
363
|
tok = "".dup
|
313
364
|
|
314
|
-
|
365
|
+
loop do
|
366
|
+
byte = @io.getbyte
|
367
|
+
|
315
368
|
case byte
|
369
|
+
when nil
|
370
|
+
break
|
316
371
|
when 0x25
|
317
372
|
# comment, ignore everything until the next EOL char
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
done = true if byte.nil? || byte == 0x0A || byte == 0x0D
|
373
|
+
loop do
|
374
|
+
commentbyte = @io.getbyte
|
375
|
+
break if commentbyte.nil? || commentbyte == 0x0A || commentbyte == 0x0D
|
322
376
|
end
|
323
377
|
when *TOKEN_WHITESPACE
|
324
378
|
# white space, token finished
|
@@ -388,15 +442,5 @@ class PDF::Reader
|
|
388
442
|
byte
|
389
443
|
end
|
390
444
|
|
391
|
-
# for a handful of tokens we want to tell the parser how to convert them
|
392
|
-
# into higher level tokens. This methods adds a to_token() method
|
393
|
-
# to tokens that should remain as strings.
|
394
|
-
#
|
395
|
-
def string_token(token)
|
396
|
-
def token.to_token
|
397
|
-
to_s
|
398
|
-
end
|
399
|
-
token
|
400
|
-
end
|
401
445
|
end
|
402
446
|
end
|
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -32,16 +33,17 @@ class PDF::Reader
|
|
32
33
|
# extracting various useful information.
|
33
34
|
#
|
34
35
|
class CMap # :nodoc:
|
36
|
+
|
35
37
|
CMAP_KEYWORDS = {
|
36
|
-
"begincodespacerange" =>
|
37
|
-
"endcodespacerange" =>
|
38
|
-
"beginbfchar" =>
|
39
|
-
"endbfchar" =>
|
40
|
-
"beginbfrange" =>
|
41
|
-
"endbfrange" =>
|
42
|
-
"begin" =>
|
43
|
-
"begincmap" =>
|
44
|
-
"def" =>
|
38
|
+
"begincodespacerange" => :noop,
|
39
|
+
"endcodespacerange" => :noop,
|
40
|
+
"beginbfchar" => :noop,
|
41
|
+
"endbfchar" => :noop,
|
42
|
+
"beginbfrange" => :noop,
|
43
|
+
"endbfrange" => :noop,
|
44
|
+
"begin" => :noop,
|
45
|
+
"begincmap" => :noop,
|
46
|
+
"def" => :noop
|
45
47
|
}
|
46
48
|
|
47
49
|
attr_reader :map
|
@@ -51,30 +53,6 @@ class PDF::Reader
|
|
51
53
|
process_data(data)
|
52
54
|
end
|
53
55
|
|
54
|
-
def process_data(data)
|
55
|
-
parser = build_parser(data)
|
56
|
-
mode = nil
|
57
|
-
instructions = []
|
58
|
-
|
59
|
-
while token = parser.parse_token(CMAP_KEYWORDS)
|
60
|
-
if token == "beginbfchar"
|
61
|
-
mode = :char
|
62
|
-
elsif token == "endbfchar"
|
63
|
-
process_bfchar_instructions(instructions)
|
64
|
-
instructions = []
|
65
|
-
mode = nil
|
66
|
-
elsif token == "beginbfrange"
|
67
|
-
mode = :range
|
68
|
-
elsif token == "endbfrange"
|
69
|
-
process_bfrange_instructions(instructions)
|
70
|
-
instructions = []
|
71
|
-
mode = nil
|
72
|
-
elsif mode == :char || mode == :range
|
73
|
-
instructions << token
|
74
|
-
end
|
75
|
-
end
|
76
|
-
end
|
77
|
-
|
78
56
|
def size
|
79
57
|
@map.size
|
80
58
|
end
|
@@ -84,13 +62,40 @@ class PDF::Reader
|
|
84
62
|
# Returns an array of Integers.
|
85
63
|
#
|
86
64
|
def decode(c)
|
87
|
-
|
88
|
-
return c unless Integer === c
|
89
|
-
@map[c]
|
65
|
+
@map.fetch(c, [])
|
90
66
|
end
|
91
67
|
|
92
68
|
private
|
93
69
|
|
70
|
+
def process_data(data, initial_mode = :none)
|
71
|
+
parser = build_parser(data)
|
72
|
+
mode = initial_mode
|
73
|
+
instructions = []
|
74
|
+
|
75
|
+
while token = parser.parse_token(CMAP_KEYWORDS)
|
76
|
+
if token.is_a?(String) || token.is_a?(Array)
|
77
|
+
if token == "beginbfchar"
|
78
|
+
mode = :char
|
79
|
+
elsif token == "endbfchar"
|
80
|
+
process_bfchar_instructions(instructions)
|
81
|
+
instructions = []
|
82
|
+
mode = :none
|
83
|
+
elsif token == "beginbfrange"
|
84
|
+
mode = :range
|
85
|
+
elsif token == "endbfrange"
|
86
|
+
process_bfrange_instructions(instructions)
|
87
|
+
instructions = []
|
88
|
+
mode = :none
|
89
|
+
elsif mode == :char
|
90
|
+
instructions << token.to_s
|
91
|
+
elsif mode == :range
|
92
|
+
instructions << token
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
|
94
99
|
def build_parser(instructions)
|
95
100
|
buffer = Buffer.new(StringIO.new(instructions))
|
96
101
|
Parser.new(buffer)
|
@@ -105,7 +110,6 @@ class PDF::Reader
|
|
105
110
|
# exception when we try converting broken UTF-16 to UTF-8
|
106
111
|
#
|
107
112
|
def str_to_int(str)
|
108
|
-
return nil if str.nil? || str.size == 0
|
109
113
|
unpacked_string = if str.bytesize == 1 # UTF-8
|
110
114
|
str.unpack("C*")
|
111
115
|
else # UTF-16
|
@@ -113,12 +117,15 @@ class PDF::Reader
|
|
113
117
|
end
|
114
118
|
result = []
|
115
119
|
while unpacked_string.any? do
|
116
|
-
if unpacked_string.size >= 2 &&
|
120
|
+
if unpacked_string.size >= 2 &&
|
121
|
+
unpacked_string.first.to_i > 0xD800 &&
|
122
|
+
unpacked_string.first.to_i < 0xDBFF
|
117
123
|
# this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
|
118
124
|
# lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
|
119
125
|
# low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
|
120
|
-
|
121
|
-
|
126
|
+
point_one = unpacked_string.shift.to_i
|
127
|
+
point_two = unpacked_string.shift.to_i
|
128
|
+
result << (point_one - 0xD800) * 0x400 + (point_two - 0xDC00) + 0x10000
|
122
129
|
else
|
123
130
|
result << unpacked_string.shift
|
124
131
|
end
|
@@ -128,9 +135,11 @@ class PDF::Reader
|
|
128
135
|
|
129
136
|
def process_bfchar_instructions(instructions)
|
130
137
|
instructions.each_slice(2) do |one, two|
|
131
|
-
find = str_to_int(one)
|
132
|
-
replace = str_to_int(two)
|
133
|
-
|
138
|
+
find = str_to_int(one.to_s)
|
139
|
+
replace = str_to_int(two.to_s)
|
140
|
+
if find.any? && replace.any?
|
141
|
+
@map[find.first.to_i] = replace
|
142
|
+
end
|
134
143
|
end
|
135
144
|
end
|
136
145
|
|
@@ -141,30 +150,36 @@ class PDF::Reader
|
|
141
150
|
elsif start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(Array)
|
142
151
|
bfrange_type_two(start, finish, to)
|
143
152
|
else
|
144
|
-
raise "invalid bfrange section"
|
153
|
+
raise MalformedPDFError, "invalid bfrange section"
|
145
154
|
end
|
146
155
|
end
|
147
156
|
end
|
148
157
|
|
149
158
|
def bfrange_type_one(start_code, end_code, dst)
|
150
|
-
start_code = str_to_int(start_code)
|
151
|
-
end_code = str_to_int(end_code)
|
159
|
+
start_code = str_to_int(start_code).first
|
160
|
+
end_code = str_to_int(end_code).first
|
152
161
|
dst = str_to_int(dst)
|
153
162
|
|
163
|
+
return if start_code.nil? || end_code.nil?
|
164
|
+
|
154
165
|
# add all values in the range to our mapping
|
155
166
|
(start_code..end_code).each_with_index do |val, idx|
|
156
|
-
@map[val] = dst.length == 1 ? [dst[0] + idx] : [dst[0], dst[1] + 1]
|
167
|
+
@map[val] = dst.length == 1 ? [dst[0].to_i + idx] : [dst[0].to_i, dst[1].to_i + 1]
|
157
168
|
end
|
158
169
|
end
|
159
170
|
|
160
171
|
def bfrange_type_two(start_code, end_code, dst)
|
161
|
-
start_code = str_to_int(start_code)
|
162
|
-
end_code = str_to_int(end_code)
|
172
|
+
start_code = str_to_int(start_code).first
|
173
|
+
end_code = str_to_int(end_code).first
|
174
|
+
|
175
|
+
return if start_code.nil? || end_code.nil?
|
176
|
+
|
163
177
|
from_range = (start_code..end_code)
|
164
178
|
|
165
179
|
# add all values in the range to our mapping
|
166
180
|
from_range.each_with_index do |val, idx|
|
167
|
-
|
181
|
+
dst_char = dst[idx]
|
182
|
+
@map[val.to_i] = str_to_int(dst_char) if dst_char
|
168
183
|
end
|
169
184
|
end
|
170
185
|
end
|