pdf-reader 2.4.1 → 2.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +40 -0
- data/README.md +16 -1
- data/Rakefile +1 -1
- data/examples/extract_fonts.rb +12 -7
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/buffer.rb +63 -21
- data/lib/pdf/reader/cid_widths.rb +1 -0
- data/lib/pdf/reader/cmap.rb +5 -3
- data/lib/pdf/reader/encoding.rb +3 -2
- data/lib/pdf/reader/error.rb +11 -3
- data/lib/pdf/reader/filter/ascii85.rb +7 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +10 -8
- data/lib/pdf/reader/filter/flate.rb +27 -14
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -0
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/font.rb +1 -0
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +16 -9
- data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
- data/lib/pdf/reader/lzw.rb +4 -2
- data/lib/pdf/reader/null_security_handler.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +8 -3
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
- data/lib/pdf/reader/page.rb +60 -9
- data/lib/pdf/reader/page_layout.rb +37 -23
- data/lib/pdf/reader/page_state.rb +18 -23
- data/lib/pdf/reader/page_text_receiver.rb +28 -5
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +12 -7
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rectangle.rb +95 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +5 -0
- data/lib/pdf/reader/standard_security_handler.rb +1 -0
- data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +1 -0
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +8 -15
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +7 -1
- data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
- data/lib/pdf/reader.rb +14 -4
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +1744 -0
- metadata +17 -13
- data/lib/pdf/reader/orientation_detector.rb +0 -34
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5ee0d8c3c55f6a0aebb60a0a6dce92428e8371b96a6beb6d75bfe90602bffae7
|
4
|
+
data.tar.gz: '0911d108353bf577aa9fd7b49b97dda1cf9d54816bf8ff6c4225281eeda63229'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 917db2b1fb977b41e7b057ff3d215b8f249577254d9fe3df72f330b32ff49630874c58f480495ddcd137d9f31d014083438623cdf7260b0d7a87bbe3a5f3685a
|
7
|
+
data.tar.gz: cd9832f025264e54d586e81eff69727379e8646d741f53ae61e90a5b38945d852147853891d468bab683581bdd0beb68a9b7c7f5e54e064e9a3935262ea9d651
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,43 @@
|
|
1
|
+
v2.7.0 (13th December 2021)
|
2
|
+
- Include RBI type files in the gem
|
3
|
+
- Downstream users of pdf-reader who also use sorbet *should* find many parts of the API will
|
4
|
+
now be typed checked by sorbet
|
5
|
+
- Fix glyph positioning in some rotation scenarios (http://github.com/yob/pdf-reader/pull/403)
|
6
|
+
- Improved text extraction on some rotated pages, and rotated text on normal pages
|
7
|
+
- Add PDF::Reader::Page#rectangles (http://github.com/yob/pdf-reader/pull/402)
|
8
|
+
- Returns page boxes (MediaBox, etc) with rotation applied, and as PORO rather than arrays of numbers
|
9
|
+
- Add PDF::Reader::Page#origin (http://github.com/yob/pdf-reader/pull/400)
|
10
|
+
- Add PDF::Reader::Page#{height,width} (http://github.com/yob/pdf-reader/pull/399)
|
11
|
+
- Overlap filter should only drop characters that overlap *and* match (http://github.com/yob/pdf-reader/pull/401)
|
12
|
+
|
13
|
+
v2.6.0 (12th November 2021)
|
14
|
+
- Text extraction improvements
|
15
|
+
- Improved text layout on pages with a variety of font sizes (http://github.com/yob/pdf-reader/pull/355)
|
16
|
+
- Fixed text positioning for some rotated pages (http://github.com/yob/pdf-reader/pull/356)
|
17
|
+
- Improved character width calculation for PDFs using built-in (non-embedded) ZapfDingbats (http://github.com/yob/pdf-reader/pull/373)
|
18
|
+
- Skip zero-width characters (http://github.com/yob/pdf-reader/pull/372)
|
19
|
+
- Performance improvements
|
20
|
+
- Reduced memory pressure when decoding TIFF images (http://github.com/yob/pdf-reader/pull/360)
|
21
|
+
- Optional dependency on ascii81_native gem for faster processing of files using the ascii85 filter (http://github.com/yob/pdf-reader/pull/359)
|
22
|
+
- Successfully parse more files
|
23
|
+
- Gracefully handle some non-spec compliant CR/LF issues (http://github.com/yob/pdf-reader/pull/364)
|
24
|
+
- Fix parsing of some escape sequences in content streams (http://github.com/yob/pdf-reader/pull/368)
|
25
|
+
- Increase the amount of junk bytes we detect and skip at the end of a file (382)
|
26
|
+
- Ignore "/Prev 0" in trailers (http://github.com/yob/pdf-reader/pull/383)
|
27
|
+
- Fix parsing of some inline images (BI ID EI tokens) (http://github.com/yob/pdf-reader/pull/389)
|
28
|
+
- Gracefully handle some xref tables that incorrectly start with 1 (http://github.com/yob/pdf-reader/pull/384)
|
29
|
+
|
30
|
+
v2.5.0 (6th June 2021)
|
31
|
+
- bump minimum ruby version to 2.0
|
32
|
+
- Correctly handle trascoding to UTF-8 from some fonts that use a difference table [#344](https://github.com/yob/pdf-reader/pull/344/)
|
33
|
+
- Fix some character spacing issues with the TJ operator [#343](https://github.com/yob/pdf-reader/pull/343)
|
34
|
+
- Fix crash with some encrypted PDFs [#348](https://github.com/yob/pdf-reader/pull/348/)
|
35
|
+
- Fix positions of text on some PDFs with pages rotated 90° [#350](https://github.com/yob/pdf-reader/pull/350/)
|
36
|
+
|
37
|
+
v2.4.2 (28th January 2021)
|
38
|
+
- relax ASCII85 dependency to allow 1.x
|
39
|
+
- improved support for decompressing objects with slightly malformed zlib data
|
40
|
+
|
1
41
|
v.2.4.1 (24th September 2020)
|
2
42
|
- Re-vendor font metrics from Adobe to clarify their license
|
3
43
|
|
data/README.md
CHANGED
@@ -166,6 +166,19 @@ http://groups.google.com/group/pdf-reader
|
|
166
166
|
The easiest way to explain how this works in practice is to show some examples.
|
167
167
|
Check out the examples/ directory for a few files.
|
168
168
|
|
169
|
+
# Alternate Decoder
|
170
|
+
|
171
|
+
For PDF files containing Ascii85 streams, the [ascii85_native](https://github.com/AnomalousBit/ascii85_native) gem can be used for increased performance. If the ascii85_native gem is detected, pdf-reader will automatically use the gem.
|
172
|
+
|
173
|
+
First, run `gem install ascii85_native` and then require the gem alongside pdf-reader:
|
174
|
+
|
175
|
+
```ruby
|
176
|
+
require "pdf-reader"
|
177
|
+
require "ascii85_native"
|
178
|
+
```
|
179
|
+
|
180
|
+
Another way of enabling native Ascii85 decoding is to place `gem 'ascii85_native'` in your project's `Gemfile`.
|
181
|
+
|
169
182
|
# Known Limitations
|
170
183
|
|
171
184
|
Occasionally some text cannot be extracted properly due to the way it has been
|
@@ -176,7 +189,9 @@ little UTF-8 friendly box to indicate an unrecognisable character.
|
|
176
189
|
|
177
190
|
* PDF::Reader Code Repository: http://github.com/yob/pdf-reader
|
178
191
|
|
179
|
-
* PDF Specification:
|
192
|
+
* PDF Specification: https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf
|
193
|
+
|
194
|
+
* Adobe PDF Developer Resources: http://www.adobe.com/devnet/pdf/pdf_reference.html
|
180
195
|
|
181
196
|
* PDF Tutorial Slide Presentations: https://web.archive.org/web/20150110042057/http://home.comcast.net/~jk05/presentations/PDFTutorials.html
|
182
197
|
|
data/Rakefile
CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
|
|
14
14
|
Cane::RakeTask.new(:quality) do |cane|
|
15
15
|
cane.abc_max = 20
|
16
16
|
cane.style_measure = 100
|
17
|
-
cane.max_violations =
|
17
|
+
cane.max_violations = 32
|
18
18
|
|
19
19
|
cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
|
20
20
|
end
|
data/examples/extract_fonts.rb
CHANGED
@@ -17,8 +17,8 @@ module ExtractFonts
|
|
17
17
|
return count if page.fonts.nil? || page.fonts.empty?
|
18
18
|
|
19
19
|
page.fonts.each do |label, font|
|
20
|
-
next if complete_refs[
|
21
|
-
complete_refs[
|
20
|
+
next if complete_refs[label]
|
21
|
+
complete_refs[label] = true
|
22
22
|
|
23
23
|
process_font(page, font)
|
24
24
|
|
@@ -39,7 +39,7 @@ module ExtractFonts
|
|
39
39
|
when :TrueType, :CIDFontType2 then
|
40
40
|
ExtractFonts::TTF.new(page.objects, font).save("#{font[:BaseFont]}.ttf")
|
41
41
|
else
|
42
|
-
$stderr.puts "unsupported font type #{font[:Subtype]}"
|
42
|
+
$stderr.puts "unsupported font type #{font[:Subtype]} for #{font[:BaseFont]}"
|
43
43
|
end
|
44
44
|
end
|
45
45
|
|
@@ -68,10 +68,15 @@ module ExtractFonts
|
|
68
68
|
end
|
69
69
|
end
|
70
70
|
|
71
|
-
|
71
|
+
if ARGV.size == 0 # default file name
|
72
|
+
ARGV << File.expand_path(File.join(File.dirname(__dir__), "spec", "data", "cairo-unicode.pdf"))
|
73
|
+
end
|
74
|
+
|
72
75
|
extractor = ExtractFonts::Extractor.new
|
73
76
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
+
ARGV.each do |arg|
|
78
|
+
PDF::Reader.open(arg) do |reader|
|
79
|
+
page = reader.page(1)
|
80
|
+
extractor.page(page)
|
81
|
+
end
|
77
82
|
end
|
data/examples/rspec.rb
CHANGED
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: ASCII-8BIT
|
2
|
+
# typed: false
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -48,6 +49,15 @@ class PDF::Reader
|
|
48
49
|
ID = "ID"
|
49
50
|
FWD_SLASH = "/"
|
50
51
|
NULL_BYTE = "\x00"
|
52
|
+
CR = "\r"
|
53
|
+
LF = "\n"
|
54
|
+
CRLF = "\r\n"
|
55
|
+
WHITE_SPACE = [LF, CR, ' ']
|
56
|
+
|
57
|
+
# Quite a few PDFs have trailing junk.
|
58
|
+
# This can be several k of nuls in some cases
|
59
|
+
# Allow for this here
|
60
|
+
TRAILING_BYTECOUNT = 5000
|
51
61
|
|
52
62
|
attr_reader :pos
|
53
63
|
|
@@ -86,9 +96,12 @@ class PDF::Reader
|
|
86
96
|
#
|
87
97
|
# options:
|
88
98
|
#
|
89
|
-
# :skip_eol - if true, the IO stream is advanced past a CRLF or LF
|
90
|
-
# is sitting under the io cursor.
|
91
|
-
#
|
99
|
+
# :skip_eol - if true, the IO stream is advanced past a CRLF, CR or LF
|
100
|
+
# that is sitting under the io cursor.
|
101
|
+
# Note:
|
102
|
+
# Skipping a bare CR is not spec-compliant.
|
103
|
+
# This is because the data may start with LF.
|
104
|
+
# However we check for CRLF first, so the ambiguity is avoided.
|
92
105
|
def read(bytes, opts = {})
|
93
106
|
reset_pos
|
94
107
|
|
@@ -97,9 +110,9 @@ class PDF::Reader
|
|
97
110
|
str = @io.read(2)
|
98
111
|
if str.nil?
|
99
112
|
return nil
|
100
|
-
elsif str ==
|
113
|
+
elsif str == CRLF # This MUST be done before checking for CR alone
|
101
114
|
# do nothing
|
102
|
-
elsif str[0,1] ==
|
115
|
+
elsif str[0, 1] == LF || str[0, 1] == CR # LF or CR alone
|
103
116
|
@io.seek(-1, IO::SEEK_CUR)
|
104
117
|
else
|
105
118
|
@io.seek(-2, IO::SEEK_CUR)
|
@@ -127,8 +140,8 @@ class PDF::Reader
|
|
127
140
|
#
|
128
141
|
def find_first_xref_offset
|
129
142
|
check_size_is_non_zero
|
130
|
-
@io.seek(-
|
131
|
-
data = @io.read(
|
143
|
+
@io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
|
144
|
+
data = @io.read(TRAILING_BYTECOUNT)
|
132
145
|
|
133
146
|
# the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
|
134
147
|
lines = data.split(/[\n\r]+/).reverse
|
@@ -217,7 +230,9 @@ class PDF::Reader
|
|
217
230
|
return if @tokens.size < 3
|
218
231
|
return if @tokens[2] != "R"
|
219
232
|
|
220
|
-
|
233
|
+
# must match whole tokens
|
234
|
+
digits_only = %r{\A\d+\z}
|
235
|
+
if @tokens[0].match(digits_only) && @tokens[1].match(digits_only)
|
221
236
|
@tokens[0] = PDF::Reader::Reference.new(@tokens[0].to_i, @tokens[1].to_i)
|
222
237
|
@tokens[1] = nil
|
223
238
|
@tokens[2] = nil
|
@@ -225,24 +240,51 @@ class PDF::Reader
|
|
225
240
|
end
|
226
241
|
end
|
227
242
|
|
243
|
+
# Extract data between ID and EI
|
244
|
+
# If the EI follows white-space the space is dropped from the data
|
245
|
+
# The EI must followed by white-space or end of buffer
|
246
|
+
# This is to reduce the chance of accidentally matching an embedded EI
|
228
247
|
def prepare_inline_token
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
248
|
+
idstart = @io.pos
|
249
|
+
chr = prevchr = nil
|
250
|
+
eisize = 0 # how many chars in the end marker
|
251
|
+
seeking = 'E' # what are we looking for now?
|
252
|
+
loop do
|
234
253
|
chr = @io.read(1)
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
254
|
+
break if chr.nil?
|
255
|
+
case seeking
|
256
|
+
when 'E'
|
257
|
+
if chr == 'E'
|
258
|
+
seeking = 'I'
|
259
|
+
if WHITE_SPACE.include? prevchr
|
260
|
+
eisize = 3 # include whitespace in delimiter, i.e. drop from data
|
261
|
+
else # assume the EI immediately follows the data
|
262
|
+
eisize = 2 # leave prevchr in data
|
263
|
+
end
|
264
|
+
end
|
265
|
+
when 'I'
|
266
|
+
if chr == 'I'
|
267
|
+
seeking = :END
|
268
|
+
else
|
269
|
+
seeking = 'E'
|
270
|
+
end
|
271
|
+
when :END
|
272
|
+
if WHITE_SPACE.include? chr
|
273
|
+
eisize += 1 # Drop trailer
|
274
|
+
break
|
275
|
+
else
|
276
|
+
seeking = 'E'
|
277
|
+
end
|
239
278
|
end
|
279
|
+
prevchr = chr
|
240
280
|
end
|
241
|
-
|
242
|
-
|
243
|
-
|
281
|
+
unless seeking == :END
|
282
|
+
raise MalformedPDFError, "EI terminator not found"
|
283
|
+
end
|
284
|
+
eiend = @io.pos
|
285
|
+
@io.seek(idstart, IO::SEEK_SET)
|
286
|
+
str = @io.read(eiend - eisize - idstart) # get the ID content
|
244
287
|
@tokens << string_token(str)
|
245
|
-
@io.seek(-3, IO::SEEK_CUR) unless chr.nil?
|
246
288
|
end
|
247
289
|
|
248
290
|
# if we're currently inside a hex string, read hex nibbles until
|
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: false
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -32,6 +33,7 @@ class PDF::Reader
|
|
32
33
|
# extracting various useful information.
|
33
34
|
#
|
34
35
|
class CMap # :nodoc:
|
36
|
+
|
35
37
|
CMAP_KEYWORDS = {
|
36
38
|
"begincodespacerange" => 1,
|
37
39
|
"endcodespacerange" => 1,
|
@@ -53,7 +55,7 @@ class PDF::Reader
|
|
53
55
|
|
54
56
|
def process_data(data)
|
55
57
|
parser = build_parser(data)
|
56
|
-
mode =
|
58
|
+
mode = :none
|
57
59
|
instructions = []
|
58
60
|
|
59
61
|
while token = parser.parse_token(CMAP_KEYWORDS)
|
@@ -62,13 +64,13 @@ class PDF::Reader
|
|
62
64
|
elsif token == "endbfchar"
|
63
65
|
process_bfchar_instructions(instructions)
|
64
66
|
instructions = []
|
65
|
-
mode =
|
67
|
+
mode = :none
|
66
68
|
elsif token == "beginbfrange"
|
67
69
|
mode = :range
|
68
70
|
elsif token == "endbfrange"
|
69
71
|
process_bfrange_instructions(instructions)
|
70
72
|
instructions = []
|
71
|
-
mode =
|
73
|
+
mode = :none
|
72
74
|
elsif mode == :char || mode == :range
|
73
75
|
instructions << token
|
74
76
|
end
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -68,7 +69,7 @@ class PDF::Reader
|
|
68
69
|
#
|
69
70
|
# [25, :A, :B]
|
70
71
|
def differences=(diff)
|
71
|
-
|
72
|
+
PDF::Reader::Error.validate_type(diff, "diff", Array)
|
72
73
|
|
73
74
|
@differences = {}
|
74
75
|
byte = 0
|
@@ -208,7 +209,7 @@ class PDF::Reader
|
|
208
209
|
def load_mapping(file)
|
209
210
|
File.open(file, "r:BINARY") do |f|
|
210
211
|
f.each do |l|
|
211
|
-
_m, single_byte, unicode = *l.match(
|
212
|
+
_m, single_byte, unicode = *l.match(/\A([0-9A-Za-z]+);([0-9A-F]{4})/)
|
212
213
|
@mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
|
213
214
|
end
|
214
215
|
end
|
data/lib/pdf/reader/error.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -33,19 +34,26 @@ class PDF::Reader
|
|
33
34
|
def self.str_assert(lvalue, rvalue, chars=nil)
|
34
35
|
raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
|
35
36
|
lvalue = lvalue[0,chars] if chars
|
36
|
-
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue != rvalue
|
37
|
+
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
|
37
38
|
end
|
38
39
|
################################################################################
|
39
40
|
def self.str_assert_not(lvalue, rvalue, chars=nil)
|
40
41
|
raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
|
41
42
|
lvalue = lvalue[0,chars] if chars
|
42
|
-
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue == rvalue
|
43
|
+
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue == rvalue
|
43
44
|
end
|
44
45
|
################################################################################
|
45
46
|
def self.assert_equal(lvalue, rvalue)
|
46
|
-
raise MalformedPDFError, "PDF malformed, expected #{rvalue} but found #{lvalue} instead" if lvalue != rvalue
|
47
|
+
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
|
47
48
|
end
|
48
49
|
################################################################################
|
50
|
+
def self.validate_type(object, name, klass)
|
51
|
+
raise ArgumentError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
|
52
|
+
end
|
53
|
+
################################################################################
|
54
|
+
def self.validate_not_nil(object, name)
|
55
|
+
raise ArgumentError, "#{object} must not be nil" if object.nil?
|
56
|
+
end
|
49
57
|
end
|
50
58
|
|
51
59
|
################################################################################
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: false
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'ascii85'
|
@@ -7,6 +8,7 @@ class PDF::Reader
|
|
7
8
|
module Filter # :nodoc:
|
8
9
|
# implementation of the Ascii85 filter
|
9
10
|
class Ascii85
|
11
|
+
|
10
12
|
def initialize(options = {})
|
11
13
|
@options = options
|
12
14
|
end
|
@@ -17,7 +19,11 @@ class PDF::Reader
|
|
17
19
|
#
|
18
20
|
def filter(data)
|
19
21
|
data = "<~#{data}" unless data.to_s[0,2] == "<~"
|
20
|
-
::
|
22
|
+
if defined?(::Ascii85Native)
|
23
|
+
::Ascii85Native::decode(data)
|
24
|
+
else
|
25
|
+
::Ascii85::decode(data)
|
26
|
+
end
|
21
27
|
rescue Exception => e
|
22
28
|
# Oops, there was a problem decoding the stream
|
23
29
|
raise MalformedPDFError,
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
#
|
@@ -6,6 +7,7 @@ class PDF::Reader
|
|
6
7
|
module Filter # :nodoc:
|
7
8
|
# implementation of the AsciiHex stream filter
|
8
9
|
class AsciiHex
|
10
|
+
|
9
11
|
def initialize(options = {})
|
10
12
|
@options = options
|
11
13
|
end
|
@@ -16,9 +18,12 @@ class PDF::Reader
|
|
16
18
|
def filter(data)
|
17
19
|
data.chop! if data[-1,1] == ">"
|
18
20
|
data = data[1,data.size] if data[0,1] == "<"
|
21
|
+
|
22
|
+
return "" if data.nil?
|
23
|
+
|
19
24
|
data.gsub!(/[^A-Fa-f0-9]/,"")
|
20
25
|
data << "0" if data.size % 2 == 1
|
21
|
-
data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
|
26
|
+
data.scan(/.{2}/).flatten.map { |s| s.hex.chr }.join("")
|
22
27
|
rescue Exception => e
|
23
28
|
# Oops, there was a problem decoding the stream
|
24
29
|
raise MalformedPDFError,
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
class PDF::Reader
|
@@ -6,6 +7,7 @@ class PDF::Reader
|
|
6
7
|
# some filter implementations support preprocessing of the data to
|
7
8
|
# improve compression
|
8
9
|
class Depredict
|
10
|
+
|
9
11
|
def initialize(options = {})
|
10
12
|
@options = options || {}
|
11
13
|
end
|
@@ -34,7 +36,7 @@ class PDF::Reader
|
|
34
36
|
################################################################################
|
35
37
|
def tiff_depredict(data)
|
36
38
|
data = data.unpack("C*")
|
37
|
-
unfiltered =
|
39
|
+
unfiltered = ''
|
38
40
|
bpc = @options[:BitsPerComponent] || 8
|
39
41
|
pixel_bits = bpc * @options[:Colors]
|
40
42
|
pixel_bytes = pixel_bits / 8
|
@@ -51,11 +53,11 @@ class PDF::Reader
|
|
51
53
|
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
52
54
|
row_data[index] = (byte + left) % 256
|
53
55
|
end
|
54
|
-
unfiltered += row_data
|
56
|
+
unfiltered += row_data.pack("C*")
|
55
57
|
pos += line_len
|
56
58
|
end
|
57
59
|
|
58
|
-
unfiltered
|
60
|
+
unfiltered
|
59
61
|
end
|
60
62
|
################################################################################
|
61
63
|
def png_depredict(data)
|
@@ -67,7 +69,7 @@ class PDF::Reader
|
|
67
69
|
scanline_length = (pixel_bytes * @options[:Columns]) + 1
|
68
70
|
row = 0
|
69
71
|
pixels = []
|
70
|
-
paeth, pa, pb, pc =
|
72
|
+
paeth, pa, pb, pc = 0, 0, 0, 0
|
71
73
|
until data.empty? do
|
72
74
|
row_data = data.slice! 0, scanline_length
|
73
75
|
filter = row_data.shift
|
@@ -94,17 +96,17 @@ class PDF::Reader
|
|
94
96
|
row_data[index] = (byte + ((left + upper)/2).floor) % 256
|
95
97
|
end
|
96
98
|
when 4 # Paeth
|
97
|
-
left = upper = upper_left =
|
99
|
+
left = upper = upper_left = 0
|
98
100
|
row_data.each_with_index do |byte, index|
|
99
101
|
col = index / pixel_bytes
|
100
102
|
|
101
|
-
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
103
|
+
left = index < pixel_bytes ? 0 : Integer(row_data[index - pixel_bytes])
|
102
104
|
if row.zero?
|
103
105
|
upper = upper_left = 0
|
104
106
|
else
|
105
|
-
upper = pixels[row-1][col][index % pixel_bytes]
|
107
|
+
upper = Integer(pixels[row-1][col][index % pixel_bytes])
|
106
108
|
upper_left = col.zero? ? 0 :
|
107
|
-
pixels[row-1][col-1][index % pixel_bytes]
|
109
|
+
Integer(pixels[row-1][col-1][index % pixel_bytes])
|
108
110
|
end
|
109
111
|
|
110
112
|
p = left + upper - upper_left
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
|
@@ -8,7 +9,9 @@ class PDF::Reader
|
|
8
9
|
module Filter # :nodoc:
|
9
10
|
# implementation of the Flate (zlib) stream filter
|
10
11
|
class Flate
|
12
|
+
|
11
13
|
ZLIB_AUTO_DETECT_ZLIB_OR_GZIP = 47 # Zlib::MAX_WBITS + 32
|
14
|
+
ZLIB_RAW_DEFLATE = -15 # Zlib::MAX_WBITS * -1
|
12
15
|
|
13
16
|
def initialize(options = {})
|
14
17
|
@options = options
|
@@ -17,24 +20,34 @@ class PDF::Reader
|
|
17
20
|
################################################################################
|
18
21
|
# Decode the specified data with the Zlib compression algorithm
|
19
22
|
def filter(data)
|
20
|
-
deflated =
|
23
|
+
deflated = zlib_inflate(data) || zlib_inflate(data[0, data.bytesize-1])
|
24
|
+
|
25
|
+
if deflated.nil?
|
26
|
+
raise MalformedPDFError,
|
27
|
+
"Error while inflating a compressed stream (no suitable inflation algorithm found)"
|
28
|
+
end
|
29
|
+
Depredict.new(@options).filter(deflated)
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def zlib_inflate(data)
|
21
35
|
begin
|
22
|
-
|
23
|
-
rescue Zlib::DataError
|
36
|
+
return Zlib::Inflate.new(ZLIB_AUTO_DETECT_ZLIB_OR_GZIP).inflate(data)
|
37
|
+
rescue Zlib::DataError
|
24
38
|
# by default, Ruby's Zlib assumes the data it's inflating
|
25
39
|
# is RFC1951 deflated data, wrapped in a RFC1950 zlib container. If that
|
26
|
-
# fails,
|
27
|
-
#
|
28
|
-
#
|
29
|
-
# See
|
30
|
-
# - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
|
31
|
-
deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
|
40
|
+
# fails, swallow the exception and attempt to inflate the data as a raw
|
41
|
+
# RFC1951 stream.
|
32
42
|
end
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
43
|
+
|
44
|
+
begin
|
45
|
+
return Zlib::Inflate.new(ZLIB_RAW_DEFLATE).inflate(data)
|
46
|
+
rescue StandardError
|
47
|
+
# swallow this one too, so we can try some other fallback options
|
48
|
+
end
|
49
|
+
|
50
|
+
nil
|
38
51
|
end
|
39
52
|
end
|
40
53
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
#
|
@@ -6,6 +7,7 @@ class PDF::Reader # :nodoc:
|
|
6
7
|
module Filter # :nodoc:
|
7
8
|
# implementation of the run length stream filter
|
8
9
|
class RunLength
|
10
|
+
|
9
11
|
def initialize(options = {})
|
10
12
|
@options = options
|
11
13
|
end
|
@@ -20,19 +22,23 @@ class PDF::Reader # :nodoc:
|
|
20
22
|
length = data.getbyte(pos)
|
21
23
|
pos += 1
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
25
|
+
unless length.nil?
|
26
|
+
case
|
27
|
+
# nothing
|
28
|
+
when length == 128
|
29
|
+
break
|
30
|
+
when length < 128
|
31
|
+
# When the length is < 128, we copy the following length+1 bytes
|
32
|
+
# literally.
|
33
|
+
out << data[pos, length + 1]
|
34
|
+
pos += length
|
35
|
+
else
|
36
|
+
# When the length is > 128, we copy the next byte (257 - length)
|
37
|
+
# times; i.e., "\xFA\x00" ([250, 0]) will expand to
|
38
|
+
# "\x00\x00\x00\x00\x00\x00\x00".
|
39
|
+
previous_byte = data[pos, 1] || ""
|
40
|
+
out << previous_byte * (257 - length)
|
41
|
+
end
|
36
42
|
end
|
37
43
|
|
38
44
|
pos += 1
|
data/lib/pdf/reader/filter.rb
CHANGED