pdf-reader 2.4.2 → 2.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +44 -0
- data/README.md +16 -1
- data/Rakefile +1 -1
- data/examples/extract_fonts.rb +12 -7
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
- data/lib/pdf/reader/buffer.rb +63 -21
- data/lib/pdf/reader/cid_widths.rb +1 -0
- data/lib/pdf/reader/cmap.rb +5 -3
- data/lib/pdf/reader/encoding.rb +3 -2
- data/lib/pdf/reader/error.rb +11 -3
- data/lib/pdf/reader/filter/ascii85.rb +7 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +10 -8
- data/lib/pdf/reader/filter/flate.rb +4 -2
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -0
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/font.rb +44 -0
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +16 -9
- data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
- data/lib/pdf/reader/lzw.rb +4 -2
- data/lib/pdf/reader/null_security_handler.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +8 -3
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
- data/lib/pdf/reader/page.rb +73 -11
- data/lib/pdf/reader/page_layout.rb +37 -37
- data/lib/pdf/reader/page_state.rb +18 -23
- data/lib/pdf/reader/page_text_receiver.rb +68 -6
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +15 -7
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rectangle.rb +113 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +5 -0
- data/lib/pdf/reader/standard_security_handler.rb +1 -0
- data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +14 -6
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +8 -15
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +7 -1
- data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
- data/lib/pdf/reader.rb +29 -6
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +1763 -0
- metadata +12 -7
- data/lib/pdf/reader/orientation_detector.rb +0 -34
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6182ffd59631afba6a2c234547a428382b1ec2d7b414d89830b1143f1a0e1704
|
4
|
+
data.tar.gz: 6c0e6a7d32cf24912edc3aa96d72b7f70497d2fdd0e0913b86f871bbf9fa104f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 42dafbe0c36ce838da4c3120bf2187efde647e486971896d9a9c59c37dac3da0f2ccf3ecd98d8dd1d3acc5404bfcf26e64a327d7797648646afd6b40be02fec2
|
7
|
+
data.tar.gz: 40f0b0958024b558d6aca7eb2b3b6f042f034059c8fca52ce97fab7d55a39c313797605341331c65efd1099a1310ccbe386c354024dbd3cbc61c1d96c423842d
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,47 @@
|
|
1
|
+
v2.8.0 (28th Decemeber 2021)
|
2
|
+
- Add PDF::Reader::Page#runs for extracting text from a page with positioning metadata (http://github.com/yob/pdf-reader/pull/411)
|
3
|
+
- Add options to PDF::Reader::Page#text to make some behaviour configurable (http://github.com/yob/pdf-reader/pull/411)
|
4
|
+
- including extracting the text for only part of the page
|
5
|
+
- Improve text positioning and extraction for Type3 fonts (http://github.com/yob/pdf-reader/pull/412)
|
6
|
+
- Skip extracting text that is positioned outside the page (http://github.com/yob/pdf-reader/pull/413)
|
7
|
+
- Fix occasional crash when reading some streams (http://github.com/yob/pdf-reader/pull/405)
|
8
|
+
|
9
|
+
v2.7.0 (13th December 2021)
|
10
|
+
- Include RBI type files in the gem
|
11
|
+
- Downstream users of pdf-reader who also use sorbet *should* find many parts of the API will
|
12
|
+
now be typed checked by sorbet
|
13
|
+
- Fix glyph positioning in some rotation scenarios (http://github.com/yob/pdf-reader/pull/403)
|
14
|
+
- Improved text extraction on some rotated pages, and rotated text on normal pages
|
15
|
+
- Add PDF::Reader::Page#rectangles (http://github.com/yob/pdf-reader/pull/402)
|
16
|
+
- Returns page boxes (MediaBox, etc) with rotation applied, and as PORO rather than arrays of numbers
|
17
|
+
- Add PDF::Reader::Page#origin (http://github.com/yob/pdf-reader/pull/400)
|
18
|
+
- Add PDF::Reader::Page#{height,width} (http://github.com/yob/pdf-reader/pull/399)
|
19
|
+
- Overlap filter should only drop characters that overlap *and* match (http://github.com/yob/pdf-reader/pull/401)
|
20
|
+
|
21
|
+
v2.6.0 (12th November 2021)
|
22
|
+
- Text extraction improvements
|
23
|
+
- Improved text layout on pages with a variety of font sizes (http://github.com/yob/pdf-reader/pull/355)
|
24
|
+
- Fixed text positioning for some rotated pages (http://github.com/yob/pdf-reader/pull/356)
|
25
|
+
- Improved character width calculation for PDFs using built-in (non-embedded) ZapfDingbats (http://github.com/yob/pdf-reader/pull/373)
|
26
|
+
- Skip zero-width characters (http://github.com/yob/pdf-reader/pull/372)
|
27
|
+
- Performance improvements
|
28
|
+
- Reduced memory pressure when decoding TIFF images (http://github.com/yob/pdf-reader/pull/360)
|
29
|
+
- Optional dependency on ascii81_native gem for faster processing of files using the ascii85 filter (http://github.com/yob/pdf-reader/pull/359)
|
30
|
+
- Successfully parse more files
|
31
|
+
- Gracefully handle some non-spec compliant CR/LF issues (http://github.com/yob/pdf-reader/pull/364)
|
32
|
+
- Fix parsing of some escape sequences in content streams (http://github.com/yob/pdf-reader/pull/368)
|
33
|
+
- Increase the amount of junk bytes we detect and skip at the end of a file (382)
|
34
|
+
- Ignore "/Prev 0" in trailers (http://github.com/yob/pdf-reader/pull/383)
|
35
|
+
- Fix parsing of some inline images (BI ID EI tokens) (http://github.com/yob/pdf-reader/pull/389)
|
36
|
+
- Gracefully handle some xref tables that incorrectly start with 1 (http://github.com/yob/pdf-reader/pull/384)
|
37
|
+
|
38
|
+
v2.5.0 (6th June 2021)
|
39
|
+
- bump minimum ruby version to 2.0
|
40
|
+
- Correctly handle trascoding to UTF-8 from some fonts that use a difference table [#344](https://github.com/yob/pdf-reader/pull/344/)
|
41
|
+
- Fix some character spacing issues with the TJ operator [#343](https://github.com/yob/pdf-reader/pull/343)
|
42
|
+
- Fix crash with some encrypted PDFs [#348](https://github.com/yob/pdf-reader/pull/348/)
|
43
|
+
- Fix positions of text on some PDFs with pages rotated 90° [#350](https://github.com/yob/pdf-reader/pull/350/)
|
44
|
+
|
1
45
|
v2.4.2 (28th January 2021)
|
2
46
|
- relax ASCII85 dependency to allow 1.x
|
3
47
|
- improved support for decompressing objects with slightly malformed zlib data
|
data/README.md
CHANGED
@@ -166,6 +166,19 @@ http://groups.google.com/group/pdf-reader
|
|
166
166
|
The easiest way to explain how this works in practice is to show some examples.
|
167
167
|
Check out the examples/ directory for a few files.
|
168
168
|
|
169
|
+
# Alternate Decoder
|
170
|
+
|
171
|
+
For PDF files containing Ascii85 streams, the [ascii85_native](https://github.com/AnomalousBit/ascii85_native) gem can be used for increased performance. If the ascii85_native gem is detected, pdf-reader will automatically use the gem.
|
172
|
+
|
173
|
+
First, run `gem install ascii85_native` and then require the gem alongside pdf-reader:
|
174
|
+
|
175
|
+
```ruby
|
176
|
+
require "pdf-reader"
|
177
|
+
require "ascii85_native"
|
178
|
+
```
|
179
|
+
|
180
|
+
Another way of enabling native Ascii85 decoding is to place `gem 'ascii85_native'` in your project's `Gemfile`.
|
181
|
+
|
169
182
|
# Known Limitations
|
170
183
|
|
171
184
|
Occasionally some text cannot be extracted properly due to the way it has been
|
@@ -176,7 +189,9 @@ little UTF-8 friendly box to indicate an unrecognisable character.
|
|
176
189
|
|
177
190
|
* PDF::Reader Code Repository: http://github.com/yob/pdf-reader
|
178
191
|
|
179
|
-
* PDF Specification:
|
192
|
+
* PDF Specification: https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf
|
193
|
+
|
194
|
+
* Adobe PDF Developer Resources: http://www.adobe.com/devnet/pdf/pdf_reference.html
|
180
195
|
|
181
196
|
* PDF Tutorial Slide Presentations: https://web.archive.org/web/20150110042057/http://home.comcast.net/~jk05/presentations/PDFTutorials.html
|
182
197
|
|
data/Rakefile
CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
|
|
14
14
|
Cane::RakeTask.new(:quality) do |cane|
|
15
15
|
cane.abc_max = 20
|
16
16
|
cane.style_measure = 100
|
17
|
-
cane.max_violations =
|
17
|
+
cane.max_violations = 28
|
18
18
|
|
19
19
|
cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
|
20
20
|
end
|
data/examples/extract_fonts.rb
CHANGED
@@ -17,8 +17,8 @@ module ExtractFonts
|
|
17
17
|
return count if page.fonts.nil? || page.fonts.empty?
|
18
18
|
|
19
19
|
page.fonts.each do |label, font|
|
20
|
-
next if complete_refs[
|
21
|
-
complete_refs[
|
20
|
+
next if complete_refs[label]
|
21
|
+
complete_refs[label] = true
|
22
22
|
|
23
23
|
process_font(page, font)
|
24
24
|
|
@@ -39,7 +39,7 @@ module ExtractFonts
|
|
39
39
|
when :TrueType, :CIDFontType2 then
|
40
40
|
ExtractFonts::TTF.new(page.objects, font).save("#{font[:BaseFont]}.ttf")
|
41
41
|
else
|
42
|
-
$stderr.puts "unsupported font type #{font[:Subtype]}"
|
42
|
+
$stderr.puts "unsupported font type #{font[:Subtype]} for #{font[:BaseFont]}"
|
43
43
|
end
|
44
44
|
end
|
45
45
|
|
@@ -68,10 +68,15 @@ module ExtractFonts
|
|
68
68
|
end
|
69
69
|
end
|
70
70
|
|
71
|
-
|
71
|
+
if ARGV.size == 0 # default file name
|
72
|
+
ARGV << File.expand_path(File.join(File.dirname(__dir__), "spec", "data", "cairo-unicode.pdf"))
|
73
|
+
end
|
74
|
+
|
72
75
|
extractor = ExtractFonts::Extractor.new
|
73
76
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
+
ARGV.each do |arg|
|
78
|
+
PDF::Reader.open(arg) do |reader|
|
79
|
+
page = reader.page(1)
|
80
|
+
extractor.page(page)
|
81
|
+
end
|
77
82
|
end
|
data/examples/rspec.rb
CHANGED
@@ -0,0 +1,16 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: strict
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
class PDF::Reader
|
6
|
+
|
7
|
+
# Filter our text/characters that are positioned outside a rectangle. Usually the page
|
8
|
+
# MediaBox or CropBox, but could be a user specified rectangle too
|
9
|
+
class BoundingRectangleRunsFilter
|
10
|
+
|
11
|
+
def self.runs_within_rect(runs, rect)
|
12
|
+
runs.select { |run| rect.contains?(run.origin) }
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: ASCII-8BIT
|
2
|
+
# typed: false
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -48,6 +49,15 @@ class PDF::Reader
|
|
48
49
|
ID = "ID"
|
49
50
|
FWD_SLASH = "/"
|
50
51
|
NULL_BYTE = "\x00"
|
52
|
+
CR = "\r"
|
53
|
+
LF = "\n"
|
54
|
+
CRLF = "\r\n"
|
55
|
+
WHITE_SPACE = [LF, CR, ' ']
|
56
|
+
|
57
|
+
# Quite a few PDFs have trailing junk.
|
58
|
+
# This can be several k of nuls in some cases
|
59
|
+
# Allow for this here
|
60
|
+
TRAILING_BYTECOUNT = 5000
|
51
61
|
|
52
62
|
attr_reader :pos
|
53
63
|
|
@@ -86,9 +96,12 @@ class PDF::Reader
|
|
86
96
|
#
|
87
97
|
# options:
|
88
98
|
#
|
89
|
-
# :skip_eol - if true, the IO stream is advanced past a CRLF or LF
|
90
|
-
# is sitting under the io cursor.
|
91
|
-
#
|
99
|
+
# :skip_eol - if true, the IO stream is advanced past a CRLF, CR or LF
|
100
|
+
# that is sitting under the io cursor.
|
101
|
+
# Note:
|
102
|
+
# Skipping a bare CR is not spec-compliant.
|
103
|
+
# This is because the data may start with LF.
|
104
|
+
# However we check for CRLF first, so the ambiguity is avoided.
|
92
105
|
def read(bytes, opts = {})
|
93
106
|
reset_pos
|
94
107
|
|
@@ -97,9 +110,9 @@ class PDF::Reader
|
|
97
110
|
str = @io.read(2)
|
98
111
|
if str.nil?
|
99
112
|
return nil
|
100
|
-
elsif str ==
|
113
|
+
elsif str == CRLF # This MUST be done before checking for CR alone
|
101
114
|
# do nothing
|
102
|
-
elsif str[0,1] ==
|
115
|
+
elsif str[0, 1] == LF || str[0, 1] == CR # LF or CR alone
|
103
116
|
@io.seek(-1, IO::SEEK_CUR)
|
104
117
|
else
|
105
118
|
@io.seek(-2, IO::SEEK_CUR)
|
@@ -127,8 +140,8 @@ class PDF::Reader
|
|
127
140
|
#
|
128
141
|
def find_first_xref_offset
|
129
142
|
check_size_is_non_zero
|
130
|
-
@io.seek(-
|
131
|
-
data = @io.read(
|
143
|
+
@io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
|
144
|
+
data = @io.read(TRAILING_BYTECOUNT)
|
132
145
|
|
133
146
|
# the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
|
134
147
|
lines = data.split(/[\n\r]+/).reverse
|
@@ -217,7 +230,9 @@ class PDF::Reader
|
|
217
230
|
return if @tokens.size < 3
|
218
231
|
return if @tokens[2] != "R"
|
219
232
|
|
220
|
-
|
233
|
+
# must match whole tokens
|
234
|
+
digits_only = %r{\A\d+\z}
|
235
|
+
if @tokens[0].match(digits_only) && @tokens[1].match(digits_only)
|
221
236
|
@tokens[0] = PDF::Reader::Reference.new(@tokens[0].to_i, @tokens[1].to_i)
|
222
237
|
@tokens[1] = nil
|
223
238
|
@tokens[2] = nil
|
@@ -225,24 +240,51 @@ class PDF::Reader
|
|
225
240
|
end
|
226
241
|
end
|
227
242
|
|
243
|
+
# Extract data between ID and EI
|
244
|
+
# If the EI follows white-space the space is dropped from the data
|
245
|
+
# The EI must followed by white-space or end of buffer
|
246
|
+
# This is to reduce the chance of accidentally matching an embedded EI
|
228
247
|
def prepare_inline_token
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
248
|
+
idstart = @io.pos
|
249
|
+
chr = prevchr = nil
|
250
|
+
eisize = 0 # how many chars in the end marker
|
251
|
+
seeking = 'E' # what are we looking for now?
|
252
|
+
loop do
|
234
253
|
chr = @io.read(1)
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
254
|
+
break if chr.nil?
|
255
|
+
case seeking
|
256
|
+
when 'E'
|
257
|
+
if chr == 'E'
|
258
|
+
seeking = 'I'
|
259
|
+
if WHITE_SPACE.include? prevchr
|
260
|
+
eisize = 3 # include whitespace in delimiter, i.e. drop from data
|
261
|
+
else # assume the EI immediately follows the data
|
262
|
+
eisize = 2 # leave prevchr in data
|
263
|
+
end
|
264
|
+
end
|
265
|
+
when 'I'
|
266
|
+
if chr == 'I'
|
267
|
+
seeking = :END
|
268
|
+
else
|
269
|
+
seeking = 'E'
|
270
|
+
end
|
271
|
+
when :END
|
272
|
+
if WHITE_SPACE.include? chr
|
273
|
+
eisize += 1 # Drop trailer
|
274
|
+
break
|
275
|
+
else
|
276
|
+
seeking = 'E'
|
277
|
+
end
|
239
278
|
end
|
279
|
+
prevchr = chr
|
240
280
|
end
|
241
|
-
|
242
|
-
|
243
|
-
|
281
|
+
unless seeking == :END
|
282
|
+
raise MalformedPDFError, "EI terminator not found"
|
283
|
+
end
|
284
|
+
eiend = @io.pos
|
285
|
+
@io.seek(idstart, IO::SEEK_SET)
|
286
|
+
str = @io.read(eiend - eisize - idstart) # get the ID content
|
244
287
|
@tokens << string_token(str)
|
245
|
-
@io.seek(-3, IO::SEEK_CUR) unless chr.nil?
|
246
288
|
end
|
247
289
|
|
248
290
|
# if we're currently inside a hex string, read hex nibbles until
|
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: false
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -32,6 +33,7 @@ class PDF::Reader
|
|
32
33
|
# extracting various useful information.
|
33
34
|
#
|
34
35
|
class CMap # :nodoc:
|
36
|
+
|
35
37
|
CMAP_KEYWORDS = {
|
36
38
|
"begincodespacerange" => 1,
|
37
39
|
"endcodespacerange" => 1,
|
@@ -53,7 +55,7 @@ class PDF::Reader
|
|
53
55
|
|
54
56
|
def process_data(data)
|
55
57
|
parser = build_parser(data)
|
56
|
-
mode =
|
58
|
+
mode = :none
|
57
59
|
instructions = []
|
58
60
|
|
59
61
|
while token = parser.parse_token(CMAP_KEYWORDS)
|
@@ -62,13 +64,13 @@ class PDF::Reader
|
|
62
64
|
elsif token == "endbfchar"
|
63
65
|
process_bfchar_instructions(instructions)
|
64
66
|
instructions = []
|
65
|
-
mode =
|
67
|
+
mode = :none
|
66
68
|
elsif token == "beginbfrange"
|
67
69
|
mode = :range
|
68
70
|
elsif token == "endbfrange"
|
69
71
|
process_bfrange_instructions(instructions)
|
70
72
|
instructions = []
|
71
|
-
mode =
|
73
|
+
mode = :none
|
72
74
|
elsif mode == :char || mode == :range
|
73
75
|
instructions << token
|
74
76
|
end
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -68,7 +69,7 @@ class PDF::Reader
|
|
68
69
|
#
|
69
70
|
# [25, :A, :B]
|
70
71
|
def differences=(diff)
|
71
|
-
|
72
|
+
PDF::Reader::Error.validate_type(diff, "diff", Array)
|
72
73
|
|
73
74
|
@differences = {}
|
74
75
|
byte = 0
|
@@ -208,7 +209,7 @@ class PDF::Reader
|
|
208
209
|
def load_mapping(file)
|
209
210
|
File.open(file, "r:BINARY") do |f|
|
210
211
|
f.each do |l|
|
211
|
-
_m, single_byte, unicode = *l.match(
|
212
|
+
_m, single_byte, unicode = *l.match(/\A([0-9A-Za-z]+);([0-9A-F]{4})/)
|
212
213
|
@mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
|
213
214
|
end
|
214
215
|
end
|
data/lib/pdf/reader/error.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -33,19 +34,26 @@ class PDF::Reader
|
|
33
34
|
def self.str_assert(lvalue, rvalue, chars=nil)
|
34
35
|
raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
|
35
36
|
lvalue = lvalue[0,chars] if chars
|
36
|
-
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue != rvalue
|
37
|
+
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
|
37
38
|
end
|
38
39
|
################################################################################
|
39
40
|
def self.str_assert_not(lvalue, rvalue, chars=nil)
|
40
41
|
raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
|
41
42
|
lvalue = lvalue[0,chars] if chars
|
42
|
-
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue == rvalue
|
43
|
+
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue == rvalue
|
43
44
|
end
|
44
45
|
################################################################################
|
45
46
|
def self.assert_equal(lvalue, rvalue)
|
46
|
-
raise MalformedPDFError, "PDF malformed, expected #{rvalue} but found #{lvalue} instead" if lvalue != rvalue
|
47
|
+
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
|
47
48
|
end
|
48
49
|
################################################################################
|
50
|
+
def self.validate_type(object, name, klass)
|
51
|
+
raise ArgumentError, "#{name} (#{object}) must be a #{klass}" unless object.is_a?(klass)
|
52
|
+
end
|
53
|
+
################################################################################
|
54
|
+
def self.validate_not_nil(object, name)
|
55
|
+
raise ArgumentError, "#{object} must not be nil" if object.nil?
|
56
|
+
end
|
49
57
|
end
|
50
58
|
|
51
59
|
################################################################################
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: false
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'ascii85'
|
@@ -7,6 +8,7 @@ class PDF::Reader
|
|
7
8
|
module Filter # :nodoc:
|
8
9
|
# implementation of the Ascii85 filter
|
9
10
|
class Ascii85
|
11
|
+
|
10
12
|
def initialize(options = {})
|
11
13
|
@options = options
|
12
14
|
end
|
@@ -17,7 +19,11 @@ class PDF::Reader
|
|
17
19
|
#
|
18
20
|
def filter(data)
|
19
21
|
data = "<~#{data}" unless data.to_s[0,2] == "<~"
|
20
|
-
::
|
22
|
+
if defined?(::Ascii85Native)
|
23
|
+
::Ascii85Native::decode(data)
|
24
|
+
else
|
25
|
+
::Ascii85::decode(data)
|
26
|
+
end
|
21
27
|
rescue Exception => e
|
22
28
|
# Oops, there was a problem decoding the stream
|
23
29
|
raise MalformedPDFError,
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
#
|
@@ -6,6 +7,7 @@ class PDF::Reader
|
|
6
7
|
module Filter # :nodoc:
|
7
8
|
# implementation of the AsciiHex stream filter
|
8
9
|
class AsciiHex
|
10
|
+
|
9
11
|
def initialize(options = {})
|
10
12
|
@options = options
|
11
13
|
end
|
@@ -16,9 +18,12 @@ class PDF::Reader
|
|
16
18
|
def filter(data)
|
17
19
|
data.chop! if data[-1,1] == ">"
|
18
20
|
data = data[1,data.size] if data[0,1] == "<"
|
21
|
+
|
22
|
+
return "" if data.nil?
|
23
|
+
|
19
24
|
data.gsub!(/[^A-Fa-f0-9]/,"")
|
20
25
|
data << "0" if data.size % 2 == 1
|
21
|
-
data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
|
26
|
+
data.scan(/.{2}/).flatten.map { |s| s.hex.chr }.join("")
|
22
27
|
rescue Exception => e
|
23
28
|
# Oops, there was a problem decoding the stream
|
24
29
|
raise MalformedPDFError,
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
class PDF::Reader
|
@@ -6,6 +7,7 @@ class PDF::Reader
|
|
6
7
|
# some filter implementations support preprocessing of the data to
|
7
8
|
# improve compression
|
8
9
|
class Depredict
|
10
|
+
|
9
11
|
def initialize(options = {})
|
10
12
|
@options = options || {}
|
11
13
|
end
|
@@ -34,7 +36,7 @@ class PDF::Reader
|
|
34
36
|
################################################################################
|
35
37
|
def tiff_depredict(data)
|
36
38
|
data = data.unpack("C*")
|
37
|
-
unfiltered =
|
39
|
+
unfiltered = ''
|
38
40
|
bpc = @options[:BitsPerComponent] || 8
|
39
41
|
pixel_bits = bpc * @options[:Colors]
|
40
42
|
pixel_bytes = pixel_bits / 8
|
@@ -51,11 +53,11 @@ class PDF::Reader
|
|
51
53
|
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
52
54
|
row_data[index] = (byte + left) % 256
|
53
55
|
end
|
54
|
-
unfiltered += row_data
|
56
|
+
unfiltered += row_data.pack("C*")
|
55
57
|
pos += line_len
|
56
58
|
end
|
57
59
|
|
58
|
-
unfiltered
|
60
|
+
unfiltered
|
59
61
|
end
|
60
62
|
################################################################################
|
61
63
|
def png_depredict(data)
|
@@ -67,7 +69,7 @@ class PDF::Reader
|
|
67
69
|
scanline_length = (pixel_bytes * @options[:Columns]) + 1
|
68
70
|
row = 0
|
69
71
|
pixels = []
|
70
|
-
paeth, pa, pb, pc =
|
72
|
+
paeth, pa, pb, pc = 0, 0, 0, 0
|
71
73
|
until data.empty? do
|
72
74
|
row_data = data.slice! 0, scanline_length
|
73
75
|
filter = row_data.shift
|
@@ -94,17 +96,17 @@ class PDF::Reader
|
|
94
96
|
row_data[index] = (byte + ((left + upper)/2).floor) % 256
|
95
97
|
end
|
96
98
|
when 4 # Paeth
|
97
|
-
left = upper = upper_left =
|
99
|
+
left = upper = upper_left = 0
|
98
100
|
row_data.each_with_index do |byte, index|
|
99
101
|
col = index / pixel_bytes
|
100
102
|
|
101
|
-
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
103
|
+
left = index < pixel_bytes ? 0 : Integer(row_data[index - pixel_bytes])
|
102
104
|
if row.zero?
|
103
105
|
upper = upper_left = 0
|
104
106
|
else
|
105
|
-
upper = pixels[row-1][col][index % pixel_bytes]
|
107
|
+
upper = Integer(pixels[row-1][col][index % pixel_bytes])
|
106
108
|
upper_left = col.zero? ? 0 :
|
107
|
-
pixels[row-1][col-1][index % pixel_bytes]
|
109
|
+
Integer(pixels[row-1][col-1][index % pixel_bytes])
|
108
110
|
end
|
109
111
|
|
110
112
|
p = left + upper - upper_left
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
|
@@ -8,6 +9,7 @@ class PDF::Reader
|
|
8
9
|
module Filter # :nodoc:
|
9
10
|
# implementation of the Flate (zlib) stream filter
|
10
11
|
class Flate
|
12
|
+
|
11
13
|
ZLIB_AUTO_DETECT_ZLIB_OR_GZIP = 47 # Zlib::MAX_WBITS + 32
|
12
14
|
ZLIB_RAW_DEFLATE = -15 # Zlib::MAX_WBITS * -1
|
13
15
|
|
@@ -32,7 +34,7 @@ class PDF::Reader
|
|
32
34
|
def zlib_inflate(data)
|
33
35
|
begin
|
34
36
|
return Zlib::Inflate.new(ZLIB_AUTO_DETECT_ZLIB_OR_GZIP).inflate(data)
|
35
|
-
rescue Zlib::DataError
|
37
|
+
rescue Zlib::DataError
|
36
38
|
# by default, Ruby's Zlib assumes the data it's inflating
|
37
39
|
# is RFC1951 deflated data, wrapped in a RFC1950 zlib container. If that
|
38
40
|
# fails, swallow the exception and attempt to inflate the data as a raw
|
@@ -41,7 +43,7 @@ class PDF::Reader
|
|
41
43
|
|
42
44
|
begin
|
43
45
|
return Zlib::Inflate.new(ZLIB_RAW_DEFLATE).inflate(data)
|
44
|
-
rescue StandardError
|
46
|
+
rescue StandardError
|
45
47
|
# swallow this one too, so we can try some other fallback options
|
46
48
|
end
|
47
49
|
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
#
|
@@ -6,6 +7,7 @@ class PDF::Reader # :nodoc:
|
|
6
7
|
module Filter # :nodoc:
|
7
8
|
# implementation of the run length stream filter
|
8
9
|
class RunLength
|
10
|
+
|
9
11
|
def initialize(options = {})
|
10
12
|
@options = options
|
11
13
|
end
|
@@ -20,19 +22,23 @@ class PDF::Reader # :nodoc:
|
|
20
22
|
length = data.getbyte(pos)
|
21
23
|
pos += 1
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
25
|
+
unless length.nil?
|
26
|
+
case
|
27
|
+
# nothing
|
28
|
+
when length == 128
|
29
|
+
break
|
30
|
+
when length < 128
|
31
|
+
# When the length is < 128, we copy the following length+1 bytes
|
32
|
+
# literally.
|
33
|
+
out << data[pos, length + 1]
|
34
|
+
pos += length
|
35
|
+
else
|
36
|
+
# When the length is > 128, we copy the next byte (257 - length)
|
37
|
+
# times; i.e., "\xFA\x00" ([250, 0]) will expand to
|
38
|
+
# "\x00\x00\x00\x00\x00\x00\x00".
|
39
|
+
previous_byte = data[pos, 1] || ""
|
40
|
+
out << previous_byte * (257 - length)
|
41
|
+
end
|
36
42
|
end
|
37
43
|
|
38
44
|
pos += 1
|