pdf-reader 1.1.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +8 -0
- data/bin/pdf_text +0 -2
- data/examples/extract_images.rb +11 -6
- data/lib/pdf/reader.rb +11 -5
- data/lib/pdf/reader/buffer.rb +48 -42
- data/lib/pdf/reader/cmap.rb +26 -11
- data/lib/pdf/reader/filter.rb +11 -234
- data/lib/pdf/reader/filter/ascii85.rb +25 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +26 -0
- data/lib/pdf/reader/filter/depredict.rb +138 -0
- data/lib/pdf/reader/filter/flate.rb +38 -0
- data/lib/pdf/reader/filter/lzw.rb +18 -0
- data/lib/pdf/reader/filter/null.rb +15 -0
- data/lib/pdf/reader/filter/run_length.rb +46 -0
- data/lib/pdf/reader/font.rb +1 -1
- data/lib/pdf/reader/form_xobject.rb +25 -4
- data/lib/pdf/reader/glyph_hash.rb +3 -2
- data/lib/pdf/reader/object_cache.rb +39 -16
- data/lib/pdf/reader/object_hash.rb +1 -1
- data/lib/pdf/reader/page.rb +7 -1
- data/lib/pdf/reader/page_state.rb +2 -1
- data/lib/pdf/reader/stream.rb +1 -1
- data/lib/pdf/reader/xref.rb +23 -4
- metadata +99 -46
data/CHANGELOG
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
v1.2.0 (28th AUgust 2012)
|
2
|
+
- Feature: correctly extract text using surrogate pairs and ligatures
|
3
|
+
(thanks Nathaniel Madura)
|
4
|
+
- Speed optimisation: cache tokenised Form XObjects to avoid re-parsing them
|
5
|
+
- Feature: support opening documents with some junk bytes prepended to file
|
6
|
+
(thanks Paul Gallagher)
|
7
|
+
- Acrobat does this, so it seemed reasonable to add support
|
8
|
+
|
1
9
|
v1.1.1 (9th May 2012)
|
2
10
|
- bugfix release to improve parsing of some PDFs
|
3
11
|
|
data/bin/pdf_text
CHANGED
data/examples/extract_images.rb
CHANGED
@@ -86,14 +86,15 @@ module ExtractImages
|
|
86
86
|
tiff = header.dup
|
87
87
|
tiff << short_tag.call( 256, 1, w ) # image width
|
88
88
|
tiff << short_tag.call( 257, 1, h ) # image height
|
89
|
-
tiff << long_tag.call( 258, 4, (header.size + (tag_count*12))) # bits per pixel
|
89
|
+
tiff << long_tag.call( 258, 4, (header.size + (tag_count*12) + 4)) # bits per pixel
|
90
90
|
tiff << short_tag.call( 259, 1, 1 ) # compression
|
91
91
|
tiff << short_tag.call( 262, 1, 5 ) # colorspace - separation
|
92
|
-
tiff << long_tag.call( 273, 1, (10 + (tag_count*12) +
|
92
|
+
tiff << long_tag.call( 273, 1, (10 + (tag_count*12) + 20) ) # data offset
|
93
93
|
tiff << short_tag.call( 277, 1, 4 ) # samples per pixel
|
94
94
|
tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
|
95
95
|
tiff << short_tag.call( 284, 1, 1 ) # planer config
|
96
96
|
tiff << long_tag.call( 332, 1, 1) # inkset - CMYK
|
97
|
+
tiff << [0].pack("I") # next IFD pointer
|
97
98
|
tiff << [bpc, bpc, bpc, bpc].pack("IIII")
|
98
99
|
tiff << stream.unfiltered_data
|
99
100
|
File.open(filename, "wb") { |file| file.write tiff }
|
@@ -119,10 +120,12 @@ module ExtractImages
|
|
119
120
|
tiff << short_tag.call( 258, 1, 8 ) # bits per pixel
|
120
121
|
tiff << short_tag.call( 259, 1, 1 ) # compression
|
121
122
|
tiff << short_tag.call( 262, 1, 1 ) # colorspace - grayscale
|
122
|
-
tiff << long_tag.call( 273, 1, (10 + (tag_count*12)) ) # data offset
|
123
|
+
tiff << long_tag.call( 273, 1, (10 + (tag_count*12) + 4) ) # data offset
|
123
124
|
tiff << short_tag.call( 277, 1, 1 ) # samples per pixel
|
124
125
|
tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
|
125
126
|
tiff << short_tag.call( 284, 1, 1 ) # planer config
|
127
|
+
tiff << [0].pack("I") # next IFD pointer
|
128
|
+
p stream.unfiltered_data.size
|
126
129
|
tiff << stream.unfiltered_data
|
127
130
|
File.open(filename, "wb") { |file| file.write tiff }
|
128
131
|
end
|
@@ -144,12 +147,13 @@ module ExtractImages
|
|
144
147
|
tiff = header.dup
|
145
148
|
tiff << short_tag.call( 256, 1, w ) # image width
|
146
149
|
tiff << short_tag.call( 257, 1, h ) # image height
|
147
|
-
tiff << long_tag.call( 258, 3, (header.size + (tag_count*12))) # bits per pixel
|
150
|
+
tiff << long_tag.call( 258, 3, (header.size + (tag_count*12) + 4)) # bits per pixel
|
148
151
|
tiff << short_tag.call( 259, 1, 1 ) # compression
|
149
152
|
tiff << short_tag.call( 262, 1, 2 ) # colorspace - RGB
|
150
|
-
tiff << long_tag.call( 273, 1, (header.size + (tag_count*12) +
|
153
|
+
tiff << long_tag.call( 273, 1, (header.size + (tag_count*12) + 16) ) # data offset
|
151
154
|
tiff << short_tag.call( 277, 1, 3 ) # samples per pixel
|
152
155
|
tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
|
156
|
+
tiff << [0].pack("I") # next IFD pointer
|
153
157
|
tiff << [bpc, bpc, bpc].pack("III")
|
154
158
|
tiff << stream.unfiltered_data
|
155
159
|
File.open(filename, "wb") { |file| file.write tiff }
|
@@ -209,8 +213,9 @@ module ExtractImages
|
|
209
213
|
+ short_tag.call( 256, cols ) \
|
210
214
|
+ short_tag.call( 257, h ) \
|
211
215
|
+ short_tag.call( 259, 4 ) \
|
212
|
-
+ long_tag.call( 273, (10 + (5*12)) ) \
|
216
|
+
+ long_tag.call( 273, (10 + (5*12) + 4) ) \
|
213
217
|
+ long_tag.call( 279, len) \
|
218
|
+
+ [0].pack("I") \
|
214
219
|
+ stream.data
|
215
220
|
File.open(filename, "wb") { |file| file.write tiff }
|
216
221
|
end
|
data/lib/pdf/reader.rb
CHANGED
@@ -25,9 +25,6 @@
|
|
25
25
|
################################################################################
|
26
26
|
|
27
27
|
require 'stringio'
|
28
|
-
require 'zlib'
|
29
|
-
|
30
|
-
require 'ascii85'
|
31
28
|
|
32
29
|
module PDF
|
33
30
|
################################################################################
|
@@ -113,6 +110,8 @@ module PDF
|
|
113
110
|
#
|
114
111
|
def initialize(input = nil, opts = {})
|
115
112
|
if input # support the deprecated Reader API
|
113
|
+
@cache = PDF::Reader::ObjectCache.new
|
114
|
+
opts.merge!(:cache => @cache)
|
116
115
|
@objects = PDF::Reader::ObjectHash.new(input, opts)
|
117
116
|
end
|
118
117
|
end
|
@@ -222,7 +221,7 @@ module PDF
|
|
222
221
|
#
|
223
222
|
def pages
|
224
223
|
(1..self.page_count).map { |num|
|
225
|
-
PDF::Reader::Page.new(@objects, num)
|
224
|
+
PDF::Reader::Page.new(@objects, num, :cache => @cache)
|
226
225
|
}
|
227
226
|
end
|
228
227
|
|
@@ -241,7 +240,7 @@ module PDF
|
|
241
240
|
def page(num)
|
242
241
|
num = num.to_i
|
243
242
|
raise ArgumentError, "valid pages are 1 .. #{self.page_count}" if num < 1 || num > self.page_count
|
244
|
-
PDF::Reader::Page.new(@objects, num)
|
243
|
+
PDF::Reader::Page.new(@objects, num, :cache => @cache)
|
245
244
|
end
|
246
245
|
|
247
246
|
|
@@ -338,6 +337,13 @@ require 'pdf/reader/cmap'
|
|
338
337
|
require 'pdf/reader/encoding'
|
339
338
|
require 'pdf/reader/error'
|
340
339
|
require 'pdf/reader/filter'
|
340
|
+
require 'pdf/reader/filter/ascii85'
|
341
|
+
require 'pdf/reader/filter/ascii_hex'
|
342
|
+
require 'pdf/reader/filter/depredict'
|
343
|
+
require 'pdf/reader/filter/flate'
|
344
|
+
require 'pdf/reader/filter/lzw'
|
345
|
+
require 'pdf/reader/filter/null'
|
346
|
+
require 'pdf/reader/filter/run_length'
|
341
347
|
require 'pdf/reader/font'
|
342
348
|
require 'pdf/reader/form_xobject'
|
343
349
|
require 'pdf/reader/glyph_hash'
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -36,7 +36,7 @@ class PDF::Reader
|
|
36
36
|
# the raw tokens into objects we can work with (strings, ints, arrays, etc)
|
37
37
|
#
|
38
38
|
class Buffer
|
39
|
-
TOKEN_WHITESPACE=[
|
39
|
+
TOKEN_WHITESPACE=[0x00, 0x09, 0x0A, 0x0C, 0x0D, 0x20]
|
40
40
|
|
41
41
|
attr_reader :pos
|
42
42
|
|
@@ -232,18 +232,17 @@ class PDF::Reader
|
|
232
232
|
finished = false
|
233
233
|
|
234
234
|
while !finished
|
235
|
-
|
236
|
-
|
237
|
-
if chr.nil?
|
235
|
+
byte = @io.getbyte
|
236
|
+
if byte.nil?
|
238
237
|
finished = true # unbalanced params
|
239
|
-
elsif (48..57).include?(
|
240
|
-
str << chr
|
241
|
-
elsif
|
238
|
+
elsif (48..57).include?(byte) || (65..90).include?(byte) || (97..122).include?(byte)
|
239
|
+
str << byte.chr
|
240
|
+
elsif byte <= 32
|
242
241
|
# ignore it
|
243
242
|
else
|
244
243
|
@tokens << str if str.size > 0
|
245
|
-
@tokens << ">" if
|
246
|
-
@tokens << chr
|
244
|
+
@tokens << ">" if byte != 0x3E # '>'
|
245
|
+
@tokens << byte.chr
|
247
246
|
finished = true
|
248
247
|
end
|
249
248
|
end
|
@@ -263,19 +262,19 @@ class PDF::Reader
|
|
263
262
|
count = 1
|
264
263
|
|
265
264
|
while count > 0
|
266
|
-
|
267
|
-
if
|
265
|
+
byte = @io.getbyte
|
266
|
+
if byte.nil?
|
268
267
|
count = 0 # unbalanced params
|
269
|
-
elsif
|
270
|
-
str << chr << @io.
|
271
|
-
elsif
|
268
|
+
elsif byte == 0x5C
|
269
|
+
str << byte.chr << @io.getbyte.chr
|
270
|
+
elsif byte == 0x28 # "("
|
272
271
|
str << "("
|
273
272
|
count += 1
|
274
|
-
elsif
|
273
|
+
elsif byte == 0x29 # ")"
|
275
274
|
count -= 1
|
276
275
|
str << ")" unless count == 0
|
277
276
|
else
|
278
|
-
str << chr unless count == 0
|
277
|
+
str << byte.chr unless count == 0
|
279
278
|
end
|
280
279
|
end
|
281
280
|
|
@@ -291,61 +290,68 @@ class PDF::Reader
|
|
291
290
|
def prepare_regular_token
|
292
291
|
tok = ""
|
293
292
|
|
294
|
-
while
|
295
|
-
case
|
296
|
-
when
|
293
|
+
while byte = @io.getbyte
|
294
|
+
case byte
|
295
|
+
when 0x25
|
297
296
|
# comment, ignore everything until the next EOL char
|
298
297
|
done = false
|
299
298
|
while !done
|
300
|
-
|
301
|
-
done = true if
|
299
|
+
byte = @io.getbyte
|
300
|
+
done = true if byte.nil? || byte == 0x0A || byte == 0x0D
|
302
301
|
end
|
303
302
|
when *TOKEN_WHITESPACE
|
304
303
|
# white space, token finished
|
305
304
|
@tokens << tok if tok.size > 0
|
306
305
|
|
307
306
|
#If the token was empty, chomp the rest of the whitespace too
|
308
|
-
while TOKEN_WHITESPACE.include?(
|
309
|
-
@io.
|
307
|
+
while TOKEN_WHITESPACE.include?(peek_byte) && tok.size == 0
|
308
|
+
@io.getbyte
|
310
309
|
end
|
311
310
|
tok = ""
|
312
311
|
break
|
313
|
-
when
|
312
|
+
when 0x3C
|
314
313
|
# opening delimiter '<', start of new token
|
315
314
|
@tokens << tok if tok.size > 0
|
316
|
-
|
317
|
-
|
315
|
+
if peek_byte == 0x3C # check if token is actually '<<'
|
316
|
+
@io.getbyte
|
317
|
+
@tokens << "<<"
|
318
|
+
else
|
319
|
+
@tokens << "<"
|
320
|
+
end
|
318
321
|
tok = ""
|
319
322
|
break
|
320
|
-
when
|
323
|
+
when 0x3E
|
321
324
|
# closing delimiter '>', start of new token
|
322
325
|
@tokens << tok if tok.size > 0
|
323
|
-
|
324
|
-
|
326
|
+
if peek_byte == 0x3E # check if token is actually '>>'
|
327
|
+
@io.getbyte
|
328
|
+
@tokens << ">>"
|
329
|
+
else
|
330
|
+
@tokens << byte.chr
|
331
|
+
end
|
325
332
|
tok = ""
|
326
333
|
break
|
327
|
-
when
|
334
|
+
when 0x28, 0x5B, 0x7B
|
328
335
|
# opening delimiter, start of new token
|
329
336
|
@tokens << tok if tok.size > 0
|
330
|
-
@tokens << chr
|
337
|
+
@tokens << byte.chr
|
331
338
|
tok = ""
|
332
339
|
break
|
333
|
-
when
|
340
|
+
when 0x29, 0x5D, 0x7D
|
334
341
|
# closing delimiter
|
335
342
|
@tokens << tok if tok.size > 0
|
336
|
-
@tokens << chr
|
343
|
+
@tokens << byte.chr
|
337
344
|
tok = ""
|
338
345
|
break
|
339
|
-
when
|
346
|
+
when 0x2F
|
340
347
|
# PDF name, start of new token
|
341
348
|
@tokens << tok if tok.size > 0
|
342
|
-
@tokens << chr
|
343
|
-
|
344
|
-
@tokens << "" if chr == "/" && [nil, " ", "\n"].include?(next_char)
|
349
|
+
@tokens << byte.chr
|
350
|
+
@tokens << "" if byte == 0x2F && [nil, 0x20, 0x0A].include?(peek_byte)
|
345
351
|
tok = ""
|
346
352
|
break
|
347
353
|
else
|
348
|
-
tok << chr
|
354
|
+
tok << byte.chr
|
349
355
|
end
|
350
356
|
end
|
351
357
|
|
@@ -355,10 +361,10 @@ class PDF::Reader
|
|
355
361
|
# peek at the next character in the io stream, leaving the stream position
|
356
362
|
# untouched
|
357
363
|
#
|
358
|
-
def
|
359
|
-
|
360
|
-
@io.seek(-1, IO::SEEK_CUR)
|
361
|
-
|
364
|
+
def peek_byte
|
365
|
+
byte = @io.getbyte
|
366
|
+
@io.seek(-1, IO::SEEK_CUR) if byte
|
367
|
+
byte
|
362
368
|
end
|
363
369
|
|
364
370
|
# for a handful of tokens we want to tell the parser how to convert them
|
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -60,6 +60,10 @@ class PDF::Reader
|
|
60
60
|
@map.size
|
61
61
|
end
|
62
62
|
|
63
|
+
# Convert a glyph code into one or more Codepoints.
|
64
|
+
#
|
65
|
+
# Returns an array of Fixnums.
|
66
|
+
#
|
63
67
|
def decode(c)
|
64
68
|
# TODO: implement the conversion
|
65
69
|
return c unless c.class == Fixnum
|
@@ -74,12 +78,23 @@ class PDF::Reader
|
|
74
78
|
end
|
75
79
|
|
76
80
|
def str_to_int(str)
|
77
|
-
return nil if str.nil? || str.size == 0
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
+
return nil if str.nil? || str.size == 0
|
82
|
+
unpacked_string = if str.size == 1 # UTF-8
|
83
|
+
str.unpack("C*")
|
84
|
+
else # UTF-16
|
85
|
+
str.unpack("n*")
|
86
|
+
end
|
87
|
+
if unpacked_string.length == 1
|
88
|
+
unpacked_string
|
89
|
+
elsif unpacked_string.length == 2 && (unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF)
|
90
|
+
# this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
|
91
|
+
# lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
|
92
|
+
# low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
|
93
|
+
[(unpacked_string[0] - 0xD800) * 0x400 + (unpacked_string[1] - 0xDC00) + 0x10000]
|
81
94
|
else
|
82
|
-
|
95
|
+
# it is a bad idea to just return the first 16 bits, as this doesn't allow
|
96
|
+
# for ligatures for example fi (U+0066 U+0069)
|
97
|
+
unpacked_string
|
83
98
|
end
|
84
99
|
end
|
85
100
|
|
@@ -88,7 +103,7 @@ class PDF::Reader
|
|
88
103
|
find = str_to_int(parser.parse_token)
|
89
104
|
replace = str_to_int(parser.parse_token)
|
90
105
|
while find && replace
|
91
|
-
@map[find] = replace
|
106
|
+
@map[find[0]] = replace
|
92
107
|
find = str_to_int(parser.parse_token)
|
93
108
|
replace = str_to_int(parser.parse_token)
|
94
109
|
end
|
@@ -114,21 +129,21 @@ class PDF::Reader
|
|
114
129
|
end
|
115
130
|
|
116
131
|
def bfrange_type_one(start_code, end_code, dst)
|
117
|
-
start_code = str_to_int(start_code)
|
118
|
-
end_code = str_to_int(end_code)
|
132
|
+
start_code = str_to_int(start_code)[0]
|
133
|
+
end_code = str_to_int(end_code)[0]
|
119
134
|
dst = str_to_int(dst)
|
120
135
|
|
121
136
|
# add all values in the range to our mapping
|
122
137
|
(start_code..end_code).each_with_index do |val, idx|
|
123
|
-
@map[val] = dst + idx
|
138
|
+
@map[val] = dst.length == 1 ? [dst[0] + idx] : [dst[0], dst[1] + 1]
|
124
139
|
# ensure a single range does not exceed 255 chars
|
125
140
|
raise PDF::Reader::MalformedPDFError, "a CMap bfrange cann't exceed 255 chars" if idx > 255
|
126
141
|
end
|
127
142
|
end
|
128
143
|
|
129
144
|
def bfrange_type_two(start_code, end_code, dst)
|
130
|
-
start_code = str_to_int(start_code)
|
131
|
-
end_code = str_to_int(end_code)
|
145
|
+
start_code = str_to_int(start_code)[0]
|
146
|
+
end_code = str_to_int(end_code)[0]
|
132
147
|
from_range = (start_code..end_code)
|
133
148
|
|
134
149
|
# add all values in the range to our mapping
|
data/lib/pdf/reader/filter.rb
CHANGED
@@ -22,7 +22,6 @@
|
|
22
22
|
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
23
|
#
|
24
24
|
################################################################################
|
25
|
-
require 'zlib'
|
26
25
|
|
27
26
|
class PDF::Reader
|
28
27
|
################################################################################
|
@@ -30,7 +29,7 @@ class PDF::Reader
|
|
30
29
|
# support for features like compression and encryption. This class is for decoding that
|
31
30
|
# content.
|
32
31
|
#
|
33
|
-
|
32
|
+
module Filter # :nodoc:
|
34
33
|
|
35
34
|
################################################################################
|
36
35
|
# creates a new filter for decoding content.
|
@@ -38,242 +37,20 @@ class PDF::Reader
|
|
38
37
|
# Filters that are only used to encode image data are accepted, but the data is
|
39
38
|
# returned untouched. At this stage PDF::Reader has no need to decode images.
|
40
39
|
#
|
41
|
-
def
|
42
|
-
@options = options
|
43
|
-
|
40
|
+
def self.with(name, options = {})
|
44
41
|
case name.to_sym
|
45
|
-
when :ASCII85Decode then
|
46
|
-
when :ASCIIHexDecode then
|
47
|
-
when :CCITTFaxDecode then
|
48
|
-
when :DCTDecode then
|
49
|
-
when :FlateDecode then
|
50
|
-
when :JBIG2Decode then
|
51
|
-
when :JPXDecode then
|
52
|
-
when :LZWDecode then
|
53
|
-
when :RunLengthDecode then
|
42
|
+
when :ASCII85Decode then PDF::Reader::Filter::Ascii85.new(options)
|
43
|
+
when :ASCIIHexDecode then PDF::Reader::Filter::AsciiHex.new(options)
|
44
|
+
when :CCITTFaxDecode then PDF::Reader::Filter::Null.new(options)
|
45
|
+
when :DCTDecode then PDF::Reader::Filter::Null.new(options)
|
46
|
+
when :FlateDecode then PDF::Reader::Filter::Flate.new(options)
|
47
|
+
when :JBIG2Decode then PDF::Reader::Filter::Null.new(options)
|
48
|
+
when :JPXDecode then PDF::Reader::Filter::Null.new(options)
|
49
|
+
when :LZWDecode then PDF::Reader::Filter::Lzw.new(options)
|
50
|
+
when :RunLengthDecode then PDF::Reader::Filter::RunLength.new(options)
|
54
51
|
else
|
55
52
|
raise UnsupportedFeatureError, "Unknown filter: #{name}"
|
56
53
|
end
|
57
54
|
end
|
58
|
-
################################################################################
|
59
|
-
# attempts to decode the specified data with the current filter
|
60
|
-
#
|
61
|
-
# Filters that are only used to encode image data are accepted, but the data is
|
62
|
-
# returned untouched. At this stage PDF::Reader has no need to decode images.
|
63
|
-
#
|
64
|
-
def filter (data)
|
65
|
-
# leave the data untouched if we don't support the required filter
|
66
|
-
return data if @filter.nil?
|
67
|
-
|
68
|
-
# decode the data
|
69
|
-
self.send(@filter, data)
|
70
|
-
end
|
71
|
-
################################################################################
|
72
|
-
# Decode the specified data using the Ascii85 algorithm. Relies on the AScii85
|
73
|
-
# rubygem.
|
74
|
-
#
|
75
|
-
def ascii85(data)
|
76
|
-
data = "<~#{data}" unless data.to_s[0,2] == "<~"
|
77
|
-
Ascii85::decode(data)
|
78
|
-
rescue Exception => e
|
79
|
-
# Oops, there was a problem decoding the stream
|
80
|
-
raise MalformedPDFError, "Error occured while decoding an ASCII85 stream (#{e.class.to_s}: #{e.to_s})"
|
81
|
-
end
|
82
|
-
################################################################################
|
83
|
-
# Decode the specified data using the AsciiHex algorithm.
|
84
|
-
#
|
85
|
-
def asciihex(data)
|
86
|
-
data.chop! if data[-1,1] == ">"
|
87
|
-
data = data[1,data.size] if data[0,1] == "<"
|
88
|
-
data.gsub!(/[^A-Fa-f0-9]/,"")
|
89
|
-
data << "0" if data.size % 2 == 1
|
90
|
-
data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
|
91
|
-
rescue Exception => e
|
92
|
-
# Oops, there was a problem decoding the stream
|
93
|
-
raise MalformedPDFError, "Error occured while decoding an ASCIIHex stream (#{e.class.to_s}: #{e.to_s})"
|
94
|
-
end
|
95
|
-
################################################################################
|
96
|
-
# Decode the specified data with the Zlib compression algorithm
|
97
|
-
def flate (data)
|
98
|
-
deflated = nil
|
99
|
-
begin
|
100
|
-
deflated = Zlib::Inflate.new.inflate(data)
|
101
|
-
rescue Zlib::DataError => e
|
102
|
-
# by default, Ruby's Zlib assumes the data it's inflating
|
103
|
-
# is RFC1951 deflated data, wrapped in a RFC1951 zlib container.
|
104
|
-
# If that fails, then use an undocumented 'feature' to attempt to inflate
|
105
|
-
# the data as a raw RFC1951 stream.
|
106
|
-
#
|
107
|
-
# See
|
108
|
-
# - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
|
109
|
-
# - http://www.gzip.org/zlib/zlib_faq.html#faq38
|
110
|
-
deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
|
111
|
-
end
|
112
|
-
depredict(deflated, @options)
|
113
|
-
rescue Exception => e
|
114
|
-
# Oops, there was a problem inflating the stream
|
115
|
-
raise MalformedPDFError, "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
|
116
|
-
end
|
117
|
-
################################################################################
|
118
|
-
# Decode the specified data with the LZW compression algorithm
|
119
|
-
def lzw(data)
|
120
|
-
data = PDF::Reader::LZW.decode(data)
|
121
|
-
depredict(data, @options)
|
122
|
-
end
|
123
|
-
################################################################################
|
124
|
-
# Decode the specified data with the RunLengthDecode compression algorithm
|
125
|
-
def runlength(data)
|
126
|
-
pos = 0
|
127
|
-
out = ""
|
128
|
-
|
129
|
-
while pos < data.length
|
130
|
-
if data.respond_to?(:getbyte)
|
131
|
-
length = data.getbyte(pos)
|
132
|
-
else
|
133
|
-
length = data[pos]
|
134
|
-
end
|
135
|
-
pos += 1
|
136
|
-
|
137
|
-
case
|
138
|
-
when length == 128
|
139
|
-
break
|
140
|
-
when length < 128
|
141
|
-
# When the length is < 128, we copy the following length+1 bytes
|
142
|
-
# literally.
|
143
|
-
out << data[pos, length + 1]
|
144
|
-
pos += length
|
145
|
-
else
|
146
|
-
# When the length is > 128, we copy the next byte (257 - length)
|
147
|
-
# times; i.e., "\xFA\x00" ([250, 0]) will expand to
|
148
|
-
# "\x00\x00\x00\x00\x00\x00\x00".
|
149
|
-
out << data[pos, 1] * (257 - length)
|
150
|
-
end
|
151
|
-
|
152
|
-
pos += 1
|
153
|
-
end
|
154
|
-
|
155
|
-
out
|
156
|
-
end
|
157
|
-
################################################################################
|
158
|
-
def depredict(data, opts = {})
|
159
|
-
predictor = (opts || {})[:Predictor].to_i
|
160
|
-
|
161
|
-
case predictor
|
162
|
-
when 0, 1 then
|
163
|
-
data
|
164
|
-
when 2 then
|
165
|
-
tiff_depredict(data, opts)
|
166
|
-
when 10, 11, 12, 13, 14, 15 then
|
167
|
-
png_depredict(data, opts)
|
168
|
-
else
|
169
|
-
raise MalformedPDFError, "Unrecognised predictor value (#{predictor})"
|
170
|
-
end
|
171
|
-
end
|
172
|
-
################################################################################
|
173
|
-
def tiff_depredict(data, opts = {})
|
174
|
-
data = data.unpack("C*")
|
175
|
-
unfiltered = []
|
176
|
-
bpc = opts[:BitsPerComponent] || 8
|
177
|
-
pixel_bits = bpc * opts[:Colors]
|
178
|
-
pixel_bytes = pixel_bits / 8
|
179
|
-
line_len = (pixel_bytes * opts[:Columns])
|
180
|
-
pos = 0
|
181
|
-
|
182
|
-
if bpc != 8
|
183
|
-
raise UnsupportedFeatureError, "TIFF predictor onlys supports 8 Bits Per Component"
|
184
|
-
end
|
185
|
-
|
186
|
-
until pos > data.size
|
187
|
-
row_data = data[pos, line_len]
|
188
|
-
row_data.each_with_index do |byte, index|
|
189
|
-
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
190
|
-
row_data[index] = (byte + left) % 256
|
191
|
-
end
|
192
|
-
unfiltered += row_data
|
193
|
-
pos += line_len
|
194
|
-
end
|
195
|
-
|
196
|
-
unfiltered.pack("C*")
|
197
|
-
end
|
198
|
-
################################################################################
|
199
|
-
def png_depredict(data, opts = {})
|
200
|
-
return data if opts.nil? || opts[:Predictor].to_i < 10
|
201
|
-
|
202
|
-
data = data.unpack("C*")
|
203
|
-
|
204
|
-
pixel_bytes = opts[:Colors] || 1
|
205
|
-
scanline_length = (pixel_bytes * opts[:Columns]) + 1
|
206
|
-
row = 0
|
207
|
-
pixels = []
|
208
|
-
paeth, pa, pb, pc = nil
|
209
|
-
until data.empty? do
|
210
|
-
row_data = data.slice! 0, scanline_length
|
211
|
-
filter = row_data.shift
|
212
|
-
case filter
|
213
|
-
when 0 # None
|
214
|
-
when 1 # Sub
|
215
|
-
row_data.each_with_index do |byte, index|
|
216
|
-
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
217
|
-
row_data[index] = (byte + left) % 256
|
218
|
-
#p [byte, left, row_data[index]]
|
219
|
-
end
|
220
|
-
when 2 # Up
|
221
|
-
row_data.each_with_index do |byte, index|
|
222
|
-
col = index / pixel_bytes
|
223
|
-
upper = row == 0 ? 0 : pixels[row-1][col][index % pixel_bytes]
|
224
|
-
row_data[index] = (upper + byte) % 256
|
225
|
-
end
|
226
|
-
when 3 # Average
|
227
|
-
row_data.each_with_index do |byte, index|
|
228
|
-
col = index / pixel_bytes
|
229
|
-
upper = row == 0 ? 0 : pixels[row-1][col][index % pixel_bytes]
|
230
|
-
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
231
|
-
|
232
|
-
row_data[index] = (byte + ((left + upper)/2).floor) % 256
|
233
|
-
end
|
234
|
-
when 4 # Paeth
|
235
|
-
left = upper = upper_left = nil
|
236
|
-
row_data.each_with_index do |byte, index|
|
237
|
-
col = index / pixel_bytes
|
238
|
-
|
239
|
-
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
240
|
-
if row.zero?
|
241
|
-
upper = upper_left = 0
|
242
|
-
else
|
243
|
-
upper = pixels[row-1][col][index % pixel_bytes]
|
244
|
-
upper_left = col.zero? ? 0 :
|
245
|
-
pixels[row-1][col-1][index % pixel_bytes]
|
246
|
-
end
|
247
|
-
|
248
|
-
p = left + upper - upper_left
|
249
|
-
pa = (p - left).abs
|
250
|
-
pb = (p - upper).abs
|
251
|
-
pc = (p - upper_left).abs
|
252
|
-
|
253
|
-
paeth = if pa <= pb && pa <= pc
|
254
|
-
left
|
255
|
-
elsif pb <= pc
|
256
|
-
upper
|
257
|
-
else
|
258
|
-
upper_left
|
259
|
-
end
|
260
|
-
|
261
|
-
row_data[index] = (byte + paeth) % 256
|
262
|
-
end
|
263
|
-
else
|
264
|
-
raise ArgumentError, "Invalid filter algorithm #{filter}"
|
265
|
-
end
|
266
|
-
|
267
|
-
s = []
|
268
|
-
row_data.each_slice pixel_bytes do |slice|
|
269
|
-
s << slice
|
270
|
-
end
|
271
|
-
pixels << s
|
272
|
-
row += 1
|
273
|
-
end
|
274
|
-
|
275
|
-
pixels.map { |bytes| bytes.flatten.pack("C*") }.join("")
|
276
|
-
end
|
277
55
|
end
|
278
56
|
end
|
279
|
-
################################################################################
|