pdf-reader 1.1.1 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +8 -0
- data/bin/pdf_text +0 -2
- data/examples/extract_images.rb +11 -6
- data/lib/pdf/reader.rb +11 -5
- data/lib/pdf/reader/buffer.rb +48 -42
- data/lib/pdf/reader/cmap.rb +26 -11
- data/lib/pdf/reader/filter.rb +11 -234
- data/lib/pdf/reader/filter/ascii85.rb +25 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +26 -0
- data/lib/pdf/reader/filter/depredict.rb +138 -0
- data/lib/pdf/reader/filter/flate.rb +38 -0
- data/lib/pdf/reader/filter/lzw.rb +18 -0
- data/lib/pdf/reader/filter/null.rb +15 -0
- data/lib/pdf/reader/filter/run_length.rb +46 -0
- data/lib/pdf/reader/font.rb +1 -1
- data/lib/pdf/reader/form_xobject.rb +25 -4
- data/lib/pdf/reader/glyph_hash.rb +3 -2
- data/lib/pdf/reader/object_cache.rb +39 -16
- data/lib/pdf/reader/object_hash.rb +1 -1
- data/lib/pdf/reader/page.rb +7 -1
- data/lib/pdf/reader/page_state.rb +2 -1
- data/lib/pdf/reader/stream.rb +1 -1
- data/lib/pdf/reader/xref.rb +23 -4
- metadata +99 -46
@@ -0,0 +1,25 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require 'ascii85'
|
4
|
+
|
5
|
+
class PDF::Reader
|
6
|
+
module Filter # :nodoc:
|
7
|
+
class Ascii85
|
8
|
+
def initialize(options = {})
|
9
|
+
@options = options
|
10
|
+
end
|
11
|
+
|
12
|
+
################################################################################
|
13
|
+
# Decode the specified data using the Ascii85 algorithm. Relies on the AScii85
|
14
|
+
# rubygem.
|
15
|
+
#
|
16
|
+
def filter(data)
|
17
|
+
data = "<~#{data}" unless data.to_s[0,2] == "<~"
|
18
|
+
::Ascii85::decode(data)
|
19
|
+
rescue Exception => e
|
20
|
+
# Oops, there was a problem decoding the stream
|
21
|
+
raise MalformedPDFError, "Error occured while decoding an ASCII85 stream (#{e.class.to_s}: #{e.to_s})"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
#
|
3
|
+
class PDF::Reader
|
4
|
+
module Filter # :nodoc:
|
5
|
+
class AsciiHex
|
6
|
+
def initialize(options = {})
|
7
|
+
@options = options
|
8
|
+
end
|
9
|
+
|
10
|
+
################################################################################
|
11
|
+
# Decode the specified data using the AsciiHex algorithm.
|
12
|
+
#
|
13
|
+
def filter(data)
|
14
|
+
data.chop! if data[-1,1] == ">"
|
15
|
+
data = data[1,data.size] if data[0,1] == "<"
|
16
|
+
data.gsub!(/[^A-Fa-f0-9]/,"")
|
17
|
+
data << "0" if data.size % 2 == 1
|
18
|
+
data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
|
19
|
+
rescue Exception => e
|
20
|
+
# Oops, there was a problem decoding the stream
|
21
|
+
raise MalformedPDFError, "Error occured while decoding an ASCIIHex stream (#{e.class.to_s}: #{e.to_s})"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
@@ -0,0 +1,138 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
module Filter # :nodoc:
|
5
|
+
class Depredict
|
6
|
+
def initialize(options = {})
|
7
|
+
@options = options || {}
|
8
|
+
end
|
9
|
+
|
10
|
+
################################################################################
|
11
|
+
# Streams can be preprocessed to improve compression. This reverses the
|
12
|
+
# preprocessing
|
13
|
+
#
|
14
|
+
def filter(data)
|
15
|
+
predictor = @options[:Predictor].to_i
|
16
|
+
|
17
|
+
case predictor
|
18
|
+
when 0, 1 then
|
19
|
+
data
|
20
|
+
when 2 then
|
21
|
+
tiff_depredict(data)
|
22
|
+
when 10, 11, 12, 13, 14, 15 then
|
23
|
+
png_depredict(data)
|
24
|
+
else
|
25
|
+
raise MalformedPDFError, "Unrecognised predictor value (#{predictor})"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
################################################################################
|
32
|
+
def tiff_depredict(data)
|
33
|
+
data = data.unpack("C*")
|
34
|
+
unfiltered = []
|
35
|
+
bpc = @options[:BitsPerComponent] || 8
|
36
|
+
pixel_bits = bpc * @options[:Colors]
|
37
|
+
pixel_bytes = pixel_bits / 8
|
38
|
+
line_len = (pixel_bytes * @options[:Columns])
|
39
|
+
pos = 0
|
40
|
+
|
41
|
+
if bpc != 8
|
42
|
+
raise UnsupportedFeatureError, "TIFF predictor onlys supports 8 Bits Per Component"
|
43
|
+
end
|
44
|
+
|
45
|
+
until pos > data.size
|
46
|
+
row_data = data[pos, line_len]
|
47
|
+
row_data.each_with_index do |byte, index|
|
48
|
+
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
49
|
+
row_data[index] = (byte + left) % 256
|
50
|
+
end
|
51
|
+
unfiltered += row_data
|
52
|
+
pos += line_len
|
53
|
+
end
|
54
|
+
|
55
|
+
unfiltered.pack("C*")
|
56
|
+
end
|
57
|
+
################################################################################
|
58
|
+
def png_depredict(data)
|
59
|
+
return data if @options[:Predictor].to_i < 10
|
60
|
+
|
61
|
+
data = data.unpack("C*")
|
62
|
+
|
63
|
+
pixel_bytes = @options[:Colors] || 1
|
64
|
+
scanline_length = (pixel_bytes * @options[:Columns]) + 1
|
65
|
+
row = 0
|
66
|
+
pixels = []
|
67
|
+
paeth, pa, pb, pc = nil
|
68
|
+
until data.empty? do
|
69
|
+
row_data = data.slice! 0, scanline_length
|
70
|
+
filter = row_data.shift
|
71
|
+
case filter
|
72
|
+
when 0 # None
|
73
|
+
when 1 # Sub
|
74
|
+
row_data.each_with_index do |byte, index|
|
75
|
+
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
76
|
+
row_data[index] = (byte + left) % 256
|
77
|
+
#p [byte, left, row_data[index]]
|
78
|
+
end
|
79
|
+
when 2 # Up
|
80
|
+
row_data.each_with_index do |byte, index|
|
81
|
+
col = index / pixel_bytes
|
82
|
+
upper = row == 0 ? 0 : pixels[row-1][col][index % pixel_bytes]
|
83
|
+
row_data[index] = (upper + byte) % 256
|
84
|
+
end
|
85
|
+
when 3 # Average
|
86
|
+
row_data.each_with_index do |byte, index|
|
87
|
+
col = index / pixel_bytes
|
88
|
+
upper = row == 0 ? 0 : pixels[row-1][col][index % pixel_bytes]
|
89
|
+
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
90
|
+
|
91
|
+
row_data[index] = (byte + ((left + upper)/2).floor) % 256
|
92
|
+
end
|
93
|
+
when 4 # Paeth
|
94
|
+
left = upper = upper_left = nil
|
95
|
+
row_data.each_with_index do |byte, index|
|
96
|
+
col = index / pixel_bytes
|
97
|
+
|
98
|
+
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
99
|
+
if row.zero?
|
100
|
+
upper = upper_left = 0
|
101
|
+
else
|
102
|
+
upper = pixels[row-1][col][index % pixel_bytes]
|
103
|
+
upper_left = col.zero? ? 0 :
|
104
|
+
pixels[row-1][col-1][index % pixel_bytes]
|
105
|
+
end
|
106
|
+
|
107
|
+
p = left + upper - upper_left
|
108
|
+
pa = (p - left).abs
|
109
|
+
pb = (p - upper).abs
|
110
|
+
pc = (p - upper_left).abs
|
111
|
+
|
112
|
+
paeth = if pa <= pb && pa <= pc
|
113
|
+
left
|
114
|
+
elsif pb <= pc
|
115
|
+
upper
|
116
|
+
else
|
117
|
+
upper_left
|
118
|
+
end
|
119
|
+
|
120
|
+
row_data[index] = (byte + paeth) % 256
|
121
|
+
end
|
122
|
+
else
|
123
|
+
raise ArgumentError, "Invalid filter algorithm #{filter}"
|
124
|
+
end
|
125
|
+
|
126
|
+
s = []
|
127
|
+
row_data.each_slice pixel_bytes do |slice|
|
128
|
+
s << slice
|
129
|
+
end
|
130
|
+
pixels << s
|
131
|
+
row += 1
|
132
|
+
end
|
133
|
+
|
134
|
+
pixels.map { |bytes| bytes.flatten.pack("C*") }.join("")
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
require 'zlib'
|
5
|
+
|
6
|
+
class PDF::Reader
|
7
|
+
module Filter # :nodoc:
|
8
|
+
class Flate
|
9
|
+
def initialize(options = {})
|
10
|
+
@options = options
|
11
|
+
end
|
12
|
+
|
13
|
+
################################################################################
|
14
|
+
# Decode the specified data with the Zlib compression algorithm
|
15
|
+
def filter(data)
|
16
|
+
deflated = nil
|
17
|
+
begin
|
18
|
+
deflated = Zlib::Inflate.new.inflate(data)
|
19
|
+
rescue Zlib::DataError => e
|
20
|
+
# by default, Ruby's Zlib assumes the data it's inflating
|
21
|
+
# is RFC1951 deflated data, wrapped in a RFC1951 zlib container.
|
22
|
+
# If that fails, then use an undocumented 'feature' to attempt to inflate
|
23
|
+
# the data as a raw RFC1951 stream.
|
24
|
+
#
|
25
|
+
# See
|
26
|
+
# - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
|
27
|
+
# - http://www.gzip.org/zlib/zlib_faq.html#faq38
|
28
|
+
deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
|
29
|
+
end
|
30
|
+
Depredict.new(@options).filter(deflated)
|
31
|
+
rescue Exception => e
|
32
|
+
# Oops, there was a problem inflating the stream
|
33
|
+
raise MalformedPDFError, "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
#
|
3
|
+
class PDF::Reader
|
4
|
+
module Filter # :nodoc:
|
5
|
+
class Lzw
|
6
|
+
def initialize(options = {})
|
7
|
+
@options = options
|
8
|
+
end
|
9
|
+
|
10
|
+
################################################################################
|
11
|
+
# Decode the specified data with the LZW compression algorithm
|
12
|
+
def filter(data)
|
13
|
+
data = PDF::Reader::LZW.decode(data)
|
14
|
+
Depredict.new(@options).filter(data)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
#
|
3
|
+
class PDF::Reader
|
4
|
+
module Filter # :nodoc:
|
5
|
+
class RunLength
|
6
|
+
def initialize(options = {})
|
7
|
+
@options = options
|
8
|
+
end
|
9
|
+
|
10
|
+
################################################################################
|
11
|
+
# Decode the specified data with the RunLengthDecode compression algorithm
|
12
|
+
def filter(data)
|
13
|
+
pos = 0
|
14
|
+
out = ""
|
15
|
+
|
16
|
+
while pos < data.length
|
17
|
+
if data.respond_to?(:getbyte)
|
18
|
+
length = data.getbyte(pos)
|
19
|
+
else
|
20
|
+
length = data[pos]
|
21
|
+
end
|
22
|
+
pos += 1
|
23
|
+
|
24
|
+
case
|
25
|
+
when length == 128
|
26
|
+
break
|
27
|
+
when length < 128
|
28
|
+
# When the length is < 128, we copy the following length+1 bytes
|
29
|
+
# literally.
|
30
|
+
out << data[pos, length + 1]
|
31
|
+
pos += length
|
32
|
+
else
|
33
|
+
# When the length is > 128, we copy the next byte (257 - length)
|
34
|
+
# times; i.e., "\xFA\x00" ([250, 0]) will expand to
|
35
|
+
# "\x00\x00\x00\x00\x00\x00\x00".
|
36
|
+
out << data[pos, 1] * (257 - length)
|
37
|
+
end
|
38
|
+
|
39
|
+
pos += 1
|
40
|
+
end
|
41
|
+
|
42
|
+
Depredict.new(@options).filter(out)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -109,7 +109,7 @@ class PDF::Reader
|
|
109
109
|
if params.class == String
|
110
110
|
params.unpack(encoding.unpack).map { |c|
|
111
111
|
@tounicode.decode(c) || PDF::Reader::Encoding::UNKNOWN_CHAR
|
112
|
-
}.pack("U*")
|
112
|
+
}.flatten.pack("U*")
|
113
113
|
elsif params.class == Array
|
114
114
|
params.collect { |param| to_utf8_via_cmap(param) }
|
115
115
|
else
|
@@ -1,5 +1,7 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
|
3
|
+
require 'digest/md5'
|
4
|
+
|
3
5
|
module PDF
|
4
6
|
class Reader
|
5
7
|
|
@@ -15,9 +17,10 @@ module PDF
|
|
15
17
|
|
16
18
|
attr_reader :xobject
|
17
19
|
|
18
|
-
def initialize(page, xobject)
|
20
|
+
def initialize(page, xobject, options = {})
|
19
21
|
@page = page
|
20
22
|
@objects = page.objects
|
23
|
+
@cache = options[:cache] || {}
|
21
24
|
@xobject = @objects.deref(xobject)
|
22
25
|
end
|
23
26
|
|
@@ -65,12 +68,30 @@ module PDF
|
|
65
68
|
end
|
66
69
|
end
|
67
70
|
|
71
|
+
def content_stream_md5
|
72
|
+
@content_stream_md5 ||= Digest::MD5.hexdigest(raw_content)
|
73
|
+
end
|
74
|
+
|
75
|
+
def cached_tokens_key
|
76
|
+
@cached_tokens_key ||= "tokens-#{content_stream_md5}"
|
77
|
+
end
|
78
|
+
|
79
|
+
def tokens
|
80
|
+
@cache[cached_tokens_key] ||= begin
|
81
|
+
buffer = Buffer.new(StringIO.new(raw_content), :content_stream => true)
|
82
|
+
parser = Parser.new(buffer, @objects)
|
83
|
+
result = []
|
84
|
+
while (token = parser.parse_token(PagesStrategy::OPERATORS))
|
85
|
+
result << token
|
86
|
+
end
|
87
|
+
result
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
68
91
|
def content_stream(receivers, instructions)
|
69
|
-
buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
|
70
|
-
parser = Parser.new(buffer, @objects)
|
71
92
|
params = []
|
72
93
|
|
73
|
-
|
94
|
+
tokens.each do |token|
|
74
95
|
if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
|
75
96
|
callback(receivers, PagesStrategy::OPERATORS[token], params)
|
76
97
|
params.clear
|
@@ -26,7 +26,8 @@
|
|
26
26
|
class PDF::Reader
|
27
27
|
class GlyphHash # :nodoc:
|
28
28
|
def initialize
|
29
|
-
|
29
|
+
# only parse the glyph list once, and cache the results (for performance)
|
30
|
+
@adobe = @@cache ||= load_adobe_glyph_mapping
|
30
31
|
end
|
31
32
|
|
32
33
|
# attempt to convert a PDF Name to a unicode codepoint. Returns nil
|
@@ -82,7 +83,7 @@ class PDF::Reader
|
|
82
83
|
end
|
83
84
|
end
|
84
85
|
|
85
|
-
glyphs
|
86
|
+
glyphs.freeze
|
86
87
|
end
|
87
88
|
|
88
89
|
end
|
@@ -1,10 +1,12 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
|
3
|
+
require 'hashery'
|
4
|
+
|
3
5
|
class PDF::Reader
|
4
6
|
|
5
7
|
# A Hash-like object for caching commonly used objects from a PDF file.
|
6
8
|
#
|
7
|
-
# This is an internal class
|
9
|
+
# This is an internal class, no promises about a stable API.
|
8
10
|
#
|
9
11
|
class ObjectCache # nodoc
|
10
12
|
|
@@ -13,53 +15,67 @@ class PDF::Reader
|
|
13
15
|
# avoid lots of repetitive (and expensive) tokenising
|
14
16
|
CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
|
15
17
|
|
16
|
-
|
18
|
+
attr_reader :hits, :misses
|
19
|
+
|
20
|
+
def initialize(lru_size = 1000)
|
17
21
|
@objects = {}
|
22
|
+
@lru_cache = Hashery::LRUHash.new(lru_size.to_i)
|
23
|
+
@hits = 0
|
24
|
+
@misses = 0
|
18
25
|
end
|
19
26
|
|
20
27
|
def [](key)
|
21
|
-
|
28
|
+
update_stats(key)
|
29
|
+
@objects[key] || @lru_cache[key]
|
22
30
|
end
|
23
31
|
|
24
32
|
def []=(key, value)
|
25
|
-
|
33
|
+
if cacheable?(value)
|
34
|
+
@objects[key] = value
|
35
|
+
else
|
36
|
+
@lru_cache[key] = value
|
37
|
+
end
|
26
38
|
end
|
27
39
|
|
28
40
|
def fetch(key, local_default = nil)
|
29
|
-
|
41
|
+
update_stats(key)
|
42
|
+
@objects[key] || @lru_cache.fetch(key, local_default)
|
30
43
|
end
|
31
44
|
|
32
45
|
def each(&block)
|
33
46
|
@objects.each(&block)
|
47
|
+
@lru_cache.each(&block)
|
34
48
|
end
|
35
49
|
alias :each_pair :each
|
36
50
|
|
37
51
|
def each_key(&block)
|
38
52
|
@objects.each_key(&block)
|
53
|
+
@lru_cache.each_key(&block)
|
39
54
|
end
|
40
55
|
|
41
56
|
def each_value(&block)
|
42
57
|
@objects.each_value(&block)
|
58
|
+
@lru_cache.each_value(&block)
|
43
59
|
end
|
44
60
|
|
45
61
|
def size
|
46
|
-
@objects.size
|
62
|
+
@objects.size + @lru_cache.size
|
47
63
|
end
|
48
64
|
alias :length :size
|
49
65
|
|
50
66
|
def empty?
|
51
|
-
@objects.empty?
|
67
|
+
@objects.empty? && @lru_cache.empty?
|
52
68
|
end
|
53
69
|
|
54
|
-
def
|
55
|
-
@objects.
|
70
|
+
def include?(key)
|
71
|
+
@objects.include?(key) || @lru_cache.include?(key)
|
56
72
|
end
|
57
|
-
alias :
|
58
|
-
alias :key? :
|
59
|
-
alias :member? :
|
73
|
+
alias :has_key? :include?
|
74
|
+
alias :key? :include?
|
75
|
+
alias :member? :include?
|
60
76
|
|
61
77
|
def has_value?(value)
|
62
|
-
@objects.has_value?(value)
|
78
|
+
@objects.has_value?(value) || @lru_cache.has_value?(value)
|
63
79
|
end
|
64
80
|
|
65
81
|
def to_s
|
@@ -67,19 +83,26 @@ class PDF::Reader
|
|
67
83
|
end
|
68
84
|
|
69
85
|
def keys
|
70
|
-
@objects.keys
|
86
|
+
@objects.keys + @lru_cache.keys
|
71
87
|
end
|
72
88
|
|
73
89
|
def values
|
74
|
-
@objects.values
|
90
|
+
@objects.values + @lru_cache.values
|
75
91
|
end
|
76
92
|
|
77
93
|
private
|
78
94
|
|
95
|
+
def update_stats(key)
|
96
|
+
if has_key?(key)
|
97
|
+
@hits += 1
|
98
|
+
else
|
99
|
+
@misses += 1
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
79
103
|
def cacheable?(obj)
|
80
104
|
obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
|
81
105
|
end
|
82
106
|
|
83
|
-
|
84
107
|
end
|
85
108
|
end
|