pdf-reader 1.1.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +8 -0
- data/bin/pdf_text +0 -2
- data/examples/extract_images.rb +11 -6
- data/lib/pdf/reader.rb +11 -5
- data/lib/pdf/reader/buffer.rb +48 -42
- data/lib/pdf/reader/cmap.rb +26 -11
- data/lib/pdf/reader/filter.rb +11 -234
- data/lib/pdf/reader/filter/ascii85.rb +25 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +26 -0
- data/lib/pdf/reader/filter/depredict.rb +138 -0
- data/lib/pdf/reader/filter/flate.rb +38 -0
- data/lib/pdf/reader/filter/lzw.rb +18 -0
- data/lib/pdf/reader/filter/null.rb +15 -0
- data/lib/pdf/reader/filter/run_length.rb +46 -0
- data/lib/pdf/reader/font.rb +1 -1
- data/lib/pdf/reader/form_xobject.rb +25 -4
- data/lib/pdf/reader/glyph_hash.rb +3 -2
- data/lib/pdf/reader/object_cache.rb +39 -16
- data/lib/pdf/reader/object_hash.rb +1 -1
- data/lib/pdf/reader/page.rb +7 -1
- data/lib/pdf/reader/page_state.rb +2 -1
- data/lib/pdf/reader/stream.rb +1 -1
- data/lib/pdf/reader/xref.rb +23 -4
- metadata +99 -46
@@ -0,0 +1,25 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require 'ascii85'
|
4
|
+
|
5
|
+
class PDF::Reader
|
6
|
+
module Filter # :nodoc:
|
7
|
+
class Ascii85
|
8
|
+
def initialize(options = {})
|
9
|
+
@options = options
|
10
|
+
end
|
11
|
+
|
12
|
+
################################################################################
|
13
|
+
# Decode the specified data using the Ascii85 algorithm. Relies on the AScii85
|
14
|
+
# rubygem.
|
15
|
+
#
|
16
|
+
def filter(data)
|
17
|
+
data = "<~#{data}" unless data.to_s[0,2] == "<~"
|
18
|
+
::Ascii85::decode(data)
|
19
|
+
rescue Exception => e
|
20
|
+
# Oops, there was a problem decoding the stream
|
21
|
+
raise MalformedPDFError, "Error occured while decoding an ASCII85 stream (#{e.class.to_s}: #{e.to_s})"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
#
|
3
|
+
class PDF::Reader
|
4
|
+
module Filter # :nodoc:
|
5
|
+
class AsciiHex
|
6
|
+
def initialize(options = {})
|
7
|
+
@options = options
|
8
|
+
end
|
9
|
+
|
10
|
+
################################################################################
|
11
|
+
# Decode the specified data using the AsciiHex algorithm.
|
12
|
+
#
|
13
|
+
def filter(data)
|
14
|
+
data.chop! if data[-1,1] == ">"
|
15
|
+
data = data[1,data.size] if data[0,1] == "<"
|
16
|
+
data.gsub!(/[^A-Fa-f0-9]/,"")
|
17
|
+
data << "0" if data.size % 2 == 1
|
18
|
+
data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
|
19
|
+
rescue Exception => e
|
20
|
+
# Oops, there was a problem decoding the stream
|
21
|
+
raise MalformedPDFError, "Error occured while decoding an ASCIIHex stream (#{e.class.to_s}: #{e.to_s})"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
@@ -0,0 +1,138 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
module Filter # :nodoc:
|
5
|
+
class Depredict
|
6
|
+
def initialize(options = {})
|
7
|
+
@options = options || {}
|
8
|
+
end
|
9
|
+
|
10
|
+
################################################################################
|
11
|
+
# Streams can be preprocessed to improve compression. This reverses the
|
12
|
+
# preprocessing
|
13
|
+
#
|
14
|
+
def filter(data)
|
15
|
+
predictor = @options[:Predictor].to_i
|
16
|
+
|
17
|
+
case predictor
|
18
|
+
when 0, 1 then
|
19
|
+
data
|
20
|
+
when 2 then
|
21
|
+
tiff_depredict(data)
|
22
|
+
when 10, 11, 12, 13, 14, 15 then
|
23
|
+
png_depredict(data)
|
24
|
+
else
|
25
|
+
raise MalformedPDFError, "Unrecognised predictor value (#{predictor})"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
################################################################################
|
32
|
+
def tiff_depredict(data)
|
33
|
+
data = data.unpack("C*")
|
34
|
+
unfiltered = []
|
35
|
+
bpc = @options[:BitsPerComponent] || 8
|
36
|
+
pixel_bits = bpc * @options[:Colors]
|
37
|
+
pixel_bytes = pixel_bits / 8
|
38
|
+
line_len = (pixel_bytes * @options[:Columns])
|
39
|
+
pos = 0
|
40
|
+
|
41
|
+
if bpc != 8
|
42
|
+
raise UnsupportedFeatureError, "TIFF predictor onlys supports 8 Bits Per Component"
|
43
|
+
end
|
44
|
+
|
45
|
+
until pos > data.size
|
46
|
+
row_data = data[pos, line_len]
|
47
|
+
row_data.each_with_index do |byte, index|
|
48
|
+
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
49
|
+
row_data[index] = (byte + left) % 256
|
50
|
+
end
|
51
|
+
unfiltered += row_data
|
52
|
+
pos += line_len
|
53
|
+
end
|
54
|
+
|
55
|
+
unfiltered.pack("C*")
|
56
|
+
end
|
57
|
+
################################################################################
|
58
|
+
def png_depredict(data)
|
59
|
+
return data if @options[:Predictor].to_i < 10
|
60
|
+
|
61
|
+
data = data.unpack("C*")
|
62
|
+
|
63
|
+
pixel_bytes = @options[:Colors] || 1
|
64
|
+
scanline_length = (pixel_bytes * @options[:Columns]) + 1
|
65
|
+
row = 0
|
66
|
+
pixels = []
|
67
|
+
paeth, pa, pb, pc = nil
|
68
|
+
until data.empty? do
|
69
|
+
row_data = data.slice! 0, scanline_length
|
70
|
+
filter = row_data.shift
|
71
|
+
case filter
|
72
|
+
when 0 # None
|
73
|
+
when 1 # Sub
|
74
|
+
row_data.each_with_index do |byte, index|
|
75
|
+
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
76
|
+
row_data[index] = (byte + left) % 256
|
77
|
+
#p [byte, left, row_data[index]]
|
78
|
+
end
|
79
|
+
when 2 # Up
|
80
|
+
row_data.each_with_index do |byte, index|
|
81
|
+
col = index / pixel_bytes
|
82
|
+
upper = row == 0 ? 0 : pixels[row-1][col][index % pixel_bytes]
|
83
|
+
row_data[index] = (upper + byte) % 256
|
84
|
+
end
|
85
|
+
when 3 # Average
|
86
|
+
row_data.each_with_index do |byte, index|
|
87
|
+
col = index / pixel_bytes
|
88
|
+
upper = row == 0 ? 0 : pixels[row-1][col][index % pixel_bytes]
|
89
|
+
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
90
|
+
|
91
|
+
row_data[index] = (byte + ((left + upper)/2).floor) % 256
|
92
|
+
end
|
93
|
+
when 4 # Paeth
|
94
|
+
left = upper = upper_left = nil
|
95
|
+
row_data.each_with_index do |byte, index|
|
96
|
+
col = index / pixel_bytes
|
97
|
+
|
98
|
+
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
99
|
+
if row.zero?
|
100
|
+
upper = upper_left = 0
|
101
|
+
else
|
102
|
+
upper = pixels[row-1][col][index % pixel_bytes]
|
103
|
+
upper_left = col.zero? ? 0 :
|
104
|
+
pixels[row-1][col-1][index % pixel_bytes]
|
105
|
+
end
|
106
|
+
|
107
|
+
p = left + upper - upper_left
|
108
|
+
pa = (p - left).abs
|
109
|
+
pb = (p - upper).abs
|
110
|
+
pc = (p - upper_left).abs
|
111
|
+
|
112
|
+
paeth = if pa <= pb && pa <= pc
|
113
|
+
left
|
114
|
+
elsif pb <= pc
|
115
|
+
upper
|
116
|
+
else
|
117
|
+
upper_left
|
118
|
+
end
|
119
|
+
|
120
|
+
row_data[index] = (byte + paeth) % 256
|
121
|
+
end
|
122
|
+
else
|
123
|
+
raise ArgumentError, "Invalid filter algorithm #{filter}"
|
124
|
+
end
|
125
|
+
|
126
|
+
s = []
|
127
|
+
row_data.each_slice pixel_bytes do |slice|
|
128
|
+
s << slice
|
129
|
+
end
|
130
|
+
pixels << s
|
131
|
+
row += 1
|
132
|
+
end
|
133
|
+
|
134
|
+
pixels.map { |bytes| bytes.flatten.pack("C*") }.join("")
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
require 'zlib'
|
5
|
+
|
6
|
+
class PDF::Reader
|
7
|
+
module Filter # :nodoc:
|
8
|
+
class Flate
|
9
|
+
def initialize(options = {})
|
10
|
+
@options = options
|
11
|
+
end
|
12
|
+
|
13
|
+
################################################################################
|
14
|
+
# Decode the specified data with the Zlib compression algorithm
|
15
|
+
def filter(data)
|
16
|
+
deflated = nil
|
17
|
+
begin
|
18
|
+
deflated = Zlib::Inflate.new.inflate(data)
|
19
|
+
rescue Zlib::DataError => e
|
20
|
+
# by default, Ruby's Zlib assumes the data it's inflating
|
21
|
+
# is RFC1951 deflated data, wrapped in a RFC1951 zlib container.
|
22
|
+
# If that fails, then use an undocumented 'feature' to attempt to inflate
|
23
|
+
# the data as a raw RFC1951 stream.
|
24
|
+
#
|
25
|
+
# See
|
26
|
+
# - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
|
27
|
+
# - http://www.gzip.org/zlib/zlib_faq.html#faq38
|
28
|
+
deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
|
29
|
+
end
|
30
|
+
Depredict.new(@options).filter(deflated)
|
31
|
+
rescue Exception => e
|
32
|
+
# Oops, there was a problem inflating the stream
|
33
|
+
raise MalformedPDFError, "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
#
|
3
|
+
class PDF::Reader
|
4
|
+
module Filter # :nodoc:
|
5
|
+
class Lzw
|
6
|
+
def initialize(options = {})
|
7
|
+
@options = options
|
8
|
+
end
|
9
|
+
|
10
|
+
################################################################################
|
11
|
+
# Decode the specified data with the LZW compression algorithm
|
12
|
+
def filter(data)
|
13
|
+
data = PDF::Reader::LZW.decode(data)
|
14
|
+
Depredict.new(@options).filter(data)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
#
|
3
|
+
class PDF::Reader
|
4
|
+
module Filter # :nodoc:
|
5
|
+
class RunLength
|
6
|
+
def initialize(options = {})
|
7
|
+
@options = options
|
8
|
+
end
|
9
|
+
|
10
|
+
################################################################################
|
11
|
+
# Decode the specified data with the RunLengthDecode compression algorithm
|
12
|
+
def filter(data)
|
13
|
+
pos = 0
|
14
|
+
out = ""
|
15
|
+
|
16
|
+
while pos < data.length
|
17
|
+
if data.respond_to?(:getbyte)
|
18
|
+
length = data.getbyte(pos)
|
19
|
+
else
|
20
|
+
length = data[pos]
|
21
|
+
end
|
22
|
+
pos += 1
|
23
|
+
|
24
|
+
case
|
25
|
+
when length == 128
|
26
|
+
break
|
27
|
+
when length < 128
|
28
|
+
# When the length is < 128, we copy the following length+1 bytes
|
29
|
+
# literally.
|
30
|
+
out << data[pos, length + 1]
|
31
|
+
pos += length
|
32
|
+
else
|
33
|
+
# When the length is > 128, we copy the next byte (257 - length)
|
34
|
+
# times; i.e., "\xFA\x00" ([250, 0]) will expand to
|
35
|
+
# "\x00\x00\x00\x00\x00\x00\x00".
|
36
|
+
out << data[pos, 1] * (257 - length)
|
37
|
+
end
|
38
|
+
|
39
|
+
pos += 1
|
40
|
+
end
|
41
|
+
|
42
|
+
Depredict.new(@options).filter(out)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -109,7 +109,7 @@ class PDF::Reader
|
|
109
109
|
if params.class == String
|
110
110
|
params.unpack(encoding.unpack).map { |c|
|
111
111
|
@tounicode.decode(c) || PDF::Reader::Encoding::UNKNOWN_CHAR
|
112
|
-
}.pack("U*")
|
112
|
+
}.flatten.pack("U*")
|
113
113
|
elsif params.class == Array
|
114
114
|
params.collect { |param| to_utf8_via_cmap(param) }
|
115
115
|
else
|
@@ -1,5 +1,7 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
|
3
|
+
require 'digest/md5'
|
4
|
+
|
3
5
|
module PDF
|
4
6
|
class Reader
|
5
7
|
|
@@ -15,9 +17,10 @@ module PDF
|
|
15
17
|
|
16
18
|
attr_reader :xobject
|
17
19
|
|
18
|
-
def initialize(page, xobject)
|
20
|
+
def initialize(page, xobject, options = {})
|
19
21
|
@page = page
|
20
22
|
@objects = page.objects
|
23
|
+
@cache = options[:cache] || {}
|
21
24
|
@xobject = @objects.deref(xobject)
|
22
25
|
end
|
23
26
|
|
@@ -65,12 +68,30 @@ module PDF
|
|
65
68
|
end
|
66
69
|
end
|
67
70
|
|
71
|
+
def content_stream_md5
|
72
|
+
@content_stream_md5 ||= Digest::MD5.hexdigest(raw_content)
|
73
|
+
end
|
74
|
+
|
75
|
+
def cached_tokens_key
|
76
|
+
@cached_tokens_key ||= "tokens-#{content_stream_md5}"
|
77
|
+
end
|
78
|
+
|
79
|
+
def tokens
|
80
|
+
@cache[cached_tokens_key] ||= begin
|
81
|
+
buffer = Buffer.new(StringIO.new(raw_content), :content_stream => true)
|
82
|
+
parser = Parser.new(buffer, @objects)
|
83
|
+
result = []
|
84
|
+
while (token = parser.parse_token(PagesStrategy::OPERATORS))
|
85
|
+
result << token
|
86
|
+
end
|
87
|
+
result
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
68
91
|
def content_stream(receivers, instructions)
|
69
|
-
buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
|
70
|
-
parser = Parser.new(buffer, @objects)
|
71
92
|
params = []
|
72
93
|
|
73
|
-
|
94
|
+
tokens.each do |token|
|
74
95
|
if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
|
75
96
|
callback(receivers, PagesStrategy::OPERATORS[token], params)
|
76
97
|
params.clear
|
@@ -26,7 +26,8 @@
|
|
26
26
|
class PDF::Reader
|
27
27
|
class GlyphHash # :nodoc:
|
28
28
|
def initialize
|
29
|
-
|
29
|
+
# only parse the glyph list once, and cache the results (for performance)
|
30
|
+
@adobe = @@cache ||= load_adobe_glyph_mapping
|
30
31
|
end
|
31
32
|
|
32
33
|
# attempt to convert a PDF Name to a unicode codepoint. Returns nil
|
@@ -82,7 +83,7 @@ class PDF::Reader
|
|
82
83
|
end
|
83
84
|
end
|
84
85
|
|
85
|
-
glyphs
|
86
|
+
glyphs.freeze
|
86
87
|
end
|
87
88
|
|
88
89
|
end
|
@@ -1,10 +1,12 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
|
3
|
+
require 'hashery'
|
4
|
+
|
3
5
|
class PDF::Reader
|
4
6
|
|
5
7
|
# A Hash-like object for caching commonly used objects from a PDF file.
|
6
8
|
#
|
7
|
-
# This is an internal class
|
9
|
+
# This is an internal class, no promises about a stable API.
|
8
10
|
#
|
9
11
|
class ObjectCache # nodoc
|
10
12
|
|
@@ -13,53 +15,67 @@ class PDF::Reader
|
|
13
15
|
# avoid lots of repetitive (and expensive) tokenising
|
14
16
|
CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
|
15
17
|
|
16
|
-
|
18
|
+
attr_reader :hits, :misses
|
19
|
+
|
20
|
+
def initialize(lru_size = 1000)
|
17
21
|
@objects = {}
|
22
|
+
@lru_cache = Hashery::LRUHash.new(lru_size.to_i)
|
23
|
+
@hits = 0
|
24
|
+
@misses = 0
|
18
25
|
end
|
19
26
|
|
20
27
|
def [](key)
|
21
|
-
|
28
|
+
update_stats(key)
|
29
|
+
@objects[key] || @lru_cache[key]
|
22
30
|
end
|
23
31
|
|
24
32
|
def []=(key, value)
|
25
|
-
|
33
|
+
if cacheable?(value)
|
34
|
+
@objects[key] = value
|
35
|
+
else
|
36
|
+
@lru_cache[key] = value
|
37
|
+
end
|
26
38
|
end
|
27
39
|
|
28
40
|
def fetch(key, local_default = nil)
|
29
|
-
|
41
|
+
update_stats(key)
|
42
|
+
@objects[key] || @lru_cache.fetch(key, local_default)
|
30
43
|
end
|
31
44
|
|
32
45
|
def each(&block)
|
33
46
|
@objects.each(&block)
|
47
|
+
@lru_cache.each(&block)
|
34
48
|
end
|
35
49
|
alias :each_pair :each
|
36
50
|
|
37
51
|
def each_key(&block)
|
38
52
|
@objects.each_key(&block)
|
53
|
+
@lru_cache.each_key(&block)
|
39
54
|
end
|
40
55
|
|
41
56
|
def each_value(&block)
|
42
57
|
@objects.each_value(&block)
|
58
|
+
@lru_cache.each_value(&block)
|
43
59
|
end
|
44
60
|
|
45
61
|
def size
|
46
|
-
@objects.size
|
62
|
+
@objects.size + @lru_cache.size
|
47
63
|
end
|
48
64
|
alias :length :size
|
49
65
|
|
50
66
|
def empty?
|
51
|
-
@objects.empty?
|
67
|
+
@objects.empty? && @lru_cache.empty?
|
52
68
|
end
|
53
69
|
|
54
|
-
def
|
55
|
-
@objects.
|
70
|
+
def include?(key)
|
71
|
+
@objects.include?(key) || @lru_cache.include?(key)
|
56
72
|
end
|
57
|
-
alias :
|
58
|
-
alias :key? :
|
59
|
-
alias :member? :
|
73
|
+
alias :has_key? :include?
|
74
|
+
alias :key? :include?
|
75
|
+
alias :member? :include?
|
60
76
|
|
61
77
|
def has_value?(value)
|
62
|
-
@objects.has_value?(value)
|
78
|
+
@objects.has_value?(value) || @lru_cache.has_value?(value)
|
63
79
|
end
|
64
80
|
|
65
81
|
def to_s
|
@@ -67,19 +83,26 @@ class PDF::Reader
|
|
67
83
|
end
|
68
84
|
|
69
85
|
def keys
|
70
|
-
@objects.keys
|
86
|
+
@objects.keys + @lru_cache.keys
|
71
87
|
end
|
72
88
|
|
73
89
|
def values
|
74
|
-
@objects.values
|
90
|
+
@objects.values + @lru_cache.values
|
75
91
|
end
|
76
92
|
|
77
93
|
private
|
78
94
|
|
95
|
+
def update_stats(key)
|
96
|
+
if has_key?(key)
|
97
|
+
@hits += 1
|
98
|
+
else
|
99
|
+
@misses += 1
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
79
103
|
def cacheable?(obj)
|
80
104
|
obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
|
81
105
|
end
|
82
106
|
|
83
|
-
|
84
107
|
end
|
85
108
|
end
|