pdf-reader 0.8.6 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +17 -0
- data/README.rdoc +7 -15
- data/Rakefile +10 -63
- data/TODO +6 -8
- data/bin/pdf_object +3 -0
- data/bin/pdf_text +4 -2
- data/examples/extract_images.rb +108 -0
- data/examples/hash.rb +1 -1
- data/examples/text.rb +3 -0
- data/lib/pdf/hash.rb +8 -225
- data/lib/pdf/reader.rb +79 -55
- data/lib/pdf/reader/abstract_strategy.rb +77 -0
- data/lib/pdf/reader/buffer.rb +61 -40
- data/lib/pdf/reader/cmap.rb +11 -10
- data/lib/pdf/reader/encoding.rb +85 -79
- data/lib/pdf/reader/error.rb +1 -2
- data/lib/pdf/reader/filter.rb +109 -6
- data/lib/pdf/reader/font.rb +11 -11
- data/lib/pdf/reader/lzw.rb +123 -0
- data/lib/pdf/reader/metadata_strategy.rb +53 -0
- data/lib/pdf/reader/object_hash.rb +275 -0
- data/lib/pdf/reader/object_stream.rb +51 -0
- data/lib/pdf/reader/{content.rb → pages_strategy.rb} +63 -100
- data/lib/pdf/reader/parser.rb +74 -37
- data/lib/pdf/reader/print_receiver.rb +0 -1
- data/lib/pdf/reader/register_receiver.rb +21 -0
- data/lib/pdf/reader/stream.rb +5 -1
- data/lib/pdf/reader/text_receiver.rb +3 -1
- data/lib/pdf/reader/token.rb +1 -1
- data/lib/pdf/reader/xref.rb +126 -64
- metadata +61 -13
- data/lib/pdf/reader/explore.rb +0 -116
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -23,58 +23,28 @@
|
|
23
23
|
#
|
24
24
|
################################################################################
|
25
25
|
|
26
|
-
require 'enumerator'
|
27
|
-
|
28
26
|
class PDF::Reader
|
29
|
-
class Encoding
|
27
|
+
class Encoding # :nodoc:
|
30
28
|
CONTROL_CHARS = [0,1,2,3,4,5,6,7,8,11,12,14,15,16,17,18,19,20,21,22,23,
|
31
29
|
24,25,26,27,28,29,30,31]
|
32
30
|
UNKNOWN_CHAR = 0x25AF # ▯
|
33
31
|
|
34
|
-
attr_reader :
|
32
|
+
attr_reader :unpack
|
35
33
|
|
36
34
|
def initialize(enc)
|
37
|
-
@to_unicode_required = false
|
38
|
-
|
39
35
|
if enc.kind_of?(Hash)
|
40
|
-
self.differences=enc[:Differences] if enc[:Differences]
|
36
|
+
self.differences = enc[:Differences] if enc[:Differences]
|
41
37
|
enc = enc[:Encoding] || enc[:BaseEncoding]
|
42
38
|
elsif enc != nil
|
43
39
|
enc = enc.to_sym
|
40
|
+
else
|
41
|
+
enc = nil
|
44
42
|
end
|
45
43
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
when "Identity-H".to_sym then
|
51
|
-
@unpack = "n*"
|
52
|
-
@to_unicode_required = true
|
53
|
-
when :MacRomanEncoding then
|
54
|
-
load_mapping File.dirname(__FILE__) + "/encodings/mac_roman.txt"
|
55
|
-
@unpack = "C*"
|
56
|
-
when :MacExpertEncoding then
|
57
|
-
load_mapping File.dirname(__FILE__) + "/encodings/mac_expert.txt"
|
58
|
-
@unpack = "C*"
|
59
|
-
when :PDFDocEncoding then
|
60
|
-
load_mapping File.dirname(__FILE__) + "/encodings/pdf_doc.txt"
|
61
|
-
@unpack = "C*"
|
62
|
-
when :StandardEncoding then
|
63
|
-
load_mapping File.dirname(__FILE__) + "/encodings/standard.txt"
|
64
|
-
@unpack = "C*"
|
65
|
-
when :SymbolEncoding then
|
66
|
-
load_mapping File.dirname(__FILE__) + "/encodings/symbol.txt"
|
67
|
-
@unpack = "C*"
|
68
|
-
when :UTF16Encoding then
|
69
|
-
@unpack = "n*"
|
70
|
-
when :WinAnsiEncoding then
|
71
|
-
load_mapping File.dirname(__FILE__) + "/encodings/win_ansi.txt"
|
72
|
-
@unpack = "C*"
|
73
|
-
when :ZapfDingbatsEncoding then
|
74
|
-
load_mapping File.dirname(__FILE__) + "/encodings/zapf_dingbats.txt"
|
75
|
-
@unpack = "C*"
|
76
|
-
else raise UnsupportedFeatureError, "#{enc} is not currently a supported encoding"
|
77
|
-
end
|
44
|
+
@to_unicode_required = unicode_required?(enc)
|
45
|
+
@unpack = get_unpack(enc)
|
46
|
+
@map_file = get_mapping_file(enc)
|
47
|
+
load_mapping(@map_file) if @map_file
|
78
48
|
end
|
79
49
|
|
80
50
|
def to_unicode_required?
|
@@ -85,9 +55,9 @@ class PDF::Reader
|
|
85
55
|
#
|
86
56
|
# [25, :A, 26, :B]
|
87
57
|
#
|
88
|
-
# The array alternates
|
58
|
+
# The array alternates between a decimal byte number and a glyph name to map to that byte
|
89
59
|
#
|
90
|
-
# To save space the following array is also valid and
|
60
|
+
# To save space the following array is also valid and equivalent to the previous one
|
91
61
|
#
|
92
62
|
# [25, :A, :B]
|
93
63
|
def differences=(diff)
|
@@ -106,45 +76,90 @@ class PDF::Reader
|
|
106
76
|
@differences
|
107
77
|
end
|
108
78
|
|
79
|
+
def differences
|
80
|
+
@differences ||= {}
|
81
|
+
end
|
82
|
+
|
109
83
|
# convert the specified string to utf8
|
84
|
+
#
|
85
|
+
# * unpack raw bytes into codepoints
|
86
|
+
# * replace any that have entries in the differences table with a glyph name
|
87
|
+
# * convert codepoints from source encoding to Unicode codepoints
|
88
|
+
# * convert any glyph names to Unicode codepoints
|
89
|
+
# * replace characters that didn't convert to Unicode nicely with something
|
90
|
+
# valid
|
91
|
+
# * pack the final array of Unicode codepoints into a utf-8 string
|
92
|
+
# * mark the string as utf-8 if we're running on a M17N aware VM
|
93
|
+
#
|
110
94
|
def to_utf8(str, tounicode = nil)
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
if tounicode && (code = tounicode.decode(num))
|
120
|
-
code
|
121
|
-
elsif tounicode || ( tounicode.nil? && to_unicode_required? )
|
122
|
-
PDF::Reader::Encoding::UNKNOWN_CHAR
|
123
|
-
elsif mapping[num]
|
124
|
-
mapping[num]
|
125
|
-
elsif PDF::Reader::Encoding::CONTROL_CHARS.include?(num)
|
95
|
+
ret = str.unpack(unpack).map { |c|
|
96
|
+
differences[c] || c
|
97
|
+
}.map { |num|
|
98
|
+
original_codepoint_to_unicode(num, tounicode)
|
99
|
+
}.map { |c|
|
100
|
+
glyphnames[c] || c
|
101
|
+
}.map { |c|
|
102
|
+
if c.nil? || !c.is_a?(Fixnum)
|
126
103
|
PDF::Reader::Encoding::UNKNOWN_CHAR
|
127
104
|
else
|
128
|
-
|
105
|
+
c
|
129
106
|
end
|
130
|
-
|
107
|
+
}.pack("U*")
|
108
|
+
|
109
|
+
ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
|
131
110
|
|
132
|
-
|
133
|
-
|
111
|
+
ret
|
112
|
+
end
|
134
113
|
|
135
|
-
|
136
|
-
array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }
|
114
|
+
private
|
137
115
|
|
138
|
-
|
139
|
-
|
116
|
+
def original_codepoint_to_unicode(cp, tounicode = nil)
|
117
|
+
if tounicode && (code = tounicode.decode(cp))
|
118
|
+
code
|
119
|
+
elsif tounicode || ( tounicode.nil? && to_unicode_required? )
|
120
|
+
PDF::Reader::Encoding::UNKNOWN_CHAR
|
121
|
+
elsif mapping[cp]
|
122
|
+
mapping[cp]
|
123
|
+
elsif PDF::Reader::Encoding::CONTROL_CHARS.include?(cp)
|
124
|
+
PDF::Reader::Encoding::UNKNOWN_CHAR
|
125
|
+
else
|
126
|
+
cp
|
127
|
+
end
|
128
|
+
end
|
140
129
|
|
141
|
-
|
142
|
-
|
130
|
+
def get_unpack(enc)
|
131
|
+
case enc
|
132
|
+
when :"Identity-H", :UTF16Encoding
|
133
|
+
"n*"
|
134
|
+
else
|
135
|
+
"C*"
|
136
|
+
end
|
137
|
+
end
|
143
138
|
|
144
|
-
|
139
|
+
def get_mapping_file(enc)
|
140
|
+
return File.dirname(__FILE__) + "/encodings/standard.txt" if enc.nil?
|
141
|
+
files = {
|
142
|
+
:"Identity-H" => nil,
|
143
|
+
:MacRomanEncoding => File.dirname(__FILE__) + "/encodings/mac_roman.txt",
|
144
|
+
:MacExpertEncoding => File.dirname(__FILE__) + "/encodings/mac_expert.txt",
|
145
|
+
:PDFDocEncoding => File.dirname(__FILE__) + "/encodings/pdf_doc.txt",
|
146
|
+
:StandardEncoding => File.dirname(__FILE__) + "/encodings/standard.txt",
|
147
|
+
:SymbolEncoding => File.dirname(__FILE__) + "/encodings/symbol.txt",
|
148
|
+
:UTF16Encoding => nil,
|
149
|
+
:WinAnsiEncoding => File.dirname(__FILE__) + "/encodings/win_ansi.txt",
|
150
|
+
:ZapfDingbatsEncoding => File.dirname(__FILE__) + "/encodings/zapf_dingbats.txt"
|
151
|
+
}
|
152
|
+
|
153
|
+
if files.has_key?(enc)
|
154
|
+
files[enc]
|
155
|
+
else
|
156
|
+
raise UnsupportedFeatureError, "#{enc} is not currently a supported encoding"
|
157
|
+
end
|
145
158
|
end
|
146
159
|
|
147
|
-
|
160
|
+
def unicode_required?(enc)
|
161
|
+
enc == :"Identity-H"
|
162
|
+
end
|
148
163
|
|
149
164
|
def mapping
|
150
165
|
@mapping ||= {}
|
@@ -154,17 +169,8 @@ class PDF::Reader
|
|
154
169
|
mapping.size > 0
|
155
170
|
end
|
156
171
|
|
157
|
-
|
158
|
-
|
159
|
-
def process_differences(arr)
|
160
|
-
@differences ||= {}
|
161
|
-
arr.collect! { |n| @differences[n].nil? ? n : @differences[n]}
|
162
|
-
end
|
163
|
-
|
164
|
-
# accepts an array of unicode code points and glyphnames, and converts any glyph names to codepoints
|
165
|
-
def process_glyphnames(arr)
|
166
|
-
@differences ||= {}
|
167
|
-
arr.collect! { |n| n.kind_of?(Numeric) ? n : PDF::Reader::Font.glyphnames[n]}
|
172
|
+
def glyphnames
|
173
|
+
@glyphnames ||= PDF::Reader::Font.glyphnames
|
168
174
|
end
|
169
175
|
|
170
176
|
def load_mapping(file)
|
data/lib/pdf/reader/error.rb
CHANGED
@@ -22,12 +22,11 @@
|
|
22
22
|
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
23
|
#
|
24
24
|
|
25
|
-
|
26
25
|
class PDF::Reader
|
27
26
|
################################################################################
|
28
27
|
# An internal PDF::Reader class that helps to verify various parts of the PDF file
|
29
28
|
# are valid
|
30
|
-
class Error
|
29
|
+
class Error # :nodoc:
|
31
30
|
################################################################################
|
32
31
|
def self.str_assert (lvalue, rvalue, chars=nil)
|
33
32
|
raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
|
data/lib/pdf/reader/filter.rb
CHANGED
@@ -30,9 +30,7 @@ class PDF::Reader
|
|
30
30
|
# support for features like compression and encryption. This class is for decoding that
|
31
31
|
# content.
|
32
32
|
#
|
33
|
-
|
34
|
-
# in the future.
|
35
|
-
class Filter
|
33
|
+
class Filter # :nodoc:
|
36
34
|
################################################################################
|
37
35
|
# creates a new filter for decoding content.
|
38
36
|
#
|
@@ -49,6 +47,7 @@ class PDF::Reader
|
|
49
47
|
when :DCTDecode then @filter = nil
|
50
48
|
when :FlateDecode then @filter = :flate
|
51
49
|
when :JBIG2Decode then @filter = nil
|
50
|
+
when :LZWDecode then @filter = :lzw
|
52
51
|
else raise UnsupportedFeatureError, "Unknown filter: #{name}"
|
53
52
|
end
|
54
53
|
end
|
@@ -92,8 +91,9 @@ class PDF::Reader
|
|
92
91
|
################################################################################
|
93
92
|
# Decode the specified data with the Zlib compression algorithm
|
94
93
|
def flate (data)
|
94
|
+
deflated = nil
|
95
95
|
begin
|
96
|
-
Zlib::Inflate.new.inflate(data)
|
96
|
+
deflated = Zlib::Inflate.new.inflate(data)
|
97
97
|
rescue Zlib::DataError => e
|
98
98
|
# by default, Ruby's Zlib assumes the data it's inflating
|
99
99
|
# is RFC1951 deflated data, wrapped in a RFC1951 zlib container.
|
@@ -103,14 +103,117 @@ class PDF::Reader
|
|
103
103
|
# See
|
104
104
|
# - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
|
105
105
|
# - http://www.gzip.org/zlib/zlib_faq.html#faq38
|
106
|
-
Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
|
106
|
+
deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
|
107
107
|
end
|
108
|
+
depredict(deflated, @options)
|
108
109
|
rescue Exception => e
|
109
110
|
# Oops, there was a problem inflating the stream
|
110
111
|
raise MalformedPDFError, "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
|
111
112
|
end
|
112
113
|
################################################################################
|
114
|
+
# Decode the specified data with the LZW compression algorithm
|
115
|
+
def lzw(data)
|
116
|
+
data = PDF::Reader::LZW.decode(data)
|
117
|
+
depredict(data, @options)
|
118
|
+
end
|
119
|
+
################################################################################
|
120
|
+
def depredict(data, opts = {})
|
121
|
+
predictor = (opts || {})[:Predictor].to_i
|
122
|
+
|
123
|
+
case predictor
|
124
|
+
when 0, 1 then
|
125
|
+
data
|
126
|
+
when 2 then
|
127
|
+
tiff_depredict(data, opts)
|
128
|
+
when 10, 11, 12, 13, 14, 15 then
|
129
|
+
png_depredict(data, opts)
|
130
|
+
else
|
131
|
+
raise MalformedPDFError, "Unrecognised predictor value (#{predictor})"
|
132
|
+
end
|
133
|
+
end
|
134
|
+
################################################################################
|
135
|
+
def tiff_depredict(data, opts = {})
|
136
|
+
raise UnsupportedFeatureError, "TIFF predictor not supported"
|
137
|
+
end
|
138
|
+
################################################################################
|
139
|
+
def png_depredict(data, opts = {})
|
140
|
+
return data if opts.nil? || opts[:Predictor].to_i < 10
|
141
|
+
|
142
|
+
data = data.unpack("C*")
|
143
|
+
|
144
|
+
pixel_bytes = 1 #pixel_bitlength / 8
|
145
|
+
scanline_length = (pixel_bytes * opts[:Columns]) + 1
|
146
|
+
row = 0
|
147
|
+
pixels = []
|
148
|
+
paeth, pa, pb, pc = nil
|
149
|
+
until data.empty? do
|
150
|
+
row_data = data.slice! 0, scanline_length
|
151
|
+
filter = row_data.shift
|
152
|
+
case filter
|
153
|
+
when 0 # None
|
154
|
+
when 1 # Sub
|
155
|
+
row_data.each_with_index do |byte, index|
|
156
|
+
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
157
|
+
row_data[index] = (byte + left) % 256
|
158
|
+
#p [byte, left, row_data[index]]
|
159
|
+
end
|
160
|
+
when 2 # Up
|
161
|
+
row_data.each_with_index do |byte, index|
|
162
|
+
col = index / pixel_bytes
|
163
|
+
upper = row == 0 ? 0 : pixels[row-1][col][index % pixel_bytes]
|
164
|
+
row_data[index] = (upper + byte) % 256
|
165
|
+
end
|
166
|
+
when 3 # Average
|
167
|
+
row_data.each_with_index do |byte, index|
|
168
|
+
col = index / pixel_bytes
|
169
|
+
upper = row == 0 ? 0 : pixels[row-1][col][index % pixel_bytes]
|
170
|
+
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
171
|
+
|
172
|
+
row_data[index] = (byte + ((left + upper)/2).floor) % 256
|
173
|
+
end
|
174
|
+
when 4 # Paeth
|
175
|
+
left = upper = upper_left = nil
|
176
|
+
row_data.each_with_index do |byte, index|
|
177
|
+
col = index / pixel_bytes
|
178
|
+
|
179
|
+
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
180
|
+
if row.zero?
|
181
|
+
upper = upper_left = 0
|
182
|
+
else
|
183
|
+
upper = pixels[row-1][col][index % pixel_bytes]
|
184
|
+
upper_left = col.zero? ? 0 :
|
185
|
+
pixels[row-1][col-1][index % pixel_bytes]
|
186
|
+
end
|
187
|
+
|
188
|
+
p = left + upper - upper_left
|
189
|
+
pa = (p - left).abs
|
190
|
+
pb = (p - upper).abs
|
191
|
+
pc = (p - upper_left).abs
|
192
|
+
|
193
|
+
paeth = if pa <= pb && pa <= pc
|
194
|
+
left
|
195
|
+
elsif pb <= pc
|
196
|
+
upper
|
197
|
+
else
|
198
|
+
upper_left
|
199
|
+
end
|
200
|
+
|
201
|
+
row_data[index] = (byte + paeth) % 256
|
202
|
+
end
|
203
|
+
else
|
204
|
+
raise ArgumentError, "Invalid filter algorithm #{filter}"
|
205
|
+
end
|
206
|
+
|
207
|
+
s = []
|
208
|
+
row_data.each_slice pixel_bytes do |slice|
|
209
|
+
s << slice
|
210
|
+
end
|
211
|
+
pixels << s
|
212
|
+
row += 1
|
213
|
+
end
|
214
|
+
|
215
|
+
pixels.map { |row| row.flatten.pack("C*") }.join("")
|
216
|
+
end
|
113
217
|
end
|
114
|
-
################################################################################
|
115
218
|
end
|
116
219
|
################################################################################
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -32,19 +32,17 @@ class PDF::Reader
|
|
32
32
|
# a text file supplied by Adobe at:
|
33
33
|
# http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
|
34
34
|
def self.glyphnames
|
35
|
-
|
35
|
+
glyphs = {}
|
36
36
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
@@glyphs[name.to_sym] = "0x#{code}".hex if name
|
43
|
-
end
|
37
|
+
RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
|
38
|
+
File.open(File.dirname(__FILE__) + "/glyphlist.txt",mode) do |f|
|
39
|
+
f.each do |l|
|
40
|
+
m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
|
41
|
+
glyphs[name.to_sym] = "0x#{code}".hex if name
|
44
42
|
end
|
45
43
|
end
|
46
44
|
|
47
|
-
|
45
|
+
glyphs
|
48
46
|
end
|
49
47
|
|
50
48
|
def basefont=(font)
|
@@ -52,9 +50,11 @@ class PDF::Reader
|
|
52
50
|
# with encoding= if required
|
53
51
|
case font
|
54
52
|
when "Symbol" then
|
55
|
-
|
53
|
+
@encoding = PDF::Reader::Encoding.new("SymbolEncoding")
|
56
54
|
when "ZapfDingbats" then
|
57
|
-
|
55
|
+
@encoding = PDF::Reader::Encoding.new("ZapfDingbatsEncoding")
|
56
|
+
else
|
57
|
+
@encoding = nil
|
58
58
|
end
|
59
59
|
@basefont = font
|
60
60
|
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module PDF
|
4
|
+
|
5
|
+
class Reader
|
6
|
+
|
7
|
+
# A general class for decoding LZW compressed data. LZW can be
|
8
|
+
# used in PDF files to compresses streams, usually for image data sourced
|
9
|
+
# from a TIFF file.
|
10
|
+
#
|
11
|
+
# See the following links for more information:
|
12
|
+
#
|
13
|
+
# ref http://www.fileformat.info/format/tiff/corion-lzw.htm
|
14
|
+
# ref http://marknelson.us/1989/10/01/lzw-data-compression/
|
15
|
+
#
|
16
|
+
# The PDF spec also has some data on the algorithm.
|
17
|
+
#
|
18
|
+
class LZW # :nodoc:
|
19
|
+
|
20
|
+
class BitStream # :nodoc:
|
21
|
+
|
22
|
+
def initialize(data, bits_in_chunk)
|
23
|
+
@data = data
|
24
|
+
@data.force_encoding("BINARY") if @data.respond_to?(:force_encoding)
|
25
|
+
@bits_in_chunk = bits_in_chunk
|
26
|
+
@current_pos = 0
|
27
|
+
@bits_left_in_byte = 8
|
28
|
+
end
|
29
|
+
|
30
|
+
def set_bits_in_chunk(bits_in_chunk)
|
31
|
+
@bits_in_chunk = bits_in_chunk
|
32
|
+
end
|
33
|
+
|
34
|
+
def read
|
35
|
+
bits_left_in_chunk = @bits_in_chunk
|
36
|
+
chunk = nil
|
37
|
+
while bits_left_in_chunk > 0 and @current_pos < @data.size
|
38
|
+
chunk = 0 if chunk.nil?
|
39
|
+
codepoint = @data[@current_pos, 1].unpack("C*")[0]
|
40
|
+
current_byte = codepoint & (2**@bits_left_in_byte -1) #clear consumed bits
|
41
|
+
dif = bits_left_in_chunk - @bits_left_in_byte
|
42
|
+
if dif > 0 then current_byte <<= dif
|
43
|
+
elsif dif < 0 then current_byte >>= dif.abs
|
44
|
+
end
|
45
|
+
chunk |= current_byte #add bits to result
|
46
|
+
bits_left_in_chunk = if dif >= 0 then dif else 0 end
|
47
|
+
@bits_left_in_byte = if dif < 0 then dif.abs else 0 end
|
48
|
+
if @bits_left_in_byte.zero? #next byte
|
49
|
+
@current_pos += 1
|
50
|
+
@bits_left_in_byte = 8
|
51
|
+
end
|
52
|
+
end
|
53
|
+
chunk
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
CODE_EOD = 257 #end of data
|
58
|
+
CODE_CLEAR_TABLE = 256 #clear table
|
59
|
+
|
60
|
+
# stores de pairs code => string
|
61
|
+
class StringTable < Hash # :nodoc:
|
62
|
+
attr_reader :string_table_pos
|
63
|
+
|
64
|
+
def initialize
|
65
|
+
super
|
66
|
+
@string_table_pos = 258 #initial code
|
67
|
+
end
|
68
|
+
|
69
|
+
#if code less than 258 return fixed string
|
70
|
+
def [](key)
|
71
|
+
if key > 257 then super else key.chr end
|
72
|
+
end
|
73
|
+
|
74
|
+
def add(string)
|
75
|
+
store(@string_table_pos, string)
|
76
|
+
@string_table_pos += 1
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# Decompresses a LZW compressed string.
|
81
|
+
#
|
82
|
+
def self.decode(data)
|
83
|
+
stream = BitStream.new data.to_s, 9 # size of codes between 9 and 12 bits
|
84
|
+
result = ''
|
85
|
+
while not (code = stream.read) == CODE_EOD
|
86
|
+
if code == CODE_CLEAR_TABLE
|
87
|
+
string_table = StringTable.new
|
88
|
+
code = stream.read
|
89
|
+
break if code == CODE_EOD
|
90
|
+
result << string_table[code]
|
91
|
+
old_code = code
|
92
|
+
else
|
93
|
+
string = string_table[code]
|
94
|
+
if string
|
95
|
+
result << string
|
96
|
+
string_table.add create_new_string(string_table, old_code, code)
|
97
|
+
old_code = code
|
98
|
+
else
|
99
|
+
new_string = create_new_string(string_table, old_code, old_code)
|
100
|
+
result << new_string
|
101
|
+
string_table.add new_string
|
102
|
+
old_code = code
|
103
|
+
end
|
104
|
+
#increase de size of the codes when limit reached
|
105
|
+
case string_table.string_table_pos
|
106
|
+
when 511 then stream.set_bits_in_chunk(10)
|
107
|
+
when 1023 then stream.set_bits_in_chunk(11)
|
108
|
+
when 2047 then stream.set_bits_in_chunk(12)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
result
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
|
117
|
+
def self.create_new_string(string_table,some_code, other_code)
|
118
|
+
string_table[some_code] + string_table[other_code][0].chr
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|