pdf-reader 0.8.6 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +17 -0
- data/README.rdoc +7 -15
- data/Rakefile +10 -63
- data/TODO +6 -8
- data/bin/pdf_object +3 -0
- data/bin/pdf_text +4 -2
- data/examples/extract_images.rb +108 -0
- data/examples/hash.rb +1 -1
- data/examples/text.rb +3 -0
- data/lib/pdf/hash.rb +8 -225
- data/lib/pdf/reader.rb +79 -55
- data/lib/pdf/reader/abstract_strategy.rb +77 -0
- data/lib/pdf/reader/buffer.rb +61 -40
- data/lib/pdf/reader/cmap.rb +11 -10
- data/lib/pdf/reader/encoding.rb +85 -79
- data/lib/pdf/reader/error.rb +1 -2
- data/lib/pdf/reader/filter.rb +109 -6
- data/lib/pdf/reader/font.rb +11 -11
- data/lib/pdf/reader/lzw.rb +123 -0
- data/lib/pdf/reader/metadata_strategy.rb +53 -0
- data/lib/pdf/reader/object_hash.rb +275 -0
- data/lib/pdf/reader/object_stream.rb +51 -0
- data/lib/pdf/reader/{content.rb → pages_strategy.rb} +63 -100
- data/lib/pdf/reader/parser.rb +74 -37
- data/lib/pdf/reader/print_receiver.rb +0 -1
- data/lib/pdf/reader/register_receiver.rb +21 -0
- data/lib/pdf/reader/stream.rb +5 -1
- data/lib/pdf/reader/text_receiver.rb +3 -1
- data/lib/pdf/reader/token.rb +1 -1
- data/lib/pdf/reader/xref.rb +126 -64
- metadata +61 -13
- data/lib/pdf/reader/explore.rb +0 -116
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -23,58 +23,28 @@
|
|
23
23
|
#
|
24
24
|
################################################################################
|
25
25
|
|
26
|
-
require 'enumerator'
|
27
|
-
|
28
26
|
class PDF::Reader
|
29
|
-
class Encoding
|
27
|
+
class Encoding # :nodoc:
|
30
28
|
CONTROL_CHARS = [0,1,2,3,4,5,6,7,8,11,12,14,15,16,17,18,19,20,21,22,23,
|
31
29
|
24,25,26,27,28,29,30,31]
|
32
30
|
UNKNOWN_CHAR = 0x25AF # ▯
|
33
31
|
|
34
|
-
attr_reader :
|
32
|
+
attr_reader :unpack
|
35
33
|
|
36
34
|
def initialize(enc)
|
37
|
-
@to_unicode_required = false
|
38
|
-
|
39
35
|
if enc.kind_of?(Hash)
|
40
|
-
self.differences=enc[:Differences] if enc[:Differences]
|
36
|
+
self.differences = enc[:Differences] if enc[:Differences]
|
41
37
|
enc = enc[:Encoding] || enc[:BaseEncoding]
|
42
38
|
elsif enc != nil
|
43
39
|
enc = enc.to_sym
|
40
|
+
else
|
41
|
+
enc = nil
|
44
42
|
end
|
45
43
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
when "Identity-H".to_sym then
|
51
|
-
@unpack = "n*"
|
52
|
-
@to_unicode_required = true
|
53
|
-
when :MacRomanEncoding then
|
54
|
-
load_mapping File.dirname(__FILE__) + "/encodings/mac_roman.txt"
|
55
|
-
@unpack = "C*"
|
56
|
-
when :MacExpertEncoding then
|
57
|
-
load_mapping File.dirname(__FILE__) + "/encodings/mac_expert.txt"
|
58
|
-
@unpack = "C*"
|
59
|
-
when :PDFDocEncoding then
|
60
|
-
load_mapping File.dirname(__FILE__) + "/encodings/pdf_doc.txt"
|
61
|
-
@unpack = "C*"
|
62
|
-
when :StandardEncoding then
|
63
|
-
load_mapping File.dirname(__FILE__) + "/encodings/standard.txt"
|
64
|
-
@unpack = "C*"
|
65
|
-
when :SymbolEncoding then
|
66
|
-
load_mapping File.dirname(__FILE__) + "/encodings/symbol.txt"
|
67
|
-
@unpack = "C*"
|
68
|
-
when :UTF16Encoding then
|
69
|
-
@unpack = "n*"
|
70
|
-
when :WinAnsiEncoding then
|
71
|
-
load_mapping File.dirname(__FILE__) + "/encodings/win_ansi.txt"
|
72
|
-
@unpack = "C*"
|
73
|
-
when :ZapfDingbatsEncoding then
|
74
|
-
load_mapping File.dirname(__FILE__) + "/encodings/zapf_dingbats.txt"
|
75
|
-
@unpack = "C*"
|
76
|
-
else raise UnsupportedFeatureError, "#{enc} is not currently a supported encoding"
|
77
|
-
end
|
44
|
+
@to_unicode_required = unicode_required?(enc)
|
45
|
+
@unpack = get_unpack(enc)
|
46
|
+
@map_file = get_mapping_file(enc)
|
47
|
+
load_mapping(@map_file) if @map_file
|
78
48
|
end
|
79
49
|
|
80
50
|
def to_unicode_required?
|
@@ -85,9 +55,9 @@ class PDF::Reader
|
|
85
55
|
#
|
86
56
|
# [25, :A, 26, :B]
|
87
57
|
#
|
88
|
-
# The array alternates
|
58
|
+
# The array alternates between a decimal byte number and a glyph name to map to that byte
|
89
59
|
#
|
90
|
-
# To save space the following array is also valid and
|
60
|
+
# To save space the following array is also valid and equivalent to the previous one
|
91
61
|
#
|
92
62
|
# [25, :A, :B]
|
93
63
|
def differences=(diff)
|
@@ -106,45 +76,90 @@ class PDF::Reader
|
|
106
76
|
@differences
|
107
77
|
end
|
108
78
|
|
79
|
+
def differences
|
80
|
+
@differences ||= {}
|
81
|
+
end
|
82
|
+
|
109
83
|
# convert the specified string to utf8
|
84
|
+
#
|
85
|
+
# * unpack raw bytes into codepoints
|
86
|
+
# * replace any that have entries in the differences table with a glyph name
|
87
|
+
# * convert codepoints from source encoding to Unicode codepoints
|
88
|
+
# * convert any glyph names to Unicode codepoints
|
89
|
+
# * replace characters that didn't convert to Unicode nicely with something
|
90
|
+
# valid
|
91
|
+
# * pack the final array of Unicode codepoints into a utf-8 string
|
92
|
+
# * mark the string as utf-8 if we're running on a M17N aware VM
|
93
|
+
#
|
110
94
|
def to_utf8(str, tounicode = nil)
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
if tounicode && (code = tounicode.decode(num))
|
120
|
-
code
|
121
|
-
elsif tounicode || ( tounicode.nil? && to_unicode_required? )
|
122
|
-
PDF::Reader::Encoding::UNKNOWN_CHAR
|
123
|
-
elsif mapping[num]
|
124
|
-
mapping[num]
|
125
|
-
elsif PDF::Reader::Encoding::CONTROL_CHARS.include?(num)
|
95
|
+
ret = str.unpack(unpack).map { |c|
|
96
|
+
differences[c] || c
|
97
|
+
}.map { |num|
|
98
|
+
original_codepoint_to_unicode(num, tounicode)
|
99
|
+
}.map { |c|
|
100
|
+
glyphnames[c] || c
|
101
|
+
}.map { |c|
|
102
|
+
if c.nil? || !c.is_a?(Fixnum)
|
126
103
|
PDF::Reader::Encoding::UNKNOWN_CHAR
|
127
104
|
else
|
128
|
-
|
105
|
+
c
|
129
106
|
end
|
130
|
-
|
107
|
+
}.pack("U*")
|
108
|
+
|
109
|
+
ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
|
131
110
|
|
132
|
-
|
133
|
-
|
111
|
+
ret
|
112
|
+
end
|
134
113
|
|
135
|
-
|
136
|
-
array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }
|
114
|
+
private
|
137
115
|
|
138
|
-
|
139
|
-
|
116
|
+
def original_codepoint_to_unicode(cp, tounicode = nil)
|
117
|
+
if tounicode && (code = tounicode.decode(cp))
|
118
|
+
code
|
119
|
+
elsif tounicode || ( tounicode.nil? && to_unicode_required? )
|
120
|
+
PDF::Reader::Encoding::UNKNOWN_CHAR
|
121
|
+
elsif mapping[cp]
|
122
|
+
mapping[cp]
|
123
|
+
elsif PDF::Reader::Encoding::CONTROL_CHARS.include?(cp)
|
124
|
+
PDF::Reader::Encoding::UNKNOWN_CHAR
|
125
|
+
else
|
126
|
+
cp
|
127
|
+
end
|
128
|
+
end
|
140
129
|
|
141
|
-
|
142
|
-
|
130
|
+
def get_unpack(enc)
|
131
|
+
case enc
|
132
|
+
when :"Identity-H", :UTF16Encoding
|
133
|
+
"n*"
|
134
|
+
else
|
135
|
+
"C*"
|
136
|
+
end
|
137
|
+
end
|
143
138
|
|
144
|
-
|
139
|
+
def get_mapping_file(enc)
|
140
|
+
return File.dirname(__FILE__) + "/encodings/standard.txt" if enc.nil?
|
141
|
+
files = {
|
142
|
+
:"Identity-H" => nil,
|
143
|
+
:MacRomanEncoding => File.dirname(__FILE__) + "/encodings/mac_roman.txt",
|
144
|
+
:MacExpertEncoding => File.dirname(__FILE__) + "/encodings/mac_expert.txt",
|
145
|
+
:PDFDocEncoding => File.dirname(__FILE__) + "/encodings/pdf_doc.txt",
|
146
|
+
:StandardEncoding => File.dirname(__FILE__) + "/encodings/standard.txt",
|
147
|
+
:SymbolEncoding => File.dirname(__FILE__) + "/encodings/symbol.txt",
|
148
|
+
:UTF16Encoding => nil,
|
149
|
+
:WinAnsiEncoding => File.dirname(__FILE__) + "/encodings/win_ansi.txt",
|
150
|
+
:ZapfDingbatsEncoding => File.dirname(__FILE__) + "/encodings/zapf_dingbats.txt"
|
151
|
+
}
|
152
|
+
|
153
|
+
if files.has_key?(enc)
|
154
|
+
files[enc]
|
155
|
+
else
|
156
|
+
raise UnsupportedFeatureError, "#{enc} is not currently a supported encoding"
|
157
|
+
end
|
145
158
|
end
|
146
159
|
|
147
|
-
|
160
|
+
def unicode_required?(enc)
|
161
|
+
enc == :"Identity-H"
|
162
|
+
end
|
148
163
|
|
149
164
|
def mapping
|
150
165
|
@mapping ||= {}
|
@@ -154,17 +169,8 @@ class PDF::Reader
|
|
154
169
|
mapping.size > 0
|
155
170
|
end
|
156
171
|
|
157
|
-
|
158
|
-
|
159
|
-
def process_differences(arr)
|
160
|
-
@differences ||= {}
|
161
|
-
arr.collect! { |n| @differences[n].nil? ? n : @differences[n]}
|
162
|
-
end
|
163
|
-
|
164
|
-
# accepts an array of unicode code points and glyphnames, and converts any glyph names to codepoints
|
165
|
-
def process_glyphnames(arr)
|
166
|
-
@differences ||= {}
|
167
|
-
arr.collect! { |n| n.kind_of?(Numeric) ? n : PDF::Reader::Font.glyphnames[n]}
|
172
|
+
def glyphnames
|
173
|
+
@glyphnames ||= PDF::Reader::Font.glyphnames
|
168
174
|
end
|
169
175
|
|
170
176
|
def load_mapping(file)
|
data/lib/pdf/reader/error.rb
CHANGED
@@ -22,12 +22,11 @@
|
|
22
22
|
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
23
|
#
|
24
24
|
|
25
|
-
|
26
25
|
class PDF::Reader
|
27
26
|
################################################################################
|
28
27
|
# An internal PDF::Reader class that helps to verify various parts of the PDF file
|
29
28
|
# are valid
|
30
|
-
class Error
|
29
|
+
class Error # :nodoc:
|
31
30
|
################################################################################
|
32
31
|
def self.str_assert (lvalue, rvalue, chars=nil)
|
33
32
|
raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
|
data/lib/pdf/reader/filter.rb
CHANGED
@@ -30,9 +30,7 @@ class PDF::Reader
|
|
30
30
|
# support for features like compression and encryption. This class is for decoding that
|
31
31
|
# content.
|
32
32
|
#
|
33
|
-
|
34
|
-
# in the future.
|
35
|
-
class Filter
|
33
|
+
class Filter # :nodoc:
|
36
34
|
################################################################################
|
37
35
|
# creates a new filter for decoding content.
|
38
36
|
#
|
@@ -49,6 +47,7 @@ class PDF::Reader
|
|
49
47
|
when :DCTDecode then @filter = nil
|
50
48
|
when :FlateDecode then @filter = :flate
|
51
49
|
when :JBIG2Decode then @filter = nil
|
50
|
+
when :LZWDecode then @filter = :lzw
|
52
51
|
else raise UnsupportedFeatureError, "Unknown filter: #{name}"
|
53
52
|
end
|
54
53
|
end
|
@@ -92,8 +91,9 @@ class PDF::Reader
|
|
92
91
|
################################################################################
|
93
92
|
# Decode the specified data with the Zlib compression algorithm
|
94
93
|
def flate (data)
|
94
|
+
deflated = nil
|
95
95
|
begin
|
96
|
-
Zlib::Inflate.new.inflate(data)
|
96
|
+
deflated = Zlib::Inflate.new.inflate(data)
|
97
97
|
rescue Zlib::DataError => e
|
98
98
|
# by default, Ruby's Zlib assumes the data it's inflating
|
99
99
|
# is RFC1951 deflated data, wrapped in a RFC1951 zlib container.
|
@@ -103,14 +103,117 @@ class PDF::Reader
|
|
103
103
|
# See
|
104
104
|
# - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
|
105
105
|
# - http://www.gzip.org/zlib/zlib_faq.html#faq38
|
106
|
-
Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
|
106
|
+
deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
|
107
107
|
end
|
108
|
+
depredict(deflated, @options)
|
108
109
|
rescue Exception => e
|
109
110
|
# Oops, there was a problem inflating the stream
|
110
111
|
raise MalformedPDFError, "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
|
111
112
|
end
|
112
113
|
################################################################################
|
114
|
+
# Decode the specified data with the LZW compression algorithm
|
115
|
+
def lzw(data)
|
116
|
+
data = PDF::Reader::LZW.decode(data)
|
117
|
+
depredict(data, @options)
|
118
|
+
end
|
119
|
+
################################################################################
|
120
|
+
def depredict(data, opts = {})
|
121
|
+
predictor = (opts || {})[:Predictor].to_i
|
122
|
+
|
123
|
+
case predictor
|
124
|
+
when 0, 1 then
|
125
|
+
data
|
126
|
+
when 2 then
|
127
|
+
tiff_depredict(data, opts)
|
128
|
+
when 10, 11, 12, 13, 14, 15 then
|
129
|
+
png_depredict(data, opts)
|
130
|
+
else
|
131
|
+
raise MalformedPDFError, "Unrecognised predictor value (#{predictor})"
|
132
|
+
end
|
133
|
+
end
|
134
|
+
################################################################################
|
135
|
+
def tiff_depredict(data, opts = {})
|
136
|
+
raise UnsupportedFeatureError, "TIFF predictor not supported"
|
137
|
+
end
|
138
|
+
################################################################################
|
139
|
+
def png_depredict(data, opts = {})
|
140
|
+
return data if opts.nil? || opts[:Predictor].to_i < 10
|
141
|
+
|
142
|
+
data = data.unpack("C*")
|
143
|
+
|
144
|
+
pixel_bytes = 1 #pixel_bitlength / 8
|
145
|
+
scanline_length = (pixel_bytes * opts[:Columns]) + 1
|
146
|
+
row = 0
|
147
|
+
pixels = []
|
148
|
+
paeth, pa, pb, pc = nil
|
149
|
+
until data.empty? do
|
150
|
+
row_data = data.slice! 0, scanline_length
|
151
|
+
filter = row_data.shift
|
152
|
+
case filter
|
153
|
+
when 0 # None
|
154
|
+
when 1 # Sub
|
155
|
+
row_data.each_with_index do |byte, index|
|
156
|
+
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
157
|
+
row_data[index] = (byte + left) % 256
|
158
|
+
#p [byte, left, row_data[index]]
|
159
|
+
end
|
160
|
+
when 2 # Up
|
161
|
+
row_data.each_with_index do |byte, index|
|
162
|
+
col = index / pixel_bytes
|
163
|
+
upper = row == 0 ? 0 : pixels[row-1][col][index % pixel_bytes]
|
164
|
+
row_data[index] = (upper + byte) % 256
|
165
|
+
end
|
166
|
+
when 3 # Average
|
167
|
+
row_data.each_with_index do |byte, index|
|
168
|
+
col = index / pixel_bytes
|
169
|
+
upper = row == 0 ? 0 : pixels[row-1][col][index % pixel_bytes]
|
170
|
+
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
171
|
+
|
172
|
+
row_data[index] = (byte + ((left + upper)/2).floor) % 256
|
173
|
+
end
|
174
|
+
when 4 # Paeth
|
175
|
+
left = upper = upper_left = nil
|
176
|
+
row_data.each_with_index do |byte, index|
|
177
|
+
col = index / pixel_bytes
|
178
|
+
|
179
|
+
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
180
|
+
if row.zero?
|
181
|
+
upper = upper_left = 0
|
182
|
+
else
|
183
|
+
upper = pixels[row-1][col][index % pixel_bytes]
|
184
|
+
upper_left = col.zero? ? 0 :
|
185
|
+
pixels[row-1][col-1][index % pixel_bytes]
|
186
|
+
end
|
187
|
+
|
188
|
+
p = left + upper - upper_left
|
189
|
+
pa = (p - left).abs
|
190
|
+
pb = (p - upper).abs
|
191
|
+
pc = (p - upper_left).abs
|
192
|
+
|
193
|
+
paeth = if pa <= pb && pa <= pc
|
194
|
+
left
|
195
|
+
elsif pb <= pc
|
196
|
+
upper
|
197
|
+
else
|
198
|
+
upper_left
|
199
|
+
end
|
200
|
+
|
201
|
+
row_data[index] = (byte + paeth) % 256
|
202
|
+
end
|
203
|
+
else
|
204
|
+
raise ArgumentError, "Invalid filter algorithm #{filter}"
|
205
|
+
end
|
206
|
+
|
207
|
+
s = []
|
208
|
+
row_data.each_slice pixel_bytes do |slice|
|
209
|
+
s << slice
|
210
|
+
end
|
211
|
+
pixels << s
|
212
|
+
row += 1
|
213
|
+
end
|
214
|
+
|
215
|
+
pixels.map { |row| row.flatten.pack("C*") }.join("")
|
216
|
+
end
|
113
217
|
end
|
114
|
-
################################################################################
|
115
218
|
end
|
116
219
|
################################################################################
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -32,19 +32,17 @@ class PDF::Reader
|
|
32
32
|
# a text file supplied by Adobe at:
|
33
33
|
# http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
|
34
34
|
def self.glyphnames
|
35
|
-
|
35
|
+
glyphs = {}
|
36
36
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
@@glyphs[name.to_sym] = "0x#{code}".hex if name
|
43
|
-
end
|
37
|
+
RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
|
38
|
+
File.open(File.dirname(__FILE__) + "/glyphlist.txt",mode) do |f|
|
39
|
+
f.each do |l|
|
40
|
+
m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
|
41
|
+
glyphs[name.to_sym] = "0x#{code}".hex if name
|
44
42
|
end
|
45
43
|
end
|
46
44
|
|
47
|
-
|
45
|
+
glyphs
|
48
46
|
end
|
49
47
|
|
50
48
|
def basefont=(font)
|
@@ -52,9 +50,11 @@ class PDF::Reader
|
|
52
50
|
# with encoding= if required
|
53
51
|
case font
|
54
52
|
when "Symbol" then
|
55
|
-
|
53
|
+
@encoding = PDF::Reader::Encoding.new("SymbolEncoding")
|
56
54
|
when "ZapfDingbats" then
|
57
|
-
|
55
|
+
@encoding = PDF::Reader::Encoding.new("ZapfDingbatsEncoding")
|
56
|
+
else
|
57
|
+
@encoding = nil
|
58
58
|
end
|
59
59
|
@basefont = font
|
60
60
|
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module PDF
|
4
|
+
|
5
|
+
class Reader
|
6
|
+
|
7
|
+
# A general class for decoding LZW compressed data. LZW can be
|
8
|
+
# used in PDF files to compresses streams, usually for image data sourced
|
9
|
+
# from a TIFF file.
|
10
|
+
#
|
11
|
+
# See the following links for more information:
|
12
|
+
#
|
13
|
+
# ref http://www.fileformat.info/format/tiff/corion-lzw.htm
|
14
|
+
# ref http://marknelson.us/1989/10/01/lzw-data-compression/
|
15
|
+
#
|
16
|
+
# The PDF spec also has some data on the algorithm.
|
17
|
+
#
|
18
|
+
class LZW # :nodoc:
|
19
|
+
|
20
|
+
class BitStream # :nodoc:
|
21
|
+
|
22
|
+
def initialize(data, bits_in_chunk)
|
23
|
+
@data = data
|
24
|
+
@data.force_encoding("BINARY") if @data.respond_to?(:force_encoding)
|
25
|
+
@bits_in_chunk = bits_in_chunk
|
26
|
+
@current_pos = 0
|
27
|
+
@bits_left_in_byte = 8
|
28
|
+
end
|
29
|
+
|
30
|
+
def set_bits_in_chunk(bits_in_chunk)
|
31
|
+
@bits_in_chunk = bits_in_chunk
|
32
|
+
end
|
33
|
+
|
34
|
+
def read
|
35
|
+
bits_left_in_chunk = @bits_in_chunk
|
36
|
+
chunk = nil
|
37
|
+
while bits_left_in_chunk > 0 and @current_pos < @data.size
|
38
|
+
chunk = 0 if chunk.nil?
|
39
|
+
codepoint = @data[@current_pos, 1].unpack("C*")[0]
|
40
|
+
current_byte = codepoint & (2**@bits_left_in_byte -1) #clear consumed bits
|
41
|
+
dif = bits_left_in_chunk - @bits_left_in_byte
|
42
|
+
if dif > 0 then current_byte <<= dif
|
43
|
+
elsif dif < 0 then current_byte >>= dif.abs
|
44
|
+
end
|
45
|
+
chunk |= current_byte #add bits to result
|
46
|
+
bits_left_in_chunk = if dif >= 0 then dif else 0 end
|
47
|
+
@bits_left_in_byte = if dif < 0 then dif.abs else 0 end
|
48
|
+
if @bits_left_in_byte.zero? #next byte
|
49
|
+
@current_pos += 1
|
50
|
+
@bits_left_in_byte = 8
|
51
|
+
end
|
52
|
+
end
|
53
|
+
chunk
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
CODE_EOD = 257 #end of data
|
58
|
+
CODE_CLEAR_TABLE = 256 #clear table
|
59
|
+
|
60
|
+
# stores de pairs code => string
|
61
|
+
class StringTable < Hash # :nodoc:
|
62
|
+
attr_reader :string_table_pos
|
63
|
+
|
64
|
+
def initialize
|
65
|
+
super
|
66
|
+
@string_table_pos = 258 #initial code
|
67
|
+
end
|
68
|
+
|
69
|
+
#if code less than 258 return fixed string
|
70
|
+
def [](key)
|
71
|
+
if key > 257 then super else key.chr end
|
72
|
+
end
|
73
|
+
|
74
|
+
def add(string)
|
75
|
+
store(@string_table_pos, string)
|
76
|
+
@string_table_pos += 1
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# Decompresses a LZW compressed string.
|
81
|
+
#
|
82
|
+
def self.decode(data)
|
83
|
+
stream = BitStream.new data.to_s, 9 # size of codes between 9 and 12 bits
|
84
|
+
result = ''
|
85
|
+
while not (code = stream.read) == CODE_EOD
|
86
|
+
if code == CODE_CLEAR_TABLE
|
87
|
+
string_table = StringTable.new
|
88
|
+
code = stream.read
|
89
|
+
break if code == CODE_EOD
|
90
|
+
result << string_table[code]
|
91
|
+
old_code = code
|
92
|
+
else
|
93
|
+
string = string_table[code]
|
94
|
+
if string
|
95
|
+
result << string
|
96
|
+
string_table.add create_new_string(string_table, old_code, code)
|
97
|
+
old_code = code
|
98
|
+
else
|
99
|
+
new_string = create_new_string(string_table, old_code, old_code)
|
100
|
+
result << new_string
|
101
|
+
string_table.add new_string
|
102
|
+
old_code = code
|
103
|
+
end
|
104
|
+
#increase de size of the codes when limit reached
|
105
|
+
case string_table.string_table_pos
|
106
|
+
when 511 then stream.set_bits_in_chunk(10)
|
107
|
+
when 1023 then stream.set_bits_in_chunk(11)
|
108
|
+
when 2047 then stream.set_bits_in_chunk(12)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
result
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
|
117
|
+
def self.create_new_string(string_table,some_code, other_code)
|
118
|
+
string_table[some_code] + string_table[other_code][0].chr
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|