lzutf8 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +3 -0
- data/README.md +47 -0
- data/lib/lzutf8.rb +208 -0
- data/lib/lzutf8/version.rb +3 -0
- metadata +47 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 8cd5f3fa2daec203af83a69d6e4ec405365755e3c6ce5065e089a4ad97dbc49b
|
4
|
+
data.tar.gz: a1b0046f6df140c6a19ed723f57317eb370706168eaba396d04d988fa5a23eac
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 816c9a697db4a67e4c8216a84a1841598c7b7898991b8a9dbe80c13d9854a34b7f9e3636b393da09d8c7b6a560c870dc997685e00ad78333679537de15b08e46
|
7
|
+
data.tar.gz: c6bc288cf364ceecc14365bf5c3c9308c26341902fc583dfb8af83ed83fea9c1c778591d2609f6176a265857ad1dd469cd2ebc1b313d786d896037aa62dc806f
|
data/.yardopts
ADDED
data/README.md
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
# LZUTF8 Gem - providing `lzutf8`
|
2
|
+
[![Gem Version](https://badge.fury.io/rb/lzutf8.svg)](https://rubygems.org/gems/lzutf8)
|
3
|
+
[![Documentation](http://img.shields.io/badge/docs-rdoc.info-blue.svg)](http://www.rubydoc.info/gems/lzutf8/0.0.1)
|
4
|
+
A ruby gem containing an implementation of LZUTF-8 compression and decompression
|
5
|
+
|
6
|
+
* `LZUTF8.compress("some input")`
|
7
|
+
* `LZUTF8.decompress("some compressed input")`
|
8
|
+
|
9
|
+
## Algorithm
|
10
|
+
|
11
|
+
This is a port of https://github.com/rotemdan/lzutf8.js/
|
12
|
+
|
13
|
+
The trick is to create a set of pointer sequences that coexist with UTF-8 encoding. The pointers include fields for a matched sequence length (bits indicated with `l`) and a distance (backward) that the sequence can be found in the encoded string (bits indicated with `d`).
|
14
|
+
|
15
|
+
|Bytes |Sized pointer sequence | UTF-8 Codepoint sequence
|
16
|
+
|------|----------------------------|-------------------------
|
17
|
+
| 1 |n/a |`0xxxxxxx`
|
18
|
+
| 2 |`110lllll 0ddddddd` |`110xxxxx 10xxxxxx`
|
19
|
+
| 3 |`111lllll 0ddddddd dddddddd`|`1110xxxx 10xxxxxx 10xxxxxx`
|
20
|
+
| 4 |n/a |`11110xxx 10xxxxxx 10xxxxxx 10xxxxxx`
|
21
|
+
|
22
|
+
Note that this allows for 5 bits (representing maxiumum 31 bytes) of matched sequence and up to 15 bits (representing maximum 32767 bytes) of distance to the matched sequence.
|
23
|
+
|
24
|
+
### Compressing
|
25
|
+
|
26
|
+
The text is converted to bytes of UTF-8. Byte sequences of `4 <= length` and `length <= 31` are replaced where possible with sized pointer sequences -- pointing to a relative location up to 32767 bytes where the sequence can be found.
|
27
|
+
|
28
|
+
Hash every 4-byte sequence in the input string and use it to store the position of that sequence. Each hash bucket will then contain an array of locations where that starting position can be found.
|
29
|
+
|
30
|
+
For example, the string `abcdefabcd` would produce the following table:
|
31
|
+
|
32
|
+
|hash |pointers
|
33
|
+
|------|-------
|
34
|
+
|`abcd`|0, 6
|
35
|
+
|`bcde`|1
|
36
|
+
|`cdef`|2
|
37
|
+
|`defa`|3
|
38
|
+
|`efab`|4
|
39
|
+
|`fabc`|5
|
40
|
+
|
41
|
+
The output string would be identical to the input as far as `abcdef` after which a sized pointer representing a distance of 6 and a length of 4 would be appended: `0b11000100_00000110`
|
42
|
+
|
43
|
+
### Decompressing
|
44
|
+
|
45
|
+
Scan for sized pointers based on the bit sequences listed above. When encountered, replace them with the text at the pointed-to location of the desired length.
|
46
|
+
|
47
|
+
> **Note:** it is legal for the desired length to be _longer_ than the requested distance (e.g. a repeated individual character might produce a sized pointer that requests length 31 from distance 1). The text extracted from the pointer should be repeated to fill the desired length.
|
data/lib/lzutf8.rb
ADDED
@@ -0,0 +1,208 @@
|
|
1
|
+
require "lzutf8/version"
|
2
|
+
|
3
|
+
# LZUTF8 contains functions to compress and decompress UTF-8 text with the LZUTF-8 algorithm
|
4
|
+
# @author Ian Katz <ianfixes@gmail.com>
|
5
|
+
module LZUTF8
|
6
|
+
|
7
|
+
MINIMUM_SEQUENCE_LENGTH = 4
|
8
|
+
MAXIMUM_SEQUENCE_LENGTH = 31
|
9
|
+
MAXIMUM_MATCH_DISTANCE = 32767
|
10
|
+
|
11
|
+
# Extract the information contained in an LZUTF8 Sized Pointer
|
12
|
+
#
|
13
|
+
# @param bytes [Array<Integer>] an array of character codes
|
14
|
+
# @return [Integer, Integer] the pointer length and distance
|
15
|
+
def self.pointer_info(bytes)
|
16
|
+
c1, c2, c3 = bytes
|
17
|
+
length = c1 & 0b00011111
|
18
|
+
# either llllll_0ddddddd or 0b111lllll_0ddddddd_dddddddd
|
19
|
+
distance = if (c1 & 0b00100000).zero?
|
20
|
+
c2
|
21
|
+
else
|
22
|
+
(c2 << 8) + c3
|
23
|
+
end
|
24
|
+
[length, distance]
|
25
|
+
end
|
26
|
+
|
27
|
+
# Explain a compressed (or uncompressed) UTF-8 string at the sequence-of-bits level by printing to STDOUT
|
28
|
+
#
|
29
|
+
# @param string [String] the input string
|
30
|
+
def self.explain(string)
|
31
|
+
raise ArgumentError unless string.is_a? String
|
32
|
+
|
33
|
+
input = string.bytes
|
34
|
+
position = -1
|
35
|
+
outsize = 0
|
36
|
+
|
37
|
+
pointer2 = lambda do |seq|
|
38
|
+
c1, c2 = seq
|
39
|
+
[c2].none?(&:nil?) &&
|
40
|
+
(c1 & 0b11100000) == 0b11000000 &&
|
41
|
+
(c2 & 0b10000000).zero?
|
42
|
+
end
|
43
|
+
|
44
|
+
pointer3 = lambda do |seq|
|
45
|
+
c1, c2, c3 = seq
|
46
|
+
[c2, c3].none?(&:nil?) &&
|
47
|
+
(c1 & 0b11100000) == 0b11100000 &&
|
48
|
+
(c2 & 0b10000000).zero?
|
49
|
+
end
|
50
|
+
|
51
|
+
codepoint1 = ->(seq) { (seq.first & 0b10000000).zero? }
|
52
|
+
|
53
|
+
codepoint2 = lambda do |seq|
|
54
|
+
c1, c2 = seq
|
55
|
+
[c2].none?(&:nil?) &&
|
56
|
+
(c1 & 0b11100000) == 0b11000000 &&
|
57
|
+
(c2 & 0b11000000) == 0b10000000
|
58
|
+
end
|
59
|
+
|
60
|
+
codepoint3 = lambda do |seq|
|
61
|
+
c1, c2, c3 = seq
|
62
|
+
[c2, c3].none?(&:nil?) &&
|
63
|
+
(c1 & 0b11110000) == 0b11100000 &&
|
64
|
+
(c2 & 0b11000000) == 0b10000000 &&
|
65
|
+
(c3 & 0b11000000) == 0b10000000
|
66
|
+
end
|
67
|
+
|
68
|
+
codepoint4 = lambda do |seq|
|
69
|
+
c1, c2, c3, c4 = seq
|
70
|
+
[c2, c3, c4].none?(&:nil?) &&
|
71
|
+
(c1 & 0b11111000) == 0b11110000 &&
|
72
|
+
(c2 & 0b11000000) == 0b10000000 &&
|
73
|
+
(c3 & 0b11000000) == 0b10000000 &&
|
74
|
+
(c4 & 0b11000000) == 0b10000000
|
75
|
+
end
|
76
|
+
|
77
|
+
binarize = proc { |bytes| "0b" + bytes.map { |b| b.to_s(2).rjust(8, '0') }.join("_") }
|
78
|
+
dump = proc do |pos, bytes, inc, meaning|
|
79
|
+
outsize += inc
|
80
|
+
puts "#{pos.to_s.rjust(4, '0')} #{binarize.call(bytes)} #{meaning} OS=#{outsize}"
|
81
|
+
end
|
82
|
+
|
83
|
+
dumpc = proc { |pos, bytes| dump.call(pos, bytes, bytes.size, "literal - #{bytes.pack('C*')}") }
|
84
|
+
dumpp = proc do |pos, bytes|
|
85
|
+
length, distance = self.pointer_info(bytes)
|
86
|
+
dump.call(pos, bytes, length, "pointer l=#{length} d=#{distance}")
|
87
|
+
end
|
88
|
+
|
89
|
+
until input.empty? do
|
90
|
+
position += 1
|
91
|
+
c1 = input.shift
|
92
|
+
c2, c3, c4 = input[0, 3]
|
93
|
+
sequence = [c1, c2, c3, c4]
|
94
|
+
|
95
|
+
case sequence
|
96
|
+
when codepoint1
|
97
|
+
dumpc.call(position, [c1])
|
98
|
+
when codepoint2
|
99
|
+
dumpc.call(position, [c1, c2])
|
100
|
+
position += 1.times { input.shift } # rubocop:disable Lint/UselessTimes
|
101
|
+
when codepoint3
|
102
|
+
dumpc.call(position, [c1, c2, c3])
|
103
|
+
position += 2.times { input.shift }
|
104
|
+
when codepoint4
|
105
|
+
dumpc.call(position, [c1, c2, c3, c4])
|
106
|
+
position += 3.times { input.shift }
|
107
|
+
when pointer2
|
108
|
+
dumpp.call(position, [c1, c2])
|
109
|
+
position += 1.times { input.shift } # rubocop:disable Lint/UselessTimes
|
110
|
+
when pointer3
|
111
|
+
dumpp.call(position, [c1, c2, c3])
|
112
|
+
position += 2.times { input.shift }
|
113
|
+
else
|
114
|
+
dump.call(position, [c1], 1, "Assumed part of a sequence")
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
# Decompress an LZUTF8-compressed string.
|
120
|
+
#
|
121
|
+
# Due to the nature of this decompression algorithm, any uncompressed UTF-8 string can be
|
122
|
+
# passed to this function and will be returned unmodified
|
123
|
+
#
|
124
|
+
# @param string [String] The input LZUTF8-compressed string
|
125
|
+
# @return [String] Uncompressed UTF-8 string
|
126
|
+
def self.decompress(string)
|
127
|
+
raise ArgumentError unless string.is_a? String
|
128
|
+
|
129
|
+
input = string.bytes
|
130
|
+
output = []
|
131
|
+
until input.empty? do
|
132
|
+
c1 = input.shift # consume
|
133
|
+
c2 = input.first # peek
|
134
|
+
next (output << c1) unless (c1 & 0b11000000) == 0b11000000 && (c2 & 0b10000000).zero?
|
135
|
+
|
136
|
+
# By this point we know it's not a literal char and we must actually consume the 2nd byte
|
137
|
+
# either llllll_0ddddddd or 0b111lllll_0ddddddd_dddddddd
|
138
|
+
c2 = input.shift & 0b01111111
|
139
|
+
length = c1 & 0b00011111
|
140
|
+
distance = (c1 & 0b00100000).zero? ? c2 : (c2 << 8) + input.shift # consume 3rd byte if needed
|
141
|
+
|
142
|
+
# get text from pointer, wrap in enumartor, take data until length satisfied and append
|
143
|
+
output += Enumerator.new { |y| loop { output[-distance, length].each { |v| y << v } } }.lazy.take(length).to_a
|
144
|
+
end
|
145
|
+
|
146
|
+
output.pack('C*').force_encoding(Encoding::UTF_8)
|
147
|
+
end
|
148
|
+
|
149
|
+
# Decompress a string using the LZUTF8 algorithm
|
150
|
+
#
|
151
|
+
# Due to the nature of this compression algorithm, only valid UTF-8 codepoints can be compressed;
|
152
|
+
# arbitrary binary data will fail.
|
153
|
+
#
|
154
|
+
# @param string [String] The input string
|
155
|
+
# @return [String] compresed string
|
156
|
+
def self.compress(string)
|
157
|
+
raise ArgumentError unless string.is_a? String
|
158
|
+
|
159
|
+
input = string.bytes
|
160
|
+
hash = {}
|
161
|
+
match_score = proc { |dist, len| dist < 128 ? len * 1.5 : len } # 2 byte vs 3 byte compression
|
162
|
+
|
163
|
+
pointer = -1
|
164
|
+
output = []
|
165
|
+
until pointer + 1 == input.size do
|
166
|
+
pointer += 1
|
167
|
+
c1, c2, c3, c4 = key = input[pointer, 4]
|
168
|
+
key = key.pack('C*')
|
169
|
+
next (output << c1) if [c2, c3, c4].any?(&:nil?) # near end of input, just iterate until it's consumed
|
170
|
+
|
171
|
+
max_len = [input.size - pointer, MAXIMUM_SEQUENCE_LENGTH].min # max length of a match
|
172
|
+
matches = begin
|
173
|
+
next (output << c1) if hash[key].nil? # no matches if no bucket
|
174
|
+
|
175
|
+
hash[key].map do |start| # all known bucket entries as [distance, length_of_match]
|
176
|
+
matchable = input[pointer, max_len] # max length of the matchable input segment
|
177
|
+
distance = pointer - start # relative distance must be less than max expressable
|
178
|
+
next nil if distance > MAXIMUM_MATCH_DISTANCE # this would mean a bucket entry we didn't clean up
|
179
|
+
|
180
|
+
# linear comparison to find longest common prefix
|
181
|
+
len = input[start, max_len].zip(matchable).index { |a, b| a.nil? || b.nil? || a != b }
|
182
|
+
case len
|
183
|
+
when nil then [distance, matchable.size] # hit end of input, fully matched
|
184
|
+
when 0..MINIMUM_SEQUENCE_LENGTH then nil # no match
|
185
|
+
else [distance, len - 1] # index is the one BEFORE the mismatch
|
186
|
+
end
|
187
|
+
end
|
188
|
+
ensure # that the pointer is added to the hash for this sequence
|
189
|
+
hash[key] = [] if hash[key].nil?
|
190
|
+
hash[key] << pointer
|
191
|
+
hash[key] = hash[key].select { |p| pointer - p < MAXIMUM_MATCH_DISTANCE } # prune too-distant matches
|
192
|
+
end.compact
|
193
|
+
|
194
|
+
best_match = matches.max { |a, b| match_score.call(a) <=> match_score.call(b) }
|
195
|
+
next (output << c1) if best_match.nil?
|
196
|
+
|
197
|
+
# Output a pointer to the match and advance the input pointer by the amount matched
|
198
|
+
match_distance, match_length = best_match
|
199
|
+
output += if match_distance < 0b10000000
|
200
|
+
[0b11000000 | match_length, match_distance]
|
201
|
+
else
|
202
|
+
[0b11100000 | match_length, match_distance >> 8, match_distance & 0b00000000_11111111]
|
203
|
+
end
|
204
|
+
pointer += (match_length - 1)
|
205
|
+
end
|
206
|
+
output.pack('C*').force_encoding(Encoding::UTF_8)
|
207
|
+
end
|
208
|
+
end
|
metadata
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: lzutf8
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Ian Katz
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2021-03-01 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: ''
|
14
|
+
email:
|
15
|
+
- ianfixes@gmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- ".yardopts"
|
21
|
+
- README.md
|
22
|
+
- lib/lzutf8.rb
|
23
|
+
- lib/lzutf8/version.rb
|
24
|
+
homepage: http://github.com/ianfixes/lzutf8_gem
|
25
|
+
licenses:
|
26
|
+
- Apache-2.0
|
27
|
+
metadata: {}
|
28
|
+
post_install_message:
|
29
|
+
rdoc_options: []
|
30
|
+
require_paths:
|
31
|
+
- lib
|
32
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
33
|
+
requirements:
|
34
|
+
- - ">="
|
35
|
+
- !ruby/object:Gem::Version
|
36
|
+
version: '0'
|
37
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - ">="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '0'
|
42
|
+
requirements: []
|
43
|
+
rubygems_version: 3.0.3
|
44
|
+
signing_key:
|
45
|
+
specification_version: 4
|
46
|
+
summary: Compression and decompression implementation of LZUTF-8 Algorithm
|
47
|
+
test_files: []
|