lzutf8 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 8cd5f3fa2daec203af83a69d6e4ec405365755e3c6ce5065e089a4ad97dbc49b
4
+ data.tar.gz: a1b0046f6df140c6a19ed723f57317eb370706168eaba396d04d988fa5a23eac
5
+ SHA512:
6
+ metadata.gz: 816c9a697db4a67e4c8216a84a1841598c7b7898991b8a9dbe80c13d9854a34b7f9e3636b393da09d8c7b6a560c870dc997685e00ad78333679537de15b08e46
7
+ data.tar.gz: c6bc288cf364ceecc14365bf5c3c9308c26341902fc583dfb8af83ed83fea9c1c778591d2609f6176a265857ad1dd469cd2ebc1b313d786d896037aa62dc806f
data/.yardopts ADDED
@@ -0,0 +1,3 @@
1
+ -M redcarpet -m markdown
2
+
3
+ lib/lzutf8.rb lib/**/*.rb
data/README.md ADDED
@@ -0,0 +1,47 @@
1
+ # LZUTF8 Gem - providing `lzutf8`
2
+ [![Gem Version](https://badge.fury.io/rb/lzutf8.svg)](https://rubygems.org/gems/lzutf8)
3
+ [![Documentation](http://img.shields.io/badge/docs-rdoc.info-blue.svg)](http://www.rubydoc.info/gems/lzutf8/0.0.1)
4
+ A ruby gem containing an implementation of LZUTF-8 compression and decompression
5
+
6
+ * `LZUTF8.compress("some input")`
7
+ * `LZUTF8.decompress("some compressed input")`
8
+
9
+ ## Algorithm
10
+
11
+ This is a port of https://github.com/rotemdan/lzutf8.js/
12
+
13
+ The trick is to create a set of pointer sequences that coexist with UTF-8 encoding. The pointers include fields for a matched sequence length (bits indicated with `l`) and a distance (backward) that the sequence can be found in the encoded string (bits indicated with `d`).
14
+
15
+ |Bytes |Sized pointer sequence | UTF-8 Codepoint sequence
16
+ |------|----------------------------|-------------------------
17
+ | 1 |n/a |`0xxxxxxx`
18
+ | 2 |`110lllll 0ddddddd` |`110xxxxx 10xxxxxx`
19
+ | 3 |`111lllll 0ddddddd dddddddd`|`1110xxxx 10xxxxxx 10xxxxxx`
20
+ | 4 |n/a |`11110xxx 10xxxxxx 10xxxxxx 10xxxxxx`
21
+
22
+ Note that this allows for 5 bits (representing maxiumum 31 bytes) of matched sequence and up to 15 bits (representing maximum 32767 bytes) of distance to the matched sequence.
23
+
24
+ ### Compressing
25
+
26
+ The text is converted to bytes of UTF-8. Byte sequences of `4 <= length` and `length <= 31` are replaced where possible with sized pointer sequences -- pointing to a relative location up to 32767 bytes where the sequence can be found.
27
+
28
+ Hash every 4-byte sequence in the input string and use it to store the position of that sequence. Each hash bucket will then contain an array of locations where that starting position can be found.
29
+
30
+ For example, the string `abcdefabcd` would produce the following table:
31
+
32
+ |hash |pointers
33
+ |------|-------
34
+ |`abcd`|0, 6
35
+ |`bcde`|1
36
+ |`cdef`|2
37
+ |`defa`|3
38
+ |`efab`|4
39
+ |`fabc`|5
40
+
41
+ The output string would be identical to the input as far as `abcdef` after which a sized pointer representing a distance of 6 and a length of 4 would be appended: `0b11000100_00000110`
42
+
43
+ ### Decompressing
44
+
45
+ Scan for sized pointers based on the bit sequences listed above. When encountered, replace them with the text at the pointed-to location of the desired length.
46
+
47
+ > **Note:** it is legal for the desired length to be _longer_ than the requested distance (e.g. a repeated individual character might produce a sized pointer that requests length 31 from distance 1). The text extracted from the pointer should be repeated to fill the desired length.
data/lib/lzutf8.rb ADDED
@@ -0,0 +1,208 @@
1
+ require "lzutf8/version"
2
+
3
+ # LZUTF8 contains functions to compress and decompress UTF-8 text with the LZUTF-8 algorithm
4
+ # @author Ian Katz <ianfixes@gmail.com>
5
+ module LZUTF8
6
+
7
+ MINIMUM_SEQUENCE_LENGTH = 4
8
+ MAXIMUM_SEQUENCE_LENGTH = 31
9
+ MAXIMUM_MATCH_DISTANCE = 32767
10
+
11
+ # Extract the information contained in an LZUTF8 Sized Pointer
12
+ #
13
+ # @param bytes [Array<Integer>] an array of character codes
14
+ # @return [Integer, Integer] the pointer length and distance
15
+ def self.pointer_info(bytes)
16
+ c1, c2, c3 = bytes
17
+ length = c1 & 0b00011111
18
+ # either llllll_0ddddddd or 0b111lllll_0ddddddd_dddddddd
19
+ distance = if (c1 & 0b00100000).zero?
20
+ c2
21
+ else
22
+ (c2 << 8) + c3
23
+ end
24
+ [length, distance]
25
+ end
26
+
27
+ # Explain a compressed (or uncompressed) UTF-8 string at the sequence-of-bits level by printing to STDOUT
28
+ #
29
+ # @param string [String] the input string
30
+ def self.explain(string)
31
+ raise ArgumentError unless string.is_a? String
32
+
33
+ input = string.bytes
34
+ position = -1
35
+ outsize = 0
36
+
37
+ pointer2 = lambda do |seq|
38
+ c1, c2 = seq
39
+ [c2].none?(&:nil?) &&
40
+ (c1 & 0b11100000) == 0b11000000 &&
41
+ (c2 & 0b10000000).zero?
42
+ end
43
+
44
+ pointer3 = lambda do |seq|
45
+ c1, c2, c3 = seq
46
+ [c2, c3].none?(&:nil?) &&
47
+ (c1 & 0b11100000) == 0b11100000 &&
48
+ (c2 & 0b10000000).zero?
49
+ end
50
+
51
+ codepoint1 = ->(seq) { (seq.first & 0b10000000).zero? }
52
+
53
+ codepoint2 = lambda do |seq|
54
+ c1, c2 = seq
55
+ [c2].none?(&:nil?) &&
56
+ (c1 & 0b11100000) == 0b11000000 &&
57
+ (c2 & 0b11000000) == 0b10000000
58
+ end
59
+
60
+ codepoint3 = lambda do |seq|
61
+ c1, c2, c3 = seq
62
+ [c2, c3].none?(&:nil?) &&
63
+ (c1 & 0b11110000) == 0b11100000 &&
64
+ (c2 & 0b11000000) == 0b10000000 &&
65
+ (c3 & 0b11000000) == 0b10000000
66
+ end
67
+
68
+ codepoint4 = lambda do |seq|
69
+ c1, c2, c3, c4 = seq
70
+ [c2, c3, c4].none?(&:nil?) &&
71
+ (c1 & 0b11111000) == 0b11110000 &&
72
+ (c2 & 0b11000000) == 0b10000000 &&
73
+ (c3 & 0b11000000) == 0b10000000 &&
74
+ (c4 & 0b11000000) == 0b10000000
75
+ end
76
+
77
+ binarize = proc { |bytes| "0b" + bytes.map { |b| b.to_s(2).rjust(8, '0') }.join("_") }
78
+ dump = proc do |pos, bytes, inc, meaning|
79
+ outsize += inc
80
+ puts "#{pos.to_s.rjust(4, '0')} #{binarize.call(bytes)} #{meaning} OS=#{outsize}"
81
+ end
82
+
83
+ dumpc = proc { |pos, bytes| dump.call(pos, bytes, bytes.size, "literal - #{bytes.pack('C*')}") }
84
+ dumpp = proc do |pos, bytes|
85
+ length, distance = self.pointer_info(bytes)
86
+ dump.call(pos, bytes, length, "pointer l=#{length} d=#{distance}")
87
+ end
88
+
89
+ until input.empty? do
90
+ position += 1
91
+ c1 = input.shift
92
+ c2, c3, c4 = input[0, 3]
93
+ sequence = [c1, c2, c3, c4]
94
+
95
+ case sequence
96
+ when codepoint1
97
+ dumpc.call(position, [c1])
98
+ when codepoint2
99
+ dumpc.call(position, [c1, c2])
100
+ position += 1.times { input.shift } # rubocop:disable Lint/UselessTimes
101
+ when codepoint3
102
+ dumpc.call(position, [c1, c2, c3])
103
+ position += 2.times { input.shift }
104
+ when codepoint4
105
+ dumpc.call(position, [c1, c2, c3, c4])
106
+ position += 3.times { input.shift }
107
+ when pointer2
108
+ dumpp.call(position, [c1, c2])
109
+ position += 1.times { input.shift } # rubocop:disable Lint/UselessTimes
110
+ when pointer3
111
+ dumpp.call(position, [c1, c2, c3])
112
+ position += 2.times { input.shift }
113
+ else
114
+ dump.call(position, [c1], 1, "Assumed part of a sequence")
115
+ end
116
+ end
117
+ end
118
+
119
+ # Decompress an LZUTF8-compressed string.
120
+ #
121
+ # Due to the nature of this decompression algorithm, any uncompressed UTF-8 string can be
122
+ # passed to this function and will be returned unmodified
123
+ #
124
+ # @param string [String] The input LZUTF8-compressed string
125
+ # @return [String] Uncompressed UTF-8 string
126
+ def self.decompress(string)
127
+ raise ArgumentError unless string.is_a? String
128
+
129
+ input = string.bytes
130
+ output = []
131
+ until input.empty? do
132
+ c1 = input.shift # consume
133
+ c2 = input.first # peek
134
+ next (output << c1) unless (c1 & 0b11000000) == 0b11000000 && (c2 & 0b10000000).zero?
135
+
136
+ # By this point we know it's not a literal char and we must actually consume the 2nd byte
137
+ # either llllll_0ddddddd or 0b111lllll_0ddddddd_dddddddd
138
+ c2 = input.shift & 0b01111111
139
+ length = c1 & 0b00011111
140
+ distance = (c1 & 0b00100000).zero? ? c2 : (c2 << 8) + input.shift # consume 3rd byte if needed
141
+
142
+ # get text from pointer, wrap in enumartor, take data until length satisfied and append
143
+ output += Enumerator.new { |y| loop { output[-distance, length].each { |v| y << v } } }.lazy.take(length).to_a
144
+ end
145
+
146
+ output.pack('C*').force_encoding(Encoding::UTF_8)
147
+ end
148
+
149
+ # Decompress a string using the LZUTF8 algorithm
150
+ #
151
+ # Due to the nature of this compression algorithm, only valid UTF-8 codepoints can be compressed;
152
+ # arbitrary binary data will fail.
153
+ #
154
+ # @param string [String] The input string
155
+ # @return [String] compresed string
156
+ def self.compress(string)
157
+ raise ArgumentError unless string.is_a? String
158
+
159
+ input = string.bytes
160
+ hash = {}
161
+ match_score = proc { |dist, len| dist < 128 ? len * 1.5 : len } # 2 byte vs 3 byte compression
162
+
163
+ pointer = -1
164
+ output = []
165
+ until pointer + 1 == input.size do
166
+ pointer += 1
167
+ c1, c2, c3, c4 = key = input[pointer, 4]
168
+ key = key.pack('C*')
169
+ next (output << c1) if [c2, c3, c4].any?(&:nil?) # near end of input, just iterate until it's consumed
170
+
171
+ max_len = [input.size - pointer, MAXIMUM_SEQUENCE_LENGTH].min # max length of a match
172
+ matches = begin
173
+ next (output << c1) if hash[key].nil? # no matches if no bucket
174
+
175
+ hash[key].map do |start| # all known bucket entries as [distance, length_of_match]
176
+ matchable = input[pointer, max_len] # max length of the matchable input segment
177
+ distance = pointer - start # relative distance must be less than max expressable
178
+ next nil if distance > MAXIMUM_MATCH_DISTANCE # this would mean a bucket entry we didn't clean up
179
+
180
+ # linear comparison to find longest common prefix
181
+ len = input[start, max_len].zip(matchable).index { |a, b| a.nil? || b.nil? || a != b }
182
+ case len
183
+ when nil then [distance, matchable.size] # hit end of input, fully matched
184
+ when 0..MINIMUM_SEQUENCE_LENGTH then nil # no match
185
+ else [distance, len - 1] # index is the one BEFORE the mismatch
186
+ end
187
+ end
188
+ ensure # that the pointer is added to the hash for this sequence
189
+ hash[key] = [] if hash[key].nil?
190
+ hash[key] << pointer
191
+ hash[key] = hash[key].select { |p| pointer - p < MAXIMUM_MATCH_DISTANCE } # prune too-distant matches
192
+ end.compact
193
+
194
+ best_match = matches.max { |a, b| match_score.call(a) <=> match_score.call(b) }
195
+ next (output << c1) if best_match.nil?
196
+
197
+ # Output a pointer to the match and advance the input pointer by the amount matched
198
+ match_distance, match_length = best_match
199
+ output += if match_distance < 0b10000000
200
+ [0b11000000 | match_length, match_distance]
201
+ else
202
+ [0b11100000 | match_length, match_distance >> 8, match_distance & 0b00000000_11111111]
203
+ end
204
+ pointer += (match_length - 1)
205
+ end
206
+ output.pack('C*').force_encoding(Encoding::UTF_8)
207
+ end
208
+ end
@@ -0,0 +1,3 @@
1
+ module LZUTF8
2
+ VERSION = "0.0.1".freeze
3
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: lzutf8
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Ian Katz
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2021-03-01 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: ''
14
+ email:
15
+ - ianfixes@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - ".yardopts"
21
+ - README.md
22
+ - lib/lzutf8.rb
23
+ - lib/lzutf8/version.rb
24
+ homepage: http://github.com/ianfixes/lzutf8_gem
25
+ licenses:
26
+ - Apache-2.0
27
+ metadata: {}
28
+ post_install_message:
29
+ rdoc_options: []
30
+ require_paths:
31
+ - lib
32
+ required_ruby_version: !ruby/object:Gem::Requirement
33
+ requirements:
34
+ - - ">="
35
+ - !ruby/object:Gem::Version
36
+ version: '0'
37
+ required_rubygems_version: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ requirements: []
43
+ rubygems_version: 3.0.3
44
+ signing_key:
45
+ specification_version: 4
46
+ summary: Compression and decompression implementation of LZUTF-8 Algorithm
47
+ test_files: []