hyperll 0.2.6 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +4 -0
- data/Rakefile +7 -0
- data/ext/hyperll/delta_bytes.c +97 -0
- data/ext/hyperll/delta_bytes.h +24 -0
- data/ext/hyperll/extconf.rb +12 -0
- data/ext/hyperll/hyper_log_log_plus.c +473 -0
- data/ext/hyperll/hyperll.c +12 -0
- data/ext/hyperll/hyperll.h +17 -0
- data/ext/hyperll/register_set.c +196 -0
- data/ext/hyperll/register_set.h +19 -0
- data/ext/hyperll/sparse_set.c +76 -0
- data/ext/hyperll/sparse_set.h +24 -0
- data/ext/hyperll/varint.c +83 -0
- data/ext/hyperll/varint.h +21 -0
- data/hyperll.gemspec +2 -0
- data/lib/hyperll.rb +1 -0
- data/lib/hyperll/hyper_log_log.rb +0 -1
- data/lib/hyperll/hyper_log_log_plus.rb +10 -310
- data/lib/hyperll/version.rb +1 -1
- data/spec/fixtures/merge-many-sets.json +192 -0
- data/spec/hyperll/delta_bytes_spec.rb +5 -3
- data/spec/hyperll/hyper_log_log_plus_spec.rb +30 -1
- data/spec/hyperll/hyper_log_log_spec.rb +1 -1
- data/spec/hyperll/register_set_spec.rb +18 -1
- data/spec/hyperll/varint_spec.rb +5 -1
- metadata +32 -7
- data/lib/hyperll/delta_bytes.rb +0 -32
- data/lib/hyperll/register_set.rb +0 -90
- data/lib/hyperll/util.rb +0 -39
- data/lib/hyperll/varint.rb +0 -26
@@ -1,14 +1,16 @@
|
|
1
1
|
require 'spec_helper'
|
2
|
-
require 'hyperll
|
2
|
+
require 'hyperll'
|
3
3
|
|
4
4
|
module Hyperll
|
5
5
|
describe DeltaBytes do
|
6
6
|
it 'uncompresses bytes' do
|
7
|
-
expect(DeltaBytes.uncompress([
|
7
|
+
expect(DeltaBytes.uncompress([-46, 5, -64, 4])).to eq([722, 1298])
|
8
|
+
expect(DeltaBytes.uncompress([210, 5, 192, 4, 254, 67])).to eq([722, 1298, 10000])
|
8
9
|
end
|
9
10
|
|
10
11
|
it 'compresses bytes' do
|
11
|
-
expect(DeltaBytes.compress([722, 1298])).to eq([
|
12
|
+
expect(DeltaBytes.compress([722, 1298])).to eq([256 - 46, 5, 256 - 64, 4])
|
13
|
+
expect(DeltaBytes.compress([722, 1298, 10000])).to eq([210, 5, 192, 4, 254, 67])
|
12
14
|
end
|
13
15
|
end
|
14
16
|
end
|
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
require 'base64'
|
3
|
-
require 'hyperll
|
3
|
+
require 'hyperll'
|
4
|
+
require 'json'
|
4
5
|
|
5
6
|
module Hyperll
|
6
7
|
describe HyperLogLogPlus do
|
@@ -206,6 +207,22 @@ module Hyperll
|
|
206
207
|
expect(hllp.cardinality).to eq(6)
|
207
208
|
end
|
208
209
|
|
210
|
+
it 'merges and keeps the cardinality exact, handling elements that are common to both sets' do
|
211
|
+
# Serialization after offering [12, 13, 22, 34, 38, 40, 41, 46, 49]
|
212
|
+
hllp = HyperLogLogPlus.unserialize(Base64.decode64("/////gsQAQnMYsoMtgak9AGMiwK8VbKiAYmU0wPVwK38Dw=="))
|
213
|
+
|
214
|
+
# Serialization after offering [2, 6, 19, 29, 41, 48]
|
215
|
+
hllp2 = HyperLogLogPlus.unserialize(Base64.decode64("/////gsQAQbwdJz0Afq4AbSZAqxX4i4="))
|
216
|
+
|
217
|
+
expect(hllp.cardinality).to eq(9)
|
218
|
+
expect(hllp2.cardinality).to eq(6)
|
219
|
+
|
220
|
+
# The set intersection of hllp and hllp2 has one element, 41, so after
|
221
|
+
# merging the cardinality should be 14.
|
222
|
+
hllp.merge(hllp2)
|
223
|
+
expect(hllp.cardinality).to eq(14)
|
224
|
+
end
|
225
|
+
|
209
226
|
it 'merges and keeps the cardinality exact' do
|
210
227
|
hllp = HyperLogLogPlus.unserialize(Base64.decode64("/////gsQAQOwX+yBA7TzAw=="))
|
211
228
|
hllp2 = HyperLogLogPlus.unserialize(Base64.decode64("/////gsQAQ7SKbociFqGigLUL9oagCWmC+IdlBqkE8g7jFiCnwE="))
|
@@ -270,5 +287,17 @@ module Hyperll
|
|
270
287
|
expect(hllp.cardinality).to eq(8) # 3 + 3 = 8; that's how it goes with hll
|
271
288
|
end
|
272
289
|
end
|
290
|
+
|
291
|
+
context 'merging multiple at a time' do
|
292
|
+
it 'merges' do
|
293
|
+
hllp = HyperLogLogPlus.unserialize([-1, -1, -1, -2, 4, 10, 1, 1, -110, 10].pack("C*"))
|
294
|
+
hllp2 = HyperLogLogPlus.unserialize([-1, -1, -1, -2, 4, 10, 1, 1, -46, 5].pack("C*"))
|
295
|
+
hllp3 = HyperLogLogPlus.unserialize([-1, -1, -1, -2, 4, 10, 1, 1, -124, 6].pack("C*"))
|
296
|
+
|
297
|
+
hllp.merge(hllp2, hllp3)
|
298
|
+
expect(hllp.format).to eq(:sparse)
|
299
|
+
expect(hllp.cardinality).to eq(3) # 1 + 1 + 1 = 3
|
300
|
+
end
|
301
|
+
end
|
273
302
|
end
|
274
303
|
end
|
@@ -1,5 +1,5 @@
|
|
1
1
|
require 'spec_helper'
|
2
|
-
require 'hyperll
|
2
|
+
require 'hyperll'
|
3
3
|
|
4
4
|
module Hyperll
|
5
5
|
describe RegisterSet do
|
@@ -68,5 +68,22 @@ module Hyperll
|
|
68
68
|
expect(value).to eq(merged[index])
|
69
69
|
end
|
70
70
|
end
|
71
|
+
|
72
|
+
it "serializes to a string" do
|
73
|
+
rs = RegisterSet.new(10)
|
74
|
+
rs[0] = 2
|
75
|
+
rs[1] = 3
|
76
|
+
rs[2] = 4
|
77
|
+
|
78
|
+
expect(rs.serialize).to eq("\x00\x00\x10b\x00\x00\x00\x00".force_encoding("ASCII-8BIT"))
|
79
|
+
end
|
80
|
+
|
81
|
+
it "unserializes from a string" do
|
82
|
+
rs = RegisterSet.new(10, "\x00\x00\x10b\x00\x00\x00\x00".unpack("N*"))
|
83
|
+
|
84
|
+
expect(rs[0]).to eq(2)
|
85
|
+
expect(rs[1]).to eq(3)
|
86
|
+
expect(rs[2]).to eq(4)
|
87
|
+
end
|
71
88
|
end
|
72
89
|
end
|
data/spec/hyperll/varint_spec.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
require 'spec_helper'
|
2
|
-
require 'hyperll
|
2
|
+
require 'hyperll'
|
3
3
|
|
4
4
|
module Hyperll
|
5
5
|
describe Varint do
|
@@ -14,5 +14,9 @@ module Hyperll
|
|
14
14
|
expect(Varint.write_unsigned_var_int(0x81)).to eq([0x81, 0x01])
|
15
15
|
expect(Varint.write_unsigned_var_int(0x4081)).to eq([0x81, 0x81, 0x01])
|
16
16
|
end
|
17
|
+
|
18
|
+
it 'treats negative integers as their twos complement unsigned representation' do
|
19
|
+
expect(Varint.write_unsigned_var_int(-4)).to eq([252, 255, 255, 255, 15])
|
20
|
+
end
|
17
21
|
end
|
18
22
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hyperll
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andy Lindeman
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-10-
|
11
|
+
date: 2013-10-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -52,11 +52,26 @@ dependencies:
|
|
52
52
|
- - ~>
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '2.14'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake-compiler
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.9.1
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ~>
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 0.9.1
|
55
69
|
description: HyperLogLog implementation in pure Ruby
|
56
70
|
email:
|
57
71
|
- andy@andylindeman.com
|
58
72
|
executables: []
|
59
|
-
extensions:
|
73
|
+
extensions:
|
74
|
+
- ext/hyperll/extconf.rb
|
60
75
|
extra_rdoc_files: []
|
61
76
|
files:
|
62
77
|
- .gitignore
|
@@ -66,18 +81,27 @@ files:
|
|
66
81
|
- LICENSE.txt
|
67
82
|
- README.md
|
68
83
|
- Rakefile
|
84
|
+
- ext/hyperll/delta_bytes.c
|
85
|
+
- ext/hyperll/delta_bytes.h
|
86
|
+
- ext/hyperll/extconf.rb
|
87
|
+
- ext/hyperll/hyper_log_log_plus.c
|
88
|
+
- ext/hyperll/hyperll.c
|
89
|
+
- ext/hyperll/hyperll.h
|
90
|
+
- ext/hyperll/register_set.c
|
91
|
+
- ext/hyperll/register_set.h
|
92
|
+
- ext/hyperll/sparse_set.c
|
93
|
+
- ext/hyperll/sparse_set.h
|
94
|
+
- ext/hyperll/varint.c
|
95
|
+
- ext/hyperll/varint.h
|
69
96
|
- hyperll.gemspec
|
70
97
|
- lib/hyperll.rb
|
71
|
-
- lib/hyperll/delta_bytes.rb
|
72
98
|
- lib/hyperll/hyper_log_log.rb
|
73
99
|
- lib/hyperll/hyper_log_log_plus.rb
|
74
100
|
- lib/hyperll/murmur_hash.rb
|
75
|
-
- lib/hyperll/register_set.rb
|
76
|
-
- lib/hyperll/util.rb
|
77
|
-
- lib/hyperll/varint.rb
|
78
101
|
- lib/hyperll/version.rb
|
79
102
|
- spec/fixtures/10000.txt
|
80
103
|
- spec/fixtures/mega.json
|
104
|
+
- spec/fixtures/merge-many-sets.json
|
81
105
|
- spec/hyperll/delta_bytes_spec.rb
|
82
106
|
- spec/hyperll/hyper_log_log_plus_spec.rb
|
83
107
|
- spec/hyperll/hyper_log_log_spec.rb
|
@@ -114,6 +138,7 @@ summary: HyperLogLog implementation in pure Ruby
|
|
114
138
|
test_files:
|
115
139
|
- spec/fixtures/10000.txt
|
116
140
|
- spec/fixtures/mega.json
|
141
|
+
- spec/fixtures/merge-many-sets.json
|
117
142
|
- spec/hyperll/delta_bytes_spec.rb
|
118
143
|
- spec/hyperll/hyper_log_log_plus_spec.rb
|
119
144
|
- spec/hyperll/hyper_log_log_spec.rb
|
data/lib/hyperll/delta_bytes.rb
DELETED
@@ -1,32 +0,0 @@
|
|
1
|
-
require_relative 'varint'
|
2
|
-
|
3
|
-
module Hyperll
|
4
|
-
class DeltaBytes
|
5
|
-
def self.compress(bytes)
|
6
|
-
compressed = Varint.write_unsigned_var_int(bytes.length)
|
7
|
-
previous_value = 0
|
8
|
-
|
9
|
-
bytes.each do |b|
|
10
|
-
compressed.concat(Varint.write_unsigned_var_int(b - previous_value))
|
11
|
-
previous_value = b
|
12
|
-
end
|
13
|
-
|
14
|
-
compressed
|
15
|
-
end
|
16
|
-
|
17
|
-
def self.uncompress(bytes)
|
18
|
-
uncompressed = []
|
19
|
-
previous_value = 0
|
20
|
-
|
21
|
-
length = Varint.read_unsigned_var_int(bytes)
|
22
|
-
length.times do
|
23
|
-
next_value = Varint.read_unsigned_var_int(bytes)
|
24
|
-
|
25
|
-
uncompressed << next_value + previous_value
|
26
|
-
previous_value = uncompressed.last
|
27
|
-
end
|
28
|
-
|
29
|
-
uncompressed
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
data/lib/hyperll/register_set.rb
DELETED
@@ -1,90 +0,0 @@
|
|
1
|
-
require_relative 'util'
|
2
|
-
|
3
|
-
module Hyperll
|
4
|
-
class RegisterSet
|
5
|
-
include Enumerable
|
6
|
-
include Util
|
7
|
-
|
8
|
-
LOG2_BITS_PER_WORD = 6
|
9
|
-
REGISTER_SIZE = 5
|
10
|
-
|
11
|
-
attr_reader :count, :size
|
12
|
-
|
13
|
-
def initialize(count, values = nil)
|
14
|
-
@count = count
|
15
|
-
|
16
|
-
@bits = count / LOG2_BITS_PER_WORD
|
17
|
-
if @bits.zero?
|
18
|
-
@size = 1
|
19
|
-
elsif (@bits % INTEGER_SIZE).zero?
|
20
|
-
@size = @bits
|
21
|
-
else
|
22
|
-
@size = @bits + 1
|
23
|
-
end
|
24
|
-
|
25
|
-
@values = values || Array.new(@size, 0)
|
26
|
-
end
|
27
|
-
|
28
|
-
def []=(position, value)
|
29
|
-
bucket = position / LOG2_BITS_PER_WORD
|
30
|
-
shift = REGISTER_SIZE * (position - (bucket * LOG2_BITS_PER_WORD))
|
31
|
-
|
32
|
-
@values[bucket] = ((@values[bucket] & ~(0x1f * POWERS_OF_TWO[shift])) | (value * POWERS_OF_TWO[shift]))
|
33
|
-
@values[bucket] &= INT_MASK if @values[bucket] > INT_MASK
|
34
|
-
end
|
35
|
-
|
36
|
-
def [](position)
|
37
|
-
bucket = position / LOG2_BITS_PER_WORD
|
38
|
-
shift = REGISTER_SIZE * (position - (bucket * LOG2_BITS_PER_WORD))
|
39
|
-
|
40
|
-
return (@values[bucket] & (0x1f * POWERS_OF_TWO[shift])) / POWERS_OF_TWO[shift]
|
41
|
-
end
|
42
|
-
|
43
|
-
def each
|
44
|
-
return enum_for(:each) unless block_given?
|
45
|
-
@count.times do |i|
|
46
|
-
yield self[i]
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
def update_if_greater(position, value)
|
51
|
-
bucket = position / LOG2_BITS_PER_WORD
|
52
|
-
shift = REGISTER_SIZE * (position - (bucket * LOG2_BITS_PER_WORD));
|
53
|
-
mask = (0x1f * POWERS_OF_TWO[shift])
|
54
|
-
|
55
|
-
current_value = @values[bucket] & mask
|
56
|
-
new_value = value * POWERS_OF_TWO[shift]
|
57
|
-
if current_value < new_value
|
58
|
-
@values[bucket] = ((@values[bucket] & ~mask) | new_value)
|
59
|
-
@values[bucket] &= INT_MASK if @values[bucket] > INT_MASK
|
60
|
-
true
|
61
|
-
else
|
62
|
-
false
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
def merge(other)
|
67
|
-
@size.times do |bucket|
|
68
|
-
word = 0
|
69
|
-
LOG2_BITS_PER_WORD.times do |j|
|
70
|
-
mask = 0x1f << (REGISTER_SIZE * j);
|
71
|
-
|
72
|
-
this_val = self.values[bucket] & mask
|
73
|
-
other_val = other.values[bucket] & mask
|
74
|
-
word |= [this_val, other_val].max
|
75
|
-
end
|
76
|
-
|
77
|
-
@values[bucket] = word
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
|
-
def serialize
|
82
|
-
@values.pack("N*")
|
83
|
-
end
|
84
|
-
|
85
|
-
protected
|
86
|
-
def values
|
87
|
-
@values
|
88
|
-
end
|
89
|
-
end
|
90
|
-
end
|
data/lib/hyperll/util.rb
DELETED
@@ -1,39 +0,0 @@
|
|
1
|
-
module Hyperll
|
2
|
-
module Util
|
3
|
-
INT_MASK = 0xFFFFFFFF
|
4
|
-
INTEGER_SIZE = 32
|
5
|
-
|
6
|
-
POWERS_OF_TWO = 0.upto(32).map { |i| 2**i }.freeze
|
7
|
-
|
8
|
-
def number_of_leading_zeros(x)
|
9
|
-
return 32 if x == 0
|
10
|
-
|
11
|
-
n = 0
|
12
|
-
if x <= 0x0000FFFF
|
13
|
-
n += 16
|
14
|
-
x *= POWERS_OF_TWO[16]
|
15
|
-
end
|
16
|
-
|
17
|
-
if x <= 0x00FFFFFF
|
18
|
-
n += 8;
|
19
|
-
x *= POWERS_OF_TWO[8]
|
20
|
-
end
|
21
|
-
|
22
|
-
if x <= 0x0FFFFFFF
|
23
|
-
n += 4
|
24
|
-
x *= POWERS_OF_TWO[4]
|
25
|
-
end
|
26
|
-
|
27
|
-
if x <= 0x3FFFFFFF
|
28
|
-
n += 2
|
29
|
-
x *= POWERS_OF_TWO[2]
|
30
|
-
end
|
31
|
-
|
32
|
-
if x <= 0x7FFFFFFF
|
33
|
-
n += 1
|
34
|
-
end
|
35
|
-
|
36
|
-
n
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
data/lib/hyperll/varint.rb
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
module Hyperll
|
2
|
-
class Varint
|
3
|
-
def self.read_unsigned_var_int(bytes)
|
4
|
-
value, i, b = 0, 0, 0
|
5
|
-
while (b = bytes.shift) & 0x80 != 0
|
6
|
-
value |= (b & 0x7F) << i
|
7
|
-
|
8
|
-
i += 7
|
9
|
-
raise "Variable length quantity is too long" if i > 35
|
10
|
-
end
|
11
|
-
|
12
|
-
value | (b << i)
|
13
|
-
end
|
14
|
-
|
15
|
-
def self.write_unsigned_var_int(value)
|
16
|
-
bytes = []
|
17
|
-
while (value & 0xFFFFFF80) != 0
|
18
|
-
bytes << ((value & 0x7F) | 0x80)
|
19
|
-
value >>= 7
|
20
|
-
end
|
21
|
-
|
22
|
-
bytes << (value & 0x7F)
|
23
|
-
bytes
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|