hyperll 0.2.6 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,14 +1,16 @@
1
1
  require 'spec_helper'
2
- require 'hyperll/delta_bytes'
2
+ require 'hyperll'
3
3
 
4
4
  module Hyperll
5
5
  describe DeltaBytes do
6
6
  it 'uncompresses bytes' do
7
- expect(DeltaBytes.uncompress([2, -46, 5, -64, 4])).to eq([722, 1298])
7
+ expect(DeltaBytes.uncompress([-46, 5, -64, 4])).to eq([722, 1298])
8
+ expect(DeltaBytes.uncompress([210, 5, 192, 4, 254, 67])).to eq([722, 1298, 10000])
8
9
  end
9
10
 
10
11
  it 'compresses bytes' do
11
- expect(DeltaBytes.compress([722, 1298])).to eq([2, 256 - 46, 5, 256 - 64, 4])
12
+ expect(DeltaBytes.compress([722, 1298])).to eq([256 - 46, 5, 256 - 64, 4])
13
+ expect(DeltaBytes.compress([722, 1298, 10000])).to eq([210, 5, 192, 4, 254, 67])
12
14
  end
13
15
  end
14
16
  end
@@ -1,6 +1,7 @@
1
1
  require 'spec_helper'
2
2
  require 'base64'
3
- require 'hyperll/hyper_log_log_plus'
3
+ require 'hyperll'
4
+ require 'json'
4
5
 
5
6
  module Hyperll
6
7
  describe HyperLogLogPlus do
@@ -206,6 +207,22 @@ module Hyperll
206
207
  expect(hllp.cardinality).to eq(6)
207
208
  end
208
209
 
210
+ it 'merges and keeps the cardinality exact, handling elements that are common to both sets' do
211
+ # Serialization after offering [12, 13, 22, 34, 38, 40, 41, 46, 49]
212
+ hllp = HyperLogLogPlus.unserialize(Base64.decode64("/////gsQAQnMYsoMtgak9AGMiwK8VbKiAYmU0wPVwK38Dw=="))
213
+
214
+ # Serialization after offering [2, 6, 19, 29, 41, 48]
215
+ hllp2 = HyperLogLogPlus.unserialize(Base64.decode64("/////gsQAQbwdJz0Afq4AbSZAqxX4i4="))
216
+
217
+ expect(hllp.cardinality).to eq(9)
218
+ expect(hllp2.cardinality).to eq(6)
219
+
220
+ # The set intersection of hllp and hllp2 has one element, 41, so after
221
+ # merging the cardinality should be 14.
222
+ hllp.merge(hllp2)
223
+ expect(hllp.cardinality).to eq(14)
224
+ end
225
+
209
226
  it 'merges and keeps the cardinality exact' do
210
227
  hllp = HyperLogLogPlus.unserialize(Base64.decode64("/////gsQAQOwX+yBA7TzAw=="))
211
228
  hllp2 = HyperLogLogPlus.unserialize(Base64.decode64("/////gsQAQ7SKbociFqGigLUL9oagCWmC+IdlBqkE8g7jFiCnwE="))
@@ -270,5 +287,17 @@ module Hyperll
270
287
  expect(hllp.cardinality).to eq(8) # 3 + 3 = 8; that's how it goes with hll
271
288
  end
272
289
  end
290
+
291
+ context 'merging multiple at a time' do
292
+ it 'merges' do
293
+ hllp = HyperLogLogPlus.unserialize([-1, -1, -1, -2, 4, 10, 1, 1, -110, 10].pack("C*"))
294
+ hllp2 = HyperLogLogPlus.unserialize([-1, -1, -1, -2, 4, 10, 1, 1, -46, 5].pack("C*"))
295
+ hllp3 = HyperLogLogPlus.unserialize([-1, -1, -1, -2, 4, 10, 1, 1, -124, 6].pack("C*"))
296
+
297
+ hllp.merge(hllp2, hllp3)
298
+ expect(hllp.format).to eq(:sparse)
299
+ expect(hllp.cardinality).to eq(3) # 1 + 1 + 1 = 3
300
+ end
301
+ end
273
302
  end
274
303
  end
@@ -1,5 +1,5 @@
1
1
  require 'spec_helper'
2
- require 'hyperll/hyper_log_log'
2
+ require 'hyperll'
3
3
 
4
4
  module Hyperll
5
5
  describe HyperLogLog do
@@ -1,5 +1,5 @@
1
1
  require 'spec_helper'
2
- require 'hyperll/register_set'
2
+ require 'hyperll'
3
3
 
4
4
  module Hyperll
5
5
  describe RegisterSet do
@@ -68,5 +68,22 @@ module Hyperll
68
68
  expect(value).to eq(merged[index])
69
69
  end
70
70
  end
71
+
72
+ it "serializes to a string" do
73
+ rs = RegisterSet.new(10)
74
+ rs[0] = 2
75
+ rs[1] = 3
76
+ rs[2] = 4
77
+
78
+ expect(rs.serialize).to eq("\x00\x00\x10b\x00\x00\x00\x00".force_encoding("ASCII-8BIT"))
79
+ end
80
+
81
+ it "unserializes from a string" do
82
+ rs = RegisterSet.new(10, "\x00\x00\x10b\x00\x00\x00\x00".unpack("N*"))
83
+
84
+ expect(rs[0]).to eq(2)
85
+ expect(rs[1]).to eq(3)
86
+ expect(rs[2]).to eq(4)
87
+ end
71
88
  end
72
89
  end
@@ -1,5 +1,5 @@
1
1
  require 'spec_helper'
2
- require 'hyperll/varint'
2
+ require 'hyperll'
3
3
 
4
4
  module Hyperll
5
5
  describe Varint do
@@ -14,5 +14,9 @@ module Hyperll
14
14
  expect(Varint.write_unsigned_var_int(0x81)).to eq([0x81, 0x01])
15
15
  expect(Varint.write_unsigned_var_int(0x4081)).to eq([0x81, 0x81, 0x01])
16
16
  end
17
+
18
+ it 'treats negative integers as their twos complement unsigned representation' do
19
+ expect(Varint.write_unsigned_var_int(-4)).to eq([252, 255, 255, 255, 15])
20
+ end
17
21
  end
18
22
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hyperll
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.6
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andy Lindeman
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-10-29 00:00:00.000000000 Z
11
+ date: 2013-10-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,11 +52,26 @@ dependencies:
52
52
  - - ~>
53
53
  - !ruby/object:Gem::Version
54
54
  version: '2.14'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake-compiler
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 0.9.1
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ~>
67
+ - !ruby/object:Gem::Version
68
+ version: 0.9.1
55
69
  description: HyperLogLog implementation in pure Ruby
56
70
  email:
57
71
  - andy@andylindeman.com
58
72
  executables: []
59
- extensions: []
73
+ extensions:
74
+ - ext/hyperll/extconf.rb
60
75
  extra_rdoc_files: []
61
76
  files:
62
77
  - .gitignore
@@ -66,18 +81,27 @@ files:
66
81
  - LICENSE.txt
67
82
  - README.md
68
83
  - Rakefile
84
+ - ext/hyperll/delta_bytes.c
85
+ - ext/hyperll/delta_bytes.h
86
+ - ext/hyperll/extconf.rb
87
+ - ext/hyperll/hyper_log_log_plus.c
88
+ - ext/hyperll/hyperll.c
89
+ - ext/hyperll/hyperll.h
90
+ - ext/hyperll/register_set.c
91
+ - ext/hyperll/register_set.h
92
+ - ext/hyperll/sparse_set.c
93
+ - ext/hyperll/sparse_set.h
94
+ - ext/hyperll/varint.c
95
+ - ext/hyperll/varint.h
69
96
  - hyperll.gemspec
70
97
  - lib/hyperll.rb
71
- - lib/hyperll/delta_bytes.rb
72
98
  - lib/hyperll/hyper_log_log.rb
73
99
  - lib/hyperll/hyper_log_log_plus.rb
74
100
  - lib/hyperll/murmur_hash.rb
75
- - lib/hyperll/register_set.rb
76
- - lib/hyperll/util.rb
77
- - lib/hyperll/varint.rb
78
101
  - lib/hyperll/version.rb
79
102
  - spec/fixtures/10000.txt
80
103
  - spec/fixtures/mega.json
104
+ - spec/fixtures/merge-many-sets.json
81
105
  - spec/hyperll/delta_bytes_spec.rb
82
106
  - spec/hyperll/hyper_log_log_plus_spec.rb
83
107
  - spec/hyperll/hyper_log_log_spec.rb
@@ -114,6 +138,7 @@ summary: HyperLogLog implementation in pure Ruby
114
138
  test_files:
115
139
  - spec/fixtures/10000.txt
116
140
  - spec/fixtures/mega.json
141
+ - spec/fixtures/merge-many-sets.json
117
142
  - spec/hyperll/delta_bytes_spec.rb
118
143
  - spec/hyperll/hyper_log_log_plus_spec.rb
119
144
  - spec/hyperll/hyper_log_log_spec.rb
@@ -1,32 +0,0 @@
1
- require_relative 'varint'
2
-
3
- module Hyperll
4
- class DeltaBytes
5
- def self.compress(bytes)
6
- compressed = Varint.write_unsigned_var_int(bytes.length)
7
- previous_value = 0
8
-
9
- bytes.each do |b|
10
- compressed.concat(Varint.write_unsigned_var_int(b - previous_value))
11
- previous_value = b
12
- end
13
-
14
- compressed
15
- end
16
-
17
- def self.uncompress(bytes)
18
- uncompressed = []
19
- previous_value = 0
20
-
21
- length = Varint.read_unsigned_var_int(bytes)
22
- length.times do
23
- next_value = Varint.read_unsigned_var_int(bytes)
24
-
25
- uncompressed << next_value + previous_value
26
- previous_value = uncompressed.last
27
- end
28
-
29
- uncompressed
30
- end
31
- end
32
- end
@@ -1,90 +0,0 @@
1
- require_relative 'util'
2
-
3
- module Hyperll
4
- class RegisterSet
5
- include Enumerable
6
- include Util
7
-
8
- LOG2_BITS_PER_WORD = 6
9
- REGISTER_SIZE = 5
10
-
11
- attr_reader :count, :size
12
-
13
- def initialize(count, values = nil)
14
- @count = count
15
-
16
- @bits = count / LOG2_BITS_PER_WORD
17
- if @bits.zero?
18
- @size = 1
19
- elsif (@bits % INTEGER_SIZE).zero?
20
- @size = @bits
21
- else
22
- @size = @bits + 1
23
- end
24
-
25
- @values = values || Array.new(@size, 0)
26
- end
27
-
28
- def []=(position, value)
29
- bucket = position / LOG2_BITS_PER_WORD
30
- shift = REGISTER_SIZE * (position - (bucket * LOG2_BITS_PER_WORD))
31
-
32
- @values[bucket] = ((@values[bucket] & ~(0x1f * POWERS_OF_TWO[shift])) | (value * POWERS_OF_TWO[shift]))
33
- @values[bucket] &= INT_MASK if @values[bucket] > INT_MASK
34
- end
35
-
36
- def [](position)
37
- bucket = position / LOG2_BITS_PER_WORD
38
- shift = REGISTER_SIZE * (position - (bucket * LOG2_BITS_PER_WORD))
39
-
40
- return (@values[bucket] & (0x1f * POWERS_OF_TWO[shift])) / POWERS_OF_TWO[shift]
41
- end
42
-
43
- def each
44
- return enum_for(:each) unless block_given?
45
- @count.times do |i|
46
- yield self[i]
47
- end
48
- end
49
-
50
- def update_if_greater(position, value)
51
- bucket = position / LOG2_BITS_PER_WORD
52
- shift = REGISTER_SIZE * (position - (bucket * LOG2_BITS_PER_WORD));
53
- mask = (0x1f * POWERS_OF_TWO[shift])
54
-
55
- current_value = @values[bucket] & mask
56
- new_value = value * POWERS_OF_TWO[shift]
57
- if current_value < new_value
58
- @values[bucket] = ((@values[bucket] & ~mask) | new_value)
59
- @values[bucket] &= INT_MASK if @values[bucket] > INT_MASK
60
- true
61
- else
62
- false
63
- end
64
- end
65
-
66
- def merge(other)
67
- @size.times do |bucket|
68
- word = 0
69
- LOG2_BITS_PER_WORD.times do |j|
70
- mask = 0x1f << (REGISTER_SIZE * j);
71
-
72
- this_val = self.values[bucket] & mask
73
- other_val = other.values[bucket] & mask
74
- word |= [this_val, other_val].max
75
- end
76
-
77
- @values[bucket] = word
78
- end
79
- end
80
-
81
- def serialize
82
- @values.pack("N*")
83
- end
84
-
85
- protected
86
- def values
87
- @values
88
- end
89
- end
90
- end
@@ -1,39 +0,0 @@
1
- module Hyperll
2
- module Util
3
- INT_MASK = 0xFFFFFFFF
4
- INTEGER_SIZE = 32
5
-
6
- POWERS_OF_TWO = 0.upto(32).map { |i| 2**i }.freeze
7
-
8
- def number_of_leading_zeros(x)
9
- return 32 if x == 0
10
-
11
- n = 0
12
- if x <= 0x0000FFFF
13
- n += 16
14
- x *= POWERS_OF_TWO[16]
15
- end
16
-
17
- if x <= 0x00FFFFFF
18
- n += 8;
19
- x *= POWERS_OF_TWO[8]
20
- end
21
-
22
- if x <= 0x0FFFFFFF
23
- n += 4
24
- x *= POWERS_OF_TWO[4]
25
- end
26
-
27
- if x <= 0x3FFFFFFF
28
- n += 2
29
- x *= POWERS_OF_TWO[2]
30
- end
31
-
32
- if x <= 0x7FFFFFFF
33
- n += 1
34
- end
35
-
36
- n
37
- end
38
- end
39
- end
@@ -1,26 +0,0 @@
1
- module Hyperll
2
- class Varint
3
- def self.read_unsigned_var_int(bytes)
4
- value, i, b = 0, 0, 0
5
- while (b = bytes.shift) & 0x80 != 0
6
- value |= (b & 0x7F) << i
7
-
8
- i += 7
9
- raise "Variable length quantity is too long" if i > 35
10
- end
11
-
12
- value | (b << i)
13
- end
14
-
15
- def self.write_unsigned_var_int(value)
16
- bytes = []
17
- while (value & 0xFFFFFF80) != 0
18
- bytes << ((value & 0x7F) | 0x80)
19
- value >>= 7
20
- end
21
-
22
- bytes << (value & 0x7F)
23
- bytes
24
- end
25
- end
26
- end