hyperll 0.2.6 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,16 @@
1
1
  require 'spec_helper'
2
- require 'hyperll/delta_bytes'
2
+ require 'hyperll'
3
3
 
4
4
  module Hyperll
5
5
  describe DeltaBytes do
6
6
  it 'uncompresses bytes' do
7
- expect(DeltaBytes.uncompress([2, -46, 5, -64, 4])).to eq([722, 1298])
7
+ expect(DeltaBytes.uncompress([-46, 5, -64, 4])).to eq([722, 1298])
8
+ expect(DeltaBytes.uncompress([210, 5, 192, 4, 254, 67])).to eq([722, 1298, 10000])
8
9
  end
9
10
 
10
11
  it 'compresses bytes' do
11
- expect(DeltaBytes.compress([722, 1298])).to eq([2, 256 - 46, 5, 256 - 64, 4])
12
+ expect(DeltaBytes.compress([722, 1298])).to eq([256 - 46, 5, 256 - 64, 4])
13
+ expect(DeltaBytes.compress([722, 1298, 10000])).to eq([210, 5, 192, 4, 254, 67])
12
14
  end
13
15
  end
14
16
  end
@@ -1,6 +1,7 @@
1
1
  require 'spec_helper'
2
2
  require 'base64'
3
- require 'hyperll/hyper_log_log_plus'
3
+ require 'hyperll'
4
+ require 'json'
4
5
 
5
6
  module Hyperll
6
7
  describe HyperLogLogPlus do
@@ -206,6 +207,22 @@ module Hyperll
206
207
  expect(hllp.cardinality).to eq(6)
207
208
  end
208
209
 
210
+ it 'merges and keeps the cardinality exact, handling elements that are common to both sets' do
211
+ # Serialization after offering [12, 13, 22, 34, 38, 40, 41, 46, 49]
212
+ hllp = HyperLogLogPlus.unserialize(Base64.decode64("/////gsQAQnMYsoMtgak9AGMiwK8VbKiAYmU0wPVwK38Dw=="))
213
+
214
+ # Serialization after offering [2, 6, 19, 29, 41, 48]
215
+ hllp2 = HyperLogLogPlus.unserialize(Base64.decode64("/////gsQAQbwdJz0Afq4AbSZAqxX4i4="))
216
+
217
+ expect(hllp.cardinality).to eq(9)
218
+ expect(hllp2.cardinality).to eq(6)
219
+
220
+ # The set intersection of hllp and hllp2 has one element, 41, so after
221
+ # merging the cardinality should be 14.
222
+ hllp.merge(hllp2)
223
+ expect(hllp.cardinality).to eq(14)
224
+ end
225
+
209
226
  it 'merges and keeps the cardinality exact' do
210
227
  hllp = HyperLogLogPlus.unserialize(Base64.decode64("/////gsQAQOwX+yBA7TzAw=="))
211
228
  hllp2 = HyperLogLogPlus.unserialize(Base64.decode64("/////gsQAQ7SKbociFqGigLUL9oagCWmC+IdlBqkE8g7jFiCnwE="))
@@ -270,5 +287,17 @@ module Hyperll
270
287
  expect(hllp.cardinality).to eq(8) # 3 + 3 = 8; that's how it goes with hll
271
288
  end
272
289
  end
290
+
291
+ context 'merging multiple at a time' do
292
+ it 'merges' do
293
+ hllp = HyperLogLogPlus.unserialize([-1, -1, -1, -2, 4, 10, 1, 1, -110, 10].pack("C*"))
294
+ hllp2 = HyperLogLogPlus.unserialize([-1, -1, -1, -2, 4, 10, 1, 1, -46, 5].pack("C*"))
295
+ hllp3 = HyperLogLogPlus.unserialize([-1, -1, -1, -2, 4, 10, 1, 1, -124, 6].pack("C*"))
296
+
297
+ hllp.merge(hllp2, hllp3)
298
+ expect(hllp.format).to eq(:sparse)
299
+ expect(hllp.cardinality).to eq(3) # 1 + 1 + 1 = 3
300
+ end
301
+ end
273
302
  end
274
303
  end
@@ -1,5 +1,5 @@
1
1
  require 'spec_helper'
2
- require 'hyperll/hyper_log_log'
2
+ require 'hyperll'
3
3
 
4
4
  module Hyperll
5
5
  describe HyperLogLog do
@@ -1,5 +1,5 @@
1
1
  require 'spec_helper'
2
- require 'hyperll/register_set'
2
+ require 'hyperll'
3
3
 
4
4
  module Hyperll
5
5
  describe RegisterSet do
@@ -68,5 +68,22 @@ module Hyperll
68
68
  expect(value).to eq(merged[index])
69
69
  end
70
70
  end
71
+
72
+ it "serializes to a string" do
73
+ rs = RegisterSet.new(10)
74
+ rs[0] = 2
75
+ rs[1] = 3
76
+ rs[2] = 4
77
+
78
+ expect(rs.serialize).to eq("\x00\x00\x10b\x00\x00\x00\x00".force_encoding("ASCII-8BIT"))
79
+ end
80
+
81
+ it "unserializes from a string" do
82
+ rs = RegisterSet.new(10, "\x00\x00\x10b\x00\x00\x00\x00".unpack("N*"))
83
+
84
+ expect(rs[0]).to eq(2)
85
+ expect(rs[1]).to eq(3)
86
+ expect(rs[2]).to eq(4)
87
+ end
71
88
  end
72
89
  end
@@ -1,5 +1,5 @@
1
1
  require 'spec_helper'
2
- require 'hyperll/varint'
2
+ require 'hyperll'
3
3
 
4
4
  module Hyperll
5
5
  describe Varint do
@@ -14,5 +14,9 @@ module Hyperll
14
14
  expect(Varint.write_unsigned_var_int(0x81)).to eq([0x81, 0x01])
15
15
  expect(Varint.write_unsigned_var_int(0x4081)).to eq([0x81, 0x81, 0x01])
16
16
  end
17
+
18
+ it 'treats negative integers as their twos complement unsigned representation' do
19
+ expect(Varint.write_unsigned_var_int(-4)).to eq([252, 255, 255, 255, 15])
20
+ end
17
21
  end
18
22
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hyperll
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.6
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andy Lindeman
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-10-29 00:00:00.000000000 Z
11
+ date: 2013-10-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,11 +52,26 @@ dependencies:
52
52
  - - ~>
53
53
  - !ruby/object:Gem::Version
54
54
  version: '2.14'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake-compiler
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 0.9.1
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ~>
67
+ - !ruby/object:Gem::Version
68
+ version: 0.9.1
55
69
  description: HyperLogLog implementation in pure Ruby
56
70
  email:
57
71
  - andy@andylindeman.com
58
72
  executables: []
59
- extensions: []
73
+ extensions:
74
+ - ext/hyperll/extconf.rb
60
75
  extra_rdoc_files: []
61
76
  files:
62
77
  - .gitignore
@@ -66,18 +81,27 @@ files:
66
81
  - LICENSE.txt
67
82
  - README.md
68
83
  - Rakefile
84
+ - ext/hyperll/delta_bytes.c
85
+ - ext/hyperll/delta_bytes.h
86
+ - ext/hyperll/extconf.rb
87
+ - ext/hyperll/hyper_log_log_plus.c
88
+ - ext/hyperll/hyperll.c
89
+ - ext/hyperll/hyperll.h
90
+ - ext/hyperll/register_set.c
91
+ - ext/hyperll/register_set.h
92
+ - ext/hyperll/sparse_set.c
93
+ - ext/hyperll/sparse_set.h
94
+ - ext/hyperll/varint.c
95
+ - ext/hyperll/varint.h
69
96
  - hyperll.gemspec
70
97
  - lib/hyperll.rb
71
- - lib/hyperll/delta_bytes.rb
72
98
  - lib/hyperll/hyper_log_log.rb
73
99
  - lib/hyperll/hyper_log_log_plus.rb
74
100
  - lib/hyperll/murmur_hash.rb
75
- - lib/hyperll/register_set.rb
76
- - lib/hyperll/util.rb
77
- - lib/hyperll/varint.rb
78
101
  - lib/hyperll/version.rb
79
102
  - spec/fixtures/10000.txt
80
103
  - spec/fixtures/mega.json
104
+ - spec/fixtures/merge-many-sets.json
81
105
  - spec/hyperll/delta_bytes_spec.rb
82
106
  - spec/hyperll/hyper_log_log_plus_spec.rb
83
107
  - spec/hyperll/hyper_log_log_spec.rb
@@ -114,6 +138,7 @@ summary: HyperLogLog implementation in pure Ruby
114
138
  test_files:
115
139
  - spec/fixtures/10000.txt
116
140
  - spec/fixtures/mega.json
141
+ - spec/fixtures/merge-many-sets.json
117
142
  - spec/hyperll/delta_bytes_spec.rb
118
143
  - spec/hyperll/hyper_log_log_plus_spec.rb
119
144
  - spec/hyperll/hyper_log_log_spec.rb
@@ -1,32 +0,0 @@
1
- require_relative 'varint'
2
-
3
- module Hyperll
4
- class DeltaBytes
5
- def self.compress(bytes)
6
- compressed = Varint.write_unsigned_var_int(bytes.length)
7
- previous_value = 0
8
-
9
- bytes.each do |b|
10
- compressed.concat(Varint.write_unsigned_var_int(b - previous_value))
11
- previous_value = b
12
- end
13
-
14
- compressed
15
- end
16
-
17
- def self.uncompress(bytes)
18
- uncompressed = []
19
- previous_value = 0
20
-
21
- length = Varint.read_unsigned_var_int(bytes)
22
- length.times do
23
- next_value = Varint.read_unsigned_var_int(bytes)
24
-
25
- uncompressed << next_value + previous_value
26
- previous_value = uncompressed.last
27
- end
28
-
29
- uncompressed
30
- end
31
- end
32
- end
@@ -1,90 +0,0 @@
1
- require_relative 'util'
2
-
3
- module Hyperll
4
- class RegisterSet
5
- include Enumerable
6
- include Util
7
-
8
- LOG2_BITS_PER_WORD = 6
9
- REGISTER_SIZE = 5
10
-
11
- attr_reader :count, :size
12
-
13
- def initialize(count, values = nil)
14
- @count = count
15
-
16
- @bits = count / LOG2_BITS_PER_WORD
17
- if @bits.zero?
18
- @size = 1
19
- elsif (@bits % INTEGER_SIZE).zero?
20
- @size = @bits
21
- else
22
- @size = @bits + 1
23
- end
24
-
25
- @values = values || Array.new(@size, 0)
26
- end
27
-
28
- def []=(position, value)
29
- bucket = position / LOG2_BITS_PER_WORD
30
- shift = REGISTER_SIZE * (position - (bucket * LOG2_BITS_PER_WORD))
31
-
32
- @values[bucket] = ((@values[bucket] & ~(0x1f * POWERS_OF_TWO[shift])) | (value * POWERS_OF_TWO[shift]))
33
- @values[bucket] &= INT_MASK if @values[bucket] > INT_MASK
34
- end
35
-
36
- def [](position)
37
- bucket = position / LOG2_BITS_PER_WORD
38
- shift = REGISTER_SIZE * (position - (bucket * LOG2_BITS_PER_WORD))
39
-
40
- return (@values[bucket] & (0x1f * POWERS_OF_TWO[shift])) / POWERS_OF_TWO[shift]
41
- end
42
-
43
- def each
44
- return enum_for(:each) unless block_given?
45
- @count.times do |i|
46
- yield self[i]
47
- end
48
- end
49
-
50
- def update_if_greater(position, value)
51
- bucket = position / LOG2_BITS_PER_WORD
52
- shift = REGISTER_SIZE * (position - (bucket * LOG2_BITS_PER_WORD));
53
- mask = (0x1f * POWERS_OF_TWO[shift])
54
-
55
- current_value = @values[bucket] & mask
56
- new_value = value * POWERS_OF_TWO[shift]
57
- if current_value < new_value
58
- @values[bucket] = ((@values[bucket] & ~mask) | new_value)
59
- @values[bucket] &= INT_MASK if @values[bucket] > INT_MASK
60
- true
61
- else
62
- false
63
- end
64
- end
65
-
66
- def merge(other)
67
- @size.times do |bucket|
68
- word = 0
69
- LOG2_BITS_PER_WORD.times do |j|
70
- mask = 0x1f << (REGISTER_SIZE * j);
71
-
72
- this_val = self.values[bucket] & mask
73
- other_val = other.values[bucket] & mask
74
- word |= [this_val, other_val].max
75
- end
76
-
77
- @values[bucket] = word
78
- end
79
- end
80
-
81
- def serialize
82
- @values.pack("N*")
83
- end
84
-
85
- protected
86
- def values
87
- @values
88
- end
89
- end
90
- end
@@ -1,39 +0,0 @@
1
- module Hyperll
2
- module Util
3
- INT_MASK = 0xFFFFFFFF
4
- INTEGER_SIZE = 32
5
-
6
- POWERS_OF_TWO = 0.upto(32).map { |i| 2**i }.freeze
7
-
8
- def number_of_leading_zeros(x)
9
- return 32 if x == 0
10
-
11
- n = 0
12
- if x <= 0x0000FFFF
13
- n += 16
14
- x *= POWERS_OF_TWO[16]
15
- end
16
-
17
- if x <= 0x00FFFFFF
18
- n += 8;
19
- x *= POWERS_OF_TWO[8]
20
- end
21
-
22
- if x <= 0x0FFFFFFF
23
- n += 4
24
- x *= POWERS_OF_TWO[4]
25
- end
26
-
27
- if x <= 0x3FFFFFFF
28
- n += 2
29
- x *= POWERS_OF_TWO[2]
30
- end
31
-
32
- if x <= 0x7FFFFFFF
33
- n += 1
34
- end
35
-
36
- n
37
- end
38
- end
39
- end
@@ -1,26 +0,0 @@
1
- module Hyperll
2
- class Varint
3
- def self.read_unsigned_var_int(bytes)
4
- value, i, b = 0, 0, 0
5
- while (b = bytes.shift) & 0x80 != 0
6
- value |= (b & 0x7F) << i
7
-
8
- i += 7
9
- raise "Variable length quantity is too long" if i > 35
10
- end
11
-
12
- value | (b << i)
13
- end
14
-
15
- def self.write_unsigned_var_int(value)
16
- bytes = []
17
- while (value & 0xFFFFFF80) != 0
18
- bytes << ((value & 0x7F) | 0x80)
19
- value >>= 7
20
- end
21
-
22
- bytes << (value & 0x7F)
23
- bytes
24
- end
25
- end
26
- end