hyperll 0.2.6 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +4 -0
- data/Rakefile +7 -0
- data/ext/hyperll/delta_bytes.c +97 -0
- data/ext/hyperll/delta_bytes.h +24 -0
- data/ext/hyperll/extconf.rb +12 -0
- data/ext/hyperll/hyper_log_log_plus.c +473 -0
- data/ext/hyperll/hyperll.c +12 -0
- data/ext/hyperll/hyperll.h +17 -0
- data/ext/hyperll/register_set.c +196 -0
- data/ext/hyperll/register_set.h +19 -0
- data/ext/hyperll/sparse_set.c +76 -0
- data/ext/hyperll/sparse_set.h +24 -0
- data/ext/hyperll/varint.c +83 -0
- data/ext/hyperll/varint.h +21 -0
- data/hyperll.gemspec +2 -0
- data/lib/hyperll.rb +1 -0
- data/lib/hyperll/hyper_log_log.rb +0 -1
- data/lib/hyperll/hyper_log_log_plus.rb +10 -310
- data/lib/hyperll/version.rb +1 -1
- data/spec/fixtures/merge-many-sets.json +192 -0
- data/spec/hyperll/delta_bytes_spec.rb +5 -3
- data/spec/hyperll/hyper_log_log_plus_spec.rb +30 -1
- data/spec/hyperll/hyper_log_log_spec.rb +1 -1
- data/spec/hyperll/register_set_spec.rb +18 -1
- data/spec/hyperll/varint_spec.rb +5 -1
- metadata +32 -7
- data/lib/hyperll/delta_bytes.rb +0 -32
- data/lib/hyperll/register_set.rb +0 -90
- data/lib/hyperll/util.rb +0 -39
- data/lib/hyperll/varint.rb +0 -26
@@ -0,0 +1,17 @@
|
|
1
|
+
#ifndef __H__HYPERLL
|
2
|
+
#define __H__HYPERLL
|
3
|
+
|
4
|
+
#include <ruby.h>
|
5
|
+
#ifndef HAVE_RUBY_ENCODING_H
|
6
|
+
#error "Hyperll requires Ruby 1.9+ to build"
|
7
|
+
#else
|
8
|
+
#include <ruby/encoding.h>
|
9
|
+
#endif
|
10
|
+
|
11
|
+
// Initialization functions
|
12
|
+
void Init_hyperll_register_set(void);
|
13
|
+
void Init_hyperll_hyper_log_log_plus(void);
|
14
|
+
void Init_hyperll_varint(void);
|
15
|
+
void Init_hyperll_delta_bytes(void);
|
16
|
+
|
17
|
+
#endif
|
@@ -0,0 +1,196 @@
|
|
1
|
+
#include "hyperll.h"
|
2
|
+
#include "register_set.h"
|
3
|
+
|
4
|
+
extern VALUE rb_mHyperll;
|
5
|
+
VALUE rb_cRegisterSet;
|
6
|
+
|
7
|
+
const int LOG2_BITS_PER_WORD = 6;
|
8
|
+
const int REGISTER_SIZE = 5;
|
9
|
+
|
10
|
+
void register_set_init(register_set *set, int count) {
|
11
|
+
set->count = count;
|
12
|
+
|
13
|
+
int bits = count / LOG2_BITS_PER_WORD;
|
14
|
+
if (bits == 0) {
|
15
|
+
set->size = 1;
|
16
|
+
} else if ((bits % sizeof(int)) == 0) {
|
17
|
+
set->size = bits;
|
18
|
+
} else {
|
19
|
+
set->size = bits + 1;
|
20
|
+
}
|
21
|
+
|
22
|
+
set->values = (uint32_t*)calloc(set->size, sizeof(uint32_t));
|
23
|
+
}
|
24
|
+
|
25
|
+
void register_set_set(register_set *set, int position, uint32_t value) {
|
26
|
+
int bucket = position / LOG2_BITS_PER_WORD;
|
27
|
+
int shift = REGISTER_SIZE * (position - (bucket * LOG2_BITS_PER_WORD));
|
28
|
+
|
29
|
+
set->values[bucket] = (set->values[bucket] & ~(0x1f << shift)) | (value << shift);
|
30
|
+
}
|
31
|
+
|
32
|
+
uint32_t register_set_get(register_set *set, int position) {
|
33
|
+
int bucket = position / LOG2_BITS_PER_WORD;
|
34
|
+
int shift = REGISTER_SIZE * (position - (bucket * LOG2_BITS_PER_WORD));
|
35
|
+
return (uint32_t)(((set->values[bucket] & (0x1f << shift))) >> shift);
|
36
|
+
}
|
37
|
+
|
38
|
+
int register_set_update_if_greater(register_set *set, int position, uint32_t value) {
|
39
|
+
int bucket = position / LOG2_BITS_PER_WORD;
|
40
|
+
int shift = REGISTER_SIZE * (position - (bucket * LOG2_BITS_PER_WORD));
|
41
|
+
uint32_t mask = 0x1f << shift;
|
42
|
+
|
43
|
+
uint64_t cur = set->values[bucket] & mask;
|
44
|
+
uint64_t new = value << shift;
|
45
|
+
if (cur < new) {
|
46
|
+
set->values[bucket] = (uint32_t)((set->values[bucket] & ~mask) | new);
|
47
|
+
return 1;
|
48
|
+
} else {
|
49
|
+
return 0;
|
50
|
+
}
|
51
|
+
}
|
52
|
+
|
53
|
+
void register_set_merge(register_set *set, register_set *other) {
|
54
|
+
int size = set->size;
|
55
|
+
for (int bucket = 0; bucket < size; bucket++) {
|
56
|
+
uint32_t sval = set->values[bucket];
|
57
|
+
uint32_t oval = other->values[bucket];
|
58
|
+
uint32_t word = 0;
|
59
|
+
|
60
|
+
for (int j = 0; j < LOG2_BITS_PER_WORD; j++) {
|
61
|
+
uint32_t mask = 0x1f << (REGISTER_SIZE * j);
|
62
|
+
|
63
|
+
uint32_t thisval = (sval & mask);
|
64
|
+
uint32_t thatval = (oval & mask);
|
65
|
+
word |= (thisval < thatval) ? thatval : thisval;
|
66
|
+
}
|
67
|
+
set->values[bucket] = word;
|
68
|
+
}
|
69
|
+
}
|
70
|
+
|
71
|
+
void register_set_free(register_set *set) {
|
72
|
+
free(set->values);
|
73
|
+
free(set);
|
74
|
+
}
|
75
|
+
|
76
|
+
static VALUE rb_register_set_new(int argc, VALUE *argv, VALUE klass) {
|
77
|
+
VALUE count, values;
|
78
|
+
rb_scan_args(argc, argv, "11", &count, &values);
|
79
|
+
|
80
|
+
register_set *set = ALLOC(register_set);
|
81
|
+
register_set_init(set, NUM2INT(count));
|
82
|
+
VALUE setv = Data_Wrap_Struct(klass, 0, register_set_free, set);
|
83
|
+
|
84
|
+
if (!NIL_P(values)) {
|
85
|
+
Check_Type(values, T_ARRAY);
|
86
|
+
if (RARRAY_LEN(values) == set->size) {
|
87
|
+
for (int i = 0; i < set->size; i++) {
|
88
|
+
set->values[i] = NUM2ULONG(rb_ary_entry(values, i));
|
89
|
+
}
|
90
|
+
} else {
|
91
|
+
rb_raise(rb_eArgError, "initial set of values is not of the correct size");
|
92
|
+
}
|
93
|
+
}
|
94
|
+
|
95
|
+
return setv;
|
96
|
+
}
|
97
|
+
|
98
|
+
static VALUE rb_register_set_index_set(VALUE self, VALUE position, VALUE value) {
|
99
|
+
Check_Type(position, T_FIXNUM);
|
100
|
+
Check_Type(value, T_FIXNUM);
|
101
|
+
|
102
|
+
register_set *set;
|
103
|
+
Data_Get_Struct(self, register_set, set);
|
104
|
+
register_set_set(set, NUM2INT(position), NUM2ULONG(value));
|
105
|
+
|
106
|
+
return Qnil;
|
107
|
+
}
|
108
|
+
|
109
|
+
static VALUE rb_register_set_index_get(VALUE self, VALUE position) {
|
110
|
+
Check_Type(position, T_FIXNUM);
|
111
|
+
|
112
|
+
register_set *set;
|
113
|
+
Data_Get_Struct(self, register_set, set);
|
114
|
+
return UINT2NUM(register_set_get(set, NUM2INT(position)));
|
115
|
+
}
|
116
|
+
|
117
|
+
static VALUE rb_register_set_update_if_greater(VALUE self, VALUE position, VALUE value) {
|
118
|
+
Check_Type(position, T_FIXNUM);
|
119
|
+
Check_Type(value, T_FIXNUM);
|
120
|
+
|
121
|
+
register_set *set;
|
122
|
+
Data_Get_Struct(self, register_set, set);
|
123
|
+
int rv = register_set_update_if_greater(set, NUM2INT(position), NUM2ULONG(value));
|
124
|
+
|
125
|
+
return rv ? Qtrue : Qfalse;
|
126
|
+
}
|
127
|
+
|
128
|
+
static VALUE rb_register_set_merge(VALUE self, VALUE other) {
|
129
|
+
register_set *set;
|
130
|
+
Data_Get_Struct(self, register_set, set);
|
131
|
+
|
132
|
+
register_set *other_set;
|
133
|
+
Data_Get_Struct(other, register_set, other_set);
|
134
|
+
if (other_set == NULL) {
|
135
|
+
rb_raise(rb_eTypeError, "other must be another register set");
|
136
|
+
return Qnil;
|
137
|
+
}
|
138
|
+
|
139
|
+
register_set_merge(set, other_set);
|
140
|
+
|
141
|
+
return self;
|
142
|
+
}
|
143
|
+
|
144
|
+
static VALUE rb_register_set_each(VALUE self) {
|
145
|
+
register_set *set;
|
146
|
+
Data_Get_Struct(self, register_set, set);
|
147
|
+
|
148
|
+
for (int i = 0; i < set->count; i++) {
|
149
|
+
rb_yield(UINT2NUM(register_set_get(set, i)));
|
150
|
+
}
|
151
|
+
|
152
|
+
return self;
|
153
|
+
}
|
154
|
+
|
155
|
+
static VALUE rb_register_set_serialize(VALUE self) {
|
156
|
+
register_set *set;
|
157
|
+
Data_Get_Struct(self, register_set, set);
|
158
|
+
|
159
|
+
int strsize = set->size * sizeof(int);
|
160
|
+
char *str = (char*)malloc(strsize + 1);
|
161
|
+
str[strsize] = 0;
|
162
|
+
|
163
|
+
for (int i = 0; i < set->size; i++) {
|
164
|
+
int value = set->values[i];
|
165
|
+
int offset = i * 4;
|
166
|
+
|
167
|
+
str[offset] = (char)(value >> 24);
|
168
|
+
str[offset + 1] = (char)(value >> 16);
|
169
|
+
str[offset + 2] = (char)(value >> 8);
|
170
|
+
str[offset + 3] = (char)value;
|
171
|
+
}
|
172
|
+
|
173
|
+
return rb_str_new(str, strsize);
|
174
|
+
}
|
175
|
+
|
176
|
+
static VALUE rb_register_set_size(VALUE self) {
|
177
|
+
register_set *set;
|
178
|
+
Data_Get_Struct(self, register_set, set);
|
179
|
+
|
180
|
+
return INT2NUM(set->size);
|
181
|
+
}
|
182
|
+
|
183
|
+
void Init_hyperll_register_set(void) {
|
184
|
+
rb_cRegisterSet = rb_define_class_under(rb_mHyperll, "RegisterSet", rb_cObject);
|
185
|
+
rb_include_module(rb_cRegisterSet, rb_mEnumerable);
|
186
|
+
|
187
|
+
rb_define_singleton_method(rb_cRegisterSet, "new", rb_register_set_new, -1);
|
188
|
+
|
189
|
+
rb_define_method(rb_cRegisterSet, "[]=", rb_register_set_index_set, 2);
|
190
|
+
rb_define_method(rb_cRegisterSet, "[]", rb_register_set_index_get, 1);
|
191
|
+
rb_define_method(rb_cRegisterSet, "update_if_greater", rb_register_set_update_if_greater, 2);
|
192
|
+
rb_define_method(rb_cRegisterSet, "merge", rb_register_set_merge, 1);
|
193
|
+
rb_define_method(rb_cRegisterSet, "each", rb_register_set_each, 0);
|
194
|
+
rb_define_method(rb_cRegisterSet, "serialize", rb_register_set_serialize, 0);
|
195
|
+
rb_define_method(rb_cRegisterSet, "size", rb_register_set_size, 0);
|
196
|
+
}
|
@@ -0,0 +1,19 @@
|
|
1
|
+
#ifndef __H_REGISTER_SET
|
2
|
+
#define __H_REGISTER_SET
|
3
|
+
|
4
|
+
#include <stdint.h>
|
5
|
+
|
6
|
+
typedef struct {
|
7
|
+
int count;
|
8
|
+
int size;
|
9
|
+
uint32_t *values;
|
10
|
+
} register_set;
|
11
|
+
|
12
|
+
void register_set_init(register_set *set, int count);
|
13
|
+
void register_set_set(register_set *set, int position, uint32_t value);
|
14
|
+
uint32_t register_set_get(register_set *set, int position);
|
15
|
+
int register_set_update_if_greater(register_set *set, int position, uint32_t value);
|
16
|
+
void register_set_merge(register_set *set, register_set *other);
|
17
|
+
void register_set_free(register_set *set);
|
18
|
+
|
19
|
+
#endif
|
@@ -0,0 +1,76 @@
|
|
1
|
+
#include <math.h>
|
2
|
+
#include <stdio.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include "sparse_set.h"
|
5
|
+
|
6
|
+
void sparse_set_init(sparse_set *set, int sm, int capacity) {
|
7
|
+
set->sm = sm;
|
8
|
+
set->capacity = capacity;
|
9
|
+
set->size = 0;
|
10
|
+
set->values = (uint32_t*)calloc(set->capacity, sizeof(uint32_t));
|
11
|
+
}
|
12
|
+
|
13
|
+
int sparse_set_cardinality(sparse_set *set) {
|
14
|
+
return (int)round(set->sm * log(((double)set->sm) / (set->sm - set->size)));
|
15
|
+
}
|
16
|
+
|
17
|
+
uint32_t sparse_set_sparse_index(uint32_t k) {
|
18
|
+
if ((k & 1) == 1) {
|
19
|
+
return k >> 7;
|
20
|
+
} else {
|
21
|
+
return k >> 1;
|
22
|
+
}
|
23
|
+
}
|
24
|
+
|
25
|
+
int sparse_set_merge(sparse_set *set, sparse_set *other) {
|
26
|
+
uint32_t *svals = set->values;
|
27
|
+
int ssize = set->size;
|
28
|
+
uint32_t *ovals = other->values;
|
29
|
+
int osize = other->size;
|
30
|
+
|
31
|
+
int capacity = set->capacity;
|
32
|
+
uint32_t *values = (uint32_t*)calloc(capacity, sizeof(uint32_t));
|
33
|
+
|
34
|
+
// s tracks svals, o tracks ovals, size tracks values
|
35
|
+
int s = 0, o = 0, size = 0;
|
36
|
+
while (s < ssize || o < osize) {
|
37
|
+
// Check that we don't grow over capacity
|
38
|
+
if (size >= capacity) {
|
39
|
+
free(values);
|
40
|
+
return -1;
|
41
|
+
}
|
42
|
+
|
43
|
+
if (o >= osize) {
|
44
|
+
values[size++] = svals[s++];
|
45
|
+
} else if (s >= ssize) {
|
46
|
+
values[size++] = ovals[o++];
|
47
|
+
} else {
|
48
|
+
uint32_t sval = svals[s];
|
49
|
+
uint32_t sidx = sparse_set_sparse_index(sval);
|
50
|
+
uint32_t oval = ovals[o];
|
51
|
+
uint32_t oidx = sparse_set_sparse_index(oval);
|
52
|
+
|
53
|
+
if (sidx == oidx) {
|
54
|
+
values[size++] = sval < oval ? sval : oval;
|
55
|
+
s++;
|
56
|
+
o++;
|
57
|
+
} else if (sidx < oidx) {
|
58
|
+
values[size++] = sval;
|
59
|
+
s++;
|
60
|
+
} else {
|
61
|
+
values[size++] = oval;
|
62
|
+
o++;
|
63
|
+
}
|
64
|
+
}
|
65
|
+
}
|
66
|
+
|
67
|
+
free(set->values);
|
68
|
+
set->values = values;
|
69
|
+
set->size = size;
|
70
|
+
return 0;
|
71
|
+
}
|
72
|
+
|
73
|
+
void sparse_set_free(sparse_set *set) {
|
74
|
+
free(set->values);
|
75
|
+
free(set);
|
76
|
+
}
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#ifndef __H_SPARSE_SET
|
2
|
+
#define __H_SPARSE_SET
|
3
|
+
|
4
|
+
#include <stdint.h>
|
5
|
+
|
6
|
+
typedef struct {
|
7
|
+
int sm;
|
8
|
+
int capacity;
|
9
|
+
int size;
|
10
|
+
uint32_t *values;
|
11
|
+
} sparse_set;
|
12
|
+
|
13
|
+
void sparse_set_init(sparse_set *set, int sm, int capacity);
|
14
|
+
int sparse_set_cardinality(sparse_set *set);
|
15
|
+
|
16
|
+
// Merges two sparse sets together.
|
17
|
+
//
|
18
|
+
// Returns 0 on success; -1 if the sparse set would grow too large
|
19
|
+
int sparse_set_merge(sparse_set *set, sparse_set *other);
|
20
|
+
|
21
|
+
uint32_t sparse_set_sparse_index(uint32_t k);
|
22
|
+
void sparse_set_free(sparse_set *set);
|
23
|
+
|
24
|
+
#endif
|
@@ -0,0 +1,83 @@
|
|
1
|
+
#include "hyperll.h"
|
2
|
+
#include "varint.h"
|
3
|
+
|
4
|
+
extern VALUE rb_mHyperll;
|
5
|
+
VALUE rb_cVarint;
|
6
|
+
|
7
|
+
int varint_write_unsigned(uint32_t value, uint8_t bytes[]) {
|
8
|
+
int i = 0;
|
9
|
+
while ((value & 0xFFFFFF80) != 0) {
|
10
|
+
bytes[i++] = (uint8_t)((value & 0x7F) | 0x80);
|
11
|
+
value >>= 7;
|
12
|
+
}
|
13
|
+
|
14
|
+
bytes[i++] = (uint8_t)(value & 0x7F);
|
15
|
+
return i;
|
16
|
+
}
|
17
|
+
|
18
|
+
uint32_t varint_read_unsigned(uint8_t bytes[], int maxlen, int *len) {
|
19
|
+
if (maxlen <= 0) {
|
20
|
+
*len = -1;
|
21
|
+
return 0;
|
22
|
+
}
|
23
|
+
|
24
|
+
uint32_t value, i, b;
|
25
|
+
value = i = b = 0;
|
26
|
+
|
27
|
+
while (((b = bytes[i]) & 0x80) != 0) {
|
28
|
+
value |= (b & 0x7F) << (i * 7);
|
29
|
+
i++;
|
30
|
+
|
31
|
+
if (i >= maxlen) {
|
32
|
+
*len = -1;
|
33
|
+
return 0;
|
34
|
+
}
|
35
|
+
}
|
36
|
+
|
37
|
+
value |= (b << (i * 7));
|
38
|
+
*len = (i + 1);
|
39
|
+
return value;
|
40
|
+
}
|
41
|
+
|
42
|
+
static VALUE rb_varint_read_unsigned_var_int(VALUE klass, VALUE rbytes) {
|
43
|
+
int rlen = RARRAY_LEN(rbytes);
|
44
|
+
int maxlen = (rlen > 5) ? 5 : rlen;
|
45
|
+
|
46
|
+
uint8_t bytes[5];
|
47
|
+
for (int i = 0; i < maxlen; i++) {
|
48
|
+
bytes[i] = (uint8_t)NUM2INT(rb_ary_entry(rbytes, i));
|
49
|
+
}
|
50
|
+
|
51
|
+
int len = 0;
|
52
|
+
uint32_t value = varint_read_unsigned(bytes, maxlen, &len);
|
53
|
+
|
54
|
+
if (len == -1) {
|
55
|
+
rb_raise(rb_eRuntimeError, "Variable length quantity is too long");
|
56
|
+
return Qnil;
|
57
|
+
}
|
58
|
+
|
59
|
+
// Discard elements that were used to retrieve the value
|
60
|
+
for (int i = 0; i < len; i++) {
|
61
|
+
rb_ary_shift(rbytes);
|
62
|
+
}
|
63
|
+
return ULONG2NUM(value);
|
64
|
+
}
|
65
|
+
|
66
|
+
static VALUE rb_varint_write_unsigned_var_int(VALUE klass, VALUE value) {
|
67
|
+
VALUE rbytes = rb_ary_new2(5);
|
68
|
+
|
69
|
+
uint8_t bytes[5];
|
70
|
+
int len = varint_write_unsigned(NUM2ULONG(value), bytes);
|
71
|
+
for (int i = 0; i < len; i++) {
|
72
|
+
rb_ary_push(rbytes, INT2NUM(bytes[i]));
|
73
|
+
}
|
74
|
+
|
75
|
+
return rbytes;
|
76
|
+
}
|
77
|
+
|
78
|
+
void Init_hyperll_varint(void) {
|
79
|
+
rb_cVarint = rb_define_class_under(rb_mHyperll, "Varint", rb_cObject);
|
80
|
+
|
81
|
+
rb_define_singleton_method(rb_cVarint, "read_unsigned_var_int", rb_varint_read_unsigned_var_int, 1);
|
82
|
+
rb_define_singleton_method(rb_cVarint, "write_unsigned_var_int", rb_varint_write_unsigned_var_int, 1);
|
83
|
+
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#ifndef __H_VARINT
|
2
|
+
#define __H_VARINT
|
3
|
+
|
4
|
+
// Writes an unsigned varint.
|
5
|
+
//
|
6
|
+
// value: value to encode
|
7
|
+
// bytes: destination buffer. Caller is responsible for the allocation.
|
8
|
+
// Must be of size 5.
|
9
|
+
//
|
10
|
+
// Returns the count of the bytes that were actually needed to store the varint
|
11
|
+
int varint_write_unsigned(uint32_t value, uint8_t bytes[]);
|
12
|
+
|
13
|
+
// Reads an unsigned varint.
|
14
|
+
//
|
15
|
+
// bytes: encoded value
|
16
|
+
// maxlen: the maximum number of bytes to read
|
17
|
+
// len: the number of bytes read to reconstruct the varint; -1 if an error
|
18
|
+
// occurred
|
19
|
+
uint32_t varint_read_unsigned(uint8_t bytes[], int maxlen, int *len);
|
20
|
+
|
21
|
+
#endif
|