hyperll 0.2.6 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,12 @@
1
+ #include "hyperll.h"
2
+
3
+ VALUE rb_mHyperll;
4
+
5
+ void Init_hyperll(void) {
6
+ rb_mHyperll = rb_define_module("Hyperll");
7
+
8
+ Init_hyperll_register_set();
9
+ Init_hyperll_hyper_log_log_plus();
10
+ Init_hyperll_varint();
11
+ Init_hyperll_delta_bytes();
12
+ }
@@ -0,0 +1,17 @@
1
+ #ifndef __H__HYPERLL
2
+ #define __H__HYPERLL
3
+
4
+ #include <ruby.h>
5
+ #ifndef HAVE_RUBY_ENCODING_H
6
+ #error "Hyperll requires Ruby 1.9+ to build"
7
+ #else
8
+ #include <ruby/encoding.h>
9
+ #endif
10
+
11
+ // Initialization functions
12
+ void Init_hyperll_register_set(void);
13
+ void Init_hyperll_hyper_log_log_plus(void);
14
+ void Init_hyperll_varint(void);
15
+ void Init_hyperll_delta_bytes(void);
16
+
17
+ #endif
@@ -0,0 +1,196 @@
1
+ #include "hyperll.h"
2
+ #include "register_set.h"
3
+
4
+ extern VALUE rb_mHyperll;
5
+ VALUE rb_cRegisterSet;
6
+
7
+ const int LOG2_BITS_PER_WORD = 6;
8
+ const int REGISTER_SIZE = 5;
9
+
10
+ void register_set_init(register_set *set, int count) {
11
+ set->count = count;
12
+
13
+ int bits = count / LOG2_BITS_PER_WORD;
14
+ if (bits == 0) {
15
+ set->size = 1;
16
+ } else if ((bits % sizeof(int)) == 0) {
17
+ set->size = bits;
18
+ } else {
19
+ set->size = bits + 1;
20
+ }
21
+
22
+ set->values = (uint32_t*)calloc(set->size, sizeof(uint32_t));
23
+ }
24
+
25
+ void register_set_set(register_set *set, int position, uint32_t value) {
26
+ int bucket = position / LOG2_BITS_PER_WORD;
27
+ int shift = REGISTER_SIZE * (position - (bucket * LOG2_BITS_PER_WORD));
28
+
29
+ set->values[bucket] = (set->values[bucket] & ~(0x1f << shift)) | (value << shift);
30
+ }
31
+
32
+ uint32_t register_set_get(register_set *set, int position) {
33
+ int bucket = position / LOG2_BITS_PER_WORD;
34
+ int shift = REGISTER_SIZE * (position - (bucket * LOG2_BITS_PER_WORD));
35
+ return (uint32_t)(((set->values[bucket] & (0x1f << shift))) >> shift);
36
+ }
37
+
38
+ int register_set_update_if_greater(register_set *set, int position, uint32_t value) {
39
+ int bucket = position / LOG2_BITS_PER_WORD;
40
+ int shift = REGISTER_SIZE * (position - (bucket * LOG2_BITS_PER_WORD));
41
+ uint32_t mask = 0x1f << shift;
42
+
43
+ uint64_t cur = set->values[bucket] & mask;
44
+ uint64_t new = value << shift;
45
+ if (cur < new) {
46
+ set->values[bucket] = (uint32_t)((set->values[bucket] & ~mask) | new);
47
+ return 1;
48
+ } else {
49
+ return 0;
50
+ }
51
+ }
52
+
53
+ void register_set_merge(register_set *set, register_set *other) {
54
+ int size = set->size;
55
+ for (int bucket = 0; bucket < size; bucket++) {
56
+ uint32_t sval = set->values[bucket];
57
+ uint32_t oval = other->values[bucket];
58
+ uint32_t word = 0;
59
+
60
+ for (int j = 0; j < LOG2_BITS_PER_WORD; j++) {
61
+ uint32_t mask = 0x1f << (REGISTER_SIZE * j);
62
+
63
+ uint32_t thisval = (sval & mask);
64
+ uint32_t thatval = (oval & mask);
65
+ word |= (thisval < thatval) ? thatval : thisval;
66
+ }
67
+ set->values[bucket] = word;
68
+ }
69
+ }
70
+
71
+ void register_set_free(register_set *set) {
72
+ free(set->values);
73
+ free(set);
74
+ }
75
+
76
+ static VALUE rb_register_set_new(int argc, VALUE *argv, VALUE klass) {
77
+ VALUE count, values;
78
+ rb_scan_args(argc, argv, "11", &count, &values);
79
+
80
+ register_set *set = ALLOC(register_set);
81
+ register_set_init(set, NUM2INT(count));
82
+ VALUE setv = Data_Wrap_Struct(klass, 0, register_set_free, set);
83
+
84
+ if (!NIL_P(values)) {
85
+ Check_Type(values, T_ARRAY);
86
+ if (RARRAY_LEN(values) == set->size) {
87
+ for (int i = 0; i < set->size; i++) {
88
+ set->values[i] = NUM2ULONG(rb_ary_entry(values, i));
89
+ }
90
+ } else {
91
+ rb_raise(rb_eArgError, "initial set of values is not of the correct size");
92
+ }
93
+ }
94
+
95
+ return setv;
96
+ }
97
+
98
+ static VALUE rb_register_set_index_set(VALUE self, VALUE position, VALUE value) {
99
+ Check_Type(position, T_FIXNUM);
100
+ Check_Type(value, T_FIXNUM);
101
+
102
+ register_set *set;
103
+ Data_Get_Struct(self, register_set, set);
104
+ register_set_set(set, NUM2INT(position), NUM2ULONG(value));
105
+
106
+ return Qnil;
107
+ }
108
+
109
+ static VALUE rb_register_set_index_get(VALUE self, VALUE position) {
110
+ Check_Type(position, T_FIXNUM);
111
+
112
+ register_set *set;
113
+ Data_Get_Struct(self, register_set, set);
114
+ return UINT2NUM(register_set_get(set, NUM2INT(position)));
115
+ }
116
+
117
+ static VALUE rb_register_set_update_if_greater(VALUE self, VALUE position, VALUE value) {
118
+ Check_Type(position, T_FIXNUM);
119
+ Check_Type(value, T_FIXNUM);
120
+
121
+ register_set *set;
122
+ Data_Get_Struct(self, register_set, set);
123
+ int rv = register_set_update_if_greater(set, NUM2INT(position), NUM2ULONG(value));
124
+
125
+ return rv ? Qtrue : Qfalse;
126
+ }
127
+
128
+ static VALUE rb_register_set_merge(VALUE self, VALUE other) {
129
+ register_set *set;
130
+ Data_Get_Struct(self, register_set, set);
131
+
132
+ register_set *other_set;
133
+ Data_Get_Struct(other, register_set, other_set);
134
+ if (other_set == NULL) {
135
+ rb_raise(rb_eTypeError, "other must be another register set");
136
+ return Qnil;
137
+ }
138
+
139
+ register_set_merge(set, other_set);
140
+
141
+ return self;
142
+ }
143
+
144
+ static VALUE rb_register_set_each(VALUE self) {
145
+ register_set *set;
146
+ Data_Get_Struct(self, register_set, set);
147
+
148
+ for (int i = 0; i < set->count; i++) {
149
+ rb_yield(UINT2NUM(register_set_get(set, i)));
150
+ }
151
+
152
+ return self;
153
+ }
154
+
155
+ static VALUE rb_register_set_serialize(VALUE self) {
156
+ register_set *set;
157
+ Data_Get_Struct(self, register_set, set);
158
+
159
+ int strsize = set->size * sizeof(int);
160
+ char *str = (char*)malloc(strsize + 1);
161
+ str[strsize] = 0;
162
+
163
+ for (int i = 0; i < set->size; i++) {
164
+ int value = set->values[i];
165
+ int offset = i * 4;
166
+
167
+ str[offset] = (char)(value >> 24);
168
+ str[offset + 1] = (char)(value >> 16);
169
+ str[offset + 2] = (char)(value >> 8);
170
+ str[offset + 3] = (char)value;
171
+ }
172
+
173
+ return rb_str_new(str, strsize);
174
+ }
175
+
176
+ static VALUE rb_register_set_size(VALUE self) {
177
+ register_set *set;
178
+ Data_Get_Struct(self, register_set, set);
179
+
180
+ return INT2NUM(set->size);
181
+ }
182
+
183
+ void Init_hyperll_register_set(void) {
184
+ rb_cRegisterSet = rb_define_class_under(rb_mHyperll, "RegisterSet", rb_cObject);
185
+ rb_include_module(rb_cRegisterSet, rb_mEnumerable);
186
+
187
+ rb_define_singleton_method(rb_cRegisterSet, "new", rb_register_set_new, -1);
188
+
189
+ rb_define_method(rb_cRegisterSet, "[]=", rb_register_set_index_set, 2);
190
+ rb_define_method(rb_cRegisterSet, "[]", rb_register_set_index_get, 1);
191
+ rb_define_method(rb_cRegisterSet, "update_if_greater", rb_register_set_update_if_greater, 2);
192
+ rb_define_method(rb_cRegisterSet, "merge", rb_register_set_merge, 1);
193
+ rb_define_method(rb_cRegisterSet, "each", rb_register_set_each, 0);
194
+ rb_define_method(rb_cRegisterSet, "serialize", rb_register_set_serialize, 0);
195
+ rb_define_method(rb_cRegisterSet, "size", rb_register_set_size, 0);
196
+ }
@@ -0,0 +1,19 @@
1
+ #ifndef __H_REGISTER_SET
2
+ #define __H_REGISTER_SET
3
+
4
+ #include <stdint.h>
5
+
6
+ typedef struct {
7
+ int count;
8
+ int size;
9
+ uint32_t *values;
10
+ } register_set;
11
+
12
+ void register_set_init(register_set *set, int count);
13
+ void register_set_set(register_set *set, int position, uint32_t value);
14
+ uint32_t register_set_get(register_set *set, int position);
15
+ int register_set_update_if_greater(register_set *set, int position, uint32_t value);
16
+ void register_set_merge(register_set *set, register_set *other);
17
+ void register_set_free(register_set *set);
18
+
19
+ #endif
@@ -0,0 +1,76 @@
1
+ #include <math.h>
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include "sparse_set.h"
5
+
6
+ void sparse_set_init(sparse_set *set, int sm, int capacity) {
7
+ set->sm = sm;
8
+ set->capacity = capacity;
9
+ set->size = 0;
10
+ set->values = (uint32_t*)calloc(set->capacity, sizeof(uint32_t));
11
+ }
12
+
13
+ int sparse_set_cardinality(sparse_set *set) {
14
+ return (int)round(set->sm * log(((double)set->sm) / (set->sm - set->size)));
15
+ }
16
+
17
+ uint32_t sparse_set_sparse_index(uint32_t k) {
18
+ if ((k & 1) == 1) {
19
+ return k >> 7;
20
+ } else {
21
+ return k >> 1;
22
+ }
23
+ }
24
+
25
+ int sparse_set_merge(sparse_set *set, sparse_set *other) {
26
+ uint32_t *svals = set->values;
27
+ int ssize = set->size;
28
+ uint32_t *ovals = other->values;
29
+ int osize = other->size;
30
+
31
+ int capacity = set->capacity;
32
+ uint32_t *values = (uint32_t*)calloc(capacity, sizeof(uint32_t));
33
+
34
+ // s tracks svals, o tracks ovals, size tracks values
35
+ int s = 0, o = 0, size = 0;
36
+ while (s < ssize || o < osize) {
37
+ // Check that we don't grow over capacity
38
+ if (size >= capacity) {
39
+ free(values);
40
+ return -1;
41
+ }
42
+
43
+ if (o >= osize) {
44
+ values[size++] = svals[s++];
45
+ } else if (s >= ssize) {
46
+ values[size++] = ovals[o++];
47
+ } else {
48
+ uint32_t sval = svals[s];
49
+ uint32_t sidx = sparse_set_sparse_index(sval);
50
+ uint32_t oval = ovals[o];
51
+ uint32_t oidx = sparse_set_sparse_index(oval);
52
+
53
+ if (sidx == oidx) {
54
+ values[size++] = sval < oval ? sval : oval;
55
+ s++;
56
+ o++;
57
+ } else if (sidx < oidx) {
58
+ values[size++] = sval;
59
+ s++;
60
+ } else {
61
+ values[size++] = oval;
62
+ o++;
63
+ }
64
+ }
65
+ }
66
+
67
+ free(set->values);
68
+ set->values = values;
69
+ set->size = size;
70
+ return 0;
71
+ }
72
+
73
+ void sparse_set_free(sparse_set *set) {
74
+ free(set->values);
75
+ free(set);
76
+ }
@@ -0,0 +1,24 @@
1
+ #ifndef __H_SPARSE_SET
2
+ #define __H_SPARSE_SET
3
+
4
+ #include <stdint.h>
5
+
6
+ typedef struct {
7
+ int sm;
8
+ int capacity;
9
+ int size;
10
+ uint32_t *values;
11
+ } sparse_set;
12
+
13
+ void sparse_set_init(sparse_set *set, int sm, int capacity);
14
+ int sparse_set_cardinality(sparse_set *set);
15
+
16
+ // Merges two sparse sets together.
17
+ //
18
+ // Returns 0 on success; -1 if the sparse set would grow too large
19
+ int sparse_set_merge(sparse_set *set, sparse_set *other);
20
+
21
+ uint32_t sparse_set_sparse_index(uint32_t k);
22
+ void sparse_set_free(sparse_set *set);
23
+
24
+ #endif
@@ -0,0 +1,83 @@
1
+ #include "hyperll.h"
2
+ #include "varint.h"
3
+
4
+ extern VALUE rb_mHyperll;
5
+ VALUE rb_cVarint;
6
+
7
+ int varint_write_unsigned(uint32_t value, uint8_t bytes[]) {
8
+ int i = 0;
9
+ while ((value & 0xFFFFFF80) != 0) {
10
+ bytes[i++] = (uint8_t)((value & 0x7F) | 0x80);
11
+ value >>= 7;
12
+ }
13
+
14
+ bytes[i++] = (uint8_t)(value & 0x7F);
15
+ return i;
16
+ }
17
+
18
+ uint32_t varint_read_unsigned(uint8_t bytes[], int maxlen, int *len) {
19
+ if (maxlen <= 0) {
20
+ *len = -1;
21
+ return 0;
22
+ }
23
+
24
+ uint32_t value, i, b;
25
+ value = i = b = 0;
26
+
27
+ while (((b = bytes[i]) & 0x80) != 0) {
28
+ value |= (b & 0x7F) << (i * 7);
29
+ i++;
30
+
31
+ if (i >= maxlen) {
32
+ *len = -1;
33
+ return 0;
34
+ }
35
+ }
36
+
37
+ value |= (b << (i * 7));
38
+ *len = (i + 1);
39
+ return value;
40
+ }
41
+
42
+ static VALUE rb_varint_read_unsigned_var_int(VALUE klass, VALUE rbytes) {
43
+ int rlen = RARRAY_LEN(rbytes);
44
+ int maxlen = (rlen > 5) ? 5 : rlen;
45
+
46
+ uint8_t bytes[5];
47
+ for (int i = 0; i < maxlen; i++) {
48
+ bytes[i] = (uint8_t)NUM2INT(rb_ary_entry(rbytes, i));
49
+ }
50
+
51
+ int len = 0;
52
+ uint32_t value = varint_read_unsigned(bytes, maxlen, &len);
53
+
54
+ if (len == -1) {
55
+ rb_raise(rb_eRuntimeError, "Variable length quantity is too long");
56
+ return Qnil;
57
+ }
58
+
59
+ // Discard elements that were used to retrieve the value
60
+ for (int i = 0; i < len; i++) {
61
+ rb_ary_shift(rbytes);
62
+ }
63
+ return ULONG2NUM(value);
64
+ }
65
+
66
+ static VALUE rb_varint_write_unsigned_var_int(VALUE klass, VALUE value) {
67
+ VALUE rbytes = rb_ary_new2(5);
68
+
69
+ uint8_t bytes[5];
70
+ int len = varint_write_unsigned(NUM2ULONG(value), bytes);
71
+ for (int i = 0; i < len; i++) {
72
+ rb_ary_push(rbytes, INT2NUM(bytes[i]));
73
+ }
74
+
75
+ return rbytes;
76
+ }
77
+
78
+ void Init_hyperll_varint(void) {
79
+ rb_cVarint = rb_define_class_under(rb_mHyperll, "Varint", rb_cObject);
80
+
81
+ rb_define_singleton_method(rb_cVarint, "read_unsigned_var_int", rb_varint_read_unsigned_var_int, 1);
82
+ rb_define_singleton_method(rb_cVarint, "write_unsigned_var_int", rb_varint_write_unsigned_var_int, 1);
83
+ }
@@ -0,0 +1,21 @@
1
+ #ifndef __H_VARINT
2
+ #define __H_VARINT
3
+
4
+ // Writes an unsigned varint.
5
+ //
6
+ // value: value to encode
7
+ // bytes: destination buffer. Caller is responsible for the allocation.
8
+ // Must be of size 5.
9
+ //
10
+ // Returns the count of the bytes that were actually needed to store the varint
11
+ int varint_write_unsigned(uint32_t value, uint8_t bytes[]);
12
+
13
+ // Reads an unsigned varint.
14
+ //
15
+ // bytes: encoded value
16
+ // maxlen: the maximum number of bytes to read
17
+ // len: the number of bytes read to reconstruct the varint; -1 if an error
18
+ // occurred
19
+ uint32_t varint_read_unsigned(uint8_t bytes[], int maxlen, int *len);
20
+
21
+ #endif