datasketches 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -1
- data/README.md +117 -5
- data/ext/datasketches/cpc_wrapper.cpp +1 -1
- data/ext/datasketches/ext.cpp +8 -0
- data/ext/datasketches/extconf.rb +4 -1
- data/ext/datasketches/fi_wrapper.cpp +48 -0
- data/ext/datasketches/hll_wrapper.cpp +2 -2
- data/ext/datasketches/kll_wrapper.cpp +87 -0
- data/ext/datasketches/theta_wrapper.cpp +12 -0
- data/ext/datasketches/vo_wrapper.cpp +39 -0
- data/lib/datasketches/version.rb +1 -1
- metadata +5 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bf7f5b6898a9f014d14fc5fcfa3a6a5da1dd5fc70d1b07bbbdf07a446126d9cd
|
4
|
+
data.tar.gz: 15d5fbb2ee6e0c1a6e7f25a145329eb73da9d7933f5f1475596b9e9c998b31f7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e3b412132bdaf7e45f85e1da48690ecccd3c510cc17fd5259f0244cb580f712dc58290fca50a547a9e0952cf7c0548124ed33be9224ba3b83d6dd461d70c2aab
|
7
|
+
data.tar.gz: 5f6d9932824961e11d6f8bd390dbf20e487b6ca8aef6b6440e5f6a11296227382be5261ebaf1da416d5f2354c7c99a97e34871b0964facc724ba33f2b88e96f6
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -12,14 +12,26 @@ Add this line to your application’s Gemfile:
|
|
12
12
|
gem 'datasketches'
|
13
13
|
```
|
14
14
|
|
15
|
-
##
|
15
|
+
## Sketch Families
|
16
16
|
|
17
17
|
Distinct counting
|
18
18
|
|
19
|
-
- [CPC
|
20
|
-
- [HyperLogLog
|
19
|
+
- [CPC sketch](#cpc-sketch)
|
20
|
+
- [HyperLogLog sketch](#hyperloglog-sketch)
|
21
21
|
|
22
|
-
|
22
|
+
Most frequent
|
23
|
+
|
24
|
+
- [Frequent item sketch](#frequent-item-sketch)
|
25
|
+
|
26
|
+
Quantiles and histograms
|
27
|
+
|
28
|
+
- [KLL sketch](#kll-sketch)
|
29
|
+
|
30
|
+
Sampling
|
31
|
+
|
32
|
+
- [VarOpt sketch](#varopt-sketch)
|
33
|
+
|
34
|
+
## CPC Sketch
|
23
35
|
|
24
36
|
Create a sketch
|
25
37
|
|
@@ -53,7 +65,7 @@ Load a sketch
|
|
53
65
|
sketch = DataSketches::CpcSketch.deserialize(data)
|
54
66
|
```
|
55
67
|
|
56
|
-
|
68
|
+
## HyperLogLog Sketch
|
57
69
|
|
58
70
|
Create a sketch
|
59
71
|
|
@@ -98,6 +110,106 @@ u.update(sketch2)
|
|
98
110
|
u.estimate
|
99
111
|
```
|
100
112
|
|
113
|
+
## Frequent Item Sketch
|
114
|
+
|
115
|
+
Create a sketch
|
116
|
+
|
117
|
+
```ruby
|
118
|
+
sketch = DataSketches::FrequentStringsSketch.new(64)
|
119
|
+
```
|
120
|
+
|
121
|
+
Add data
|
122
|
+
|
123
|
+
```ruby
|
124
|
+
sketch.update("a")
|
125
|
+
sketch.update("b")
|
126
|
+
sketch.update("c")
|
127
|
+
```
|
128
|
+
|
129
|
+
Estimate the frequency of an item
|
130
|
+
|
131
|
+
```ruby
|
132
|
+
sketch.estimate("a")
|
133
|
+
```
|
134
|
+
|
135
|
+
Save a sketch
|
136
|
+
|
137
|
+
```ruby
|
138
|
+
data = sketch.serialize
|
139
|
+
```
|
140
|
+
|
141
|
+
Load a sketch
|
142
|
+
|
143
|
+
```ruby
|
144
|
+
sketch = DataSketches::FrequentStringsSketch.deserialize(data)
|
145
|
+
```
|
146
|
+
|
147
|
+
## KLL Sketch
|
148
|
+
|
149
|
+
Create a sketch
|
150
|
+
|
151
|
+
```ruby
|
152
|
+
sketch = DataSketches::KllIntsSketch.new(200)
|
153
|
+
# or
|
154
|
+
sketch = DataSketches::KllFloatsSketch.new(200)
|
155
|
+
```
|
156
|
+
|
157
|
+
Add data
|
158
|
+
|
159
|
+
```ruby
|
160
|
+
sketch.update(1)
|
161
|
+
sketch.update(2)
|
162
|
+
sketch.update(3)
|
163
|
+
```
|
164
|
+
|
165
|
+
Get quantiles
|
166
|
+
|
167
|
+
```ruby
|
168
|
+
sketch.quantile(0.5)
|
169
|
+
sketch.quantiles([0.25, 0.5, 0.75])
|
170
|
+
```
|
171
|
+
|
172
|
+
Get the minimum and maximum values from the stream
|
173
|
+
|
174
|
+
```ruby
|
175
|
+
sketch.min_value
|
176
|
+
sketch.max_value
|
177
|
+
```
|
178
|
+
|
179
|
+
Save a sketch
|
180
|
+
|
181
|
+
```ruby
|
182
|
+
data = sketch.serialize
|
183
|
+
```
|
184
|
+
|
185
|
+
Load a sketch
|
186
|
+
|
187
|
+
```ruby
|
188
|
+
sketch = DataSketches::KllIntsSketch.deserialize(data)
|
189
|
+
```
|
190
|
+
|
191
|
+
Merge sketches
|
192
|
+
|
193
|
+
```ruby
|
194
|
+
sketch.merge(sketch2)
|
195
|
+
```
|
196
|
+
|
197
|
+
## VarOpt Sketch
|
198
|
+
|
199
|
+
Create a sketch
|
200
|
+
|
201
|
+
```ruby
|
202
|
+
sketch = DataSketches::VarOptSketch.new(14)
|
203
|
+
```
|
204
|
+
|
205
|
+
Add data
|
206
|
+
|
207
|
+
```ruby
|
208
|
+
sketch.update(1)
|
209
|
+
sketch.update(2.0)
|
210
|
+
sketch.update("three")
|
211
|
+
```
|
212
|
+
|
101
213
|
## Credits
|
102
214
|
|
103
215
|
This library is modeled after the DataSketches [Python API](https://github.com/apache/datasketches-cpp/tree/master/python).
|
@@ -15,7 +15,7 @@ void init_cpc(Rice::Module& m) {
|
|
15
15
|
.define_method(
|
16
16
|
"update",
|
17
17
|
*[](datasketches::cpc_sketch& self, Rice::Object datum) {
|
18
|
-
if (datum.
|
18
|
+
if (FIXNUM_P(datum.value())) {
|
19
19
|
return self.update(from_ruby<int64_t>(datum));
|
20
20
|
} else if (datum.is_a(rb_cNumeric)) {
|
21
21
|
return self.update(from_ruby<double>(datum));
|
data/ext/datasketches/ext.cpp
CHANGED
@@ -1,12 +1,20 @@
|
|
1
1
|
#include <rice/Module.hpp>
|
2
2
|
|
3
3
|
void init_cpc(Rice::Module& m);
|
4
|
+
void init_fi(Rice::Module& m);
|
4
5
|
void init_hll(Rice::Module& m);
|
6
|
+
void init_kll(Rice::Module& m);
|
7
|
+
void init_theta(Rice::Module& m);
|
8
|
+
void init_vo(Rice::Module& m);
|
5
9
|
|
6
10
|
extern "C"
|
7
11
|
void Init_ext()
|
8
12
|
{
|
9
13
|
Rice::Module m = Rice::define_module("DataSketches");
|
10
14
|
init_cpc(m);
|
15
|
+
init_fi(m);
|
11
16
|
init_hll(m);
|
17
|
+
init_kll(m);
|
18
|
+
init_theta(m);
|
19
|
+
init_vo(m);
|
12
20
|
}
|
data/ext/datasketches/extconf.rb
CHANGED
@@ -6,6 +6,9 @@ ext = File.expand_path(".", __dir__)
|
|
6
6
|
datasketches = File.expand_path("../../vendor/datasketches-cpp", __dir__)
|
7
7
|
|
8
8
|
$srcs = Dir["#{ext}/*.cpp"]
|
9
|
-
|
9
|
+
|
10
|
+
%w(common cpc fi hll kll sampling theta tuple).each do |v|
|
11
|
+
$INCFLAGS += " -I#{datasketches}/#{v}/include"
|
12
|
+
end
|
10
13
|
|
11
14
|
create_makefile("datasketches/ext")
|
@@ -0,0 +1,48 @@
|
|
1
|
+
#include <sstream>
|
2
|
+
|
3
|
+
#include <frequent_items_sketch.hpp>
|
4
|
+
|
5
|
+
#include <rice/Constructor.hpp>
|
6
|
+
#include <rice/Module.hpp>
|
7
|
+
|
8
|
+
template<typename T>
|
9
|
+
void bind_fi_sketch(Rice::Module& m, const char* name) {
|
10
|
+
Rice::define_class_under<datasketches::frequent_items_sketch<T>>(m, name)
|
11
|
+
.define_constructor(Rice::Constructor<datasketches::frequent_items_sketch<T>, uint16_t>())
|
12
|
+
.define_method("empty?", &datasketches::frequent_items_sketch<T>::is_empty)
|
13
|
+
.define_method("num_active_items", &datasketches::frequent_items_sketch<T>::get_num_active_items)
|
14
|
+
.define_method("total_weight", &datasketches::frequent_items_sketch<T>::get_total_weight)
|
15
|
+
.define_method("estimate", &datasketches::frequent_items_sketch<T>::get_estimate)
|
16
|
+
.define_method("lower_bound", &datasketches::frequent_items_sketch<T>::get_lower_bound)
|
17
|
+
.define_method("upper_bound", &datasketches::frequent_items_sketch<T>::get_upper_bound)
|
18
|
+
.define_method("maximum_error", &datasketches::frequent_items_sketch<T>::get_maximum_error)
|
19
|
+
.define_method(
|
20
|
+
"update",
|
21
|
+
*[](datasketches::frequent_items_sketch<T>& self, const T item) {
|
22
|
+
self.update(item);
|
23
|
+
})
|
24
|
+
.define_method(
|
25
|
+
"serialize",
|
26
|
+
*[](datasketches::frequent_items_sketch<T>& self) {
|
27
|
+
std::ostringstream oss;
|
28
|
+
self.serialize(oss);
|
29
|
+
return oss.str();
|
30
|
+
})
|
31
|
+
// TODO change to summary?
|
32
|
+
.define_method(
|
33
|
+
"to_string",
|
34
|
+
*[](datasketches::frequent_items_sketch<T>& self) {
|
35
|
+
return self.to_string();
|
36
|
+
})
|
37
|
+
.define_singleton_method(
|
38
|
+
"deserialize",
|
39
|
+
// TODO figure out segfault
|
40
|
+
*[](std::string is) {
|
41
|
+
std::istringstream iss(is);
|
42
|
+
return datasketches::frequent_items_sketch<T>::deserialize(iss);
|
43
|
+
});
|
44
|
+
}
|
45
|
+
|
46
|
+
void init_fi(Rice::Module& m) {
|
47
|
+
bind_fi_sketch<std::string>(m, "FrequentStringsSketch");
|
48
|
+
}
|
@@ -14,7 +14,7 @@ void init_hll(Rice::Module& m) {
|
|
14
14
|
.define_method(
|
15
15
|
"update",
|
16
16
|
*[](datasketches::hll_sketch& self, Rice::Object datum) {
|
17
|
-
if (datum.
|
17
|
+
if (FIXNUM_P(datum.value())) {
|
18
18
|
return self.update(from_ruby<int64_t>(datum));
|
19
19
|
} else if (datum.is_a(rb_cNumeric)) {
|
20
20
|
return self.update(from_ruby<double>(datum));
|
@@ -54,7 +54,7 @@ void init_hll(Rice::Module& m) {
|
|
54
54
|
return datasketches::hll_sketch::deserialize(iss);
|
55
55
|
});
|
56
56
|
|
57
|
-
define_class_under<datasketches::hll_union>(m, "HllUnion")
|
57
|
+
Rice::define_class_under<datasketches::hll_union>(m, "HllUnion")
|
58
58
|
.define_constructor(Rice::Constructor<datasketches::hll_union, int>())
|
59
59
|
.define_method(
|
60
60
|
"update",
|
@@ -0,0 +1,87 @@
|
|
1
|
+
#include <sstream>
|
2
|
+
|
3
|
+
#include <kll_sketch.hpp>
|
4
|
+
|
5
|
+
#include <rice/Array.hpp>
|
6
|
+
#include <rice/Constructor.hpp>
|
7
|
+
#include <rice/Module.hpp>
|
8
|
+
|
9
|
+
template<>
|
10
|
+
std::vector<double> from_ruby<std::vector<double>>(Rice::Object x)
|
11
|
+
{
|
12
|
+
auto a = Rice::Array(x);
|
13
|
+
std::vector<double> vec(a.size());
|
14
|
+
for (size_t i = 0; i < a.size(); i++) {
|
15
|
+
vec[i] = from_ruby<double>(a[i]);
|
16
|
+
}
|
17
|
+
return vec;
|
18
|
+
}
|
19
|
+
|
20
|
+
template<>
|
21
|
+
Rice::Object to_ruby<std::vector<int>>(std::vector<int> const & x)
|
22
|
+
{
|
23
|
+
auto a = Rice::Array();
|
24
|
+
for (size_t i = 0; i < x.size(); i++) {
|
25
|
+
a.push(x[i]);
|
26
|
+
}
|
27
|
+
return a;
|
28
|
+
}
|
29
|
+
|
30
|
+
template<>
|
31
|
+
Rice::Object to_ruby<std::vector<float>>(std::vector<float> const & x)
|
32
|
+
{
|
33
|
+
auto a = Rice::Array();
|
34
|
+
for (size_t i = 0; i < x.size(); i++) {
|
35
|
+
a.push(x[i]);
|
36
|
+
}
|
37
|
+
return a;
|
38
|
+
}
|
39
|
+
|
40
|
+
template<typename T>
|
41
|
+
void bind_kll_sketch(Rice::Module& m, const char* name) {
|
42
|
+
Rice::define_class_under<datasketches::kll_sketch<T>>(m, name)
|
43
|
+
.define_constructor(Rice::Constructor<datasketches::kll_sketch<T>, uint16_t>())
|
44
|
+
.define_method("empty?", &datasketches::kll_sketch<T>::is_empty)
|
45
|
+
.define_method("min_value", &datasketches::kll_sketch<T>::get_min_value)
|
46
|
+
.define_method("max_value", &datasketches::kll_sketch<T>::get_max_value)
|
47
|
+
.define_method("quantile", &datasketches::kll_sketch<T>::get_quantile)
|
48
|
+
.define_method(
|
49
|
+
"quantiles",
|
50
|
+
*[](datasketches::kll_sketch<T>& self, std::vector<double> fractions) {
|
51
|
+
return self.get_quantiles(&fractions[0], fractions.size());
|
52
|
+
})
|
53
|
+
.define_method(
|
54
|
+
"merge",
|
55
|
+
*[](datasketches::kll_sketch<T>& self, const datasketches::kll_sketch<T>& other) {
|
56
|
+
self.merge(other);
|
57
|
+
})
|
58
|
+
.define_method(
|
59
|
+
"update",
|
60
|
+
*[](datasketches::kll_sketch<T>& self, const T item) {
|
61
|
+
self.update(item);
|
62
|
+
})
|
63
|
+
.define_method(
|
64
|
+
"serialize",
|
65
|
+
*[](datasketches::kll_sketch<T>& self) {
|
66
|
+
std::ostringstream oss;
|
67
|
+
self.serialize(oss);
|
68
|
+
return oss.str();
|
69
|
+
})
|
70
|
+
// TODO change to summary?
|
71
|
+
.define_method(
|
72
|
+
"to_string",
|
73
|
+
*[](datasketches::kll_sketch<T>& self) {
|
74
|
+
return self.to_string();
|
75
|
+
})
|
76
|
+
.define_singleton_method(
|
77
|
+
"deserialize",
|
78
|
+
*[](std::string& is) {
|
79
|
+
std::istringstream iss(is);
|
80
|
+
return datasketches::kll_sketch<T>::deserialize(iss);
|
81
|
+
});
|
82
|
+
}
|
83
|
+
|
84
|
+
void init_kll(Rice::Module& m) {
|
85
|
+
bind_kll_sketch<int>(m, "KllIntsSketch");
|
86
|
+
bind_kll_sketch<float>(m, "KllFloatsSketch");
|
87
|
+
}
|
@@ -0,0 +1,12 @@
|
|
1
|
+
#include <sstream>
|
2
|
+
|
3
|
+
#include <theta_sketch.hpp>
|
4
|
+
|
5
|
+
#include <rice/Constructor.hpp>
|
6
|
+
#include <rice/Module.hpp>
|
7
|
+
|
8
|
+
void init_theta(Rice::Module& m) {
|
9
|
+
Rice::define_class_under<datasketches::theta_sketch>(m, "ThetaSketch")
|
10
|
+
.define_method("empty?", &datasketches::theta_sketch::is_empty)
|
11
|
+
.define_method("estimate", &datasketches::theta_sketch::get_estimate);
|
12
|
+
}
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#include <sstream>
|
2
|
+
|
3
|
+
#include <var_opt_sketch.hpp>
|
4
|
+
|
5
|
+
#include <rice/Array.hpp>
|
6
|
+
#include <rice/Constructor.hpp>
|
7
|
+
#include <rice/Module.hpp>
|
8
|
+
|
9
|
+
template<typename T>
|
10
|
+
void bind_vo_sketch(Rice::Module &m, const char* name) {
|
11
|
+
Rice::define_class_under<datasketches::var_opt_sketch<T>>(m, "VarOptSketch")
|
12
|
+
.define_constructor(Rice::Constructor<datasketches::var_opt_sketch<T>, uint32_t>())
|
13
|
+
.define_method("k", &datasketches::var_opt_sketch<T>::get_k)
|
14
|
+
.define_method("n", &datasketches::var_opt_sketch<T>::get_n)
|
15
|
+
.define_method("num_samples", &datasketches::var_opt_sketch<T>::get_num_samples)
|
16
|
+
.define_method("empty?", &datasketches::var_opt_sketch<T>::is_empty)
|
17
|
+
.define_method("reset", &datasketches::var_opt_sketch<T>::reset)
|
18
|
+
.define_method(
|
19
|
+
"samples",
|
20
|
+
*[](datasketches::var_opt_sketch<T>& self) {
|
21
|
+
auto a = Rice::Array();
|
22
|
+
for (auto item : self) {
|
23
|
+
auto t = Rice::Array();
|
24
|
+
t.push(item.first);
|
25
|
+
t.push(item.second);
|
26
|
+
a.push(t);
|
27
|
+
}
|
28
|
+
return a;
|
29
|
+
})
|
30
|
+
.define_method(
|
31
|
+
"update",
|
32
|
+
*[](datasketches::var_opt_sketch<T>& self, const T item) {
|
33
|
+
self.update(item);
|
34
|
+
});
|
35
|
+
}
|
36
|
+
|
37
|
+
void init_vo(Rice::Module& m) {
|
38
|
+
bind_vo_sketch<Rice::Object>(m, "VarOptSketch");
|
39
|
+
}
|
data/lib/datasketches/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datasketches
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
@@ -38,7 +38,11 @@ files:
|
|
38
38
|
- ext/datasketches/cpc_wrapper.cpp
|
39
39
|
- ext/datasketches/ext.cpp
|
40
40
|
- ext/datasketches/extconf.rb
|
41
|
+
- ext/datasketches/fi_wrapper.cpp
|
41
42
|
- ext/datasketches/hll_wrapper.cpp
|
43
|
+
- ext/datasketches/kll_wrapper.cpp
|
44
|
+
- ext/datasketches/theta_wrapper.cpp
|
45
|
+
- ext/datasketches/vo_wrapper.cpp
|
42
46
|
- lib/datasketches.rb
|
43
47
|
- lib/datasketches/version.rb
|
44
48
|
- vendor/datasketches-cpp/CMakeLists.txt
|