datasketches 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -1
- data/README.md +117 -5
- data/ext/datasketches/cpc_wrapper.cpp +1 -1
- data/ext/datasketches/ext.cpp +8 -0
- data/ext/datasketches/extconf.rb +4 -1
- data/ext/datasketches/fi_wrapper.cpp +48 -0
- data/ext/datasketches/hll_wrapper.cpp +2 -2
- data/ext/datasketches/kll_wrapper.cpp +87 -0
- data/ext/datasketches/theta_wrapper.cpp +12 -0
- data/ext/datasketches/vo_wrapper.cpp +39 -0
- data/lib/datasketches/version.rb +1 -1
- metadata +5 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bf7f5b6898a9f014d14fc5fcfa3a6a5da1dd5fc70d1b07bbbdf07a446126d9cd
|
4
|
+
data.tar.gz: 15d5fbb2ee6e0c1a6e7f25a145329eb73da9d7933f5f1475596b9e9c998b31f7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e3b412132bdaf7e45f85e1da48690ecccd3c510cc17fd5259f0244cb580f712dc58290fca50a547a9e0952cf7c0548124ed33be9224ba3b83d6dd461d70c2aab
|
7
|
+
data.tar.gz: 5f6d9932824961e11d6f8bd390dbf20e487b6ca8aef6b6440e5f6a11296227382be5261ebaf1da416d5f2354c7c99a97e34871b0964facc724ba33f2b88e96f6
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -12,14 +12,26 @@ Add this line to your application’s Gemfile:
|
|
12
12
|
gem 'datasketches'
|
13
13
|
```
|
14
14
|
|
15
|
-
##
|
15
|
+
## Sketch Families
|
16
16
|
|
17
17
|
Distinct counting
|
18
18
|
|
19
|
-
- [CPC
|
20
|
-
- [HyperLogLog
|
19
|
+
- [CPC sketch](#cpc-sketch)
|
20
|
+
- [HyperLogLog sketch](#hyperloglog-sketch)
|
21
21
|
|
22
|
-
|
22
|
+
Most frequent
|
23
|
+
|
24
|
+
- [Frequent item sketch](#frequent-item-sketch)
|
25
|
+
|
26
|
+
Quantiles and histograms
|
27
|
+
|
28
|
+
- [KLL sketch](#kll-sketch)
|
29
|
+
|
30
|
+
Sampling
|
31
|
+
|
32
|
+
- [VarOpt sketch](#varopt-sketch)
|
33
|
+
|
34
|
+
## CPC Sketch
|
23
35
|
|
24
36
|
Create a sketch
|
25
37
|
|
@@ -53,7 +65,7 @@ Load a sketch
|
|
53
65
|
sketch = DataSketches::CpcSketch.deserialize(data)
|
54
66
|
```
|
55
67
|
|
56
|
-
|
68
|
+
## HyperLogLog Sketch
|
57
69
|
|
58
70
|
Create a sketch
|
59
71
|
|
@@ -98,6 +110,106 @@ u.update(sketch2)
|
|
98
110
|
u.estimate
|
99
111
|
```
|
100
112
|
|
113
|
+
## Frequent Item Sketch
|
114
|
+
|
115
|
+
Create a sketch
|
116
|
+
|
117
|
+
```ruby
|
118
|
+
sketch = DataSketches::FrequentStringsSketch.new(64)
|
119
|
+
```
|
120
|
+
|
121
|
+
Add data
|
122
|
+
|
123
|
+
```ruby
|
124
|
+
sketch.update("a")
|
125
|
+
sketch.update("b")
|
126
|
+
sketch.update("c")
|
127
|
+
```
|
128
|
+
|
129
|
+
Estimate the frequency of an item
|
130
|
+
|
131
|
+
```ruby
|
132
|
+
sketch.estimate("a")
|
133
|
+
```
|
134
|
+
|
135
|
+
Save a sketch
|
136
|
+
|
137
|
+
```ruby
|
138
|
+
data = sketch.serialize
|
139
|
+
```
|
140
|
+
|
141
|
+
Load a sketch
|
142
|
+
|
143
|
+
```ruby
|
144
|
+
sketch = DataSketches::FrequentStringsSketch.deserialize(data)
|
145
|
+
```
|
146
|
+
|
147
|
+
## KLL Sketch
|
148
|
+
|
149
|
+
Create a sketch
|
150
|
+
|
151
|
+
```ruby
|
152
|
+
sketch = DataSketches::KllIntsSketch.new(200)
|
153
|
+
# or
|
154
|
+
sketch = DataSketches::KllFloatsSketch.new(200)
|
155
|
+
```
|
156
|
+
|
157
|
+
Add data
|
158
|
+
|
159
|
+
```ruby
|
160
|
+
sketch.update(1)
|
161
|
+
sketch.update(2)
|
162
|
+
sketch.update(3)
|
163
|
+
```
|
164
|
+
|
165
|
+
Get quantiles
|
166
|
+
|
167
|
+
```ruby
|
168
|
+
sketch.quantile(0.5)
|
169
|
+
sketch.quantiles([0.25, 0.5, 0.75])
|
170
|
+
```
|
171
|
+
|
172
|
+
Get the minimum and maximum values from the stream
|
173
|
+
|
174
|
+
```ruby
|
175
|
+
sketch.min_value
|
176
|
+
sketch.max_value
|
177
|
+
```
|
178
|
+
|
179
|
+
Save a sketch
|
180
|
+
|
181
|
+
```ruby
|
182
|
+
data = sketch.serialize
|
183
|
+
```
|
184
|
+
|
185
|
+
Load a sketch
|
186
|
+
|
187
|
+
```ruby
|
188
|
+
sketch = DataSketches::KllIntsSketch.deserialize(data)
|
189
|
+
```
|
190
|
+
|
191
|
+
Merge sketches
|
192
|
+
|
193
|
+
```ruby
|
194
|
+
sketch.merge(sketch2)
|
195
|
+
```
|
196
|
+
|
197
|
+
## VarOpt Sketch
|
198
|
+
|
199
|
+
Create a sketch
|
200
|
+
|
201
|
+
```ruby
|
202
|
+
sketch = DataSketches::VarOptSketch.new(14)
|
203
|
+
```
|
204
|
+
|
205
|
+
Add data
|
206
|
+
|
207
|
+
```ruby
|
208
|
+
sketch.update(1)
|
209
|
+
sketch.update(2.0)
|
210
|
+
sketch.update("three")
|
211
|
+
```
|
212
|
+
|
101
213
|
## Credits
|
102
214
|
|
103
215
|
This library is modeled after the DataSketches [Python API](https://github.com/apache/datasketches-cpp/tree/master/python).
|
@@ -15,7 +15,7 @@ void init_cpc(Rice::Module& m) {
|
|
15
15
|
.define_method(
|
16
16
|
"update",
|
17
17
|
*[](datasketches::cpc_sketch& self, Rice::Object datum) {
|
18
|
-
if (datum.
|
18
|
+
if (FIXNUM_P(datum.value())) {
|
19
19
|
return self.update(from_ruby<int64_t>(datum));
|
20
20
|
} else if (datum.is_a(rb_cNumeric)) {
|
21
21
|
return self.update(from_ruby<double>(datum));
|
data/ext/datasketches/ext.cpp
CHANGED
@@ -1,12 +1,20 @@
|
|
1
1
|
#include <rice/Module.hpp>
|
2
2
|
|
3
3
|
void init_cpc(Rice::Module& m);
|
4
|
+
void init_fi(Rice::Module& m);
|
4
5
|
void init_hll(Rice::Module& m);
|
6
|
+
void init_kll(Rice::Module& m);
|
7
|
+
void init_theta(Rice::Module& m);
|
8
|
+
void init_vo(Rice::Module& m);
|
5
9
|
|
6
10
|
extern "C"
|
7
11
|
void Init_ext()
|
8
12
|
{
|
9
13
|
Rice::Module m = Rice::define_module("DataSketches");
|
10
14
|
init_cpc(m);
|
15
|
+
init_fi(m);
|
11
16
|
init_hll(m);
|
17
|
+
init_kll(m);
|
18
|
+
init_theta(m);
|
19
|
+
init_vo(m);
|
12
20
|
}
|
data/ext/datasketches/extconf.rb
CHANGED
@@ -6,6 +6,9 @@ ext = File.expand_path(".", __dir__)
|
|
6
6
|
datasketches = File.expand_path("../../vendor/datasketches-cpp", __dir__)
|
7
7
|
|
8
8
|
$srcs = Dir["#{ext}/*.cpp"]
|
9
|
-
|
9
|
+
|
10
|
+
%w(common cpc fi hll kll sampling theta tuple).each do |v|
|
11
|
+
$INCFLAGS += " -I#{datasketches}/#{v}/include"
|
12
|
+
end
|
10
13
|
|
11
14
|
create_makefile("datasketches/ext")
|
@@ -0,0 +1,48 @@
|
|
1
|
+
#include <sstream>
|
2
|
+
|
3
|
+
#include <frequent_items_sketch.hpp>
|
4
|
+
|
5
|
+
#include <rice/Constructor.hpp>
|
6
|
+
#include <rice/Module.hpp>
|
7
|
+
|
8
|
+
template<typename T>
|
9
|
+
void bind_fi_sketch(Rice::Module& m, const char* name) {
|
10
|
+
Rice::define_class_under<datasketches::frequent_items_sketch<T>>(m, name)
|
11
|
+
.define_constructor(Rice::Constructor<datasketches::frequent_items_sketch<T>, uint16_t>())
|
12
|
+
.define_method("empty?", &datasketches::frequent_items_sketch<T>::is_empty)
|
13
|
+
.define_method("num_active_items", &datasketches::frequent_items_sketch<T>::get_num_active_items)
|
14
|
+
.define_method("total_weight", &datasketches::frequent_items_sketch<T>::get_total_weight)
|
15
|
+
.define_method("estimate", &datasketches::frequent_items_sketch<T>::get_estimate)
|
16
|
+
.define_method("lower_bound", &datasketches::frequent_items_sketch<T>::get_lower_bound)
|
17
|
+
.define_method("upper_bound", &datasketches::frequent_items_sketch<T>::get_upper_bound)
|
18
|
+
.define_method("maximum_error", &datasketches::frequent_items_sketch<T>::get_maximum_error)
|
19
|
+
.define_method(
|
20
|
+
"update",
|
21
|
+
*[](datasketches::frequent_items_sketch<T>& self, const T item) {
|
22
|
+
self.update(item);
|
23
|
+
})
|
24
|
+
.define_method(
|
25
|
+
"serialize",
|
26
|
+
*[](datasketches::frequent_items_sketch<T>& self) {
|
27
|
+
std::ostringstream oss;
|
28
|
+
self.serialize(oss);
|
29
|
+
return oss.str();
|
30
|
+
})
|
31
|
+
// TODO change to summary?
|
32
|
+
.define_method(
|
33
|
+
"to_string",
|
34
|
+
*[](datasketches::frequent_items_sketch<T>& self) {
|
35
|
+
return self.to_string();
|
36
|
+
})
|
37
|
+
.define_singleton_method(
|
38
|
+
"deserialize",
|
39
|
+
// TODO figure out segfault
|
40
|
+
*[](std::string is) {
|
41
|
+
std::istringstream iss(is);
|
42
|
+
return datasketches::frequent_items_sketch<T>::deserialize(iss);
|
43
|
+
});
|
44
|
+
}
|
45
|
+
|
46
|
+
void init_fi(Rice::Module& m) {
|
47
|
+
bind_fi_sketch<std::string>(m, "FrequentStringsSketch");
|
48
|
+
}
|
@@ -14,7 +14,7 @@ void init_hll(Rice::Module& m) {
|
|
14
14
|
.define_method(
|
15
15
|
"update",
|
16
16
|
*[](datasketches::hll_sketch& self, Rice::Object datum) {
|
17
|
-
if (datum.
|
17
|
+
if (FIXNUM_P(datum.value())) {
|
18
18
|
return self.update(from_ruby<int64_t>(datum));
|
19
19
|
} else if (datum.is_a(rb_cNumeric)) {
|
20
20
|
return self.update(from_ruby<double>(datum));
|
@@ -54,7 +54,7 @@ void init_hll(Rice::Module& m) {
|
|
54
54
|
return datasketches::hll_sketch::deserialize(iss);
|
55
55
|
});
|
56
56
|
|
57
|
-
define_class_under<datasketches::hll_union>(m, "HllUnion")
|
57
|
+
Rice::define_class_under<datasketches::hll_union>(m, "HllUnion")
|
58
58
|
.define_constructor(Rice::Constructor<datasketches::hll_union, int>())
|
59
59
|
.define_method(
|
60
60
|
"update",
|
@@ -0,0 +1,87 @@
|
|
1
|
+
#include <sstream>
|
2
|
+
|
3
|
+
#include <kll_sketch.hpp>
|
4
|
+
|
5
|
+
#include <rice/Array.hpp>
|
6
|
+
#include <rice/Constructor.hpp>
|
7
|
+
#include <rice/Module.hpp>
|
8
|
+
|
9
|
+
template<>
|
10
|
+
std::vector<double> from_ruby<std::vector<double>>(Rice::Object x)
|
11
|
+
{
|
12
|
+
auto a = Rice::Array(x);
|
13
|
+
std::vector<double> vec(a.size());
|
14
|
+
for (size_t i = 0; i < a.size(); i++) {
|
15
|
+
vec[i] = from_ruby<double>(a[i]);
|
16
|
+
}
|
17
|
+
return vec;
|
18
|
+
}
|
19
|
+
|
20
|
+
template<>
|
21
|
+
Rice::Object to_ruby<std::vector<int>>(std::vector<int> const & x)
|
22
|
+
{
|
23
|
+
auto a = Rice::Array();
|
24
|
+
for (size_t i = 0; i < x.size(); i++) {
|
25
|
+
a.push(x[i]);
|
26
|
+
}
|
27
|
+
return a;
|
28
|
+
}
|
29
|
+
|
30
|
+
template<>
|
31
|
+
Rice::Object to_ruby<std::vector<float>>(std::vector<float> const & x)
|
32
|
+
{
|
33
|
+
auto a = Rice::Array();
|
34
|
+
for (size_t i = 0; i < x.size(); i++) {
|
35
|
+
a.push(x[i]);
|
36
|
+
}
|
37
|
+
return a;
|
38
|
+
}
|
39
|
+
|
40
|
+
template<typename T>
|
41
|
+
void bind_kll_sketch(Rice::Module& m, const char* name) {
|
42
|
+
Rice::define_class_under<datasketches::kll_sketch<T>>(m, name)
|
43
|
+
.define_constructor(Rice::Constructor<datasketches::kll_sketch<T>, uint16_t>())
|
44
|
+
.define_method("empty?", &datasketches::kll_sketch<T>::is_empty)
|
45
|
+
.define_method("min_value", &datasketches::kll_sketch<T>::get_min_value)
|
46
|
+
.define_method("max_value", &datasketches::kll_sketch<T>::get_max_value)
|
47
|
+
.define_method("quantile", &datasketches::kll_sketch<T>::get_quantile)
|
48
|
+
.define_method(
|
49
|
+
"quantiles",
|
50
|
+
*[](datasketches::kll_sketch<T>& self, std::vector<double> fractions) {
|
51
|
+
return self.get_quantiles(&fractions[0], fractions.size());
|
52
|
+
})
|
53
|
+
.define_method(
|
54
|
+
"merge",
|
55
|
+
*[](datasketches::kll_sketch<T>& self, const datasketches::kll_sketch<T>& other) {
|
56
|
+
self.merge(other);
|
57
|
+
})
|
58
|
+
.define_method(
|
59
|
+
"update",
|
60
|
+
*[](datasketches::kll_sketch<T>& self, const T item) {
|
61
|
+
self.update(item);
|
62
|
+
})
|
63
|
+
.define_method(
|
64
|
+
"serialize",
|
65
|
+
*[](datasketches::kll_sketch<T>& self) {
|
66
|
+
std::ostringstream oss;
|
67
|
+
self.serialize(oss);
|
68
|
+
return oss.str();
|
69
|
+
})
|
70
|
+
// TODO change to summary?
|
71
|
+
.define_method(
|
72
|
+
"to_string",
|
73
|
+
*[](datasketches::kll_sketch<T>& self) {
|
74
|
+
return self.to_string();
|
75
|
+
})
|
76
|
+
.define_singleton_method(
|
77
|
+
"deserialize",
|
78
|
+
*[](std::string& is) {
|
79
|
+
std::istringstream iss(is);
|
80
|
+
return datasketches::kll_sketch<T>::deserialize(iss);
|
81
|
+
});
|
82
|
+
}
|
83
|
+
|
84
|
+
void init_kll(Rice::Module& m) {
|
85
|
+
bind_kll_sketch<int>(m, "KllIntsSketch");
|
86
|
+
bind_kll_sketch<float>(m, "KllFloatsSketch");
|
87
|
+
}
|
@@ -0,0 +1,12 @@
|
|
1
|
+
#include <sstream>
|
2
|
+
|
3
|
+
#include <theta_sketch.hpp>
|
4
|
+
|
5
|
+
#include <rice/Constructor.hpp>
|
6
|
+
#include <rice/Module.hpp>
|
7
|
+
|
8
|
+
void init_theta(Rice::Module& m) {
|
9
|
+
Rice::define_class_under<datasketches::theta_sketch>(m, "ThetaSketch")
|
10
|
+
.define_method("empty?", &datasketches::theta_sketch::is_empty)
|
11
|
+
.define_method("estimate", &datasketches::theta_sketch::get_estimate);
|
12
|
+
}
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#include <sstream>
|
2
|
+
|
3
|
+
#include <var_opt_sketch.hpp>
|
4
|
+
|
5
|
+
#include <rice/Array.hpp>
|
6
|
+
#include <rice/Constructor.hpp>
|
7
|
+
#include <rice/Module.hpp>
|
8
|
+
|
9
|
+
template<typename T>
|
10
|
+
void bind_vo_sketch(Rice::Module &m, const char* name) {
|
11
|
+
Rice::define_class_under<datasketches::var_opt_sketch<T>>(m, "VarOptSketch")
|
12
|
+
.define_constructor(Rice::Constructor<datasketches::var_opt_sketch<T>, uint32_t>())
|
13
|
+
.define_method("k", &datasketches::var_opt_sketch<T>::get_k)
|
14
|
+
.define_method("n", &datasketches::var_opt_sketch<T>::get_n)
|
15
|
+
.define_method("num_samples", &datasketches::var_opt_sketch<T>::get_num_samples)
|
16
|
+
.define_method("empty?", &datasketches::var_opt_sketch<T>::is_empty)
|
17
|
+
.define_method("reset", &datasketches::var_opt_sketch<T>::reset)
|
18
|
+
.define_method(
|
19
|
+
"samples",
|
20
|
+
*[](datasketches::var_opt_sketch<T>& self) {
|
21
|
+
auto a = Rice::Array();
|
22
|
+
for (auto item : self) {
|
23
|
+
auto t = Rice::Array();
|
24
|
+
t.push(item.first);
|
25
|
+
t.push(item.second);
|
26
|
+
a.push(t);
|
27
|
+
}
|
28
|
+
return a;
|
29
|
+
})
|
30
|
+
.define_method(
|
31
|
+
"update",
|
32
|
+
*[](datasketches::var_opt_sketch<T>& self, const T item) {
|
33
|
+
self.update(item);
|
34
|
+
});
|
35
|
+
}
|
36
|
+
|
37
|
+
void init_vo(Rice::Module& m) {
|
38
|
+
bind_vo_sketch<Rice::Object>(m, "VarOptSketch");
|
39
|
+
}
|
data/lib/datasketches/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datasketches
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
@@ -38,7 +38,11 @@ files:
|
|
38
38
|
- ext/datasketches/cpc_wrapper.cpp
|
39
39
|
- ext/datasketches/ext.cpp
|
40
40
|
- ext/datasketches/extconf.rb
|
41
|
+
- ext/datasketches/fi_wrapper.cpp
|
41
42
|
- ext/datasketches/hll_wrapper.cpp
|
43
|
+
- ext/datasketches/kll_wrapper.cpp
|
44
|
+
- ext/datasketches/theta_wrapper.cpp
|
45
|
+
- ext/datasketches/vo_wrapper.cpp
|
42
46
|
- lib/datasketches.rb
|
43
47
|
- lib/datasketches/version.rb
|
44
48
|
- vendor/datasketches-cpp/CMakeLists.txt
|