datasketches 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0bf3a004350a15e151c0114a7351519d14f2d9fc793c5ac87a8daa9756be42af
4
- data.tar.gz: a48f65a18a1a099e68a4d0de0714b5a1db888c97348c021cd68d6e5a1b066abf
3
+ metadata.gz: bf7f5b6898a9f014d14fc5fcfa3a6a5da1dd5fc70d1b07bbbdf07a446126d9cd
4
+ data.tar.gz: 15d5fbb2ee6e0c1a6e7f25a145329eb73da9d7933f5f1475596b9e9c998b31f7
5
5
  SHA512:
6
- metadata.gz: 89cce97d6a892575dfbe1441d8815742fdd827d26b9f3faafef4e9cc99e5386f025cd00a5e0e1d0188520c2f41405062fcb58c4950d8238d234eff2513234566
7
- data.tar.gz: 3cc79a07e80f42b58ea6a3a5e792f72ea79b6f7980076bd92b9cb43c1906bc4c3f30bb6156347c1acce4a2b1d69407ec8178e7775e7ed45bab5930f052fc7b71
6
+ metadata.gz: e3b412132bdaf7e45f85e1da48690ecccd3c510cc17fd5259f0244cb580f712dc58290fca50a547a9e0952cf7c0548124ed33be9224ba3b83d6dd461d70c2aab
7
+ data.tar.gz: 5f6d9932824961e11d6f8bd390dbf20e487b6ca8aef6b6440e5f6a11296227382be5261ebaf1da416d5f2354c7c99a97e34871b0964facc724ba33f2b88e96f6
@@ -1,3 +1,8 @@
1
- ## 0.1.0 (unreleased)
1
+ ## 0.1.1 (2021-01-20)
2
+
3
+ - Added more sketches
4
+ - Fixed installation on Linux and Windows
5
+
6
+ ## 0.1.0 (2021-01-19)
2
7
 
3
8
  - First release
data/README.md CHANGED
@@ -12,14 +12,26 @@ Add this line to your application’s Gemfile:
12
12
  gem 'datasketches'
13
13
  ```
14
14
 
15
- ## Data Structures
15
+ ## Sketch Families
16
16
 
17
17
  Distinct counting
18
18
 
19
- - [CPC Sketch](#cpc-sketch)
20
- - [HyperLogLog Sketch](#hyperloglog-sketch)
19
+ - [CPC sketch](#cpc-sketch)
20
+ - [HyperLogLog sketch](#hyperloglog-sketch)
21
21
 
22
- ### CPC Sketch
22
+ Most frequent
23
+
24
+ - [Frequent item sketch](#frequent-item-sketch)
25
+
26
+ Quantiles and histograms
27
+
28
+ - [KLL sketch](#kll-sketch)
29
+
30
+ Sampling
31
+
32
+ - [VarOpt sketch](#varopt-sketch)
33
+
34
+ ## CPC Sketch
23
35
 
24
36
  Create a sketch
25
37
 
@@ -53,7 +65,7 @@ Load a sketch
53
65
  sketch = DataSketches::CpcSketch.deserialize(data)
54
66
  ```
55
67
 
56
- ### HyperLogLog Sketch
68
+ ## HyperLogLog Sketch
57
69
 
58
70
  Create a sketch
59
71
 
@@ -98,6 +110,106 @@ u.update(sketch2)
98
110
  u.estimate
99
111
  ```
100
112
 
113
+ ## Frequent Item Sketch
114
+
115
+ Create a sketch
116
+
117
+ ```ruby
118
+ sketch = DataSketches::FrequentStringsSketch.new(64)
119
+ ```
120
+
121
+ Add data
122
+
123
+ ```ruby
124
+ sketch.update("a")
125
+ sketch.update("b")
126
+ sketch.update("c")
127
+ ```
128
+
129
+ Estimate the frequency of an item
130
+
131
+ ```ruby
132
+ sketch.estimate("a")
133
+ ```
134
+
135
+ Save a sketch
136
+
137
+ ```ruby
138
+ data = sketch.serialize
139
+ ```
140
+
141
+ Load a sketch
142
+
143
+ ```ruby
144
+ sketch = DataSketches::FrequentStringsSketch.deserialize(data)
145
+ ```
146
+
147
+ ## KLL Sketch
148
+
149
+ Create a sketch
150
+
151
+ ```ruby
152
+ sketch = DataSketches::KllIntsSketch.new(200)
153
+ # or
154
+ sketch = DataSketches::KllFloatsSketch.new(200)
155
+ ```
156
+
157
+ Add data
158
+
159
+ ```ruby
160
+ sketch.update(1)
161
+ sketch.update(2)
162
+ sketch.update(3)
163
+ ```
164
+
165
+ Get quantiles
166
+
167
+ ```ruby
168
+ sketch.quantile(0.5)
169
+ sketch.quantiles([0.25, 0.5, 0.75])
170
+ ```
171
+
172
+ Get the minimum and maximum values from the stream
173
+
174
+ ```ruby
175
+ sketch.min_value
176
+ sketch.max_value
177
+ ```
178
+
179
+ Save a sketch
180
+
181
+ ```ruby
182
+ data = sketch.serialize
183
+ ```
184
+
185
+ Load a sketch
186
+
187
+ ```ruby
188
+ sketch = DataSketches::KllIntsSketch.deserialize(data)
189
+ ```
190
+
191
+ Merge sketches
192
+
193
+ ```ruby
194
+ sketch.merge(sketch2)
195
+ ```
196
+
197
+ ## VarOpt Sketch
198
+
199
+ Create a sketch
200
+
201
+ ```ruby
202
+ sketch = DataSketches::VarOptSketch.new(14)
203
+ ```
204
+
205
+ Add data
206
+
207
+ ```ruby
208
+ sketch.update(1)
209
+ sketch.update(2.0)
210
+ sketch.update("three")
211
+ ```
212
+
101
213
  ## Credits
102
214
 
103
215
  This library is modeled after the DataSketches [Python API](https://github.com/apache/datasketches-cpp/tree/master/python).
@@ -15,7 +15,7 @@ void init_cpc(Rice::Module& m) {
15
15
  .define_method(
16
16
  "update",
17
17
  *[](datasketches::cpc_sketch& self, Rice::Object datum) {
18
- if (datum.is_a(rb_cInteger)) {
18
+ if (FIXNUM_P(datum.value())) {
19
19
  return self.update(from_ruby<int64_t>(datum));
20
20
  } else if (datum.is_a(rb_cNumeric)) {
21
21
  return self.update(from_ruby<double>(datum));
@@ -1,12 +1,20 @@
1
1
  #include <rice/Module.hpp>
2
2
 
3
3
  void init_cpc(Rice::Module& m);
4
+ void init_fi(Rice::Module& m);
4
5
  void init_hll(Rice::Module& m);
6
+ void init_kll(Rice::Module& m);
7
+ void init_theta(Rice::Module& m);
8
+ void init_vo(Rice::Module& m);
5
9
 
6
10
  extern "C"
7
11
  void Init_ext()
8
12
  {
9
13
  Rice::Module m = Rice::define_module("DataSketches");
10
14
  init_cpc(m);
15
+ init_fi(m);
11
16
  init_hll(m);
17
+ init_kll(m);
18
+ init_theta(m);
19
+ init_vo(m);
12
20
  }
@@ -6,6 +6,9 @@ ext = File.expand_path(".", __dir__)
6
6
  datasketches = File.expand_path("../../vendor/datasketches-cpp", __dir__)
7
7
 
8
8
  $srcs = Dir["#{ext}/*.cpp"]
9
- $INCFLAGS += " -I#{datasketches}/common/include -I#{datasketches}/cpc/include -I#{datasketches}/hll/include"
9
+
10
+ %w(common cpc fi hll kll sampling theta tuple).each do |v|
11
+ $INCFLAGS += " -I#{datasketches}/#{v}/include"
12
+ end
10
13
 
11
14
  create_makefile("datasketches/ext")
@@ -0,0 +1,48 @@
1
+ #include <sstream>
2
+
3
+ #include <frequent_items_sketch.hpp>
4
+
5
+ #include <rice/Constructor.hpp>
6
+ #include <rice/Module.hpp>
7
+
8
+ template<typename T>
9
+ void bind_fi_sketch(Rice::Module& m, const char* name) {
10
+ Rice::define_class_under<datasketches::frequent_items_sketch<T>>(m, name)
11
+ .define_constructor(Rice::Constructor<datasketches::frequent_items_sketch<T>, uint16_t>())
12
+ .define_method("empty?", &datasketches::frequent_items_sketch<T>::is_empty)
13
+ .define_method("num_active_items", &datasketches::frequent_items_sketch<T>::get_num_active_items)
14
+ .define_method("total_weight", &datasketches::frequent_items_sketch<T>::get_total_weight)
15
+ .define_method("estimate", &datasketches::frequent_items_sketch<T>::get_estimate)
16
+ .define_method("lower_bound", &datasketches::frequent_items_sketch<T>::get_lower_bound)
17
+ .define_method("upper_bound", &datasketches::frequent_items_sketch<T>::get_upper_bound)
18
+ .define_method("maximum_error", &datasketches::frequent_items_sketch<T>::get_maximum_error)
19
+ .define_method(
20
+ "update",
21
+ *[](datasketches::frequent_items_sketch<T>& self, const T item) {
22
+ self.update(item);
23
+ })
24
+ .define_method(
25
+ "serialize",
26
+ *[](datasketches::frequent_items_sketch<T>& self) {
27
+ std::ostringstream oss;
28
+ self.serialize(oss);
29
+ return oss.str();
30
+ })
31
+ // TODO change to summary?
32
+ .define_method(
33
+ "to_string",
34
+ *[](datasketches::frequent_items_sketch<T>& self) {
35
+ return self.to_string();
36
+ })
37
+ .define_singleton_method(
38
+ "deserialize",
39
+ // TODO figure out segfault
40
+ *[](std::string is) {
41
+ std::istringstream iss(is);
42
+ return datasketches::frequent_items_sketch<T>::deserialize(iss);
43
+ });
44
+ }
45
+
46
+ void init_fi(Rice::Module& m) {
47
+ bind_fi_sketch<std::string>(m, "FrequentStringsSketch");
48
+ }
@@ -14,7 +14,7 @@ void init_hll(Rice::Module& m) {
14
14
  .define_method(
15
15
  "update",
16
16
  *[](datasketches::hll_sketch& self, Rice::Object datum) {
17
- if (datum.is_a(rb_cInteger)) {
17
+ if (FIXNUM_P(datum.value())) {
18
18
  return self.update(from_ruby<int64_t>(datum));
19
19
  } else if (datum.is_a(rb_cNumeric)) {
20
20
  return self.update(from_ruby<double>(datum));
@@ -54,7 +54,7 @@ void init_hll(Rice::Module& m) {
54
54
  return datasketches::hll_sketch::deserialize(iss);
55
55
  });
56
56
 
57
- define_class_under<datasketches::hll_union>(m, "HllUnion")
57
+ Rice::define_class_under<datasketches::hll_union>(m, "HllUnion")
58
58
  .define_constructor(Rice::Constructor<datasketches::hll_union, int>())
59
59
  .define_method(
60
60
  "update",
@@ -0,0 +1,87 @@
1
+ #include <sstream>
2
+
3
+ #include <kll_sketch.hpp>
4
+
5
+ #include <rice/Array.hpp>
6
+ #include <rice/Constructor.hpp>
7
+ #include <rice/Module.hpp>
8
+
9
+ template<>
10
+ std::vector<double> from_ruby<std::vector<double>>(Rice::Object x)
11
+ {
12
+ auto a = Rice::Array(x);
13
+ std::vector<double> vec(a.size());
14
+ for (size_t i = 0; i < a.size(); i++) {
15
+ vec[i] = from_ruby<double>(a[i]);
16
+ }
17
+ return vec;
18
+ }
19
+
20
+ template<>
21
+ Rice::Object to_ruby<std::vector<int>>(std::vector<int> const & x)
22
+ {
23
+ auto a = Rice::Array();
24
+ for (size_t i = 0; i < x.size(); i++) {
25
+ a.push(x[i]);
26
+ }
27
+ return a;
28
+ }
29
+
30
+ template<>
31
+ Rice::Object to_ruby<std::vector<float>>(std::vector<float> const & x)
32
+ {
33
+ auto a = Rice::Array();
34
+ for (size_t i = 0; i < x.size(); i++) {
35
+ a.push(x[i]);
36
+ }
37
+ return a;
38
+ }
39
+
40
+ template<typename T>
41
+ void bind_kll_sketch(Rice::Module& m, const char* name) {
42
+ Rice::define_class_under<datasketches::kll_sketch<T>>(m, name)
43
+ .define_constructor(Rice::Constructor<datasketches::kll_sketch<T>, uint16_t>())
44
+ .define_method("empty?", &datasketches::kll_sketch<T>::is_empty)
45
+ .define_method("min_value", &datasketches::kll_sketch<T>::get_min_value)
46
+ .define_method("max_value", &datasketches::kll_sketch<T>::get_max_value)
47
+ .define_method("quantile", &datasketches::kll_sketch<T>::get_quantile)
48
+ .define_method(
49
+ "quantiles",
50
+ *[](datasketches::kll_sketch<T>& self, std::vector<double> fractions) {
51
+ return self.get_quantiles(&fractions[0], fractions.size());
52
+ })
53
+ .define_method(
54
+ "merge",
55
+ *[](datasketches::kll_sketch<T>& self, const datasketches::kll_sketch<T>& other) {
56
+ self.merge(other);
57
+ })
58
+ .define_method(
59
+ "update",
60
+ *[](datasketches::kll_sketch<T>& self, const T item) {
61
+ self.update(item);
62
+ })
63
+ .define_method(
64
+ "serialize",
65
+ *[](datasketches::kll_sketch<T>& self) {
66
+ std::ostringstream oss;
67
+ self.serialize(oss);
68
+ return oss.str();
69
+ })
70
+ // TODO change to summary?
71
+ .define_method(
72
+ "to_string",
73
+ *[](datasketches::kll_sketch<T>& self) {
74
+ return self.to_string();
75
+ })
76
+ .define_singleton_method(
77
+ "deserialize",
78
+ *[](std::string& is) {
79
+ std::istringstream iss(is);
80
+ return datasketches::kll_sketch<T>::deserialize(iss);
81
+ });
82
+ }
83
+
84
+ void init_kll(Rice::Module& m) {
85
+ bind_kll_sketch<int>(m, "KllIntsSketch");
86
+ bind_kll_sketch<float>(m, "KllFloatsSketch");
87
+ }
@@ -0,0 +1,12 @@
1
+ #include <sstream>
2
+
3
+ #include <theta_sketch.hpp>
4
+
5
+ #include <rice/Constructor.hpp>
6
+ #include <rice/Module.hpp>
7
+
8
+ void init_theta(Rice::Module& m) {
9
+ Rice::define_class_under<datasketches::theta_sketch>(m, "ThetaSketch")
10
+ .define_method("empty?", &datasketches::theta_sketch::is_empty)
11
+ .define_method("estimate", &datasketches::theta_sketch::get_estimate);
12
+ }
@@ -0,0 +1,39 @@
1
+ #include <sstream>
2
+
3
+ #include <var_opt_sketch.hpp>
4
+
5
+ #include <rice/Array.hpp>
6
+ #include <rice/Constructor.hpp>
7
+ #include <rice/Module.hpp>
8
+
9
+ template<typename T>
10
+ void bind_vo_sketch(Rice::Module &m, const char* name) {
11
+ Rice::define_class_under<datasketches::var_opt_sketch<T>>(m, "VarOptSketch")
12
+ .define_constructor(Rice::Constructor<datasketches::var_opt_sketch<T>, uint32_t>())
13
+ .define_method("k", &datasketches::var_opt_sketch<T>::get_k)
14
+ .define_method("n", &datasketches::var_opt_sketch<T>::get_n)
15
+ .define_method("num_samples", &datasketches::var_opt_sketch<T>::get_num_samples)
16
+ .define_method("empty?", &datasketches::var_opt_sketch<T>::is_empty)
17
+ .define_method("reset", &datasketches::var_opt_sketch<T>::reset)
18
+ .define_method(
19
+ "samples",
20
+ *[](datasketches::var_opt_sketch<T>& self) {
21
+ auto a = Rice::Array();
22
+ for (auto item : self) {
23
+ auto t = Rice::Array();
24
+ t.push(item.first);
25
+ t.push(item.second);
26
+ a.push(t);
27
+ }
28
+ return a;
29
+ })
30
+ .define_method(
31
+ "update",
32
+ *[](datasketches::var_opt_sketch<T>& self, const T item) {
33
+ self.update(item);
34
+ });
35
+ }
36
+
37
+ void init_vo(Rice::Module& m) {
38
+ bind_vo_sketch<Rice::Object>(m, "VarOptSketch");
39
+ }
@@ -1,3 +1,3 @@
1
1
  module DataSketches
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datasketches
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
@@ -38,7 +38,11 @@ files:
38
38
  - ext/datasketches/cpc_wrapper.cpp
39
39
  - ext/datasketches/ext.cpp
40
40
  - ext/datasketches/extconf.rb
41
+ - ext/datasketches/fi_wrapper.cpp
41
42
  - ext/datasketches/hll_wrapper.cpp
43
+ - ext/datasketches/kll_wrapper.cpp
44
+ - ext/datasketches/theta_wrapper.cpp
45
+ - ext/datasketches/vo_wrapper.cpp
42
46
  - lib/datasketches.rb
43
47
  - lib/datasketches/version.rb
44
48
  - vendor/datasketches-cpp/CMakeLists.txt