datasketches 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0bf3a004350a15e151c0114a7351519d14f2d9fc793c5ac87a8daa9756be42af
4
- data.tar.gz: a48f65a18a1a099e68a4d0de0714b5a1db888c97348c021cd68d6e5a1b066abf
3
+ metadata.gz: bf7f5b6898a9f014d14fc5fcfa3a6a5da1dd5fc70d1b07bbbdf07a446126d9cd
4
+ data.tar.gz: 15d5fbb2ee6e0c1a6e7f25a145329eb73da9d7933f5f1475596b9e9c998b31f7
5
5
  SHA512:
6
- metadata.gz: 89cce97d6a892575dfbe1441d8815742fdd827d26b9f3faafef4e9cc99e5386f025cd00a5e0e1d0188520c2f41405062fcb58c4950d8238d234eff2513234566
7
- data.tar.gz: 3cc79a07e80f42b58ea6a3a5e792f72ea79b6f7980076bd92b9cb43c1906bc4c3f30bb6156347c1acce4a2b1d69407ec8178e7775e7ed45bab5930f052fc7b71
6
+ metadata.gz: e3b412132bdaf7e45f85e1da48690ecccd3c510cc17fd5259f0244cb580f712dc58290fca50a547a9e0952cf7c0548124ed33be9224ba3b83d6dd461d70c2aab
7
+ data.tar.gz: 5f6d9932824961e11d6f8bd390dbf20e487b6ca8aef6b6440e5f6a11296227382be5261ebaf1da416d5f2354c7c99a97e34871b0964facc724ba33f2b88e96f6
@@ -1,3 +1,8 @@
1
- ## 0.1.0 (unreleased)
1
+ ## 0.1.1 (2021-01-20)
2
+
3
+ - Added more sketches
4
+ - Fixed installation on Linux and Windows
5
+
6
+ ## 0.1.0 (2021-01-19)
2
7
 
3
8
  - First release
data/README.md CHANGED
@@ -12,14 +12,26 @@ Add this line to your application’s Gemfile:
12
12
  gem 'datasketches'
13
13
  ```
14
14
 
15
- ## Data Structures
15
+ ## Sketch Families
16
16
 
17
17
  Distinct counting
18
18
 
19
- - [CPC Sketch](#cpc-sketch)
20
- - [HyperLogLog Sketch](#hyperloglog-sketch)
19
+ - [CPC sketch](#cpc-sketch)
20
+ - [HyperLogLog sketch](#hyperloglog-sketch)
21
21
 
22
- ### CPC Sketch
22
+ Most frequent
23
+
24
+ - [Frequent item sketch](#frequent-item-sketch)
25
+
26
+ Quantiles and histograms
27
+
28
+ - [KLL sketch](#kll-sketch)
29
+
30
+ Sampling
31
+
32
+ - [VarOpt sketch](#varopt-sketch)
33
+
34
+ ## CPC Sketch
23
35
 
24
36
  Create a sketch
25
37
 
@@ -53,7 +65,7 @@ Load a sketch
53
65
  sketch = DataSketches::CpcSketch.deserialize(data)
54
66
  ```
55
67
 
56
- ### HyperLogLog Sketch
68
+ ## HyperLogLog Sketch
57
69
 
58
70
  Create a sketch
59
71
 
@@ -98,6 +110,106 @@ u.update(sketch2)
98
110
  u.estimate
99
111
  ```
100
112
 
113
+ ## Frequent Item Sketch
114
+
115
+ Create a sketch
116
+
117
+ ```ruby
118
+ sketch = DataSketches::FrequentStringsSketch.new(64)
119
+ ```
120
+
121
+ Add data
122
+
123
+ ```ruby
124
+ sketch.update("a")
125
+ sketch.update("b")
126
+ sketch.update("c")
127
+ ```
128
+
129
+ Estimate the frequency of an item
130
+
131
+ ```ruby
132
+ sketch.estimate("a")
133
+ ```
134
+
135
+ Save a sketch
136
+
137
+ ```ruby
138
+ data = sketch.serialize
139
+ ```
140
+
141
+ Load a sketch
142
+
143
+ ```ruby
144
+ sketch = DataSketches::FrequentStringsSketch.deserialize(data)
145
+ ```
146
+
147
+ ## KLL Sketch
148
+
149
+ Create a sketch
150
+
151
+ ```ruby
152
+ sketch = DataSketches::KllIntsSketch.new(200)
153
+ # or
154
+ sketch = DataSketches::KllFloatsSketch.new(200)
155
+ ```
156
+
157
+ Add data
158
+
159
+ ```ruby
160
+ sketch.update(1)
161
+ sketch.update(2)
162
+ sketch.update(3)
163
+ ```
164
+
165
+ Get quantiles
166
+
167
+ ```ruby
168
+ sketch.quantile(0.5)
169
+ sketch.quantiles([0.25, 0.5, 0.75])
170
+ ```
171
+
172
+ Get the minimum and maximum values from the stream
173
+
174
+ ```ruby
175
+ sketch.min_value
176
+ sketch.max_value
177
+ ```
178
+
179
+ Save a sketch
180
+
181
+ ```ruby
182
+ data = sketch.serialize
183
+ ```
184
+
185
+ Load a sketch
186
+
187
+ ```ruby
188
+ sketch = DataSketches::KllIntsSketch.deserialize(data)
189
+ ```
190
+
191
+ Merge sketches
192
+
193
+ ```ruby
194
+ sketch.merge(sketch2)
195
+ ```
196
+
197
+ ## VarOpt Sketch
198
+
199
+ Create a sketch
200
+
201
+ ```ruby
202
+ sketch = DataSketches::VarOptSketch.new(14)
203
+ ```
204
+
205
+ Add data
206
+
207
+ ```ruby
208
+ sketch.update(1)
209
+ sketch.update(2.0)
210
+ sketch.update("three")
211
+ ```
212
+
101
213
  ## Credits
102
214
 
103
215
  This library is modeled after the DataSketches [Python API](https://github.com/apache/datasketches-cpp/tree/master/python).
@@ -15,7 +15,7 @@ void init_cpc(Rice::Module& m) {
15
15
  .define_method(
16
16
  "update",
17
17
  *[](datasketches::cpc_sketch& self, Rice::Object datum) {
18
- if (datum.is_a(rb_cInteger)) {
18
+ if (FIXNUM_P(datum.value())) {
19
19
  return self.update(from_ruby<int64_t>(datum));
20
20
  } else if (datum.is_a(rb_cNumeric)) {
21
21
  return self.update(from_ruby<double>(datum));
@@ -1,12 +1,20 @@
1
1
  #include <rice/Module.hpp>
2
2
 
3
3
  void init_cpc(Rice::Module& m);
4
+ void init_fi(Rice::Module& m);
4
5
  void init_hll(Rice::Module& m);
6
+ void init_kll(Rice::Module& m);
7
+ void init_theta(Rice::Module& m);
8
+ void init_vo(Rice::Module& m);
5
9
 
6
10
  extern "C"
7
11
  void Init_ext()
8
12
  {
9
13
  Rice::Module m = Rice::define_module("DataSketches");
10
14
  init_cpc(m);
15
+ init_fi(m);
11
16
  init_hll(m);
17
+ init_kll(m);
18
+ init_theta(m);
19
+ init_vo(m);
12
20
  }
@@ -6,6 +6,9 @@ ext = File.expand_path(".", __dir__)
6
6
  datasketches = File.expand_path("../../vendor/datasketches-cpp", __dir__)
7
7
 
8
8
  $srcs = Dir["#{ext}/*.cpp"]
9
- $INCFLAGS += " -I#{datasketches}/common/include -I#{datasketches}/cpc/include -I#{datasketches}/hll/include"
9
+
10
+ %w(common cpc fi hll kll sampling theta tuple).each do |v|
11
+ $INCFLAGS += " -I#{datasketches}/#{v}/include"
12
+ end
10
13
 
11
14
  create_makefile("datasketches/ext")
@@ -0,0 +1,48 @@
1
+ #include <sstream>
2
+
3
+ #include <frequent_items_sketch.hpp>
4
+
5
+ #include <rice/Constructor.hpp>
6
+ #include <rice/Module.hpp>
7
+
8
+ template<typename T>
9
+ void bind_fi_sketch(Rice::Module& m, const char* name) {
10
+ Rice::define_class_under<datasketches::frequent_items_sketch<T>>(m, name)
11
+ .define_constructor(Rice::Constructor<datasketches::frequent_items_sketch<T>, uint16_t>())
12
+ .define_method("empty?", &datasketches::frequent_items_sketch<T>::is_empty)
13
+ .define_method("num_active_items", &datasketches::frequent_items_sketch<T>::get_num_active_items)
14
+ .define_method("total_weight", &datasketches::frequent_items_sketch<T>::get_total_weight)
15
+ .define_method("estimate", &datasketches::frequent_items_sketch<T>::get_estimate)
16
+ .define_method("lower_bound", &datasketches::frequent_items_sketch<T>::get_lower_bound)
17
+ .define_method("upper_bound", &datasketches::frequent_items_sketch<T>::get_upper_bound)
18
+ .define_method("maximum_error", &datasketches::frequent_items_sketch<T>::get_maximum_error)
19
+ .define_method(
20
+ "update",
21
+ *[](datasketches::frequent_items_sketch<T>& self, const T item) {
22
+ self.update(item);
23
+ })
24
+ .define_method(
25
+ "serialize",
26
+ *[](datasketches::frequent_items_sketch<T>& self) {
27
+ std::ostringstream oss;
28
+ self.serialize(oss);
29
+ return oss.str();
30
+ })
31
+ // TODO change to summary?
32
+ .define_method(
33
+ "to_string",
34
+ *[](datasketches::frequent_items_sketch<T>& self) {
35
+ return self.to_string();
36
+ })
37
+ .define_singleton_method(
38
+ "deserialize",
39
+ // TODO figure out segfault
40
+ *[](std::string is) {
41
+ std::istringstream iss(is);
42
+ return datasketches::frequent_items_sketch<T>::deserialize(iss);
43
+ });
44
+ }
45
+
46
+ void init_fi(Rice::Module& m) {
47
+ bind_fi_sketch<std::string>(m, "FrequentStringsSketch");
48
+ }
@@ -14,7 +14,7 @@ void init_hll(Rice::Module& m) {
14
14
  .define_method(
15
15
  "update",
16
16
  *[](datasketches::hll_sketch& self, Rice::Object datum) {
17
- if (datum.is_a(rb_cInteger)) {
17
+ if (FIXNUM_P(datum.value())) {
18
18
  return self.update(from_ruby<int64_t>(datum));
19
19
  } else if (datum.is_a(rb_cNumeric)) {
20
20
  return self.update(from_ruby<double>(datum));
@@ -54,7 +54,7 @@ void init_hll(Rice::Module& m) {
54
54
  return datasketches::hll_sketch::deserialize(iss);
55
55
  });
56
56
 
57
- define_class_under<datasketches::hll_union>(m, "HllUnion")
57
+ Rice::define_class_under<datasketches::hll_union>(m, "HllUnion")
58
58
  .define_constructor(Rice::Constructor<datasketches::hll_union, int>())
59
59
  .define_method(
60
60
  "update",
@@ -0,0 +1,87 @@
1
+ #include <sstream>
2
+
3
+ #include <kll_sketch.hpp>
4
+
5
+ #include <rice/Array.hpp>
6
+ #include <rice/Constructor.hpp>
7
+ #include <rice/Module.hpp>
8
+
9
+ template<>
10
+ std::vector<double> from_ruby<std::vector<double>>(Rice::Object x)
11
+ {
12
+ auto a = Rice::Array(x);
13
+ std::vector<double> vec(a.size());
14
+ for (size_t i = 0; i < a.size(); i++) {
15
+ vec[i] = from_ruby<double>(a[i]);
16
+ }
17
+ return vec;
18
+ }
19
+
20
+ template<>
21
+ Rice::Object to_ruby<std::vector<int>>(std::vector<int> const & x)
22
+ {
23
+ auto a = Rice::Array();
24
+ for (size_t i = 0; i < x.size(); i++) {
25
+ a.push(x[i]);
26
+ }
27
+ return a;
28
+ }
29
+
30
+ template<>
31
+ Rice::Object to_ruby<std::vector<float>>(std::vector<float> const & x)
32
+ {
33
+ auto a = Rice::Array();
34
+ for (size_t i = 0; i < x.size(); i++) {
35
+ a.push(x[i]);
36
+ }
37
+ return a;
38
+ }
39
+
40
+ template<typename T>
41
+ void bind_kll_sketch(Rice::Module& m, const char* name) {
42
+ Rice::define_class_under<datasketches::kll_sketch<T>>(m, name)
43
+ .define_constructor(Rice::Constructor<datasketches::kll_sketch<T>, uint16_t>())
44
+ .define_method("empty?", &datasketches::kll_sketch<T>::is_empty)
45
+ .define_method("min_value", &datasketches::kll_sketch<T>::get_min_value)
46
+ .define_method("max_value", &datasketches::kll_sketch<T>::get_max_value)
47
+ .define_method("quantile", &datasketches::kll_sketch<T>::get_quantile)
48
+ .define_method(
49
+ "quantiles",
50
+ *[](datasketches::kll_sketch<T>& self, std::vector<double> fractions) {
51
+ return self.get_quantiles(&fractions[0], fractions.size());
52
+ })
53
+ .define_method(
54
+ "merge",
55
+ *[](datasketches::kll_sketch<T>& self, const datasketches::kll_sketch<T>& other) {
56
+ self.merge(other);
57
+ })
58
+ .define_method(
59
+ "update",
60
+ *[](datasketches::kll_sketch<T>& self, const T item) {
61
+ self.update(item);
62
+ })
63
+ .define_method(
64
+ "serialize",
65
+ *[](datasketches::kll_sketch<T>& self) {
66
+ std::ostringstream oss;
67
+ self.serialize(oss);
68
+ return oss.str();
69
+ })
70
+ // TODO change to summary?
71
+ .define_method(
72
+ "to_string",
73
+ *[](datasketches::kll_sketch<T>& self) {
74
+ return self.to_string();
75
+ })
76
+ .define_singleton_method(
77
+ "deserialize",
78
+ *[](std::string& is) {
79
+ std::istringstream iss(is);
80
+ return datasketches::kll_sketch<T>::deserialize(iss);
81
+ });
82
+ }
83
+
84
+ void init_kll(Rice::Module& m) {
85
+ bind_kll_sketch<int>(m, "KllIntsSketch");
86
+ bind_kll_sketch<float>(m, "KllFloatsSketch");
87
+ }
@@ -0,0 +1,12 @@
1
+ #include <sstream>
2
+
3
+ #include <theta_sketch.hpp>
4
+
5
+ #include <rice/Constructor.hpp>
6
+ #include <rice/Module.hpp>
7
+
8
+ void init_theta(Rice::Module& m) {
9
+ Rice::define_class_under<datasketches::theta_sketch>(m, "ThetaSketch")
10
+ .define_method("empty?", &datasketches::theta_sketch::is_empty)
11
+ .define_method("estimate", &datasketches::theta_sketch::get_estimate);
12
+ }
@@ -0,0 +1,39 @@
1
+ #include <sstream>
2
+
3
+ #include <var_opt_sketch.hpp>
4
+
5
+ #include <rice/Array.hpp>
6
+ #include <rice/Constructor.hpp>
7
+ #include <rice/Module.hpp>
8
+
9
+ template<typename T>
10
+ void bind_vo_sketch(Rice::Module &m, const char* name) {
11
+ Rice::define_class_under<datasketches::var_opt_sketch<T>>(m, "VarOptSketch")
12
+ .define_constructor(Rice::Constructor<datasketches::var_opt_sketch<T>, uint32_t>())
13
+ .define_method("k", &datasketches::var_opt_sketch<T>::get_k)
14
+ .define_method("n", &datasketches::var_opt_sketch<T>::get_n)
15
+ .define_method("num_samples", &datasketches::var_opt_sketch<T>::get_num_samples)
16
+ .define_method("empty?", &datasketches::var_opt_sketch<T>::is_empty)
17
+ .define_method("reset", &datasketches::var_opt_sketch<T>::reset)
18
+ .define_method(
19
+ "samples",
20
+ *[](datasketches::var_opt_sketch<T>& self) {
21
+ auto a = Rice::Array();
22
+ for (auto item : self) {
23
+ auto t = Rice::Array();
24
+ t.push(item.first);
25
+ t.push(item.second);
26
+ a.push(t);
27
+ }
28
+ return a;
29
+ })
30
+ .define_method(
31
+ "update",
32
+ *[](datasketches::var_opt_sketch<T>& self, const T item) {
33
+ self.update(item);
34
+ });
35
+ }
36
+
37
+ void init_vo(Rice::Module& m) {
38
+ bind_vo_sketch<Rice::Object>(m, "VarOptSketch");
39
+ }
@@ -1,3 +1,3 @@
1
1
  module DataSketches
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datasketches
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
@@ -38,7 +38,11 @@ files:
38
38
  - ext/datasketches/cpc_wrapper.cpp
39
39
  - ext/datasketches/ext.cpp
40
40
  - ext/datasketches/extconf.rb
41
+ - ext/datasketches/fi_wrapper.cpp
41
42
  - ext/datasketches/hll_wrapper.cpp
43
+ - ext/datasketches/kll_wrapper.cpp
44
+ - ext/datasketches/theta_wrapper.cpp
45
+ - ext/datasketches/vo_wrapper.cpp
42
46
  - lib/datasketches.rb
43
47
  - lib/datasketches/version.rb
44
48
  - vendor/datasketches-cpp/CMakeLists.txt