datasketches 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/datasketches/cpc_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
- data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
- data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
- data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
- data/vendor/datasketches-cpp/python/README.md +5 -5
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
- data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
- data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
- data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
- data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
- data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
- data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +31 -3
@@ -16,11 +16,12 @@
|
|
16
16
|
# under the License.
|
17
17
|
|
18
18
|
import unittest
|
19
|
-
from datasketches import kll_ints_sketch, kll_floats_sketch, kll_doubles_sketch
|
19
|
+
from datasketches import kll_ints_sketch, kll_floats_sketch, kll_doubles_sketch
|
20
|
+
from datasketches import kll_items_sketch, ks_test, PyStringsSerDe
|
20
21
|
import numpy as np
|
21
22
|
|
22
23
|
class KllTest(unittest.TestCase):
|
23
|
-
def
|
24
|
+
def test_kll_floats_example(self):
|
24
25
|
k = 160
|
25
26
|
n = 2 ** 20
|
26
27
|
|
@@ -61,12 +62,14 @@ class KllTest(unittest.TestCase):
|
|
61
62
|
self.assertLess(kll.get_num_retained(), n)
|
62
63
|
|
63
64
|
# merging itself will double the number of items the sketch has seen
|
64
|
-
|
65
|
+
# but need to do that with a copy
|
66
|
+
kll_copy = kll_floats_sketch(kll)
|
67
|
+
kll.merge(kll_copy)
|
65
68
|
self.assertEqual(kll.get_n(), 2*n)
|
66
69
|
|
67
70
|
# we can then serialize and reconstruct the sketch
|
68
71
|
kll_bytes = kll.serialize()
|
69
|
-
new_kll =
|
72
|
+
new_kll = kll_floats_sketch.deserialize(kll_bytes)
|
70
73
|
self.assertEqual(kll.get_num_retained(), new_kll.get_num_retained())
|
71
74
|
self.assertEqual(kll.get_min_value(), new_kll.get_min_value())
|
72
75
|
self.assertEqual(kll.get_max_value(), new_kll.get_max_value())
|
@@ -78,6 +81,12 @@ class KllTest(unittest.TestCase):
|
|
78
81
|
# they come from the same distribution (since they do)
|
79
82
|
self.assertFalse(ks_test(kll, new_kll, 0.001))
|
80
83
|
|
84
|
+
total_weight = 0
|
85
|
+
for tuple in kll:
|
86
|
+
item = tuple[0]
|
87
|
+
weight = tuple[1]
|
88
|
+
total_weight = total_weight + weight
|
89
|
+
self.assertEqual(total_weight, kll.get_n())
|
81
90
|
|
82
91
|
def test_kll_ints_sketch(self):
|
83
92
|
k = 100
|
@@ -108,8 +117,9 @@ class KllTest(unittest.TestCase):
|
|
108
117
|
|
109
118
|
self.assertEqual(kll.get_rank(round(n/2)), 0.5)
|
110
119
|
|
111
|
-
# merge self
|
112
|
-
|
120
|
+
# merge copy of self
|
121
|
+
kll_copy = kll_ints_sketch(kll)
|
122
|
+
kll.merge(kll_copy)
|
113
123
|
self.assertEqual(kll.get_n(), 2 * n)
|
114
124
|
|
115
125
|
sk_bytes = kll.serialize()
|
@@ -121,5 +131,29 @@ class KllTest(unittest.TestCase):
|
|
121
131
|
kll = kll_doubles_sketch(k)
|
122
132
|
self.assertTrue(kll.is_empty())
|
123
133
|
|
134
|
+
def test_kll_items_sketch(self):
|
135
|
+
# most functionality has been tested, but we need to ensure objects and sorting work
|
136
|
+
# as well as serialization
|
137
|
+
k = 100
|
138
|
+
n = 2 ** 16
|
139
|
+
|
140
|
+
# create a sketch and inject enough points to force compaction
|
141
|
+
kll = kll_items_sketch(k)
|
142
|
+
for i in range(0, n):
|
143
|
+
kll.update(str(i))
|
144
|
+
|
145
|
+
kll_copy = kll_items_sketch(kll)
|
146
|
+
kll.merge(kll_copy)
|
147
|
+
self.assertEqual(kll.get_n(), 2 * n)
|
148
|
+
|
149
|
+
kll_bytes = kll.serialize(PyStringsSerDe())
|
150
|
+
new_kll = kll_items_sketch.deserialize(kll_bytes, PyStringsSerDe())
|
151
|
+
self.assertEqual(kll.get_num_retained(), new_kll.get_num_retained())
|
152
|
+
self.assertEqual(kll.get_min_value(), new_kll.get_min_value())
|
153
|
+
self.assertEqual(kll.get_max_value(), new_kll.get_max_value())
|
154
|
+
self.assertEqual(kll.get_quantile(0.7), new_kll.get_quantile(0.7))
|
155
|
+
self.assertEqual(kll.get_rank(str(n/4)), new_kll.get_rank(str(n/4)))
|
156
|
+
|
157
|
+
|
124
158
|
if __name__ == '__main__':
|
125
159
|
unittest.main()
|
@@ -16,11 +16,12 @@
|
|
16
16
|
# under the License.
|
17
17
|
|
18
18
|
import unittest
|
19
|
-
from datasketches import quantiles_ints_sketch, quantiles_floats_sketch, quantiles_doubles_sketch
|
19
|
+
from datasketches import quantiles_ints_sketch, quantiles_floats_sketch, quantiles_doubles_sketch
|
20
|
+
from datasketches import quantiles_items_sketch, ks_test, PyStringsSerDe
|
20
21
|
import numpy as np
|
21
22
|
|
22
23
|
class QuantilesTest(unittest.TestCase):
|
23
|
-
def
|
24
|
+
def test_quantiles_floats_example(self):
|
24
25
|
k = 128
|
25
26
|
n = 2 ** 20
|
26
27
|
|
@@ -61,12 +62,13 @@ class QuantilesTest(unittest.TestCase):
|
|
61
62
|
self.assertLess(quantiles.get_num_retained(), n)
|
62
63
|
|
63
64
|
# merging itself will double the number of items the sketch has seen
|
64
|
-
|
65
|
+
quantiles_copy = quantiles_floats_sketch(quantiles)
|
66
|
+
quantiles.merge(quantiles_copy)
|
65
67
|
self.assertEqual(quantiles.get_n(), 2*n)
|
66
68
|
|
67
69
|
# we can then serialize and reconstruct the sketch
|
68
70
|
quantiles_bytes = quantiles.serialize()
|
69
|
-
new_quantiles =
|
71
|
+
new_quantiles = quantiles_floats_sketch.deserialize(quantiles_bytes)
|
70
72
|
self.assertEqual(quantiles.get_num_retained(), new_quantiles.get_num_retained())
|
71
73
|
self.assertEqual(quantiles.get_min_value(), new_quantiles.get_min_value())
|
72
74
|
self.assertEqual(quantiles.get_max_value(), new_quantiles.get_max_value())
|
@@ -80,6 +82,13 @@ class QuantilesTest(unittest.TestCase):
|
|
80
82
|
unif_quantiles.update(np.random.uniform(10, 20, size=n-1))
|
81
83
|
self.assertTrue(ks_test(quantiles, unif_quantiles, 0.001))
|
82
84
|
|
85
|
+
total_weight = 0
|
86
|
+
for tuple in quantiles:
|
87
|
+
item = tuple[0]
|
88
|
+
weight = tuple[1]
|
89
|
+
total_weight = total_weight + weight
|
90
|
+
self.assertEqual(total_weight, quantiles.get_n())
|
91
|
+
|
83
92
|
def test_quantiles_ints_sketch(self):
|
84
93
|
k = 128
|
85
94
|
n = 10
|
@@ -110,7 +119,8 @@ class QuantilesTest(unittest.TestCase):
|
|
110
119
|
self.assertEqual(quantiles.get_rank(round(n/2)), 0.5)
|
111
120
|
|
112
121
|
# merge self
|
113
|
-
|
122
|
+
quantiles_copy = quantiles_ints_sketch(quantiles)
|
123
|
+
quantiles.merge(quantiles_copy)
|
114
124
|
self.assertEqual(quantiles.get_n(), 2 * n)
|
115
125
|
|
116
126
|
sk_bytes = quantiles.serialize()
|
@@ -122,5 +132,29 @@ class QuantilesTest(unittest.TestCase):
|
|
122
132
|
quantiles = quantiles_doubles_sketch(k)
|
123
133
|
self.assertTrue(quantiles.is_empty())
|
124
134
|
|
135
|
+
def test_quantiles_items_sketch(self):
|
136
|
+
# most functionality has been tested, but we need to ensure objects and sorting work
|
137
|
+
# as well as serialization
|
138
|
+
k = 128
|
139
|
+
n = 2 ** 16
|
140
|
+
|
141
|
+
# create a sketch and inject enough points to force compaction
|
142
|
+
quantiles = quantiles_items_sketch(k)
|
143
|
+
for i in range(0, n):
|
144
|
+
quantiles.update(str(i))
|
145
|
+
|
146
|
+
quantiles_copy = quantiles_items_sketch(quantiles)
|
147
|
+
quantiles.merge(quantiles_copy)
|
148
|
+
self.assertEqual(quantiles.get_n(), 2 * n)
|
149
|
+
|
150
|
+
quantiles_bytes = quantiles.serialize(PyStringsSerDe())
|
151
|
+
new_quantiles = quantiles_items_sketch.deserialize(quantiles_bytes, PyStringsSerDe())
|
152
|
+
self.assertEqual(quantiles.get_num_retained(), new_quantiles.get_num_retained())
|
153
|
+
self.assertEqual(quantiles.get_min_value(), new_quantiles.get_min_value())
|
154
|
+
self.assertEqual(quantiles.get_max_value(), new_quantiles.get_max_value())
|
155
|
+
self.assertEqual(quantiles.get_quantile(0.7), new_quantiles.get_quantile(0.7))
|
156
|
+
self.assertEqual(quantiles.get_rank(str(n/4)), new_quantiles.get_rank(str(n/4)))
|
157
|
+
|
158
|
+
|
125
159
|
if __name__ == '__main__':
|
126
160
|
unittest.main()
|
@@ -16,7 +16,7 @@
|
|
16
16
|
# under the License.
|
17
17
|
|
18
18
|
import unittest
|
19
|
-
from datasketches import req_ints_sketch, req_floats_sketch
|
19
|
+
from datasketches import req_ints_sketch, req_floats_sketch, req_items_sketch, PyStringsSerDe
|
20
20
|
import numpy as np
|
21
21
|
|
22
22
|
class reqTest(unittest.TestCase):
|
@@ -67,18 +67,26 @@ class reqTest(unittest.TestCase):
|
|
67
67
|
self.assertEqual(req.get_k(), k)
|
68
68
|
|
69
69
|
# merging itself will double the number of items the sketch has seen
|
70
|
-
|
70
|
+
req_copy = req_floats_sketch(req)
|
71
|
+
req.merge(req_copy)
|
71
72
|
self.assertEqual(req.get_n(), 2*n)
|
72
73
|
|
73
74
|
# we can then serialize and reconstruct the sketch
|
74
75
|
req_bytes = req.serialize()
|
75
|
-
new_req =
|
76
|
+
new_req = req_floats_sketch.deserialize(req_bytes)
|
76
77
|
self.assertEqual(req.get_num_retained(), new_req.get_num_retained())
|
77
78
|
self.assertEqual(req.get_min_value(), new_req.get_min_value())
|
78
79
|
self.assertEqual(req.get_max_value(), new_req.get_max_value())
|
79
80
|
self.assertEqual(req.get_quantile(0.7), new_req.get_quantile(0.7))
|
80
81
|
self.assertEqual(req.get_rank(0.0), new_req.get_rank(0.0))
|
81
82
|
|
83
|
+
total_weight = 0
|
84
|
+
for tuple in req:
|
85
|
+
item = tuple[0]
|
86
|
+
weight = tuple[1]
|
87
|
+
total_weight = total_weight + weight
|
88
|
+
self.assertEqual(total_weight, req.get_n())
|
89
|
+
|
82
90
|
def test_req_ints_sketch(self):
|
83
91
|
k = 100
|
84
92
|
n = 10
|
@@ -109,18 +117,43 @@ class reqTest(unittest.TestCase):
|
|
109
117
|
self.assertEqual(req.get_rank(round(n/2)), 0.5)
|
110
118
|
|
111
119
|
# merge self
|
112
|
-
|
120
|
+
req_copy = req_ints_sketch(req)
|
121
|
+
req.merge(req_copy)
|
113
122
|
self.assertEqual(req.get_n(), 2 * n)
|
114
123
|
|
115
124
|
sk_bytes = req.serialize()
|
116
125
|
self.assertTrue(isinstance(req_ints_sketch.deserialize(sk_bytes), req_ints_sketch))
|
117
126
|
|
118
127
|
def test_req_floats_sketch(self):
|
119
|
-
# already tested
|
128
|
+
# already tested floats with LRA so just check that HRA works
|
120
129
|
k = 75
|
121
130
|
req = req_floats_sketch(k, False) # low rank accuracy
|
122
131
|
self.assertTrue(req.is_empty())
|
123
132
|
self.assertFalse(req.is_hra())
|
124
133
|
|
134
|
+
def test_req_items_sketch(self):
|
135
|
+
# most functionality has been tested, but we need to ensure objects and sorting work
|
136
|
+
# as well as serialization
|
137
|
+
k = 100
|
138
|
+
n = 2 ** 16
|
139
|
+
|
140
|
+
# create a sketch and inject enough points to force compaction
|
141
|
+
req = req_items_sketch(k)
|
142
|
+
for i in range(0, n):
|
143
|
+
req.update(str(i))
|
144
|
+
|
145
|
+
req_copy = req_items_sketch(req)
|
146
|
+
req.merge(req_copy)
|
147
|
+
self.assertEqual(req.get_n(), 2 * n)
|
148
|
+
|
149
|
+
req_bytes = req.serialize(PyStringsSerDe())
|
150
|
+
new_req = req_items_sketch.deserialize(req_bytes, PyStringsSerDe())
|
151
|
+
self.assertEqual(req.get_num_retained(), new_req.get_num_retained())
|
152
|
+
self.assertEqual(req.get_min_value(), new_req.get_min_value())
|
153
|
+
self.assertEqual(req.get_max_value(), new_req.get_max_value())
|
154
|
+
self.assertEqual(req.get_quantile(0.7), new_req.get_quantile(0.7))
|
155
|
+
self.assertEqual(req.get_rank(str(n/4)), new_req.get_rank(str(n/4)))
|
156
|
+
|
157
|
+
|
125
158
|
if __name__ == '__main__':
|
126
159
|
unittest.main()
|
@@ -14,7 +14,7 @@
|
|
14
14
|
# KIND, either express or implied. See the License for the
|
15
15
|
# specific language governing permissions and limitations
|
16
16
|
# under the License.
|
17
|
-
|
17
|
+
|
18
18
|
import unittest
|
19
19
|
|
20
20
|
from datasketches import theta_sketch, update_theta_sketch
|
@@ -24,11 +24,11 @@ from datasketches import theta_jaccard_similarity
|
|
24
24
|
|
25
25
|
class ThetaTest(unittest.TestCase):
|
26
26
|
def test_theta_basic_example(self):
|
27
|
-
|
27
|
+
lgk = 12 # 2^k = 4096 rows in the table
|
28
28
|
n = 1 << 18 # ~256k unique values
|
29
29
|
|
30
30
|
# create a sketch and inject some values
|
31
|
-
sk = self.generate_theta_sketch(n,
|
31
|
+
sk = self.generate_theta_sketch(n, lgk)
|
32
32
|
|
33
33
|
# we can check that the upper and lower bounds bracket the
|
34
34
|
# estimate, without needing to know the exact value.
|
@@ -48,20 +48,26 @@ class ThetaTest(unittest.TestCase):
|
|
48
48
|
self.assertFalse(sk.is_empty())
|
49
49
|
self.assertEqual(sk.get_estimate(), new_sk.get_estimate())
|
50
50
|
|
51
|
+
count = 0
|
52
|
+
for hash in new_sk:
|
53
|
+
self.assertLess(hash, new_sk.get_theta64())
|
54
|
+
count = count + 1
|
55
|
+
self.assertEqual(count, new_sk.get_num_retained())
|
56
|
+
|
51
57
|
def test_theta_set_operations(self):
|
52
|
-
|
58
|
+
lgk = 12 # 2^k = 4096 rows in the table
|
53
59
|
n = 1 << 18 # ~256k unique values
|
54
60
|
|
55
61
|
# we'll have 1/4 of the values overlap
|
56
62
|
offset = int(3 * n / 4) # it's a float w/o cast
|
57
63
|
|
58
64
|
# create a couple sketches and inject some values
|
59
|
-
sk1 = self.generate_theta_sketch(n,
|
60
|
-
sk2 = self.generate_theta_sketch(n,
|
65
|
+
sk1 = self.generate_theta_sketch(n, lgk)
|
66
|
+
sk2 = self.generate_theta_sketch(n, lgk, offset)
|
61
67
|
|
62
68
|
# UNIONS
|
63
69
|
# create a union object
|
64
|
-
union = theta_union(
|
70
|
+
union = theta_union(lgk)
|
65
71
|
union.update(sk1)
|
66
72
|
union.update(sk2)
|
67
73
|
|
@@ -77,7 +83,6 @@ class ThetaTest(unittest.TestCase):
|
|
77
83
|
self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
|
78
84
|
self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
|
79
85
|
|
80
|
-
|
81
86
|
# INTERSECTIONS
|
82
87
|
# create an intersection object
|
83
88
|
intersect = theta_intersection() # no lg_k
|
@@ -96,7 +101,6 @@ class ThetaTest(unittest.TestCase):
|
|
96
101
|
self.assertLessEqual(result.get_lower_bound(1), n / 4)
|
97
102
|
self.assertGreaterEqual(result.get_upper_bound(1), n / 4)
|
98
103
|
|
99
|
-
|
100
104
|
# A NOT B
|
101
105
|
# create an a_not_b object
|
102
106
|
anb = theta_a_not_b() # no lg_k
|
@@ -134,13 +138,11 @@ class ThetaTest(unittest.TestCase):
|
|
134
138
|
self.assertTrue(theta_jaccard_similarity.similarity_test(sk1, result, 0.7))
|
135
139
|
|
136
140
|
|
137
|
-
def generate_theta_sketch(self, n,
|
138
|
-
sk = update_theta_sketch(
|
141
|
+
def generate_theta_sketch(self, n, lgk, offset=0):
|
142
|
+
sk = update_theta_sketch(lgk)
|
139
143
|
for i in range(0, n):
|
140
144
|
sk.update(i + offset)
|
141
145
|
return sk
|
142
|
-
|
146
|
+
|
143
147
|
if __name__ == '__main__':
|
144
148
|
unittest.main()
|
145
|
-
|
146
|
-
|
@@ -0,0 +1,206 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
import unittest
|
19
|
+
|
20
|
+
from datasketches import update_tuple_sketch
|
21
|
+
from datasketches import compact_tuple_sketch, tuple_union
|
22
|
+
from datasketches import tuple_intersection, tuple_a_not_b
|
23
|
+
from datasketches import tuple_jaccard_similarity
|
24
|
+
from datasketches import tuple_jaccard_similarity, PyIntsSerDe
|
25
|
+
from datasketches import AccumulatorPolicy, MaxIntPolicy, MinIntPolicy
|
26
|
+
from datasketches import update_theta_sketch
|
27
|
+
|
28
|
+
class TupleTest(unittest.TestCase):
|
29
|
+
def test_tuple_basic_example(self):
|
30
|
+
lgk = 12 # 2^k = 4096 rows in the table
|
31
|
+
n = 1 << 18 # ~256k unique values
|
32
|
+
|
33
|
+
# create a sketch and inject some values -- summary is 2 so we can sum them
|
34
|
+
# and know the reuslt
|
35
|
+
sk = self.generate_tuple_sketch(AccumulatorPolicy(), n, lgk, value=2)
|
36
|
+
|
37
|
+
# we can check that the upper and lower bounds bracket the
|
38
|
+
# estimate, without needing to know the exact value.
|
39
|
+
self.assertLessEqual(sk.get_lower_bound(1), sk.get_estimate())
|
40
|
+
self.assertGreaterEqual(sk.get_upper_bound(1), sk.get_estimate())
|
41
|
+
|
42
|
+
# because this sketch is deterministically generated, we can
|
43
|
+
# also compare against the exact value
|
44
|
+
self.assertLessEqual(sk.get_lower_bound(1), n)
|
45
|
+
self.assertGreaterEqual(sk.get_upper_bound(1), n)
|
46
|
+
|
47
|
+
# compact and serialize for storage, then reconstruct
|
48
|
+
sk_bytes = sk.compact().serialize(PyIntsSerDe())
|
49
|
+
new_sk = compact_tuple_sketch.deserialize(sk_bytes, serde=PyIntsSerDe())
|
50
|
+
|
51
|
+
# estimate remains unchanged
|
52
|
+
self.assertFalse(sk.is_empty())
|
53
|
+
self.assertEqual(sk.get_estimate(), new_sk.get_estimate())
|
54
|
+
|
55
|
+
# we can also iterate over the sketch entries
|
56
|
+
# the iterator provides a (hashkey, summary) pair where the
|
57
|
+
# first value is the raw hash value and the second the summary
|
58
|
+
count = 0
|
59
|
+
cumSum = 0
|
60
|
+
for pair in new_sk:
|
61
|
+
self.assertLess(pair[0], new_sk.get_theta64())
|
62
|
+
count += 1
|
63
|
+
cumSum += pair[1]
|
64
|
+
self.assertEqual(count, new_sk.get_num_retained())
|
65
|
+
self.assertEqual(cumSum, 2 * new_sk.get_num_retained())
|
66
|
+
|
67
|
+
# we can even create a tuple sketch from an existing theta sketch
|
68
|
+
# as long as we provide a summary to use
|
69
|
+
theta_sk = update_theta_sketch(lgk)
|
70
|
+
for i in range(n, 2*n):
|
71
|
+
theta_sk.update(i)
|
72
|
+
cts = compact_tuple_sketch(theta_sk, 5)
|
73
|
+
cumSum = 0
|
74
|
+
for pair in cts:
|
75
|
+
cumSum += pair[1]
|
76
|
+
self.assertEqual(cumSum, 5 * cts.get_num_retained())
|
77
|
+
|
78
|
+
|
79
|
+
def test_tuple_set_operations(self):
|
80
|
+
lgk = 12 # 2^k = 4096 rows in the table
|
81
|
+
n = 1 << 18 # ~256k unique values
|
82
|
+
|
83
|
+
# we'll have 1/4 of the values overlap
|
84
|
+
offset = int(3 * n / 4) # it's a float w/o cast
|
85
|
+
|
86
|
+
# create a couple sketches and inject some values, with different summaries
|
87
|
+
sk1 = self.generate_tuple_sketch(AccumulatorPolicy(), n, lgk, value=5)
|
88
|
+
sk2 = self.generate_tuple_sketch(AccumulatorPolicy(), n, lgk, value=7, offset=offset)
|
89
|
+
|
90
|
+
# UNIONS
|
91
|
+
# create a union object
|
92
|
+
union = tuple_union(MaxIntPolicy(), lgk)
|
93
|
+
union.update(sk1)
|
94
|
+
union.update(sk2)
|
95
|
+
|
96
|
+
# getting result from union returns a compact_theta_sketch
|
97
|
+
# compact theta sketches can be used in additional unions
|
98
|
+
# or set operations but cannot accept further item updates
|
99
|
+
result = union.get_result()
|
100
|
+
self.assertTrue(isinstance(result, compact_tuple_sketch))
|
101
|
+
|
102
|
+
# since our process here is deterministic, we have
|
103
|
+
# checked and know the exact answer is within one
|
104
|
+
# standard deviation of the estimate
|
105
|
+
self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
|
106
|
+
self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
|
107
|
+
|
108
|
+
# we unioned two equal-sized sketches with overlap and used
|
109
|
+
# the max value as the resulting summary, meaning we should
|
110
|
+
# have more summaries with value 7 than value 5 in the result
|
111
|
+
count5 = 0
|
112
|
+
count7 = 0
|
113
|
+
for pair in result:
|
114
|
+
if pair[1] == 5:
|
115
|
+
count5 += 1
|
116
|
+
elif pair[1] == 7:
|
117
|
+
count7 += 1
|
118
|
+
else:
|
119
|
+
self.fail()
|
120
|
+
self.assertLess(count5, count7)
|
121
|
+
|
122
|
+
# INTERSECTIONS
|
123
|
+
# create an intersection object
|
124
|
+
intersect = tuple_intersection(MinIntPolicy()) # no lg_k
|
125
|
+
intersect.update(sk1)
|
126
|
+
intersect.update(sk2)
|
127
|
+
|
128
|
+
# has_result() indicates the intersection has been used,
|
129
|
+
# although the result may be the empty set
|
130
|
+
self.assertTrue(intersect.has_result())
|
131
|
+
|
132
|
+
# as with unions, the result is a compact sketch
|
133
|
+
result = intersect.get_result()
|
134
|
+
self.assertTrue(isinstance(result, compact_tuple_sketch))
|
135
|
+
|
136
|
+
# we know the sets overlap by 1/4
|
137
|
+
self.assertLessEqual(result.get_lower_bound(1), n / 4)
|
138
|
+
self.assertGreaterEqual(result.get_upper_bound(1), n / 4)
|
139
|
+
|
140
|
+
# in this example, we intersected the sketches and took the
|
141
|
+
# min value as the resulting summary, so all summaries
|
142
|
+
# must be exactly equal to that value
|
143
|
+
count5 = 0
|
144
|
+
for pair in result:
|
145
|
+
if pair[1] == 5:
|
146
|
+
count5 += 1
|
147
|
+
else:
|
148
|
+
self.fail()
|
149
|
+
self.assertEqual(count5, result.get_num_retained())
|
150
|
+
|
151
|
+
# A NOT B
|
152
|
+
# create an a_not_b object
|
153
|
+
anb = tuple_a_not_b() # no lg_k or policy
|
154
|
+
result = anb.compute(sk1, sk2)
|
155
|
+
|
156
|
+
# as with unions, the result is a compact sketch
|
157
|
+
self.assertTrue(isinstance(result, compact_tuple_sketch))
|
158
|
+
|
159
|
+
# we know the sets overlap by 1/4, so the remainder is 3/4
|
160
|
+
self.assertLessEqual(result.get_lower_bound(1), 3 * n / 4)
|
161
|
+
self.assertGreaterEqual(result.get_upper_bound(1), 3 * n / 4)
|
162
|
+
|
163
|
+
# here, we have only values with a summary of 5 as any keys that
|
164
|
+
# existed in both sketches were removed
|
165
|
+
count5 = 0
|
166
|
+
for pair in result:
|
167
|
+
if pair[1] == 5:
|
168
|
+
count5 += 1
|
169
|
+
else:
|
170
|
+
self.fail()
|
171
|
+
self.assertEqual(count5, result.get_num_retained())
|
172
|
+
|
173
|
+
# JACCARD SIMILARITY
|
174
|
+
# Jaccard Similarity measure returns (lower_bound, estimate, upper_bound)
|
175
|
+
# and does not examine summaries, even for (dis)similarity tests.
|
176
|
+
jac = tuple_jaccard_similarity.jaccard(sk1, sk2)
|
177
|
+
|
178
|
+
# we can check that results are in the expected order
|
179
|
+
self.assertLess(jac[0], jac[1])
|
180
|
+
self.assertLess(jac[1], jac[2])
|
181
|
+
|
182
|
+
# checks for sketch equivalence
|
183
|
+
self.assertTrue(tuple_jaccard_similarity.exactly_equal(sk1, sk1))
|
184
|
+
self.assertFalse(tuple_jaccard_similarity.exactly_equal(sk1, sk2))
|
185
|
+
|
186
|
+
# we can apply a check for similarity or dissimilarity at a
|
187
|
+
# given threshold, at 97.7% confidence.
|
188
|
+
|
189
|
+
# check that the Jaccard Index is at most (upper bound) 0.2.
|
190
|
+
# exact result would be 1/7
|
191
|
+
self.assertTrue(tuple_jaccard_similarity.dissimilarity_test(sk1, sk2, 0.2))
|
192
|
+
|
193
|
+
# check that the Jaccard Index is at least (lower bound) 0.7
|
194
|
+
# exact result would be 3/4, using result from A NOT B test
|
195
|
+
self.assertTrue(tuple_jaccard_similarity.similarity_test(sk1, result, 0.7))
|
196
|
+
|
197
|
+
|
198
|
+
# Generates a basic tuple sketch with a fixed value for each update
|
199
|
+
def generate_tuple_sketch(self, policy, n, lgk, value, offset=0):
|
200
|
+
sk = update_tuple_sketch(policy, lgk)
|
201
|
+
for i in range(0, n):
|
202
|
+
sk.update(i + offset, value)
|
203
|
+
return sk
|
204
|
+
|
205
|
+
if __name__ == '__main__':
|
206
|
+
unittest.main()
|
@@ -45,6 +45,13 @@ class VoTest(unittest.TestCase):
|
|
45
45
|
items = vo.get_samples()
|
46
46
|
self.assertEqual(len(items), k)
|
47
47
|
|
48
|
+
count = 0
|
49
|
+
for tuple in vo:
|
50
|
+
sample = tuple[0]
|
51
|
+
weight = tuple[1]
|
52
|
+
count = count + 1
|
53
|
+
self.assertEqual(count, vo.num_samples)
|
54
|
+
|
48
55
|
# we can also apply a predicate to the sketch to get an estimate
|
49
56
|
# (with optimally minimal variance) of the subset sum of items
|
50
57
|
# matching that predicate among the entire population
|
@@ -580,15 +580,20 @@ private:
|
|
580
580
|
|
581
581
|
|
582
582
|
template<typename T, typename C, typename A>
|
583
|
-
class quantiles_sketch<T, C, A>::const_iterator
|
583
|
+
class quantiles_sketch<T, C, A>::const_iterator {
|
584
584
|
public:
|
585
|
+
using iterator_category = std::input_iterator_tag;
|
585
586
|
using value_type = std::pair<const T&, const uint64_t>;
|
587
|
+
using difference_type = void;
|
588
|
+
using pointer = const return_value_holder<value_type>;
|
589
|
+
using reference = const value_type;
|
590
|
+
|
586
591
|
const_iterator& operator++();
|
587
592
|
const_iterator& operator++(int);
|
588
593
|
bool operator==(const const_iterator& other) const;
|
589
594
|
bool operator!=(const const_iterator& other) const;
|
590
|
-
|
591
|
-
|
595
|
+
reference operator*() const;
|
596
|
+
pointer operator->() const;
|
592
597
|
private:
|
593
598
|
friend class quantiles_sketch<T, C, A>;
|
594
599
|
using Level = std::vector<T, A>;
|
@@ -645,12 +645,12 @@ string<A> quantiles_sketch<T, C, A>::to_string(bool print_levels, bool print_ite
|
|
645
645
|
uint8_t level = 0;
|
646
646
|
os << " BB:" << std::endl;
|
647
647
|
for (const T& item : base_buffer_) {
|
648
|
-
os << " " <<
|
648
|
+
os << " " << item << std::endl;
|
649
649
|
}
|
650
650
|
for (uint8_t i = 0; i < levels_.size(); ++i) {
|
651
651
|
os << " level " << static_cast<unsigned int>(level) << ":" << std::endl;
|
652
652
|
for (const T& item : levels_[i]) {
|
653
|
-
os << " " <<
|
653
|
+
os << " " << item << std::endl;
|
654
654
|
}
|
655
655
|
}
|
656
656
|
os << "### End sketch data" << std::endl;
|
@@ -1354,12 +1354,12 @@ bool quantiles_sketch<T, C, A>::const_iterator::operator!=(const const_iterator&
|
|
1354
1354
|
}
|
1355
1355
|
|
1356
1356
|
template<typename T, typename C, typename A>
|
1357
|
-
auto quantiles_sketch<T, C, A>::const_iterator::operator*() const ->
|
1357
|
+
auto quantiles_sketch<T, C, A>::const_iterator::operator*() const -> reference {
|
1358
1358
|
return value_type(level_ == -1 ? base_buffer_[index_] : levels_[level_][index_], weight_);
|
1359
1359
|
}
|
1360
1360
|
|
1361
1361
|
template<typename T, typename C, typename A>
|
1362
|
-
auto quantiles_sketch<T, C, A>::const_iterator::operator->() const ->
|
1362
|
+
auto quantiles_sketch<T, C, A>::const_iterator::operator->() const -> pointer {
|
1363
1363
|
return **this;
|
1364
1364
|
}
|
1365
1365
|
|
@@ -260,7 +260,7 @@ TEST_CASE("quantiles sketch", "[quantiles_sketch]") {
|
|
260
260
|
REQUIRE(sketch.get_rank(values[i]) == ranks[i]);
|
261
261
|
}
|
262
262
|
subtotal_pmf += pmf[i];
|
263
|
-
if (abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
|
263
|
+
if (std::abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
|
264
264
|
std::cerr << "CDF vs PMF for value " << i << std::endl;
|
265
265
|
REQUIRE(ranks[i] == Approx(subtotal_pmf).margin(NUMERIC_NOISE_TOLERANCE));
|
266
266
|
}
|
@@ -399,15 +399,20 @@ private:
|
|
399
399
|
};
|
400
400
|
|
401
401
|
template<typename T, typename C, typename A>
|
402
|
-
class req_sketch<T, C, A>::const_iterator
|
402
|
+
class req_sketch<T, C, A>::const_iterator {
|
403
403
|
public:
|
404
|
+
using iterator_category = std::input_iterator_tag;
|
404
405
|
using value_type = std::pair<const T&, const uint64_t>;
|
406
|
+
using difference_type = void;
|
407
|
+
using pointer = const return_value_holder<value_type>;
|
408
|
+
using reference = const value_type;
|
409
|
+
|
405
410
|
const_iterator& operator++();
|
406
411
|
const_iterator& operator++(int);
|
407
412
|
bool operator==(const const_iterator& other) const;
|
408
413
|
bool operator!=(const const_iterator& other) const;
|
409
|
-
|
410
|
-
|
414
|
+
reference operator*() const;
|
415
|
+
pointer operator->() const;
|
411
416
|
private:
|
412
417
|
using LevelsIterator = typename std::vector<Compactor, AllocCompactor>::const_iterator;
|
413
418
|
LevelsIterator levels_it_;
|
@@ -848,12 +848,12 @@ bool req_sketch<T, C, A>::const_iterator::operator!=(const const_iterator& other
|
|
848
848
|
}
|
849
849
|
|
850
850
|
template<typename T, typename C, typename A>
|
851
|
-
auto req_sketch<T, C, A>::const_iterator::operator*() const ->
|
851
|
+
auto req_sketch<T, C, A>::const_iterator::operator*() const -> reference {
|
852
852
|
return value_type(*compactor_it_, 1ULL << (*levels_it_).get_lg_weight());
|
853
853
|
}
|
854
854
|
|
855
855
|
template<typename T, typename C, typename A>
|
856
|
-
auto req_sketch<T, C, A>::const_iterator::operator->() const ->
|
856
|
+
auto req_sketch<T, C, A>::const_iterator::operator->() const -> pointer {
|
857
857
|
return **this;
|
858
858
|
}
|
859
859
|
|