datasketches 0.3.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
  6. data/vendor/datasketches-cpp/NOTICE +1 -1
  7. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
  9. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
  10. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  11. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  12. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  13. data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
  14. data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
  15. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
  16. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
  17. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  18. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
  19. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  20. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
  21. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  22. data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
  23. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
  24. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  25. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
  26. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  27. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  28. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  29. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  30. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  31. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  32. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  33. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  34. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
  35. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
  36. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  37. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  38. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
  39. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  40. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  41. data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  43. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  44. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
  45. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
  46. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
  47. data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
  48. data/vendor/datasketches-cpp/python/README.md +5 -5
  49. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
  50. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
  51. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
  52. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
  53. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
  54. data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
  55. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
  56. data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
  57. data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
  58. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
  59. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
  60. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
  61. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
  62. data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
  63. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
  64. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
  65. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
  66. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
  67. data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
  68. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
  69. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
  70. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
  71. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
  72. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
  73. data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
  74. data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
  75. data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
  76. data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
  78. data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
  79. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
  80. data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
  81. data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
  82. data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
  83. data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
  84. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
  85. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
  86. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
  87. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
  88. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
  89. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
  96. data/vendor/datasketches-cpp/setup.py +1 -1
  97. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  98. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  99. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
  102. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
  103. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  104. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
  105. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
  106. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
  107. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  108. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  109. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
  110. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  111. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  112. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  113. metadata +31 -3
@@ -16,11 +16,12 @@
16
16
  # under the License.
17
17
 
18
18
  import unittest
19
- from datasketches import kll_ints_sketch, kll_floats_sketch, kll_doubles_sketch, ks_test
19
+ from datasketches import kll_ints_sketch, kll_floats_sketch, kll_doubles_sketch
20
+ from datasketches import kll_items_sketch, ks_test, PyStringsSerDe
20
21
  import numpy as np
21
22
 
22
23
  class KllTest(unittest.TestCase):
23
- def test_kll_example(self):
24
+ def test_kll_floats_example(self):
24
25
  k = 160
25
26
  n = 2 ** 20
26
27
 
@@ -61,12 +62,14 @@ class KllTest(unittest.TestCase):
61
62
  self.assertLess(kll.get_num_retained(), n)
62
63
 
63
64
  # merging itself will double the number of items the sketch has seen
64
- kll.merge(kll)
65
+ # but need to do that with a copy
66
+ kll_copy = kll_floats_sketch(kll)
67
+ kll.merge(kll_copy)
65
68
  self.assertEqual(kll.get_n(), 2*n)
66
69
 
67
70
  # we can then serialize and reconstruct the sketch
68
71
  kll_bytes = kll.serialize()
69
- new_kll = kll.deserialize(kll_bytes)
72
+ new_kll = kll_floats_sketch.deserialize(kll_bytes)
70
73
  self.assertEqual(kll.get_num_retained(), new_kll.get_num_retained())
71
74
  self.assertEqual(kll.get_min_value(), new_kll.get_min_value())
72
75
  self.assertEqual(kll.get_max_value(), new_kll.get_max_value())
@@ -78,6 +81,12 @@ class KllTest(unittest.TestCase):
78
81
  # they come from the same distribution (since they do)
79
82
  self.assertFalse(ks_test(kll, new_kll, 0.001))
80
83
 
84
+ total_weight = 0
85
+ for tuple in kll:
86
+ item = tuple[0]
87
+ weight = tuple[1]
88
+ total_weight = total_weight + weight
89
+ self.assertEqual(total_weight, kll.get_n())
81
90
 
82
91
  def test_kll_ints_sketch(self):
83
92
  k = 100
@@ -108,8 +117,9 @@ class KllTest(unittest.TestCase):
108
117
 
109
118
  self.assertEqual(kll.get_rank(round(n/2)), 0.5)
110
119
 
111
- # merge self
112
- kll.merge(kll)
120
+ # merge copy of self
121
+ kll_copy = kll_ints_sketch(kll)
122
+ kll.merge(kll_copy)
113
123
  self.assertEqual(kll.get_n(), 2 * n)
114
124
 
115
125
  sk_bytes = kll.serialize()
@@ -121,5 +131,29 @@ class KllTest(unittest.TestCase):
121
131
  kll = kll_doubles_sketch(k)
122
132
  self.assertTrue(kll.is_empty())
123
133
 
134
+ def test_kll_items_sketch(self):
135
+ # most functionality has been tested, but we need to ensure objects and sorting work
136
+ # as well as serialization
137
+ k = 100
138
+ n = 2 ** 16
139
+
140
+ # create a sketch and inject enough points to force compaction
141
+ kll = kll_items_sketch(k)
142
+ for i in range(0, n):
143
+ kll.update(str(i))
144
+
145
+ kll_copy = kll_items_sketch(kll)
146
+ kll.merge(kll_copy)
147
+ self.assertEqual(kll.get_n(), 2 * n)
148
+
149
+ kll_bytes = kll.serialize(PyStringsSerDe())
150
+ new_kll = kll_items_sketch.deserialize(kll_bytes, PyStringsSerDe())
151
+ self.assertEqual(kll.get_num_retained(), new_kll.get_num_retained())
152
+ self.assertEqual(kll.get_min_value(), new_kll.get_min_value())
153
+ self.assertEqual(kll.get_max_value(), new_kll.get_max_value())
154
+ self.assertEqual(kll.get_quantile(0.7), new_kll.get_quantile(0.7))
155
+ self.assertEqual(kll.get_rank(str(n/4)), new_kll.get_rank(str(n/4)))
156
+
157
+
124
158
  if __name__ == '__main__':
125
159
  unittest.main()
@@ -16,11 +16,12 @@
16
16
  # under the License.
17
17
 
18
18
  import unittest
19
- from datasketches import quantiles_ints_sketch, quantiles_floats_sketch, quantiles_doubles_sketch, ks_test
19
+ from datasketches import quantiles_ints_sketch, quantiles_floats_sketch, quantiles_doubles_sketch
20
+ from datasketches import quantiles_items_sketch, ks_test, PyStringsSerDe
20
21
  import numpy as np
21
22
 
22
23
  class QuantilesTest(unittest.TestCase):
23
- def test_quantiles_example(self):
24
+ def test_quantiles_floats_example(self):
24
25
  k = 128
25
26
  n = 2 ** 20
26
27
 
@@ -61,12 +62,13 @@ class QuantilesTest(unittest.TestCase):
61
62
  self.assertLess(quantiles.get_num_retained(), n)
62
63
 
63
64
  # merging itself will double the number of items the sketch has seen
64
- quantiles.merge(quantiles)
65
+ quantiles_copy = quantiles_floats_sketch(quantiles)
66
+ quantiles.merge(quantiles_copy)
65
67
  self.assertEqual(quantiles.get_n(), 2*n)
66
68
 
67
69
  # we can then serialize and reconstruct the sketch
68
70
  quantiles_bytes = quantiles.serialize()
69
- new_quantiles = quantiles.deserialize(quantiles_bytes)
71
+ new_quantiles = quantiles_floats_sketch.deserialize(quantiles_bytes)
70
72
  self.assertEqual(quantiles.get_num_retained(), new_quantiles.get_num_retained())
71
73
  self.assertEqual(quantiles.get_min_value(), new_quantiles.get_min_value())
72
74
  self.assertEqual(quantiles.get_max_value(), new_quantiles.get_max_value())
@@ -80,6 +82,13 @@ class QuantilesTest(unittest.TestCase):
80
82
  unif_quantiles.update(np.random.uniform(10, 20, size=n-1))
81
83
  self.assertTrue(ks_test(quantiles, unif_quantiles, 0.001))
82
84
 
85
+ total_weight = 0
86
+ for tuple in quantiles:
87
+ item = tuple[0]
88
+ weight = tuple[1]
89
+ total_weight = total_weight + weight
90
+ self.assertEqual(total_weight, quantiles.get_n())
91
+
83
92
  def test_quantiles_ints_sketch(self):
84
93
  k = 128
85
94
  n = 10
@@ -110,7 +119,8 @@ class QuantilesTest(unittest.TestCase):
110
119
  self.assertEqual(quantiles.get_rank(round(n/2)), 0.5)
111
120
 
112
121
  # merge self
113
- quantiles.merge(quantiles)
122
+ quantiles_copy = quantiles_ints_sketch(quantiles)
123
+ quantiles.merge(quantiles_copy)
114
124
  self.assertEqual(quantiles.get_n(), 2 * n)
115
125
 
116
126
  sk_bytes = quantiles.serialize()
@@ -122,5 +132,29 @@ class QuantilesTest(unittest.TestCase):
122
132
  quantiles = quantiles_doubles_sketch(k)
123
133
  self.assertTrue(quantiles.is_empty())
124
134
 
135
+ def test_quantiles_items_sketch(self):
136
+ # most functionality has been tested, but we need to ensure objects and sorting work
137
+ # as well as serialization
138
+ k = 128
139
+ n = 2 ** 16
140
+
141
+ # create a sketch and inject enough points to force compaction
142
+ quantiles = quantiles_items_sketch(k)
143
+ for i in range(0, n):
144
+ quantiles.update(str(i))
145
+
146
+ quantiles_copy = quantiles_items_sketch(quantiles)
147
+ quantiles.merge(quantiles_copy)
148
+ self.assertEqual(quantiles.get_n(), 2 * n)
149
+
150
+ quantiles_bytes = quantiles.serialize(PyStringsSerDe())
151
+ new_quantiles = quantiles_items_sketch.deserialize(quantiles_bytes, PyStringsSerDe())
152
+ self.assertEqual(quantiles.get_num_retained(), new_quantiles.get_num_retained())
153
+ self.assertEqual(quantiles.get_min_value(), new_quantiles.get_min_value())
154
+ self.assertEqual(quantiles.get_max_value(), new_quantiles.get_max_value())
155
+ self.assertEqual(quantiles.get_quantile(0.7), new_quantiles.get_quantile(0.7))
156
+ self.assertEqual(quantiles.get_rank(str(n/4)), new_quantiles.get_rank(str(n/4)))
157
+
158
+
125
159
  if __name__ == '__main__':
126
160
  unittest.main()
@@ -16,7 +16,7 @@
16
16
  # under the License.
17
17
 
18
18
  import unittest
19
- from datasketches import req_ints_sketch, req_floats_sketch
19
+ from datasketches import req_ints_sketch, req_floats_sketch, req_items_sketch, PyStringsSerDe
20
20
  import numpy as np
21
21
 
22
22
  class reqTest(unittest.TestCase):
@@ -67,18 +67,26 @@ class reqTest(unittest.TestCase):
67
67
  self.assertEqual(req.get_k(), k)
68
68
 
69
69
  # merging itself will double the number of items the sketch has seen
70
- req.merge(req)
70
+ req_copy = req_floats_sketch(req)
71
+ req.merge(req_copy)
71
72
  self.assertEqual(req.get_n(), 2*n)
72
73
 
73
74
  # we can then serialize and reconstruct the sketch
74
75
  req_bytes = req.serialize()
75
- new_req = req.deserialize(req_bytes)
76
+ new_req = req_floats_sketch.deserialize(req_bytes)
76
77
  self.assertEqual(req.get_num_retained(), new_req.get_num_retained())
77
78
  self.assertEqual(req.get_min_value(), new_req.get_min_value())
78
79
  self.assertEqual(req.get_max_value(), new_req.get_max_value())
79
80
  self.assertEqual(req.get_quantile(0.7), new_req.get_quantile(0.7))
80
81
  self.assertEqual(req.get_rank(0.0), new_req.get_rank(0.0))
81
82
 
83
+ total_weight = 0
84
+ for tuple in req:
85
+ item = tuple[0]
86
+ weight = tuple[1]
87
+ total_weight = total_weight + weight
88
+ self.assertEqual(total_weight, req.get_n())
89
+
82
90
  def test_req_ints_sketch(self):
83
91
  k = 100
84
92
  n = 10
@@ -109,18 +117,43 @@ class reqTest(unittest.TestCase):
109
117
  self.assertEqual(req.get_rank(round(n/2)), 0.5)
110
118
 
111
119
  # merge self
112
- req.merge(req)
120
+ req_copy = req_ints_sketch(req)
121
+ req.merge(req_copy)
113
122
  self.assertEqual(req.get_n(), 2 * n)
114
123
 
115
124
  sk_bytes = req.serialize()
116
125
  self.assertTrue(isinstance(req_ints_sketch.deserialize(sk_bytes), req_ints_sketch))
117
126
 
118
127
  def test_req_floats_sketch(self):
119
- # already tested ints and it's templatized, so just make sure it instantiates properly
128
+ # already tested floats with LRA so just check that HRA works
120
129
  k = 75
121
130
  req = req_floats_sketch(k, False) # low rank accuracy
122
131
  self.assertTrue(req.is_empty())
123
132
  self.assertFalse(req.is_hra())
124
133
 
134
+ def test_req_items_sketch(self):
135
+ # most functionality has been tested, but we need to ensure objects and sorting work
136
+ # as well as serialization
137
+ k = 100
138
+ n = 2 ** 16
139
+
140
+ # create a sketch and inject enough points to force compaction
141
+ req = req_items_sketch(k)
142
+ for i in range(0, n):
143
+ req.update(str(i))
144
+
145
+ req_copy = req_items_sketch(req)
146
+ req.merge(req_copy)
147
+ self.assertEqual(req.get_n(), 2 * n)
148
+
149
+ req_bytes = req.serialize(PyStringsSerDe())
150
+ new_req = req_items_sketch.deserialize(req_bytes, PyStringsSerDe())
151
+ self.assertEqual(req.get_num_retained(), new_req.get_num_retained())
152
+ self.assertEqual(req.get_min_value(), new_req.get_min_value())
153
+ self.assertEqual(req.get_max_value(), new_req.get_max_value())
154
+ self.assertEqual(req.get_quantile(0.7), new_req.get_quantile(0.7))
155
+ self.assertEqual(req.get_rank(str(n/4)), new_req.get_rank(str(n/4)))
156
+
157
+
125
158
  if __name__ == '__main__':
126
159
  unittest.main()
@@ -14,7 +14,7 @@
14
14
  # KIND, either express or implied. See the License for the
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
-
17
+
18
18
  import unittest
19
19
 
20
20
  from datasketches import theta_sketch, update_theta_sketch
@@ -24,11 +24,11 @@ from datasketches import theta_jaccard_similarity
24
24
 
25
25
  class ThetaTest(unittest.TestCase):
26
26
  def test_theta_basic_example(self):
27
- k = 12 # 2^k = 4096 rows in the table
27
+ lgk = 12 # 2^k = 4096 rows in the table
28
28
  n = 1 << 18 # ~256k unique values
29
29
 
30
30
  # create a sketch and inject some values
31
- sk = self.generate_theta_sketch(n, k)
31
+ sk = self.generate_theta_sketch(n, lgk)
32
32
 
33
33
  # we can check that the upper and lower bounds bracket the
34
34
  # estimate, without needing to know the exact value.
@@ -48,20 +48,26 @@ class ThetaTest(unittest.TestCase):
48
48
  self.assertFalse(sk.is_empty())
49
49
  self.assertEqual(sk.get_estimate(), new_sk.get_estimate())
50
50
 
51
+ count = 0
52
+ for hash in new_sk:
53
+ self.assertLess(hash, new_sk.get_theta64())
54
+ count = count + 1
55
+ self.assertEqual(count, new_sk.get_num_retained())
56
+
51
57
  def test_theta_set_operations(self):
52
- k = 12 # 2^k = 4096 rows in the table
58
+ lgk = 12 # 2^k = 4096 rows in the table
53
59
  n = 1 << 18 # ~256k unique values
54
60
 
55
61
  # we'll have 1/4 of the values overlap
56
62
  offset = int(3 * n / 4) # it's a float w/o cast
57
63
 
58
64
  # create a couple sketches and inject some values
59
- sk1 = self.generate_theta_sketch(n, k)
60
- sk2 = self.generate_theta_sketch(n, k, offset)
65
+ sk1 = self.generate_theta_sketch(n, lgk)
66
+ sk2 = self.generate_theta_sketch(n, lgk, offset)
61
67
 
62
68
  # UNIONS
63
69
  # create a union object
64
- union = theta_union(k)
70
+ union = theta_union(lgk)
65
71
  union.update(sk1)
66
72
  union.update(sk2)
67
73
 
@@ -77,7 +83,6 @@ class ThetaTest(unittest.TestCase):
77
83
  self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
78
84
  self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
79
85
 
80
-
81
86
  # INTERSECTIONS
82
87
  # create an intersection object
83
88
  intersect = theta_intersection() # no lg_k
@@ -96,7 +101,6 @@ class ThetaTest(unittest.TestCase):
96
101
  self.assertLessEqual(result.get_lower_bound(1), n / 4)
97
102
  self.assertGreaterEqual(result.get_upper_bound(1), n / 4)
98
103
 
99
-
100
104
  # A NOT B
101
105
  # create an a_not_b object
102
106
  anb = theta_a_not_b() # no lg_k
@@ -134,13 +138,11 @@ class ThetaTest(unittest.TestCase):
134
138
  self.assertTrue(theta_jaccard_similarity.similarity_test(sk1, result, 0.7))
135
139
 
136
140
 
137
- def generate_theta_sketch(self, n, k, offset=0):
138
- sk = update_theta_sketch(k)
141
+ def generate_theta_sketch(self, n, lgk, offset=0):
142
+ sk = update_theta_sketch(lgk)
139
143
  for i in range(0, n):
140
144
  sk.update(i + offset)
141
145
  return sk
142
-
146
+
143
147
  if __name__ == '__main__':
144
148
  unittest.main()
145
-
146
-
@@ -0,0 +1,206 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ import unittest
19
+
20
+ from datasketches import update_tuple_sketch
21
+ from datasketches import compact_tuple_sketch, tuple_union
22
+ from datasketches import tuple_intersection, tuple_a_not_b
23
+ from datasketches import tuple_jaccard_similarity
24
+ from datasketches import tuple_jaccard_similarity, PyIntsSerDe
25
+ from datasketches import AccumulatorPolicy, MaxIntPolicy, MinIntPolicy
26
+ from datasketches import update_theta_sketch
27
+
28
+ class TupleTest(unittest.TestCase):
29
+ def test_tuple_basic_example(self):
30
+ lgk = 12 # 2^k = 4096 rows in the table
31
+ n = 1 << 18 # ~256k unique values
32
+
33
+ # create a sketch and inject some values -- summary is 2 so we can sum them
34
+ # and know the reuslt
35
+ sk = self.generate_tuple_sketch(AccumulatorPolicy(), n, lgk, value=2)
36
+
37
+ # we can check that the upper and lower bounds bracket the
38
+ # estimate, without needing to know the exact value.
39
+ self.assertLessEqual(sk.get_lower_bound(1), sk.get_estimate())
40
+ self.assertGreaterEqual(sk.get_upper_bound(1), sk.get_estimate())
41
+
42
+ # because this sketch is deterministically generated, we can
43
+ # also compare against the exact value
44
+ self.assertLessEqual(sk.get_lower_bound(1), n)
45
+ self.assertGreaterEqual(sk.get_upper_bound(1), n)
46
+
47
+ # compact and serialize for storage, then reconstruct
48
+ sk_bytes = sk.compact().serialize(PyIntsSerDe())
49
+ new_sk = compact_tuple_sketch.deserialize(sk_bytes, serde=PyIntsSerDe())
50
+
51
+ # estimate remains unchanged
52
+ self.assertFalse(sk.is_empty())
53
+ self.assertEqual(sk.get_estimate(), new_sk.get_estimate())
54
+
55
+ # we can also iterate over the sketch entries
56
+ # the iterator provides a (hashkey, summary) pair where the
57
+ # first value is the raw hash value and the second the summary
58
+ count = 0
59
+ cumSum = 0
60
+ for pair in new_sk:
61
+ self.assertLess(pair[0], new_sk.get_theta64())
62
+ count += 1
63
+ cumSum += pair[1]
64
+ self.assertEqual(count, new_sk.get_num_retained())
65
+ self.assertEqual(cumSum, 2 * new_sk.get_num_retained())
66
+
67
+ # we can even create a tuple sketch from an existing theta sketch
68
+ # as long as we provide a summary to use
69
+ theta_sk = update_theta_sketch(lgk)
70
+ for i in range(n, 2*n):
71
+ theta_sk.update(i)
72
+ cts = compact_tuple_sketch(theta_sk, 5)
73
+ cumSum = 0
74
+ for pair in cts:
75
+ cumSum += pair[1]
76
+ self.assertEqual(cumSum, 5 * cts.get_num_retained())
77
+
78
+
79
+ def test_tuple_set_operations(self):
80
+ lgk = 12 # 2^k = 4096 rows in the table
81
+ n = 1 << 18 # ~256k unique values
82
+
83
+ # we'll have 1/4 of the values overlap
84
+ offset = int(3 * n / 4) # it's a float w/o cast
85
+
86
+ # create a couple sketches and inject some values, with different summaries
87
+ sk1 = self.generate_tuple_sketch(AccumulatorPolicy(), n, lgk, value=5)
88
+ sk2 = self.generate_tuple_sketch(AccumulatorPolicy(), n, lgk, value=7, offset=offset)
89
+
90
+ # UNIONS
91
+ # create a union object
92
+ union = tuple_union(MaxIntPolicy(), lgk)
93
+ union.update(sk1)
94
+ union.update(sk2)
95
+
96
+ # getting result from union returns a compact_theta_sketch
97
+ # compact theta sketches can be used in additional unions
98
+ # or set operations but cannot accept further item updates
99
+ result = union.get_result()
100
+ self.assertTrue(isinstance(result, compact_tuple_sketch))
101
+
102
+ # since our process here is deterministic, we have
103
+ # checked and know the exact answer is within one
104
+ # standard deviation of the estimate
105
+ self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
106
+ self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
107
+
108
+ # we unioned two equal-sized sketches with overlap and used
109
+ # the max value as the resulting summary, meaning we should
110
+ # have more summaries with value 7 than value 5 in the result
111
+ count5 = 0
112
+ count7 = 0
113
+ for pair in result:
114
+ if pair[1] == 5:
115
+ count5 += 1
116
+ elif pair[1] == 7:
117
+ count7 += 1
118
+ else:
119
+ self.fail()
120
+ self.assertLess(count5, count7)
121
+
122
+ # INTERSECTIONS
123
+ # create an intersection object
124
+ intersect = tuple_intersection(MinIntPolicy()) # no lg_k
125
+ intersect.update(sk1)
126
+ intersect.update(sk2)
127
+
128
+ # has_result() indicates the intersection has been used,
129
+ # although the result may be the empty set
130
+ self.assertTrue(intersect.has_result())
131
+
132
+ # as with unions, the result is a compact sketch
133
+ result = intersect.get_result()
134
+ self.assertTrue(isinstance(result, compact_tuple_sketch))
135
+
136
+ # we know the sets overlap by 1/4
137
+ self.assertLessEqual(result.get_lower_bound(1), n / 4)
138
+ self.assertGreaterEqual(result.get_upper_bound(1), n / 4)
139
+
140
+ # in this example, we intersected the sketches and took the
141
+ # min value as the resulting summary, so all summaries
142
+ # must be exactly equal to that value
143
+ count5 = 0
144
+ for pair in result:
145
+ if pair[1] == 5:
146
+ count5 += 1
147
+ else:
148
+ self.fail()
149
+ self.assertEqual(count5, result.get_num_retained())
150
+
151
+ # A NOT B
152
+ # create an a_not_b object
153
+ anb = tuple_a_not_b() # no lg_k or policy
154
+ result = anb.compute(sk1, sk2)
155
+
156
+ # as with unions, the result is a compact sketch
157
+ self.assertTrue(isinstance(result, compact_tuple_sketch))
158
+
159
+ # we know the sets overlap by 1/4, so the remainder is 3/4
160
+ self.assertLessEqual(result.get_lower_bound(1), 3 * n / 4)
161
+ self.assertGreaterEqual(result.get_upper_bound(1), 3 * n / 4)
162
+
163
+ # here, we have only values with a summary of 5 as any keys that
164
+ # existed in both sketches were removed
165
+ count5 = 0
166
+ for pair in result:
167
+ if pair[1] == 5:
168
+ count5 += 1
169
+ else:
170
+ self.fail()
171
+ self.assertEqual(count5, result.get_num_retained())
172
+
173
+ # JACCARD SIMILARITY
174
+ # Jaccard Similarity measure returns (lower_bound, estimate, upper_bound)
175
+ # and does not examine summaries, even for (dis)similarity tests.
176
+ jac = tuple_jaccard_similarity.jaccard(sk1, sk2)
177
+
178
+ # we can check that results are in the expected order
179
+ self.assertLess(jac[0], jac[1])
180
+ self.assertLess(jac[1], jac[2])
181
+
182
+ # checks for sketch equivalence
183
+ self.assertTrue(tuple_jaccard_similarity.exactly_equal(sk1, sk1))
184
+ self.assertFalse(tuple_jaccard_similarity.exactly_equal(sk1, sk2))
185
+
186
+ # we can apply a check for similarity or dissimilarity at a
187
+ # given threshold, at 97.7% confidence.
188
+
189
+ # check that the Jaccard Index is at most (upper bound) 0.2.
190
+ # exact result would be 1/7
191
+ self.assertTrue(tuple_jaccard_similarity.dissimilarity_test(sk1, sk2, 0.2))
192
+
193
+ # check that the Jaccard Index is at least (lower bound) 0.7
194
+ # exact result would be 3/4, using result from A NOT B test
195
+ self.assertTrue(tuple_jaccard_similarity.similarity_test(sk1, result, 0.7))
196
+
197
+
198
+ # Generates a basic tuple sketch with a fixed value for each update
199
+ def generate_tuple_sketch(self, policy, n, lgk, value, offset=0):
200
+ sk = update_tuple_sketch(policy, lgk)
201
+ for i in range(0, n):
202
+ sk.update(i + offset, value)
203
+ return sk
204
+
205
+ if __name__ == '__main__':
206
+ unittest.main()
@@ -45,6 +45,13 @@ class VoTest(unittest.TestCase):
45
45
  items = vo.get_samples()
46
46
  self.assertEqual(len(items), k)
47
47
 
48
+ count = 0
49
+ for tuple in vo:
50
+ sample = tuple[0]
51
+ weight = tuple[1]
52
+ count = count + 1
53
+ self.assertEqual(count, vo.num_samples)
54
+
48
55
  # we can also apply a predicate to the sketch to get an estimate
49
56
  # (with optimally minimal variance) of the subset sum of items
50
57
  # matching that predicate among the entire population
@@ -580,15 +580,20 @@ private:
580
580
 
581
581
 
582
582
  template<typename T, typename C, typename A>
583
- class quantiles_sketch<T, C, A>::const_iterator: public std::iterator<std::input_iterator_tag, T> {
583
+ class quantiles_sketch<T, C, A>::const_iterator {
584
584
  public:
585
+ using iterator_category = std::input_iterator_tag;
585
586
  using value_type = std::pair<const T&, const uint64_t>;
587
+ using difference_type = void;
588
+ using pointer = const return_value_holder<value_type>;
589
+ using reference = const value_type;
590
+
586
591
  const_iterator& operator++();
587
592
  const_iterator& operator++(int);
588
593
  bool operator==(const const_iterator& other) const;
589
594
  bool operator!=(const const_iterator& other) const;
590
- const value_type operator*() const;
591
- const return_value_holder<value_type> operator->() const;
595
+ reference operator*() const;
596
+ pointer operator->() const;
592
597
  private:
593
598
  friend class quantiles_sketch<T, C, A>;
594
599
  using Level = std::vector<T, A>;
@@ -645,12 +645,12 @@ string<A> quantiles_sketch<T, C, A>::to_string(bool print_levels, bool print_ite
645
645
  uint8_t level = 0;
646
646
  os << " BB:" << std::endl;
647
647
  for (const T& item : base_buffer_) {
648
- os << " " << std::to_string(item) << std::endl;
648
+ os << " " << item << std::endl;
649
649
  }
650
650
  for (uint8_t i = 0; i < levels_.size(); ++i) {
651
651
  os << " level " << static_cast<unsigned int>(level) << ":" << std::endl;
652
652
  for (const T& item : levels_[i]) {
653
- os << " " << std::to_string(item) << std::endl;
653
+ os << " " << item << std::endl;
654
654
  }
655
655
  }
656
656
  os << "### End sketch data" << std::endl;
@@ -1354,12 +1354,12 @@ bool quantiles_sketch<T, C, A>::const_iterator::operator!=(const const_iterator&
1354
1354
  }
1355
1355
 
1356
1356
  template<typename T, typename C, typename A>
1357
- auto quantiles_sketch<T, C, A>::const_iterator::operator*() const -> const value_type {
1357
+ auto quantiles_sketch<T, C, A>::const_iterator::operator*() const -> reference {
1358
1358
  return value_type(level_ == -1 ? base_buffer_[index_] : levels_[level_][index_], weight_);
1359
1359
  }
1360
1360
 
1361
1361
  template<typename T, typename C, typename A>
1362
- auto quantiles_sketch<T, C, A>::const_iterator::operator->() const -> const return_value_holder<value_type> {
1362
+ auto quantiles_sketch<T, C, A>::const_iterator::operator->() const -> pointer {
1363
1363
  return **this;
1364
1364
  }
1365
1365
 
@@ -260,7 +260,7 @@ TEST_CASE("quantiles sketch", "[quantiles_sketch]") {
260
260
  REQUIRE(sketch.get_rank(values[i]) == ranks[i]);
261
261
  }
262
262
  subtotal_pmf += pmf[i];
263
- if (abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
263
+ if (std::abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
264
264
  std::cerr << "CDF vs PMF for value " << i << std::endl;
265
265
  REQUIRE(ranks[i] == Approx(subtotal_pmf).margin(NUMERIC_NOISE_TOLERANCE));
266
266
  }
@@ -28,8 +28,6 @@
28
28
  #include "conditional_forward.hpp"
29
29
  #include "common_defs.hpp"
30
30
 
31
- #include <iomanip>
32
-
33
31
  namespace datasketches {
34
32
 
35
33
  template<typename T, typename C, typename A>
@@ -399,15 +399,20 @@ private:
399
399
  };
400
400
 
401
401
  template<typename T, typename C, typename A>
402
- class req_sketch<T, C, A>::const_iterator: public std::iterator<std::input_iterator_tag, T> {
402
+ class req_sketch<T, C, A>::const_iterator {
403
403
  public:
404
+ using iterator_category = std::input_iterator_tag;
404
405
  using value_type = std::pair<const T&, const uint64_t>;
406
+ using difference_type = void;
407
+ using pointer = const return_value_holder<value_type>;
408
+ using reference = const value_type;
409
+
405
410
  const_iterator& operator++();
406
411
  const_iterator& operator++(int);
407
412
  bool operator==(const const_iterator& other) const;
408
413
  bool operator!=(const const_iterator& other) const;
409
- const value_type operator*() const;
410
- const return_value_holder<value_type> operator->() const;
414
+ reference operator*() const;
415
+ pointer operator->() const;
411
416
  private:
412
417
  using LevelsIterator = typename std::vector<Compactor, AllocCompactor>::const_iterator;
413
418
  LevelsIterator levels_it_;
@@ -848,12 +848,12 @@ bool req_sketch<T, C, A>::const_iterator::operator!=(const const_iterator& other
848
848
  }
849
849
 
850
850
  template<typename T, typename C, typename A>
851
- auto req_sketch<T, C, A>::const_iterator::operator*() const -> const value_type {
851
+ auto req_sketch<T, C, A>::const_iterator::operator*() const -> reference {
852
852
  return value_type(*compactor_it_, 1ULL << (*levels_it_).get_lg_weight());
853
853
  }
854
854
 
855
855
  template<typename T, typename C, typename A>
856
- auto req_sketch<T, C, A>::const_iterator::operator->() const -> const return_value_holder<value_type> {
856
+ auto req_sketch<T, C, A>::const_iterator::operator->() const -> pointer {
857
857
  return **this;
858
858
  }
859
859