datasketches 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
  6. data/vendor/datasketches-cpp/NOTICE +1 -1
  7. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
  9. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
  10. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  11. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  12. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  13. data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
  14. data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
  15. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
  16. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
  17. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  18. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
  19. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  20. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
  21. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  22. data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
  23. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
  24. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  25. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
  26. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  27. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  28. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  29. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  30. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  31. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  32. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  33. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  34. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
  35. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
  36. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  37. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  38. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
  39. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  40. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  41. data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  43. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  44. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
  45. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
  46. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
  47. data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
  48. data/vendor/datasketches-cpp/python/README.md +5 -5
  49. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
  50. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
  51. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
  52. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
  53. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
  54. data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
  55. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
  56. data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
  57. data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
  58. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
  59. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
  60. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
  61. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
  62. data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
  63. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
  64. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
  65. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
  66. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
  67. data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
  68. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
  69. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
  70. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
  71. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
  72. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
  73. data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
  74. data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
  75. data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
  76. data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
  78. data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
  79. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
  80. data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
  81. data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
  82. data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
  83. data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
  84. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
  85. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
  86. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
  87. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
  88. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
  89. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
  96. data/vendor/datasketches-cpp/setup.py +1 -1
  97. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  98. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  99. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
  102. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
  103. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  104. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
  105. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
  106. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
  107. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  108. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  109. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
  110. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  111. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  112. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  113. metadata +31 -3
@@ -16,11 +16,12 @@
16
16
  # under the License.
17
17
 
18
18
  import unittest
19
- from datasketches import kll_ints_sketch, kll_floats_sketch, kll_doubles_sketch, ks_test
19
+ from datasketches import kll_ints_sketch, kll_floats_sketch, kll_doubles_sketch
20
+ from datasketches import kll_items_sketch, ks_test, PyStringsSerDe
20
21
  import numpy as np
21
22
 
22
23
  class KllTest(unittest.TestCase):
23
- def test_kll_example(self):
24
+ def test_kll_floats_example(self):
24
25
  k = 160
25
26
  n = 2 ** 20
26
27
 
@@ -61,12 +62,14 @@ class KllTest(unittest.TestCase):
61
62
  self.assertLess(kll.get_num_retained(), n)
62
63
 
63
64
  # merging itself will double the number of items the sketch has seen
64
- kll.merge(kll)
65
+ # but need to do that with a copy
66
+ kll_copy = kll_floats_sketch(kll)
67
+ kll.merge(kll_copy)
65
68
  self.assertEqual(kll.get_n(), 2*n)
66
69
 
67
70
  # we can then serialize and reconstruct the sketch
68
71
  kll_bytes = kll.serialize()
69
- new_kll = kll.deserialize(kll_bytes)
72
+ new_kll = kll_floats_sketch.deserialize(kll_bytes)
70
73
  self.assertEqual(kll.get_num_retained(), new_kll.get_num_retained())
71
74
  self.assertEqual(kll.get_min_value(), new_kll.get_min_value())
72
75
  self.assertEqual(kll.get_max_value(), new_kll.get_max_value())
@@ -78,6 +81,12 @@ class KllTest(unittest.TestCase):
78
81
  # they come from the same distribution (since they do)
79
82
  self.assertFalse(ks_test(kll, new_kll, 0.001))
80
83
 
84
+ total_weight = 0
85
+ for tuple in kll:
86
+ item = tuple[0]
87
+ weight = tuple[1]
88
+ total_weight = total_weight + weight
89
+ self.assertEqual(total_weight, kll.get_n())
81
90
 
82
91
  def test_kll_ints_sketch(self):
83
92
  k = 100
@@ -108,8 +117,9 @@ class KllTest(unittest.TestCase):
108
117
 
109
118
  self.assertEqual(kll.get_rank(round(n/2)), 0.5)
110
119
 
111
- # merge self
112
- kll.merge(kll)
120
+ # merge copy of self
121
+ kll_copy = kll_ints_sketch(kll)
122
+ kll.merge(kll_copy)
113
123
  self.assertEqual(kll.get_n(), 2 * n)
114
124
 
115
125
  sk_bytes = kll.serialize()
@@ -121,5 +131,29 @@ class KllTest(unittest.TestCase):
121
131
  kll = kll_doubles_sketch(k)
122
132
  self.assertTrue(kll.is_empty())
123
133
 
134
+ def test_kll_items_sketch(self):
135
+ # most functionality has been tested, but we need to ensure objects and sorting work
136
+ # as well as serialization
137
+ k = 100
138
+ n = 2 ** 16
139
+
140
+ # create a sketch and inject enough points to force compaction
141
+ kll = kll_items_sketch(k)
142
+ for i in range(0, n):
143
+ kll.update(str(i))
144
+
145
+ kll_copy = kll_items_sketch(kll)
146
+ kll.merge(kll_copy)
147
+ self.assertEqual(kll.get_n(), 2 * n)
148
+
149
+ kll_bytes = kll.serialize(PyStringsSerDe())
150
+ new_kll = kll_items_sketch.deserialize(kll_bytes, PyStringsSerDe())
151
+ self.assertEqual(kll.get_num_retained(), new_kll.get_num_retained())
152
+ self.assertEqual(kll.get_min_value(), new_kll.get_min_value())
153
+ self.assertEqual(kll.get_max_value(), new_kll.get_max_value())
154
+ self.assertEqual(kll.get_quantile(0.7), new_kll.get_quantile(0.7))
155
+ self.assertEqual(kll.get_rank(str(n/4)), new_kll.get_rank(str(n/4)))
156
+
157
+
124
158
  if __name__ == '__main__':
125
159
  unittest.main()
@@ -16,11 +16,12 @@
16
16
  # under the License.
17
17
 
18
18
  import unittest
19
- from datasketches import quantiles_ints_sketch, quantiles_floats_sketch, quantiles_doubles_sketch, ks_test
19
+ from datasketches import quantiles_ints_sketch, quantiles_floats_sketch, quantiles_doubles_sketch
20
+ from datasketches import quantiles_items_sketch, ks_test, PyStringsSerDe
20
21
  import numpy as np
21
22
 
22
23
  class QuantilesTest(unittest.TestCase):
23
- def test_quantiles_example(self):
24
+ def test_quantiles_floats_example(self):
24
25
  k = 128
25
26
  n = 2 ** 20
26
27
 
@@ -61,12 +62,13 @@ class QuantilesTest(unittest.TestCase):
61
62
  self.assertLess(quantiles.get_num_retained(), n)
62
63
 
63
64
  # merging itself will double the number of items the sketch has seen
64
- quantiles.merge(quantiles)
65
+ quantiles_copy = quantiles_floats_sketch(quantiles)
66
+ quantiles.merge(quantiles_copy)
65
67
  self.assertEqual(quantiles.get_n(), 2*n)
66
68
 
67
69
  # we can then serialize and reconstruct the sketch
68
70
  quantiles_bytes = quantiles.serialize()
69
- new_quantiles = quantiles.deserialize(quantiles_bytes)
71
+ new_quantiles = quantiles_floats_sketch.deserialize(quantiles_bytes)
70
72
  self.assertEqual(quantiles.get_num_retained(), new_quantiles.get_num_retained())
71
73
  self.assertEqual(quantiles.get_min_value(), new_quantiles.get_min_value())
72
74
  self.assertEqual(quantiles.get_max_value(), new_quantiles.get_max_value())
@@ -80,6 +82,13 @@ class QuantilesTest(unittest.TestCase):
80
82
  unif_quantiles.update(np.random.uniform(10, 20, size=n-1))
81
83
  self.assertTrue(ks_test(quantiles, unif_quantiles, 0.001))
82
84
 
85
+ total_weight = 0
86
+ for tuple in quantiles:
87
+ item = tuple[0]
88
+ weight = tuple[1]
89
+ total_weight = total_weight + weight
90
+ self.assertEqual(total_weight, quantiles.get_n())
91
+
83
92
  def test_quantiles_ints_sketch(self):
84
93
  k = 128
85
94
  n = 10
@@ -110,7 +119,8 @@ class QuantilesTest(unittest.TestCase):
110
119
  self.assertEqual(quantiles.get_rank(round(n/2)), 0.5)
111
120
 
112
121
  # merge self
113
- quantiles.merge(quantiles)
122
+ quantiles_copy = quantiles_ints_sketch(quantiles)
123
+ quantiles.merge(quantiles_copy)
114
124
  self.assertEqual(quantiles.get_n(), 2 * n)
115
125
 
116
126
  sk_bytes = quantiles.serialize()
@@ -122,5 +132,29 @@ class QuantilesTest(unittest.TestCase):
122
132
  quantiles = quantiles_doubles_sketch(k)
123
133
  self.assertTrue(quantiles.is_empty())
124
134
 
135
+ def test_quantiles_items_sketch(self):
136
+ # most functionality has been tested, but we need to ensure objects and sorting work
137
+ # as well as serialization
138
+ k = 128
139
+ n = 2 ** 16
140
+
141
+ # create a sketch and inject enough points to force compaction
142
+ quantiles = quantiles_items_sketch(k)
143
+ for i in range(0, n):
144
+ quantiles.update(str(i))
145
+
146
+ quantiles_copy = quantiles_items_sketch(quantiles)
147
+ quantiles.merge(quantiles_copy)
148
+ self.assertEqual(quantiles.get_n(), 2 * n)
149
+
150
+ quantiles_bytes = quantiles.serialize(PyStringsSerDe())
151
+ new_quantiles = quantiles_items_sketch.deserialize(quantiles_bytes, PyStringsSerDe())
152
+ self.assertEqual(quantiles.get_num_retained(), new_quantiles.get_num_retained())
153
+ self.assertEqual(quantiles.get_min_value(), new_quantiles.get_min_value())
154
+ self.assertEqual(quantiles.get_max_value(), new_quantiles.get_max_value())
155
+ self.assertEqual(quantiles.get_quantile(0.7), new_quantiles.get_quantile(0.7))
156
+ self.assertEqual(quantiles.get_rank(str(n/4)), new_quantiles.get_rank(str(n/4)))
157
+
158
+
125
159
  if __name__ == '__main__':
126
160
  unittest.main()
@@ -16,7 +16,7 @@
16
16
  # under the License.
17
17
 
18
18
  import unittest
19
- from datasketches import req_ints_sketch, req_floats_sketch
19
+ from datasketches import req_ints_sketch, req_floats_sketch, req_items_sketch, PyStringsSerDe
20
20
  import numpy as np
21
21
 
22
22
  class reqTest(unittest.TestCase):
@@ -67,18 +67,26 @@ class reqTest(unittest.TestCase):
67
67
  self.assertEqual(req.get_k(), k)
68
68
 
69
69
  # merging itself will double the number of items the sketch has seen
70
- req.merge(req)
70
+ req_copy = req_floats_sketch(req)
71
+ req.merge(req_copy)
71
72
  self.assertEqual(req.get_n(), 2*n)
72
73
 
73
74
  # we can then serialize and reconstruct the sketch
74
75
  req_bytes = req.serialize()
75
- new_req = req.deserialize(req_bytes)
76
+ new_req = req_floats_sketch.deserialize(req_bytes)
76
77
  self.assertEqual(req.get_num_retained(), new_req.get_num_retained())
77
78
  self.assertEqual(req.get_min_value(), new_req.get_min_value())
78
79
  self.assertEqual(req.get_max_value(), new_req.get_max_value())
79
80
  self.assertEqual(req.get_quantile(0.7), new_req.get_quantile(0.7))
80
81
  self.assertEqual(req.get_rank(0.0), new_req.get_rank(0.0))
81
82
 
83
+ total_weight = 0
84
+ for tuple in req:
85
+ item = tuple[0]
86
+ weight = tuple[1]
87
+ total_weight = total_weight + weight
88
+ self.assertEqual(total_weight, req.get_n())
89
+
82
90
  def test_req_ints_sketch(self):
83
91
  k = 100
84
92
  n = 10
@@ -109,18 +117,43 @@ class reqTest(unittest.TestCase):
109
117
  self.assertEqual(req.get_rank(round(n/2)), 0.5)
110
118
 
111
119
  # merge self
112
- req.merge(req)
120
+ req_copy = req_ints_sketch(req)
121
+ req.merge(req_copy)
113
122
  self.assertEqual(req.get_n(), 2 * n)
114
123
 
115
124
  sk_bytes = req.serialize()
116
125
  self.assertTrue(isinstance(req_ints_sketch.deserialize(sk_bytes), req_ints_sketch))
117
126
 
118
127
  def test_req_floats_sketch(self):
119
- # already tested ints and it's templatized, so just make sure it instantiates properly
128
+ # already tested floats with LRA so just check that HRA works
120
129
  k = 75
121
130
  req = req_floats_sketch(k, False) # low rank accuracy
122
131
  self.assertTrue(req.is_empty())
123
132
  self.assertFalse(req.is_hra())
124
133
 
134
+ def test_req_items_sketch(self):
135
+ # most functionality has been tested, but we need to ensure objects and sorting work
136
+ # as well as serialization
137
+ k = 100
138
+ n = 2 ** 16
139
+
140
+ # create a sketch and inject enough points to force compaction
141
+ req = req_items_sketch(k)
142
+ for i in range(0, n):
143
+ req.update(str(i))
144
+
145
+ req_copy = req_items_sketch(req)
146
+ req.merge(req_copy)
147
+ self.assertEqual(req.get_n(), 2 * n)
148
+
149
+ req_bytes = req.serialize(PyStringsSerDe())
150
+ new_req = req_items_sketch.deserialize(req_bytes, PyStringsSerDe())
151
+ self.assertEqual(req.get_num_retained(), new_req.get_num_retained())
152
+ self.assertEqual(req.get_min_value(), new_req.get_min_value())
153
+ self.assertEqual(req.get_max_value(), new_req.get_max_value())
154
+ self.assertEqual(req.get_quantile(0.7), new_req.get_quantile(0.7))
155
+ self.assertEqual(req.get_rank(str(n/4)), new_req.get_rank(str(n/4)))
156
+
157
+
125
158
  if __name__ == '__main__':
126
159
  unittest.main()
@@ -14,7 +14,7 @@
14
14
  # KIND, either express or implied. See the License for the
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
-
17
+
18
18
  import unittest
19
19
 
20
20
  from datasketches import theta_sketch, update_theta_sketch
@@ -24,11 +24,11 @@ from datasketches import theta_jaccard_similarity
24
24
 
25
25
  class ThetaTest(unittest.TestCase):
26
26
  def test_theta_basic_example(self):
27
- k = 12 # 2^k = 4096 rows in the table
27
+ lgk = 12 # 2^k = 4096 rows in the table
28
28
  n = 1 << 18 # ~256k unique values
29
29
 
30
30
  # create a sketch and inject some values
31
- sk = self.generate_theta_sketch(n, k)
31
+ sk = self.generate_theta_sketch(n, lgk)
32
32
 
33
33
  # we can check that the upper and lower bounds bracket the
34
34
  # estimate, without needing to know the exact value.
@@ -48,20 +48,26 @@ class ThetaTest(unittest.TestCase):
48
48
  self.assertFalse(sk.is_empty())
49
49
  self.assertEqual(sk.get_estimate(), new_sk.get_estimate())
50
50
 
51
+ count = 0
52
+ for hash in new_sk:
53
+ self.assertLess(hash, new_sk.get_theta64())
54
+ count = count + 1
55
+ self.assertEqual(count, new_sk.get_num_retained())
56
+
51
57
  def test_theta_set_operations(self):
52
- k = 12 # 2^k = 4096 rows in the table
58
+ lgk = 12 # 2^k = 4096 rows in the table
53
59
  n = 1 << 18 # ~256k unique values
54
60
 
55
61
  # we'll have 1/4 of the values overlap
56
62
  offset = int(3 * n / 4) # it's a float w/o cast
57
63
 
58
64
  # create a couple sketches and inject some values
59
- sk1 = self.generate_theta_sketch(n, k)
60
- sk2 = self.generate_theta_sketch(n, k, offset)
65
+ sk1 = self.generate_theta_sketch(n, lgk)
66
+ sk2 = self.generate_theta_sketch(n, lgk, offset)
61
67
 
62
68
  # UNIONS
63
69
  # create a union object
64
- union = theta_union(k)
70
+ union = theta_union(lgk)
65
71
  union.update(sk1)
66
72
  union.update(sk2)
67
73
 
@@ -77,7 +83,6 @@ class ThetaTest(unittest.TestCase):
77
83
  self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
78
84
  self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
79
85
 
80
-
81
86
  # INTERSECTIONS
82
87
  # create an intersection object
83
88
  intersect = theta_intersection() # no lg_k
@@ -96,7 +101,6 @@ class ThetaTest(unittest.TestCase):
96
101
  self.assertLessEqual(result.get_lower_bound(1), n / 4)
97
102
  self.assertGreaterEqual(result.get_upper_bound(1), n / 4)
98
103
 
99
-
100
104
  # A NOT B
101
105
  # create an a_not_b object
102
106
  anb = theta_a_not_b() # no lg_k
@@ -134,13 +138,11 @@ class ThetaTest(unittest.TestCase):
134
138
  self.assertTrue(theta_jaccard_similarity.similarity_test(sk1, result, 0.7))
135
139
 
136
140
 
137
- def generate_theta_sketch(self, n, k, offset=0):
138
- sk = update_theta_sketch(k)
141
+ def generate_theta_sketch(self, n, lgk, offset=0):
142
+ sk = update_theta_sketch(lgk)
139
143
  for i in range(0, n):
140
144
  sk.update(i + offset)
141
145
  return sk
142
-
146
+
143
147
  if __name__ == '__main__':
144
148
  unittest.main()
145
-
146
-
@@ -0,0 +1,206 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ import unittest
19
+
20
+ from datasketches import update_tuple_sketch
21
+ from datasketches import compact_tuple_sketch, tuple_union
22
+ from datasketches import tuple_intersection, tuple_a_not_b
23
+ from datasketches import tuple_jaccard_similarity
24
+ from datasketches import tuple_jaccard_similarity, PyIntsSerDe
25
+ from datasketches import AccumulatorPolicy, MaxIntPolicy, MinIntPolicy
26
+ from datasketches import update_theta_sketch
27
+
28
+ class TupleTest(unittest.TestCase):
29
+ def test_tuple_basic_example(self):
30
+ lgk = 12 # 2^k = 4096 rows in the table
31
+ n = 1 << 18 # ~256k unique values
32
+
33
+ # create a sketch and inject some values -- summary is 2 so we can sum them
34
+ # and know the reuslt
35
+ sk = self.generate_tuple_sketch(AccumulatorPolicy(), n, lgk, value=2)
36
+
37
+ # we can check that the upper and lower bounds bracket the
38
+ # estimate, without needing to know the exact value.
39
+ self.assertLessEqual(sk.get_lower_bound(1), sk.get_estimate())
40
+ self.assertGreaterEqual(sk.get_upper_bound(1), sk.get_estimate())
41
+
42
+ # because this sketch is deterministically generated, we can
43
+ # also compare against the exact value
44
+ self.assertLessEqual(sk.get_lower_bound(1), n)
45
+ self.assertGreaterEqual(sk.get_upper_bound(1), n)
46
+
47
+ # compact and serialize for storage, then reconstruct
48
+ sk_bytes = sk.compact().serialize(PyIntsSerDe())
49
+ new_sk = compact_tuple_sketch.deserialize(sk_bytes, serde=PyIntsSerDe())
50
+
51
+ # estimate remains unchanged
52
+ self.assertFalse(sk.is_empty())
53
+ self.assertEqual(sk.get_estimate(), new_sk.get_estimate())
54
+
55
+ # we can also iterate over the sketch entries
56
+ # the iterator provides a (hashkey, summary) pair where the
57
+ # first value is the raw hash value and the second the summary
58
+ count = 0
59
+ cumSum = 0
60
+ for pair in new_sk:
61
+ self.assertLess(pair[0], new_sk.get_theta64())
62
+ count += 1
63
+ cumSum += pair[1]
64
+ self.assertEqual(count, new_sk.get_num_retained())
65
+ self.assertEqual(cumSum, 2 * new_sk.get_num_retained())
66
+
67
+ # we can even create a tuple sketch from an existing theta sketch
68
+ # as long as we provide a summary to use
69
+ theta_sk = update_theta_sketch(lgk)
70
+ for i in range(n, 2*n):
71
+ theta_sk.update(i)
72
+ cts = compact_tuple_sketch(theta_sk, 5)
73
+ cumSum = 0
74
+ for pair in cts:
75
+ cumSum += pair[1]
76
+ self.assertEqual(cumSum, 5 * cts.get_num_retained())
77
+
78
+
79
+ def test_tuple_set_operations(self):
80
+ lgk = 12 # 2^k = 4096 rows in the table
81
+ n = 1 << 18 # ~256k unique values
82
+
83
+ # we'll have 1/4 of the values overlap
84
+ offset = int(3 * n / 4) # it's a float w/o cast
85
+
86
+ # create a couple sketches and inject some values, with different summaries
87
+ sk1 = self.generate_tuple_sketch(AccumulatorPolicy(), n, lgk, value=5)
88
+ sk2 = self.generate_tuple_sketch(AccumulatorPolicy(), n, lgk, value=7, offset=offset)
89
+
90
+ # UNIONS
91
+ # create a union object
92
+ union = tuple_union(MaxIntPolicy(), lgk)
93
+ union.update(sk1)
94
+ union.update(sk2)
95
+
96
+ # getting result from union returns a compact_theta_sketch
97
+ # compact theta sketches can be used in additional unions
98
+ # or set operations but cannot accept further item updates
99
+ result = union.get_result()
100
+ self.assertTrue(isinstance(result, compact_tuple_sketch))
101
+
102
+ # since our process here is deterministic, we have
103
+ # checked and know the exact answer is within one
104
+ # standard deviation of the estimate
105
+ self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
106
+ self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
107
+
108
+ # we unioned two equal-sized sketches with overlap and used
109
+ # the max value as the resulting summary, meaning we should
110
+ # have more summaries with value 7 than value 5 in the result
111
+ count5 = 0
112
+ count7 = 0
113
+ for pair in result:
114
+ if pair[1] == 5:
115
+ count5 += 1
116
+ elif pair[1] == 7:
117
+ count7 += 1
118
+ else:
119
+ self.fail()
120
+ self.assertLess(count5, count7)
121
+
122
+ # INTERSECTIONS
123
+ # create an intersection object
124
+ intersect = tuple_intersection(MinIntPolicy()) # no lg_k
125
+ intersect.update(sk1)
126
+ intersect.update(sk2)
127
+
128
+ # has_result() indicates the intersection has been used,
129
+ # although the result may be the empty set
130
+ self.assertTrue(intersect.has_result())
131
+
132
+ # as with unions, the result is a compact sketch
133
+ result = intersect.get_result()
134
+ self.assertTrue(isinstance(result, compact_tuple_sketch))
135
+
136
+ # we know the sets overlap by 1/4
137
+ self.assertLessEqual(result.get_lower_bound(1), n / 4)
138
+ self.assertGreaterEqual(result.get_upper_bound(1), n / 4)
139
+
140
+ # in this example, we intersected the sketches and took the
141
+ # min value as the resulting summary, so all summaries
142
+ # must be exactly equal to that value
143
+ count5 = 0
144
+ for pair in result:
145
+ if pair[1] == 5:
146
+ count5 += 1
147
+ else:
148
+ self.fail()
149
+ self.assertEqual(count5, result.get_num_retained())
150
+
151
+ # A NOT B
152
+ # create an a_not_b object
153
+ anb = tuple_a_not_b() # no lg_k or policy
154
+ result = anb.compute(sk1, sk2)
155
+
156
+ # as with unions, the result is a compact sketch
157
+ self.assertTrue(isinstance(result, compact_tuple_sketch))
158
+
159
+ # we know the sets overlap by 1/4, so the remainder is 3/4
160
+ self.assertLessEqual(result.get_lower_bound(1), 3 * n / 4)
161
+ self.assertGreaterEqual(result.get_upper_bound(1), 3 * n / 4)
162
+
163
+ # here, we have only values with a summary of 5 as any keys that
164
+ # existed in both sketches were removed
165
+ count5 = 0
166
+ for pair in result:
167
+ if pair[1] == 5:
168
+ count5 += 1
169
+ else:
170
+ self.fail()
171
+ self.assertEqual(count5, result.get_num_retained())
172
+
173
+ # JACCARD SIMILARITY
174
+ # Jaccard Similarity measure returns (lower_bound, estimate, upper_bound)
175
+ # and does not examine summaries, even for (dis)similarity tests.
176
+ jac = tuple_jaccard_similarity.jaccard(sk1, sk2)
177
+
178
+ # we can check that results are in the expected order
179
+ self.assertLess(jac[0], jac[1])
180
+ self.assertLess(jac[1], jac[2])
181
+
182
+ # checks for sketch equivalence
183
+ self.assertTrue(tuple_jaccard_similarity.exactly_equal(sk1, sk1))
184
+ self.assertFalse(tuple_jaccard_similarity.exactly_equal(sk1, sk2))
185
+
186
+ # we can apply a check for similarity or dissimilarity at a
187
+ # given threshold, at 97.7% confidence.
188
+
189
+ # check that the Jaccard Index is at most (upper bound) 0.2.
190
+ # exact result would be 1/7
191
+ self.assertTrue(tuple_jaccard_similarity.dissimilarity_test(sk1, sk2, 0.2))
192
+
193
+ # check that the Jaccard Index is at least (lower bound) 0.7
194
+ # exact result would be 3/4, using result from A NOT B test
195
+ self.assertTrue(tuple_jaccard_similarity.similarity_test(sk1, result, 0.7))
196
+
197
+
198
+ # Generates a basic tuple sketch with a fixed value for each update
199
+ def generate_tuple_sketch(self, policy, n, lgk, value, offset=0):
200
+ sk = update_tuple_sketch(policy, lgk)
201
+ for i in range(0, n):
202
+ sk.update(i + offset, value)
203
+ return sk
204
+
205
+ if __name__ == '__main__':
206
+ unittest.main()
@@ -45,6 +45,13 @@ class VoTest(unittest.TestCase):
45
45
  items = vo.get_samples()
46
46
  self.assertEqual(len(items), k)
47
47
 
48
+ count = 0
49
+ for tuple in vo:
50
+ sample = tuple[0]
51
+ weight = tuple[1]
52
+ count = count + 1
53
+ self.assertEqual(count, vo.num_samples)
54
+
48
55
  # we can also apply a predicate to the sketch to get an estimate
49
56
  # (with optimally minimal variance) of the subset sum of items
50
57
  # matching that predicate among the entire population
@@ -580,15 +580,20 @@ private:
580
580
 
581
581
 
582
582
  template<typename T, typename C, typename A>
583
- class quantiles_sketch<T, C, A>::const_iterator: public std::iterator<std::input_iterator_tag, T> {
583
+ class quantiles_sketch<T, C, A>::const_iterator {
584
584
  public:
585
+ using iterator_category = std::input_iterator_tag;
585
586
  using value_type = std::pair<const T&, const uint64_t>;
587
+ using difference_type = void;
588
+ using pointer = const return_value_holder<value_type>;
589
+ using reference = const value_type;
590
+
586
591
  const_iterator& operator++();
587
592
  const_iterator& operator++(int);
588
593
  bool operator==(const const_iterator& other) const;
589
594
  bool operator!=(const const_iterator& other) const;
590
- const value_type operator*() const;
591
- const return_value_holder<value_type> operator->() const;
595
+ reference operator*() const;
596
+ pointer operator->() const;
592
597
  private:
593
598
  friend class quantiles_sketch<T, C, A>;
594
599
  using Level = std::vector<T, A>;
@@ -645,12 +645,12 @@ string<A> quantiles_sketch<T, C, A>::to_string(bool print_levels, bool print_ite
645
645
  uint8_t level = 0;
646
646
  os << " BB:" << std::endl;
647
647
  for (const T& item : base_buffer_) {
648
- os << " " << std::to_string(item) << std::endl;
648
+ os << " " << item << std::endl;
649
649
  }
650
650
  for (uint8_t i = 0; i < levels_.size(); ++i) {
651
651
  os << " level " << static_cast<unsigned int>(level) << ":" << std::endl;
652
652
  for (const T& item : levels_[i]) {
653
- os << " " << std::to_string(item) << std::endl;
653
+ os << " " << item << std::endl;
654
654
  }
655
655
  }
656
656
  os << "### End sketch data" << std::endl;
@@ -1354,12 +1354,12 @@ bool quantiles_sketch<T, C, A>::const_iterator::operator!=(const const_iterator&
1354
1354
  }
1355
1355
 
1356
1356
  template<typename T, typename C, typename A>
1357
- auto quantiles_sketch<T, C, A>::const_iterator::operator*() const -> const value_type {
1357
+ auto quantiles_sketch<T, C, A>::const_iterator::operator*() const -> reference {
1358
1358
  return value_type(level_ == -1 ? base_buffer_[index_] : levels_[level_][index_], weight_);
1359
1359
  }
1360
1360
 
1361
1361
  template<typename T, typename C, typename A>
1362
- auto quantiles_sketch<T, C, A>::const_iterator::operator->() const -> const return_value_holder<value_type> {
1362
+ auto quantiles_sketch<T, C, A>::const_iterator::operator->() const -> pointer {
1363
1363
  return **this;
1364
1364
  }
1365
1365
 
@@ -260,7 +260,7 @@ TEST_CASE("quantiles sketch", "[quantiles_sketch]") {
260
260
  REQUIRE(sketch.get_rank(values[i]) == ranks[i]);
261
261
  }
262
262
  subtotal_pmf += pmf[i];
263
- if (abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
263
+ if (std::abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
264
264
  std::cerr << "CDF vs PMF for value " << i << std::endl;
265
265
  REQUIRE(ranks[i] == Approx(subtotal_pmf).margin(NUMERIC_NOISE_TOLERANCE));
266
266
  }
@@ -28,8 +28,6 @@
28
28
  #include "conditional_forward.hpp"
29
29
  #include "common_defs.hpp"
30
30
 
31
- #include <iomanip>
32
-
33
31
  namespace datasketches {
34
32
 
35
33
  template<typename T, typename C, typename A>
@@ -399,15 +399,20 @@ private:
399
399
  };
400
400
 
401
401
  template<typename T, typename C, typename A>
402
- class req_sketch<T, C, A>::const_iterator: public std::iterator<std::input_iterator_tag, T> {
402
+ class req_sketch<T, C, A>::const_iterator {
403
403
  public:
404
+ using iterator_category = std::input_iterator_tag;
404
405
  using value_type = std::pair<const T&, const uint64_t>;
406
+ using difference_type = void;
407
+ using pointer = const return_value_holder<value_type>;
408
+ using reference = const value_type;
409
+
405
410
  const_iterator& operator++();
406
411
  const_iterator& operator++(int);
407
412
  bool operator==(const const_iterator& other) const;
408
413
  bool operator!=(const const_iterator& other) const;
409
- const value_type operator*() const;
410
- const return_value_holder<value_type> operator->() const;
414
+ reference operator*() const;
415
+ pointer operator->() const;
411
416
  private:
412
417
  using LevelsIterator = typename std::vector<Compactor, AllocCompactor>::const_iterator;
413
418
  LevelsIterator levels_it_;
@@ -848,12 +848,12 @@ bool req_sketch<T, C, A>::const_iterator::operator!=(const const_iterator& other
848
848
  }
849
849
 
850
850
  template<typename T, typename C, typename A>
851
- auto req_sketch<T, C, A>::const_iterator::operator*() const -> const value_type {
851
+ auto req_sketch<T, C, A>::const_iterator::operator*() const -> reference {
852
852
  return value_type(*compactor_it_, 1ULL << (*levels_it_).get_lg_weight());
853
853
  }
854
854
 
855
855
  template<typename T, typename C, typename A>
856
- auto req_sketch<T, C, A>::const_iterator::operator->() const -> const return_value_holder<value_type> {
856
+ auto req_sketch<T, C, A>::const_iterator::operator->() const -> pointer {
857
857
  return **this;
858
858
  }
859
859