RubyGems - datasketches - Versions diffs - 0.3.2 → 0.4.0 - Mend

datasketches 0.3.2 → 0.4.0

Files changed (237) hide show

data/vendor/datasketches-cpp/python/tests/quantiles_test.py DELETED Viewed

@@ -1,160 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import unittest
-from datasketches import quantiles_ints_sketch, quantiles_floats_sketch, quantiles_doubles_sketch
-from datasketches import quantiles_items_sketch, ks_test, PyStringsSerDe
-import numpy as np
-class QuantilesTest(unittest.TestCase):
-    def test_quantiles_floats_example(self):
-      k = 128
-      n = 2 ** 20
-      # create a sketch and inject ~1 million N(0,1) points as an array and as a single item
-      quantiles = quantiles_floats_sketch(k)
-      quantiles.update(np.random.normal(size=n-1))
-      quantiles.update(0.0)
-      # 0 should be near the median
-      self.assertAlmostEqual(0.5, quantiles.get_rank(0.0), delta=0.035)
-      # the median should be near 0
-      self.assertAlmostEqual(0.0, quantiles.get_quantile(0.5), delta=0.035)
-      # we also track the min/max independently from the rest of the data
-      # which lets us know the full observed data range
-      self.assertLessEqual(quantiles.get_min_value(), quantiles.get_quantile(0.01))
-      self.assertLessEqual(0.0, quantiles.get_rank(quantiles.get_min_value()))
-      self.assertGreaterEqual(quantiles.get_max_value(), quantiles.get_quantile(0.99))
-      self.assertGreaterEqual(1.0, quantiles.get_rank(quantiles.get_max_value()))
-      # we can also extract a list of values at a time,
-      # here the values should give us something close to [-2, -1, 0, 1, 2].
-      # then get the CDF, which will return something close to
-      # the original values used in get_quantiles()
-      # finally, can check the normalized rank error bound
-      pts = quantiles.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772])
-      cdf = quantiles.get_cdf(pts)  # include 1.0 at end to account for all probability mass
-      self.assertEqual(len(cdf), len(pts)+1)
-      err = quantiles.normalized_rank_error(False)
-      self.assertEqual(err, quantiles_floats_sketch.get_normalized_rank_error(k, False))
-      # and a few basic queries about the sketch
-      self.assertFalse(quantiles.is_empty())
-      self.assertTrue(quantiles.is_estimation_mode())
-      self.assertEqual(quantiles.get_n(), n)
-      self.assertEqual(quantiles.get_k(), k)
-      self.assertLess(quantiles.get_num_retained(), n)
-      # merging itself will double the number of items the sketch has seen
-      quantiles_copy = quantiles_floats_sketch(quantiles)
-      quantiles.merge(quantiles_copy)
-      self.assertEqual(quantiles.get_n(), 2*n)
-      # we can then serialize and reconstruct the sketch
-      quantiles_bytes = quantiles.serialize()
-      new_quantiles = quantiles_floats_sketch.deserialize(quantiles_bytes)
-      self.assertEqual(quantiles.get_num_retained(), new_quantiles.get_num_retained())
-      self.assertEqual(quantiles.get_min_value(), new_quantiles.get_min_value())
-      self.assertEqual(quantiles.get_max_value(), new_quantiles.get_max_value())
-      self.assertEqual(quantiles.get_quantile(0.7), new_quantiles.get_quantile(0.7))
-      self.assertEqual(quantiles.get_rank(0.0), new_quantiles.get_rank(0.0))
-      # If we create a new sketch with a very different distribution, a Kolmogorov-Smirnov Test
-      # of the two should return True: we can reject the null hypothesis that the sketches
-      # come from the same distributions.
-      unif_quantiles = quantiles_floats_sketch(k)
-      unif_quantiles.update(np.random.uniform(10, 20, size=n-1))
-      self.assertTrue(ks_test(quantiles, unif_quantiles, 0.001))
-      total_weight = 0
-      for tuple in quantiles:
-        item = tuple[0]
-        weight = tuple[1]
-        total_weight = total_weight + weight
-      self.assertEqual(total_weight, quantiles.get_n())
-    def test_quantiles_ints_sketch(self):
-        k = 128
-        n = 10
-        quantiles = quantiles_ints_sketch(k)
-        for i in range(0, n):
-          quantiles.update(i)
-        self.assertEqual(quantiles.get_min_value(), 0)
-        self.assertEqual(quantiles.get_max_value(), n-1)
-        self.assertEqual(quantiles.get_n(), n)
-        self.assertFalse(quantiles.is_empty())
-        self.assertFalse(quantiles.is_estimation_mode()) # n < k
-        self.assertEqual(quantiles.get_k(), k)
-        pmf = quantiles.get_pmf([round(n/2)])
-        self.assertIsNotNone(pmf)
-        self.assertEqual(len(pmf), 2)
-        cdf = quantiles.get_cdf([round(n/2)])
-        self.assertIsNotNone(cdf)
-        self.assertEqual(len(cdf), 2)
-        self.assertEqual(quantiles.get_quantile(0.5), round(n/2))
-        quants = quantiles.get_quantiles([0.25, 0.5, 0.75])
-        self.assertIsNotNone(quants)
-        self.assertEqual(len(quants), 3)
-        self.assertEqual(quantiles.get_rank(round(n/2)), 0.5)
-        # merge self
-        quantiles_copy = quantiles_ints_sketch(quantiles)
-        quantiles.merge(quantiles_copy)
-        self.assertEqual(quantiles.get_n(), 2 * n)
-        sk_bytes = quantiles.serialize()
-        self.assertTrue(isinstance(quantiles_ints_sketch.deserialize(sk_bytes), quantiles_ints_sketch))
-    def test_quantiles_doubles_sketch(self):
-      # already tested floats and ints and it's templatized, so just make sure it instantiates properly
-      k = 128
-      quantiles = quantiles_doubles_sketch(k)
-      self.assertTrue(quantiles.is_empty())
-    def test_quantiles_items_sketch(self):
-      # most functionality has been tested, but we need to ensure objects and sorting work
-      # as well as serialization
-      k = 128
-      n = 2 ** 16
-      # create a sketch and inject enough points to force compaction
-      quantiles = quantiles_items_sketch(k)
-      for i in range(0, n):
-        quantiles.update(str(i))
-      quantiles_copy = quantiles_items_sketch(quantiles)
-      quantiles.merge(quantiles_copy)
-      self.assertEqual(quantiles.get_n(), 2 * n)
-      quantiles_bytes = quantiles.serialize(PyStringsSerDe())
-      new_quantiles = quantiles_items_sketch.deserialize(quantiles_bytes, PyStringsSerDe())
-      self.assertEqual(quantiles.get_num_retained(), new_quantiles.get_num_retained())
-      self.assertEqual(quantiles.get_min_value(), new_quantiles.get_min_value())
-      self.assertEqual(quantiles.get_max_value(), new_quantiles.get_max_value())
-      self.assertEqual(quantiles.get_quantile(0.7), new_quantiles.get_quantile(0.7))
-      self.assertEqual(quantiles.get_rank(str(n/4)), new_quantiles.get_rank(str(n/4)))
-if __name__ == '__main__':
-    unittest.main()

data/vendor/datasketches-cpp/python/tests/req_test.py DELETED Viewed

@@ -1,159 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import unittest
-from datasketches import req_ints_sketch, req_floats_sketch, req_items_sketch, PyStringsSerDe
-import numpy as np
-class reqTest(unittest.TestCase):
-    def test_req_example(self):
-      k = 12
-      n = 2 ** 20
-      # create a sketch and inject ~1 million N(0,1) points as an array and as a single item
-      req = req_floats_sketch(k, True) # high rank accuracy
-      req.update(np.random.normal(size=n-1))
-      req.update(0.0)
-      # 0 should be near the median
-      self.assertAlmostEqual(0.5, req.get_rank(0.0), delta=0.045)
-      # the median should be near 0
-      self.assertAlmostEqual(0.0, req.get_quantile(0.5), delta=0.045)
-      # we also track the min/max independently from the rest of the data
-      # which lets us know the full observed data range
-      self.assertLessEqual(req.get_min_value(), req.get_quantile(0.01))
-      self.assertLessEqual(0.0, req.get_rank(req.get_min_value()))
-      self.assertGreaterEqual(req.get_max_value(), req.get_quantile(0.99))
-      self.assertGreaterEqual(1.0, req.get_rank(req.get_max_value()))
-      # we can also extract a list of values at a time,
-      # here the values should give us something close to [-2, -1, 0, 1, 2].
-      # then get the CDF, which will return something close to
-      # the original values used in get_quantiles()
-      # finally, can check the normalized rank error bound
-      pts = req.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772])
-      cdf = req.get_cdf(pts)  # include 1.0 at end to account for all probability mass
-      self.assertEqual(len(cdf), len(pts)+1)
-      # For relative error quantiles, the error depends on the actual rank
-      # so we need to use that to detemrine the bounds
-      est = req.get_rank(0.999, True)
-      lb = req.get_rank_lower_bound(est, 1)
-      ub = req.get_rank_upper_bound(est, 1)
-      self.assertLessEqual(lb, est)
-      self.assertLessEqual(est, ub)
-      # and a few basic queries about the sketch
-      self.assertFalse(req.is_empty())
-      self.assertTrue(req.is_estimation_mode())
-      self.assertEqual(req.get_n(), n)
-      self.assertLess(req.get_num_retained(), n)
-      self.assertEqual(req.get_k(), k)
-      # merging itself will double the number of items the sketch has seen
-      req_copy = req_floats_sketch(req)
-      req.merge(req_copy)
-      self.assertEqual(req.get_n(), 2*n)
-      # we can then serialize and reconstruct the sketch
-      req_bytes = req.serialize()
-      new_req = req_floats_sketch.deserialize(req_bytes)
-      self.assertEqual(req.get_num_retained(), new_req.get_num_retained())
-      self.assertEqual(req.get_min_value(), new_req.get_min_value())
-      self.assertEqual(req.get_max_value(), new_req.get_max_value())
-      self.assertEqual(req.get_quantile(0.7), new_req.get_quantile(0.7))
-      self.assertEqual(req.get_rank(0.0), new_req.get_rank(0.0))
-      total_weight = 0
-      for tuple in req:
-        item = tuple[0]
-        weight = tuple[1]
-        total_weight = total_weight + weight
-      self.assertEqual(total_weight, req.get_n())
-    def test_req_ints_sketch(self):
-        k = 100
-        n = 10
-        req = req_ints_sketch(k)
-        for i in range(0, n):
-          req.update(i)
-        self.assertEqual(req.get_min_value(), 0)
-        self.assertEqual(req.get_max_value(), n-1)
-        self.assertEqual(req.get_n(), n)
-        self.assertFalse(req.is_empty())
-        self.assertFalse(req.is_estimation_mode()) # n < k
-        self.assertEqual(req.get_k(), k)
-        pmf = req.get_pmf([round(n/2)])
-        self.assertIsNotNone(pmf)
-        self.assertEqual(len(pmf), 2)
-        cdf = req.get_cdf([round(n/2)])
-        self.assertIsNotNone(cdf)
-        self.assertEqual(len(cdf), 2)
-        self.assertEqual(req.get_quantile(0.5), round(n/2))
-        quants = req.get_quantiles([0.25, 0.5, 0.75])
-        self.assertIsNotNone(quants)
-        self.assertEqual(len(quants), 3)
-        self.assertEqual(req.get_rank(round(n/2)), 0.5)
-        # merge self
-        req_copy = req_ints_sketch(req)
-        req.merge(req_copy)
-        self.assertEqual(req.get_n(), 2 * n)
-        sk_bytes = req.serialize()
-        self.assertTrue(isinstance(req_ints_sketch.deserialize(sk_bytes), req_ints_sketch))
-    def test_req_floats_sketch(self):
-      # already tested floats with LRA so just check that HRA works
-      k = 75
-      req = req_floats_sketch(k, False) # low rank accuracy
-      self.assertTrue(req.is_empty())
-      self.assertFalse(req.is_hra())
-    def test_req_items_sketch(self):
-      # most functionality has been tested, but we need to ensure objects and sorting work
-      # as well as serialization
-      k = 100
-      n = 2 ** 16
-      # create a sketch and inject enough points to force compaction
-      req = req_items_sketch(k)
-      for i in range(0, n):
-        req.update(str(i))
-      req_copy = req_items_sketch(req)
-      req.merge(req_copy)
-      self.assertEqual(req.get_n(), 2 * n)
-      req_bytes = req.serialize(PyStringsSerDe())
-      new_req = req_items_sketch.deserialize(req_bytes, PyStringsSerDe())
-      self.assertEqual(req.get_num_retained(), new_req.get_num_retained())
-      self.assertEqual(req.get_min_value(), new_req.get_min_value())
-      self.assertEqual(req.get_max_value(), new_req.get_max_value())
-      self.assertEqual(req.get_quantile(0.7), new_req.get_quantile(0.7))
-      self.assertEqual(req.get_rank(str(n/4)), new_req.get_rank(str(n/4)))
-if __name__ == '__main__':
-    unittest.main()

data/vendor/datasketches-cpp/python/tests/theta_test.py DELETED Viewed

@@ -1,148 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import unittest
-from datasketches import theta_sketch, update_theta_sketch
-from datasketches import compact_theta_sketch, theta_union
-from datasketches import theta_intersection, theta_a_not_b
-from datasketches import theta_jaccard_similarity
-class ThetaTest(unittest.TestCase):
-    def test_theta_basic_example(self):
-        lgk = 12    # 2^k = 4096 rows in the table
-        n = 1 << 18 # ~256k unique values
-        # create a sketch and inject some values
-        sk = self.generate_theta_sketch(n, lgk)
-        # we can check that the upper and lower bounds bracket the
-        # estimate, without needing to know the exact value.
-        self.assertLessEqual(sk.get_lower_bound(1), sk.get_estimate())
-        self.assertGreaterEqual(sk.get_upper_bound(1), sk.get_estimate())
-        # because this sketch is deterministically generated, we can
-        # also compare against the exact value
-        self.assertLessEqual(sk.get_lower_bound(1), n)
-        self.assertGreaterEqual(sk.get_upper_bound(1), n)
-        # compact and serialize for storage, then reconstruct
-        sk_bytes = sk.compact().serialize()
-        new_sk = compact_theta_sketch.deserialize(sk_bytes)
-        # estimate remains unchanged
-        self.assertFalse(sk.is_empty())
-        self.assertEqual(sk.get_estimate(), new_sk.get_estimate())
-        count = 0
-        for hash in new_sk:
-          self.assertLess(hash, new_sk.get_theta64())
-          count = count + 1
-        self.assertEqual(count, new_sk.get_num_retained())
-    def test_theta_set_operations(self):
-        lgk = 12    # 2^k = 4096 rows in the table
-        n = 1 << 18 # ~256k unique values
-        # we'll have 1/4 of the values overlap
-        offset = int(3 * n / 4) # it's a float w/o cast
-        # create a couple sketches and inject some values
-        sk1 = self.generate_theta_sketch(n, lgk)
-        sk2 = self.generate_theta_sketch(n, lgk, offset)
-        # UNIONS
-        # create a union object
-        union = theta_union(lgk)
-        union.update(sk1)
-        union.update(sk2)
-        # getting result from union returns a compact_theta_sketch
-        # compact theta sketches can be used in additional unions
-        # or set operations but cannot accept further item updates
-        result = union.get_result()
-        self.assertTrue(isinstance(result, compact_theta_sketch))
-        # since our process here is deterministic, we have
-        # checked and know the exact answer is within one
-        # standard deviation of the estimate
-        self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
-        self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
-        # INTERSECTIONS
-        # create an intersection object
-        intersect = theta_intersection() # no lg_k
-        intersect.update(sk1)
-        intersect.update(sk2)
-        # has_result() indicates the intersection has been used,
-        # although the result may be the empty set
-        self.assertTrue(intersect.has_result())
-        # as with unions, the result is a compact sketch
-        result = intersect.get_result()
-        self.assertTrue(isinstance(result, compact_theta_sketch))
-        # we know the sets overlap by 1/4
-        self.assertLessEqual(result.get_lower_bound(1), n / 4)
-        self.assertGreaterEqual(result.get_upper_bound(1), n / 4)
-        # A NOT B
-        # create an a_not_b object
-        anb = theta_a_not_b() # no lg_k
-        result = anb.compute(sk1, sk2)
-        # as with unions, the result is a compact sketch
-        self.assertTrue(isinstance(result, compact_theta_sketch))
-        # we know the sets overlap by 1/4, so the remainder is 3/4
-        self.assertLessEqual(result.get_lower_bound(1), 3 * n / 4)
-        self.assertGreaterEqual(result.get_upper_bound(1), 3 * n / 4)
-        # JACCARD SIMILARITY
-        # Jaccard Similarity measure returns (lower_bound, estimate, upper_bound)
-        jac = theta_jaccard_similarity.jaccard(sk1, sk2)
-        # we can check that results are in the expected order
-        self.assertLess(jac[0], jac[1])
-        self.assertLess(jac[1], jac[2])
-        # checks for sketch equivalency
-        self.assertTrue(theta_jaccard_similarity.exactly_equal(sk1, sk1))
-        self.assertFalse(theta_jaccard_similarity.exactly_equal(sk1, sk2))
-        # we can apply a check for similarity or dissimilarity at a
-        # given threshhold, at 97.7% confidence.
-        # check that the Jaccard Index is at most (upper bound) 0.2.
-        # exact result would be 1/7
-        self.assertTrue(theta_jaccard_similarity.dissimilarity_test(sk1, sk2, 0.2))
-        # check that the Jaccard Index is at least (lower bound) 0.7
-        # exact result would be 3/4, using result from A NOT B test
-        self.assertTrue(theta_jaccard_similarity.similarity_test(sk1, result, 0.7))
-    def generate_theta_sketch(self, n, lgk, offset=0):
-      sk = update_theta_sketch(lgk)
-      for i in range(0, n):
-        sk.update(i + offset)
-      return sk
-if __name__ == '__main__':
-    unittest.main()

data/vendor/datasketches-cpp/python/tests/tuple_test.py DELETED Viewed

@@ -1,206 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import unittest
-from datasketches import update_tuple_sketch
-from datasketches import compact_tuple_sketch, tuple_union
-from datasketches import tuple_intersection, tuple_a_not_b
-from datasketches import tuple_jaccard_similarity
-from datasketches import tuple_jaccard_similarity, PyIntsSerDe
-from datasketches import AccumulatorPolicy, MaxIntPolicy, MinIntPolicy
-from datasketches import update_theta_sketch
-class TupleTest(unittest.TestCase):
-    def test_tuple_basic_example(self):
-        lgk = 12    # 2^k = 4096 rows in the table
-        n = 1 << 18 # ~256k unique values
-        # create a sketch and inject some values -- summary is 2 so we can sum them
-        # and know the reuslt
-        sk = self.generate_tuple_sketch(AccumulatorPolicy(), n, lgk, value=2)
-        # we can check that the upper and lower bounds bracket the
-        # estimate, without needing to know the exact value.
-        self.assertLessEqual(sk.get_lower_bound(1), sk.get_estimate())
-        self.assertGreaterEqual(sk.get_upper_bound(1), sk.get_estimate())
-        # because this sketch is deterministically generated, we can
-        # also compare against the exact value
-        self.assertLessEqual(sk.get_lower_bound(1), n)
-        self.assertGreaterEqual(sk.get_upper_bound(1), n)
-        # compact and serialize for storage, then reconstruct
-        sk_bytes = sk.compact().serialize(PyIntsSerDe())
-        new_sk = compact_tuple_sketch.deserialize(sk_bytes, serde=PyIntsSerDe())
-        # estimate remains unchanged
-        self.assertFalse(sk.is_empty())
-        self.assertEqual(sk.get_estimate(), new_sk.get_estimate())
-        # we can also iterate over the sketch entries
-        # the iterator provides a (hashkey, summary) pair where the
-        # first value is the raw hash value and the second the summary
-        count = 0
-        cumSum = 0
-        for pair in new_sk:
-          self.assertLess(pair[0], new_sk.get_theta64())
-          count += 1
-          cumSum += pair[1]
-        self.assertEqual(count, new_sk.get_num_retained())
-        self.assertEqual(cumSum, 2 * new_sk.get_num_retained())
-        # we can even create a tuple sketch from an existing theta sketch
-        # as long as we provide a summary to use
-        theta_sk = update_theta_sketch(lgk)
-        for i in range(n, 2*n):
-          theta_sk.update(i)
-        cts = compact_tuple_sketch(theta_sk, 5)
-        cumSum = 0
-        for pair in cts:
-          cumSum += pair[1]
-        self.assertEqual(cumSum, 5 * cts.get_num_retained())
-    def test_tuple_set_operations(self):
-        lgk = 12    # 2^k = 4096 rows in the table
-        n = 1 << 18 # ~256k unique values
-        # we'll have 1/4 of the values overlap
-        offset = int(3 * n / 4) # it's a float w/o cast
-        # create a couple sketches and inject some values, with different summaries
-        sk1 = self.generate_tuple_sketch(AccumulatorPolicy(), n, lgk, value=5)
-        sk2 = self.generate_tuple_sketch(AccumulatorPolicy(), n, lgk, value=7, offset=offset)
-        # UNIONS
-        # create a union object
-        union = tuple_union(MaxIntPolicy(), lgk)
-        union.update(sk1)
-        union.update(sk2)
-        # getting result from union returns a compact_theta_sketch
-        # compact theta sketches can be used in additional unions
-        # or set operations but cannot accept further item updates
-        result = union.get_result()
-        self.assertTrue(isinstance(result, compact_tuple_sketch))
-        # since our process here is deterministic, we have
-        # checked and know the exact answer is within one
-        # standard deviation of the estimate
-        self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
-        self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
-        # we unioned two equal-sized sketches with overlap and used
-        # the max value as the resulting summary, meaning we should
-        # have more summaries with value 7 than value 5 in the result
-        count5 = 0
-        count7 = 0
-        for pair in result:
-          if pair[1] == 5:
-            count5 += 1
-          elif pair[1] == 7:
-            count7 += 1
-          else:
-            self.fail()
-        self.assertLess(count5, count7)
-        # INTERSECTIONS
-        # create an intersection object
-        intersect = tuple_intersection(MinIntPolicy()) # no lg_k
-        intersect.update(sk1)
-        intersect.update(sk2)
-        # has_result() indicates the intersection has been used,
-        # although the result may be the empty set
-        self.assertTrue(intersect.has_result())
-        # as with unions, the result is a compact sketch
-        result = intersect.get_result()
-        self.assertTrue(isinstance(result, compact_tuple_sketch))
-        # we know the sets overlap by 1/4
-        self.assertLessEqual(result.get_lower_bound(1), n / 4)
-        self.assertGreaterEqual(result.get_upper_bound(1), n / 4)
-        # in this example, we intersected the sketches and took the
-        # min value as the resulting summary, so all summaries
-        # must be exactly equal to that value
-        count5 = 0
-        for pair in result:
-          if pair[1] == 5:
-            count5 += 1
-          else:
-            self.fail()
-        self.assertEqual(count5, result.get_num_retained())
-        # A NOT B
-        # create an a_not_b object
-        anb = tuple_a_not_b() # no lg_k or policy
-        result = anb.compute(sk1, sk2)
-        # as with unions, the result is a compact sketch
-        self.assertTrue(isinstance(result, compact_tuple_sketch))
-        # we know the sets overlap by 1/4, so the remainder is 3/4
-        self.assertLessEqual(result.get_lower_bound(1), 3 * n / 4)
-        self.assertGreaterEqual(result.get_upper_bound(1), 3 * n / 4)
-        # here, we have only values with a summary of 5 as any keys that
-        # existed in both sketches were removed
-        count5 = 0
-        for pair in result:
-          if pair[1] == 5:
-            count5 += 1
-          else:
-            self.fail()
-        self.assertEqual(count5, result.get_num_retained())
-        # JACCARD SIMILARITY
-        # Jaccard Similarity measure returns (lower_bound, estimate, upper_bound)
-        # and does not examine summaries, even for (dis)similarity tests.
-        jac = tuple_jaccard_similarity.jaccard(sk1, sk2)
-        # we can check that results are in the expected order
-        self.assertLess(jac[0], jac[1])
-        self.assertLess(jac[1], jac[2])
-        # checks for sketch equivalence
-        self.assertTrue(tuple_jaccard_similarity.exactly_equal(sk1, sk1))
-        self.assertFalse(tuple_jaccard_similarity.exactly_equal(sk1, sk2))
-        # we can apply a check for similarity or dissimilarity at a
-        # given threshold, at 97.7% confidence.
-        # check that the Jaccard Index is at most (upper bound) 0.2.
-        # exact result would be 1/7
-        self.assertTrue(tuple_jaccard_similarity.dissimilarity_test(sk1, sk2, 0.2))
-        # check that the Jaccard Index is at least (lower bound) 0.7
-        # exact result would be 3/4, using result from A NOT B test
-        self.assertTrue(tuple_jaccard_similarity.similarity_test(sk1, result, 0.7))
-    # Generates a basic tuple sketch with a fixed value for each update
-    def generate_tuple_sketch(self, policy, n, lgk, value, offset=0):
-      sk = update_tuple_sketch(policy, lgk)
-      for i in range(0, n):
-        sk.update(i + offset, value)
-      return sk
-if __name__ == '__main__':
-    unittest.main()