RubyGems - datasketches - Versions diffs - 0.1.0 - Mend

datasketches 0.1.0

Files changed (247) hide show

data/vendor/datasketches-cpp/python/tests/kll_test.py ADDED

@@ -0,0 +1,119 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import unittest
+from datasketches import (kll_ints_sketch, kll_floats_sketch,
+                          vector_of_kll_ints_sketches,
+                          vector_of_kll_floats_sketches)
+import numpy as np
+class KllTest(unittest.TestCase):
+    def test_kll_example(self):
+      k = 160
+      n = 2 ** 20
+      # create a sketch and inject ~1 million N(0,1) points as an array and as a single item
+      kll = kll_floats_sketch(k)
+      kll.update(np.random.normal(size=n-1))
+      kll.update(0.0)
+      # 0 should be near the median
+      self.assertAlmostEqual(0.5, kll.get_rank(0.0), delta=0.025)
+      # the median should be near 0
+      self.assertAlmostEqual(0.0, kll.get_quantile(0.5), delta=0.025)
+      # we also track the min/max independently from the rest of the data
+      # which lets us know the full observed data range
+      self.assertLessEqual(kll.get_min_value(), kll.get_quantile(0.01))
+      self.assertLessEqual(0.0, kll.get_rank(kll.get_min_value()))
+      self.assertGreaterEqual(kll.get_max_value(), kll.get_quantile(0.99))
+      self.assertGreaterEqual(1.0, kll.get_rank(kll.get_max_value()))
+      # we can also extract a list of values at a time,
+      # here the values should give us something close to [-2, -1, 0, 1, 2].
+      # then get the CDF, which will return something close to
+      # the original values used in get_quantiles()
+      # finally, can check the normalized rank error bound
+      pts = kll.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772])
+      cdf = kll.get_cdf(pts)  # include 1.0 at end to account for all probability mass
+      self.assertEqual(len(cdf), len(pts)+1)
+      err = kll.normalized_rank_error(False)
+      self.assertEqual(err, kll_floats_sketch.get_normalized_rank_error(k, False))
+      # and a few basic queries about the sketch
+      self.assertFalse(kll.is_empty())
+      self.assertTrue(kll.is_estimation_mode())
+      self.assertEqual(kll.get_n(), n)
+      self.assertLess(kll.get_num_retained(), n)
+      # merging itself will double the number of items the sketch has seen
+      kll.merge(kll)
+      self.assertEqual(kll.get_n(), 2*n)
+      # we can then serialize and reconstruct the sketch
+      kll_bytes = kll.serialize()
+      new_kll = kll.deserialize(kll_bytes)
+      self.assertEqual(kll.get_num_retained(), new_kll.get_num_retained())
+      self.assertEqual(kll.get_min_value(), new_kll.get_min_value())
+      self.assertEqual(kll.get_max_value(), new_kll.get_max_value())
+      self.assertEqual(kll.get_quantile(0.7), new_kll.get_quantile(0.7))
+      self.assertEqual(kll.get_rank(0.0), new_kll.get_rank(0.0))
+    def test_kll_ints_sketch(self):
+        k = 100
+        n = 10
+        kll = kll_ints_sketch(k)
+        for i in range(0, n):
+          kll.update(i)
+        self.assertEqual(kll.get_min_value(), 0)
+        self.assertEqual(kll.get_max_value(), n-1)
+        self.assertEqual(kll.get_n(), n)
+        self.assertFalse(kll.is_empty())
+        self.assertFalse(kll.is_estimation_mode()) # n < k
+        pmf = kll.get_pmf([round(n/2)])
+        self.assertIsNotNone(pmf)
+        self.assertEqual(len(pmf), 2)
+        cdf = kll.get_cdf([round(n/2)])
+        self.assertIsNotNone(cdf)
+        self.assertEqual(len(cdf), 2)
+        self.assertEqual(kll.get_quantile(0.5), round(n/2))
+        quants = kll.get_quantiles([0.25, 0.5, 0.75])
+        self.assertIsNotNone(quants)
+        self.assertEqual(len(quants), 3)
+        self.assertEqual(kll.get_rank(round(n/2)), 0.5)
+        # merge self
+        kll.merge(kll)
+        self.assertEqual(kll.get_n(), 2 * n)
+        sk_bytes = kll.serialize()
+        self.assertTrue(isinstance(kll_ints_sketch.deserialize(sk_bytes), kll_ints_sketch))
+    def test_kll_floats_sketch(self):
+      # already tested ints and it's templatized, so just make sure it instantiates properly
+      k = 75
+      kll = kll_floats_sketch(k)
+      self.assertTrue(kll.is_empty())
+if __name__ == '__main__':
+    unittest.main()

data/vendor/datasketches-cpp/python/tests/theta_test.py ADDED

@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import unittest
+from datasketches import theta_sketch, update_theta_sketch
+from datasketches import compact_theta_sketch, theta_union
+from datasketches import theta_intersection, theta_a_not_b
+class ThetaTest(unittest.TestCase):
+    def test_theta_basic_example(self):
+        k = 12      # 2^k = 4096 rows in the table
+        n = 1 << 18 # ~256k unique values
+        # create a sketch and inject some values
+        sk = self.generate_theta_sketch(n, k)
+        # we can check that the upper and lower bounds bracket the
+        # estimate, without needing to know the exact value.
+        self.assertLessEqual(sk.get_lower_bound(1), sk.get_estimate())
+        self.assertGreaterEqual(sk.get_upper_bound(1), sk.get_estimate())
+        # because this sketch is deterministically generated, we can
+        # also compare against the exact value
+        self.assertLessEqual(sk.get_lower_bound(1), n)
+        self.assertGreaterEqual(sk.get_upper_bound(1), n)
+        # serialize for storage and reconstruct
+        sk_bytes = sk.serialize()
+        new_sk = update_theta_sketch.deserialize(sk_bytes)
+        # estimate remains unchanged
+        self.assertFalse(sk.is_empty())
+        self.assertEqual(sk.get_estimate(), new_sk.get_estimate())
+    def test_theta_set_operations(self):
+        k = 12      # 2^k = 4096 rows in the table
+        n = 1 << 18 # ~256k unique values
+        # we'll have 1/4 of the values overlap
+        offset = int(3 * n / 4) # it's a float w/o cast
+        # create a couple sketches and inject some values
+        sk1 = self.generate_theta_sketch(n, k)
+        sk2 = self.generate_theta_sketch(n, k, offset)
+        # UNIONS
+        # create a union object
+        union = theta_union(k)
+        union.update(sk1)
+        union.update(sk2)
+        # getting result from union returns a compact_theta_sketch
+        # compact theta sketches can be used in additional unions
+        # or set operations but cannot accept further item updates
+        result = union.get_result()
+        self.assertTrue(isinstance(result, compact_theta_sketch))
+        # since our process here is deterministic, we have
+        # checked and know the exact answer is within one
+        # standard deviation of the estimate
+        self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
+        self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
+        # INTERSECTIONS
+        # create an intersection object
+        intersect = theta_intersection() # no lg_k
+        intersect.update(sk1)
+        intersect.update(sk2)
+        # has_result() indicates the intersection has been used,
+        # although the result may be the empty set
+        self.assertTrue(intersect.has_result())
+        # as with unions, the result is a compact sketch
+        result = intersect.get_result()
+        self.assertTrue(isinstance(result, compact_theta_sketch))
+        # we know the sets overlap by 1/4
+        self.assertLessEqual(result.get_lower_bound(1), n / 4)
+        self.assertGreaterEqual(result.get_upper_bound(1), n / 4)
+        # A NOT B
+        # create an a_not_b object
+        anb = theta_a_not_b() # no lg_k
+        result = anb.compute(sk1, sk2)
+        # as with unions, the result is a compact sketch
+        self.assertTrue(isinstance(result, compact_theta_sketch))
+        # we know the sets overlap by 1/4, so the remainder is 3/4
+        self.assertLessEqual(result.get_lower_bound(1), 3 * n / 4)
+        self.assertGreaterEqual(result.get_upper_bound(1), 3 * n / 4)
+    def generate_theta_sketch(self, n, k, offset=0):
+      sk = update_theta_sketch(k)
+      for i in range(0, n):
+        sk.update(i + offset)
+      return sk
+if __name__ == '__main__':
+    unittest.main()

data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py ADDED

@@ -0,0 +1,148 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import unittest
+from datasketches import (vector_of_kll_ints_sketches,
+                          vector_of_kll_floats_sketches)
+import numpy as np
+class VectorOfKllSketchesTest(unittest.TestCase):
+    def test_vector_of_kll_floats_sketches_example(self):
+      k = 200
+      d = 3
+      n = 2 ** 20
+      # create a sketch and inject ~1 million N(0,1) points
+      kll = vector_of_kll_floats_sketches(k, d)
+      # Track the min/max for each sketch to test later
+      smin = np.zeros(d) + np.inf
+      smax = np.zeros(d) - np.inf
+      for i in range(0, n):
+        dat  = np.random.randn(d)
+        smin = np.amin([smin, dat], axis=0)
+        smax = np.amax([smax, dat], axis=0)
+        kll.update(dat)
+      # 0 should be near the median
+      np.testing.assert_allclose(0.5, kll.get_ranks(0.0), atol=0.025)
+      # the median should be near 0
+      np.testing.assert_allclose(0.0, kll.get_quantiles(0.5), atol=0.025)
+      # we also track the min/max independently from the rest of the data
+      # which lets us know the full observed data range
+      np.testing.assert_allclose(kll.get_min_values(), smin)
+      np.testing.assert_allclose(kll.get_max_values(), smax)
+      np.testing.assert_array_less(kll.get_min_values(), kll.get_quantiles(0.01)[:,0])
+      np.testing.assert_array_less(kll.get_quantiles(0.99)[:,0], kll.get_max_values())
+      # we can also extract a list of values at a time,
+      # here the values should give us something close to [-2, -1, 0, 1, 2].
+      # then get the CDF, which will return something close to
+      # the original values used in get_quantiles()
+      # finally, can check the normalized rank error bound
+      pts = kll.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772])
+      # use the mean pts for the CDF, include 1.0 at end to account for all probability mass
+      meanpts = np.mean(pts, axis=0)
+      cdf = kll.get_cdf(meanpts)
+      self.assertEqual(cdf.shape[0], pts.shape[0])
+      self.assertEqual(cdf.shape[1], pts.shape[1]+1)
+      # and a few basic queries about the sketch
+      self.assertFalse(np.all(kll.is_empty()))
+      self.assertTrue(np.all(kll.is_estimation_mode()))
+      self.assertTrue(np.all(kll.get_n() == n))
+      self.assertTrue(np.all(kll.get_num_retained() < n))
+      # we can combine sketches across all dimensions and get the reuslt
+      result = kll.collapse()
+      self.assertEqual(result.get_n(), d * n)
+      # merging a copy of itself will double the number of items the sketch has seen
+      kll_copy = vector_of_kll_floats_sketches(kll)
+      kll.merge(kll_copy)
+      np.testing.assert_equal(kll.get_n(), 2*n)
+      # we can then serialize and reconstruct the sketch
+      kll_bytes = kll.serialize() # serializes each sketch as a list
+      new_kll = vector_of_kll_floats_sketches(k, d)
+      for s in range(len(kll_bytes)):
+        new_kll.deserialize(kll_bytes[s], s)
+      # everything should be exactly equal
+      np.testing.assert_equal(kll.get_num_retained(), new_kll.get_num_retained())
+      np.testing.assert_equal;(kll.get_min_values(), new_kll.get_min_values())
+      np.testing.assert_equal(kll.get_max_values(), new_kll.get_max_values())
+      np.testing.assert_equal(kll.get_quantiles(0.7), new_kll.get_quantiles(0.7))
+      np.testing.assert_equal(kll.get_ranks(0.0), new_kll.get_ranks(0.0))
+    def test_kll_ints_sketches(self):
+      # already tested floats and it's templatized, so just make sure it instantiates properly
+      k = 100
+      d = 5
+      kll = vector_of_kll_ints_sketches(k, d)
+      self.assertTrue(np.all(kll.is_empty()))
+    def test_kll_2Dupdates(self):
+      # 1D case tested in the first example
+      # 2D case will follow same idea, but focusing on update()
+      k = 200
+      d = 3
+      # we'll do ~250k updates of 4 values each (total ~1mil updates, as above)
+      n = 2 ** 18
+      nbatch = 4
+      # create a sketch and inject ~1 million N(0,1) points
+      kll = vector_of_kll_floats_sketches(k, d)
+      # Track the min/max for each sketch to test later
+      smin = np.zeros(d) + np.inf
+      smax = np.zeros(d) - np.inf
+      for i in range(0, n):
+        dat  = np.random.randn(nbatch, d)
+        smin = np.amin(np.row_stack((smin, dat)), axis=0)
+        smax = np.amax(np.row_stack((smax, dat)), axis=0)
+        kll.update(dat)
+      # 0 should be near the median
+      np.testing.assert_allclose(0.5, kll.get_ranks(0.0), atol=0.025)
+      # the median should be near 0
+      np.testing.assert_allclose(0.0, kll.get_quantiles(0.5), atol=0.025)
+      # we also track the min/max independently from the rest of the data
+      # which lets us know the full observed data range
+      np.testing.assert_allclose(kll.get_min_values(), smin)
+      np.testing.assert_allclose(kll.get_max_values(), smax)
+    def test_kll_3Dupdates(self):
+      # now test 3D update, which should fail
+      k = 200
+      d = 3
+      # create a sketch
+      kll = vector_of_kll_floats_sketches(k, d)
+      # we'll try 1 3D update
+      dat = np.random.randn(10, 7, d)
+      try:
+        kll.update(dat)
+      except:
+        # this is what we expect
+        pass
+      # the sketches should still be empty
+      self.assertTrue(np.all(kll.is_empty()))
+if __name__ == '__main__':
+    unittest.main()

data/vendor/datasketches-cpp/python/tests/vo_test.py ADDED

@@ -0,0 +1,101 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import unittest
+from datasketches import var_opt_sketch, var_opt_union
+class VoTest(unittest.TestCase):
+  def test_vo_example(self):
+    k = 50  # a small value so we can easily fill the sketch
+    vo = var_opt_sketch(k)
+    # varopt sampling reduces to standard reservoir sampling
+    # if the items are all equally weighted, although the
+    # algorithm will be significantly slower than an optimized
+    # reservoir sampler
+    n = 5 * k
+    for i in range(0, n):
+      vo.update(i)
+    # we can also add a heavy item, using a negative weight for
+    # easy filtering later.  keep in mind that "heavy" is a
+    # relative concept, so using a fixed multiple of n may not
+    # be considered a heavy item for larger values of n
+    vo.update(-1, 1000 * n)
+    self.assertEqual(k, vo.k)
+    self.assertEqual(k, vo.num_samples)
+    self.assertEqual(n + 1, vo.n)
+    self.assertFalse(vo.is_empty())
+    # we can easily get the list of items in the sample
+    items = vo.get_samples()
+    self.assertEqual(len(items), k)
+    # we can also apply a predicate to the sketch to get an estimate
+    # (with optimially minimal variance) of the subset sum of items
+    # matching that predicate among the entire population
+    # we'll use a lambda here, but any function operating on a single
+    # item which returns a boolean value should work
+    summary = vo.estimate_subset_sum(lambda x: x < 0)
+    self.assertEqual(summary['estimate'], 1000 * n)
+    self.assertEqual(summary['total_sketch_weight'], 1001 * n)
+    # a regular function is similarly handled
+    def geq_zero(x):
+      return x >= 0
+    summary = vo.estimate_subset_sum(geq_zero)
+    self.assertEqual(summary['estimate'], n)
+    self.assertEqual(summary['total_sketch_weight'], 1001 * n)
+    # next we'll create a second, smaller sketch with
+    # only heavier items relative to the previous sketch,
+    # but with the sketch in sampling mode
+    k2 = 5
+    vo2 = var_opt_sketch(k2)
+    # for weight, use the estimate of all items >=0 from before
+    wt = summary['estimate']
+    for i in range(0, k2 + 1):
+      vo2.update((2 * n) + i, wt)
+    # now union the sketches, demonstrating how the
+    # union's k may not be equal to that of either
+    # input value
+    union = var_opt_union(k)
+    union.update(vo)
+    union.update(vo2)
+    result = union.get_result()
+    self.assertEqual(n + k2 + 2, result.n)
+    self.assertFalse(result.is_empty())
+    self.assertGreater(result.k, k2)
+    self.assertLess(result.k, k)
+    # we can compare what information is available from both
+    # the union and a sketch.
+    print(union)
+    # if we want to print the list of itmes, there must be a
+    # __str__() method for each item (which need not be the same
+    # type; they're all generic python objects when used from
+    # python), otherwise you may trigger an exception.
+    # to_string() is provided as a convenince to avoid direct
+    # calls to __str__() with parameters.
+    print(result.to_string(True))
+if __name__ == '__main__':
+  unittest.main()