PyPI - datasketch - Versions diffs - 1.6.4__tar.gz → 1.6.5__tar.gz - Mend

datasketch 1.6.4tar.gz → 1.6.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

{datasketch-1.6.4 → datasketch-1.6.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datasketch
-Version: 1.6.4
+Version: 1.6.5
 Summary: Probabilistic data structures for processing and searching very large datasets
 Home-page: https://ekzhu.github.io/datasketch
 Author: ekzhu
@@ -51,11 +51,11 @@ Requires-Dist: motor; python_version >= "3.6" and extra == "experimental-aio"
 datasketch: Big Data Looks Small
 ================================
-.. image:: https://github.com/ekzhu/datasketch/workflows/Python%20package/badge.svg
-    :target: https://github.com/ekzhu/datasketch/actions
+.. image:: https://static.pepy.tech/badge/datasketch/month
+    :target: https://pepy.tech/project/datasketch
-.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.290602.svg
-   :target: https://doi.org/10.5281/zenodo.290602
+.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
+   :target: https://zenodo.org/doi/10.5281/zenodo.598238
 datasketch gives you probabilistic data structures that can process and
 search very large amount of data super fast, with little loss of

{datasketch-1.6.4 → datasketch-1.6.5}/README.rst RENAMED Viewed

@@ -1,11 +1,11 @@
 datasketch: Big Data Looks Small
 ================================
-.. image:: https://github.com/ekzhu/datasketch/workflows/Python%20package/badge.svg
-    :target: https://github.com/ekzhu/datasketch/actions
+.. image:: https://static.pepy.tech/badge/datasketch/month
+    :target: https://pepy.tech/project/datasketch
-.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.290602.svg
-   :target: https://doi.org/10.5281/zenodo.290602
+.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
+   :target: https://zenodo.org/doi/10.5281/zenodo.598238
 datasketch gives you probabilistic data structures that can process and
 search very large amount of data super fast, with little loss of

{datasketch-1.6.4 → datasketch-1.6.5}/datasketch/lsh.py RENAMED Viewed

@@ -226,6 +226,29 @@ class MinHashLSH(object):
         """
         self._insert(key, minhash, check_duplication=check_duplication, buffer=False)
+    def merge(
+            self,
+            other: MinHashLSH,
+            check_overlap: bool = False
+    ):
+        """Merge the other MinHashLSH with this one, making this one the union
+        of both.
+        Note:
+            Only num_perm, number of bands and sizes of each band is checked for equivalency of two MinHashLSH indexes.
+            Other initialization parameters threshold, weights, storage_config, prepickle and hash_func are not checked.
+        Args:
+            other (MinHashLSH): The other MinHashLSH.
+            check_overlap (bool): Check if there are any overlapping keys before merging and raise if there are any.
+                (`default=False`)
+        Raises:
+            ValueError: If the two MinHashLSH have different initialization
+                parameters, or if `check_overlap` is `True` and there are overlapping keys.
+        """
+        self._merge(other, check_overlap=check_overlap, buffer=False)
     def insertion_session(self, buffer_size: int = 50000) -> MinHashLSHInsertionSession:
         """
         Create a context manager for fast insertion into this index.
@@ -282,6 +305,38 @@ class MinHashLSH(object):
         for H, hashtable in zip(Hs, self.hashtables):
             hashtable.insert(H, key, buffer=buffer)
+    def __equivalent(self, other:MinHashLSH) -> bool:
+        """
+        Returns:
+            bool: If the two MinHashLSH have equal num_perm, number of bands, size of each band then two are equivalent.
+        """
+        return (
+            type(self) is type(other) and
+            self.h == other.h and
+            self.b == other.b and
+            self.r == other.r
+        )
+    def _merge(
+        self,
+        other: MinHashLSH,
+        check_overlap: bool = False,
+        buffer: bool = False
+    ) -> MinHashLSH:
+        if self.__equivalent(other):
+            if check_overlap and set(self.keys).intersection(set(other.keys)):
+                raise ValueError("The keys are overlapping, duplicate key exists.")
+            for key in other.keys:
+                Hs = other.keys.get(key)
+                self.keys.insert(key, *Hs, buffer=buffer)
+                for H, hashtable in zip(Hs, self.hashtables):
+                    hashtable.insert(H, key, buffer=buffer)
+        else:
+            if type(self) is not type(other):
+                raise ValueError(f"Cannot merge type MinHashLSH and type {type(other).__name__}.")
+            raise ValueError(
+                "Cannot merge MinHashLSH with different initialization parameters.")
     def query(self, minhash) -> List[Hashable]:
         """
         Giving the MinHash of the query set, retrieve

{datasketch-1.6.4 → datasketch-1.6.5}/datasketch/lshforest.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from collections import defaultdict
 from typing import Hashable, List
+import numpy as np
 from datasketch.minhash import MinHash
@@ -128,6 +129,30 @@ class MinHashLSHForest(object):
             r -= 1
         return list(results)
+    def get_minhash_hashvalues(self, key: Hashable) -> np.ndarray:
+        """
+        Returns the hashvalues from the MinHash object that corresponds to the given key in the LSHForest,
+        if it exists. This is useful for when we want to reconstruct the original MinHash
+        object to manually check the Jaccard Similarity for the top-k results from a query.
+        Args:
+            key (Hashable): The key whose MinHash hashvalues we want to retrieve.
+        Returns:
+            hashvalues: The hashvalues for the MinHash object corresponding to the given key.
+        """
+        byteslist = self.keys.get(key, None)
+        if byteslist is None:
+            raise KeyError(f"The provided key does not exist in the LSHForest: {key}")
+        hashvalue_byte_size = len(byteslist[0])//8
+        hashvalues = np.empty(len(byteslist)*hashvalue_byte_size, dtype=np.uint64)
+        for index, item in enumerate(byteslist):
+            # unswap the bytes, as their representation is flipped during storage
+            hv_segment = np.frombuffer(item, dtype=np.uint64).byteswap()
+            curr_index = index*hashvalue_byte_size
+            hashvalues[curr_index:curr_index+hashvalue_byte_size] = hv_segment
+        return hashvalues
     def _binary_search(self, n, func):
         """
         https://golang.org/src/sort/search.go?s=2247:2287#L49

datasketch-1.6.5/datasketch/version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.6.5"

{datasketch-1.6.4 → datasketch-1.6.5}/datasketch.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datasketch
-Version: 1.6.4
+Version: 1.6.5
 Summary: Probabilistic data structures for processing and searching very large datasets
 Home-page: https://ekzhu.github.io/datasketch
 Author: ekzhu
@@ -51,11 +51,11 @@ Requires-Dist: motor; python_version >= "3.6" and extra == "experimental-aio"
 datasketch: Big Data Looks Small
 ================================
-.. image:: https://github.com/ekzhu/datasketch/workflows/Python%20package/badge.svg
-    :target: https://github.com/ekzhu/datasketch/actions
+.. image:: https://static.pepy.tech/badge/datasketch/month
+    :target: https://pepy.tech/project/datasketch
-.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.290602.svg
-   :target: https://doi.org/10.5281/zenodo.290602
+.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
+   :target: https://zenodo.org/doi/10.5281/zenodo.598238
 datasketch gives you probabilistic data structures that can process and
 search very large amount of data super fast, with little loss of

{datasketch-1.6.4 → datasketch-1.6.5}/test/test_lsh.py RENAMED Viewed

@@ -240,6 +240,117 @@ class TestMinHashLSH(unittest.TestCase):
         for table in counts:
             self.assertEqual(sum(table.values()), 2)
+    def test_merge(self):
+        lsh1 = MinHashLSH(threshold=0.5, num_perm=16)
+        m1 = MinHash(16)
+        m1.update("a".encode("utf-8"))
+        m2 = MinHash(16)
+        m2.update("b".encode("utf-8"))
+        lsh1.insert("a",m1)
+        lsh1.insert("b",m2)
+        lsh2 = MinHashLSH(threshold=0.5, num_perm=16)
+        m3 = MinHash(16)
+        m3.update("c".encode("utf-8"))
+        m4 = MinHash(16)
+        m4.update("d".encode("utf-8"))
+        lsh2.insert("c",m1)
+        lsh2.insert("d",m2)
+        lsh1.merge(lsh2)
+        for t in lsh1.hashtables:
+            self.assertTrue(len(t) >= 1)
+            items = []
+            for H in t:
+                items.extend(t[H])
+            self.assertTrue("c" in items)
+            self.assertTrue("d" in items)
+        self.assertTrue("a" in lsh1)
+        self.assertTrue("b" in lsh1)
+        self.assertTrue("c" in lsh1)
+        self.assertTrue("d" in lsh1)
+        for i, H in enumerate(lsh1.keys["c"]):
+            self.assertTrue("c" in lsh1.hashtables[i][H])
+        self.assertTrue(lsh1.merge, lsh2)
+        self.assertRaises(ValueError, lsh1.merge, lsh2, check_overlap=True)
+        m5 = MinHash(16)
+        m5.update("e".encode("utf-8"))
+        lsh3 = MinHashLSH(threshold=0.5, num_perm=16)
+        lsh3.insert("a",m5)
+        self.assertRaises(ValueError, lsh1.merge, lsh3, check_overlap=True)
+        lsh1.merge(lsh3)
+        m6 = MinHash(16)
+        m6.update("e".encode("utf-8"))
+        lsh4 = MinHashLSH(threshold=0.5, num_perm=16)
+        lsh4.insert("a",m6)
+        lsh1.merge(lsh4, check_overlap=False)
+    def test_merge_redis(self):
+        with patch('redis.Redis', fake_redis) as mock_redis:
+            lsh1 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={
+                'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}
+            })
+            lsh2 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={
+                'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}
+            })
+            m1 = MinHash(16)
+            m1.update("a".encode("utf8"))
+            m2 = MinHash(16)
+            m2.update("b".encode("utf8"))
+            lsh1.insert("a", m1)
+            lsh1.insert("b", m2)
+            m3 = MinHash(16)
+            m3.update("c".encode("utf8"))
+            m4 = MinHash(16)
+            m4.update("d".encode("utf8"))
+            lsh2.insert("c", m3)
+            lsh2.insert("d", m4)
+            lsh1.merge(lsh2)
+            for t in lsh1.hashtables:
+                self.assertTrue(len(t) >= 1)
+                items = []
+                for H in t:
+                    items.extend(t[H])
+                self.assertTrue(pickle.dumps("c") in items)
+                self.assertTrue(pickle.dumps("d") in items)
+            self.assertTrue("a" in lsh1)
+            self.assertTrue("b" in lsh1)
+            self.assertTrue("c" in lsh1)
+            self.assertTrue("d" in lsh1)
+            for i, H in enumerate(lsh1.keys[pickle.dumps("c")]):
+                self.assertTrue(pickle.dumps("c") in lsh1.hashtables[i][H])
+            self.assertTrue(lsh1.merge, lsh2)
+            self.assertRaises(ValueError, lsh1.merge, lsh2, check_overlap=True)
+            m5 = MinHash(16)
+            m5.update("e".encode("utf-8"))
+            lsh3 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={
+                'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}
+            })
+            lsh3.insert("a",m5)
+            self.assertRaises(ValueError, lsh1.merge, lsh3, check_overlap=True)
+            m6 = MinHash(16)
+            m6.update("e".encode("utf-8"))
+            lsh4 = MinHashLSH(threshold=0.5, num_perm=16, storage_config={
+                'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}
+            })
+            lsh4.insert("a",m6)
+            lsh1.merge(lsh4, check_overlap=False)
 class TestWeightedMinHashLSH(unittest.TestCase):

{datasketch-1.6.4 → datasketch-1.6.5}/test/test_lshforest.py RENAMED Viewed

@@ -62,6 +62,18 @@ class TestMinHashLSHForest(unittest.TestCase):
             results = forest.query(data[key], 10)
             self.assertIn(key, results)
+    def test_get_minhash_hashvalues(self):
+        forest, data = self._setup()
+        for key in data:
+            minhash_ori = data[key]
+            hashvalues = forest.get_minhash_hashvalues(key)
+            minhash_retrieved = MinHash(hashvalues=hashvalues)
+            retrieved_hashvalues = minhash_retrieved.hashvalues
+            self.assertEqual(len(hashvalues), len(retrieved_hashvalues))
+            self.assertEqual(minhash_retrieved.jaccard(minhash_ori), 1.0)
+            for i in range(len(retrieved_hashvalues)):
+                self.assertEqual(hashvalues[i], retrieved_hashvalues[i])
     def test_pickle(self):
         forest, _ = self._setup()
         forest2 = pickle.loads(pickle.dumps(forest))