PyPI - datasketch - Versions diffs - 1.9.0__tar.gz → 1.10.0__tar.gz - Mend

datasketch 1.9.0tar.gz → 1.10.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{datasketch-1.9.0 → datasketch-1.10.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datasketch
-Version: 1.9.0
+Version: 1.10.0
 Summary: Probabilistic data structures for processing and searching very large datasets
 Project-URL: Homepage, https://ekzhu.github.io/datasketch
 Project-URL: Bug Tracker, https://github.com/ekzhu/datasketch/issues
@@ -23,10 +23,16 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
 Requires-Python: >=3.9
 Requires-Dist: numpy>=1.11
 Requires-Dist: scipy>=1.0.0
+Provides-Extra: aio
+Requires-Dist: aiounittest; extra == 'aio'
+Requires-Dist: motor>3.6.0; extra == 'aio'
 Provides-Extra: benchmark
+Requires-Dist: fonttools>=4.60.2; extra == 'benchmark'
 Requires-Dist: matplotlib>=3.1.2; extra == 'benchmark'
-Requires-Dist: nltk>=3.4.5; extra == 'benchmark'
+Requires-Dist: nltk>=3.4.5; (python_version < '3.10') and extra == 'benchmark'
+Requires-Dist: nltk>=3.9.4; (python_version >= '3.10') and extra == 'benchmark'
 Requires-Dist: pandas>=0.25.3; extra == 'benchmark'
+Requires-Dist: pillow>=12.2.0; (python_version >= '3.10') and extra == 'benchmark'
 Requires-Dist: pyfarmhash>=0.2.2; extra == 'benchmark'
 Requires-Dist: pyhash>=0.9.3; extra == 'benchmark'
 Requires-Dist: scikit-learn>=0.21.3; extra == 'benchmark'
@@ -48,11 +54,13 @@ Requires-Dist: mock>=2.0.0; extra == 'test'
 Requires-Dist: mockredispy; extra == 'test'
 Requires-Dist: nose-exclude>=0.5.0; extra == 'test'
 Requires-Dist: nose>=1.3.7; extra == 'test'
+Requires-Dist: pygments>=2.20.0; extra == 'test'
 Requires-Dist: pymongo>=3.9.0; extra == 'test'
-Requires-Dist: pytest; extra == 'test'
 Requires-Dist: pytest-asyncio; extra == 'test'
 Requires-Dist: pytest-cov; extra == 'test'
 Requires-Dist: pytest-rerunfailures; extra == 'test'
+Requires-Dist: pytest; (python_version < '3.10') and extra == 'test'
+Requires-Dist: pytest>=9.0.3; (python_version >= '3.10') and extra == 'test'
 Requires-Dist: redis>=2.10.0; extra == 'test'
 Description-Content-Type: text/x-rst

{datasketch-1.9.0 → datasketch-1.10.0}/datasketch/__init__.py RENAMED Viewed

@@ -7,6 +7,7 @@ except importlib.metadata.PackageNotFoundError:
     _version = "0.0.0"  # Fallback for development mode
 __version__: Final[str] = _version
+from datasketch.aio import AsyncMinHashLSH  # Instantiation requires motor/redis.asyncio; import itself is always safe.
 from datasketch.b_bit_minhash import bBitMinHash
 from datasketch.hashfunc import sha1_hash32
 from datasketch.hnsw import HNSW
@@ -23,9 +24,9 @@ from datasketch.weighted_minhash import WeightedMinHash, WeightedMinHashGenerato
 WeightedMinHashLSH = MinHashLSH
 WeightedMinHashLSHForest = MinHashLSHForest
 __all__ = [
     "HNSW",
+    "AsyncMinHashLSH",
     "HyperLogLog",
     "HyperLogLogPlusPlus",
     "LeanMinHash",

datasketch-1.10.0/datasketch/aio/__init__.py ADDED Viewed

@@ -0,0 +1,44 @@
+"""Async MinHash LSH module.
+This module provides asynchronous implementations of MinHash LSH for use with
+async storage backends like MongoDB (via motor) and Redis (via redis.asyncio).
+Example:
+    .. code-block:: python
+        import asyncio
+        from datasketch.aio import AsyncMinHashLSH
+        from datasketch import MinHash
+        async def main():
+            # prepickle=True lets you use non-bytes keys (e.g. str). With the
+            # default prepickle=False, keys passed to insert() must be bytes.
+            async with AsyncMinHashLSH(
+                storage_config={"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}},
+                threshold=0.5,
+                num_perm=128,
+                prepickle=True,
+            ) as lsh:
+                m = MinHash(num_perm=128)
+                m.update(b"data")
+                await lsh.insert("key", m)
+                result = await lsh.query(m)
+        asyncio.run(main())
+"""
+from datasketch.aio.lsh import (
+    AsyncMinHashLSH,
+    AsyncMinHashLSHDeleteSession,
+    AsyncMinHashLSHInsertionSession,
+)
+__all__ = [
+    "AsyncMinHashLSH",
+    "AsyncMinHashLSHDeleteSession",
+    "AsyncMinHashLSHInsertionSession",
+]

{datasketch-1.9.0/datasketch/experimental → datasketch-1.10.0/datasketch}/aio/lsh.py RENAMED Viewed

@@ -1,9 +1,15 @@
+"""Asynchronous MinHash LSH implementation.
+This module provides AsyncMinHashLSH for use with async storage backends
+like MongoDB (via motor) and Redis (via redis.asyncio).
+"""
 import asyncio
 import pickle
 from itertools import chain
 from typing import Optional
-from datasketch.experimental.aio.storage import (
+from datasketch.aio.storage import (
     async_ordered_storage,
     async_unordered_storage,
 )
@@ -34,8 +40,6 @@ class AsyncMinHashLSH:
         MONGO = {"type": "aiomongo", "basename": "base_name_1", "mongo": {"host": "localhost", "port": 27017}}
     .. note::
-        * The module supports Python version >=3.6, and is currently experimental.
-          So the interface may change slightly in the future.
         * For main functionality of LSH algorithm see :class:`datasketch.MinHashLSH`.
         * For additional information see :ref:`minhash_lsh_at_scale` and :ref:`minhash_lsh_async`
     """
@@ -129,7 +133,7 @@ class AsyncMinHashLSH:
         if self.keys is not None:
             self.keys.batch_size = value
         else:
-            raise AttributeError("AsyncMinHash is not initialized.")
+            raise AttributeError("AsyncMinHashLSH is not initialized.")
         for t in self.hashtables:
             t.batch_size = value
@@ -163,12 +167,6 @@ class AsyncMinHashLSH:
         if self.keys is None:
             await self._create_storages()
-        if not self.keys.initialized:
-            await self.keys
-        fs = (ht for ht in self.hashtables if not ht.initialized)
-        await asyncio.gather(*fs)
     async def close(self):
         """Cleanup client resources and disconnect from AsyncMinHashLSH storage."""
         async with self._lock:
@@ -189,41 +187,26 @@ class AsyncMinHashLSH:
         :param int batch_size: the size of chunks to use in insert_session mode (default=10000).
-        :return: datasketch.experimental.aio.lsh.AsyncMinHashLSHSession
+        :return: datasketch.aio.lsh.AsyncMinHashLSHInsertionSession
         Example:
             .. code-block:: python
-                from datasketch.experimental.aio.lsh import AsyncMinHashLSH
+                import asyncio
+                from datasketch.aio import AsyncMinHashLSH
                 from datasketch import MinHash
-                def chunk(it, size):
-                    it = iter(it)
-                    return iter(lambda: tuple(islice(it, size)), ())
-                _chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
-                seq = frozenset(
-                    chain(
-                        ("".join(s) for s in _chunked_str),
-                        ("aahhb", "aahh", "aahhc", "aac", "kld", "bhg", "kkd", "yow", "ppi", "eer"),
-                    )
-                )
-                objs = [MinHash(16) for _ in range(len(seq))]
-                for e, obj in zip(seq, objs):
-                    for i in e:
-                        obj.update(i.encode("utf-8"))
-                data = [(e, m) for e, m in zip(seq, objs)]
-                _storage_config_redis = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
-                async def func():
-                    async with AsyncMinHashLSH(storage_config=_storage_config_redis, threshold=0.5, num_perm=16) as lsh:
+                async def main():
+                    storage_config = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
+                    async with AsyncMinHashLSH(
+                        storage_config=storage_config, threshold=0.5, num_perm=16, prepickle=True
+                    ) as lsh:
                         async with lsh.insertion_session(batch_size=1000) as session:
-                            fs = (session.insert(key, minhash, check_duplication=True) for key, minhash in data)
-                            await asyncio.gather(*fs)
+                            m = MinHash(num_perm=16)
+                            m.update(b"data")
+                            await session.insert("key", m)
+                asyncio.run(main())
         """
         return AsyncMinHashLSHInsertionSession(self, batch_size=batch_size)
@@ -232,47 +215,32 @@ class AsyncMinHashLSH:
         """Create a asynchronous context manager for fast removal of keys
         from index.
-        :param int batch_size: the size of chunks to use in insert_session mode (default=10000).
+        :param int batch_size: the size of chunks to use in delete_session mode (default=10000).
-        :return: datasketch.experimental.aio.lsh.AsyncMinHashLSHSession
+        :return: datasketch.aio.lsh.AsyncMinHashLSHDeleteSession
         Example:
             .. code-block:: python
-                from datasketch.experimental.aio.lsh import AsyncMinHashLSH
+                import asyncio
+                from datasketch.aio import AsyncMinHashLSH
                 from datasketch import MinHash
+                async def main():
+                    storage_config = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
+                    async with AsyncMinHashLSH(
+                        storage_config=storage_config, threshold=0.5, num_perm=16, prepickle=True
+                    ) as lsh:
+                        # Insert some data first
+                        m = MinHash(num_perm=16)
+                        m.update(b"data")
+                        await lsh.insert("key1", m)
-                def chunk(it, size):
-                    it = iter(it)
-                    return iter(lambda: tuple(islice(it, size)), ())
+                        # Delete using session
+                        async with lsh.delete_session(batch_size=100) as session:
+                            await session.remove("key1")
-                _chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
-                seq = frozenset(
-                    chain(
-                        ("".join(s) for s in _chunked_str),
-                        ("aahhb", "aahh", "aahhc", "aac", "kld", "bhg", "kkd", "yow", "ppi", "eer"),
-                    )
-                )
-                objs = [MinHash(16) for _ in range(len(seq))]
-                for e, obj in zip(seq, objs):
-                    for i in e:
-                        obj.update(i.encode("utf-8"))
-                data = [(e, m) for e, m in zip(seq, objs)]
-                _storage_config_redis = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
-                async def func():
-                    async with AsyncMinHashLSH(storage_config=_storage_config_redis, threshold=0.5, num_perm=16) as lsh:
-                        async with lsh.insertion_session(batch_size=1000) as session:
-                            fs = (session.insert(key, minhash, check_duplication=True) for key, minhash in data)
-                            await asyncio.gather(*fs)
-                        async with lsh.delete_session(batch_size=3) as session:
-                            fs = (session.remove(key) for key in keys_to_remove)
-                            await asyncio.gather(*fs)
+                asyncio.run(main())
         """
         return AsyncMinHashLSHDeleteSession(self, batch_size=batch_size)
@@ -288,7 +256,9 @@ class AsyncMinHashLSH:
         if self.prepickle:
             key = pickle.dumps(key)
-        if check_duplication and await self.has_key(key):
+        # `key` is already pickled at this point under prepickle=True; call the
+        # storage primitive directly so we don't re-pickle through has_key().
+        if check_duplication and await self.keys.has_key(key):
             raise ValueError("The given key already exists")
         Hs = [self._H(minhash.hashvalues[start:end]) for start, end in self.hashranges]
@@ -314,6 +284,8 @@ class AsyncMinHashLSH:
     async def has_key(self, key):
         """See :class:`datasketch.MinHashLSH`."""
+        if self.prepickle:
+            key = pickle.dumps(key)
         return await self.keys.has_key(key)
     async def remove(self, key):
@@ -321,7 +293,12 @@ class AsyncMinHashLSH:
         await self._remove(key, buffer=False)
     async def _remove(self, key, buffer=False):
-        if not await self.has_key(key):
+        if self.prepickle:
+            key = pickle.dumps(key)
+        # `key` is already pickled here; call storage primitives directly so
+        # the existence check, lookup, and deletes all use the stored form.
+        if not await self.keys.has_key(key):
             raise ValueError("The given key does not exist")
         for H, hashtable in zip(await self.keys.get(key), self.hashtables):
@@ -352,7 +329,10 @@ class AsyncMinHashLSH:
             H = self._H(minhash.hashvalues[start:end])
             if await hashtable.has_key(H):
                 fs.append(hashtable.get(H))
-        return set(chain.from_iterable(await asyncio.gather(*fs)))  # candidates
+        candidates = set(chain.from_iterable(await asyncio.gather(*fs)))
+        if self.prepickle:
+            return {pickle.loads(key) for key in candidates}
+        return candidates
     async def get_counts(self):
         """See :class:`datasketch.MinHashLSH`."""
@@ -361,6 +341,10 @@ class AsyncMinHashLSH:
     async def get_subset_counts(self, *keys):
         """See :class:`datasketch.MinHashLSH`."""
+        # Keys in storage are pickled when prepickle is enabled, so we have to
+        # pickle the query keys to match the stored representation.
+        if self.prepickle:
+            keys = tuple(pickle.dumps(key) for key in keys)
         key_set = list(set(keys))
         hashtables = [unordered_storage({"type": "dict"}) for _ in range(self.b)]
         Hss = await self.keys.getmany(*key_set)

{datasketch-1.9.0/datasketch/experimental → datasketch-1.10.0/datasketch}/aio/storage.py RENAMED Viewed

@@ -1,9 +1,23 @@
+"""Async storage backends for MinHash LSH.
+This module provides async storage implementations for use with AsyncMinHashLSH:
+- AsyncMongoListStorage / AsyncMongoSetStorage: MongoDB storage via motor
+- AsyncRedisListStorage / AsyncRedisSetStorage: Redis storage via redis.asyncio
+"""
 import asyncio
 import os
 from abc import ABCMeta
 from itertools import chain
-from datasketch.storage import OrderedStorage, RedisStorage, Storage, UnorderedStorage, _random_name
+from datasketch.storage import OrderedStorage, Storage, UnorderedStorage, _random_name
+# RedisStorage is only available when redis package is installed (optional dependency)
+# Import it conditionally to avoid ImportError when redis is not installed
+try:
+    from datasketch.storage import RedisStorage
+except ImportError:
+    RedisStorage = None
 ABC = ABCMeta("ABC", (object,), {})
@@ -24,6 +38,12 @@ except ImportError:
     redis = None
+__all__ = [
+    "async_ordered_storage",
+    "async_unordered_storage",
+]
 async def async_ordered_storage(config, name=None):
     tp = config["type"]
     if tp == "aiomongo":
@@ -55,9 +75,9 @@ if motor is not None and ReturnDocument is not None:
     class AsyncMongoBuffer:
         def __init__(self, aio_mongo_collection, batch_size):
             self._batch_size = batch_size
-            self._insert_documents_stack = tuple()
-            self._delete_by_key_documents_stack = tuple()
-            self._delete_by_val_documents_stack = tuple()
+            self._insert_documents_stack = []
+            self._delete_by_key_documents_stack = []
+            self._delete_by_val_documents_stack = []
             self._mongo_coll = aio_mongo_collection
         @property
@@ -73,28 +93,28 @@ if motor is not None and ReturnDocument is not None:
             if command == "insert":
                 if len(self._insert_documents_stack) >= self.batch_size:
                     await self.execute(command)
-                self._insert_documents_stack += (kwargs["obj"],)
+                self._insert_documents_stack.append(kwargs["obj"])
             elif command == "delete_by_key":
                 if len(self._delete_by_key_documents_stack) >= self.batch_size:
                     await self.execute(command)
-                self._delete_by_key_documents_stack += (kwargs["key"],)
+                self._delete_by_key_documents_stack.append(kwargs["key"])
             elif command == "delete_by_val":
                 if len(self._delete_by_val_documents_stack) >= self.batch_size:
                     await self.execute(command)
-                self._delete_by_val_documents_stack += (kwargs["val"],)
+                self._delete_by_val_documents_stack.append(kwargs["val"])
         async def execute(self, command):
             if command == "insert" and self._insert_documents_stack:
                 buffer = self._insert_documents_stack
-                self._insert_documents_stack = tuple()
+                self._insert_documents_stack = []
                 await self._mongo_coll.insert_many(buffer, ordered=False)
             elif command == "delete_by_key" and self._delete_by_key_documents_stack:
                 buffer = self._delete_by_key_documents_stack
-                self._delete_by_key_documents_stack = tuple()
+                self._delete_by_key_documents_stack = []
                 await self._mongo_coll.delete_many({"key": {"$in": buffer}})
             elif command == "delete_by_val" and self._delete_by_val_documents_stack:
                 buffer = self._delete_by_val_documents_stack
-                self._delete_by_val_documents_stack = tuple()
+                self._delete_by_val_documents_stack = []
                 await self._mongo_coll.delete_many({"vals": {"$in": buffer}})
         async def insert_one(self, **kwargs):
@@ -254,6 +274,9 @@ if motor is not None and ReturnDocument is not None:
         async def has_key(self, key):
             return bool(await self._collection.find_one({"key": key}))
+        async def getmany(self, *keys):
+            return await asyncio.gather(*(self.get(key) for key in keys))
         async def status(self):
             status = self._parse_config(self.config["mongo"])
             status.update({"keyspace_size": await self.size()})
@@ -285,7 +308,9 @@ if motor is not None and ReturnDocument is not None:
                 await self._collection.find_one_and_delete({"key": key, "vals": val})
-if redis is not None:
+# Redis-based async storage classes are only defined when both redis package
+# and RedisStorage are available (optional dependencies)
+if redis is not None and RedisStorage is not None:
     class AsyncRedisBuffer(redis.client.Pipeline):
         def __init__(self, connection_pool, response_callbacks, transaction, buffer_size, shard_hint=None):
@@ -304,7 +329,7 @@ if redis is not None:
         async def execute_command(self, *args, **kwargs):
             if len(self.command_stack) >= self._buffer_size:
-                self.execute()
+                await self.execute()
             await super(AsyncRedisBuffer, self).execute_command(*args, **kwargs)
     class AsyncRedisStorage(RedisStorage):
@@ -347,9 +372,8 @@ if redis is not None:
         async def getmany(self, *keys):
             pipe = self._redis.pipeline()
-            pipe.multi()
             for key in keys:
-                await self._get_items(pipe, self.redis_key(key))
+                pipe.lrange(self.redis_key(key), 0, -1)
             return await pipe.execute()
         @staticmethod
@@ -422,6 +446,12 @@ if redis is not None:
         async def _get_items(r, k):
             return await r.smembers(k)
+        async def getmany(self, *keys):
+            pipe = self._redis.pipeline()
+            for key in keys:
+                pipe.smembers(self.redis_key(key))
+            return await pipe.execute()
         async def remove_val(self, key, val, **kwargs):
             buffer = kwargs.pop("buffer", False)
             redis_key = self.redis_key(key)

datasketch-1.10.0/datasketch/experimental/__init__.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""Deprecated experimental module.
+.. deprecated::
+    The `datasketch.experimental` module is deprecated and will be removed in a future version.
+    Please use `datasketch.aio` instead:
+    Old: ``from datasketch.experimental import AsyncMinHashLSH``
+    New: ``from datasketch.aio import AsyncMinHashLSH``
+    Or simply: ``from datasketch import AsyncMinHashLSH``
+"""
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    # Visible to static analyzers so they know `__all__` is satisfied.
+    # Not imported at runtime - the real dispatch happens in __getattr__.
+    from datasketch.aio import AsyncMinHashLSH
+__all__ = ["AsyncMinHashLSH"]
+def __getattr__(name):
+    # PEP 562: only emit the DeprecationWarning when the user actually pulls a
+    # symbol out of this package, not on every `import datasketch.experimental`.
+    # This avoids the noisy triple-warning that fired when each intermediate
+    # __init__.py warned eagerly.
+    #
+    # We cache the resolved symbol back into globals() so subsequent accesses
+    # bypass __getattr__. This matters for two reasons:
+    #   1. `from pkg import x` internally performs both `hasattr(pkg, x)` and
+    #      `getattr(pkg, x)`, so without caching __getattr__ fires twice.
+    #   2. It makes the warning a one-shot per process, which is the normal
+    #      expectation for deprecation warnings.
+    if name == "AsyncMinHashLSH":
+        import warnings
+        warnings.warn(
+            "datasketch.experimental is deprecated. "
+            "Use 'from datasketch.aio import AsyncMinHashLSH' or "
+            "'from datasketch import AsyncMinHashLSH' instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        from datasketch.aio import AsyncMinHashLSH
+        globals()[name] = AsyncMinHashLSH
+        return AsyncMinHashLSH
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

datasketch-1.10.0/datasketch/experimental/aio/__init__.py ADDED Viewed

@@ -0,0 +1,50 @@
+"""Deprecated experimental aio module.
+.. deprecated::
+    The `datasketch.experimental.aio` module is deprecated and will be removed in a future version.
+    Please use `datasketch.aio` instead:
+    Old: ``from datasketch.experimental.aio import AsyncMinHashLSH``
+    New: ``from datasketch.aio import AsyncMinHashLSH``
+"""
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    # Visible to static analyzers so they know `__all__` is satisfied.
+    # Not imported at runtime - the real dispatch happens in __getattr__.
+    from datasketch.aio import (
+        AsyncMinHashLSH,
+        AsyncMinHashLSHDeleteSession,
+        AsyncMinHashLSHInsertionSession,
+    )
+__all__ = [
+    "AsyncMinHashLSH",
+    "AsyncMinHashLSHDeleteSession",
+    "AsyncMinHashLSHInsertionSession",
+]
+_DEPRECATED = frozenset(__all__)
+def __getattr__(name):
+    # PEP 562: emit the warning lazily on attribute access so that merely
+    # importing the parent package (e.g. as an intermediate step of
+    # `from datasketch.experimental.aio.lsh import ...`) does not fire
+    # a second, redundant warning. See the long comment in
+    # datasketch/experimental/__init__.py for why we cache into globals().
+    if name in _DEPRECATED:
+        import warnings
+        warnings.warn(
+            "datasketch.experimental.aio is deprecated. Use 'from datasketch.aio import AsyncMinHashLSH' instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        import datasketch.aio as _new
+        value = getattr(_new, name)
+        globals()[name] = value
+        return value
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

datasketch-1.10.0/datasketch/experimental/aio/lsh.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""Deprecated experimental aio lsh module.
+.. deprecated::
+    The `datasketch.experimental.aio.lsh` module is deprecated and will be removed in a future version.
+    Please use `datasketch.aio.lsh` instead:
+    Old: ``from datasketch.experimental.aio.lsh import AsyncMinHashLSH``
+    New: ``from datasketch.aio import AsyncMinHashLSH``
+"""
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    # Visible to static analyzers so they know `__all__` is satisfied.
+    # Not imported at runtime - the real dispatch happens in __getattr__.
+    from datasketch.aio.lsh import (
+        AsyncMinHashLSH,
+        AsyncMinHashLSHDeleteSession,
+        AsyncMinHashLSHInsertionSession,
+    )
+__all__ = [
+    "AsyncMinHashLSH",
+    "AsyncMinHashLSHDeleteSession",
+    "AsyncMinHashLSHInsertionSession",
+]
+_DEPRECATED = frozenset(__all__)
+def __getattr__(name):
+    # Lazy warning via PEP 562: fires exactly once per attribute access on the
+    # deprecated module, and we cache the resolved symbol back into globals()
+    # so the warning is emitted once per process (see the long comment in
+    # datasketch/experimental/__init__.py for rationale).
+    if name in _DEPRECATED:
+        import warnings
+        warnings.warn(
+            "datasketch.experimental.aio.lsh is deprecated. Use 'from datasketch.aio import AsyncMinHashLSH' instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        import datasketch.aio.lsh as _new
+        value = getattr(_new, name)
+        globals()[name] = value
+        return value
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

{datasketch-1.9.0 → datasketch-1.10.0}/datasketch/lsh.py RENAMED Viewed

@@ -463,16 +463,24 @@ class MinHashLSH:
             list: a list of unique keys.
         """
-        collected_result_sets = [
-            set(collected_result_lists)
-            for hashtable in self.hashtables
-            for collected_result_lists in hashtable.collect_select_buffer()
+        collected_result_lists = [hashtable.collect_select_buffer() for hashtable in self.hashtables]
+        if not any(collected_result_lists):
+            return []
+        # Each buffered query contributes one result list per hashtable. We first
+        # union candidates across bands for each query, then intersect across the
+        # buffered queries to match repeated calls to `query()`.
+        per_query_result_sets = [
+            set().union(*query_result_lists)
+            for query_result_lists in zip(*collected_result_lists)
         ]
-        if not collected_result_sets:
+        if not per_query_result_sets:
             return []
+        candidates = set.intersection(*per_query_result_sets)
         if self.prepickle:
-            return [pickle.loads(key) for key in set.intersection(*collected_result_sets)]
-        return list(set.intersection(*collected_result_sets))
+            return [pickle.loads(key) for key in candidates]
+        return list(candidates)
     def __contains__(self, key: Hashable) -> bool:
         """Args:

{datasketch-1.9.0 → datasketch-1.10.0}/datasketch/lshensemble.py RENAMED Viewed

@@ -204,7 +204,7 @@ class MinHashLSHEnsemble:
         if not self.is_empty():
             raise ValueError("Cannot call index again on a non-empty index")
         if not isinstance(entries, list):
-            queue = deque([])
+            queue = deque()
             for key, minhash, size in entries:
                 if size <= 0:
                     raise ValueError("Set size must be positive")

{datasketch-1.9.0 → datasketch-1.10.0}/datasketch/lshforest.py RENAMED Viewed

@@ -9,8 +9,8 @@ from datasketch.minhash import MinHash
 class MinHashLSHForest:
     """The LSH Forest for MinHash. It supports top-k query in Jaccard
     similarity.
-    Instead of using prefix trees as the `original paper
-    <http://ilpubs.stanford.edu:8090/678/1/2005-14.pdf>`_,
+    Instead of using prefix trees as described in the original LSH Forest
+    paper by Bawa et al. (WWW 2005),
     I use a sorted array to store the hash values in every
     hash table.
@@ -37,7 +37,8 @@ class MinHashLSHForest:
         # Maximum depth of the prefix tree
         self.k = int(num_perm / l)
         self.hashtables = [defaultdict(list) for _ in range(self.l)]
-        self.hashranges = [(i * self.k, (i + 1) * self.k) for i in range(self.l)]
+        self.hashranges = [(i * self.k, (i + 1) * self.k)
+                           for i in range(self.l)]
         self.keys = dict()
         # This is the sorted array implementation for the prefix trees
         self.sorted_hashtables = [[] for _ in range(self.l)]
@@ -59,7 +60,8 @@ class MinHashLSHForest:
             raise ValueError("The num_perm of MinHash out of range")
         if key in self.keys:
             raise ValueError("The given key has already been added")
-        self.keys[key] = [self._H(minhash.hashvalues[start:end]) for start, end in self.hashranges]
+        self.keys[key] = [self._H(minhash.hashvalues[start:end])
+                          for start, end in self.hashranges]
         for H, hashtable in zip(self.keys[key], self.hashtables):
             hashtable[H].append(key)
@@ -73,11 +75,13 @@ class MinHashLSHForest:
         if r > self.k or r <= 0 or b > self.l or b <= 0:
             raise ValueError("parameter outside range")
         # Generate prefixes of concatenated hash values
-        hps = [self._H(minhash.hashvalues[start : start + r]) for start, _ in self.hashranges]
+        hps = [self._H(minhash.hashvalues[start: start + r])
+               for start, _ in self.hashranges]
         # Set the prefix length for look-ups in the sorted hash values list
         prefix_size = len(hps[0])
         for ht, hp, hashtable in zip(self.sorted_hashtables, hps, self.hashtables):
-            i = self._binary_search(len(ht), lambda x, ht=ht, hp=hp: ht[x][:prefix_size] >= hp)
+            i = self._binary_search(
+                len(ht), lambda x, ht=ht, hp=hp: ht[x][:prefix_size] >= hp)
             if i < len(ht) and ht[i][:prefix_size] == hp:
                 j = i
                 while j < len(ht) and ht[j][:prefix_size] == hp:
@@ -137,14 +141,17 @@ class MinHashLSHForest:
         """
         byteslist = self.keys.get(key, None)
         if byteslist is None:
-            raise KeyError(f"The provided key does not exist in the LSHForest: {key}")
+            raise KeyError(
+                f"The provided key does not exist in the LSHForest: {key}")
         hashvalue_byte_size = len(byteslist[0]) // 8
-        hashvalues = np.empty(len(byteslist) * hashvalue_byte_size, dtype=np.uint64)
+        hashvalues = np.empty(
+            len(byteslist) * hashvalue_byte_size, dtype=np.uint64)
         for index, item in enumerate(byteslist):
             # unswap the bytes, as their representation is flipped during storage
             hv_segment = np.frombuffer(item, dtype=np.uint64).byteswap()
             curr_index = index * hashvalue_byte_size
-            hashvalues[curr_index : curr_index + hashvalue_byte_size] = hv_segment
+            hashvalues[curr_index: curr_index +
+                       hashvalue_byte_size] = hv_segment
         return hashvalues
     def _binary_search(self, n, func):

{datasketch-1.9.0 → datasketch-1.10.0}/datasketch/storage.py RENAMED Viewed

@@ -603,12 +603,14 @@ if cassandra is not None:
             del self._select_statements_and_parameters_with_decoders[:]
             statements_and_parameters, decoders = zip(*buffer)
-            ret = collections.defaultdict(list)
             query_results = self._select(statements_and_parameters)
-            for rows, (key_decoder, val_decoder) in zip(query_results, decoders):
+            ret = []
+            for rows, (_key_decoder, val_decoder) in zip(query_results, decoders):
+                values = []
                 for row in rows:
-                    ret[key_decoder(row.key)].append((val_decoder(row.value), row.ts))
-            return [[x[0] for x in sorted(v, key=operator.itemgetter(1))] for v in ret.values()]
+                    values.append((val_decoder(row.value), row.ts))
+                ret.append([x[0] for x in sorted(values, key=operator.itemgetter(1))])
+            return ret
         def select(self, keys):
             """Select all values for the given keys.

{datasketch-1.9.0 → datasketch-1.10.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "datasketch"
-version = "1.9.0"
+version = "1.10.0"
 description = "Probabilistic data structures for processing and searching very large datasets"
 readme = "README.rst"
 requires-python = ">=3.9"
@@ -37,7 +37,11 @@ benchmark = [
   "pandas>=0.25.3",
   "SetSimilaritySearch>=0.1.7",
   "pyfarmhash>=0.2.2",
-  "nltk>=3.4.5",
+  "nltk>=3.4.5; python_version < '3.10'",
+  "nltk>=3.9.4; python_version >= '3.10'",
+  # Transitive deps of matplotlib listed to avoid dependabot uv.lock-only PRs.
+  "pillow>=12.2.0; python_version >= '3.10'",
+  "fonttools>=4.60.2",
 ]
 test = [
   "cassandra-driver>=3.20",
@@ -49,10 +53,17 @@ test = [
   "pymongo>=3.9.0",
   "nose>=1.3.7",
   "nose-exclude>=0.5.0",
-  "pytest",
+  "pytest; python_version < '3.10'",
+  "pytest>=9.0.3; python_version >= '3.10'",
   "pytest-rerunfailures",
   "pytest-asyncio",
+  # Transitive dep of pytest listed to avoid dependabot uv.lock-only PRs.
+  "pygments>=2.20.0",
 ]
+aio = ["aiounittest", "motor>3.6.0"]
+# KEEP IN SYNC WITH `aio` ABOVE. Deprecated alias retained for backwards compat;
+# PEP 621 does not support referencing one optional-dependency group from
+# another, so the dependency list must be duplicated verbatim.
 experimental_aio = ["aiounittest", "motor>3.6.0"]
 [project.urls]
@@ -194,4 +205,4 @@ reportCallIssue = "none"
 [tool.coverage.run]
 source = ["datasketch"]
-omit = ["*/experimental/*", "*/tests/*", "*/test/*"]
+omit = ["*/tests/*", "*/test/*"]

datasketch-1.9.0/datasketch/experimental/__init__.py DELETED Viewed

@@ -1,15 +0,0 @@
-"""Warning.
-datasketch.experimental is dedicated to new modules that are to be merged into
-the stable interface of datasketch. So their interfaces may change in future
-versions.
-To add a new class or function, register it here in this file. For example:
-from new_module import NewModuleClass
-"""
-from datasketch.experimental.aio.lsh import AsyncMinHashLSH
-__all__ = ["AsyncMinHashLSH"]