PyPI - datasketch - Versions diffs - 1.8.0__tar.gz → 1.10.0__tar.gz - Mend

datasketch 1.8.0tar.gz → 1.10.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

{datasketch-1.8.0 → datasketch-1.10.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datasketch
-Version: 1.8.0
+Version: 1.10.0
 Summary: Probabilistic data structures for processing and searching very large datasets
 Project-URL: Homepage, https://ekzhu.github.io/datasketch
 Project-URL: Bug Tracker, https://github.com/ekzhu/datasketch/issues
@@ -17,15 +17,22 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
 Classifier: Topic :: Database
 Classifier: Topic :: Scientific/Engineering :: Information Analysis
 Requires-Python: >=3.9
 Requires-Dist: numpy>=1.11
 Requires-Dist: scipy>=1.0.0
+Provides-Extra: aio
+Requires-Dist: aiounittest; extra == 'aio'
+Requires-Dist: motor>3.6.0; extra == 'aio'
 Provides-Extra: benchmark
+Requires-Dist: fonttools>=4.60.2; extra == 'benchmark'
 Requires-Dist: matplotlib>=3.1.2; extra == 'benchmark'
-Requires-Dist: nltk>=3.4.5; extra == 'benchmark'
+Requires-Dist: nltk>=3.4.5; (python_version < '3.10') and extra == 'benchmark'
+Requires-Dist: nltk>=3.9.4; (python_version >= '3.10') and extra == 'benchmark'
 Requires-Dist: pandas>=0.25.3; extra == 'benchmark'
+Requires-Dist: pillow>=12.2.0; (python_version >= '3.10') and extra == 'benchmark'
 Requires-Dist: pyfarmhash>=0.2.2; extra == 'benchmark'
 Requires-Dist: pyhash>=0.9.3; extra == 'benchmark'
 Requires-Dist: scikit-learn>=0.21.3; extra == 'benchmark'
@@ -47,9 +54,13 @@ Requires-Dist: mock>=2.0.0; extra == 'test'
 Requires-Dist: mockredispy; extra == 'test'
 Requires-Dist: nose-exclude>=0.5.0; extra == 'test'
 Requires-Dist: nose>=1.3.7; extra == 'test'
+Requires-Dist: pygments>=2.20.0; extra == 'test'
 Requires-Dist: pymongo>=3.9.0; extra == 'test'
-Requires-Dist: pytest; extra == 'test'
+Requires-Dist: pytest-asyncio; extra == 'test'
+Requires-Dist: pytest-cov; extra == 'test'
 Requires-Dist: pytest-rerunfailures; extra == 'test'
+Requires-Dist: pytest; (python_version < '3.10') and extra == 'test'
+Requires-Dist: pytest>=9.0.3; (python_version >= '3.10') and extra == 'test'
 Requires-Dist: redis>=2.10.0; extra == 'test'
 Description-Content-Type: text/x-rst
@@ -62,6 +73,9 @@ datasketch: Big Data Looks Small
 .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
    :target: https://zenodo.org/doi/10.5281/zenodo.598238
+.. image:: https://codecov.io/gh/ekzhu/datasketch/branch/master/graph/badge.svg
+    :target: https://codecov.io/gh/ekzhu/datasketch
 datasketch gives you probabilistic data structures that can process and
 search very large amount of data super fast, with little loss of
 accuracy.

{datasketch-1.8.0 → datasketch-1.10.0}/README.rst RENAMED Viewed

@@ -7,6 +7,9 @@ datasketch: Big Data Looks Small
 .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
    :target: https://zenodo.org/doi/10.5281/zenodo.598238
+.. image:: https://codecov.io/gh/ekzhu/datasketch/branch/master/graph/badge.svg
+    :target: https://codecov.io/gh/ekzhu/datasketch
 datasketch gives you probabilistic data structures that can process and
 search very large amount of data super fast, with little loss of
 accuracy.

{datasketch-1.8.0 → datasketch-1.10.0}/datasketch/__init__.py RENAMED Viewed

@@ -7,6 +7,7 @@ except importlib.metadata.PackageNotFoundError:
     _version = "0.0.0"  # Fallback for development mode
 __version__: Final[str] = _version
+from datasketch.aio import AsyncMinHashLSH  # Instantiation requires motor/redis.asyncio; import itself is always safe.
 from datasketch.b_bit_minhash import bBitMinHash
 from datasketch.hashfunc import sha1_hash32
 from datasketch.hnsw import HNSW
@@ -23,9 +24,9 @@ from datasketch.weighted_minhash import WeightedMinHash, WeightedMinHashGenerato
 WeightedMinHashLSH = MinHashLSH
 WeightedMinHashLSHForest = MinHashLSHForest
 __all__ = [
     "HNSW",
+    "AsyncMinHashLSH",
     "HyperLogLog",
     "HyperLogLogPlusPlus",
     "LeanMinHash",

datasketch-1.10.0/datasketch/aio/__init__.py ADDED Viewed

@@ -0,0 +1,44 @@
+"""Async MinHash LSH module.
+This module provides asynchronous implementations of MinHash LSH for use with
+async storage backends like MongoDB (via motor) and Redis (via redis.asyncio).
+Example:
+    .. code-block:: python
+        import asyncio
+        from datasketch.aio import AsyncMinHashLSH
+        from datasketch import MinHash
+        async def main():
+            # prepickle=True lets you use non-bytes keys (e.g. str). With the
+            # default prepickle=False, keys passed to insert() must be bytes.
+            async with AsyncMinHashLSH(
+                storage_config={"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}},
+                threshold=0.5,
+                num_perm=128,
+                prepickle=True,
+            ) as lsh:
+                m = MinHash(num_perm=128)
+                m.update(b"data")
+                await lsh.insert("key", m)
+                result = await lsh.query(m)
+        asyncio.run(main())
+"""
+from datasketch.aio.lsh import (
+    AsyncMinHashLSH,
+    AsyncMinHashLSHDeleteSession,
+    AsyncMinHashLSHInsertionSession,
+)
+__all__ = [
+    "AsyncMinHashLSH",
+    "AsyncMinHashLSHDeleteSession",
+    "AsyncMinHashLSHInsertionSession",
+]

{datasketch-1.8.0/datasketch/experimental → datasketch-1.10.0/datasketch}/aio/lsh.py RENAMED Viewed

@@ -1,9 +1,15 @@
+"""Asynchronous MinHash LSH implementation.
+This module provides AsyncMinHashLSH for use with async storage backends
+like MongoDB (via motor) and Redis (via redis.asyncio).
+"""
 import asyncio
 import pickle
 from itertools import chain
 from typing import Optional
-from datasketch.experimental.aio.storage import (
+from datasketch.aio.storage import (
     async_ordered_storage,
     async_unordered_storage,
 )
@@ -34,8 +40,6 @@ class AsyncMinHashLSH:
         MONGO = {"type": "aiomongo", "basename": "base_name_1", "mongo": {"host": "localhost", "port": 27017}}
     .. note::
-        * The module supports Python version >=3.6, and is currently experimental.
-          So the interface may change slightly in the future.
         * For main functionality of LSH algorithm see :class:`datasketch.MinHashLSH`.
         * For additional information see :ref:`minhash_lsh_at_scale` and :ref:`minhash_lsh_async`
     """
@@ -60,6 +64,7 @@ class AsyncMinHashLSH:
         self._weights = weights
         self._params = params
         self.prepickle = storage_config["type"] == "aioredis" if prepickle is None else prepickle
+        self._require_bytes_keys = not self.prepickle
         if self._threshold > 1.0 or self._threshold < 0.0:
             raise ValueError("threshold must be in [0.0, 1.0]")
@@ -115,7 +120,9 @@ class AsyncMinHashLSH:
     def __setstate__(self, state):
         state["_lock"] = asyncio.Lock()
         self.__dict__ = state
-        self.__init__(self._threshold, self._num_perm, self._weights, self._params, self._storage_config)
+        self.__init__(
+            self._threshold, self._num_perm, self._weights, self._params, self._storage_config, self.prepickle
+        )
     @property
     def batch_size(self):
@@ -126,7 +133,7 @@ class AsyncMinHashLSH:
         if self.keys is not None:
             self.keys.batch_size = value
         else:
-            raise AttributeError("AsyncMinHash is not initialized.")
+            raise AttributeError("AsyncMinHashLSH is not initialized.")
         for t in self.hashtables:
             t.batch_size = value
@@ -160,12 +167,6 @@ class AsyncMinHashLSH:
         if self.keys is None:
             await self._create_storages()
-        if not self.keys.initialized:
-            await self.keys
-        fs = (ht for ht in self.hashtables if not ht.initialized)
-        await asyncio.gather(*fs)
     async def close(self):
         """Cleanup client resources and disconnect from AsyncMinHashLSH storage."""
         async with self._lock:
@@ -186,41 +187,26 @@ class AsyncMinHashLSH:
         :param int batch_size: the size of chunks to use in insert_session mode (default=10000).
-        :return: datasketch.experimental.aio.lsh.AsyncMinHashLSHSession
+        :return: datasketch.aio.lsh.AsyncMinHashLSHInsertionSession
         Example:
             .. code-block:: python
-                from datasketch.experimental.aio.lsh import AsyncMinHashLSH
+                import asyncio
+                from datasketch.aio import AsyncMinHashLSH
                 from datasketch import MinHash
-                def chunk(it, size):
-                    it = iter(it)
-                    return iter(lambda: tuple(islice(it, size)), ())
-                _chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
-                seq = frozenset(
-                    chain(
-                        ("".join(s) for s in _chunked_str),
-                        ("aahhb", "aahh", "aahhc", "aac", "kld", "bhg", "kkd", "yow", "ppi", "eer"),
-                    )
-                )
-                objs = [MinHash(16) for _ in range(len(seq))]
-                for e, obj in zip(seq, objs):
-                    for i in e:
-                        obj.update(i.encode("utf-8"))
-                data = [(e, m) for e, m in zip(seq, objs)]
-                _storage_config_redis = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
-                async def func():
-                    async with AsyncMinHashLSH(storage_config=_storage_config_redis, threshold=0.5, num_perm=16) as lsh:
+                async def main():
+                    storage_config = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
+                    async with AsyncMinHashLSH(
+                        storage_config=storage_config, threshold=0.5, num_perm=16, prepickle=True
+                    ) as lsh:
                         async with lsh.insertion_session(batch_size=1000) as session:
-                            fs = (session.insert(key, minhash, check_duplication=True) for key, minhash in data)
-                            await asyncio.gather(*fs)
+                            m = MinHash(num_perm=16)
+                            m.update(b"data")
+                            await session.insert("key", m)
+                asyncio.run(main())
         """
         return AsyncMinHashLSHInsertionSession(self, batch_size=batch_size)
@@ -229,47 +215,32 @@ class AsyncMinHashLSH:
         """Create a asynchronous context manager for fast removal of keys
         from index.
-        :param int batch_size: the size of chunks to use in insert_session mode (default=10000).
+        :param int batch_size: the size of chunks to use in delete_session mode (default=10000).
-        :return: datasketch.experimental.aio.lsh.AsyncMinHashLSHSession
+        :return: datasketch.aio.lsh.AsyncMinHashLSHDeleteSession
         Example:
             .. code-block:: python
-                from datasketch.experimental.aio.lsh import AsyncMinHashLSH
+                import asyncio
+                from datasketch.aio import AsyncMinHashLSH
                 from datasketch import MinHash
+                async def main():
+                    storage_config = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
+                    async with AsyncMinHashLSH(
+                        storage_config=storage_config, threshold=0.5, num_perm=16, prepickle=True
+                    ) as lsh:
+                        # Insert some data first
+                        m = MinHash(num_perm=16)
+                        m.update(b"data")
+                        await lsh.insert("key1", m)
-                def chunk(it, size):
-                    it = iter(it)
-                    return iter(lambda: tuple(islice(it, size)), ())
+                        # Delete using session
+                        async with lsh.delete_session(batch_size=100) as session:
+                            await session.remove("key1")
-                _chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
-                seq = frozenset(
-                    chain(
-                        ("".join(s) for s in _chunked_str),
-                        ("aahhb", "aahh", "aahhc", "aac", "kld", "bhg", "kkd", "yow", "ppi", "eer"),
-                    )
-                )
-                objs = [MinHash(16) for _ in range(len(seq))]
-                for e, obj in zip(seq, objs):
-                    for i in e:
-                        obj.update(i.encode("utf-8"))
-                data = [(e, m) for e, m in zip(seq, objs)]
-                _storage_config_redis = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
-                async def func():
-                    async with AsyncMinHashLSH(storage_config=_storage_config_redis, threshold=0.5, num_perm=16) as lsh:
-                        async with lsh.insertion_session(batch_size=1000) as session:
-                            fs = (session.insert(key, minhash, check_duplication=True) for key, minhash in data)
-                            await asyncio.gather(*fs)
-                        async with lsh.delete_session(batch_size=3) as session:
-                            fs = (session.remove(key) for key in keys_to_remove)
-                            await asyncio.gather(*fs)
+                asyncio.run(main())
         """
         return AsyncMinHashLSHDeleteSession(self, batch_size=batch_size)
@@ -277,10 +248,17 @@ class AsyncMinHashLSH:
     async def _insert(self, key, minhash, check_duplication=True, buffer=False):
         if len(minhash) != self.h:
             raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
+        if self._require_bytes_keys and not isinstance(key, bytes):
+            raise TypeError(
+                f"prepickle=False requires bytes keys for non-dict storage, got {type(key).__name__}. "
+                "Either pass bytes keys or use prepickle=True for automatic serialization."
+            )
         if self.prepickle:
             key = pickle.dumps(key)
-        if check_duplication and await self.has_key(key):
+        # `key` is already pickled at this point under prepickle=True; call the
+        # storage primitive directly so we don't re-pickle through has_key().
+        if check_duplication and await self.keys.has_key(key):
             raise ValueError("The given key already exists")
         Hs = [self._H(minhash.hashvalues[start:end]) for start, end in self.hashranges]
@@ -306,6 +284,8 @@ class AsyncMinHashLSH:
     async def has_key(self, key):
         """See :class:`datasketch.MinHashLSH`."""
+        if self.prepickle:
+            key = pickle.dumps(key)
         return await self.keys.has_key(key)
     async def remove(self, key):
@@ -313,7 +293,12 @@ class AsyncMinHashLSH:
         await self._remove(key, buffer=False)
     async def _remove(self, key, buffer=False):
-        if not await self.has_key(key):
+        if self.prepickle:
+            key = pickle.dumps(key)
+        # `key` is already pickled here; call storage primitives directly so
+        # the existence check, lookup, and deletes all use the stored form.
+        if not await self.keys.has_key(key):
             raise ValueError("The given key does not exist")
         for H, hashtable in zip(await self.keys.get(key), self.hashtables):
@@ -344,7 +329,10 @@ class AsyncMinHashLSH:
             H = self._H(minhash.hashvalues[start:end])
             if await hashtable.has_key(H):
                 fs.append(hashtable.get(H))
-        return set(chain.from_iterable(await asyncio.gather(*fs)))  # candidates
+        candidates = set(chain.from_iterable(await asyncio.gather(*fs)))
+        if self.prepickle:
+            return {pickle.loads(key) for key in candidates}
+        return candidates
     async def get_counts(self):
         """See :class:`datasketch.MinHashLSH`."""
@@ -353,6 +341,10 @@ class AsyncMinHashLSH:
     async def get_subset_counts(self, *keys):
         """See :class:`datasketch.MinHashLSH`."""
+        # Keys in storage are pickled when prepickle is enabled, so we have to
+        # pickle the query keys to match the stored representation.
+        if self.prepickle:
+            keys = tuple(pickle.dumps(key) for key in keys)
         key_set = list(set(keys))
         hashtables = [unordered_storage({"type": "dict"}) for _ in range(self.b)]
         Hss = await self.keys.getmany(*key_set)

{datasketch-1.8.0/datasketch/experimental → datasketch-1.10.0/datasketch}/aio/storage.py RENAMED Viewed

@@ -1,9 +1,23 @@
+"""Async storage backends for MinHash LSH.
+This module provides async storage implementations for use with AsyncMinHashLSH:
+- AsyncMongoListStorage / AsyncMongoSetStorage: MongoDB storage via motor
+- AsyncRedisListStorage / AsyncRedisSetStorage: Redis storage via redis.asyncio
+"""
 import asyncio
 import os
 from abc import ABCMeta
 from itertools import chain
-from datasketch.storage import OrderedStorage, RedisStorage, Storage, UnorderedStorage, _random_name
+from datasketch.storage import OrderedStorage, Storage, UnorderedStorage, _random_name
+# RedisStorage is only available when redis package is installed (optional dependency)
+# Import it conditionally to avoid ImportError when redis is not installed
+try:
+    from datasketch.storage import RedisStorage
+except ImportError:
+    RedisStorage = None
 ABC = ABCMeta("ABC", (object,), {})
@@ -24,6 +38,12 @@ except ImportError:
     redis = None
+__all__ = [
+    "async_ordered_storage",
+    "async_unordered_storage",
+]
 async def async_ordered_storage(config, name=None):
     tp = config["type"]
     if tp == "aiomongo":
@@ -55,9 +75,9 @@ if motor is not None and ReturnDocument is not None:
     class AsyncMongoBuffer:
         def __init__(self, aio_mongo_collection, batch_size):
             self._batch_size = batch_size
-            self._insert_documents_stack = tuple()
-            self._delete_by_key_documents_stack = tuple()
-            self._delete_by_val_documents_stack = tuple()
+            self._insert_documents_stack = []
+            self._delete_by_key_documents_stack = []
+            self._delete_by_val_documents_stack = []
             self._mongo_coll = aio_mongo_collection
         @property
@@ -73,28 +93,28 @@ if motor is not None and ReturnDocument is not None:
             if command == "insert":
                 if len(self._insert_documents_stack) >= self.batch_size:
                     await self.execute(command)
-                self._insert_documents_stack += (kwargs["obj"],)
+                self._insert_documents_stack.append(kwargs["obj"])
             elif command == "delete_by_key":
                 if len(self._delete_by_key_documents_stack) >= self.batch_size:
                     await self.execute(command)
-                self._delete_by_key_documents_stack += (kwargs["key"],)
+                self._delete_by_key_documents_stack.append(kwargs["key"])
             elif command == "delete_by_val":
                 if len(self._delete_by_val_documents_stack) >= self.batch_size:
                     await self.execute(command)
-                self._delete_by_val_documents_stack += (kwargs["val"],)
+                self._delete_by_val_documents_stack.append(kwargs["val"])
         async def execute(self, command):
             if command == "insert" and self._insert_documents_stack:
                 buffer = self._insert_documents_stack
-                self._insert_documents_stack = tuple()
+                self._insert_documents_stack = []
                 await self._mongo_coll.insert_many(buffer, ordered=False)
             elif command == "delete_by_key" and self._delete_by_key_documents_stack:
                 buffer = self._delete_by_key_documents_stack
-                self._delete_by_key_documents_stack = tuple()
+                self._delete_by_key_documents_stack = []
                 await self._mongo_coll.delete_many({"key": {"$in": buffer}})
             elif command == "delete_by_val" and self._delete_by_val_documents_stack:
                 buffer = self._delete_by_val_documents_stack
-                self._delete_by_val_documents_stack = tuple()
+                self._delete_by_val_documents_stack = []
                 await self._mongo_coll.delete_many({"vals": {"$in": buffer}})
         async def insert_one(self, **kwargs):
@@ -254,6 +274,9 @@ if motor is not None and ReturnDocument is not None:
         async def has_key(self, key):
             return bool(await self._collection.find_one({"key": key}))
+        async def getmany(self, *keys):
+            return await asyncio.gather(*(self.get(key) for key in keys))
         async def status(self):
             status = self._parse_config(self.config["mongo"])
             status.update({"keyspace_size": await self.size()})
@@ -285,7 +308,9 @@ if motor is not None and ReturnDocument is not None:
                 await self._collection.find_one_and_delete({"key": key, "vals": val})
-if redis is not None:
+# Redis-based async storage classes are only defined when both redis package
+# and RedisStorage are available (optional dependencies)
+if redis is not None and RedisStorage is not None:
     class AsyncRedisBuffer(redis.client.Pipeline):
         def __init__(self, connection_pool, response_callbacks, transaction, buffer_size, shard_hint=None):
@@ -304,7 +329,7 @@ if redis is not None:
         async def execute_command(self, *args, **kwargs):
             if len(self.command_stack) >= self._buffer_size:
-                self.execute()
+                await self.execute()
             await super(AsyncRedisBuffer, self).execute_command(*args, **kwargs)
     class AsyncRedisStorage(RedisStorage):
@@ -323,16 +348,19 @@ if redis is not None:
             )
             self._initialized = True
+        async def close(self):
+            await self._redis.aclose()
         @property
         def initialized(self):
             return self._initialized
     class AsyncRedisListStorage(OrderedStorage, AsyncRedisStorage):
         async def keys(self):
-            return await self._redis.hkeys(self._name)
+            return await self._redis.hkeys(self._name)  # type: ignore
         async def redis_keys(self):
-            return await self._redis.hvals(self._name)
+            return await self._redis.hvals(self._name)  # type: ignore
         def status(self):
             status = self._parse_config(self.config["redis"])
@@ -344,24 +372,34 @@ if redis is not None:
         async def getmany(self, *keys):
             pipe = self._redis.pipeline()
-            pipe.multi()
             for key in keys:
-                await self._get_items(pipe, self.redis_key(key))
+                pipe.lrange(self.redis_key(key), 0, -1)
             return await pipe.execute()
         @staticmethod
         async def _get_items(r, k):
             return await r.lrange(k, 0, -1)
-        async def remove(self, *keys):
-            await self._redis.hdel(self._name, *keys)
-            await self._redis.delete(*[self.redis_key(key) for key in keys])
+        async def remove(self, *keys, **kwargs):
+            buffer = kwargs.pop("buffer", False)
+            if buffer:
+                await self._remove(self._buffer, *keys)
+            else:
+                await self._remove(self._redis, *keys)
+        async def _remove(self, r, *keys):
+            await r.hdel(self._name, *keys)
+            await r.delete(*[self.redis_key(key) for key in keys])
-        async def remove_val(self, key, val):
+        async def remove_val(self, key, val, **kwargs):
+            buffer = kwargs.pop("buffer", False)
             redis_key = self.redis_key(key)
-            await self._redis.lrem(redis_key, val)
-            if not await self._redis.exists(redis_key):
-                await self._redis.hdel(self._name, redis_key)
+            if buffer:
+                await self._buffer.lrem(redis_key, val)
+            else:
+                await self._redis.lrem(redis_key, val)
+                if not await self._redis.exists(redis_key):  # type: ignore
+                    await self._redis.hdel(self._name, redis_key)  # type: ignore
         async def insert(self, key, *vals, **kwargs):
             # Using buffer=True outside of an `insertion_session`
@@ -380,7 +418,7 @@ if redis is not None:
             await r.rpush(redis_key, *values)
         async def size(self):
-            return await self._redis.hlen(self._name)
+            return await self._redis.hlen(self._name)  # type: ignore
         async def itemcounts(self):
             pipe = self._redis.pipeline()
@@ -395,7 +433,7 @@ if redis is not None:
             return await r.llen(k)
         async def has_key(self, key):
-            return await self._redis.hexists(self._name, key)
+            return await self._redis.hexists(self._name, key)  # type: ignore
         async def empty_buffer(self):
             await self._buffer.execute()
@@ -408,11 +446,21 @@ if redis is not None:
         async def _get_items(r, k):
             return await r.smembers(k)
-        async def remove_val(self, key, val):
+        async def getmany(self, *keys):
+            pipe = self._redis.pipeline()
+            for key in keys:
+                pipe.smembers(self.redis_key(key))
+            return await pipe.execute()
+        async def remove_val(self, key, val, **kwargs):
+            buffer = kwargs.pop("buffer", False)
             redis_key = self.redis_key(key)
-            await self._redis.srem(redis_key, val)
-            if not await self._redis.exists(redis_key):
-                await self._redis.hdel(self._name, redis_key)
+            if buffer:
+                await self._buffer.srem(redis_key, val)
+            else:
+                await self._redis.srem(redis_key, val)
+                if not await self._redis.exists(redis_key):  # type: ignore
+                    await self._redis.hdel(self._name, redis_key)  # type: ignore
         async def _insert(self, r, key, *values):
             redis_key = self.redis_key(key)

datasketch-1.10.0/datasketch/experimental/__init__.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""Deprecated experimental module.
+.. deprecated::
+    The `datasketch.experimental` module is deprecated and will be removed in a future version.
+    Please use `datasketch.aio` instead:
+    Old: ``from datasketch.experimental import AsyncMinHashLSH``
+    New: ``from datasketch.aio import AsyncMinHashLSH``
+    Or simply: ``from datasketch import AsyncMinHashLSH``
+"""
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    # Visible to static analyzers so they know `__all__` is satisfied.
+    # Not imported at runtime - the real dispatch happens in __getattr__.
+    from datasketch.aio import AsyncMinHashLSH
+__all__ = ["AsyncMinHashLSH"]
+def __getattr__(name):
+    # PEP 562: only emit the DeprecationWarning when the user actually pulls a
+    # symbol out of this package, not on every `import datasketch.experimental`.
+    # This avoids the noisy triple-warning that fired when each intermediate
+    # __init__.py warned eagerly.
+    #
+    # We cache the resolved symbol back into globals() so subsequent accesses
+    # bypass __getattr__. This matters for two reasons:
+    #   1. `from pkg import x` internally performs both `hasattr(pkg, x)` and
+    #      `getattr(pkg, x)`, so without caching __getattr__ fires twice.
+    #   2. It makes the warning a one-shot per process, which is the normal
+    #      expectation for deprecation warnings.
+    if name == "AsyncMinHashLSH":
+        import warnings
+        warnings.warn(
+            "datasketch.experimental is deprecated. "
+            "Use 'from datasketch.aio import AsyncMinHashLSH' or "
+            "'from datasketch import AsyncMinHashLSH' instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        from datasketch.aio import AsyncMinHashLSH
+        globals()[name] = AsyncMinHashLSH
+        return AsyncMinHashLSH
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

datasketch-1.10.0/datasketch/experimental/aio/__init__.py ADDED Viewed

@@ -0,0 +1,50 @@
+"""Deprecated experimental aio module.
+.. deprecated::
+    The `datasketch.experimental.aio` module is deprecated and will be removed in a future version.
+    Please use `datasketch.aio` instead:
+    Old: ``from datasketch.experimental.aio import AsyncMinHashLSH``
+    New: ``from datasketch.aio import AsyncMinHashLSH``
+"""
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    # Visible to static analyzers so they know `__all__` is satisfied.
+    # Not imported at runtime - the real dispatch happens in __getattr__.
+    from datasketch.aio import (
+        AsyncMinHashLSH,
+        AsyncMinHashLSHDeleteSession,
+        AsyncMinHashLSHInsertionSession,
+    )
+__all__ = [
+    "AsyncMinHashLSH",
+    "AsyncMinHashLSHDeleteSession",
+    "AsyncMinHashLSHInsertionSession",
+]
+_DEPRECATED = frozenset(__all__)
+def __getattr__(name):
+    # PEP 562: emit the warning lazily on attribute access so that merely
+    # importing the parent package (e.g. as an intermediate step of
+    # `from datasketch.experimental.aio.lsh import ...`) does not fire
+    # a second, redundant warning. See the long comment in
+    # datasketch/experimental/__init__.py for why we cache into globals().
+    if name in _DEPRECATED:
+        import warnings
+        warnings.warn(
+            "datasketch.experimental.aio is deprecated. Use 'from datasketch.aio import AsyncMinHashLSH' instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        import datasketch.aio as _new
+        value = getattr(_new, name)
+        globals()[name] = value
+        return value
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

datasketch-1.10.0/datasketch/experimental/aio/lsh.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""Deprecated experimental aio lsh module.
+.. deprecated::
+    The `datasketch.experimental.aio.lsh` module is deprecated and will be removed in a future version.
+    Please use `datasketch.aio.lsh` instead:
+    Old: ``from datasketch.experimental.aio.lsh import AsyncMinHashLSH``
+    New: ``from datasketch.aio import AsyncMinHashLSH``
+"""
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    # Visible to static analyzers so they know `__all__` is satisfied.
+    # Not imported at runtime - the real dispatch happens in __getattr__.
+    from datasketch.aio.lsh import (
+        AsyncMinHashLSH,
+        AsyncMinHashLSHDeleteSession,
+        AsyncMinHashLSHInsertionSession,
+    )
+__all__ = [
+    "AsyncMinHashLSH",
+    "AsyncMinHashLSHDeleteSession",
+    "AsyncMinHashLSHInsertionSession",
+]
+_DEPRECATED = frozenset(__all__)
+def __getattr__(name):
+    # Lazy warning via PEP 562: fires exactly once per attribute access on the
+    # deprecated module, and we cache the resolved symbol back into globals()
+    # so the warning is emitted once per process (see the long comment in
+    # datasketch/experimental/__init__.py for rationale).
+    if name in _DEPRECATED:
+        import warnings
+        warnings.warn(
+            "datasketch.experimental.aio.lsh is deprecated. Use 'from datasketch.aio import AsyncMinHashLSH' instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        import datasketch.aio.lsh as _new
+        value = getattr(_new, name)
+        globals()[name] = value
+        return value
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

{datasketch-1.8.0 → datasketch-1.10.0}/datasketch/lsh.py RENAMED Viewed

@@ -3,12 +3,18 @@ from __future__ import annotations
 import pickle
 import struct
 from collections.abc import Hashable
-from typing import Callable, Optional, Union
+from typing import Callable, List, Optional, Union
 from scipy.integrate import quad as integrate
 from datasketch.minhash import MinHash
-from datasketch.storage import _random_name, ordered_storage, unordered_storage
+from datasketch.storage import (
+    OrderedStorage,
+    UnorderedStorage,
+    _random_name,
+    ordered_storage,
+    unordered_storage,
+)
 from datasketch.weighted_minhash import WeightedMinHash
@@ -183,7 +189,7 @@ class MinHashLSH:
             self._H = self._byteswap
         basename = storage_config.get("basename", _random_name(11))
-        self.hashtables = [
+        self.hashtables: List[UnorderedStorage] = [
             unordered_storage(
                 storage_config,
                 name=b"".join([basename, b"_bucket_", struct.pack(">H", i)]),
@@ -191,7 +197,7 @@ class MinHashLSH:
             for i in range(self.b)
         ]
         self.hashranges = [(i * self.r, (i + 1) * self.r) for i in range(self.b)]
-        self.keys = ordered_storage(storage_config, name=b"".join([basename, b"_keys"]))
+        self.keys: OrderedStorage = ordered_storage(storage_config, name=b"".join([basename, b"_keys"]))
     @property
     def buffer_size(self) -> int:
@@ -347,7 +353,7 @@ class MinHashLSH:
         """
         return type(self) is type(other) and self.h == other.h and self.b == other.b and self.r == other.r
-    def _merge(self, other: MinHashLSH, check_overlap: bool = False, buffer: bool = False) -> MinHashLSH:
+    def _merge(self, other: MinHashLSH, check_overlap: bool = False, buffer: bool = False) -> None:
         if self.__equivalent(other):
             if check_overlap and set(self.keys).intersection(set(other.keys)):
                 raise ValueError("The keys are overlapping, duplicate key exists.")
@@ -457,16 +463,24 @@ class MinHashLSH:
             list: a list of unique keys.
         """
-        collected_result_sets = [
-            set(collected_result_lists)
-            for hashtable in self.hashtables
-            for collected_result_lists in hashtable.collect_select_buffer()
+        collected_result_lists = [hashtable.collect_select_buffer() for hashtable in self.hashtables]
+        if not any(collected_result_lists):
+            return []
+        # Each buffered query contributes one result list per hashtable. We first
+        # union candidates across bands for each query, then intersect across the
+        # buffered queries to match repeated calls to `query()`.
+        per_query_result_sets = [
+            set().union(*query_result_lists)
+            for query_result_lists in zip(*collected_result_lists)
         ]
-        if not collected_result_sets:
+        if not per_query_result_sets:
             return []
+        candidates = set.intersection(*per_query_result_sets)
         if self.prepickle:
-            return [pickle.loads(key) for key in set.intersection(*collected_result_sets)]
-        return list(set.intersection(*collected_result_sets))
+            return [pickle.loads(key) for key in candidates]
+        return list(candidates)
     def __contains__(self, key: Hashable) -> bool:
         """Args:
@@ -524,6 +538,8 @@ class MinHashLSH:
         return bytes(hs.byteswap().data)
     def _hashed_byteswap(self, hs):
+        if self.hashfunc is None:
+            raise RuntimeError("Hash function not configured.")
         return self.hashfunc(bytes(hs.byteswap().data))
     def _query_b(self, minhash, b):

{datasketch-1.8.0 → datasketch-1.10.0}/datasketch/lsh_bloom.py RENAMED Viewed

@@ -252,9 +252,9 @@ class MinHashLSHBloom:
             raise ValueError("threshold must be in [0.0, 1.0]")
         if num_perm < 2:
             raise ValueError("Too few permutation functions")
-        if n <= 0:
+        if n is None or n <= 0:
             raise ValueError("n for LSHBloom must be >= 0")
-        if fp >= 1.0 or fp <= 0.0:
+        if fp is None or fp >= 1.0 or fp <= 0.0:
             raise ValueError("fp must be in (0.0, 1.0)")
         if save_dir is None:
             warnings.warn(

{datasketch-1.8.0 → datasketch-1.10.0}/datasketch/lshensemble.py RENAMED Viewed

@@ -204,7 +204,7 @@ class MinHashLSHEnsemble:
         if not self.is_empty():
             raise ValueError("Cannot call index again on a non-empty index")
         if not isinstance(entries, list):
-            queue = deque([])
+            queue = deque()
             for key, minhash, size in entries:
                 if size <= 0:
                     raise ValueError("Set size must be positive")
@@ -221,7 +221,8 @@ class MinHashLSHEnsemble:
         entries.sort(key=lambda e: e[2])
         curr_part = 0
         for key, minhash, size in entries:
-            if size > self.uppers[curr_part]:
+            u = self.uppers[curr_part]
+            if size > u:
                 curr_part += 1
             for r in self.indexes[curr_part]:
                 self.indexes[curr_part][r].insert(key, minhash)

{datasketch-1.8.0 → datasketch-1.10.0}/datasketch/lshforest.py RENAMED Viewed

@@ -9,8 +9,8 @@ from datasketch.minhash import MinHash
 class MinHashLSHForest:
     """The LSH Forest for MinHash. It supports top-k query in Jaccard
     similarity.
-    Instead of using prefix trees as the `original paper
-    <http://ilpubs.stanford.edu:8090/678/1/2005-14.pdf>`_,
+    Instead of using prefix trees as described in the original LSH Forest
+    paper by Bawa et al. (WWW 2005),
     I use a sorted array to store the hash values in every
     hash table.
@@ -37,7 +37,8 @@ class MinHashLSHForest:
         # Maximum depth of the prefix tree
         self.k = int(num_perm / l)
         self.hashtables = [defaultdict(list) for _ in range(self.l)]
-        self.hashranges = [(i * self.k, (i + 1) * self.k) for i in range(self.l)]
+        self.hashranges = [(i * self.k, (i + 1) * self.k)
+                           for i in range(self.l)]
         self.keys = dict()
         # This is the sorted array implementation for the prefix trees
         self.sorted_hashtables = [[] for _ in range(self.l)]
@@ -59,7 +60,8 @@ class MinHashLSHForest:
             raise ValueError("The num_perm of MinHash out of range")
         if key in self.keys:
             raise ValueError("The given key has already been added")
-        self.keys[key] = [self._H(minhash.hashvalues[start:end]) for start, end in self.hashranges]
+        self.keys[key] = [self._H(minhash.hashvalues[start:end])
+                          for start, end in self.hashranges]
         for H, hashtable in zip(self.keys[key], self.hashtables):
             hashtable[H].append(key)
@@ -73,11 +75,13 @@ class MinHashLSHForest:
         if r > self.k or r <= 0 or b > self.l or b <= 0:
             raise ValueError("parameter outside range")
         # Generate prefixes of concatenated hash values
-        hps = [self._H(minhash.hashvalues[start : start + r]) for start, _ in self.hashranges]
+        hps = [self._H(minhash.hashvalues[start: start + r])
+               for start, _ in self.hashranges]
         # Set the prefix length for look-ups in the sorted hash values list
         prefix_size = len(hps[0])
         for ht, hp, hashtable in zip(self.sorted_hashtables, hps, self.hashtables):
-            i = self._binary_search(len(ht), lambda x, ht=ht, hp=hp: ht[x][:prefix_size] >= hp)
+            i = self._binary_search(
+                len(ht), lambda x, ht=ht, hp=hp: ht[x][:prefix_size] >= hp)
             if i < len(ht) and ht[i][:prefix_size] == hp:
                 j = i
                 while j < len(ht) and ht[j][:prefix_size] == hp:
@@ -137,14 +141,17 @@ class MinHashLSHForest:
         """
         byteslist = self.keys.get(key, None)
         if byteslist is None:
-            raise KeyError(f"The provided key does not exist in the LSHForest: {key}")
+            raise KeyError(
+                f"The provided key does not exist in the LSHForest: {key}")
         hashvalue_byte_size = len(byteslist[0]) // 8
-        hashvalues = np.empty(len(byteslist) * hashvalue_byte_size, dtype=np.uint64)
+        hashvalues = np.empty(
+            len(byteslist) * hashvalue_byte_size, dtype=np.uint64)
         for index, item in enumerate(byteslist):
             # unswap the bytes, as their representation is flipped during storage
             hv_segment = np.frombuffer(item, dtype=np.uint64).byteswap()
             curr_index = index * hashvalue_byte_size
-            hashvalues[curr_index : curr_index + hashvalue_byte_size] = hv_segment
+            hashvalues[curr_index: curr_index +
+                       hashvalue_byte_size] = hv_segment
         return hashvalues
     def _binary_search(self, n, func):

{datasketch-1.8.0 → datasketch-1.10.0}/datasketch/minhash.py RENAMED Viewed

@@ -3,15 +3,18 @@ from __future__ import annotations
 import copy
 import warnings
 from collections.abc import Generator, Iterable
-from typing import Callable, Optional
+from typing import TYPE_CHECKING, Callable, Optional, Union
 try:
     from typing import Literal  # py3.8+; if older, you can fallback to typing_extensions
-except Exception:
+except ImportError:
     from typing_extensions import Literal
 import numpy as np
+if TYPE_CHECKING:
+    from numpy.typing import ArrayLike
 # GPU backend
 try:
     import cupy as cp
@@ -114,8 +117,8 @@ class MinHash:
         gpu_mode: Literal["disable", "detect", "always"] = "disable",
         hashfunc: Callable = sha1_hash32,
         hashobj: Optional[object] = None,  # Deprecated.
-        hashvalues: Optional[Iterable] = None,
-        permutations: Optional[tuple[Iterable, Iterable]] = None,
+        hashvalues: Optional[ArrayLike] = None,
+        permutations: Optional[Union[tuple[ArrayLike, ArrayLike], ArrayLike]] = None,
     ) -> None:
         if hashvalues is not None:
             num_perm = len(hashvalues)
@@ -180,7 +183,7 @@ class MinHash:
             dtype=np.uint64,
         ).T
-    def _parse_hashvalues(self, hashvalues):
+    def _parse_hashvalues(self, hashvalues) -> np.ndarray:
         return np.array(hashvalues, dtype=np.uint64)
     def update(self, b) -> None:

{datasketch-1.8.0 → datasketch-1.10.0}/datasketch/storage.py RENAMED Viewed

@@ -26,7 +26,7 @@ except ImportError:
     c_concurrent = None
-def ordered_storage(config, name=None):
+def ordered_storage(config, name=None) -> "OrderedStorage":
     """Return ordered storage system based on the specified config.
     The canonical example of such a storage container is
@@ -62,10 +62,10 @@ def ordered_storage(config, name=None):
         return RedisListStorage(config, name=name)
     if tp == "cassandra":
         return CassandraListStorage(config, name=name)
-    return None
+    raise ValueError(f"Unknown storage type: {tp}")
-def unordered_storage(config, name=None):
+def unordered_storage(config, name=None) -> "UnorderedStorage":
     """Return an unordered storage system based on the specified config.
     The canonical example of such a storage container is
@@ -100,7 +100,7 @@ def unordered_storage(config, name=None):
         return RedisSetStorage(config, name=name)
     if tp == "cassandra":
         return CassandraSetStorage(config, name=name)
-    return None
+    raise ValueError(f"Unknown storage type: {tp}")
 class Storage(ABC):
@@ -144,7 +144,7 @@ class Storage(ABC):
         pass
     @abstractmethod
-    def remove(self, *keys):
+    def remove(self, *keys, **kwargs):
         """Remove `keys` from storage."""
         pass
@@ -154,12 +154,12 @@ class Storage(ABC):
         pass
     @abstractmethod
-    def size(self):
+    def size(self) -> int:
         """Return size of storage with respect to number of keys."""
         pass
     @abstractmethod
-    def itemcounts(self, **kwargs):
+    def itemcounts(self, **kwargs) -> dict:
         """Returns the number of items stored under each key."""
         pass
@@ -168,6 +168,14 @@ class Storage(ABC):
         """Determines whether the key is in the storage or not."""
         pass
+    @property
+    def buffer_size(self) -> int:
+        return getattr(self, "_buffer_size", 50000)
+    @buffer_size.setter
+    def buffer_size(self, value: int):
+        self._buffer_size = value
     def status(self):
         return {"keyspace_size": len(self)}
@@ -595,12 +603,14 @@ if cassandra is not None:
             del self._select_statements_and_parameters_with_decoders[:]
             statements_and_parameters, decoders = zip(*buffer)
-            ret = collections.defaultdict(list)
             query_results = self._select(statements_and_parameters)
-            for rows, (key_decoder, val_decoder) in zip(query_results, decoders):
+            ret = []
+            for rows, (_key_decoder, val_decoder) in zip(query_results, decoders):
+                values = []
                 for row in rows:
-                    ret[key_decoder(row.key)].append((val_decoder(row.value), row.ts))
-            return [[x[0] for x in sorted(v, key=operator.itemgetter(1))] for v in ret.values()]
+                    values.append((val_decoder(row.value), row.ts))
+                ret.append([x[0] for x in sorted(values, key=operator.itemgetter(1))])
+            return ret
         def select(self, keys):
             """Select all values for the given keys.

{datasketch-1.8.0 → datasketch-1.10.0}/datasketch/weighted_minhash.py RENAMED Viewed

@@ -133,14 +133,15 @@ class WeightedMinHashGenerator:
             WeightedMinHash: The weighted MinHash.
         """
-        if not isinstance(v, collections.abc.Iterable):
-            raise TypeError("Input vector must be an iterable")
+        if not isinstance(v, collections.abc.Sized):
+            raise TypeError("Input vector must be sized")
         if not len(v) == self.dim:
             raise ValueError("Input dimension mismatch, expecting %d" % self.dim)
         if not isinstance(v, np.ndarray):
             v = np.array(v, dtype=np.float32)
         elif v.dtype != np.float32:
             v = v.astype(np.float32)
+        v: np.ndarray = v
         hashvalues = np.zeros((self.sample_size, 2), dtype=int)
         vzeros = v == 0
         if vzeros.all():
@@ -226,9 +227,8 @@ class WeightedMinHashGenerator:
                 doc_argmin = np.argmin(doc_ln_a, axis=1)
                 doc_k = doc_cidx[doc_argmin]
-                all_hashvalues[it_doc] = np.zeros((self.sample_size, 2), dtype=int)
-                hashvalues = all_hashvalues[it_doc]
+                hashvalues = np.zeros((self.sample_size, 2), dtype=int)
+                all_hashvalues[it_doc] = hashvalues
                 hashvalues[:, 0], hashvalues[:, 1] = (
                     doc_k,
                     t[np.arange(self.sample_size), doc_begin + doc_argmin],

{datasketch-1.8.0 → datasketch-1.10.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "datasketch"
-version = "1.8.0"
+version = "1.10.0"
 description = "Probabilistic data structures for processing and searching very large datasets"
 readme = "README.rst"
 requires-python = ">=3.9"
@@ -21,6 +21,7 @@ classifiers = [
   "Programming Language :: Python :: 3.10",
   "Programming Language :: Python :: 3.11",
   "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
 ]
 dependencies = ["numpy>=1.11", "scipy>=1.0.0"]
@@ -36,7 +37,11 @@ benchmark = [
   "pandas>=0.25.3",
   "SetSimilaritySearch>=0.1.7",
   "pyfarmhash>=0.2.2",
-  "nltk>=3.4.5",
+  "nltk>=3.4.5; python_version < '3.10'",
+  "nltk>=3.9.4; python_version >= '3.10'",
+  # Transitive deps of matplotlib listed to avoid dependabot uv.lock-only PRs.
+  "pillow>=12.2.0; python_version >= '3.10'",
+  "fonttools>=4.60.2",
 ]
 test = [
   "cassandra-driver>=3.20",
@@ -44,12 +49,21 @@ test = [
   "mock>=2.0.0",
   "mockredispy",
   "coverage",
+  "pytest-cov",
   "pymongo>=3.9.0",
   "nose>=1.3.7",
   "nose-exclude>=0.5.0",
-  "pytest",
+  "pytest; python_version < '3.10'",
+  "pytest>=9.0.3; python_version >= '3.10'",
   "pytest-rerunfailures",
+  "pytest-asyncio",
+  # Transitive dep of pytest listed to avoid dependabot uv.lock-only PRs.
+  "pygments>=2.20.0",
 ]
+aio = ["aiounittest", "motor>3.6.0"]
+# KEEP IN SYNC WITH `aio` ABOVE. Deprecated alias retained for backwards compat;
+# PEP 621 does not support referencing one optional-dependency group from
+# another, so the dependency list must be duplicated verbatim.
 experimental_aio = ["aiounittest", "motor>3.6.0"]
 [project.urls]
@@ -91,7 +105,6 @@ exclude = [
   "dist",
   "docs",
   "examples",
-  "travis",
   "datasketch/hyperloglog_const.py",
 ]
@@ -158,5 +171,38 @@ include = ["pyproject.toml", "README.rst", "LICENSE", "datasketch/**"]
 [tool.pytest.ini_options]
 minversion = "6.0"
-addopts = ["--strict-markers", "--color=yes"]
+addopts = ["--strict-markers", "--color=yes", "--cov-report=xml"]
 testpaths = ["test"]
+asyncio_mode = "auto"
+[tool.pyright]
+include = ["datasketch"]
+exclude = [
+  "benchmark",
+  "docs",
+  "examples",
+  "test",
+  "travis",
+  "**/.venv/**",
+  "**/__pycache__",
+]
+pythonVersion = "3.9"
+typeCheckingMode = "basic" # todo: change to "strict" in future
+reportMissingImports = "none"
+reportUnusedVariable = "warning"
+reportAttributeAccessIssue = "none"
+reportOptionalMemberAccess = "none"
+reportGeneralTypeIssues = "none"
+reportArgumentType = "none"
+reportOptionalIterable = "none"
+reportReturnType = "none"
+reportRedeclaration = "none"
+reportOperatorIssue = "none"
+reportAssignmentType = "none"
+reportOptionalSubscript = "none"
+reportCallIssue = "none"
+[tool.coverage.run]
+source = ["datasketch"]
+omit = ["*/tests/*", "*/test/*"]

datasketch-1.8.0/datasketch/experimental/__init__.py DELETED Viewed

@@ -1,15 +0,0 @@
-"""Warning.
-datasketch.experimental is dedicated to new modules that are to be merged into
-the stable interface of datasketch. So their interfaces may change in future
-versions.
-To add a new class or function, register it here in this file. For example:
-from new_module import NewModuleClass
-"""
-from datasketch.experimental.aio.lsh import AsyncMinHashLSH
-__all__ = ["AsyncMinHashLSH"]