datasketch 1.8.0__tar.gz → 1.10.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {datasketch-1.8.0 → datasketch-1.10.0}/PKG-INFO +17 -3
  2. {datasketch-1.8.0 → datasketch-1.10.0}/README.rst +3 -0
  3. {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/__init__.py +2 -1
  4. datasketch-1.10.0/datasketch/aio/__init__.py +44 -0
  5. {datasketch-1.8.0/datasketch/experimental → datasketch-1.10.0/datasketch}/aio/lsh.py +66 -74
  6. {datasketch-1.8.0/datasketch/experimental → datasketch-1.10.0/datasketch}/aio/storage.py +77 -29
  7. datasketch-1.10.0/datasketch/experimental/__init__.py +49 -0
  8. datasketch-1.10.0/datasketch/experimental/aio/__init__.py +50 -0
  9. datasketch-1.10.0/datasketch/experimental/aio/lsh.py +49 -0
  10. {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/lsh.py +28 -12
  11. {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/lsh_bloom.py +2 -2
  12. {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/lshensemble.py +3 -2
  13. {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/lshforest.py +16 -9
  14. {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/minhash.py +8 -5
  15. {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/storage.py +21 -11
  16. {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/weighted_minhash.py +5 -5
  17. {datasketch-1.8.0 → datasketch-1.10.0}/pyproject.toml +51 -5
  18. datasketch-1.8.0/datasketch/experimental/__init__.py +0 -15
  19. datasketch-1.8.0/datasketch/experimental/aio/__init__.py +0 -0
  20. {datasketch-1.8.0 → datasketch-1.10.0}/.gitignore +0 -0
  21. {datasketch-1.8.0 → datasketch-1.10.0}/LICENSE +0 -0
  22. {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/b_bit_minhash.py +0 -0
  23. {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/hashfunc.py +0 -0
  24. {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/hnsw.py +0 -0
  25. {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/hyperloglog.py +0 -0
  26. {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/hyperloglog_const.py +0 -0
  27. {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/lean_minhash.py +0 -0
  28. {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/lshensemble_partition.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datasketch
3
- Version: 1.8.0
3
+ Version: 1.10.0
4
4
  Summary: Probabilistic data structures for processing and searching very large datasets
5
5
  Project-URL: Homepage, https://ekzhu.github.io/datasketch
6
6
  Project-URL: Bug Tracker, https://github.com/ekzhu/datasketch/issues
@@ -17,15 +17,22 @@ Classifier: Programming Language :: Python :: 3.9
17
17
  Classifier: Programming Language :: Python :: 3.10
18
18
  Classifier: Programming Language :: Python :: 3.11
19
19
  Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
20
21
  Classifier: Topic :: Database
21
22
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
22
23
  Requires-Python: >=3.9
23
24
  Requires-Dist: numpy>=1.11
24
25
  Requires-Dist: scipy>=1.0.0
26
+ Provides-Extra: aio
27
+ Requires-Dist: aiounittest; extra == 'aio'
28
+ Requires-Dist: motor>3.6.0; extra == 'aio'
25
29
  Provides-Extra: benchmark
30
+ Requires-Dist: fonttools>=4.60.2; extra == 'benchmark'
26
31
  Requires-Dist: matplotlib>=3.1.2; extra == 'benchmark'
27
- Requires-Dist: nltk>=3.4.5; extra == 'benchmark'
32
+ Requires-Dist: nltk>=3.4.5; (python_version < '3.10') and extra == 'benchmark'
33
+ Requires-Dist: nltk>=3.9.4; (python_version >= '3.10') and extra == 'benchmark'
28
34
  Requires-Dist: pandas>=0.25.3; extra == 'benchmark'
35
+ Requires-Dist: pillow>=12.2.0; (python_version >= '3.10') and extra == 'benchmark'
29
36
  Requires-Dist: pyfarmhash>=0.2.2; extra == 'benchmark'
30
37
  Requires-Dist: pyhash>=0.9.3; extra == 'benchmark'
31
38
  Requires-Dist: scikit-learn>=0.21.3; extra == 'benchmark'
@@ -47,9 +54,13 @@ Requires-Dist: mock>=2.0.0; extra == 'test'
47
54
  Requires-Dist: mockredispy; extra == 'test'
48
55
  Requires-Dist: nose-exclude>=0.5.0; extra == 'test'
49
56
  Requires-Dist: nose>=1.3.7; extra == 'test'
57
+ Requires-Dist: pygments>=2.20.0; extra == 'test'
50
58
  Requires-Dist: pymongo>=3.9.0; extra == 'test'
51
- Requires-Dist: pytest; extra == 'test'
59
+ Requires-Dist: pytest-asyncio; extra == 'test'
60
+ Requires-Dist: pytest-cov; extra == 'test'
52
61
  Requires-Dist: pytest-rerunfailures; extra == 'test'
62
+ Requires-Dist: pytest; (python_version < '3.10') and extra == 'test'
63
+ Requires-Dist: pytest>=9.0.3; (python_version >= '3.10') and extra == 'test'
53
64
  Requires-Dist: redis>=2.10.0; extra == 'test'
54
65
  Description-Content-Type: text/x-rst
55
66
 
@@ -62,6 +73,9 @@ datasketch: Big Data Looks Small
62
73
  .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
63
74
  :target: https://zenodo.org/doi/10.5281/zenodo.598238
64
75
 
76
+ .. image:: https://codecov.io/gh/ekzhu/datasketch/branch/master/graph/badge.svg
77
+ :target: https://codecov.io/gh/ekzhu/datasketch
78
+
65
79
  datasketch gives you probabilistic data structures that can process and
66
80
  search very large amount of data super fast, with little loss of
67
81
  accuracy.
@@ -7,6 +7,9 @@ datasketch: Big Data Looks Small
7
7
  .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
8
8
  :target: https://zenodo.org/doi/10.5281/zenodo.598238
9
9
 
10
+ .. image:: https://codecov.io/gh/ekzhu/datasketch/branch/master/graph/badge.svg
11
+ :target: https://codecov.io/gh/ekzhu/datasketch
12
+
10
13
  datasketch gives you probabilistic data structures that can process and
11
14
  search very large amount of data super fast, with little loss of
12
15
  accuracy.
@@ -7,6 +7,7 @@ except importlib.metadata.PackageNotFoundError:
7
7
  _version = "0.0.0" # Fallback for development mode
8
8
  __version__: Final[str] = _version
9
9
 
10
+ from datasketch.aio import AsyncMinHashLSH # Instantiation requires motor/redis.asyncio; import itself is always safe.
10
11
  from datasketch.b_bit_minhash import bBitMinHash
11
12
  from datasketch.hashfunc import sha1_hash32
12
13
  from datasketch.hnsw import HNSW
@@ -23,9 +24,9 @@ from datasketch.weighted_minhash import WeightedMinHash, WeightedMinHashGenerato
23
24
  WeightedMinHashLSH = MinHashLSH
24
25
  WeightedMinHashLSHForest = MinHashLSHForest
25
26
 
26
-
27
27
  __all__ = [
28
28
  "HNSW",
29
+ "AsyncMinHashLSH",
29
30
  "HyperLogLog",
30
31
  "HyperLogLogPlusPlus",
31
32
  "LeanMinHash",
@@ -0,0 +1,44 @@
1
+ """Async MinHash LSH module.
2
+
3
+ This module provides asynchronous implementations of MinHash LSH for use with
4
+ async storage backends like MongoDB (via motor) and Redis (via redis.asyncio).
5
+
6
+ Example:
7
+ .. code-block:: python
8
+
9
+ import asyncio
10
+
11
+ from datasketch.aio import AsyncMinHashLSH
12
+ from datasketch import MinHash
13
+
14
+
15
+ async def main():
16
+ # prepickle=True lets you use non-bytes keys (e.g. str). With the
17
+ # default prepickle=False, keys passed to insert() must be bytes.
18
+ async with AsyncMinHashLSH(
19
+ storage_config={"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}},
20
+ threshold=0.5,
21
+ num_perm=128,
22
+ prepickle=True,
23
+ ) as lsh:
24
+ m = MinHash(num_perm=128)
25
+ m.update(b"data")
26
+ await lsh.insert("key", m)
27
+ result = await lsh.query(m)
28
+
29
+
30
+ asyncio.run(main())
31
+
32
+ """
33
+
34
+ from datasketch.aio.lsh import (
35
+ AsyncMinHashLSH,
36
+ AsyncMinHashLSHDeleteSession,
37
+ AsyncMinHashLSHInsertionSession,
38
+ )
39
+
40
+ __all__ = [
41
+ "AsyncMinHashLSH",
42
+ "AsyncMinHashLSHDeleteSession",
43
+ "AsyncMinHashLSHInsertionSession",
44
+ ]
@@ -1,9 +1,15 @@
1
+ """Asynchronous MinHash LSH implementation.
2
+
3
+ This module provides AsyncMinHashLSH for use with async storage backends
4
+ like MongoDB (via motor) and Redis (via redis.asyncio).
5
+ """
6
+
1
7
  import asyncio
2
8
  import pickle
3
9
  from itertools import chain
4
10
  from typing import Optional
5
11
 
6
- from datasketch.experimental.aio.storage import (
12
+ from datasketch.aio.storage import (
7
13
  async_ordered_storage,
8
14
  async_unordered_storage,
9
15
  )
@@ -34,8 +40,6 @@ class AsyncMinHashLSH:
34
40
  MONGO = {"type": "aiomongo", "basename": "base_name_1", "mongo": {"host": "localhost", "port": 27017}}
35
41
 
36
42
  .. note::
37
- * The module supports Python version >=3.6, and is currently experimental.
38
- So the interface may change slightly in the future.
39
43
  * For main functionality of LSH algorithm see :class:`datasketch.MinHashLSH`.
40
44
  * For additional information see :ref:`minhash_lsh_at_scale` and :ref:`minhash_lsh_async`
41
45
  """
@@ -60,6 +64,7 @@ class AsyncMinHashLSH:
60
64
  self._weights = weights
61
65
  self._params = params
62
66
  self.prepickle = storage_config["type"] == "aioredis" if prepickle is None else prepickle
67
+ self._require_bytes_keys = not self.prepickle
63
68
 
64
69
  if self._threshold > 1.0 or self._threshold < 0.0:
65
70
  raise ValueError("threshold must be in [0.0, 1.0]")
@@ -115,7 +120,9 @@ class AsyncMinHashLSH:
115
120
  def __setstate__(self, state):
116
121
  state["_lock"] = asyncio.Lock()
117
122
  self.__dict__ = state
118
- self.__init__(self._threshold, self._num_perm, self._weights, self._params, self._storage_config)
123
+ self.__init__(
124
+ self._threshold, self._num_perm, self._weights, self._params, self._storage_config, self.prepickle
125
+ )
119
126
 
120
127
  @property
121
128
  def batch_size(self):
@@ -126,7 +133,7 @@ class AsyncMinHashLSH:
126
133
  if self.keys is not None:
127
134
  self.keys.batch_size = value
128
135
  else:
129
- raise AttributeError("AsyncMinHash is not initialized.")
136
+ raise AttributeError("AsyncMinHashLSH is not initialized.")
130
137
 
131
138
  for t in self.hashtables:
132
139
  t.batch_size = value
@@ -160,12 +167,6 @@ class AsyncMinHashLSH:
160
167
  if self.keys is None:
161
168
  await self._create_storages()
162
169
 
163
- if not self.keys.initialized:
164
- await self.keys
165
-
166
- fs = (ht for ht in self.hashtables if not ht.initialized)
167
- await asyncio.gather(*fs)
168
-
169
170
  async def close(self):
170
171
  """Cleanup client resources and disconnect from AsyncMinHashLSH storage."""
171
172
  async with self._lock:
@@ -186,41 +187,26 @@ class AsyncMinHashLSH:
186
187
 
187
188
  :param int batch_size: the size of chunks to use in insert_session mode (default=10000).
188
189
 
189
- :return: datasketch.experimental.aio.lsh.AsyncMinHashLSHSession
190
+ :return: datasketch.aio.lsh.AsyncMinHashLSHInsertionSession
190
191
 
191
192
  Example:
192
193
  .. code-block:: python
193
194
 
194
- from datasketch.experimental.aio.lsh import AsyncMinHashLSH
195
+ import asyncio
196
+ from datasketch.aio import AsyncMinHashLSH
195
197
  from datasketch import MinHash
196
198
 
197
-
198
- def chunk(it, size):
199
- it = iter(it)
200
- return iter(lambda: tuple(islice(it, size)), ())
201
-
202
-
203
- _chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
204
- seq = frozenset(
205
- chain(
206
- ("".join(s) for s in _chunked_str),
207
- ("aahhb", "aahh", "aahhc", "aac", "kld", "bhg", "kkd", "yow", "ppi", "eer"),
208
- )
209
- )
210
- objs = [MinHash(16) for _ in range(len(seq))]
211
- for e, obj in zip(seq, objs):
212
- for i in e:
213
- obj.update(i.encode("utf-8"))
214
- data = [(e, m) for e, m in zip(seq, objs)]
215
-
216
- _storage_config_redis = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
217
-
218
-
219
- async def func():
220
- async with AsyncMinHashLSH(storage_config=_storage_config_redis, threshold=0.5, num_perm=16) as lsh:
199
+ async def main():
200
+ storage_config = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
201
+ async with AsyncMinHashLSH(
202
+ storage_config=storage_config, threshold=0.5, num_perm=16, prepickle=True
203
+ ) as lsh:
221
204
  async with lsh.insertion_session(batch_size=1000) as session:
222
- fs = (session.insert(key, minhash, check_duplication=True) for key, minhash in data)
223
- await asyncio.gather(*fs)
205
+ m = MinHash(num_perm=16)
206
+ m.update(b"data")
207
+ await session.insert("key", m)
208
+
209
+ asyncio.run(main())
224
210
 
225
211
  """
226
212
  return AsyncMinHashLSHInsertionSession(self, batch_size=batch_size)
@@ -229,47 +215,32 @@ class AsyncMinHashLSH:
229
215
  """Create a asynchronous context manager for fast removal of keys
230
216
  from index.
231
217
 
232
- :param int batch_size: the size of chunks to use in insert_session mode (default=10000).
218
+ :param int batch_size: the size of chunks to use in delete_session mode (default=10000).
233
219
 
234
- :return: datasketch.experimental.aio.lsh.AsyncMinHashLSHSession
220
+ :return: datasketch.aio.lsh.AsyncMinHashLSHDeleteSession
235
221
 
236
222
  Example:
237
223
  .. code-block:: python
238
224
 
239
- from datasketch.experimental.aio.lsh import AsyncMinHashLSH
225
+ import asyncio
226
+ from datasketch.aio import AsyncMinHashLSH
240
227
  from datasketch import MinHash
241
228
 
229
+ async def main():
230
+ storage_config = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
231
+ async with AsyncMinHashLSH(
232
+ storage_config=storage_config, threshold=0.5, num_perm=16, prepickle=True
233
+ ) as lsh:
234
+ # Insert some data first
235
+ m = MinHash(num_perm=16)
236
+ m.update(b"data")
237
+ await lsh.insert("key1", m)
242
238
 
243
- def chunk(it, size):
244
- it = iter(it)
245
- return iter(lambda: tuple(islice(it, size)), ())
239
+ # Delete using session
240
+ async with lsh.delete_session(batch_size=100) as session:
241
+ await session.remove("key1")
246
242
 
247
-
248
- _chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
249
- seq = frozenset(
250
- chain(
251
- ("".join(s) for s in _chunked_str),
252
- ("aahhb", "aahh", "aahhc", "aac", "kld", "bhg", "kkd", "yow", "ppi", "eer"),
253
- )
254
- )
255
- objs = [MinHash(16) for _ in range(len(seq))]
256
- for e, obj in zip(seq, objs):
257
- for i in e:
258
- obj.update(i.encode("utf-8"))
259
- data = [(e, m) for e, m in zip(seq, objs)]
260
-
261
- _storage_config_redis = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
262
-
263
-
264
- async def func():
265
- async with AsyncMinHashLSH(storage_config=_storage_config_redis, threshold=0.5, num_perm=16) as lsh:
266
- async with lsh.insertion_session(batch_size=1000) as session:
267
- fs = (session.insert(key, minhash, check_duplication=True) for key, minhash in data)
268
- await asyncio.gather(*fs)
269
-
270
- async with lsh.delete_session(batch_size=3) as session:
271
- fs = (session.remove(key) for key in keys_to_remove)
272
- await asyncio.gather(*fs)
243
+ asyncio.run(main())
273
244
 
274
245
  """
275
246
  return AsyncMinHashLSHDeleteSession(self, batch_size=batch_size)
@@ -277,10 +248,17 @@ class AsyncMinHashLSH:
277
248
  async def _insert(self, key, minhash, check_duplication=True, buffer=False):
278
249
  if len(minhash) != self.h:
279
250
  raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
251
+ if self._require_bytes_keys and not isinstance(key, bytes):
252
+ raise TypeError(
253
+ f"prepickle=False requires bytes keys for non-dict storage, got {type(key).__name__}. "
254
+ "Either pass bytes keys or use prepickle=True for automatic serialization."
255
+ )
280
256
  if self.prepickle:
281
257
  key = pickle.dumps(key)
282
258
 
283
- if check_duplication and await self.has_key(key):
259
+ # `key` is already pickled at this point under prepickle=True; call the
260
+ # storage primitive directly so we don't re-pickle through has_key().
261
+ if check_duplication and await self.keys.has_key(key):
284
262
  raise ValueError("The given key already exists")
285
263
  Hs = [self._H(minhash.hashvalues[start:end]) for start, end in self.hashranges]
286
264
 
@@ -306,6 +284,8 @@ class AsyncMinHashLSH:
306
284
 
307
285
  async def has_key(self, key):
308
286
  """See :class:`datasketch.MinHashLSH`."""
287
+ if self.prepickle:
288
+ key = pickle.dumps(key)
309
289
  return await self.keys.has_key(key)
310
290
 
311
291
  async def remove(self, key):
@@ -313,7 +293,12 @@ class AsyncMinHashLSH:
313
293
  await self._remove(key, buffer=False)
314
294
 
315
295
  async def _remove(self, key, buffer=False):
316
- if not await self.has_key(key):
296
+ if self.prepickle:
297
+ key = pickle.dumps(key)
298
+
299
+ # `key` is already pickled here; call storage primitives directly so
300
+ # the existence check, lookup, and deletes all use the stored form.
301
+ if not await self.keys.has_key(key):
317
302
  raise ValueError("The given key does not exist")
318
303
 
319
304
  for H, hashtable in zip(await self.keys.get(key), self.hashtables):
@@ -344,7 +329,10 @@ class AsyncMinHashLSH:
344
329
  H = self._H(minhash.hashvalues[start:end])
345
330
  if await hashtable.has_key(H):
346
331
  fs.append(hashtable.get(H))
347
- return set(chain.from_iterable(await asyncio.gather(*fs))) # candidates
332
+ candidates = set(chain.from_iterable(await asyncio.gather(*fs)))
333
+ if self.prepickle:
334
+ return {pickle.loads(key) for key in candidates}
335
+ return candidates
348
336
 
349
337
  async def get_counts(self):
350
338
  """See :class:`datasketch.MinHashLSH`."""
@@ -353,6 +341,10 @@ class AsyncMinHashLSH:
353
341
 
354
342
  async def get_subset_counts(self, *keys):
355
343
  """See :class:`datasketch.MinHashLSH`."""
344
+ # Keys in storage are pickled when prepickle is enabled, so we have to
345
+ # pickle the query keys to match the stored representation.
346
+ if self.prepickle:
347
+ keys = tuple(pickle.dumps(key) for key in keys)
356
348
  key_set = list(set(keys))
357
349
  hashtables = [unordered_storage({"type": "dict"}) for _ in range(self.b)]
358
350
  Hss = await self.keys.getmany(*key_set)
@@ -1,9 +1,23 @@
1
+ """Async storage backends for MinHash LSH.
2
+
3
+ This module provides async storage implementations for use with AsyncMinHashLSH:
4
+ - AsyncMongoListStorage / AsyncMongoSetStorage: MongoDB storage via motor
5
+ - AsyncRedisListStorage / AsyncRedisSetStorage: Redis storage via redis.asyncio
6
+ """
7
+
1
8
  import asyncio
2
9
  import os
3
10
  from abc import ABCMeta
4
11
  from itertools import chain
5
12
 
6
- from datasketch.storage import OrderedStorage, RedisStorage, Storage, UnorderedStorage, _random_name
13
+ from datasketch.storage import OrderedStorage, Storage, UnorderedStorage, _random_name
14
+
15
+ # RedisStorage is only available when redis package is installed (optional dependency)
16
+ # Import it conditionally to avoid ImportError when redis is not installed
17
+ try:
18
+ from datasketch.storage import RedisStorage
19
+ except ImportError:
20
+ RedisStorage = None
7
21
 
8
22
  ABC = ABCMeta("ABC", (object,), {})
9
23
 
@@ -24,6 +38,12 @@ except ImportError:
24
38
  redis = None
25
39
 
26
40
 
41
+ __all__ = [
42
+ "async_ordered_storage",
43
+ "async_unordered_storage",
44
+ ]
45
+
46
+
27
47
  async def async_ordered_storage(config, name=None):
28
48
  tp = config["type"]
29
49
  if tp == "aiomongo":
@@ -55,9 +75,9 @@ if motor is not None and ReturnDocument is not None:
55
75
  class AsyncMongoBuffer:
56
76
  def __init__(self, aio_mongo_collection, batch_size):
57
77
  self._batch_size = batch_size
58
- self._insert_documents_stack = tuple()
59
- self._delete_by_key_documents_stack = tuple()
60
- self._delete_by_val_documents_stack = tuple()
78
+ self._insert_documents_stack = []
79
+ self._delete_by_key_documents_stack = []
80
+ self._delete_by_val_documents_stack = []
61
81
  self._mongo_coll = aio_mongo_collection
62
82
 
63
83
  @property
@@ -73,28 +93,28 @@ if motor is not None and ReturnDocument is not None:
73
93
  if command == "insert":
74
94
  if len(self._insert_documents_stack) >= self.batch_size:
75
95
  await self.execute(command)
76
- self._insert_documents_stack += (kwargs["obj"],)
96
+ self._insert_documents_stack.append(kwargs["obj"])
77
97
  elif command == "delete_by_key":
78
98
  if len(self._delete_by_key_documents_stack) >= self.batch_size:
79
99
  await self.execute(command)
80
- self._delete_by_key_documents_stack += (kwargs["key"],)
100
+ self._delete_by_key_documents_stack.append(kwargs["key"])
81
101
  elif command == "delete_by_val":
82
102
  if len(self._delete_by_val_documents_stack) >= self.batch_size:
83
103
  await self.execute(command)
84
- self._delete_by_val_documents_stack += (kwargs["val"],)
104
+ self._delete_by_val_documents_stack.append(kwargs["val"])
85
105
 
86
106
  async def execute(self, command):
87
107
  if command == "insert" and self._insert_documents_stack:
88
108
  buffer = self._insert_documents_stack
89
- self._insert_documents_stack = tuple()
109
+ self._insert_documents_stack = []
90
110
  await self._mongo_coll.insert_many(buffer, ordered=False)
91
111
  elif command == "delete_by_key" and self._delete_by_key_documents_stack:
92
112
  buffer = self._delete_by_key_documents_stack
93
- self._delete_by_key_documents_stack = tuple()
113
+ self._delete_by_key_documents_stack = []
94
114
  await self._mongo_coll.delete_many({"key": {"$in": buffer}})
95
115
  elif command == "delete_by_val" and self._delete_by_val_documents_stack:
96
116
  buffer = self._delete_by_val_documents_stack
97
- self._delete_by_val_documents_stack = tuple()
117
+ self._delete_by_val_documents_stack = []
98
118
  await self._mongo_coll.delete_many({"vals": {"$in": buffer}})
99
119
 
100
120
  async def insert_one(self, **kwargs):
@@ -254,6 +274,9 @@ if motor is not None and ReturnDocument is not None:
254
274
  async def has_key(self, key):
255
275
  return bool(await self._collection.find_one({"key": key}))
256
276
 
277
+ async def getmany(self, *keys):
278
+ return await asyncio.gather(*(self.get(key) for key in keys))
279
+
257
280
  async def status(self):
258
281
  status = self._parse_config(self.config["mongo"])
259
282
  status.update({"keyspace_size": await self.size()})
@@ -285,7 +308,9 @@ if motor is not None and ReturnDocument is not None:
285
308
  await self._collection.find_one_and_delete({"key": key, "vals": val})
286
309
 
287
310
 
288
- if redis is not None:
311
+ # Redis-based async storage classes are only defined when both redis package
312
+ # and RedisStorage are available (optional dependencies)
313
+ if redis is not None and RedisStorage is not None:
289
314
 
290
315
  class AsyncRedisBuffer(redis.client.Pipeline):
291
316
  def __init__(self, connection_pool, response_callbacks, transaction, buffer_size, shard_hint=None):
@@ -304,7 +329,7 @@ if redis is not None:
304
329
 
305
330
  async def execute_command(self, *args, **kwargs):
306
331
  if len(self.command_stack) >= self._buffer_size:
307
- self.execute()
332
+ await self.execute()
308
333
  await super(AsyncRedisBuffer, self).execute_command(*args, **kwargs)
309
334
 
310
335
  class AsyncRedisStorage(RedisStorage):
@@ -323,16 +348,19 @@ if redis is not None:
323
348
  )
324
349
  self._initialized = True
325
350
 
351
+ async def close(self):
352
+ await self._redis.aclose()
353
+
326
354
  @property
327
355
  def initialized(self):
328
356
  return self._initialized
329
357
 
330
358
  class AsyncRedisListStorage(OrderedStorage, AsyncRedisStorage):
331
359
  async def keys(self):
332
- return await self._redis.hkeys(self._name)
360
+ return await self._redis.hkeys(self._name) # type: ignore
333
361
 
334
362
  async def redis_keys(self):
335
- return await self._redis.hvals(self._name)
363
+ return await self._redis.hvals(self._name) # type: ignore
336
364
 
337
365
  def status(self):
338
366
  status = self._parse_config(self.config["redis"])
@@ -344,24 +372,34 @@ if redis is not None:
344
372
 
345
373
  async def getmany(self, *keys):
346
374
  pipe = self._redis.pipeline()
347
- pipe.multi()
348
375
  for key in keys:
349
- await self._get_items(pipe, self.redis_key(key))
376
+ pipe.lrange(self.redis_key(key), 0, -1)
350
377
  return await pipe.execute()
351
378
 
352
379
  @staticmethod
353
380
  async def _get_items(r, k):
354
381
  return await r.lrange(k, 0, -1)
355
382
 
356
- async def remove(self, *keys):
357
- await self._redis.hdel(self._name, *keys)
358
- await self._redis.delete(*[self.redis_key(key) for key in keys])
383
+ async def remove(self, *keys, **kwargs):
384
+ buffer = kwargs.pop("buffer", False)
385
+ if buffer:
386
+ await self._remove(self._buffer, *keys)
387
+ else:
388
+ await self._remove(self._redis, *keys)
389
+
390
+ async def _remove(self, r, *keys):
391
+ await r.hdel(self._name, *keys)
392
+ await r.delete(*[self.redis_key(key) for key in keys])
359
393
 
360
- async def remove_val(self, key, val):
394
+ async def remove_val(self, key, val, **kwargs):
395
+ buffer = kwargs.pop("buffer", False)
361
396
  redis_key = self.redis_key(key)
362
- await self._redis.lrem(redis_key, val)
363
- if not await self._redis.exists(redis_key):
364
- await self._redis.hdel(self._name, redis_key)
397
+ if buffer:
398
+ await self._buffer.lrem(redis_key, val)
399
+ else:
400
+ await self._redis.lrem(redis_key, val)
401
+ if not await self._redis.exists(redis_key): # type: ignore
402
+ await self._redis.hdel(self._name, redis_key) # type: ignore
365
403
 
366
404
  async def insert(self, key, *vals, **kwargs):
367
405
  # Using buffer=True outside of an `insertion_session`
@@ -380,7 +418,7 @@ if redis is not None:
380
418
  await r.rpush(redis_key, *values)
381
419
 
382
420
  async def size(self):
383
- return await self._redis.hlen(self._name)
421
+ return await self._redis.hlen(self._name) # type: ignore
384
422
 
385
423
  async def itemcounts(self):
386
424
  pipe = self._redis.pipeline()
@@ -395,7 +433,7 @@ if redis is not None:
395
433
  return await r.llen(k)
396
434
 
397
435
  async def has_key(self, key):
398
- return await self._redis.hexists(self._name, key)
436
+ return await self._redis.hexists(self._name, key) # type: ignore
399
437
 
400
438
  async def empty_buffer(self):
401
439
  await self._buffer.execute()
@@ -408,11 +446,21 @@ if redis is not None:
408
446
  async def _get_items(r, k):
409
447
  return await r.smembers(k)
410
448
 
411
- async def remove_val(self, key, val):
449
+ async def getmany(self, *keys):
450
+ pipe = self._redis.pipeline()
451
+ for key in keys:
452
+ pipe.smembers(self.redis_key(key))
453
+ return await pipe.execute()
454
+
455
+ async def remove_val(self, key, val, **kwargs):
456
+ buffer = kwargs.pop("buffer", False)
412
457
  redis_key = self.redis_key(key)
413
- await self._redis.srem(redis_key, val)
414
- if not await self._redis.exists(redis_key):
415
- await self._redis.hdel(self._name, redis_key)
458
+ if buffer:
459
+ await self._buffer.srem(redis_key, val)
460
+ else:
461
+ await self._redis.srem(redis_key, val)
462
+ if not await self._redis.exists(redis_key): # type: ignore
463
+ await self._redis.hdel(self._name, redis_key) # type: ignore
416
464
 
417
465
  async def _insert(self, r, key, *values):
418
466
  redis_key = self.redis_key(key)
@@ -0,0 +1,49 @@
1
+ """Deprecated experimental module.
2
+
3
+ .. deprecated::
4
+ The `datasketch.experimental` module is deprecated and will be removed in a future version.
5
+ Please use `datasketch.aio` instead:
6
+
7
+ Old: ``from datasketch.experimental import AsyncMinHashLSH``
8
+ New: ``from datasketch.aio import AsyncMinHashLSH``
9
+
10
+ Or simply: ``from datasketch import AsyncMinHashLSH``
11
+ """
12
+
13
+ from typing import TYPE_CHECKING
14
+
15
+ if TYPE_CHECKING:
16
+ # Visible to static analyzers so they know `__all__` is satisfied.
17
+ # Not imported at runtime - the real dispatch happens in __getattr__.
18
+ from datasketch.aio import AsyncMinHashLSH
19
+
20
+ __all__ = ["AsyncMinHashLSH"]
21
+
22
+
23
+ def __getattr__(name):
24
+ # PEP 562: only emit the DeprecationWarning when the user actually pulls a
25
+ # symbol out of this package, not on every `import datasketch.experimental`.
26
+ # This avoids the noisy triple-warning that fired when each intermediate
27
+ # __init__.py warned eagerly.
28
+ #
29
+ # We cache the resolved symbol back into globals() so subsequent accesses
30
+ # bypass __getattr__. This matters for two reasons:
31
+ # 1. `from pkg import x` internally performs both `hasattr(pkg, x)` and
32
+ # `getattr(pkg, x)`, so without caching __getattr__ fires twice.
33
+ # 2. It makes the warning a one-shot per process, which is the normal
34
+ # expectation for deprecation warnings.
35
+ if name == "AsyncMinHashLSH":
36
+ import warnings
37
+
38
+ warnings.warn(
39
+ "datasketch.experimental is deprecated. "
40
+ "Use 'from datasketch.aio import AsyncMinHashLSH' or "
41
+ "'from datasketch import AsyncMinHashLSH' instead.",
42
+ DeprecationWarning,
43
+ stacklevel=2,
44
+ )
45
+ from datasketch.aio import AsyncMinHashLSH
46
+
47
+ globals()[name] = AsyncMinHashLSH
48
+ return AsyncMinHashLSH
49
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -0,0 +1,50 @@
1
+ """Deprecated experimental aio module.
2
+
3
+ .. deprecated::
4
+ The `datasketch.experimental.aio` module is deprecated and will be removed in a future version.
5
+ Please use `datasketch.aio` instead:
6
+
7
+ Old: ``from datasketch.experimental.aio import AsyncMinHashLSH``
8
+ New: ``from datasketch.aio import AsyncMinHashLSH``
9
+ """
10
+
11
+ from typing import TYPE_CHECKING
12
+
13
+ if TYPE_CHECKING:
14
+ # Visible to static analyzers so they know `__all__` is satisfied.
15
+ # Not imported at runtime - the real dispatch happens in __getattr__.
16
+ from datasketch.aio import (
17
+ AsyncMinHashLSH,
18
+ AsyncMinHashLSHDeleteSession,
19
+ AsyncMinHashLSHInsertionSession,
20
+ )
21
+
22
+ __all__ = [
23
+ "AsyncMinHashLSH",
24
+ "AsyncMinHashLSHDeleteSession",
25
+ "AsyncMinHashLSHInsertionSession",
26
+ ]
27
+
28
+ _DEPRECATED = frozenset(__all__)
29
+
30
+
31
+ def __getattr__(name):
32
+ # PEP 562: emit the warning lazily on attribute access so that merely
33
+ # importing the parent package (e.g. as an intermediate step of
34
+ # `from datasketch.experimental.aio.lsh import ...`) does not fire
35
+ # a second, redundant warning. See the long comment in
36
+ # datasketch/experimental/__init__.py for why we cache into globals().
37
+ if name in _DEPRECATED:
38
+ import warnings
39
+
40
+ warnings.warn(
41
+ "datasketch.experimental.aio is deprecated. Use 'from datasketch.aio import AsyncMinHashLSH' instead.",
42
+ DeprecationWarning,
43
+ stacklevel=2,
44
+ )
45
+ import datasketch.aio as _new
46
+
47
+ value = getattr(_new, name)
48
+ globals()[name] = value
49
+ return value
50
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -0,0 +1,49 @@
1
+ """Deprecated experimental aio lsh module.
2
+
3
+ .. deprecated::
4
+ The `datasketch.experimental.aio.lsh` module is deprecated and will be removed in a future version.
5
+ Please use `datasketch.aio.lsh` instead:
6
+
7
+ Old: ``from datasketch.experimental.aio.lsh import AsyncMinHashLSH``
8
+ New: ``from datasketch.aio import AsyncMinHashLSH``
9
+ """
10
+
11
+ from typing import TYPE_CHECKING
12
+
13
+ if TYPE_CHECKING:
14
+ # Visible to static analyzers so they know `__all__` is satisfied.
15
+ # Not imported at runtime - the real dispatch happens in __getattr__.
16
+ from datasketch.aio.lsh import (
17
+ AsyncMinHashLSH,
18
+ AsyncMinHashLSHDeleteSession,
19
+ AsyncMinHashLSHInsertionSession,
20
+ )
21
+
22
+ __all__ = [
23
+ "AsyncMinHashLSH",
24
+ "AsyncMinHashLSHDeleteSession",
25
+ "AsyncMinHashLSHInsertionSession",
26
+ ]
27
+
28
+ _DEPRECATED = frozenset(__all__)
29
+
30
+
31
+ def __getattr__(name):
32
+ # Lazy warning via PEP 562: fires exactly once per attribute access on the
33
+ # deprecated module, and we cache the resolved symbol back into globals()
34
+ # so the warning is emitted once per process (see the long comment in
35
+ # datasketch/experimental/__init__.py for rationale).
36
+ if name in _DEPRECATED:
37
+ import warnings
38
+
39
+ warnings.warn(
40
+ "datasketch.experimental.aio.lsh is deprecated. Use 'from datasketch.aio import AsyncMinHashLSH' instead.",
41
+ DeprecationWarning,
42
+ stacklevel=2,
43
+ )
44
+ import datasketch.aio.lsh as _new
45
+
46
+ value = getattr(_new, name)
47
+ globals()[name] = value
48
+ return value
49
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -3,12 +3,18 @@ from __future__ import annotations
3
3
  import pickle
4
4
  import struct
5
5
  from collections.abc import Hashable
6
- from typing import Callable, Optional, Union
6
+ from typing import Callable, List, Optional, Union
7
7
 
8
8
  from scipy.integrate import quad as integrate
9
9
 
10
10
  from datasketch.minhash import MinHash
11
- from datasketch.storage import _random_name, ordered_storage, unordered_storage
11
+ from datasketch.storage import (
12
+ OrderedStorage,
13
+ UnorderedStorage,
14
+ _random_name,
15
+ ordered_storage,
16
+ unordered_storage,
17
+ )
12
18
  from datasketch.weighted_minhash import WeightedMinHash
13
19
 
14
20
 
@@ -183,7 +189,7 @@ class MinHashLSH:
183
189
  self._H = self._byteswap
184
190
 
185
191
  basename = storage_config.get("basename", _random_name(11))
186
- self.hashtables = [
192
+ self.hashtables: List[UnorderedStorage] = [
187
193
  unordered_storage(
188
194
  storage_config,
189
195
  name=b"".join([basename, b"_bucket_", struct.pack(">H", i)]),
@@ -191,7 +197,7 @@ class MinHashLSH:
191
197
  for i in range(self.b)
192
198
  ]
193
199
  self.hashranges = [(i * self.r, (i + 1) * self.r) for i in range(self.b)]
194
- self.keys = ordered_storage(storage_config, name=b"".join([basename, b"_keys"]))
200
+ self.keys: OrderedStorage = ordered_storage(storage_config, name=b"".join([basename, b"_keys"]))
195
201
 
196
202
  @property
197
203
  def buffer_size(self) -> int:
@@ -347,7 +353,7 @@ class MinHashLSH:
347
353
  """
348
354
  return type(self) is type(other) and self.h == other.h and self.b == other.b and self.r == other.r
349
355
 
350
- def _merge(self, other: MinHashLSH, check_overlap: bool = False, buffer: bool = False) -> MinHashLSH:
356
+ def _merge(self, other: MinHashLSH, check_overlap: bool = False, buffer: bool = False) -> None:
351
357
  if self.__equivalent(other):
352
358
  if check_overlap and set(self.keys).intersection(set(other.keys)):
353
359
  raise ValueError("The keys are overlapping, duplicate key exists.")
@@ -457,16 +463,24 @@ class MinHashLSH:
457
463
  list: a list of unique keys.
458
464
 
459
465
  """
460
- collected_result_sets = [
461
- set(collected_result_lists)
462
- for hashtable in self.hashtables
463
- for collected_result_lists in hashtable.collect_select_buffer()
466
+ collected_result_lists = [hashtable.collect_select_buffer() for hashtable in self.hashtables]
467
+ if not any(collected_result_lists):
468
+ return []
469
+
470
+ # Each buffered query contributes one result list per hashtable. We first
471
+ # union candidates across bands for each query, then intersect across the
472
+ # buffered queries to match repeated calls to `query()`.
473
+ per_query_result_sets = [
474
+ set().union(*query_result_lists)
475
+ for query_result_lists in zip(*collected_result_lists)
464
476
  ]
465
- if not collected_result_sets:
477
+ if not per_query_result_sets:
466
478
  return []
479
+
480
+ candidates = set.intersection(*per_query_result_sets)
467
481
  if self.prepickle:
468
- return [pickle.loads(key) for key in set.intersection(*collected_result_sets)]
469
- return list(set.intersection(*collected_result_sets))
482
+ return [pickle.loads(key) for key in candidates]
483
+ return list(candidates)
470
484
 
471
485
  def __contains__(self, key: Hashable) -> bool:
472
486
  """Args:
@@ -524,6 +538,8 @@ class MinHashLSH:
524
538
  return bytes(hs.byteswap().data)
525
539
 
526
540
  def _hashed_byteswap(self, hs):
541
+ if self.hashfunc is None:
542
+ raise RuntimeError("Hash function not configured.")
527
543
  return self.hashfunc(bytes(hs.byteswap().data))
528
544
 
529
545
  def _query_b(self, minhash, b):
@@ -252,9 +252,9 @@ class MinHashLSHBloom:
252
252
  raise ValueError("threshold must be in [0.0, 1.0]")
253
253
  if num_perm < 2:
254
254
  raise ValueError("Too few permutation functions")
255
- if n <= 0:
255
+ if n is None or n <= 0:
256
256
  raise ValueError("n for LSHBloom must be >= 0")
257
- if fp >= 1.0 or fp <= 0.0:
257
+ if fp is None or fp >= 1.0 or fp <= 0.0:
258
258
  raise ValueError("fp must be in (0.0, 1.0)")
259
259
  if save_dir is None:
260
260
  warnings.warn(
@@ -204,7 +204,7 @@ class MinHashLSHEnsemble:
204
204
  if not self.is_empty():
205
205
  raise ValueError("Cannot call index again on a non-empty index")
206
206
  if not isinstance(entries, list):
207
- queue = deque([])
207
+ queue = deque()
208
208
  for key, minhash, size in entries:
209
209
  if size <= 0:
210
210
  raise ValueError("Set size must be positive")
@@ -221,7 +221,8 @@ class MinHashLSHEnsemble:
221
221
  entries.sort(key=lambda e: e[2])
222
222
  curr_part = 0
223
223
  for key, minhash, size in entries:
224
- if size > self.uppers[curr_part]:
224
+ u = self.uppers[curr_part]
225
+ if size > u:
225
226
  curr_part += 1
226
227
  for r in self.indexes[curr_part]:
227
228
  self.indexes[curr_part][r].insert(key, minhash)
@@ -9,8 +9,8 @@ from datasketch.minhash import MinHash
9
9
  class MinHashLSHForest:
10
10
  """The LSH Forest for MinHash. It supports top-k query in Jaccard
11
11
  similarity.
12
- Instead of using prefix trees as the `original paper
13
- <http://ilpubs.stanford.edu:8090/678/1/2005-14.pdf>`_,
12
+ Instead of using prefix trees as described in the original LSH Forest
13
+ paper by Bawa et al. (WWW 2005),
14
14
  I use a sorted array to store the hash values in every
15
15
  hash table.
16
16
 
@@ -37,7 +37,8 @@ class MinHashLSHForest:
37
37
  # Maximum depth of the prefix tree
38
38
  self.k = int(num_perm / l)
39
39
  self.hashtables = [defaultdict(list) for _ in range(self.l)]
40
- self.hashranges = [(i * self.k, (i + 1) * self.k) for i in range(self.l)]
40
+ self.hashranges = [(i * self.k, (i + 1) * self.k)
41
+ for i in range(self.l)]
41
42
  self.keys = dict()
42
43
  # This is the sorted array implementation for the prefix trees
43
44
  self.sorted_hashtables = [[] for _ in range(self.l)]
@@ -59,7 +60,8 @@ class MinHashLSHForest:
59
60
  raise ValueError("The num_perm of MinHash out of range")
60
61
  if key in self.keys:
61
62
  raise ValueError("The given key has already been added")
62
- self.keys[key] = [self._H(minhash.hashvalues[start:end]) for start, end in self.hashranges]
63
+ self.keys[key] = [self._H(minhash.hashvalues[start:end])
64
+ for start, end in self.hashranges]
63
65
  for H, hashtable in zip(self.keys[key], self.hashtables):
64
66
  hashtable[H].append(key)
65
67
 
@@ -73,11 +75,13 @@ class MinHashLSHForest:
73
75
  if r > self.k or r <= 0 or b > self.l or b <= 0:
74
76
  raise ValueError("parameter outside range")
75
77
  # Generate prefixes of concatenated hash values
76
- hps = [self._H(minhash.hashvalues[start : start + r]) for start, _ in self.hashranges]
78
+ hps = [self._H(minhash.hashvalues[start: start + r])
79
+ for start, _ in self.hashranges]
77
80
  # Set the prefix length for look-ups in the sorted hash values list
78
81
  prefix_size = len(hps[0])
79
82
  for ht, hp, hashtable in zip(self.sorted_hashtables, hps, self.hashtables):
80
- i = self._binary_search(len(ht), lambda x, ht=ht, hp=hp: ht[x][:prefix_size] >= hp)
83
+ i = self._binary_search(
84
+ len(ht), lambda x, ht=ht, hp=hp: ht[x][:prefix_size] >= hp)
81
85
  if i < len(ht) and ht[i][:prefix_size] == hp:
82
86
  j = i
83
87
  while j < len(ht) and ht[j][:prefix_size] == hp:
@@ -137,14 +141,17 @@ class MinHashLSHForest:
137
141
  """
138
142
  byteslist = self.keys.get(key, None)
139
143
  if byteslist is None:
140
- raise KeyError(f"The provided key does not exist in the LSHForest: {key}")
144
+ raise KeyError(
145
+ f"The provided key does not exist in the LSHForest: {key}")
141
146
  hashvalue_byte_size = len(byteslist[0]) // 8
142
- hashvalues = np.empty(len(byteslist) * hashvalue_byte_size, dtype=np.uint64)
147
+ hashvalues = np.empty(
148
+ len(byteslist) * hashvalue_byte_size, dtype=np.uint64)
143
149
  for index, item in enumerate(byteslist):
144
150
  # unswap the bytes, as their representation is flipped during storage
145
151
  hv_segment = np.frombuffer(item, dtype=np.uint64).byteswap()
146
152
  curr_index = index * hashvalue_byte_size
147
- hashvalues[curr_index : curr_index + hashvalue_byte_size] = hv_segment
153
+ hashvalues[curr_index: curr_index +
154
+ hashvalue_byte_size] = hv_segment
148
155
  return hashvalues
149
156
 
150
157
  def _binary_search(self, n, func):
@@ -3,15 +3,18 @@ from __future__ import annotations
3
3
  import copy
4
4
  import warnings
5
5
  from collections.abc import Generator, Iterable
6
- from typing import Callable, Optional
6
+ from typing import TYPE_CHECKING, Callable, Optional, Union
7
7
 
8
8
  try:
9
9
  from typing import Literal # py3.8+; if older, you can fallback to typing_extensions
10
- except Exception:
10
+ except ImportError:
11
11
  from typing_extensions import Literal
12
12
 
13
13
  import numpy as np
14
14
 
15
+ if TYPE_CHECKING:
16
+ from numpy.typing import ArrayLike
17
+
15
18
  # GPU backend
16
19
  try:
17
20
  import cupy as cp
@@ -114,8 +117,8 @@ class MinHash:
114
117
  gpu_mode: Literal["disable", "detect", "always"] = "disable",
115
118
  hashfunc: Callable = sha1_hash32,
116
119
  hashobj: Optional[object] = None, # Deprecated.
117
- hashvalues: Optional[Iterable] = None,
118
- permutations: Optional[tuple[Iterable, Iterable]] = None,
120
+ hashvalues: Optional[ArrayLike] = None,
121
+ permutations: Optional[Union[tuple[ArrayLike, ArrayLike], ArrayLike]] = None,
119
122
  ) -> None:
120
123
  if hashvalues is not None:
121
124
  num_perm = len(hashvalues)
@@ -180,7 +183,7 @@ class MinHash:
180
183
  dtype=np.uint64,
181
184
  ).T
182
185
 
183
- def _parse_hashvalues(self, hashvalues):
186
+ def _parse_hashvalues(self, hashvalues) -> np.ndarray:
184
187
  return np.array(hashvalues, dtype=np.uint64)
185
188
 
186
189
  def update(self, b) -> None:
@@ -26,7 +26,7 @@ except ImportError:
26
26
  c_concurrent = None
27
27
 
28
28
 
29
- def ordered_storage(config, name=None):
29
+ def ordered_storage(config, name=None) -> "OrderedStorage":
30
30
  """Return ordered storage system based on the specified config.
31
31
 
32
32
  The canonical example of such a storage container is
@@ -62,10 +62,10 @@ def ordered_storage(config, name=None):
62
62
  return RedisListStorage(config, name=name)
63
63
  if tp == "cassandra":
64
64
  return CassandraListStorage(config, name=name)
65
- return None
65
+ raise ValueError(f"Unknown storage type: {tp}")
66
66
 
67
67
 
68
- def unordered_storage(config, name=None):
68
+ def unordered_storage(config, name=None) -> "UnorderedStorage":
69
69
  """Return an unordered storage system based on the specified config.
70
70
 
71
71
  The canonical example of such a storage container is
@@ -100,7 +100,7 @@ def unordered_storage(config, name=None):
100
100
  return RedisSetStorage(config, name=name)
101
101
  if tp == "cassandra":
102
102
  return CassandraSetStorage(config, name=name)
103
- return None
103
+ raise ValueError(f"Unknown storage type: {tp}")
104
104
 
105
105
 
106
106
  class Storage(ABC):
@@ -144,7 +144,7 @@ class Storage(ABC):
144
144
  pass
145
145
 
146
146
  @abstractmethod
147
- def remove(self, *keys):
147
+ def remove(self, *keys, **kwargs):
148
148
  """Remove `keys` from storage."""
149
149
  pass
150
150
 
@@ -154,12 +154,12 @@ class Storage(ABC):
154
154
  pass
155
155
 
156
156
  @abstractmethod
157
- def size(self):
157
+ def size(self) -> int:
158
158
  """Return size of storage with respect to number of keys."""
159
159
  pass
160
160
 
161
161
  @abstractmethod
162
- def itemcounts(self, **kwargs):
162
+ def itemcounts(self, **kwargs) -> dict:
163
163
  """Returns the number of items stored under each key."""
164
164
  pass
165
165
 
@@ -168,6 +168,14 @@ class Storage(ABC):
168
168
  """Determines whether the key is in the storage or not."""
169
169
  pass
170
170
 
171
+ @property
172
+ def buffer_size(self) -> int:
173
+ return getattr(self, "_buffer_size", 50000)
174
+
175
+ @buffer_size.setter
176
+ def buffer_size(self, value: int):
177
+ self._buffer_size = value
178
+
171
179
  def status(self):
172
180
  return {"keyspace_size": len(self)}
173
181
 
@@ -595,12 +603,14 @@ if cassandra is not None:
595
603
  del self._select_statements_and_parameters_with_decoders[:]
596
604
  statements_and_parameters, decoders = zip(*buffer)
597
605
 
598
- ret = collections.defaultdict(list)
599
606
  query_results = self._select(statements_and_parameters)
600
- for rows, (key_decoder, val_decoder) in zip(query_results, decoders):
607
+ ret = []
608
+ for rows, (_key_decoder, val_decoder) in zip(query_results, decoders):
609
+ values = []
601
610
  for row in rows:
602
- ret[key_decoder(row.key)].append((val_decoder(row.value), row.ts))
603
- return [[x[0] for x in sorted(v, key=operator.itemgetter(1))] for v in ret.values()]
611
+ values.append((val_decoder(row.value), row.ts))
612
+ ret.append([x[0] for x in sorted(values, key=operator.itemgetter(1))])
613
+ return ret
604
614
 
605
615
  def select(self, keys):
606
616
  """Select all values for the given keys.
@@ -133,14 +133,15 @@ class WeightedMinHashGenerator:
133
133
  WeightedMinHash: The weighted MinHash.
134
134
 
135
135
  """
136
- if not isinstance(v, collections.abc.Iterable):
137
- raise TypeError("Input vector must be an iterable")
136
+ if not isinstance(v, collections.abc.Sized):
137
+ raise TypeError("Input vector must be sized")
138
138
  if not len(v) == self.dim:
139
139
  raise ValueError("Input dimension mismatch, expecting %d" % self.dim)
140
140
  if not isinstance(v, np.ndarray):
141
141
  v = np.array(v, dtype=np.float32)
142
142
  elif v.dtype != np.float32:
143
143
  v = v.astype(np.float32)
144
+ v: np.ndarray = v
144
145
  hashvalues = np.zeros((self.sample_size, 2), dtype=int)
145
146
  vzeros = v == 0
146
147
  if vzeros.all():
@@ -226,9 +227,8 @@ class WeightedMinHashGenerator:
226
227
  doc_argmin = np.argmin(doc_ln_a, axis=1)
227
228
  doc_k = doc_cidx[doc_argmin]
228
229
 
229
- all_hashvalues[it_doc] = np.zeros((self.sample_size, 2), dtype=int)
230
-
231
- hashvalues = all_hashvalues[it_doc]
230
+ hashvalues = np.zeros((self.sample_size, 2), dtype=int)
231
+ all_hashvalues[it_doc] = hashvalues
232
232
  hashvalues[:, 0], hashvalues[:, 1] = (
233
233
  doc_k,
234
234
  t[np.arange(self.sample_size), doc_begin + doc_argmin],
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "datasketch"
7
- version = "1.8.0"
7
+ version = "1.10.0"
8
8
  description = "Probabilistic data structures for processing and searching very large datasets"
9
9
  readme = "README.rst"
10
10
  requires-python = ">=3.9"
@@ -21,6 +21,7 @@ classifiers = [
21
21
  "Programming Language :: Python :: 3.10",
22
22
  "Programming Language :: Python :: 3.11",
23
23
  "Programming Language :: Python :: 3.12",
24
+ "Programming Language :: Python :: 3.13",
24
25
  ]
25
26
  dependencies = ["numpy>=1.11", "scipy>=1.0.0"]
26
27
 
@@ -36,7 +37,11 @@ benchmark = [
36
37
  "pandas>=0.25.3",
37
38
  "SetSimilaritySearch>=0.1.7",
38
39
  "pyfarmhash>=0.2.2",
39
- "nltk>=3.4.5",
40
+ "nltk>=3.4.5; python_version < '3.10'",
41
+ "nltk>=3.9.4; python_version >= '3.10'",
42
+ # Transitive deps of matplotlib listed to avoid dependabot uv.lock-only PRs.
43
+ "pillow>=12.2.0; python_version >= '3.10'",
44
+ "fonttools>=4.60.2",
40
45
  ]
41
46
  test = [
42
47
  "cassandra-driver>=3.20",
@@ -44,12 +49,21 @@ test = [
44
49
  "mock>=2.0.0",
45
50
  "mockredispy",
46
51
  "coverage",
52
+ "pytest-cov",
47
53
  "pymongo>=3.9.0",
48
54
  "nose>=1.3.7",
49
55
  "nose-exclude>=0.5.0",
50
- "pytest",
56
+ "pytest; python_version < '3.10'",
57
+ "pytest>=9.0.3; python_version >= '3.10'",
51
58
  "pytest-rerunfailures",
59
+ "pytest-asyncio",
60
+ # Transitive dep of pytest listed to avoid dependabot uv.lock-only PRs.
61
+ "pygments>=2.20.0",
52
62
  ]
63
+ aio = ["aiounittest", "motor>3.6.0"]
64
+ # KEEP IN SYNC WITH `aio` ABOVE. Deprecated alias retained for backwards compat;
65
+ # PEP 621 does not support referencing one optional-dependency group from
66
+ # another, so the dependency list must be duplicated verbatim.
53
67
  experimental_aio = ["aiounittest", "motor>3.6.0"]
54
68
 
55
69
  [project.urls]
@@ -91,7 +105,6 @@ exclude = [
91
105
  "dist",
92
106
  "docs",
93
107
  "examples",
94
- "travis",
95
108
  "datasketch/hyperloglog_const.py",
96
109
  ]
97
110
 
@@ -158,5 +171,38 @@ include = ["pyproject.toml", "README.rst", "LICENSE", "datasketch/**"]
158
171
 
159
172
  [tool.pytest.ini_options]
160
173
  minversion = "6.0"
161
- addopts = ["--strict-markers", "--color=yes"]
174
+ addopts = ["--strict-markers", "--color=yes", "--cov-report=xml"]
162
175
  testpaths = ["test"]
176
+ asyncio_mode = "auto"
177
+
178
+ [tool.pyright]
179
+ include = ["datasketch"]
180
+ exclude = [
181
+ "benchmark",
182
+ "docs",
183
+ "examples",
184
+ "test",
185
+ "travis",
186
+ "**/.venv/**",
187
+ "**/__pycache__",
188
+ ]
189
+ pythonVersion = "3.9"
190
+ typeCheckingMode = "basic" # todo: change to "strict" in future
191
+
192
+ reportMissingImports = "none"
193
+ reportUnusedVariable = "warning"
194
+ reportAttributeAccessIssue = "none"
195
+ reportOptionalMemberAccess = "none"
196
+ reportGeneralTypeIssues = "none"
197
+ reportArgumentType = "none"
198
+ reportOptionalIterable = "none"
199
+ reportReturnType = "none"
200
+ reportRedeclaration = "none"
201
+ reportOperatorIssue = "none"
202
+ reportAssignmentType = "none"
203
+ reportOptionalSubscript = "none"
204
+ reportCallIssue = "none"
205
+
206
+ [tool.coverage.run]
207
+ source = ["datasketch"]
208
+ omit = ["*/tests/*", "*/test/*"]
@@ -1,15 +0,0 @@
1
- """Warning.
2
-
3
- datasketch.experimental is dedicated to new modules that are to be merged into
4
- the stable interface of datasketch. So their interfaces may change in future
5
- versions.
6
-
7
- To add a new class or function, register it here in this file. For example:
8
-
9
- from new_module import NewModuleClass
10
-
11
- """
12
-
13
- from datasketch.experimental.aio.lsh import AsyncMinHashLSH
14
-
15
- __all__ = ["AsyncMinHashLSH"]
File without changes
File without changes
File without changes