datasketch 1.9.0__tar.gz → 1.10.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {datasketch-1.9.0 → datasketch-1.10.0}/PKG-INFO +11 -3
  2. {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/__init__.py +2 -1
  3. datasketch-1.10.0/datasketch/aio/__init__.py +44 -0
  4. {datasketch-1.9.0/datasketch/experimental → datasketch-1.10.0/datasketch}/aio/lsh.py +57 -73
  5. {datasketch-1.9.0/datasketch/experimental → datasketch-1.10.0/datasketch}/aio/storage.py +44 -14
  6. datasketch-1.10.0/datasketch/experimental/__init__.py +49 -0
  7. datasketch-1.10.0/datasketch/experimental/aio/__init__.py +50 -0
  8. datasketch-1.10.0/datasketch/experimental/aio/lsh.py +49 -0
  9. {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/lsh.py +15 -7
  10. {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/lshensemble.py +1 -1
  11. {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/lshforest.py +16 -9
  12. {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/storage.py +6 -4
  13. {datasketch-1.9.0 → datasketch-1.10.0}/pyproject.toml +15 -4
  14. datasketch-1.9.0/datasketch/experimental/__init__.py +0 -15
  15. datasketch-1.9.0/datasketch/experimental/aio/__init__.py +0 -0
  16. {datasketch-1.9.0 → datasketch-1.10.0}/.gitignore +0 -0
  17. {datasketch-1.9.0 → datasketch-1.10.0}/LICENSE +0 -0
  18. {datasketch-1.9.0 → datasketch-1.10.0}/README.rst +0 -0
  19. {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/b_bit_minhash.py +0 -0
  20. {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/hashfunc.py +0 -0
  21. {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/hnsw.py +0 -0
  22. {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/hyperloglog.py +0 -0
  23. {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/hyperloglog_const.py +0 -0
  24. {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/lean_minhash.py +0 -0
  25. {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/lsh_bloom.py +0 -0
  26. {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/lshensemble_partition.py +0 -0
  27. {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/minhash.py +0 -0
  28. {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/weighted_minhash.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datasketch
3
- Version: 1.9.0
3
+ Version: 1.10.0
4
4
  Summary: Probabilistic data structures for processing and searching very large datasets
5
5
  Project-URL: Homepage, https://ekzhu.github.io/datasketch
6
6
  Project-URL: Bug Tracker, https://github.com/ekzhu/datasketch/issues
@@ -23,10 +23,16 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
23
23
  Requires-Python: >=3.9
24
24
  Requires-Dist: numpy>=1.11
25
25
  Requires-Dist: scipy>=1.0.0
26
+ Provides-Extra: aio
27
+ Requires-Dist: aiounittest; extra == 'aio'
28
+ Requires-Dist: motor>3.6.0; extra == 'aio'
26
29
  Provides-Extra: benchmark
30
+ Requires-Dist: fonttools>=4.60.2; extra == 'benchmark'
27
31
  Requires-Dist: matplotlib>=3.1.2; extra == 'benchmark'
28
- Requires-Dist: nltk>=3.4.5; extra == 'benchmark'
32
+ Requires-Dist: nltk>=3.4.5; (python_version < '3.10') and extra == 'benchmark'
33
+ Requires-Dist: nltk>=3.9.4; (python_version >= '3.10') and extra == 'benchmark'
29
34
  Requires-Dist: pandas>=0.25.3; extra == 'benchmark'
35
+ Requires-Dist: pillow>=12.2.0; (python_version >= '3.10') and extra == 'benchmark'
30
36
  Requires-Dist: pyfarmhash>=0.2.2; extra == 'benchmark'
31
37
  Requires-Dist: pyhash>=0.9.3; extra == 'benchmark'
32
38
  Requires-Dist: scikit-learn>=0.21.3; extra == 'benchmark'
@@ -48,11 +54,13 @@ Requires-Dist: mock>=2.0.0; extra == 'test'
48
54
  Requires-Dist: mockredispy; extra == 'test'
49
55
  Requires-Dist: nose-exclude>=0.5.0; extra == 'test'
50
56
  Requires-Dist: nose>=1.3.7; extra == 'test'
57
+ Requires-Dist: pygments>=2.20.0; extra == 'test'
51
58
  Requires-Dist: pymongo>=3.9.0; extra == 'test'
52
- Requires-Dist: pytest; extra == 'test'
53
59
  Requires-Dist: pytest-asyncio; extra == 'test'
54
60
  Requires-Dist: pytest-cov; extra == 'test'
55
61
  Requires-Dist: pytest-rerunfailures; extra == 'test'
62
+ Requires-Dist: pytest; (python_version < '3.10') and extra == 'test'
63
+ Requires-Dist: pytest>=9.0.3; (python_version >= '3.10') and extra == 'test'
56
64
  Requires-Dist: redis>=2.10.0; extra == 'test'
57
65
  Description-Content-Type: text/x-rst
58
66
 
@@ -7,6 +7,7 @@ except importlib.metadata.PackageNotFoundError:
7
7
  _version = "0.0.0" # Fallback for development mode
8
8
  __version__: Final[str] = _version
9
9
 
10
+ from datasketch.aio import AsyncMinHashLSH # Instantiation requires motor/redis.asyncio; import itself is always safe.
10
11
  from datasketch.b_bit_minhash import bBitMinHash
11
12
  from datasketch.hashfunc import sha1_hash32
12
13
  from datasketch.hnsw import HNSW
@@ -23,9 +24,9 @@ from datasketch.weighted_minhash import WeightedMinHash, WeightedMinHashGenerato
23
24
  WeightedMinHashLSH = MinHashLSH
24
25
  WeightedMinHashLSHForest = MinHashLSHForest
25
26
 
26
-
27
27
  __all__ = [
28
28
  "HNSW",
29
+ "AsyncMinHashLSH",
29
30
  "HyperLogLog",
30
31
  "HyperLogLogPlusPlus",
31
32
  "LeanMinHash",
@@ -0,0 +1,44 @@
1
+ """Async MinHash LSH module.
2
+
3
+ This module provides asynchronous implementations of MinHash LSH for use with
4
+ async storage backends like MongoDB (via motor) and Redis (via redis.asyncio).
5
+
6
+ Example:
7
+ .. code-block:: python
8
+
9
+ import asyncio
10
+
11
+ from datasketch.aio import AsyncMinHashLSH
12
+ from datasketch import MinHash
13
+
14
+
15
+ async def main():
16
+ # prepickle=True lets you use non-bytes keys (e.g. str). With the
17
+ # default prepickle=False, keys passed to insert() must be bytes.
18
+ async with AsyncMinHashLSH(
19
+ storage_config={"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}},
20
+ threshold=0.5,
21
+ num_perm=128,
22
+ prepickle=True,
23
+ ) as lsh:
24
+ m = MinHash(num_perm=128)
25
+ m.update(b"data")
26
+ await lsh.insert("key", m)
27
+ result = await lsh.query(m)
28
+
29
+
30
+ asyncio.run(main())
31
+
32
+ """
33
+
34
+ from datasketch.aio.lsh import (
35
+ AsyncMinHashLSH,
36
+ AsyncMinHashLSHDeleteSession,
37
+ AsyncMinHashLSHInsertionSession,
38
+ )
39
+
40
+ __all__ = [
41
+ "AsyncMinHashLSH",
42
+ "AsyncMinHashLSHDeleteSession",
43
+ "AsyncMinHashLSHInsertionSession",
44
+ ]
@@ -1,9 +1,15 @@
1
+ """Asynchronous MinHash LSH implementation.
2
+
3
+ This module provides AsyncMinHashLSH for use with async storage backends
4
+ like MongoDB (via motor) and Redis (via redis.asyncio).
5
+ """
6
+
1
7
  import asyncio
2
8
  import pickle
3
9
  from itertools import chain
4
10
  from typing import Optional
5
11
 
6
- from datasketch.experimental.aio.storage import (
12
+ from datasketch.aio.storage import (
7
13
  async_ordered_storage,
8
14
  async_unordered_storage,
9
15
  )
@@ -34,8 +40,6 @@ class AsyncMinHashLSH:
34
40
  MONGO = {"type": "aiomongo", "basename": "base_name_1", "mongo": {"host": "localhost", "port": 27017}}
35
41
 
36
42
  .. note::
37
- * The module supports Python version >=3.6, and is currently experimental.
38
- So the interface may change slightly in the future.
39
43
  * For main functionality of LSH algorithm see :class:`datasketch.MinHashLSH`.
40
44
  * For additional information see :ref:`minhash_lsh_at_scale` and :ref:`minhash_lsh_async`
41
45
  """
@@ -129,7 +133,7 @@ class AsyncMinHashLSH:
129
133
  if self.keys is not None:
130
134
  self.keys.batch_size = value
131
135
  else:
132
- raise AttributeError("AsyncMinHash is not initialized.")
136
+ raise AttributeError("AsyncMinHashLSH is not initialized.")
133
137
 
134
138
  for t in self.hashtables:
135
139
  t.batch_size = value
@@ -163,12 +167,6 @@ class AsyncMinHashLSH:
163
167
  if self.keys is None:
164
168
  await self._create_storages()
165
169
 
166
- if not self.keys.initialized:
167
- await self.keys
168
-
169
- fs = (ht for ht in self.hashtables if not ht.initialized)
170
- await asyncio.gather(*fs)
171
-
172
170
  async def close(self):
173
171
  """Cleanup client resources and disconnect from AsyncMinHashLSH storage."""
174
172
  async with self._lock:
@@ -189,41 +187,26 @@ class AsyncMinHashLSH:
189
187
 
190
188
  :param int batch_size: the size of chunks to use in insert_session mode (default=10000).
191
189
 
192
- :return: datasketch.experimental.aio.lsh.AsyncMinHashLSHSession
190
+ :return: datasketch.aio.lsh.AsyncMinHashLSHInsertionSession
193
191
 
194
192
  Example:
195
193
  .. code-block:: python
196
194
 
197
- from datasketch.experimental.aio.lsh import AsyncMinHashLSH
195
+ import asyncio
196
+ from datasketch.aio import AsyncMinHashLSH
198
197
  from datasketch import MinHash
199
198
 
200
-
201
- def chunk(it, size):
202
- it = iter(it)
203
- return iter(lambda: tuple(islice(it, size)), ())
204
-
205
-
206
- _chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
207
- seq = frozenset(
208
- chain(
209
- ("".join(s) for s in _chunked_str),
210
- ("aahhb", "aahh", "aahhc", "aac", "kld", "bhg", "kkd", "yow", "ppi", "eer"),
211
- )
212
- )
213
- objs = [MinHash(16) for _ in range(len(seq))]
214
- for e, obj in zip(seq, objs):
215
- for i in e:
216
- obj.update(i.encode("utf-8"))
217
- data = [(e, m) for e, m in zip(seq, objs)]
218
-
219
- _storage_config_redis = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
220
-
221
-
222
- async def func():
223
- async with AsyncMinHashLSH(storage_config=_storage_config_redis, threshold=0.5, num_perm=16) as lsh:
199
+ async def main():
200
+ storage_config = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
201
+ async with AsyncMinHashLSH(
202
+ storage_config=storage_config, threshold=0.5, num_perm=16, prepickle=True
203
+ ) as lsh:
224
204
  async with lsh.insertion_session(batch_size=1000) as session:
225
- fs = (session.insert(key, minhash, check_duplication=True) for key, minhash in data)
226
- await asyncio.gather(*fs)
205
+ m = MinHash(num_perm=16)
206
+ m.update(b"data")
207
+ await session.insert("key", m)
208
+
209
+ asyncio.run(main())
227
210
 
228
211
  """
229
212
  return AsyncMinHashLSHInsertionSession(self, batch_size=batch_size)
@@ -232,47 +215,32 @@ class AsyncMinHashLSH:
232
215
  """Create a asynchronous context manager for fast removal of keys
233
216
  from index.
234
217
 
235
- :param int batch_size: the size of chunks to use in insert_session mode (default=10000).
218
+ :param int batch_size: the size of chunks to use in delete_session mode (default=10000).
236
219
 
237
- :return: datasketch.experimental.aio.lsh.AsyncMinHashLSHSession
220
+ :return: datasketch.aio.lsh.AsyncMinHashLSHDeleteSession
238
221
 
239
222
  Example:
240
223
  .. code-block:: python
241
224
 
242
- from datasketch.experimental.aio.lsh import AsyncMinHashLSH
225
+ import asyncio
226
+ from datasketch.aio import AsyncMinHashLSH
243
227
  from datasketch import MinHash
244
228
 
229
+ async def main():
230
+ storage_config = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
231
+ async with AsyncMinHashLSH(
232
+ storage_config=storage_config, threshold=0.5, num_perm=16, prepickle=True
233
+ ) as lsh:
234
+ # Insert some data first
235
+ m = MinHash(num_perm=16)
236
+ m.update(b"data")
237
+ await lsh.insert("key1", m)
245
238
 
246
- def chunk(it, size):
247
- it = iter(it)
248
- return iter(lambda: tuple(islice(it, size)), ())
239
+ # Delete using session
240
+ async with lsh.delete_session(batch_size=100) as session:
241
+ await session.remove("key1")
249
242
 
250
-
251
- _chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
252
- seq = frozenset(
253
- chain(
254
- ("".join(s) for s in _chunked_str),
255
- ("aahhb", "aahh", "aahhc", "aac", "kld", "bhg", "kkd", "yow", "ppi", "eer"),
256
- )
257
- )
258
- objs = [MinHash(16) for _ in range(len(seq))]
259
- for e, obj in zip(seq, objs):
260
- for i in e:
261
- obj.update(i.encode("utf-8"))
262
- data = [(e, m) for e, m in zip(seq, objs)]
263
-
264
- _storage_config_redis = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
265
-
266
-
267
- async def func():
268
- async with AsyncMinHashLSH(storage_config=_storage_config_redis, threshold=0.5, num_perm=16) as lsh:
269
- async with lsh.insertion_session(batch_size=1000) as session:
270
- fs = (session.insert(key, minhash, check_duplication=True) for key, minhash in data)
271
- await asyncio.gather(*fs)
272
-
273
- async with lsh.delete_session(batch_size=3) as session:
274
- fs = (session.remove(key) for key in keys_to_remove)
275
- await asyncio.gather(*fs)
243
+ asyncio.run(main())
276
244
 
277
245
  """
278
246
  return AsyncMinHashLSHDeleteSession(self, batch_size=batch_size)
@@ -288,7 +256,9 @@ class AsyncMinHashLSH:
288
256
  if self.prepickle:
289
257
  key = pickle.dumps(key)
290
258
 
291
- if check_duplication and await self.has_key(key):
259
+ # `key` is already pickled at this point under prepickle=True; call the
260
+ # storage primitive directly so we don't re-pickle through has_key().
261
+ if check_duplication and await self.keys.has_key(key):
292
262
  raise ValueError("The given key already exists")
293
263
  Hs = [self._H(minhash.hashvalues[start:end]) for start, end in self.hashranges]
294
264
 
@@ -314,6 +284,8 @@ class AsyncMinHashLSH:
314
284
 
315
285
  async def has_key(self, key):
316
286
  """See :class:`datasketch.MinHashLSH`."""
287
+ if self.prepickle:
288
+ key = pickle.dumps(key)
317
289
  return await self.keys.has_key(key)
318
290
 
319
291
  async def remove(self, key):
@@ -321,7 +293,12 @@ class AsyncMinHashLSH:
321
293
  await self._remove(key, buffer=False)
322
294
 
323
295
  async def _remove(self, key, buffer=False):
324
- if not await self.has_key(key):
296
+ if self.prepickle:
297
+ key = pickle.dumps(key)
298
+
299
+ # `key` is already pickled here; call storage primitives directly so
300
+ # the existence check, lookup, and deletes all use the stored form.
301
+ if not await self.keys.has_key(key):
325
302
  raise ValueError("The given key does not exist")
326
303
 
327
304
  for H, hashtable in zip(await self.keys.get(key), self.hashtables):
@@ -352,7 +329,10 @@ class AsyncMinHashLSH:
352
329
  H = self._H(minhash.hashvalues[start:end])
353
330
  if await hashtable.has_key(H):
354
331
  fs.append(hashtable.get(H))
355
- return set(chain.from_iterable(await asyncio.gather(*fs))) # candidates
332
+ candidates = set(chain.from_iterable(await asyncio.gather(*fs)))
333
+ if self.prepickle:
334
+ return {pickle.loads(key) for key in candidates}
335
+ return candidates
356
336
 
357
337
  async def get_counts(self):
358
338
  """See :class:`datasketch.MinHashLSH`."""
@@ -361,6 +341,10 @@ class AsyncMinHashLSH:
361
341
 
362
342
  async def get_subset_counts(self, *keys):
363
343
  """See :class:`datasketch.MinHashLSH`."""
344
+ # Keys in storage are pickled when prepickle is enabled, so we have to
345
+ # pickle the query keys to match the stored representation.
346
+ if self.prepickle:
347
+ keys = tuple(pickle.dumps(key) for key in keys)
364
348
  key_set = list(set(keys))
365
349
  hashtables = [unordered_storage({"type": "dict"}) for _ in range(self.b)]
366
350
  Hss = await self.keys.getmany(*key_set)
@@ -1,9 +1,23 @@
1
+ """Async storage backends for MinHash LSH.
2
+
3
+ This module provides async storage implementations for use with AsyncMinHashLSH:
4
+ - AsyncMongoListStorage / AsyncMongoSetStorage: MongoDB storage via motor
5
+ - AsyncRedisListStorage / AsyncRedisSetStorage: Redis storage via redis.asyncio
6
+ """
7
+
1
8
  import asyncio
2
9
  import os
3
10
  from abc import ABCMeta
4
11
  from itertools import chain
5
12
 
6
- from datasketch.storage import OrderedStorage, RedisStorage, Storage, UnorderedStorage, _random_name
13
+ from datasketch.storage import OrderedStorage, Storage, UnorderedStorage, _random_name
14
+
15
+ # RedisStorage is only available when redis package is installed (optional dependency)
16
+ # Import it conditionally to avoid ImportError when redis is not installed
17
+ try:
18
+ from datasketch.storage import RedisStorage
19
+ except ImportError:
20
+ RedisStorage = None
7
21
 
8
22
  ABC = ABCMeta("ABC", (object,), {})
9
23
 
@@ -24,6 +38,12 @@ except ImportError:
24
38
  redis = None
25
39
 
26
40
 
41
+ __all__ = [
42
+ "async_ordered_storage",
43
+ "async_unordered_storage",
44
+ ]
45
+
46
+
27
47
  async def async_ordered_storage(config, name=None):
28
48
  tp = config["type"]
29
49
  if tp == "aiomongo":
@@ -55,9 +75,9 @@ if motor is not None and ReturnDocument is not None:
55
75
  class AsyncMongoBuffer:
56
76
  def __init__(self, aio_mongo_collection, batch_size):
57
77
  self._batch_size = batch_size
58
- self._insert_documents_stack = tuple()
59
- self._delete_by_key_documents_stack = tuple()
60
- self._delete_by_val_documents_stack = tuple()
78
+ self._insert_documents_stack = []
79
+ self._delete_by_key_documents_stack = []
80
+ self._delete_by_val_documents_stack = []
61
81
  self._mongo_coll = aio_mongo_collection
62
82
 
63
83
  @property
@@ -73,28 +93,28 @@ if motor is not None and ReturnDocument is not None:
73
93
  if command == "insert":
74
94
  if len(self._insert_documents_stack) >= self.batch_size:
75
95
  await self.execute(command)
76
- self._insert_documents_stack += (kwargs["obj"],)
96
+ self._insert_documents_stack.append(kwargs["obj"])
77
97
  elif command == "delete_by_key":
78
98
  if len(self._delete_by_key_documents_stack) >= self.batch_size:
79
99
  await self.execute(command)
80
- self._delete_by_key_documents_stack += (kwargs["key"],)
100
+ self._delete_by_key_documents_stack.append(kwargs["key"])
81
101
  elif command == "delete_by_val":
82
102
  if len(self._delete_by_val_documents_stack) >= self.batch_size:
83
103
  await self.execute(command)
84
- self._delete_by_val_documents_stack += (kwargs["val"],)
104
+ self._delete_by_val_documents_stack.append(kwargs["val"])
85
105
 
86
106
  async def execute(self, command):
87
107
  if command == "insert" and self._insert_documents_stack:
88
108
  buffer = self._insert_documents_stack
89
- self._insert_documents_stack = tuple()
109
+ self._insert_documents_stack = []
90
110
  await self._mongo_coll.insert_many(buffer, ordered=False)
91
111
  elif command == "delete_by_key" and self._delete_by_key_documents_stack:
92
112
  buffer = self._delete_by_key_documents_stack
93
- self._delete_by_key_documents_stack = tuple()
113
+ self._delete_by_key_documents_stack = []
94
114
  await self._mongo_coll.delete_many({"key": {"$in": buffer}})
95
115
  elif command == "delete_by_val" and self._delete_by_val_documents_stack:
96
116
  buffer = self._delete_by_val_documents_stack
97
- self._delete_by_val_documents_stack = tuple()
117
+ self._delete_by_val_documents_stack = []
98
118
  await self._mongo_coll.delete_many({"vals": {"$in": buffer}})
99
119
 
100
120
  async def insert_one(self, **kwargs):
@@ -254,6 +274,9 @@ if motor is not None and ReturnDocument is not None:
254
274
  async def has_key(self, key):
255
275
  return bool(await self._collection.find_one({"key": key}))
256
276
 
277
+ async def getmany(self, *keys):
278
+ return await asyncio.gather(*(self.get(key) for key in keys))
279
+
257
280
  async def status(self):
258
281
  status = self._parse_config(self.config["mongo"])
259
282
  status.update({"keyspace_size": await self.size()})
@@ -285,7 +308,9 @@ if motor is not None and ReturnDocument is not None:
285
308
  await self._collection.find_one_and_delete({"key": key, "vals": val})
286
309
 
287
310
 
288
- if redis is not None:
311
+ # Redis-based async storage classes are only defined when both redis package
312
+ # and RedisStorage are available (optional dependencies)
313
+ if redis is not None and RedisStorage is not None:
289
314
 
290
315
  class AsyncRedisBuffer(redis.client.Pipeline):
291
316
  def __init__(self, connection_pool, response_callbacks, transaction, buffer_size, shard_hint=None):
@@ -304,7 +329,7 @@ if redis is not None:
304
329
 
305
330
  async def execute_command(self, *args, **kwargs):
306
331
  if len(self.command_stack) >= self._buffer_size:
307
- self.execute()
332
+ await self.execute()
308
333
  await super(AsyncRedisBuffer, self).execute_command(*args, **kwargs)
309
334
 
310
335
  class AsyncRedisStorage(RedisStorage):
@@ -347,9 +372,8 @@ if redis is not None:
347
372
 
348
373
  async def getmany(self, *keys):
349
374
  pipe = self._redis.pipeline()
350
- pipe.multi()
351
375
  for key in keys:
352
- await self._get_items(pipe, self.redis_key(key))
376
+ pipe.lrange(self.redis_key(key), 0, -1)
353
377
  return await pipe.execute()
354
378
 
355
379
  @staticmethod
@@ -422,6 +446,12 @@ if redis is not None:
422
446
  async def _get_items(r, k):
423
447
  return await r.smembers(k)
424
448
 
449
+ async def getmany(self, *keys):
450
+ pipe = self._redis.pipeline()
451
+ for key in keys:
452
+ pipe.smembers(self.redis_key(key))
453
+ return await pipe.execute()
454
+
425
455
  async def remove_val(self, key, val, **kwargs):
426
456
  buffer = kwargs.pop("buffer", False)
427
457
  redis_key = self.redis_key(key)
@@ -0,0 +1,49 @@
1
+ """Deprecated experimental module.
2
+
3
+ .. deprecated::
4
+ The `datasketch.experimental` module is deprecated and will be removed in a future version.
5
+ Please use `datasketch.aio` instead:
6
+
7
+ Old: ``from datasketch.experimental import AsyncMinHashLSH``
8
+ New: ``from datasketch.aio import AsyncMinHashLSH``
9
+
10
+ Or simply: ``from datasketch import AsyncMinHashLSH``
11
+ """
12
+
13
+ from typing import TYPE_CHECKING
14
+
15
+ if TYPE_CHECKING:
16
+ # Visible to static analyzers so they know `__all__` is satisfied.
17
+ # Not imported at runtime - the real dispatch happens in __getattr__.
18
+ from datasketch.aio import AsyncMinHashLSH
19
+
20
+ __all__ = ["AsyncMinHashLSH"]
21
+
22
+
23
+ def __getattr__(name):
24
+ # PEP 562: only emit the DeprecationWarning when the user actually pulls a
25
+ # symbol out of this package, not on every `import datasketch.experimental`.
26
+ # This avoids the noisy triple-warning that fired when each intermediate
27
+ # __init__.py warned eagerly.
28
+ #
29
+ # We cache the resolved symbol back into globals() so subsequent accesses
30
+ # bypass __getattr__. This matters for two reasons:
31
+ # 1. `from pkg import x` internally performs both `hasattr(pkg, x)` and
32
+ # `getattr(pkg, x)`, so without caching __getattr__ fires twice.
33
+ # 2. It makes the warning a one-shot per process, which is the normal
34
+ # expectation for deprecation warnings.
35
+ if name == "AsyncMinHashLSH":
36
+ import warnings
37
+
38
+ warnings.warn(
39
+ "datasketch.experimental is deprecated. "
40
+ "Use 'from datasketch.aio import AsyncMinHashLSH' or "
41
+ "'from datasketch import AsyncMinHashLSH' instead.",
42
+ DeprecationWarning,
43
+ stacklevel=2,
44
+ )
45
+ from datasketch.aio import AsyncMinHashLSH
46
+
47
+ globals()[name] = AsyncMinHashLSH
48
+ return AsyncMinHashLSH
49
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -0,0 +1,50 @@
1
+ """Deprecated experimental aio module.
2
+
3
+ .. deprecated::
4
+ The `datasketch.experimental.aio` module is deprecated and will be removed in a future version.
5
+ Please use `datasketch.aio` instead:
6
+
7
+ Old: ``from datasketch.experimental.aio import AsyncMinHashLSH``
8
+ New: ``from datasketch.aio import AsyncMinHashLSH``
9
+ """
10
+
11
+ from typing import TYPE_CHECKING
12
+
13
+ if TYPE_CHECKING:
14
+ # Visible to static analyzers so they know `__all__` is satisfied.
15
+ # Not imported at runtime - the real dispatch happens in __getattr__.
16
+ from datasketch.aio import (
17
+ AsyncMinHashLSH,
18
+ AsyncMinHashLSHDeleteSession,
19
+ AsyncMinHashLSHInsertionSession,
20
+ )
21
+
22
+ __all__ = [
23
+ "AsyncMinHashLSH",
24
+ "AsyncMinHashLSHDeleteSession",
25
+ "AsyncMinHashLSHInsertionSession",
26
+ ]
27
+
28
+ _DEPRECATED = frozenset(__all__)
29
+
30
+
31
+ def __getattr__(name):
32
+ # PEP 562: emit the warning lazily on attribute access so that merely
33
+ # importing the parent package (e.g. as an intermediate step of
34
+ # `from datasketch.experimental.aio.lsh import ...`) does not fire
35
+ # a second, redundant warning. See the long comment in
36
+ # datasketch/experimental/__init__.py for why we cache into globals().
37
+ if name in _DEPRECATED:
38
+ import warnings
39
+
40
+ warnings.warn(
41
+ "datasketch.experimental.aio is deprecated. Use 'from datasketch.aio import AsyncMinHashLSH' instead.",
42
+ DeprecationWarning,
43
+ stacklevel=2,
44
+ )
45
+ import datasketch.aio as _new
46
+
47
+ value = getattr(_new, name)
48
+ globals()[name] = value
49
+ return value
50
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -0,0 +1,49 @@
1
+ """Deprecated experimental aio lsh module.
2
+
3
+ .. deprecated::
4
+ The `datasketch.experimental.aio.lsh` module is deprecated and will be removed in a future version.
5
+ Please use `datasketch.aio.lsh` instead:
6
+
7
+ Old: ``from datasketch.experimental.aio.lsh import AsyncMinHashLSH``
8
+ New: ``from datasketch.aio import AsyncMinHashLSH``
9
+ """
10
+
11
+ from typing import TYPE_CHECKING
12
+
13
+ if TYPE_CHECKING:
14
+ # Visible to static analyzers so they know `__all__` is satisfied.
15
+ # Not imported at runtime - the real dispatch happens in __getattr__.
16
+ from datasketch.aio.lsh import (
17
+ AsyncMinHashLSH,
18
+ AsyncMinHashLSHDeleteSession,
19
+ AsyncMinHashLSHInsertionSession,
20
+ )
21
+
22
+ __all__ = [
23
+ "AsyncMinHashLSH",
24
+ "AsyncMinHashLSHDeleteSession",
25
+ "AsyncMinHashLSHInsertionSession",
26
+ ]
27
+
28
+ _DEPRECATED = frozenset(__all__)
29
+
30
+
31
+ def __getattr__(name):
32
+ # Lazy warning via PEP 562: fires exactly once per attribute access on the
33
+ # deprecated module, and we cache the resolved symbol back into globals()
34
+ # so the warning is emitted once per process (see the long comment in
35
+ # datasketch/experimental/__init__.py for rationale).
36
+ if name in _DEPRECATED:
37
+ import warnings
38
+
39
+ warnings.warn(
40
+ "datasketch.experimental.aio.lsh is deprecated. Use 'from datasketch.aio import AsyncMinHashLSH' instead.",
41
+ DeprecationWarning,
42
+ stacklevel=2,
43
+ )
44
+ import datasketch.aio.lsh as _new
45
+
46
+ value = getattr(_new, name)
47
+ globals()[name] = value
48
+ return value
49
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -463,16 +463,24 @@ class MinHashLSH:
463
463
  list: a list of unique keys.
464
464
 
465
465
  """
466
- collected_result_sets = [
467
- set(collected_result_lists)
468
- for hashtable in self.hashtables
469
- for collected_result_lists in hashtable.collect_select_buffer()
466
+ collected_result_lists = [hashtable.collect_select_buffer() for hashtable in self.hashtables]
467
+ if not any(collected_result_lists):
468
+ return []
469
+
470
+ # Each buffered query contributes one result list per hashtable. We first
471
+ # union candidates across bands for each query, then intersect across the
472
+ # buffered queries to match repeated calls to `query()`.
473
+ per_query_result_sets = [
474
+ set().union(*query_result_lists)
475
+ for query_result_lists in zip(*collected_result_lists)
470
476
  ]
471
- if not collected_result_sets:
477
+ if not per_query_result_sets:
472
478
  return []
479
+
480
+ candidates = set.intersection(*per_query_result_sets)
473
481
  if self.prepickle:
474
- return [pickle.loads(key) for key in set.intersection(*collected_result_sets)]
475
- return list(set.intersection(*collected_result_sets))
482
+ return [pickle.loads(key) for key in candidates]
483
+ return list(candidates)
476
484
 
477
485
  def __contains__(self, key: Hashable) -> bool:
478
486
  """Args:
@@ -204,7 +204,7 @@ class MinHashLSHEnsemble:
204
204
  if not self.is_empty():
205
205
  raise ValueError("Cannot call index again on a non-empty index")
206
206
  if not isinstance(entries, list):
207
- queue = deque([])
207
+ queue = deque()
208
208
  for key, minhash, size in entries:
209
209
  if size <= 0:
210
210
  raise ValueError("Set size must be positive")
@@ -9,8 +9,8 @@ from datasketch.minhash import MinHash
9
9
  class MinHashLSHForest:
10
10
  """The LSH Forest for MinHash. It supports top-k query in Jaccard
11
11
  similarity.
12
- Instead of using prefix trees as the `original paper
13
- <http://ilpubs.stanford.edu:8090/678/1/2005-14.pdf>`_,
12
+ Instead of using prefix trees as described in the original LSH Forest
13
+ paper by Bawa et al. (WWW 2005),
14
14
  I use a sorted array to store the hash values in every
15
15
  hash table.
16
16
 
@@ -37,7 +37,8 @@ class MinHashLSHForest:
37
37
  # Maximum depth of the prefix tree
38
38
  self.k = int(num_perm / l)
39
39
  self.hashtables = [defaultdict(list) for _ in range(self.l)]
40
- self.hashranges = [(i * self.k, (i + 1) * self.k) for i in range(self.l)]
40
+ self.hashranges = [(i * self.k, (i + 1) * self.k)
41
+ for i in range(self.l)]
41
42
  self.keys = dict()
42
43
  # This is the sorted array implementation for the prefix trees
43
44
  self.sorted_hashtables = [[] for _ in range(self.l)]
@@ -59,7 +60,8 @@ class MinHashLSHForest:
59
60
  raise ValueError("The num_perm of MinHash out of range")
60
61
  if key in self.keys:
61
62
  raise ValueError("The given key has already been added")
62
- self.keys[key] = [self._H(minhash.hashvalues[start:end]) for start, end in self.hashranges]
63
+ self.keys[key] = [self._H(minhash.hashvalues[start:end])
64
+ for start, end in self.hashranges]
63
65
  for H, hashtable in zip(self.keys[key], self.hashtables):
64
66
  hashtable[H].append(key)
65
67
 
@@ -73,11 +75,13 @@ class MinHashLSHForest:
73
75
  if r > self.k or r <= 0 or b > self.l or b <= 0:
74
76
  raise ValueError("parameter outside range")
75
77
  # Generate prefixes of concatenated hash values
76
- hps = [self._H(minhash.hashvalues[start : start + r]) for start, _ in self.hashranges]
78
+ hps = [self._H(minhash.hashvalues[start: start + r])
79
+ for start, _ in self.hashranges]
77
80
  # Set the prefix length for look-ups in the sorted hash values list
78
81
  prefix_size = len(hps[0])
79
82
  for ht, hp, hashtable in zip(self.sorted_hashtables, hps, self.hashtables):
80
- i = self._binary_search(len(ht), lambda x, ht=ht, hp=hp: ht[x][:prefix_size] >= hp)
83
+ i = self._binary_search(
84
+ len(ht), lambda x, ht=ht, hp=hp: ht[x][:prefix_size] >= hp)
81
85
  if i < len(ht) and ht[i][:prefix_size] == hp:
82
86
  j = i
83
87
  while j < len(ht) and ht[j][:prefix_size] == hp:
@@ -137,14 +141,17 @@ class MinHashLSHForest:
137
141
  """
138
142
  byteslist = self.keys.get(key, None)
139
143
  if byteslist is None:
140
- raise KeyError(f"The provided key does not exist in the LSHForest: {key}")
144
+ raise KeyError(
145
+ f"The provided key does not exist in the LSHForest: {key}")
141
146
  hashvalue_byte_size = len(byteslist[0]) // 8
142
- hashvalues = np.empty(len(byteslist) * hashvalue_byte_size, dtype=np.uint64)
147
+ hashvalues = np.empty(
148
+ len(byteslist) * hashvalue_byte_size, dtype=np.uint64)
143
149
  for index, item in enumerate(byteslist):
144
150
  # unswap the bytes, as their representation is flipped during storage
145
151
  hv_segment = np.frombuffer(item, dtype=np.uint64).byteswap()
146
152
  curr_index = index * hashvalue_byte_size
147
- hashvalues[curr_index : curr_index + hashvalue_byte_size] = hv_segment
153
+ hashvalues[curr_index: curr_index +
154
+ hashvalue_byte_size] = hv_segment
148
155
  return hashvalues
149
156
 
150
157
  def _binary_search(self, n, func):
@@ -603,12 +603,14 @@ if cassandra is not None:
603
603
  del self._select_statements_and_parameters_with_decoders[:]
604
604
  statements_and_parameters, decoders = zip(*buffer)
605
605
 
606
- ret = collections.defaultdict(list)
607
606
  query_results = self._select(statements_and_parameters)
608
- for rows, (key_decoder, val_decoder) in zip(query_results, decoders):
607
+ ret = []
608
+ for rows, (_key_decoder, val_decoder) in zip(query_results, decoders):
609
+ values = []
609
610
  for row in rows:
610
- ret[key_decoder(row.key)].append((val_decoder(row.value), row.ts))
611
- return [[x[0] for x in sorted(v, key=operator.itemgetter(1))] for v in ret.values()]
611
+ values.append((val_decoder(row.value), row.ts))
612
+ ret.append([x[0] for x in sorted(values, key=operator.itemgetter(1))])
613
+ return ret
612
614
 
613
615
  def select(self, keys):
614
616
  """Select all values for the given keys.
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "datasketch"
7
- version = "1.9.0"
7
+ version = "1.10.0"
8
8
  description = "Probabilistic data structures for processing and searching very large datasets"
9
9
  readme = "README.rst"
10
10
  requires-python = ">=3.9"
@@ -37,7 +37,11 @@ benchmark = [
37
37
  "pandas>=0.25.3",
38
38
  "SetSimilaritySearch>=0.1.7",
39
39
  "pyfarmhash>=0.2.2",
40
- "nltk>=3.4.5",
40
+ "nltk>=3.4.5; python_version < '3.10'",
41
+ "nltk>=3.9.4; python_version >= '3.10'",
42
+ # Transitive deps of matplotlib listed to avoid dependabot uv.lock-only PRs.
43
+ "pillow>=12.2.0; python_version >= '3.10'",
44
+ "fonttools>=4.60.2",
41
45
  ]
42
46
  test = [
43
47
  "cassandra-driver>=3.20",
@@ -49,10 +53,17 @@ test = [
49
53
  "pymongo>=3.9.0",
50
54
  "nose>=1.3.7",
51
55
  "nose-exclude>=0.5.0",
52
- "pytest",
56
+ "pytest; python_version < '3.10'",
57
+ "pytest>=9.0.3; python_version >= '3.10'",
53
58
  "pytest-rerunfailures",
54
59
  "pytest-asyncio",
60
+ # Transitive dep of pytest listed to avoid dependabot uv.lock-only PRs.
61
+ "pygments>=2.20.0",
55
62
  ]
63
+ aio = ["aiounittest", "motor>3.6.0"]
64
+ # KEEP IN SYNC WITH `aio` ABOVE. Deprecated alias retained for backwards compat;
65
+ # PEP 621 does not support referencing one optional-dependency group from
66
+ # another, so the dependency list must be duplicated verbatim.
56
67
  experimental_aio = ["aiounittest", "motor>3.6.0"]
57
68
 
58
69
  [project.urls]
@@ -194,4 +205,4 @@ reportCallIssue = "none"
194
205
 
195
206
  [tool.coverage.run]
196
207
  source = ["datasketch"]
197
- omit = ["*/experimental/*", "*/tests/*", "*/test/*"]
208
+ omit = ["*/tests/*", "*/test/*"]
@@ -1,15 +0,0 @@
1
- """Warning.
2
-
3
- datasketch.experimental is dedicated to new modules that are to be merged into
4
- the stable interface of datasketch. So their interfaces may change in future
5
- versions.
6
-
7
- To add a new class or function, register it here in this file. For example:
8
-
9
- from new_module import NewModuleClass
10
-
11
- """
12
-
13
- from datasketch.experimental.aio.lsh import AsyncMinHashLSH
14
-
15
- __all__ = ["AsyncMinHashLSH"]
File without changes
File without changes
File without changes
File without changes