datasketch 1.9.0__tar.gz → 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {datasketch-1.9.0 → datasketch-2.0.0}/PKG-INFO +22 -3
  2. {datasketch-1.9.0 → datasketch-2.0.0}/README.rst +11 -0
  3. {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/__init__.py +4 -2
  4. datasketch-2.0.0/datasketch/aio/__init__.py +44 -0
  5. {datasketch-1.9.0/datasketch/experimental → datasketch-2.0.0/datasketch}/aio/lsh.py +65 -73
  6. {datasketch-1.9.0/datasketch/experimental → datasketch-2.0.0/datasketch}/aio/storage.py +44 -14
  7. {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/b_bit_minhash.py +64 -11
  8. datasketch-2.0.0/datasketch/experimental/__init__.py +49 -0
  9. datasketch-2.0.0/datasketch/experimental/aio/__init__.py +50 -0
  10. datasketch-2.0.0/datasketch/experimental/aio/lsh.py +49 -0
  11. {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/hyperloglog.py +1 -1
  12. datasketch-2.0.0/datasketch/lean_minhash.py +359 -0
  13. {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/lsh.py +32 -8
  14. {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/lsh_bloom.py +18 -8
  15. {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/lshensemble.py +6 -1
  16. {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/lshforest.py +40 -13
  17. {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/minhash.py +314 -56
  18. {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/storage.py +6 -4
  19. {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/weighted_minhash.py +2 -0
  20. {datasketch-1.9.0 → datasketch-2.0.0}/pyproject.toml +15 -4
  21. datasketch-1.9.0/datasketch/experimental/__init__.py +0 -15
  22. datasketch-1.9.0/datasketch/experimental/aio/__init__.py +0 -0
  23. datasketch-1.9.0/datasketch/lean_minhash.py +0 -253
  24. {datasketch-1.9.0 → datasketch-2.0.0}/.gitignore +0 -0
  25. {datasketch-1.9.0 → datasketch-2.0.0}/LICENSE +0 -0
  26. {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/hashfunc.py +0 -0
  27. {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/hnsw.py +0 -0
  28. {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/hyperloglog_const.py +0 -0
  29. {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/lshensemble_partition.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datasketch
3
- Version: 1.9.0
3
+ Version: 2.0.0
4
4
  Summary: Probabilistic data structures for processing and searching very large datasets
5
5
  Project-URL: Homepage, https://ekzhu.github.io/datasketch
6
6
  Project-URL: Bug Tracker, https://github.com/ekzhu/datasketch/issues
@@ -23,10 +23,16 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
23
23
  Requires-Python: >=3.9
24
24
  Requires-Dist: numpy>=1.11
25
25
  Requires-Dist: scipy>=1.0.0
26
+ Provides-Extra: aio
27
+ Requires-Dist: aiounittest; extra == 'aio'
28
+ Requires-Dist: motor>3.6.0; extra == 'aio'
26
29
  Provides-Extra: benchmark
30
+ Requires-Dist: fonttools>=4.60.2; extra == 'benchmark'
27
31
  Requires-Dist: matplotlib>=3.1.2; extra == 'benchmark'
28
- Requires-Dist: nltk>=3.4.5; extra == 'benchmark'
32
+ Requires-Dist: nltk>=3.4.5; (python_version < '3.10') and extra == 'benchmark'
33
+ Requires-Dist: nltk>=3.9.4; (python_version >= '3.10') and extra == 'benchmark'
29
34
  Requires-Dist: pandas>=0.25.3; extra == 'benchmark'
35
+ Requires-Dist: pillow>=12.2.0; (python_version >= '3.10') and extra == 'benchmark'
30
36
  Requires-Dist: pyfarmhash>=0.2.2; extra == 'benchmark'
31
37
  Requires-Dist: pyhash>=0.9.3; extra == 'benchmark'
32
38
  Requires-Dist: scikit-learn>=0.21.3; extra == 'benchmark'
@@ -48,11 +54,13 @@ Requires-Dist: mock>=2.0.0; extra == 'test'
48
54
  Requires-Dist: mockredispy; extra == 'test'
49
55
  Requires-Dist: nose-exclude>=0.5.0; extra == 'test'
50
56
  Requires-Dist: nose>=1.3.7; extra == 'test'
57
+ Requires-Dist: pygments>=2.20.0; extra == 'test'
51
58
  Requires-Dist: pymongo>=3.9.0; extra == 'test'
52
- Requires-Dist: pytest; extra == 'test'
53
59
  Requires-Dist: pytest-asyncio; extra == 'test'
54
60
  Requires-Dist: pytest-cov; extra == 'test'
55
61
  Requires-Dist: pytest-rerunfailures; extra == 'test'
62
+ Requires-Dist: pytest; (python_version < '3.10') and extra == 'test'
63
+ Requires-Dist: pytest>=9.0.3; (python_version >= '3.10') and extra == 'test'
56
64
  Requires-Dist: redis>=2.10.0; extra == 'test'
57
65
  Description-Content-Type: text/x-rst
58
66
 
@@ -72,6 +80,17 @@ datasketch gives you probabilistic data structures that can process and
72
80
  search very large amount of data super fast, with little loss of
73
81
  accuracy.
74
82
 
83
+ .. note::
84
+ **Version 2.0.0** changes the default MinHash permutation scheme to
85
+ ``"affine32"``, which fixes a similarity over-estimation bias on large
86
+ sets (`issue #212 <https://github.com/ekzhu/datasketch/issues/212>`__),
87
+ halves sketch memory, and speeds up updates by roughly 4x. A 64-bit
88
+ ``"affine64"`` scheme is available for billion-scale sets. Hash values
89
+ differ from earlier versions: rebuild persisted sketches and LSH
90
+ indexes, or pass ``MinHash(..., scheme="legacy")`` to interoperate with
91
+ existing data. See the `MinHash documentation
92
+ <https://ekzhu.github.io/datasketch/minhash.html>`__ for details.
93
+
75
94
  This package contains the following data sketches:
76
95
 
77
96
  +-------------------------+-----------------------------------------------+
@@ -14,6 +14,17 @@ datasketch gives you probabilistic data structures that can process and
14
14
  search very large amount of data super fast, with little loss of
15
15
  accuracy.
16
16
 
17
+ .. note::
18
+ **Version 2.0.0** changes the default MinHash permutation scheme to
19
+ ``"affine32"``, which fixes a similarity over-estimation bias on large
20
+ sets (`issue #212 <https://github.com/ekzhu/datasketch/issues/212>`__),
21
+ halves sketch memory, and speeds up updates by roughly 4x. A 64-bit
22
+ ``"affine64"`` scheme is available for billion-scale sets. Hash values
23
+ differ from earlier versions: rebuild persisted sketches and LSH
24
+ indexes, or pass ``MinHash(..., scheme="legacy")`` to interoperate with
25
+ existing data. See the `MinHash documentation
26
+ <https://ekzhu.github.io/datasketch/minhash.html>`__ for details.
27
+
17
28
  This package contains the following data sketches:
18
29
 
19
30
  +-------------------------+-----------------------------------------------+
@@ -7,8 +7,9 @@ except importlib.metadata.PackageNotFoundError:
7
7
  _version = "0.0.0" # Fallback for development mode
8
8
  __version__: Final[str] = _version
9
9
 
10
+ from datasketch.aio import AsyncMinHashLSH # Instantiation requires motor/redis.asyncio; import itself is always safe.
10
11
  from datasketch.b_bit_minhash import bBitMinHash
11
- from datasketch.hashfunc import sha1_hash32
12
+ from datasketch.hashfunc import sha1_hash32, sha1_hash64
12
13
  from datasketch.hnsw import HNSW
13
14
  from datasketch.hyperloglog import HyperLogLog, HyperLogLogPlusPlus
14
15
  from datasketch.lean_minhash import LeanMinHash
@@ -23,9 +24,9 @@ from datasketch.weighted_minhash import WeightedMinHash, WeightedMinHashGenerato
23
24
  WeightedMinHashLSH = MinHashLSH
24
25
  WeightedMinHashLSHForest = MinHashLSHForest
25
26
 
26
-
27
27
  __all__ = [
28
28
  "HNSW",
29
+ "AsyncMinHashLSH",
29
30
  "HyperLogLog",
30
31
  "HyperLogLogPlusPlus",
31
32
  "LeanMinHash",
@@ -40,4 +41,5 @@ __all__ = [
40
41
  "WeightedMinHashLSHForest",
41
42
  "bBitMinHash",
42
43
  "sha1_hash32",
44
+ "sha1_hash64",
43
45
  ]
@@ -0,0 +1,44 @@
1
+ """Async MinHash LSH module.
2
+
3
+ This module provides asynchronous implementations of MinHash LSH for use with
4
+ async storage backends like MongoDB (via motor) and Redis (via redis.asyncio).
5
+
6
+ Example:
7
+ .. code-block:: python
8
+
9
+ import asyncio
10
+
11
+ from datasketch.aio import AsyncMinHashLSH
12
+ from datasketch import MinHash
13
+
14
+
15
+ async def main():
16
+ # prepickle=True lets you use non-bytes keys (e.g. str). With the
17
+ # default prepickle=False, keys passed to insert() must be bytes.
18
+ async with AsyncMinHashLSH(
19
+ storage_config={"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}},
20
+ threshold=0.5,
21
+ num_perm=128,
22
+ prepickle=True,
23
+ ) as lsh:
24
+ m = MinHash(num_perm=128)
25
+ m.update(b"data")
26
+ await lsh.insert("key", m)
27
+ result = await lsh.query(m)
28
+
29
+
30
+ asyncio.run(main())
31
+
32
+ """
33
+
34
+ from datasketch.aio.lsh import (
35
+ AsyncMinHashLSH,
36
+ AsyncMinHashLSHDeleteSession,
37
+ AsyncMinHashLSHInsertionSession,
38
+ )
39
+
40
+ __all__ = [
41
+ "AsyncMinHashLSH",
42
+ "AsyncMinHashLSHDeleteSession",
43
+ "AsyncMinHashLSHInsertionSession",
44
+ ]
@@ -1,13 +1,20 @@
1
+ """Asynchronous MinHash LSH implementation.
2
+
3
+ This module provides AsyncMinHashLSH for use with async storage backends
4
+ like MongoDB (via motor) and Redis (via redis.asyncio).
5
+ """
6
+
1
7
  import asyncio
2
8
  import pickle
3
9
  from itertools import chain
4
10
  from typing import Optional
5
11
 
6
- from datasketch.experimental.aio.storage import (
12
+ from datasketch.aio.storage import (
7
13
  async_ordered_storage,
8
14
  async_unordered_storage,
9
15
  )
10
16
  from datasketch.lsh import _optimal_param
17
+ from datasketch.minhash import _check_scheme_consistency
11
18
  from datasketch.storage import _random_name, unordered_storage
12
19
 
13
20
 
@@ -34,8 +41,6 @@ class AsyncMinHashLSH:
34
41
  MONGO = {"type": "aiomongo", "basename": "base_name_1", "mongo": {"host": "localhost", "port": 27017}}
35
42
 
36
43
  .. note::
37
- * The module supports Python version >=3.6, and is currently experimental.
38
- So the interface may change slightly in the future.
39
44
  * For main functionality of LSH algorithm see :class:`datasketch.MinHashLSH`.
40
45
  * For additional information see :ref:`minhash_lsh_at_scale` and :ref:`minhash_lsh_async`
41
46
  """
@@ -84,6 +89,10 @@ class AsyncMinHashLSH:
84
89
  self.hashranges = [(i * self.r, (i + 1) * self.r) for i in range(self.b)]
85
90
  self.hashtables = None
86
91
  self.keys = None
92
+ # The permutation scheme of the indexed MinHash, learned from the
93
+ # first insert. Note that an index attached to pre-existing external
94
+ # storage re-learns the scheme on its first insert.
95
+ self._minhash_scheme: Optional[str] = None
87
96
 
88
97
  self._lock = asyncio.Lock()
89
98
  self._initialized = False
@@ -129,7 +138,7 @@ class AsyncMinHashLSH:
129
138
  if self.keys is not None:
130
139
  self.keys.batch_size = value
131
140
  else:
132
- raise AttributeError("AsyncMinHash is not initialized.")
141
+ raise AttributeError("AsyncMinHashLSH is not initialized.")
133
142
 
134
143
  for t in self.hashtables:
135
144
  t.batch_size = value
@@ -163,12 +172,6 @@ class AsyncMinHashLSH:
163
172
  if self.keys is None:
164
173
  await self._create_storages()
165
174
 
166
- if not self.keys.initialized:
167
- await self.keys
168
-
169
- fs = (ht for ht in self.hashtables if not ht.initialized)
170
- await asyncio.gather(*fs)
171
-
172
175
  async def close(self):
173
176
  """Cleanup client resources and disconnect from AsyncMinHashLSH storage."""
174
177
  async with self._lock:
@@ -189,41 +192,26 @@ class AsyncMinHashLSH:
189
192
 
190
193
  :param int batch_size: the size of chunks to use in insert_session mode (default=10000).
191
194
 
192
- :return: datasketch.experimental.aio.lsh.AsyncMinHashLSHSession
195
+ :return: datasketch.aio.lsh.AsyncMinHashLSHInsertionSession
193
196
 
194
197
  Example:
195
198
  .. code-block:: python
196
199
 
197
- from datasketch.experimental.aio.lsh import AsyncMinHashLSH
200
+ import asyncio
201
+ from datasketch.aio import AsyncMinHashLSH
198
202
  from datasketch import MinHash
199
203
 
200
-
201
- def chunk(it, size):
202
- it = iter(it)
203
- return iter(lambda: tuple(islice(it, size)), ())
204
-
205
-
206
- _chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
207
- seq = frozenset(
208
- chain(
209
- ("".join(s) for s in _chunked_str),
210
- ("aahhb", "aahh", "aahhc", "aac", "kld", "bhg", "kkd", "yow", "ppi", "eer"),
211
- )
212
- )
213
- objs = [MinHash(16) for _ in range(len(seq))]
214
- for e, obj in zip(seq, objs):
215
- for i in e:
216
- obj.update(i.encode("utf-8"))
217
- data = [(e, m) for e, m in zip(seq, objs)]
218
-
219
- _storage_config_redis = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
220
-
221
-
222
- async def func():
223
- async with AsyncMinHashLSH(storage_config=_storage_config_redis, threshold=0.5, num_perm=16) as lsh:
204
+ async def main():
205
+ storage_config = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
206
+ async with AsyncMinHashLSH(
207
+ storage_config=storage_config, threshold=0.5, num_perm=16, prepickle=True
208
+ ) as lsh:
224
209
  async with lsh.insertion_session(batch_size=1000) as session:
225
- fs = (session.insert(key, minhash, check_duplication=True) for key, minhash in data)
226
- await asyncio.gather(*fs)
210
+ m = MinHash(num_perm=16)
211
+ m.update(b"data")
212
+ await session.insert("key", m)
213
+
214
+ asyncio.run(main())
227
215
 
228
216
  """
229
217
  return AsyncMinHashLSHInsertionSession(self, batch_size=batch_size)
@@ -232,47 +220,32 @@ class AsyncMinHashLSH:
232
220
  """Create a asynchronous context manager for fast removal of keys
233
221
  from index.
234
222
 
235
- :param int batch_size: the size of chunks to use in insert_session mode (default=10000).
223
+ :param int batch_size: the size of chunks to use in delete_session mode (default=10000).
236
224
 
237
- :return: datasketch.experimental.aio.lsh.AsyncMinHashLSHSession
225
+ :return: datasketch.aio.lsh.AsyncMinHashLSHDeleteSession
238
226
 
239
227
  Example:
240
228
  .. code-block:: python
241
229
 
242
- from datasketch.experimental.aio.lsh import AsyncMinHashLSH
230
+ import asyncio
231
+ from datasketch.aio import AsyncMinHashLSH
243
232
  from datasketch import MinHash
244
233
 
234
+ async def main():
235
+ storage_config = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
236
+ async with AsyncMinHashLSH(
237
+ storage_config=storage_config, threshold=0.5, num_perm=16, prepickle=True
238
+ ) as lsh:
239
+ # Insert some data first
240
+ m = MinHash(num_perm=16)
241
+ m.update(b"data")
242
+ await lsh.insert("key1", m)
245
243
 
246
- def chunk(it, size):
247
- it = iter(it)
248
- return iter(lambda: tuple(islice(it, size)), ())
244
+ # Delete using session
245
+ async with lsh.delete_session(batch_size=100) as session:
246
+ await session.remove("key1")
249
247
 
250
-
251
- _chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
252
- seq = frozenset(
253
- chain(
254
- ("".join(s) for s in _chunked_str),
255
- ("aahhb", "aahh", "aahhc", "aac", "kld", "bhg", "kkd", "yow", "ppi", "eer"),
256
- )
257
- )
258
- objs = [MinHash(16) for _ in range(len(seq))]
259
- for e, obj in zip(seq, objs):
260
- for i in e:
261
- obj.update(i.encode("utf-8"))
262
- data = [(e, m) for e, m in zip(seq, objs)]
263
-
264
- _storage_config_redis = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
265
-
266
-
267
- async def func():
268
- async with AsyncMinHashLSH(storage_config=_storage_config_redis, threshold=0.5, num_perm=16) as lsh:
269
- async with lsh.insertion_session(batch_size=1000) as session:
270
- fs = (session.insert(key, minhash, check_duplication=True) for key, minhash in data)
271
- await asyncio.gather(*fs)
272
-
273
- async with lsh.delete_session(batch_size=3) as session:
274
- fs = (session.remove(key) for key in keys_to_remove)
275
- await asyncio.gather(*fs)
248
+ asyncio.run(main())
276
249
 
277
250
  """
278
251
  return AsyncMinHashLSHDeleteSession(self, batch_size=batch_size)
@@ -280,6 +253,7 @@ class AsyncMinHashLSH:
280
253
  async def _insert(self, key, minhash, check_duplication=True, buffer=False):
281
254
  if len(minhash) != self.h:
282
255
  raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
256
+ self._minhash_scheme = _check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
283
257
  if self._require_bytes_keys and not isinstance(key, bytes):
284
258
  raise TypeError(
285
259
  f"prepickle=False requires bytes keys for non-dict storage, got {type(key).__name__}. "
@@ -288,7 +262,9 @@ class AsyncMinHashLSH:
288
262
  if self.prepickle:
289
263
  key = pickle.dumps(key)
290
264
 
291
- if check_duplication and await self.has_key(key):
265
+ # `key` is already pickled at this point under prepickle=True; call the
266
+ # storage primitive directly so we don't re-pickle through has_key().
267
+ if check_duplication and await self.keys.has_key(key):
292
268
  raise ValueError("The given key already exists")
293
269
  Hs = [self._H(minhash.hashvalues[start:end]) for start, end in self.hashranges]
294
270
 
@@ -302,6 +278,7 @@ class AsyncMinHashLSH:
302
278
  """See :class:`datasketch.MinHashLSH`."""
303
279
  if len(minhash) != self.h:
304
280
  raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
281
+ _check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
305
282
 
306
283
  fs = (
307
284
  hashtable.get(self._H(minhash.hashvalues[start:end]))
@@ -314,6 +291,8 @@ class AsyncMinHashLSH:
314
291
 
315
292
  async def has_key(self, key):
316
293
  """See :class:`datasketch.MinHashLSH`."""
294
+ if self.prepickle:
295
+ key = pickle.dumps(key)
317
296
  return await self.keys.has_key(key)
318
297
 
319
298
  async def remove(self, key):
@@ -321,7 +300,12 @@ class AsyncMinHashLSH:
321
300
  await self._remove(key, buffer=False)
322
301
 
323
302
  async def _remove(self, key, buffer=False):
324
- if not await self.has_key(key):
303
+ if self.prepickle:
304
+ key = pickle.dumps(key)
305
+
306
+ # `key` is already pickled here; call storage primitives directly so
307
+ # the existence check, lookup, and deletes all use the stored form.
308
+ if not await self.keys.has_key(key):
325
309
  raise ValueError("The given key does not exist")
326
310
 
327
311
  for H, hashtable in zip(await self.keys.get(key), self.hashtables):
@@ -345,6 +329,7 @@ class AsyncMinHashLSH:
345
329
  async def _query_b(self, minhash, b):
346
330
  if len(minhash) != self.h:
347
331
  raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
332
+ _check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
348
333
  if b > len(self.hashtables):
349
334
  raise ValueError("b must be less or equal to the number of hash tables")
350
335
  fs = []
@@ -352,7 +337,10 @@ class AsyncMinHashLSH:
352
337
  H = self._H(minhash.hashvalues[start:end])
353
338
  if await hashtable.has_key(H):
354
339
  fs.append(hashtable.get(H))
355
- return set(chain.from_iterable(await asyncio.gather(*fs))) # candidates
340
+ candidates = set(chain.from_iterable(await asyncio.gather(*fs)))
341
+ if self.prepickle:
342
+ return {pickle.loads(key) for key in candidates}
343
+ return candidates
356
344
 
357
345
  async def get_counts(self):
358
346
  """See :class:`datasketch.MinHashLSH`."""
@@ -361,6 +349,10 @@ class AsyncMinHashLSH:
361
349
 
362
350
  async def get_subset_counts(self, *keys):
363
351
  """See :class:`datasketch.MinHashLSH`."""
352
+ # Keys in storage are pickled when prepickle is enabled, so we have to
353
+ # pickle the query keys to match the stored representation.
354
+ if self.prepickle:
355
+ keys = tuple(pickle.dumps(key) for key in keys)
364
356
  key_set = list(set(keys))
365
357
  hashtables = [unordered_storage({"type": "dict"}) for _ in range(self.b)]
366
358
  Hss = await self.keys.getmany(*key_set)
@@ -1,9 +1,23 @@
1
+ """Async storage backends for MinHash LSH.
2
+
3
+ This module provides async storage implementations for use with AsyncMinHashLSH:
4
+ - AsyncMongoListStorage / AsyncMongoSetStorage: MongoDB storage via motor
5
+ - AsyncRedisListStorage / AsyncRedisSetStorage: Redis storage via redis.asyncio
6
+ """
7
+
1
8
  import asyncio
2
9
  import os
3
10
  from abc import ABCMeta
4
11
  from itertools import chain
5
12
 
6
- from datasketch.storage import OrderedStorage, RedisStorage, Storage, UnorderedStorage, _random_name
13
+ from datasketch.storage import OrderedStorage, Storage, UnorderedStorage, _random_name
14
+
15
+ # RedisStorage is only available when redis package is installed (optional dependency)
16
+ # Import it conditionally to avoid ImportError when redis is not installed
17
+ try:
18
+ from datasketch.storage import RedisStorage
19
+ except ImportError:
20
+ RedisStorage = None
7
21
 
8
22
  ABC = ABCMeta("ABC", (object,), {})
9
23
 
@@ -24,6 +38,12 @@ except ImportError:
24
38
  redis = None
25
39
 
26
40
 
41
+ __all__ = [
42
+ "async_ordered_storage",
43
+ "async_unordered_storage",
44
+ ]
45
+
46
+
27
47
  async def async_ordered_storage(config, name=None):
28
48
  tp = config["type"]
29
49
  if tp == "aiomongo":
@@ -55,9 +75,9 @@ if motor is not None and ReturnDocument is not None:
55
75
  class AsyncMongoBuffer:
56
76
  def __init__(self, aio_mongo_collection, batch_size):
57
77
  self._batch_size = batch_size
58
- self._insert_documents_stack = tuple()
59
- self._delete_by_key_documents_stack = tuple()
60
- self._delete_by_val_documents_stack = tuple()
78
+ self._insert_documents_stack = []
79
+ self._delete_by_key_documents_stack = []
80
+ self._delete_by_val_documents_stack = []
61
81
  self._mongo_coll = aio_mongo_collection
62
82
 
63
83
  @property
@@ -73,28 +93,28 @@ if motor is not None and ReturnDocument is not None:
73
93
  if command == "insert":
74
94
  if len(self._insert_documents_stack) >= self.batch_size:
75
95
  await self.execute(command)
76
- self._insert_documents_stack += (kwargs["obj"],)
96
+ self._insert_documents_stack.append(kwargs["obj"])
77
97
  elif command == "delete_by_key":
78
98
  if len(self._delete_by_key_documents_stack) >= self.batch_size:
79
99
  await self.execute(command)
80
- self._delete_by_key_documents_stack += (kwargs["key"],)
100
+ self._delete_by_key_documents_stack.append(kwargs["key"])
81
101
  elif command == "delete_by_val":
82
102
  if len(self._delete_by_val_documents_stack) >= self.batch_size:
83
103
  await self.execute(command)
84
- self._delete_by_val_documents_stack += (kwargs["val"],)
104
+ self._delete_by_val_documents_stack.append(kwargs["val"])
85
105
 
86
106
  async def execute(self, command):
87
107
  if command == "insert" and self._insert_documents_stack:
88
108
  buffer = self._insert_documents_stack
89
- self._insert_documents_stack = tuple()
109
+ self._insert_documents_stack = []
90
110
  await self._mongo_coll.insert_many(buffer, ordered=False)
91
111
  elif command == "delete_by_key" and self._delete_by_key_documents_stack:
92
112
  buffer = self._delete_by_key_documents_stack
93
- self._delete_by_key_documents_stack = tuple()
113
+ self._delete_by_key_documents_stack = []
94
114
  await self._mongo_coll.delete_many({"key": {"$in": buffer}})
95
115
  elif command == "delete_by_val" and self._delete_by_val_documents_stack:
96
116
  buffer = self._delete_by_val_documents_stack
97
- self._delete_by_val_documents_stack = tuple()
117
+ self._delete_by_val_documents_stack = []
98
118
  await self._mongo_coll.delete_many({"vals": {"$in": buffer}})
99
119
 
100
120
  async def insert_one(self, **kwargs):
@@ -254,6 +274,9 @@ if motor is not None and ReturnDocument is not None:
254
274
  async def has_key(self, key):
255
275
  return bool(await self._collection.find_one({"key": key}))
256
276
 
277
+ async def getmany(self, *keys):
278
+ return await asyncio.gather(*(self.get(key) for key in keys))
279
+
257
280
  async def status(self):
258
281
  status = self._parse_config(self.config["mongo"])
259
282
  status.update({"keyspace_size": await self.size()})
@@ -285,7 +308,9 @@ if motor is not None and ReturnDocument is not None:
285
308
  await self._collection.find_one_and_delete({"key": key, "vals": val})
286
309
 
287
310
 
288
- if redis is not None:
311
+ # Redis-based async storage classes are only defined when both redis package
312
+ # and RedisStorage are available (optional dependencies)
313
+ if redis is not None and RedisStorage is not None:
289
314
 
290
315
  class AsyncRedisBuffer(redis.client.Pipeline):
291
316
  def __init__(self, connection_pool, response_callbacks, transaction, buffer_size, shard_hint=None):
@@ -304,7 +329,7 @@ if redis is not None:
304
329
 
305
330
  async def execute_command(self, *args, **kwargs):
306
331
  if len(self.command_stack) >= self._buffer_size:
307
- self.execute()
332
+ await self.execute()
308
333
  await super(AsyncRedisBuffer, self).execute_command(*args, **kwargs)
309
334
 
310
335
  class AsyncRedisStorage(RedisStorage):
@@ -347,9 +372,8 @@ if redis is not None:
347
372
 
348
373
  async def getmany(self, *keys):
349
374
  pipe = self._redis.pipeline()
350
- pipe.multi()
351
375
  for key in keys:
352
- await self._get_items(pipe, self.redis_key(key))
376
+ pipe.lrange(self.redis_key(key), 0, -1)
353
377
  return await pipe.execute()
354
378
 
355
379
  @staticmethod
@@ -422,6 +446,12 @@ if redis is not None:
422
446
  async def _get_items(r, k):
423
447
  return await r.smembers(k)
424
448
 
449
+ async def getmany(self, *keys):
450
+ pipe = self._redis.pipeline()
451
+ for key in keys:
452
+ pipe.smembers(self.redis_key(key))
453
+ return await pipe.execute()
454
+
425
455
  async def remove_val(self, key, val, **kwargs):
426
456
  buffer = kwargs.pop("buffer", False)
427
457
  redis_key = self.redis_key(key)