datasketch 1.9.0__tar.gz → 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datasketch-1.9.0 → datasketch-2.0.0}/PKG-INFO +22 -3
- {datasketch-1.9.0 → datasketch-2.0.0}/README.rst +11 -0
- {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/__init__.py +4 -2
- datasketch-2.0.0/datasketch/aio/__init__.py +44 -0
- {datasketch-1.9.0/datasketch/experimental → datasketch-2.0.0/datasketch}/aio/lsh.py +65 -73
- {datasketch-1.9.0/datasketch/experimental → datasketch-2.0.0/datasketch}/aio/storage.py +44 -14
- {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/b_bit_minhash.py +64 -11
- datasketch-2.0.0/datasketch/experimental/__init__.py +49 -0
- datasketch-2.0.0/datasketch/experimental/aio/__init__.py +50 -0
- datasketch-2.0.0/datasketch/experimental/aio/lsh.py +49 -0
- {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/hyperloglog.py +1 -1
- datasketch-2.0.0/datasketch/lean_minhash.py +359 -0
- {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/lsh.py +32 -8
- {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/lsh_bloom.py +18 -8
- {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/lshensemble.py +6 -1
- {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/lshforest.py +40 -13
- {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/minhash.py +314 -56
- {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/storage.py +6 -4
- {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/weighted_minhash.py +2 -0
- {datasketch-1.9.0 → datasketch-2.0.0}/pyproject.toml +15 -4
- datasketch-1.9.0/datasketch/experimental/__init__.py +0 -15
- datasketch-1.9.0/datasketch/experimental/aio/__init__.py +0 -0
- datasketch-1.9.0/datasketch/lean_minhash.py +0 -253
- {datasketch-1.9.0 → datasketch-2.0.0}/.gitignore +0 -0
- {datasketch-1.9.0 → datasketch-2.0.0}/LICENSE +0 -0
- {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/hashfunc.py +0 -0
- {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/hnsw.py +0 -0
- {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/hyperloglog_const.py +0 -0
- {datasketch-1.9.0 → datasketch-2.0.0}/datasketch/lshensemble_partition.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datasketch
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2.0.0
|
|
4
4
|
Summary: Probabilistic data structures for processing and searching very large datasets
|
|
5
5
|
Project-URL: Homepage, https://ekzhu.github.io/datasketch
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/ekzhu/datasketch/issues
|
|
@@ -23,10 +23,16 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
|
23
23
|
Requires-Python: >=3.9
|
|
24
24
|
Requires-Dist: numpy>=1.11
|
|
25
25
|
Requires-Dist: scipy>=1.0.0
|
|
26
|
+
Provides-Extra: aio
|
|
27
|
+
Requires-Dist: aiounittest; extra == 'aio'
|
|
28
|
+
Requires-Dist: motor>3.6.0; extra == 'aio'
|
|
26
29
|
Provides-Extra: benchmark
|
|
30
|
+
Requires-Dist: fonttools>=4.60.2; extra == 'benchmark'
|
|
27
31
|
Requires-Dist: matplotlib>=3.1.2; extra == 'benchmark'
|
|
28
|
-
Requires-Dist: nltk>=3.4.5; extra == 'benchmark'
|
|
32
|
+
Requires-Dist: nltk>=3.4.5; (python_version < '3.10') and extra == 'benchmark'
|
|
33
|
+
Requires-Dist: nltk>=3.9.4; (python_version >= '3.10') and extra == 'benchmark'
|
|
29
34
|
Requires-Dist: pandas>=0.25.3; extra == 'benchmark'
|
|
35
|
+
Requires-Dist: pillow>=12.2.0; (python_version >= '3.10') and extra == 'benchmark'
|
|
30
36
|
Requires-Dist: pyfarmhash>=0.2.2; extra == 'benchmark'
|
|
31
37
|
Requires-Dist: pyhash>=0.9.3; extra == 'benchmark'
|
|
32
38
|
Requires-Dist: scikit-learn>=0.21.3; extra == 'benchmark'
|
|
@@ -48,11 +54,13 @@ Requires-Dist: mock>=2.0.0; extra == 'test'
|
|
|
48
54
|
Requires-Dist: mockredispy; extra == 'test'
|
|
49
55
|
Requires-Dist: nose-exclude>=0.5.0; extra == 'test'
|
|
50
56
|
Requires-Dist: nose>=1.3.7; extra == 'test'
|
|
57
|
+
Requires-Dist: pygments>=2.20.0; extra == 'test'
|
|
51
58
|
Requires-Dist: pymongo>=3.9.0; extra == 'test'
|
|
52
|
-
Requires-Dist: pytest; extra == 'test'
|
|
53
59
|
Requires-Dist: pytest-asyncio; extra == 'test'
|
|
54
60
|
Requires-Dist: pytest-cov; extra == 'test'
|
|
55
61
|
Requires-Dist: pytest-rerunfailures; extra == 'test'
|
|
62
|
+
Requires-Dist: pytest; (python_version < '3.10') and extra == 'test'
|
|
63
|
+
Requires-Dist: pytest>=9.0.3; (python_version >= '3.10') and extra == 'test'
|
|
56
64
|
Requires-Dist: redis>=2.10.0; extra == 'test'
|
|
57
65
|
Description-Content-Type: text/x-rst
|
|
58
66
|
|
|
@@ -72,6 +80,17 @@ datasketch gives you probabilistic data structures that can process and
|
|
|
72
80
|
search very large amount of data super fast, with little loss of
|
|
73
81
|
accuracy.
|
|
74
82
|
|
|
83
|
+
.. note::
|
|
84
|
+
**Version 2.0.0** changes the default MinHash permutation scheme to
|
|
85
|
+
``"affine32"``, which fixes a similarity over-estimation bias on large
|
|
86
|
+
sets (`issue #212 <https://github.com/ekzhu/datasketch/issues/212>`__),
|
|
87
|
+
halves sketch memory, and speeds up updates by roughly 4x. A 64-bit
|
|
88
|
+
``"affine64"`` scheme is available for billion-scale sets. Hash values
|
|
89
|
+
differ from earlier versions: rebuild persisted sketches and LSH
|
|
90
|
+
indexes, or pass ``MinHash(..., scheme="legacy")`` to interoperate with
|
|
91
|
+
existing data. See the `MinHash documentation
|
|
92
|
+
<https://ekzhu.github.io/datasketch/minhash.html>`__ for details.
|
|
93
|
+
|
|
75
94
|
This package contains the following data sketches:
|
|
76
95
|
|
|
77
96
|
+-------------------------+-----------------------------------------------+
|
|
@@ -14,6 +14,17 @@ datasketch gives you probabilistic data structures that can process and
|
|
|
14
14
|
search very large amount of data super fast, with little loss of
|
|
15
15
|
accuracy.
|
|
16
16
|
|
|
17
|
+
.. note::
|
|
18
|
+
**Version 2.0.0** changes the default MinHash permutation scheme to
|
|
19
|
+
``"affine32"``, which fixes a similarity over-estimation bias on large
|
|
20
|
+
sets (`issue #212 <https://github.com/ekzhu/datasketch/issues/212>`__),
|
|
21
|
+
halves sketch memory, and speeds up updates by roughly 4x. A 64-bit
|
|
22
|
+
``"affine64"`` scheme is available for billion-scale sets. Hash values
|
|
23
|
+
differ from earlier versions: rebuild persisted sketches and LSH
|
|
24
|
+
indexes, or pass ``MinHash(..., scheme="legacy")`` to interoperate with
|
|
25
|
+
existing data. See the `MinHash documentation
|
|
26
|
+
<https://ekzhu.github.io/datasketch/minhash.html>`__ for details.
|
|
27
|
+
|
|
17
28
|
This package contains the following data sketches:
|
|
18
29
|
|
|
19
30
|
+-------------------------+-----------------------------------------------+
|
|
@@ -7,8 +7,9 @@ except importlib.metadata.PackageNotFoundError:
|
|
|
7
7
|
_version = "0.0.0" # Fallback for development mode
|
|
8
8
|
__version__: Final[str] = _version
|
|
9
9
|
|
|
10
|
+
from datasketch.aio import AsyncMinHashLSH # Instantiation requires motor/redis.asyncio; import itself is always safe.
|
|
10
11
|
from datasketch.b_bit_minhash import bBitMinHash
|
|
11
|
-
from datasketch.hashfunc import sha1_hash32
|
|
12
|
+
from datasketch.hashfunc import sha1_hash32, sha1_hash64
|
|
12
13
|
from datasketch.hnsw import HNSW
|
|
13
14
|
from datasketch.hyperloglog import HyperLogLog, HyperLogLogPlusPlus
|
|
14
15
|
from datasketch.lean_minhash import LeanMinHash
|
|
@@ -23,9 +24,9 @@ from datasketch.weighted_minhash import WeightedMinHash, WeightedMinHashGenerato
|
|
|
23
24
|
WeightedMinHashLSH = MinHashLSH
|
|
24
25
|
WeightedMinHashLSHForest = MinHashLSHForest
|
|
25
26
|
|
|
26
|
-
|
|
27
27
|
__all__ = [
|
|
28
28
|
"HNSW",
|
|
29
|
+
"AsyncMinHashLSH",
|
|
29
30
|
"HyperLogLog",
|
|
30
31
|
"HyperLogLogPlusPlus",
|
|
31
32
|
"LeanMinHash",
|
|
@@ -40,4 +41,5 @@ __all__ = [
|
|
|
40
41
|
"WeightedMinHashLSHForest",
|
|
41
42
|
"bBitMinHash",
|
|
42
43
|
"sha1_hash32",
|
|
44
|
+
"sha1_hash64",
|
|
43
45
|
]
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Async MinHash LSH module.
|
|
2
|
+
|
|
3
|
+
This module provides asynchronous implementations of MinHash LSH for use with
|
|
4
|
+
async storage backends like MongoDB (via motor) and Redis (via redis.asyncio).
|
|
5
|
+
|
|
6
|
+
Example:
|
|
7
|
+
.. code-block:: python
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
|
|
11
|
+
from datasketch.aio import AsyncMinHashLSH
|
|
12
|
+
from datasketch import MinHash
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
async def main():
|
|
16
|
+
# prepickle=True lets you use non-bytes keys (e.g. str). With the
|
|
17
|
+
# default prepickle=False, keys passed to insert() must be bytes.
|
|
18
|
+
async with AsyncMinHashLSH(
|
|
19
|
+
storage_config={"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}},
|
|
20
|
+
threshold=0.5,
|
|
21
|
+
num_perm=128,
|
|
22
|
+
prepickle=True,
|
|
23
|
+
) as lsh:
|
|
24
|
+
m = MinHash(num_perm=128)
|
|
25
|
+
m.update(b"data")
|
|
26
|
+
await lsh.insert("key", m)
|
|
27
|
+
result = await lsh.query(m)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
asyncio.run(main())
|
|
31
|
+
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
from datasketch.aio.lsh import (
|
|
35
|
+
AsyncMinHashLSH,
|
|
36
|
+
AsyncMinHashLSHDeleteSession,
|
|
37
|
+
AsyncMinHashLSHInsertionSession,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
__all__ = [
|
|
41
|
+
"AsyncMinHashLSH",
|
|
42
|
+
"AsyncMinHashLSHDeleteSession",
|
|
43
|
+
"AsyncMinHashLSHInsertionSession",
|
|
44
|
+
]
|
|
@@ -1,13 +1,20 @@
|
|
|
1
|
+
"""Asynchronous MinHash LSH implementation.
|
|
2
|
+
|
|
3
|
+
This module provides AsyncMinHashLSH for use with async storage backends
|
|
4
|
+
like MongoDB (via motor) and Redis (via redis.asyncio).
|
|
5
|
+
"""
|
|
6
|
+
|
|
1
7
|
import asyncio
|
|
2
8
|
import pickle
|
|
3
9
|
from itertools import chain
|
|
4
10
|
from typing import Optional
|
|
5
11
|
|
|
6
|
-
from datasketch.
|
|
12
|
+
from datasketch.aio.storage import (
|
|
7
13
|
async_ordered_storage,
|
|
8
14
|
async_unordered_storage,
|
|
9
15
|
)
|
|
10
16
|
from datasketch.lsh import _optimal_param
|
|
17
|
+
from datasketch.minhash import _check_scheme_consistency
|
|
11
18
|
from datasketch.storage import _random_name, unordered_storage
|
|
12
19
|
|
|
13
20
|
|
|
@@ -34,8 +41,6 @@ class AsyncMinHashLSH:
|
|
|
34
41
|
MONGO = {"type": "aiomongo", "basename": "base_name_1", "mongo": {"host": "localhost", "port": 27017}}
|
|
35
42
|
|
|
36
43
|
.. note::
|
|
37
|
-
* The module supports Python version >=3.6, and is currently experimental.
|
|
38
|
-
So the interface may change slightly in the future.
|
|
39
44
|
* For main functionality of LSH algorithm see :class:`datasketch.MinHashLSH`.
|
|
40
45
|
* For additional information see :ref:`minhash_lsh_at_scale` and :ref:`minhash_lsh_async`
|
|
41
46
|
"""
|
|
@@ -84,6 +89,10 @@ class AsyncMinHashLSH:
|
|
|
84
89
|
self.hashranges = [(i * self.r, (i + 1) * self.r) for i in range(self.b)]
|
|
85
90
|
self.hashtables = None
|
|
86
91
|
self.keys = None
|
|
92
|
+
# The permutation scheme of the indexed MinHash, learned from the
|
|
93
|
+
# first insert. Note that an index attached to pre-existing external
|
|
94
|
+
# storage re-learns the scheme on its first insert.
|
|
95
|
+
self._minhash_scheme: Optional[str] = None
|
|
87
96
|
|
|
88
97
|
self._lock = asyncio.Lock()
|
|
89
98
|
self._initialized = False
|
|
@@ -129,7 +138,7 @@ class AsyncMinHashLSH:
|
|
|
129
138
|
if self.keys is not None:
|
|
130
139
|
self.keys.batch_size = value
|
|
131
140
|
else:
|
|
132
|
-
raise AttributeError("
|
|
141
|
+
raise AttributeError("AsyncMinHashLSH is not initialized.")
|
|
133
142
|
|
|
134
143
|
for t in self.hashtables:
|
|
135
144
|
t.batch_size = value
|
|
@@ -163,12 +172,6 @@ class AsyncMinHashLSH:
|
|
|
163
172
|
if self.keys is None:
|
|
164
173
|
await self._create_storages()
|
|
165
174
|
|
|
166
|
-
if not self.keys.initialized:
|
|
167
|
-
await self.keys
|
|
168
|
-
|
|
169
|
-
fs = (ht for ht in self.hashtables if not ht.initialized)
|
|
170
|
-
await asyncio.gather(*fs)
|
|
171
|
-
|
|
172
175
|
async def close(self):
|
|
173
176
|
"""Cleanup client resources and disconnect from AsyncMinHashLSH storage."""
|
|
174
177
|
async with self._lock:
|
|
@@ -189,41 +192,26 @@ class AsyncMinHashLSH:
|
|
|
189
192
|
|
|
190
193
|
:param int batch_size: the size of chunks to use in insert_session mode (default=10000).
|
|
191
194
|
|
|
192
|
-
:return: datasketch.
|
|
195
|
+
:return: datasketch.aio.lsh.AsyncMinHashLSHInsertionSession
|
|
193
196
|
|
|
194
197
|
Example:
|
|
195
198
|
.. code-block:: python
|
|
196
199
|
|
|
197
|
-
|
|
200
|
+
import asyncio
|
|
201
|
+
from datasketch.aio import AsyncMinHashLSH
|
|
198
202
|
from datasketch import MinHash
|
|
199
203
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
_chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
|
|
207
|
-
seq = frozenset(
|
|
208
|
-
chain(
|
|
209
|
-
("".join(s) for s in _chunked_str),
|
|
210
|
-
("aahhb", "aahh", "aahhc", "aac", "kld", "bhg", "kkd", "yow", "ppi", "eer"),
|
|
211
|
-
)
|
|
212
|
-
)
|
|
213
|
-
objs = [MinHash(16) for _ in range(len(seq))]
|
|
214
|
-
for e, obj in zip(seq, objs):
|
|
215
|
-
for i in e:
|
|
216
|
-
obj.update(i.encode("utf-8"))
|
|
217
|
-
data = [(e, m) for e, m in zip(seq, objs)]
|
|
218
|
-
|
|
219
|
-
_storage_config_redis = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
async def func():
|
|
223
|
-
async with AsyncMinHashLSH(storage_config=_storage_config_redis, threshold=0.5, num_perm=16) as lsh:
|
|
204
|
+
async def main():
|
|
205
|
+
storage_config = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
|
|
206
|
+
async with AsyncMinHashLSH(
|
|
207
|
+
storage_config=storage_config, threshold=0.5, num_perm=16, prepickle=True
|
|
208
|
+
) as lsh:
|
|
224
209
|
async with lsh.insertion_session(batch_size=1000) as session:
|
|
225
|
-
|
|
226
|
-
|
|
210
|
+
m = MinHash(num_perm=16)
|
|
211
|
+
m.update(b"data")
|
|
212
|
+
await session.insert("key", m)
|
|
213
|
+
|
|
214
|
+
asyncio.run(main())
|
|
227
215
|
|
|
228
216
|
"""
|
|
229
217
|
return AsyncMinHashLSHInsertionSession(self, batch_size=batch_size)
|
|
@@ -232,47 +220,32 @@ class AsyncMinHashLSH:
|
|
|
232
220
|
"""Create a asynchronous context manager for fast removal of keys
|
|
233
221
|
from index.
|
|
234
222
|
|
|
235
|
-
:param int batch_size: the size of chunks to use in
|
|
223
|
+
:param int batch_size: the size of chunks to use in delete_session mode (default=10000).
|
|
236
224
|
|
|
237
|
-
:return: datasketch.
|
|
225
|
+
:return: datasketch.aio.lsh.AsyncMinHashLSHDeleteSession
|
|
238
226
|
|
|
239
227
|
Example:
|
|
240
228
|
.. code-block:: python
|
|
241
229
|
|
|
242
|
-
|
|
230
|
+
import asyncio
|
|
231
|
+
from datasketch.aio import AsyncMinHashLSH
|
|
243
232
|
from datasketch import MinHash
|
|
244
233
|
|
|
234
|
+
async def main():
|
|
235
|
+
storage_config = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
|
|
236
|
+
async with AsyncMinHashLSH(
|
|
237
|
+
storage_config=storage_config, threshold=0.5, num_perm=16, prepickle=True
|
|
238
|
+
) as lsh:
|
|
239
|
+
# Insert some data first
|
|
240
|
+
m = MinHash(num_perm=16)
|
|
241
|
+
m.update(b"data")
|
|
242
|
+
await lsh.insert("key1", m)
|
|
245
243
|
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
244
|
+
# Delete using session
|
|
245
|
+
async with lsh.delete_session(batch_size=100) as session:
|
|
246
|
+
await session.remove("key1")
|
|
249
247
|
|
|
250
|
-
|
|
251
|
-
_chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
|
|
252
|
-
seq = frozenset(
|
|
253
|
-
chain(
|
|
254
|
-
("".join(s) for s in _chunked_str),
|
|
255
|
-
("aahhb", "aahh", "aahhc", "aac", "kld", "bhg", "kkd", "yow", "ppi", "eer"),
|
|
256
|
-
)
|
|
257
|
-
)
|
|
258
|
-
objs = [MinHash(16) for _ in range(len(seq))]
|
|
259
|
-
for e, obj in zip(seq, objs):
|
|
260
|
-
for i in e:
|
|
261
|
-
obj.update(i.encode("utf-8"))
|
|
262
|
-
data = [(e, m) for e, m in zip(seq, objs)]
|
|
263
|
-
|
|
264
|
-
_storage_config_redis = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
async def func():
|
|
268
|
-
async with AsyncMinHashLSH(storage_config=_storage_config_redis, threshold=0.5, num_perm=16) as lsh:
|
|
269
|
-
async with lsh.insertion_session(batch_size=1000) as session:
|
|
270
|
-
fs = (session.insert(key, minhash, check_duplication=True) for key, minhash in data)
|
|
271
|
-
await asyncio.gather(*fs)
|
|
272
|
-
|
|
273
|
-
async with lsh.delete_session(batch_size=3) as session:
|
|
274
|
-
fs = (session.remove(key) for key in keys_to_remove)
|
|
275
|
-
await asyncio.gather(*fs)
|
|
248
|
+
asyncio.run(main())
|
|
276
249
|
|
|
277
250
|
"""
|
|
278
251
|
return AsyncMinHashLSHDeleteSession(self, batch_size=batch_size)
|
|
@@ -280,6 +253,7 @@ class AsyncMinHashLSH:
|
|
|
280
253
|
async def _insert(self, key, minhash, check_duplication=True, buffer=False):
|
|
281
254
|
if len(minhash) != self.h:
|
|
282
255
|
raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
|
|
256
|
+
self._minhash_scheme = _check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
|
|
283
257
|
if self._require_bytes_keys and not isinstance(key, bytes):
|
|
284
258
|
raise TypeError(
|
|
285
259
|
f"prepickle=False requires bytes keys for non-dict storage, got {type(key).__name__}. "
|
|
@@ -288,7 +262,9 @@ class AsyncMinHashLSH:
|
|
|
288
262
|
if self.prepickle:
|
|
289
263
|
key = pickle.dumps(key)
|
|
290
264
|
|
|
291
|
-
|
|
265
|
+
# `key` is already pickled at this point under prepickle=True; call the
|
|
266
|
+
# storage primitive directly so we don't re-pickle through has_key().
|
|
267
|
+
if check_duplication and await self.keys.has_key(key):
|
|
292
268
|
raise ValueError("The given key already exists")
|
|
293
269
|
Hs = [self._H(minhash.hashvalues[start:end]) for start, end in self.hashranges]
|
|
294
270
|
|
|
@@ -302,6 +278,7 @@ class AsyncMinHashLSH:
|
|
|
302
278
|
"""See :class:`datasketch.MinHashLSH`."""
|
|
303
279
|
if len(minhash) != self.h:
|
|
304
280
|
raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
|
|
281
|
+
_check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
|
|
305
282
|
|
|
306
283
|
fs = (
|
|
307
284
|
hashtable.get(self._H(minhash.hashvalues[start:end]))
|
|
@@ -314,6 +291,8 @@ class AsyncMinHashLSH:
|
|
|
314
291
|
|
|
315
292
|
async def has_key(self, key):
|
|
316
293
|
"""See :class:`datasketch.MinHashLSH`."""
|
|
294
|
+
if self.prepickle:
|
|
295
|
+
key = pickle.dumps(key)
|
|
317
296
|
return await self.keys.has_key(key)
|
|
318
297
|
|
|
319
298
|
async def remove(self, key):
|
|
@@ -321,7 +300,12 @@ class AsyncMinHashLSH:
|
|
|
321
300
|
await self._remove(key, buffer=False)
|
|
322
301
|
|
|
323
302
|
async def _remove(self, key, buffer=False):
|
|
324
|
-
if
|
|
303
|
+
if self.prepickle:
|
|
304
|
+
key = pickle.dumps(key)
|
|
305
|
+
|
|
306
|
+
# `key` is already pickled here; call storage primitives directly so
|
|
307
|
+
# the existence check, lookup, and deletes all use the stored form.
|
|
308
|
+
if not await self.keys.has_key(key):
|
|
325
309
|
raise ValueError("The given key does not exist")
|
|
326
310
|
|
|
327
311
|
for H, hashtable in zip(await self.keys.get(key), self.hashtables):
|
|
@@ -345,6 +329,7 @@ class AsyncMinHashLSH:
|
|
|
345
329
|
async def _query_b(self, minhash, b):
|
|
346
330
|
if len(minhash) != self.h:
|
|
347
331
|
raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
|
|
332
|
+
_check_scheme_consistency(getattr(self, "_minhash_scheme", None), minhash)
|
|
348
333
|
if b > len(self.hashtables):
|
|
349
334
|
raise ValueError("b must be less or equal to the number of hash tables")
|
|
350
335
|
fs = []
|
|
@@ -352,7 +337,10 @@ class AsyncMinHashLSH:
|
|
|
352
337
|
H = self._H(minhash.hashvalues[start:end])
|
|
353
338
|
if await hashtable.has_key(H):
|
|
354
339
|
fs.append(hashtable.get(H))
|
|
355
|
-
|
|
340
|
+
candidates = set(chain.from_iterable(await asyncio.gather(*fs)))
|
|
341
|
+
if self.prepickle:
|
|
342
|
+
return {pickle.loads(key) for key in candidates}
|
|
343
|
+
return candidates
|
|
356
344
|
|
|
357
345
|
async def get_counts(self):
|
|
358
346
|
"""See :class:`datasketch.MinHashLSH`."""
|
|
@@ -361,6 +349,10 @@ class AsyncMinHashLSH:
|
|
|
361
349
|
|
|
362
350
|
async def get_subset_counts(self, *keys):
|
|
363
351
|
"""See :class:`datasketch.MinHashLSH`."""
|
|
352
|
+
# Keys in storage are pickled when prepickle is enabled, so we have to
|
|
353
|
+
# pickle the query keys to match the stored representation.
|
|
354
|
+
if self.prepickle:
|
|
355
|
+
keys = tuple(pickle.dumps(key) for key in keys)
|
|
364
356
|
key_set = list(set(keys))
|
|
365
357
|
hashtables = [unordered_storage({"type": "dict"}) for _ in range(self.b)]
|
|
366
358
|
Hss = await self.keys.getmany(*key_set)
|
|
@@ -1,9 +1,23 @@
|
|
|
1
|
+
"""Async storage backends for MinHash LSH.
|
|
2
|
+
|
|
3
|
+
This module provides async storage implementations for use with AsyncMinHashLSH:
|
|
4
|
+
- AsyncMongoListStorage / AsyncMongoSetStorage: MongoDB storage via motor
|
|
5
|
+
- AsyncRedisListStorage / AsyncRedisSetStorage: Redis storage via redis.asyncio
|
|
6
|
+
"""
|
|
7
|
+
|
|
1
8
|
import asyncio
|
|
2
9
|
import os
|
|
3
10
|
from abc import ABCMeta
|
|
4
11
|
from itertools import chain
|
|
5
12
|
|
|
6
|
-
from datasketch.storage import OrderedStorage,
|
|
13
|
+
from datasketch.storage import OrderedStorage, Storage, UnorderedStorage, _random_name
|
|
14
|
+
|
|
15
|
+
# RedisStorage is only available when redis package is installed (optional dependency)
|
|
16
|
+
# Import it conditionally to avoid ImportError when redis is not installed
|
|
17
|
+
try:
|
|
18
|
+
from datasketch.storage import RedisStorage
|
|
19
|
+
except ImportError:
|
|
20
|
+
RedisStorage = None
|
|
7
21
|
|
|
8
22
|
ABC = ABCMeta("ABC", (object,), {})
|
|
9
23
|
|
|
@@ -24,6 +38,12 @@ except ImportError:
|
|
|
24
38
|
redis = None
|
|
25
39
|
|
|
26
40
|
|
|
41
|
+
__all__ = [
|
|
42
|
+
"async_ordered_storage",
|
|
43
|
+
"async_unordered_storage",
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
|
|
27
47
|
async def async_ordered_storage(config, name=None):
|
|
28
48
|
tp = config["type"]
|
|
29
49
|
if tp == "aiomongo":
|
|
@@ -55,9 +75,9 @@ if motor is not None and ReturnDocument is not None:
|
|
|
55
75
|
class AsyncMongoBuffer:
|
|
56
76
|
def __init__(self, aio_mongo_collection, batch_size):
|
|
57
77
|
self._batch_size = batch_size
|
|
58
|
-
self._insert_documents_stack =
|
|
59
|
-
self._delete_by_key_documents_stack =
|
|
60
|
-
self._delete_by_val_documents_stack =
|
|
78
|
+
self._insert_documents_stack = []
|
|
79
|
+
self._delete_by_key_documents_stack = []
|
|
80
|
+
self._delete_by_val_documents_stack = []
|
|
61
81
|
self._mongo_coll = aio_mongo_collection
|
|
62
82
|
|
|
63
83
|
@property
|
|
@@ -73,28 +93,28 @@ if motor is not None and ReturnDocument is not None:
|
|
|
73
93
|
if command == "insert":
|
|
74
94
|
if len(self._insert_documents_stack) >= self.batch_size:
|
|
75
95
|
await self.execute(command)
|
|
76
|
-
self._insert_documents_stack
|
|
96
|
+
self._insert_documents_stack.append(kwargs["obj"])
|
|
77
97
|
elif command == "delete_by_key":
|
|
78
98
|
if len(self._delete_by_key_documents_stack) >= self.batch_size:
|
|
79
99
|
await self.execute(command)
|
|
80
|
-
self._delete_by_key_documents_stack
|
|
100
|
+
self._delete_by_key_documents_stack.append(kwargs["key"])
|
|
81
101
|
elif command == "delete_by_val":
|
|
82
102
|
if len(self._delete_by_val_documents_stack) >= self.batch_size:
|
|
83
103
|
await self.execute(command)
|
|
84
|
-
self._delete_by_val_documents_stack
|
|
104
|
+
self._delete_by_val_documents_stack.append(kwargs["val"])
|
|
85
105
|
|
|
86
106
|
async def execute(self, command):
|
|
87
107
|
if command == "insert" and self._insert_documents_stack:
|
|
88
108
|
buffer = self._insert_documents_stack
|
|
89
|
-
self._insert_documents_stack =
|
|
109
|
+
self._insert_documents_stack = []
|
|
90
110
|
await self._mongo_coll.insert_many(buffer, ordered=False)
|
|
91
111
|
elif command == "delete_by_key" and self._delete_by_key_documents_stack:
|
|
92
112
|
buffer = self._delete_by_key_documents_stack
|
|
93
|
-
self._delete_by_key_documents_stack =
|
|
113
|
+
self._delete_by_key_documents_stack = []
|
|
94
114
|
await self._mongo_coll.delete_many({"key": {"$in": buffer}})
|
|
95
115
|
elif command == "delete_by_val" and self._delete_by_val_documents_stack:
|
|
96
116
|
buffer = self._delete_by_val_documents_stack
|
|
97
|
-
self._delete_by_val_documents_stack =
|
|
117
|
+
self._delete_by_val_documents_stack = []
|
|
98
118
|
await self._mongo_coll.delete_many({"vals": {"$in": buffer}})
|
|
99
119
|
|
|
100
120
|
async def insert_one(self, **kwargs):
|
|
@@ -254,6 +274,9 @@ if motor is not None and ReturnDocument is not None:
|
|
|
254
274
|
async def has_key(self, key):
|
|
255
275
|
return bool(await self._collection.find_one({"key": key}))
|
|
256
276
|
|
|
277
|
+
async def getmany(self, *keys):
|
|
278
|
+
return await asyncio.gather(*(self.get(key) for key in keys))
|
|
279
|
+
|
|
257
280
|
async def status(self):
|
|
258
281
|
status = self._parse_config(self.config["mongo"])
|
|
259
282
|
status.update({"keyspace_size": await self.size()})
|
|
@@ -285,7 +308,9 @@ if motor is not None and ReturnDocument is not None:
|
|
|
285
308
|
await self._collection.find_one_and_delete({"key": key, "vals": val})
|
|
286
309
|
|
|
287
310
|
|
|
288
|
-
|
|
311
|
+
# Redis-based async storage classes are only defined when both redis package
|
|
312
|
+
# and RedisStorage are available (optional dependencies)
|
|
313
|
+
if redis is not None and RedisStorage is not None:
|
|
289
314
|
|
|
290
315
|
class AsyncRedisBuffer(redis.client.Pipeline):
|
|
291
316
|
def __init__(self, connection_pool, response_callbacks, transaction, buffer_size, shard_hint=None):
|
|
@@ -304,7 +329,7 @@ if redis is not None:
|
|
|
304
329
|
|
|
305
330
|
async def execute_command(self, *args, **kwargs):
|
|
306
331
|
if len(self.command_stack) >= self._buffer_size:
|
|
307
|
-
self.execute()
|
|
332
|
+
await self.execute()
|
|
308
333
|
await super(AsyncRedisBuffer, self).execute_command(*args, **kwargs)
|
|
309
334
|
|
|
310
335
|
class AsyncRedisStorage(RedisStorage):
|
|
@@ -347,9 +372,8 @@ if redis is not None:
|
|
|
347
372
|
|
|
348
373
|
async def getmany(self, *keys):
|
|
349
374
|
pipe = self._redis.pipeline()
|
|
350
|
-
pipe.multi()
|
|
351
375
|
for key in keys:
|
|
352
|
-
|
|
376
|
+
pipe.lrange(self.redis_key(key), 0, -1)
|
|
353
377
|
return await pipe.execute()
|
|
354
378
|
|
|
355
379
|
@staticmethod
|
|
@@ -422,6 +446,12 @@ if redis is not None:
|
|
|
422
446
|
async def _get_items(r, k):
|
|
423
447
|
return await r.smembers(k)
|
|
424
448
|
|
|
449
|
+
async def getmany(self, *keys):
|
|
450
|
+
pipe = self._redis.pipeline()
|
|
451
|
+
for key in keys:
|
|
452
|
+
pipe.smembers(self.redis_key(key))
|
|
453
|
+
return await pipe.execute()
|
|
454
|
+
|
|
425
455
|
async def remove_val(self, key, val, **kwargs):
|
|
426
456
|
buffer = kwargs.pop("buffer", False)
|
|
427
457
|
redis_key = self.redis_key(key)
|