datasketch 1.8.0__tar.gz → 1.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datasketch-1.8.0 → datasketch-1.10.0}/PKG-INFO +17 -3
- {datasketch-1.8.0 → datasketch-1.10.0}/README.rst +3 -0
- {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/__init__.py +2 -1
- datasketch-1.10.0/datasketch/aio/__init__.py +44 -0
- {datasketch-1.8.0/datasketch/experimental → datasketch-1.10.0/datasketch}/aio/lsh.py +66 -74
- {datasketch-1.8.0/datasketch/experimental → datasketch-1.10.0/datasketch}/aio/storage.py +77 -29
- datasketch-1.10.0/datasketch/experimental/__init__.py +49 -0
- datasketch-1.10.0/datasketch/experimental/aio/__init__.py +50 -0
- datasketch-1.10.0/datasketch/experimental/aio/lsh.py +49 -0
- {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/lsh.py +28 -12
- {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/lsh_bloom.py +2 -2
- {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/lshensemble.py +3 -2
- {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/lshforest.py +16 -9
- {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/minhash.py +8 -5
- {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/storage.py +21 -11
- {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/weighted_minhash.py +5 -5
- {datasketch-1.8.0 → datasketch-1.10.0}/pyproject.toml +51 -5
- datasketch-1.8.0/datasketch/experimental/__init__.py +0 -15
- datasketch-1.8.0/datasketch/experimental/aio/__init__.py +0 -0
- {datasketch-1.8.0 → datasketch-1.10.0}/.gitignore +0 -0
- {datasketch-1.8.0 → datasketch-1.10.0}/LICENSE +0 -0
- {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/b_bit_minhash.py +0 -0
- {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/hashfunc.py +0 -0
- {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/hnsw.py +0 -0
- {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/hyperloglog.py +0 -0
- {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/hyperloglog_const.py +0 -0
- {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/lean_minhash.py +0 -0
- {datasketch-1.8.0 → datasketch-1.10.0}/datasketch/lshensemble_partition.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datasketch
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.10.0
|
|
4
4
|
Summary: Probabilistic data structures for processing and searching very large datasets
|
|
5
5
|
Project-URL: Homepage, https://ekzhu.github.io/datasketch
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/ekzhu/datasketch/issues
|
|
@@ -17,15 +17,22 @@ Classifier: Programming Language :: Python :: 3.9
|
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.10
|
|
18
18
|
Classifier: Programming Language :: Python :: 3.11
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
21
|
Classifier: Topic :: Database
|
|
21
22
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
22
23
|
Requires-Python: >=3.9
|
|
23
24
|
Requires-Dist: numpy>=1.11
|
|
24
25
|
Requires-Dist: scipy>=1.0.0
|
|
26
|
+
Provides-Extra: aio
|
|
27
|
+
Requires-Dist: aiounittest; extra == 'aio'
|
|
28
|
+
Requires-Dist: motor>3.6.0; extra == 'aio'
|
|
25
29
|
Provides-Extra: benchmark
|
|
30
|
+
Requires-Dist: fonttools>=4.60.2; extra == 'benchmark'
|
|
26
31
|
Requires-Dist: matplotlib>=3.1.2; extra == 'benchmark'
|
|
27
|
-
Requires-Dist: nltk>=3.4.5; extra == 'benchmark'
|
|
32
|
+
Requires-Dist: nltk>=3.4.5; (python_version < '3.10') and extra == 'benchmark'
|
|
33
|
+
Requires-Dist: nltk>=3.9.4; (python_version >= '3.10') and extra == 'benchmark'
|
|
28
34
|
Requires-Dist: pandas>=0.25.3; extra == 'benchmark'
|
|
35
|
+
Requires-Dist: pillow>=12.2.0; (python_version >= '3.10') and extra == 'benchmark'
|
|
29
36
|
Requires-Dist: pyfarmhash>=0.2.2; extra == 'benchmark'
|
|
30
37
|
Requires-Dist: pyhash>=0.9.3; extra == 'benchmark'
|
|
31
38
|
Requires-Dist: scikit-learn>=0.21.3; extra == 'benchmark'
|
|
@@ -47,9 +54,13 @@ Requires-Dist: mock>=2.0.0; extra == 'test'
|
|
|
47
54
|
Requires-Dist: mockredispy; extra == 'test'
|
|
48
55
|
Requires-Dist: nose-exclude>=0.5.0; extra == 'test'
|
|
49
56
|
Requires-Dist: nose>=1.3.7; extra == 'test'
|
|
57
|
+
Requires-Dist: pygments>=2.20.0; extra == 'test'
|
|
50
58
|
Requires-Dist: pymongo>=3.9.0; extra == 'test'
|
|
51
|
-
Requires-Dist: pytest; extra == 'test'
|
|
59
|
+
Requires-Dist: pytest-asyncio; extra == 'test'
|
|
60
|
+
Requires-Dist: pytest-cov; extra == 'test'
|
|
52
61
|
Requires-Dist: pytest-rerunfailures; extra == 'test'
|
|
62
|
+
Requires-Dist: pytest; (python_version < '3.10') and extra == 'test'
|
|
63
|
+
Requires-Dist: pytest>=9.0.3; (python_version >= '3.10') and extra == 'test'
|
|
53
64
|
Requires-Dist: redis>=2.10.0; extra == 'test'
|
|
54
65
|
Description-Content-Type: text/x-rst
|
|
55
66
|
|
|
@@ -62,6 +73,9 @@ datasketch: Big Data Looks Small
|
|
|
62
73
|
.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
|
|
63
74
|
:target: https://zenodo.org/doi/10.5281/zenodo.598238
|
|
64
75
|
|
|
76
|
+
.. image:: https://codecov.io/gh/ekzhu/datasketch/branch/master/graph/badge.svg
|
|
77
|
+
:target: https://codecov.io/gh/ekzhu/datasketch
|
|
78
|
+
|
|
65
79
|
datasketch gives you probabilistic data structures that can process and
|
|
66
80
|
search very large amount of data super fast, with little loss of
|
|
67
81
|
accuracy.
|
|
@@ -7,6 +7,9 @@ datasketch: Big Data Looks Small
|
|
|
7
7
|
.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
|
|
8
8
|
:target: https://zenodo.org/doi/10.5281/zenodo.598238
|
|
9
9
|
|
|
10
|
+
.. image:: https://codecov.io/gh/ekzhu/datasketch/branch/master/graph/badge.svg
|
|
11
|
+
:target: https://codecov.io/gh/ekzhu/datasketch
|
|
12
|
+
|
|
10
13
|
datasketch gives you probabilistic data structures that can process and
|
|
11
14
|
search very large amount of data super fast, with little loss of
|
|
12
15
|
accuracy.
|
|
@@ -7,6 +7,7 @@ except importlib.metadata.PackageNotFoundError:
|
|
|
7
7
|
_version = "0.0.0" # Fallback for development mode
|
|
8
8
|
__version__: Final[str] = _version
|
|
9
9
|
|
|
10
|
+
from datasketch.aio import AsyncMinHashLSH # Instantiation requires motor/redis.asyncio; import itself is always safe.
|
|
10
11
|
from datasketch.b_bit_minhash import bBitMinHash
|
|
11
12
|
from datasketch.hashfunc import sha1_hash32
|
|
12
13
|
from datasketch.hnsw import HNSW
|
|
@@ -23,9 +24,9 @@ from datasketch.weighted_minhash import WeightedMinHash, WeightedMinHashGenerato
|
|
|
23
24
|
WeightedMinHashLSH = MinHashLSH
|
|
24
25
|
WeightedMinHashLSHForest = MinHashLSHForest
|
|
25
26
|
|
|
26
|
-
|
|
27
27
|
__all__ = [
|
|
28
28
|
"HNSW",
|
|
29
|
+
"AsyncMinHashLSH",
|
|
29
30
|
"HyperLogLog",
|
|
30
31
|
"HyperLogLogPlusPlus",
|
|
31
32
|
"LeanMinHash",
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Async MinHash LSH module.
|
|
2
|
+
|
|
3
|
+
This module provides asynchronous implementations of MinHash LSH for use with
|
|
4
|
+
async storage backends like MongoDB (via motor) and Redis (via redis.asyncio).
|
|
5
|
+
|
|
6
|
+
Example:
|
|
7
|
+
.. code-block:: python
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
|
|
11
|
+
from datasketch.aio import AsyncMinHashLSH
|
|
12
|
+
from datasketch import MinHash
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
async def main():
|
|
16
|
+
# prepickle=True lets you use non-bytes keys (e.g. str). With the
|
|
17
|
+
# default prepickle=False, keys passed to insert() must be bytes.
|
|
18
|
+
async with AsyncMinHashLSH(
|
|
19
|
+
storage_config={"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}},
|
|
20
|
+
threshold=0.5,
|
|
21
|
+
num_perm=128,
|
|
22
|
+
prepickle=True,
|
|
23
|
+
) as lsh:
|
|
24
|
+
m = MinHash(num_perm=128)
|
|
25
|
+
m.update(b"data")
|
|
26
|
+
await lsh.insert("key", m)
|
|
27
|
+
result = await lsh.query(m)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
asyncio.run(main())
|
|
31
|
+
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
from datasketch.aio.lsh import (
|
|
35
|
+
AsyncMinHashLSH,
|
|
36
|
+
AsyncMinHashLSHDeleteSession,
|
|
37
|
+
AsyncMinHashLSHInsertionSession,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
__all__ = [
|
|
41
|
+
"AsyncMinHashLSH",
|
|
42
|
+
"AsyncMinHashLSHDeleteSession",
|
|
43
|
+
"AsyncMinHashLSHInsertionSession",
|
|
44
|
+
]
|
|
@@ -1,9 +1,15 @@
|
|
|
1
|
+
"""Asynchronous MinHash LSH implementation.
|
|
2
|
+
|
|
3
|
+
This module provides AsyncMinHashLSH for use with async storage backends
|
|
4
|
+
like MongoDB (via motor) and Redis (via redis.asyncio).
|
|
5
|
+
"""
|
|
6
|
+
|
|
1
7
|
import asyncio
|
|
2
8
|
import pickle
|
|
3
9
|
from itertools import chain
|
|
4
10
|
from typing import Optional
|
|
5
11
|
|
|
6
|
-
from datasketch.
|
|
12
|
+
from datasketch.aio.storage import (
|
|
7
13
|
async_ordered_storage,
|
|
8
14
|
async_unordered_storage,
|
|
9
15
|
)
|
|
@@ -34,8 +40,6 @@ class AsyncMinHashLSH:
|
|
|
34
40
|
MONGO = {"type": "aiomongo", "basename": "base_name_1", "mongo": {"host": "localhost", "port": 27017}}
|
|
35
41
|
|
|
36
42
|
.. note::
|
|
37
|
-
* The module supports Python version >=3.6, and is currently experimental.
|
|
38
|
-
So the interface may change slightly in the future.
|
|
39
43
|
* For main functionality of LSH algorithm see :class:`datasketch.MinHashLSH`.
|
|
40
44
|
* For additional information see :ref:`minhash_lsh_at_scale` and :ref:`minhash_lsh_async`
|
|
41
45
|
"""
|
|
@@ -60,6 +64,7 @@ class AsyncMinHashLSH:
|
|
|
60
64
|
self._weights = weights
|
|
61
65
|
self._params = params
|
|
62
66
|
self.prepickle = storage_config["type"] == "aioredis" if prepickle is None else prepickle
|
|
67
|
+
self._require_bytes_keys = not self.prepickle
|
|
63
68
|
|
|
64
69
|
if self._threshold > 1.0 or self._threshold < 0.0:
|
|
65
70
|
raise ValueError("threshold must be in [0.0, 1.0]")
|
|
@@ -115,7 +120,9 @@ class AsyncMinHashLSH:
|
|
|
115
120
|
def __setstate__(self, state):
|
|
116
121
|
state["_lock"] = asyncio.Lock()
|
|
117
122
|
self.__dict__ = state
|
|
118
|
-
self.__init__(
|
|
123
|
+
self.__init__(
|
|
124
|
+
self._threshold, self._num_perm, self._weights, self._params, self._storage_config, self.prepickle
|
|
125
|
+
)
|
|
119
126
|
|
|
120
127
|
@property
|
|
121
128
|
def batch_size(self):
|
|
@@ -126,7 +133,7 @@ class AsyncMinHashLSH:
|
|
|
126
133
|
if self.keys is not None:
|
|
127
134
|
self.keys.batch_size = value
|
|
128
135
|
else:
|
|
129
|
-
raise AttributeError("
|
|
136
|
+
raise AttributeError("AsyncMinHashLSH is not initialized.")
|
|
130
137
|
|
|
131
138
|
for t in self.hashtables:
|
|
132
139
|
t.batch_size = value
|
|
@@ -160,12 +167,6 @@ class AsyncMinHashLSH:
|
|
|
160
167
|
if self.keys is None:
|
|
161
168
|
await self._create_storages()
|
|
162
169
|
|
|
163
|
-
if not self.keys.initialized:
|
|
164
|
-
await self.keys
|
|
165
|
-
|
|
166
|
-
fs = (ht for ht in self.hashtables if not ht.initialized)
|
|
167
|
-
await asyncio.gather(*fs)
|
|
168
|
-
|
|
169
170
|
async def close(self):
|
|
170
171
|
"""Cleanup client resources and disconnect from AsyncMinHashLSH storage."""
|
|
171
172
|
async with self._lock:
|
|
@@ -186,41 +187,26 @@ class AsyncMinHashLSH:
|
|
|
186
187
|
|
|
187
188
|
:param int batch_size: the size of chunks to use in insert_session mode (default=10000).
|
|
188
189
|
|
|
189
|
-
:return: datasketch.
|
|
190
|
+
:return: datasketch.aio.lsh.AsyncMinHashLSHInsertionSession
|
|
190
191
|
|
|
191
192
|
Example:
|
|
192
193
|
.. code-block:: python
|
|
193
194
|
|
|
194
|
-
|
|
195
|
+
import asyncio
|
|
196
|
+
from datasketch.aio import AsyncMinHashLSH
|
|
195
197
|
from datasketch import MinHash
|
|
196
198
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
_chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
|
|
204
|
-
seq = frozenset(
|
|
205
|
-
chain(
|
|
206
|
-
("".join(s) for s in _chunked_str),
|
|
207
|
-
("aahhb", "aahh", "aahhc", "aac", "kld", "bhg", "kkd", "yow", "ppi", "eer"),
|
|
208
|
-
)
|
|
209
|
-
)
|
|
210
|
-
objs = [MinHash(16) for _ in range(len(seq))]
|
|
211
|
-
for e, obj in zip(seq, objs):
|
|
212
|
-
for i in e:
|
|
213
|
-
obj.update(i.encode("utf-8"))
|
|
214
|
-
data = [(e, m) for e, m in zip(seq, objs)]
|
|
215
|
-
|
|
216
|
-
_storage_config_redis = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
async def func():
|
|
220
|
-
async with AsyncMinHashLSH(storage_config=_storage_config_redis, threshold=0.5, num_perm=16) as lsh:
|
|
199
|
+
async def main():
|
|
200
|
+
storage_config = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
|
|
201
|
+
async with AsyncMinHashLSH(
|
|
202
|
+
storage_config=storage_config, threshold=0.5, num_perm=16, prepickle=True
|
|
203
|
+
) as lsh:
|
|
221
204
|
async with lsh.insertion_session(batch_size=1000) as session:
|
|
222
|
-
|
|
223
|
-
|
|
205
|
+
m = MinHash(num_perm=16)
|
|
206
|
+
m.update(b"data")
|
|
207
|
+
await session.insert("key", m)
|
|
208
|
+
|
|
209
|
+
asyncio.run(main())
|
|
224
210
|
|
|
225
211
|
"""
|
|
226
212
|
return AsyncMinHashLSHInsertionSession(self, batch_size=batch_size)
|
|
@@ -229,47 +215,32 @@ class AsyncMinHashLSH:
|
|
|
229
215
|
"""Create a asynchronous context manager for fast removal of keys
|
|
230
216
|
from index.
|
|
231
217
|
|
|
232
|
-
:param int batch_size: the size of chunks to use in
|
|
218
|
+
:param int batch_size: the size of chunks to use in delete_session mode (default=10000).
|
|
233
219
|
|
|
234
|
-
:return: datasketch.
|
|
220
|
+
:return: datasketch.aio.lsh.AsyncMinHashLSHDeleteSession
|
|
235
221
|
|
|
236
222
|
Example:
|
|
237
223
|
.. code-block:: python
|
|
238
224
|
|
|
239
|
-
|
|
225
|
+
import asyncio
|
|
226
|
+
from datasketch.aio import AsyncMinHashLSH
|
|
240
227
|
from datasketch import MinHash
|
|
241
228
|
|
|
229
|
+
async def main():
|
|
230
|
+
storage_config = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
|
|
231
|
+
async with AsyncMinHashLSH(
|
|
232
|
+
storage_config=storage_config, threshold=0.5, num_perm=16, prepickle=True
|
|
233
|
+
) as lsh:
|
|
234
|
+
# Insert some data first
|
|
235
|
+
m = MinHash(num_perm=16)
|
|
236
|
+
m.update(b"data")
|
|
237
|
+
await lsh.insert("key1", m)
|
|
242
238
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
239
|
+
# Delete using session
|
|
240
|
+
async with lsh.delete_session(batch_size=100) as session:
|
|
241
|
+
await session.remove("key1")
|
|
246
242
|
|
|
247
|
-
|
|
248
|
-
_chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
|
|
249
|
-
seq = frozenset(
|
|
250
|
-
chain(
|
|
251
|
-
("".join(s) for s in _chunked_str),
|
|
252
|
-
("aahhb", "aahh", "aahhc", "aac", "kld", "bhg", "kkd", "yow", "ppi", "eer"),
|
|
253
|
-
)
|
|
254
|
-
)
|
|
255
|
-
objs = [MinHash(16) for _ in range(len(seq))]
|
|
256
|
-
for e, obj in zip(seq, objs):
|
|
257
|
-
for i in e:
|
|
258
|
-
obj.update(i.encode("utf-8"))
|
|
259
|
-
data = [(e, m) for e, m in zip(seq, objs)]
|
|
260
|
-
|
|
261
|
-
_storage_config_redis = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
async def func():
|
|
265
|
-
async with AsyncMinHashLSH(storage_config=_storage_config_redis, threshold=0.5, num_perm=16) as lsh:
|
|
266
|
-
async with lsh.insertion_session(batch_size=1000) as session:
|
|
267
|
-
fs = (session.insert(key, minhash, check_duplication=True) for key, minhash in data)
|
|
268
|
-
await asyncio.gather(*fs)
|
|
269
|
-
|
|
270
|
-
async with lsh.delete_session(batch_size=3) as session:
|
|
271
|
-
fs = (session.remove(key) for key in keys_to_remove)
|
|
272
|
-
await asyncio.gather(*fs)
|
|
243
|
+
asyncio.run(main())
|
|
273
244
|
|
|
274
245
|
"""
|
|
275
246
|
return AsyncMinHashLSHDeleteSession(self, batch_size=batch_size)
|
|
@@ -277,10 +248,17 @@ class AsyncMinHashLSH:
|
|
|
277
248
|
async def _insert(self, key, minhash, check_duplication=True, buffer=False):
|
|
278
249
|
if len(minhash) != self.h:
|
|
279
250
|
raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
|
|
251
|
+
if self._require_bytes_keys and not isinstance(key, bytes):
|
|
252
|
+
raise TypeError(
|
|
253
|
+
f"prepickle=False requires bytes keys for non-dict storage, got {type(key).__name__}. "
|
|
254
|
+
"Either pass bytes keys or use prepickle=True for automatic serialization."
|
|
255
|
+
)
|
|
280
256
|
if self.prepickle:
|
|
281
257
|
key = pickle.dumps(key)
|
|
282
258
|
|
|
283
|
-
|
|
259
|
+
# `key` is already pickled at this point under prepickle=True; call the
|
|
260
|
+
# storage primitive directly so we don't re-pickle through has_key().
|
|
261
|
+
if check_duplication and await self.keys.has_key(key):
|
|
284
262
|
raise ValueError("The given key already exists")
|
|
285
263
|
Hs = [self._H(minhash.hashvalues[start:end]) for start, end in self.hashranges]
|
|
286
264
|
|
|
@@ -306,6 +284,8 @@ class AsyncMinHashLSH:
|
|
|
306
284
|
|
|
307
285
|
async def has_key(self, key):
|
|
308
286
|
"""See :class:`datasketch.MinHashLSH`."""
|
|
287
|
+
if self.prepickle:
|
|
288
|
+
key = pickle.dumps(key)
|
|
309
289
|
return await self.keys.has_key(key)
|
|
310
290
|
|
|
311
291
|
async def remove(self, key):
|
|
@@ -313,7 +293,12 @@ class AsyncMinHashLSH:
|
|
|
313
293
|
await self._remove(key, buffer=False)
|
|
314
294
|
|
|
315
295
|
async def _remove(self, key, buffer=False):
|
|
316
|
-
if
|
|
296
|
+
if self.prepickle:
|
|
297
|
+
key = pickle.dumps(key)
|
|
298
|
+
|
|
299
|
+
# `key` is already pickled here; call storage primitives directly so
|
|
300
|
+
# the existence check, lookup, and deletes all use the stored form.
|
|
301
|
+
if not await self.keys.has_key(key):
|
|
317
302
|
raise ValueError("The given key does not exist")
|
|
318
303
|
|
|
319
304
|
for H, hashtable in zip(await self.keys.get(key), self.hashtables):
|
|
@@ -344,7 +329,10 @@ class AsyncMinHashLSH:
|
|
|
344
329
|
H = self._H(minhash.hashvalues[start:end])
|
|
345
330
|
if await hashtable.has_key(H):
|
|
346
331
|
fs.append(hashtable.get(H))
|
|
347
|
-
|
|
332
|
+
candidates = set(chain.from_iterable(await asyncio.gather(*fs)))
|
|
333
|
+
if self.prepickle:
|
|
334
|
+
return {pickle.loads(key) for key in candidates}
|
|
335
|
+
return candidates
|
|
348
336
|
|
|
349
337
|
async def get_counts(self):
|
|
350
338
|
"""See :class:`datasketch.MinHashLSH`."""
|
|
@@ -353,6 +341,10 @@ class AsyncMinHashLSH:
|
|
|
353
341
|
|
|
354
342
|
async def get_subset_counts(self, *keys):
|
|
355
343
|
"""See :class:`datasketch.MinHashLSH`."""
|
|
344
|
+
# Keys in storage are pickled when prepickle is enabled, so we have to
|
|
345
|
+
# pickle the query keys to match the stored representation.
|
|
346
|
+
if self.prepickle:
|
|
347
|
+
keys = tuple(pickle.dumps(key) for key in keys)
|
|
356
348
|
key_set = list(set(keys))
|
|
357
349
|
hashtables = [unordered_storage({"type": "dict"}) for _ in range(self.b)]
|
|
358
350
|
Hss = await self.keys.getmany(*key_set)
|
|
@@ -1,9 +1,23 @@
|
|
|
1
|
+
"""Async storage backends for MinHash LSH.
|
|
2
|
+
|
|
3
|
+
This module provides async storage implementations for use with AsyncMinHashLSH:
|
|
4
|
+
- AsyncMongoListStorage / AsyncMongoSetStorage: MongoDB storage via motor
|
|
5
|
+
- AsyncRedisListStorage / AsyncRedisSetStorage: Redis storage via redis.asyncio
|
|
6
|
+
"""
|
|
7
|
+
|
|
1
8
|
import asyncio
|
|
2
9
|
import os
|
|
3
10
|
from abc import ABCMeta
|
|
4
11
|
from itertools import chain
|
|
5
12
|
|
|
6
|
-
from datasketch.storage import OrderedStorage,
|
|
13
|
+
from datasketch.storage import OrderedStorage, Storage, UnorderedStorage, _random_name
|
|
14
|
+
|
|
15
|
+
# RedisStorage is only available when redis package is installed (optional dependency)
|
|
16
|
+
# Import it conditionally to avoid ImportError when redis is not installed
|
|
17
|
+
try:
|
|
18
|
+
from datasketch.storage import RedisStorage
|
|
19
|
+
except ImportError:
|
|
20
|
+
RedisStorage = None
|
|
7
21
|
|
|
8
22
|
ABC = ABCMeta("ABC", (object,), {})
|
|
9
23
|
|
|
@@ -24,6 +38,12 @@ except ImportError:
|
|
|
24
38
|
redis = None
|
|
25
39
|
|
|
26
40
|
|
|
41
|
+
__all__ = [
|
|
42
|
+
"async_ordered_storage",
|
|
43
|
+
"async_unordered_storage",
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
|
|
27
47
|
async def async_ordered_storage(config, name=None):
|
|
28
48
|
tp = config["type"]
|
|
29
49
|
if tp == "aiomongo":
|
|
@@ -55,9 +75,9 @@ if motor is not None and ReturnDocument is not None:
|
|
|
55
75
|
class AsyncMongoBuffer:
|
|
56
76
|
def __init__(self, aio_mongo_collection, batch_size):
|
|
57
77
|
self._batch_size = batch_size
|
|
58
|
-
self._insert_documents_stack =
|
|
59
|
-
self._delete_by_key_documents_stack =
|
|
60
|
-
self._delete_by_val_documents_stack =
|
|
78
|
+
self._insert_documents_stack = []
|
|
79
|
+
self._delete_by_key_documents_stack = []
|
|
80
|
+
self._delete_by_val_documents_stack = []
|
|
61
81
|
self._mongo_coll = aio_mongo_collection
|
|
62
82
|
|
|
63
83
|
@property
|
|
@@ -73,28 +93,28 @@ if motor is not None and ReturnDocument is not None:
|
|
|
73
93
|
if command == "insert":
|
|
74
94
|
if len(self._insert_documents_stack) >= self.batch_size:
|
|
75
95
|
await self.execute(command)
|
|
76
|
-
self._insert_documents_stack
|
|
96
|
+
self._insert_documents_stack.append(kwargs["obj"])
|
|
77
97
|
elif command == "delete_by_key":
|
|
78
98
|
if len(self._delete_by_key_documents_stack) >= self.batch_size:
|
|
79
99
|
await self.execute(command)
|
|
80
|
-
self._delete_by_key_documents_stack
|
|
100
|
+
self._delete_by_key_documents_stack.append(kwargs["key"])
|
|
81
101
|
elif command == "delete_by_val":
|
|
82
102
|
if len(self._delete_by_val_documents_stack) >= self.batch_size:
|
|
83
103
|
await self.execute(command)
|
|
84
|
-
self._delete_by_val_documents_stack
|
|
104
|
+
self._delete_by_val_documents_stack.append(kwargs["val"])
|
|
85
105
|
|
|
86
106
|
async def execute(self, command):
|
|
87
107
|
if command == "insert" and self._insert_documents_stack:
|
|
88
108
|
buffer = self._insert_documents_stack
|
|
89
|
-
self._insert_documents_stack =
|
|
109
|
+
self._insert_documents_stack = []
|
|
90
110
|
await self._mongo_coll.insert_many(buffer, ordered=False)
|
|
91
111
|
elif command == "delete_by_key" and self._delete_by_key_documents_stack:
|
|
92
112
|
buffer = self._delete_by_key_documents_stack
|
|
93
|
-
self._delete_by_key_documents_stack =
|
|
113
|
+
self._delete_by_key_documents_stack = []
|
|
94
114
|
await self._mongo_coll.delete_many({"key": {"$in": buffer}})
|
|
95
115
|
elif command == "delete_by_val" and self._delete_by_val_documents_stack:
|
|
96
116
|
buffer = self._delete_by_val_documents_stack
|
|
97
|
-
self._delete_by_val_documents_stack =
|
|
117
|
+
self._delete_by_val_documents_stack = []
|
|
98
118
|
await self._mongo_coll.delete_many({"vals": {"$in": buffer}})
|
|
99
119
|
|
|
100
120
|
async def insert_one(self, **kwargs):
|
|
@@ -254,6 +274,9 @@ if motor is not None and ReturnDocument is not None:
|
|
|
254
274
|
async def has_key(self, key):
|
|
255
275
|
return bool(await self._collection.find_one({"key": key}))
|
|
256
276
|
|
|
277
|
+
async def getmany(self, *keys):
|
|
278
|
+
return await asyncio.gather(*(self.get(key) for key in keys))
|
|
279
|
+
|
|
257
280
|
async def status(self):
|
|
258
281
|
status = self._parse_config(self.config["mongo"])
|
|
259
282
|
status.update({"keyspace_size": await self.size()})
|
|
@@ -285,7 +308,9 @@ if motor is not None and ReturnDocument is not None:
|
|
|
285
308
|
await self._collection.find_one_and_delete({"key": key, "vals": val})
|
|
286
309
|
|
|
287
310
|
|
|
288
|
-
|
|
311
|
+
# Redis-based async storage classes are only defined when both redis package
|
|
312
|
+
# and RedisStorage are available (optional dependencies)
|
|
313
|
+
if redis is not None and RedisStorage is not None:
|
|
289
314
|
|
|
290
315
|
class AsyncRedisBuffer(redis.client.Pipeline):
|
|
291
316
|
def __init__(self, connection_pool, response_callbacks, transaction, buffer_size, shard_hint=None):
|
|
@@ -304,7 +329,7 @@ if redis is not None:
|
|
|
304
329
|
|
|
305
330
|
async def execute_command(self, *args, **kwargs):
|
|
306
331
|
if len(self.command_stack) >= self._buffer_size:
|
|
307
|
-
self.execute()
|
|
332
|
+
await self.execute()
|
|
308
333
|
await super(AsyncRedisBuffer, self).execute_command(*args, **kwargs)
|
|
309
334
|
|
|
310
335
|
class AsyncRedisStorage(RedisStorage):
|
|
@@ -323,16 +348,19 @@ if redis is not None:
|
|
|
323
348
|
)
|
|
324
349
|
self._initialized = True
|
|
325
350
|
|
|
351
|
+
async def close(self):
|
|
352
|
+
await self._redis.aclose()
|
|
353
|
+
|
|
326
354
|
@property
|
|
327
355
|
def initialized(self):
|
|
328
356
|
return self._initialized
|
|
329
357
|
|
|
330
358
|
class AsyncRedisListStorage(OrderedStorage, AsyncRedisStorage):
|
|
331
359
|
async def keys(self):
|
|
332
|
-
return await self._redis.hkeys(self._name)
|
|
360
|
+
return await self._redis.hkeys(self._name) # type: ignore
|
|
333
361
|
|
|
334
362
|
async def redis_keys(self):
|
|
335
|
-
return await self._redis.hvals(self._name)
|
|
363
|
+
return await self._redis.hvals(self._name) # type: ignore
|
|
336
364
|
|
|
337
365
|
def status(self):
|
|
338
366
|
status = self._parse_config(self.config["redis"])
|
|
@@ -344,24 +372,34 @@ if redis is not None:
|
|
|
344
372
|
|
|
345
373
|
async def getmany(self, *keys):
|
|
346
374
|
pipe = self._redis.pipeline()
|
|
347
|
-
pipe.multi()
|
|
348
375
|
for key in keys:
|
|
349
|
-
|
|
376
|
+
pipe.lrange(self.redis_key(key), 0, -1)
|
|
350
377
|
return await pipe.execute()
|
|
351
378
|
|
|
352
379
|
@staticmethod
|
|
353
380
|
async def _get_items(r, k):
|
|
354
381
|
return await r.lrange(k, 0, -1)
|
|
355
382
|
|
|
356
|
-
async def remove(self, *keys):
|
|
357
|
-
|
|
358
|
-
|
|
383
|
+
async def remove(self, *keys, **kwargs):
|
|
384
|
+
buffer = kwargs.pop("buffer", False)
|
|
385
|
+
if buffer:
|
|
386
|
+
await self._remove(self._buffer, *keys)
|
|
387
|
+
else:
|
|
388
|
+
await self._remove(self._redis, *keys)
|
|
389
|
+
|
|
390
|
+
async def _remove(self, r, *keys):
|
|
391
|
+
await r.hdel(self._name, *keys)
|
|
392
|
+
await r.delete(*[self.redis_key(key) for key in keys])
|
|
359
393
|
|
|
360
|
-
async def remove_val(self, key, val):
|
|
394
|
+
async def remove_val(self, key, val, **kwargs):
|
|
395
|
+
buffer = kwargs.pop("buffer", False)
|
|
361
396
|
redis_key = self.redis_key(key)
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
397
|
+
if buffer:
|
|
398
|
+
await self._buffer.lrem(redis_key, val)
|
|
399
|
+
else:
|
|
400
|
+
await self._redis.lrem(redis_key, val)
|
|
401
|
+
if not await self._redis.exists(redis_key): # type: ignore
|
|
402
|
+
await self._redis.hdel(self._name, redis_key) # type: ignore
|
|
365
403
|
|
|
366
404
|
async def insert(self, key, *vals, **kwargs):
|
|
367
405
|
# Using buffer=True outside of an `insertion_session`
|
|
@@ -380,7 +418,7 @@ if redis is not None:
|
|
|
380
418
|
await r.rpush(redis_key, *values)
|
|
381
419
|
|
|
382
420
|
async def size(self):
|
|
383
|
-
return await self._redis.hlen(self._name)
|
|
421
|
+
return await self._redis.hlen(self._name) # type: ignore
|
|
384
422
|
|
|
385
423
|
async def itemcounts(self):
|
|
386
424
|
pipe = self._redis.pipeline()
|
|
@@ -395,7 +433,7 @@ if redis is not None:
|
|
|
395
433
|
return await r.llen(k)
|
|
396
434
|
|
|
397
435
|
async def has_key(self, key):
|
|
398
|
-
return await self._redis.hexists(self._name, key)
|
|
436
|
+
return await self._redis.hexists(self._name, key) # type: ignore
|
|
399
437
|
|
|
400
438
|
async def empty_buffer(self):
|
|
401
439
|
await self._buffer.execute()
|
|
@@ -408,11 +446,21 @@ if redis is not None:
|
|
|
408
446
|
async def _get_items(r, k):
|
|
409
447
|
return await r.smembers(k)
|
|
410
448
|
|
|
411
|
-
async def
|
|
449
|
+
async def getmany(self, *keys):
|
|
450
|
+
pipe = self._redis.pipeline()
|
|
451
|
+
for key in keys:
|
|
452
|
+
pipe.smembers(self.redis_key(key))
|
|
453
|
+
return await pipe.execute()
|
|
454
|
+
|
|
455
|
+
async def remove_val(self, key, val, **kwargs):
|
|
456
|
+
buffer = kwargs.pop("buffer", False)
|
|
412
457
|
redis_key = self.redis_key(key)
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
458
|
+
if buffer:
|
|
459
|
+
await self._buffer.srem(redis_key, val)
|
|
460
|
+
else:
|
|
461
|
+
await self._redis.srem(redis_key, val)
|
|
462
|
+
if not await self._redis.exists(redis_key): # type: ignore
|
|
463
|
+
await self._redis.hdel(self._name, redis_key) # type: ignore
|
|
416
464
|
|
|
417
465
|
async def _insert(self, r, key, *values):
|
|
418
466
|
redis_key = self.redis_key(key)
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Deprecated experimental module.
|
|
2
|
+
|
|
3
|
+
.. deprecated::
|
|
4
|
+
The `datasketch.experimental` module is deprecated and will be removed in a future version.
|
|
5
|
+
Please use `datasketch.aio` instead:
|
|
6
|
+
|
|
7
|
+
Old: ``from datasketch.experimental import AsyncMinHashLSH``
|
|
8
|
+
New: ``from datasketch.aio import AsyncMinHashLSH``
|
|
9
|
+
|
|
10
|
+
Or simply: ``from datasketch import AsyncMinHashLSH``
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from typing import TYPE_CHECKING
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
# Visible to static analyzers so they know `__all__` is satisfied.
|
|
17
|
+
# Not imported at runtime - the real dispatch happens in __getattr__.
|
|
18
|
+
from datasketch.aio import AsyncMinHashLSH
|
|
19
|
+
|
|
20
|
+
__all__ = ["AsyncMinHashLSH"]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def __getattr__(name):
|
|
24
|
+
# PEP 562: only emit the DeprecationWarning when the user actually pulls a
|
|
25
|
+
# symbol out of this package, not on every `import datasketch.experimental`.
|
|
26
|
+
# This avoids the noisy triple-warning that fired when each intermediate
|
|
27
|
+
# __init__.py warned eagerly.
|
|
28
|
+
#
|
|
29
|
+
# We cache the resolved symbol back into globals() so subsequent accesses
|
|
30
|
+
# bypass __getattr__. This matters for two reasons:
|
|
31
|
+
# 1. `from pkg import x` internally performs both `hasattr(pkg, x)` and
|
|
32
|
+
# `getattr(pkg, x)`, so without caching __getattr__ fires twice.
|
|
33
|
+
# 2. It makes the warning a one-shot per process, which is the normal
|
|
34
|
+
# expectation for deprecation warnings.
|
|
35
|
+
if name == "AsyncMinHashLSH":
|
|
36
|
+
import warnings
|
|
37
|
+
|
|
38
|
+
warnings.warn(
|
|
39
|
+
"datasketch.experimental is deprecated. "
|
|
40
|
+
"Use 'from datasketch.aio import AsyncMinHashLSH' or "
|
|
41
|
+
"'from datasketch import AsyncMinHashLSH' instead.",
|
|
42
|
+
DeprecationWarning,
|
|
43
|
+
stacklevel=2,
|
|
44
|
+
)
|
|
45
|
+
from datasketch.aio import AsyncMinHashLSH
|
|
46
|
+
|
|
47
|
+
globals()[name] = AsyncMinHashLSH
|
|
48
|
+
return AsyncMinHashLSH
|
|
49
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Deprecated experimental aio module.
|
|
2
|
+
|
|
3
|
+
.. deprecated::
|
|
4
|
+
The `datasketch.experimental.aio` module is deprecated and will be removed in a future version.
|
|
5
|
+
Please use `datasketch.aio` instead:
|
|
6
|
+
|
|
7
|
+
Old: ``from datasketch.experimental.aio import AsyncMinHashLSH``
|
|
8
|
+
New: ``from datasketch.aio import AsyncMinHashLSH``
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from typing import TYPE_CHECKING
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
# Visible to static analyzers so they know `__all__` is satisfied.
|
|
15
|
+
# Not imported at runtime - the real dispatch happens in __getattr__.
|
|
16
|
+
from datasketch.aio import (
|
|
17
|
+
AsyncMinHashLSH,
|
|
18
|
+
AsyncMinHashLSHDeleteSession,
|
|
19
|
+
AsyncMinHashLSHInsertionSession,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"AsyncMinHashLSH",
|
|
24
|
+
"AsyncMinHashLSHDeleteSession",
|
|
25
|
+
"AsyncMinHashLSHInsertionSession",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
_DEPRECATED = frozenset(__all__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def __getattr__(name):
|
|
32
|
+
# PEP 562: emit the warning lazily on attribute access so that merely
|
|
33
|
+
# importing the parent package (e.g. as an intermediate step of
|
|
34
|
+
# `from datasketch.experimental.aio.lsh import ...`) does not fire
|
|
35
|
+
# a second, redundant warning. See the long comment in
|
|
36
|
+
# datasketch/experimental/__init__.py for why we cache into globals().
|
|
37
|
+
if name in _DEPRECATED:
|
|
38
|
+
import warnings
|
|
39
|
+
|
|
40
|
+
warnings.warn(
|
|
41
|
+
"datasketch.experimental.aio is deprecated. Use 'from datasketch.aio import AsyncMinHashLSH' instead.",
|
|
42
|
+
DeprecationWarning,
|
|
43
|
+
stacklevel=2,
|
|
44
|
+
)
|
|
45
|
+
import datasketch.aio as _new
|
|
46
|
+
|
|
47
|
+
value = getattr(_new, name)
|
|
48
|
+
globals()[name] = value
|
|
49
|
+
return value
|
|
50
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Deprecated experimental aio lsh module.
|
|
2
|
+
|
|
3
|
+
.. deprecated::
|
|
4
|
+
The `datasketch.experimental.aio.lsh` module is deprecated and will be removed in a future version.
|
|
5
|
+
Please use `datasketch.aio.lsh` instead:
|
|
6
|
+
|
|
7
|
+
Old: ``from datasketch.experimental.aio.lsh import AsyncMinHashLSH``
|
|
8
|
+
New: ``from datasketch.aio import AsyncMinHashLSH``
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from typing import TYPE_CHECKING
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
# Visible to static analyzers so they know `__all__` is satisfied.
|
|
15
|
+
# Not imported at runtime - the real dispatch happens in __getattr__.
|
|
16
|
+
from datasketch.aio.lsh import (
|
|
17
|
+
AsyncMinHashLSH,
|
|
18
|
+
AsyncMinHashLSHDeleteSession,
|
|
19
|
+
AsyncMinHashLSHInsertionSession,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"AsyncMinHashLSH",
|
|
24
|
+
"AsyncMinHashLSHDeleteSession",
|
|
25
|
+
"AsyncMinHashLSHInsertionSession",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
_DEPRECATED = frozenset(__all__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def __getattr__(name):
|
|
32
|
+
# Lazy warning via PEP 562: fires exactly once per attribute access on the
|
|
33
|
+
# deprecated module, and we cache the resolved symbol back into globals()
|
|
34
|
+
# so the warning is emitted once per process (see the long comment in
|
|
35
|
+
# datasketch/experimental/__init__.py for rationale).
|
|
36
|
+
if name in _DEPRECATED:
|
|
37
|
+
import warnings
|
|
38
|
+
|
|
39
|
+
warnings.warn(
|
|
40
|
+
"datasketch.experimental.aio.lsh is deprecated. Use 'from datasketch.aio import AsyncMinHashLSH' instead.",
|
|
41
|
+
DeprecationWarning,
|
|
42
|
+
stacklevel=2,
|
|
43
|
+
)
|
|
44
|
+
import datasketch.aio.lsh as _new
|
|
45
|
+
|
|
46
|
+
value = getattr(_new, name)
|
|
47
|
+
globals()[name] = value
|
|
48
|
+
return value
|
|
49
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
@@ -3,12 +3,18 @@ from __future__ import annotations
|
|
|
3
3
|
import pickle
|
|
4
4
|
import struct
|
|
5
5
|
from collections.abc import Hashable
|
|
6
|
-
from typing import Callable, Optional, Union
|
|
6
|
+
from typing import Callable, List, Optional, Union
|
|
7
7
|
|
|
8
8
|
from scipy.integrate import quad as integrate
|
|
9
9
|
|
|
10
10
|
from datasketch.minhash import MinHash
|
|
11
|
-
from datasketch.storage import
|
|
11
|
+
from datasketch.storage import (
|
|
12
|
+
OrderedStorage,
|
|
13
|
+
UnorderedStorage,
|
|
14
|
+
_random_name,
|
|
15
|
+
ordered_storage,
|
|
16
|
+
unordered_storage,
|
|
17
|
+
)
|
|
12
18
|
from datasketch.weighted_minhash import WeightedMinHash
|
|
13
19
|
|
|
14
20
|
|
|
@@ -183,7 +189,7 @@ class MinHashLSH:
|
|
|
183
189
|
self._H = self._byteswap
|
|
184
190
|
|
|
185
191
|
basename = storage_config.get("basename", _random_name(11))
|
|
186
|
-
self.hashtables = [
|
|
192
|
+
self.hashtables: List[UnorderedStorage] = [
|
|
187
193
|
unordered_storage(
|
|
188
194
|
storage_config,
|
|
189
195
|
name=b"".join([basename, b"_bucket_", struct.pack(">H", i)]),
|
|
@@ -191,7 +197,7 @@ class MinHashLSH:
|
|
|
191
197
|
for i in range(self.b)
|
|
192
198
|
]
|
|
193
199
|
self.hashranges = [(i * self.r, (i + 1) * self.r) for i in range(self.b)]
|
|
194
|
-
self.keys = ordered_storage(storage_config, name=b"".join([basename, b"_keys"]))
|
|
200
|
+
self.keys: OrderedStorage = ordered_storage(storage_config, name=b"".join([basename, b"_keys"]))
|
|
195
201
|
|
|
196
202
|
@property
|
|
197
203
|
def buffer_size(self) -> int:
|
|
@@ -347,7 +353,7 @@ class MinHashLSH:
|
|
|
347
353
|
"""
|
|
348
354
|
return type(self) is type(other) and self.h == other.h and self.b == other.b and self.r == other.r
|
|
349
355
|
|
|
350
|
-
def _merge(self, other: MinHashLSH, check_overlap: bool = False, buffer: bool = False) ->
|
|
356
|
+
def _merge(self, other: MinHashLSH, check_overlap: bool = False, buffer: bool = False) -> None:
|
|
351
357
|
if self.__equivalent(other):
|
|
352
358
|
if check_overlap and set(self.keys).intersection(set(other.keys)):
|
|
353
359
|
raise ValueError("The keys are overlapping, duplicate key exists.")
|
|
@@ -457,16 +463,24 @@ class MinHashLSH:
|
|
|
457
463
|
list: a list of unique keys.
|
|
458
464
|
|
|
459
465
|
"""
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
466
|
+
collected_result_lists = [hashtable.collect_select_buffer() for hashtable in self.hashtables]
|
|
467
|
+
if not any(collected_result_lists):
|
|
468
|
+
return []
|
|
469
|
+
|
|
470
|
+
# Each buffered query contributes one result list per hashtable. We first
|
|
471
|
+
# union candidates across bands for each query, then intersect across the
|
|
472
|
+
# buffered queries to match repeated calls to `query()`.
|
|
473
|
+
per_query_result_sets = [
|
|
474
|
+
set().union(*query_result_lists)
|
|
475
|
+
for query_result_lists in zip(*collected_result_lists)
|
|
464
476
|
]
|
|
465
|
-
if not
|
|
477
|
+
if not per_query_result_sets:
|
|
466
478
|
return []
|
|
479
|
+
|
|
480
|
+
candidates = set.intersection(*per_query_result_sets)
|
|
467
481
|
if self.prepickle:
|
|
468
|
-
return [pickle.loads(key) for key in
|
|
469
|
-
return list(
|
|
482
|
+
return [pickle.loads(key) for key in candidates]
|
|
483
|
+
return list(candidates)
|
|
470
484
|
|
|
471
485
|
def __contains__(self, key: Hashable) -> bool:
|
|
472
486
|
"""Args:
|
|
@@ -524,6 +538,8 @@ class MinHashLSH:
|
|
|
524
538
|
return bytes(hs.byteswap().data)
|
|
525
539
|
|
|
526
540
|
def _hashed_byteswap(self, hs):
|
|
541
|
+
if self.hashfunc is None:
|
|
542
|
+
raise RuntimeError("Hash function not configured.")
|
|
527
543
|
return self.hashfunc(bytes(hs.byteswap().data))
|
|
528
544
|
|
|
529
545
|
def _query_b(self, minhash, b):
|
|
@@ -252,9 +252,9 @@ class MinHashLSHBloom:
|
|
|
252
252
|
raise ValueError("threshold must be in [0.0, 1.0]")
|
|
253
253
|
if num_perm < 2:
|
|
254
254
|
raise ValueError("Too few permutation functions")
|
|
255
|
-
if n <= 0:
|
|
255
|
+
if n is None or n <= 0:
|
|
256
256
|
raise ValueError("n for LSHBloom must be >= 0")
|
|
257
|
-
if fp >= 1.0 or fp <= 0.0:
|
|
257
|
+
if fp is None or fp >= 1.0 or fp <= 0.0:
|
|
258
258
|
raise ValueError("fp must be in (0.0, 1.0)")
|
|
259
259
|
if save_dir is None:
|
|
260
260
|
warnings.warn(
|
|
@@ -204,7 +204,7 @@ class MinHashLSHEnsemble:
|
|
|
204
204
|
if not self.is_empty():
|
|
205
205
|
raise ValueError("Cannot call index again on a non-empty index")
|
|
206
206
|
if not isinstance(entries, list):
|
|
207
|
-
queue = deque(
|
|
207
|
+
queue = deque()
|
|
208
208
|
for key, minhash, size in entries:
|
|
209
209
|
if size <= 0:
|
|
210
210
|
raise ValueError("Set size must be positive")
|
|
@@ -221,7 +221,8 @@ class MinHashLSHEnsemble:
|
|
|
221
221
|
entries.sort(key=lambda e: e[2])
|
|
222
222
|
curr_part = 0
|
|
223
223
|
for key, minhash, size in entries:
|
|
224
|
-
|
|
224
|
+
u = self.uppers[curr_part]
|
|
225
|
+
if size > u:
|
|
225
226
|
curr_part += 1
|
|
226
227
|
for r in self.indexes[curr_part]:
|
|
227
228
|
self.indexes[curr_part][r].insert(key, minhash)
|
|
@@ -9,8 +9,8 @@ from datasketch.minhash import MinHash
|
|
|
9
9
|
class MinHashLSHForest:
|
|
10
10
|
"""The LSH Forest for MinHash. It supports top-k query in Jaccard
|
|
11
11
|
similarity.
|
|
12
|
-
Instead of using prefix trees as the
|
|
13
|
-
|
|
12
|
+
Instead of using prefix trees as described in the original LSH Forest
|
|
13
|
+
paper by Bawa et al. (WWW 2005),
|
|
14
14
|
I use a sorted array to store the hash values in every
|
|
15
15
|
hash table.
|
|
16
16
|
|
|
@@ -37,7 +37,8 @@ class MinHashLSHForest:
|
|
|
37
37
|
# Maximum depth of the prefix tree
|
|
38
38
|
self.k = int(num_perm / l)
|
|
39
39
|
self.hashtables = [defaultdict(list) for _ in range(self.l)]
|
|
40
|
-
self.hashranges = [(i * self.k, (i + 1) * self.k)
|
|
40
|
+
self.hashranges = [(i * self.k, (i + 1) * self.k)
|
|
41
|
+
for i in range(self.l)]
|
|
41
42
|
self.keys = dict()
|
|
42
43
|
# This is the sorted array implementation for the prefix trees
|
|
43
44
|
self.sorted_hashtables = [[] for _ in range(self.l)]
|
|
@@ -59,7 +60,8 @@ class MinHashLSHForest:
|
|
|
59
60
|
raise ValueError("The num_perm of MinHash out of range")
|
|
60
61
|
if key in self.keys:
|
|
61
62
|
raise ValueError("The given key has already been added")
|
|
62
|
-
self.keys[key] = [self._H(minhash.hashvalues[start:end])
|
|
63
|
+
self.keys[key] = [self._H(minhash.hashvalues[start:end])
|
|
64
|
+
for start, end in self.hashranges]
|
|
63
65
|
for H, hashtable in zip(self.keys[key], self.hashtables):
|
|
64
66
|
hashtable[H].append(key)
|
|
65
67
|
|
|
@@ -73,11 +75,13 @@ class MinHashLSHForest:
|
|
|
73
75
|
if r > self.k or r <= 0 or b > self.l or b <= 0:
|
|
74
76
|
raise ValueError("parameter outside range")
|
|
75
77
|
# Generate prefixes of concatenated hash values
|
|
76
|
-
hps = [self._H(minhash.hashvalues[start
|
|
78
|
+
hps = [self._H(minhash.hashvalues[start: start + r])
|
|
79
|
+
for start, _ in self.hashranges]
|
|
77
80
|
# Set the prefix length for look-ups in the sorted hash values list
|
|
78
81
|
prefix_size = len(hps[0])
|
|
79
82
|
for ht, hp, hashtable in zip(self.sorted_hashtables, hps, self.hashtables):
|
|
80
|
-
i = self._binary_search(
|
|
83
|
+
i = self._binary_search(
|
|
84
|
+
len(ht), lambda x, ht=ht, hp=hp: ht[x][:prefix_size] >= hp)
|
|
81
85
|
if i < len(ht) and ht[i][:prefix_size] == hp:
|
|
82
86
|
j = i
|
|
83
87
|
while j < len(ht) and ht[j][:prefix_size] == hp:
|
|
@@ -137,14 +141,17 @@ class MinHashLSHForest:
|
|
|
137
141
|
"""
|
|
138
142
|
byteslist = self.keys.get(key, None)
|
|
139
143
|
if byteslist is None:
|
|
140
|
-
raise KeyError(
|
|
144
|
+
raise KeyError(
|
|
145
|
+
f"The provided key does not exist in the LSHForest: {key}")
|
|
141
146
|
hashvalue_byte_size = len(byteslist[0]) // 8
|
|
142
|
-
hashvalues = np.empty(
|
|
147
|
+
hashvalues = np.empty(
|
|
148
|
+
len(byteslist) * hashvalue_byte_size, dtype=np.uint64)
|
|
143
149
|
for index, item in enumerate(byteslist):
|
|
144
150
|
# unswap the bytes, as their representation is flipped during storage
|
|
145
151
|
hv_segment = np.frombuffer(item, dtype=np.uint64).byteswap()
|
|
146
152
|
curr_index = index * hashvalue_byte_size
|
|
147
|
-
hashvalues[curr_index
|
|
153
|
+
hashvalues[curr_index: curr_index +
|
|
154
|
+
hashvalue_byte_size] = hv_segment
|
|
148
155
|
return hashvalues
|
|
149
156
|
|
|
150
157
|
def _binary_search(self, n, func):
|
|
@@ -3,15 +3,18 @@ from __future__ import annotations
|
|
|
3
3
|
import copy
|
|
4
4
|
import warnings
|
|
5
5
|
from collections.abc import Generator, Iterable
|
|
6
|
-
from typing import Callable, Optional
|
|
6
|
+
from typing import TYPE_CHECKING, Callable, Optional, Union
|
|
7
7
|
|
|
8
8
|
try:
|
|
9
9
|
from typing import Literal # py3.8+; if older, you can fallback to typing_extensions
|
|
10
|
-
except
|
|
10
|
+
except ImportError:
|
|
11
11
|
from typing_extensions import Literal
|
|
12
12
|
|
|
13
13
|
import numpy as np
|
|
14
14
|
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from numpy.typing import ArrayLike
|
|
17
|
+
|
|
15
18
|
# GPU backend
|
|
16
19
|
try:
|
|
17
20
|
import cupy as cp
|
|
@@ -114,8 +117,8 @@ class MinHash:
|
|
|
114
117
|
gpu_mode: Literal["disable", "detect", "always"] = "disable",
|
|
115
118
|
hashfunc: Callable = sha1_hash32,
|
|
116
119
|
hashobj: Optional[object] = None, # Deprecated.
|
|
117
|
-
hashvalues: Optional[
|
|
118
|
-
permutations: Optional[tuple[
|
|
120
|
+
hashvalues: Optional[ArrayLike] = None,
|
|
121
|
+
permutations: Optional[Union[tuple[ArrayLike, ArrayLike], ArrayLike]] = None,
|
|
119
122
|
) -> None:
|
|
120
123
|
if hashvalues is not None:
|
|
121
124
|
num_perm = len(hashvalues)
|
|
@@ -180,7 +183,7 @@ class MinHash:
|
|
|
180
183
|
dtype=np.uint64,
|
|
181
184
|
).T
|
|
182
185
|
|
|
183
|
-
def _parse_hashvalues(self, hashvalues):
|
|
186
|
+
def _parse_hashvalues(self, hashvalues) -> np.ndarray:
|
|
184
187
|
return np.array(hashvalues, dtype=np.uint64)
|
|
185
188
|
|
|
186
189
|
def update(self, b) -> None:
|
|
@@ -26,7 +26,7 @@ except ImportError:
|
|
|
26
26
|
c_concurrent = None
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
def ordered_storage(config, name=None):
|
|
29
|
+
def ordered_storage(config, name=None) -> "OrderedStorage":
|
|
30
30
|
"""Return ordered storage system based on the specified config.
|
|
31
31
|
|
|
32
32
|
The canonical example of such a storage container is
|
|
@@ -62,10 +62,10 @@ def ordered_storage(config, name=None):
|
|
|
62
62
|
return RedisListStorage(config, name=name)
|
|
63
63
|
if tp == "cassandra":
|
|
64
64
|
return CassandraListStorage(config, name=name)
|
|
65
|
-
|
|
65
|
+
raise ValueError(f"Unknown storage type: {tp}")
|
|
66
66
|
|
|
67
67
|
|
|
68
|
-
def unordered_storage(config, name=None):
|
|
68
|
+
def unordered_storage(config, name=None) -> "UnorderedStorage":
|
|
69
69
|
"""Return an unordered storage system based on the specified config.
|
|
70
70
|
|
|
71
71
|
The canonical example of such a storage container is
|
|
@@ -100,7 +100,7 @@ def unordered_storage(config, name=None):
|
|
|
100
100
|
return RedisSetStorage(config, name=name)
|
|
101
101
|
if tp == "cassandra":
|
|
102
102
|
return CassandraSetStorage(config, name=name)
|
|
103
|
-
|
|
103
|
+
raise ValueError(f"Unknown storage type: {tp}")
|
|
104
104
|
|
|
105
105
|
|
|
106
106
|
class Storage(ABC):
|
|
@@ -144,7 +144,7 @@ class Storage(ABC):
|
|
|
144
144
|
pass
|
|
145
145
|
|
|
146
146
|
@abstractmethod
|
|
147
|
-
def remove(self, *keys):
|
|
147
|
+
def remove(self, *keys, **kwargs):
|
|
148
148
|
"""Remove `keys` from storage."""
|
|
149
149
|
pass
|
|
150
150
|
|
|
@@ -154,12 +154,12 @@ class Storage(ABC):
|
|
|
154
154
|
pass
|
|
155
155
|
|
|
156
156
|
@abstractmethod
|
|
157
|
-
def size(self):
|
|
157
|
+
def size(self) -> int:
|
|
158
158
|
"""Return size of storage with respect to number of keys."""
|
|
159
159
|
pass
|
|
160
160
|
|
|
161
161
|
@abstractmethod
|
|
162
|
-
def itemcounts(self, **kwargs):
|
|
162
|
+
def itemcounts(self, **kwargs) -> dict:
|
|
163
163
|
"""Returns the number of items stored under each key."""
|
|
164
164
|
pass
|
|
165
165
|
|
|
@@ -168,6 +168,14 @@ class Storage(ABC):
|
|
|
168
168
|
"""Determines whether the key is in the storage or not."""
|
|
169
169
|
pass
|
|
170
170
|
|
|
171
|
+
@property
|
|
172
|
+
def buffer_size(self) -> int:
|
|
173
|
+
return getattr(self, "_buffer_size", 50000)
|
|
174
|
+
|
|
175
|
+
@buffer_size.setter
|
|
176
|
+
def buffer_size(self, value: int):
|
|
177
|
+
self._buffer_size = value
|
|
178
|
+
|
|
171
179
|
def status(self):
|
|
172
180
|
return {"keyspace_size": len(self)}
|
|
173
181
|
|
|
@@ -595,12 +603,14 @@ if cassandra is not None:
|
|
|
595
603
|
del self._select_statements_and_parameters_with_decoders[:]
|
|
596
604
|
statements_and_parameters, decoders = zip(*buffer)
|
|
597
605
|
|
|
598
|
-
ret = collections.defaultdict(list)
|
|
599
606
|
query_results = self._select(statements_and_parameters)
|
|
600
|
-
|
|
607
|
+
ret = []
|
|
608
|
+
for rows, (_key_decoder, val_decoder) in zip(query_results, decoders):
|
|
609
|
+
values = []
|
|
601
610
|
for row in rows:
|
|
602
|
-
|
|
603
|
-
|
|
611
|
+
values.append((val_decoder(row.value), row.ts))
|
|
612
|
+
ret.append([x[0] for x in sorted(values, key=operator.itemgetter(1))])
|
|
613
|
+
return ret
|
|
604
614
|
|
|
605
615
|
def select(self, keys):
|
|
606
616
|
"""Select all values for the given keys.
|
|
@@ -133,14 +133,15 @@ class WeightedMinHashGenerator:
|
|
|
133
133
|
WeightedMinHash: The weighted MinHash.
|
|
134
134
|
|
|
135
135
|
"""
|
|
136
|
-
if not isinstance(v, collections.abc.
|
|
137
|
-
raise TypeError("Input vector must be
|
|
136
|
+
if not isinstance(v, collections.abc.Sized):
|
|
137
|
+
raise TypeError("Input vector must be sized")
|
|
138
138
|
if not len(v) == self.dim:
|
|
139
139
|
raise ValueError("Input dimension mismatch, expecting %d" % self.dim)
|
|
140
140
|
if not isinstance(v, np.ndarray):
|
|
141
141
|
v = np.array(v, dtype=np.float32)
|
|
142
142
|
elif v.dtype != np.float32:
|
|
143
143
|
v = v.astype(np.float32)
|
|
144
|
+
v: np.ndarray = v
|
|
144
145
|
hashvalues = np.zeros((self.sample_size, 2), dtype=int)
|
|
145
146
|
vzeros = v == 0
|
|
146
147
|
if vzeros.all():
|
|
@@ -226,9 +227,8 @@ class WeightedMinHashGenerator:
|
|
|
226
227
|
doc_argmin = np.argmin(doc_ln_a, axis=1)
|
|
227
228
|
doc_k = doc_cidx[doc_argmin]
|
|
228
229
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
hashvalues = all_hashvalues[it_doc]
|
|
230
|
+
hashvalues = np.zeros((self.sample_size, 2), dtype=int)
|
|
231
|
+
all_hashvalues[it_doc] = hashvalues
|
|
232
232
|
hashvalues[:, 0], hashvalues[:, 1] = (
|
|
233
233
|
doc_k,
|
|
234
234
|
t[np.arange(self.sample_size), doc_begin + doc_argmin],
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "datasketch"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.10.0"
|
|
8
8
|
description = "Probabilistic data structures for processing and searching very large datasets"
|
|
9
9
|
readme = "README.rst"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -21,6 +21,7 @@ classifiers = [
|
|
|
21
21
|
"Programming Language :: Python :: 3.10",
|
|
22
22
|
"Programming Language :: Python :: 3.11",
|
|
23
23
|
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Programming Language :: Python :: 3.13",
|
|
24
25
|
]
|
|
25
26
|
dependencies = ["numpy>=1.11", "scipy>=1.0.0"]
|
|
26
27
|
|
|
@@ -36,7 +37,11 @@ benchmark = [
|
|
|
36
37
|
"pandas>=0.25.3",
|
|
37
38
|
"SetSimilaritySearch>=0.1.7",
|
|
38
39
|
"pyfarmhash>=0.2.2",
|
|
39
|
-
"nltk>=3.4.5",
|
|
40
|
+
"nltk>=3.4.5; python_version < '3.10'",
|
|
41
|
+
"nltk>=3.9.4; python_version >= '3.10'",
|
|
42
|
+
# Transitive deps of matplotlib listed to avoid dependabot uv.lock-only PRs.
|
|
43
|
+
"pillow>=12.2.0; python_version >= '3.10'",
|
|
44
|
+
"fonttools>=4.60.2",
|
|
40
45
|
]
|
|
41
46
|
test = [
|
|
42
47
|
"cassandra-driver>=3.20",
|
|
@@ -44,12 +49,21 @@ test = [
|
|
|
44
49
|
"mock>=2.0.0",
|
|
45
50
|
"mockredispy",
|
|
46
51
|
"coverage",
|
|
52
|
+
"pytest-cov",
|
|
47
53
|
"pymongo>=3.9.0",
|
|
48
54
|
"nose>=1.3.7",
|
|
49
55
|
"nose-exclude>=0.5.0",
|
|
50
|
-
"pytest",
|
|
56
|
+
"pytest; python_version < '3.10'",
|
|
57
|
+
"pytest>=9.0.3; python_version >= '3.10'",
|
|
51
58
|
"pytest-rerunfailures",
|
|
59
|
+
"pytest-asyncio",
|
|
60
|
+
# Transitive dep of pytest listed to avoid dependabot uv.lock-only PRs.
|
|
61
|
+
"pygments>=2.20.0",
|
|
52
62
|
]
|
|
63
|
+
aio = ["aiounittest", "motor>3.6.0"]
|
|
64
|
+
# KEEP IN SYNC WITH `aio` ABOVE. Deprecated alias retained for backwards compat;
|
|
65
|
+
# PEP 621 does not support referencing one optional-dependency group from
|
|
66
|
+
# another, so the dependency list must be duplicated verbatim.
|
|
53
67
|
experimental_aio = ["aiounittest", "motor>3.6.0"]
|
|
54
68
|
|
|
55
69
|
[project.urls]
|
|
@@ -91,7 +105,6 @@ exclude = [
|
|
|
91
105
|
"dist",
|
|
92
106
|
"docs",
|
|
93
107
|
"examples",
|
|
94
|
-
"travis",
|
|
95
108
|
"datasketch/hyperloglog_const.py",
|
|
96
109
|
]
|
|
97
110
|
|
|
@@ -158,5 +171,38 @@ include = ["pyproject.toml", "README.rst", "LICENSE", "datasketch/**"]
|
|
|
158
171
|
|
|
159
172
|
[tool.pytest.ini_options]
|
|
160
173
|
minversion = "6.0"
|
|
161
|
-
addopts = ["--strict-markers", "--color=yes"]
|
|
174
|
+
addopts = ["--strict-markers", "--color=yes", "--cov-report=xml"]
|
|
162
175
|
testpaths = ["test"]
|
|
176
|
+
asyncio_mode = "auto"
|
|
177
|
+
|
|
178
|
+
[tool.pyright]
|
|
179
|
+
include = ["datasketch"]
|
|
180
|
+
exclude = [
|
|
181
|
+
"benchmark",
|
|
182
|
+
"docs",
|
|
183
|
+
"examples",
|
|
184
|
+
"test",
|
|
185
|
+
"travis",
|
|
186
|
+
"**/.venv/**",
|
|
187
|
+
"**/__pycache__",
|
|
188
|
+
]
|
|
189
|
+
pythonVersion = "3.9"
|
|
190
|
+
typeCheckingMode = "basic" # todo: change to "strict" in future
|
|
191
|
+
|
|
192
|
+
reportMissingImports = "none"
|
|
193
|
+
reportUnusedVariable = "warning"
|
|
194
|
+
reportAttributeAccessIssue = "none"
|
|
195
|
+
reportOptionalMemberAccess = "none"
|
|
196
|
+
reportGeneralTypeIssues = "none"
|
|
197
|
+
reportArgumentType = "none"
|
|
198
|
+
reportOptionalIterable = "none"
|
|
199
|
+
reportReturnType = "none"
|
|
200
|
+
reportRedeclaration = "none"
|
|
201
|
+
reportOperatorIssue = "none"
|
|
202
|
+
reportAssignmentType = "none"
|
|
203
|
+
reportOptionalSubscript = "none"
|
|
204
|
+
reportCallIssue = "none"
|
|
205
|
+
|
|
206
|
+
[tool.coverage.run]
|
|
207
|
+
source = ["datasketch"]
|
|
208
|
+
omit = ["*/tests/*", "*/test/*"]
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
"""Warning.
|
|
2
|
-
|
|
3
|
-
datasketch.experimental is dedicated to new modules that are to be merged into
|
|
4
|
-
the stable interface of datasketch. So their interfaces may change in future
|
|
5
|
-
versions.
|
|
6
|
-
|
|
7
|
-
To add a new class or function, register it here in this file. For example:
|
|
8
|
-
|
|
9
|
-
from new_module import NewModuleClass
|
|
10
|
-
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
from datasketch.experimental.aio.lsh import AsyncMinHashLSH
|
|
14
|
-
|
|
15
|
-
__all__ = ["AsyncMinHashLSH"]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|