datasketch 1.9.0__tar.gz → 1.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datasketch-1.9.0 → datasketch-1.10.0}/PKG-INFO +11 -3
- {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/__init__.py +2 -1
- datasketch-1.10.0/datasketch/aio/__init__.py +44 -0
- {datasketch-1.9.0/datasketch/experimental → datasketch-1.10.0/datasketch}/aio/lsh.py +57 -73
- {datasketch-1.9.0/datasketch/experimental → datasketch-1.10.0/datasketch}/aio/storage.py +44 -14
- datasketch-1.10.0/datasketch/experimental/__init__.py +49 -0
- datasketch-1.10.0/datasketch/experimental/aio/__init__.py +50 -0
- datasketch-1.10.0/datasketch/experimental/aio/lsh.py +49 -0
- {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/lsh.py +15 -7
- {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/lshensemble.py +1 -1
- {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/lshforest.py +16 -9
- {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/storage.py +6 -4
- {datasketch-1.9.0 → datasketch-1.10.0}/pyproject.toml +15 -4
- datasketch-1.9.0/datasketch/experimental/__init__.py +0 -15
- datasketch-1.9.0/datasketch/experimental/aio/__init__.py +0 -0
- {datasketch-1.9.0 → datasketch-1.10.0}/.gitignore +0 -0
- {datasketch-1.9.0 → datasketch-1.10.0}/LICENSE +0 -0
- {datasketch-1.9.0 → datasketch-1.10.0}/README.rst +0 -0
- {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/b_bit_minhash.py +0 -0
- {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/hashfunc.py +0 -0
- {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/hnsw.py +0 -0
- {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/hyperloglog.py +0 -0
- {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/hyperloglog_const.py +0 -0
- {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/lean_minhash.py +0 -0
- {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/lsh_bloom.py +0 -0
- {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/lshensemble_partition.py +0 -0
- {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/minhash.py +0 -0
- {datasketch-1.9.0 → datasketch-1.10.0}/datasketch/weighted_minhash.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datasketch
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.10.0
|
|
4
4
|
Summary: Probabilistic data structures for processing and searching very large datasets
|
|
5
5
|
Project-URL: Homepage, https://ekzhu.github.io/datasketch
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/ekzhu/datasketch/issues
|
|
@@ -23,10 +23,16 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
|
23
23
|
Requires-Python: >=3.9
|
|
24
24
|
Requires-Dist: numpy>=1.11
|
|
25
25
|
Requires-Dist: scipy>=1.0.0
|
|
26
|
+
Provides-Extra: aio
|
|
27
|
+
Requires-Dist: aiounittest; extra == 'aio'
|
|
28
|
+
Requires-Dist: motor>3.6.0; extra == 'aio'
|
|
26
29
|
Provides-Extra: benchmark
|
|
30
|
+
Requires-Dist: fonttools>=4.60.2; extra == 'benchmark'
|
|
27
31
|
Requires-Dist: matplotlib>=3.1.2; extra == 'benchmark'
|
|
28
|
-
Requires-Dist: nltk>=3.4.5; extra == 'benchmark'
|
|
32
|
+
Requires-Dist: nltk>=3.4.5; (python_version < '3.10') and extra == 'benchmark'
|
|
33
|
+
Requires-Dist: nltk>=3.9.4; (python_version >= '3.10') and extra == 'benchmark'
|
|
29
34
|
Requires-Dist: pandas>=0.25.3; extra == 'benchmark'
|
|
35
|
+
Requires-Dist: pillow>=12.2.0; (python_version >= '3.10') and extra == 'benchmark'
|
|
30
36
|
Requires-Dist: pyfarmhash>=0.2.2; extra == 'benchmark'
|
|
31
37
|
Requires-Dist: pyhash>=0.9.3; extra == 'benchmark'
|
|
32
38
|
Requires-Dist: scikit-learn>=0.21.3; extra == 'benchmark'
|
|
@@ -48,11 +54,13 @@ Requires-Dist: mock>=2.0.0; extra == 'test'
|
|
|
48
54
|
Requires-Dist: mockredispy; extra == 'test'
|
|
49
55
|
Requires-Dist: nose-exclude>=0.5.0; extra == 'test'
|
|
50
56
|
Requires-Dist: nose>=1.3.7; extra == 'test'
|
|
57
|
+
Requires-Dist: pygments>=2.20.0; extra == 'test'
|
|
51
58
|
Requires-Dist: pymongo>=3.9.0; extra == 'test'
|
|
52
|
-
Requires-Dist: pytest; extra == 'test'
|
|
53
59
|
Requires-Dist: pytest-asyncio; extra == 'test'
|
|
54
60
|
Requires-Dist: pytest-cov; extra == 'test'
|
|
55
61
|
Requires-Dist: pytest-rerunfailures; extra == 'test'
|
|
62
|
+
Requires-Dist: pytest; (python_version < '3.10') and extra == 'test'
|
|
63
|
+
Requires-Dist: pytest>=9.0.3; (python_version >= '3.10') and extra == 'test'
|
|
56
64
|
Requires-Dist: redis>=2.10.0; extra == 'test'
|
|
57
65
|
Description-Content-Type: text/x-rst
|
|
58
66
|
|
|
@@ -7,6 +7,7 @@ except importlib.metadata.PackageNotFoundError:
|
|
|
7
7
|
_version = "0.0.0" # Fallback for development mode
|
|
8
8
|
__version__: Final[str] = _version
|
|
9
9
|
|
|
10
|
+
from datasketch.aio import AsyncMinHashLSH # Instantiation requires motor/redis.asyncio; import itself is always safe.
|
|
10
11
|
from datasketch.b_bit_minhash import bBitMinHash
|
|
11
12
|
from datasketch.hashfunc import sha1_hash32
|
|
12
13
|
from datasketch.hnsw import HNSW
|
|
@@ -23,9 +24,9 @@ from datasketch.weighted_minhash import WeightedMinHash, WeightedMinHashGenerato
|
|
|
23
24
|
WeightedMinHashLSH = MinHashLSH
|
|
24
25
|
WeightedMinHashLSHForest = MinHashLSHForest
|
|
25
26
|
|
|
26
|
-
|
|
27
27
|
__all__ = [
|
|
28
28
|
"HNSW",
|
|
29
|
+
"AsyncMinHashLSH",
|
|
29
30
|
"HyperLogLog",
|
|
30
31
|
"HyperLogLogPlusPlus",
|
|
31
32
|
"LeanMinHash",
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Async MinHash LSH module.
|
|
2
|
+
|
|
3
|
+
This module provides asynchronous implementations of MinHash LSH for use with
|
|
4
|
+
async storage backends like MongoDB (via motor) and Redis (via redis.asyncio).
|
|
5
|
+
|
|
6
|
+
Example:
|
|
7
|
+
.. code-block:: python
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
|
|
11
|
+
from datasketch.aio import AsyncMinHashLSH
|
|
12
|
+
from datasketch import MinHash
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
async def main():
|
|
16
|
+
# prepickle=True lets you use non-bytes keys (e.g. str). With the
|
|
17
|
+
# default prepickle=False, keys passed to insert() must be bytes.
|
|
18
|
+
async with AsyncMinHashLSH(
|
|
19
|
+
storage_config={"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}},
|
|
20
|
+
threshold=0.5,
|
|
21
|
+
num_perm=128,
|
|
22
|
+
prepickle=True,
|
|
23
|
+
) as lsh:
|
|
24
|
+
m = MinHash(num_perm=128)
|
|
25
|
+
m.update(b"data")
|
|
26
|
+
await lsh.insert("key", m)
|
|
27
|
+
result = await lsh.query(m)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
asyncio.run(main())
|
|
31
|
+
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
from datasketch.aio.lsh import (
|
|
35
|
+
AsyncMinHashLSH,
|
|
36
|
+
AsyncMinHashLSHDeleteSession,
|
|
37
|
+
AsyncMinHashLSHInsertionSession,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
__all__ = [
|
|
41
|
+
"AsyncMinHashLSH",
|
|
42
|
+
"AsyncMinHashLSHDeleteSession",
|
|
43
|
+
"AsyncMinHashLSHInsertionSession",
|
|
44
|
+
]
|
|
@@ -1,9 +1,15 @@
|
|
|
1
|
+
"""Asynchronous MinHash LSH implementation.
|
|
2
|
+
|
|
3
|
+
This module provides AsyncMinHashLSH for use with async storage backends
|
|
4
|
+
like MongoDB (via motor) and Redis (via redis.asyncio).
|
|
5
|
+
"""
|
|
6
|
+
|
|
1
7
|
import asyncio
|
|
2
8
|
import pickle
|
|
3
9
|
from itertools import chain
|
|
4
10
|
from typing import Optional
|
|
5
11
|
|
|
6
|
-
from datasketch.
|
|
12
|
+
from datasketch.aio.storage import (
|
|
7
13
|
async_ordered_storage,
|
|
8
14
|
async_unordered_storage,
|
|
9
15
|
)
|
|
@@ -34,8 +40,6 @@ class AsyncMinHashLSH:
|
|
|
34
40
|
MONGO = {"type": "aiomongo", "basename": "base_name_1", "mongo": {"host": "localhost", "port": 27017}}
|
|
35
41
|
|
|
36
42
|
.. note::
|
|
37
|
-
* The module supports Python version >=3.6, and is currently experimental.
|
|
38
|
-
So the interface may change slightly in the future.
|
|
39
43
|
* For main functionality of LSH algorithm see :class:`datasketch.MinHashLSH`.
|
|
40
44
|
* For additional information see :ref:`minhash_lsh_at_scale` and :ref:`minhash_lsh_async`
|
|
41
45
|
"""
|
|
@@ -129,7 +133,7 @@ class AsyncMinHashLSH:
|
|
|
129
133
|
if self.keys is not None:
|
|
130
134
|
self.keys.batch_size = value
|
|
131
135
|
else:
|
|
132
|
-
raise AttributeError("
|
|
136
|
+
raise AttributeError("AsyncMinHashLSH is not initialized.")
|
|
133
137
|
|
|
134
138
|
for t in self.hashtables:
|
|
135
139
|
t.batch_size = value
|
|
@@ -163,12 +167,6 @@ class AsyncMinHashLSH:
|
|
|
163
167
|
if self.keys is None:
|
|
164
168
|
await self._create_storages()
|
|
165
169
|
|
|
166
|
-
if not self.keys.initialized:
|
|
167
|
-
await self.keys
|
|
168
|
-
|
|
169
|
-
fs = (ht for ht in self.hashtables if not ht.initialized)
|
|
170
|
-
await asyncio.gather(*fs)
|
|
171
|
-
|
|
172
170
|
async def close(self):
|
|
173
171
|
"""Cleanup client resources and disconnect from AsyncMinHashLSH storage."""
|
|
174
172
|
async with self._lock:
|
|
@@ -189,41 +187,26 @@ class AsyncMinHashLSH:
|
|
|
189
187
|
|
|
190
188
|
:param int batch_size: the size of chunks to use in insert_session mode (default=10000).
|
|
191
189
|
|
|
192
|
-
:return: datasketch.
|
|
190
|
+
:return: datasketch.aio.lsh.AsyncMinHashLSHInsertionSession
|
|
193
191
|
|
|
194
192
|
Example:
|
|
195
193
|
.. code-block:: python
|
|
196
194
|
|
|
197
|
-
|
|
195
|
+
import asyncio
|
|
196
|
+
from datasketch.aio import AsyncMinHashLSH
|
|
198
197
|
from datasketch import MinHash
|
|
199
198
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
_chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
|
|
207
|
-
seq = frozenset(
|
|
208
|
-
chain(
|
|
209
|
-
("".join(s) for s in _chunked_str),
|
|
210
|
-
("aahhb", "aahh", "aahhc", "aac", "kld", "bhg", "kkd", "yow", "ppi", "eer"),
|
|
211
|
-
)
|
|
212
|
-
)
|
|
213
|
-
objs = [MinHash(16) for _ in range(len(seq))]
|
|
214
|
-
for e, obj in zip(seq, objs):
|
|
215
|
-
for i in e:
|
|
216
|
-
obj.update(i.encode("utf-8"))
|
|
217
|
-
data = [(e, m) for e, m in zip(seq, objs)]
|
|
218
|
-
|
|
219
|
-
_storage_config_redis = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
async def func():
|
|
223
|
-
async with AsyncMinHashLSH(storage_config=_storage_config_redis, threshold=0.5, num_perm=16) as lsh:
|
|
199
|
+
async def main():
|
|
200
|
+
storage_config = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
|
|
201
|
+
async with AsyncMinHashLSH(
|
|
202
|
+
storage_config=storage_config, threshold=0.5, num_perm=16, prepickle=True
|
|
203
|
+
) as lsh:
|
|
224
204
|
async with lsh.insertion_session(batch_size=1000) as session:
|
|
225
|
-
|
|
226
|
-
|
|
205
|
+
m = MinHash(num_perm=16)
|
|
206
|
+
m.update(b"data")
|
|
207
|
+
await session.insert("key", m)
|
|
208
|
+
|
|
209
|
+
asyncio.run(main())
|
|
227
210
|
|
|
228
211
|
"""
|
|
229
212
|
return AsyncMinHashLSHInsertionSession(self, batch_size=batch_size)
|
|
@@ -232,47 +215,32 @@ class AsyncMinHashLSH:
|
|
|
232
215
|
"""Create a asynchronous context manager for fast removal of keys
|
|
233
216
|
from index.
|
|
234
217
|
|
|
235
|
-
:param int batch_size: the size of chunks to use in
|
|
218
|
+
:param int batch_size: the size of chunks to use in delete_session mode (default=10000).
|
|
236
219
|
|
|
237
|
-
:return: datasketch.
|
|
220
|
+
:return: datasketch.aio.lsh.AsyncMinHashLSHDeleteSession
|
|
238
221
|
|
|
239
222
|
Example:
|
|
240
223
|
.. code-block:: python
|
|
241
224
|
|
|
242
|
-
|
|
225
|
+
import asyncio
|
|
226
|
+
from datasketch.aio import AsyncMinHashLSH
|
|
243
227
|
from datasketch import MinHash
|
|
244
228
|
|
|
229
|
+
async def main():
|
|
230
|
+
storage_config = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
|
|
231
|
+
async with AsyncMinHashLSH(
|
|
232
|
+
storage_config=storage_config, threshold=0.5, num_perm=16, prepickle=True
|
|
233
|
+
) as lsh:
|
|
234
|
+
# Insert some data first
|
|
235
|
+
m = MinHash(num_perm=16)
|
|
236
|
+
m.update(b"data")
|
|
237
|
+
await lsh.insert("key1", m)
|
|
245
238
|
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
239
|
+
# Delete using session
|
|
240
|
+
async with lsh.delete_session(batch_size=100) as session:
|
|
241
|
+
await session.remove("key1")
|
|
249
242
|
|
|
250
|
-
|
|
251
|
-
_chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
|
|
252
|
-
seq = frozenset(
|
|
253
|
-
chain(
|
|
254
|
-
("".join(s) for s in _chunked_str),
|
|
255
|
-
("aahhb", "aahh", "aahhc", "aac", "kld", "bhg", "kkd", "yow", "ppi", "eer"),
|
|
256
|
-
)
|
|
257
|
-
)
|
|
258
|
-
objs = [MinHash(16) for _ in range(len(seq))]
|
|
259
|
-
for e, obj in zip(seq, objs):
|
|
260
|
-
for i in e:
|
|
261
|
-
obj.update(i.encode("utf-8"))
|
|
262
|
-
data = [(e, m) for e, m in zip(seq, objs)]
|
|
263
|
-
|
|
264
|
-
_storage_config_redis = {"type": "aiomongo", "mongo": {"host": "localhost", "port": 27017}}
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
async def func():
|
|
268
|
-
async with AsyncMinHashLSH(storage_config=_storage_config_redis, threshold=0.5, num_perm=16) as lsh:
|
|
269
|
-
async with lsh.insertion_session(batch_size=1000) as session:
|
|
270
|
-
fs = (session.insert(key, minhash, check_duplication=True) for key, minhash in data)
|
|
271
|
-
await asyncio.gather(*fs)
|
|
272
|
-
|
|
273
|
-
async with lsh.delete_session(batch_size=3) as session:
|
|
274
|
-
fs = (session.remove(key) for key in keys_to_remove)
|
|
275
|
-
await asyncio.gather(*fs)
|
|
243
|
+
asyncio.run(main())
|
|
276
244
|
|
|
277
245
|
"""
|
|
278
246
|
return AsyncMinHashLSHDeleteSession(self, batch_size=batch_size)
|
|
@@ -288,7 +256,9 @@ class AsyncMinHashLSH:
|
|
|
288
256
|
if self.prepickle:
|
|
289
257
|
key = pickle.dumps(key)
|
|
290
258
|
|
|
291
|
-
|
|
259
|
+
# `key` is already pickled at this point under prepickle=True; call the
|
|
260
|
+
# storage primitive directly so we don't re-pickle through has_key().
|
|
261
|
+
if check_duplication and await self.keys.has_key(key):
|
|
292
262
|
raise ValueError("The given key already exists")
|
|
293
263
|
Hs = [self._H(minhash.hashvalues[start:end]) for start, end in self.hashranges]
|
|
294
264
|
|
|
@@ -314,6 +284,8 @@ class AsyncMinHashLSH:
|
|
|
314
284
|
|
|
315
285
|
async def has_key(self, key):
|
|
316
286
|
"""See :class:`datasketch.MinHashLSH`."""
|
|
287
|
+
if self.prepickle:
|
|
288
|
+
key = pickle.dumps(key)
|
|
317
289
|
return await self.keys.has_key(key)
|
|
318
290
|
|
|
319
291
|
async def remove(self, key):
|
|
@@ -321,7 +293,12 @@ class AsyncMinHashLSH:
|
|
|
321
293
|
await self._remove(key, buffer=False)
|
|
322
294
|
|
|
323
295
|
async def _remove(self, key, buffer=False):
|
|
324
|
-
if
|
|
296
|
+
if self.prepickle:
|
|
297
|
+
key = pickle.dumps(key)
|
|
298
|
+
|
|
299
|
+
# `key` is already pickled here; call storage primitives directly so
|
|
300
|
+
# the existence check, lookup, and deletes all use the stored form.
|
|
301
|
+
if not await self.keys.has_key(key):
|
|
325
302
|
raise ValueError("The given key does not exist")
|
|
326
303
|
|
|
327
304
|
for H, hashtable in zip(await self.keys.get(key), self.hashtables):
|
|
@@ -352,7 +329,10 @@ class AsyncMinHashLSH:
|
|
|
352
329
|
H = self._H(minhash.hashvalues[start:end])
|
|
353
330
|
if await hashtable.has_key(H):
|
|
354
331
|
fs.append(hashtable.get(H))
|
|
355
|
-
|
|
332
|
+
candidates = set(chain.from_iterable(await asyncio.gather(*fs)))
|
|
333
|
+
if self.prepickle:
|
|
334
|
+
return {pickle.loads(key) for key in candidates}
|
|
335
|
+
return candidates
|
|
356
336
|
|
|
357
337
|
async def get_counts(self):
|
|
358
338
|
"""See :class:`datasketch.MinHashLSH`."""
|
|
@@ -361,6 +341,10 @@ class AsyncMinHashLSH:
|
|
|
361
341
|
|
|
362
342
|
async def get_subset_counts(self, *keys):
|
|
363
343
|
"""See :class:`datasketch.MinHashLSH`."""
|
|
344
|
+
# Keys in storage are pickled when prepickle is enabled, so we have to
|
|
345
|
+
# pickle the query keys to match the stored representation.
|
|
346
|
+
if self.prepickle:
|
|
347
|
+
keys = tuple(pickle.dumps(key) for key in keys)
|
|
364
348
|
key_set = list(set(keys))
|
|
365
349
|
hashtables = [unordered_storage({"type": "dict"}) for _ in range(self.b)]
|
|
366
350
|
Hss = await self.keys.getmany(*key_set)
|
|
@@ -1,9 +1,23 @@
|
|
|
1
|
+
"""Async storage backends for MinHash LSH.
|
|
2
|
+
|
|
3
|
+
This module provides async storage implementations for use with AsyncMinHashLSH:
|
|
4
|
+
- AsyncMongoListStorage / AsyncMongoSetStorage: MongoDB storage via motor
|
|
5
|
+
- AsyncRedisListStorage / AsyncRedisSetStorage: Redis storage via redis.asyncio
|
|
6
|
+
"""
|
|
7
|
+
|
|
1
8
|
import asyncio
|
|
2
9
|
import os
|
|
3
10
|
from abc import ABCMeta
|
|
4
11
|
from itertools import chain
|
|
5
12
|
|
|
6
|
-
from datasketch.storage import OrderedStorage,
|
|
13
|
+
from datasketch.storage import OrderedStorage, Storage, UnorderedStorage, _random_name
|
|
14
|
+
|
|
15
|
+
# RedisStorage is only available when redis package is installed (optional dependency)
|
|
16
|
+
# Import it conditionally to avoid ImportError when redis is not installed
|
|
17
|
+
try:
|
|
18
|
+
from datasketch.storage import RedisStorage
|
|
19
|
+
except ImportError:
|
|
20
|
+
RedisStorage = None
|
|
7
21
|
|
|
8
22
|
ABC = ABCMeta("ABC", (object,), {})
|
|
9
23
|
|
|
@@ -24,6 +38,12 @@ except ImportError:
|
|
|
24
38
|
redis = None
|
|
25
39
|
|
|
26
40
|
|
|
41
|
+
__all__ = [
|
|
42
|
+
"async_ordered_storage",
|
|
43
|
+
"async_unordered_storage",
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
|
|
27
47
|
async def async_ordered_storage(config, name=None):
|
|
28
48
|
tp = config["type"]
|
|
29
49
|
if tp == "aiomongo":
|
|
@@ -55,9 +75,9 @@ if motor is not None and ReturnDocument is not None:
|
|
|
55
75
|
class AsyncMongoBuffer:
|
|
56
76
|
def __init__(self, aio_mongo_collection, batch_size):
|
|
57
77
|
self._batch_size = batch_size
|
|
58
|
-
self._insert_documents_stack =
|
|
59
|
-
self._delete_by_key_documents_stack =
|
|
60
|
-
self._delete_by_val_documents_stack =
|
|
78
|
+
self._insert_documents_stack = []
|
|
79
|
+
self._delete_by_key_documents_stack = []
|
|
80
|
+
self._delete_by_val_documents_stack = []
|
|
61
81
|
self._mongo_coll = aio_mongo_collection
|
|
62
82
|
|
|
63
83
|
@property
|
|
@@ -73,28 +93,28 @@ if motor is not None and ReturnDocument is not None:
|
|
|
73
93
|
if command == "insert":
|
|
74
94
|
if len(self._insert_documents_stack) >= self.batch_size:
|
|
75
95
|
await self.execute(command)
|
|
76
|
-
self._insert_documents_stack
|
|
96
|
+
self._insert_documents_stack.append(kwargs["obj"])
|
|
77
97
|
elif command == "delete_by_key":
|
|
78
98
|
if len(self._delete_by_key_documents_stack) >= self.batch_size:
|
|
79
99
|
await self.execute(command)
|
|
80
|
-
self._delete_by_key_documents_stack
|
|
100
|
+
self._delete_by_key_documents_stack.append(kwargs["key"])
|
|
81
101
|
elif command == "delete_by_val":
|
|
82
102
|
if len(self._delete_by_val_documents_stack) >= self.batch_size:
|
|
83
103
|
await self.execute(command)
|
|
84
|
-
self._delete_by_val_documents_stack
|
|
104
|
+
self._delete_by_val_documents_stack.append(kwargs["val"])
|
|
85
105
|
|
|
86
106
|
async def execute(self, command):
|
|
87
107
|
if command == "insert" and self._insert_documents_stack:
|
|
88
108
|
buffer = self._insert_documents_stack
|
|
89
|
-
self._insert_documents_stack =
|
|
109
|
+
self._insert_documents_stack = []
|
|
90
110
|
await self._mongo_coll.insert_many(buffer, ordered=False)
|
|
91
111
|
elif command == "delete_by_key" and self._delete_by_key_documents_stack:
|
|
92
112
|
buffer = self._delete_by_key_documents_stack
|
|
93
|
-
self._delete_by_key_documents_stack =
|
|
113
|
+
self._delete_by_key_documents_stack = []
|
|
94
114
|
await self._mongo_coll.delete_many({"key": {"$in": buffer}})
|
|
95
115
|
elif command == "delete_by_val" and self._delete_by_val_documents_stack:
|
|
96
116
|
buffer = self._delete_by_val_documents_stack
|
|
97
|
-
self._delete_by_val_documents_stack =
|
|
117
|
+
self._delete_by_val_documents_stack = []
|
|
98
118
|
await self._mongo_coll.delete_many({"vals": {"$in": buffer}})
|
|
99
119
|
|
|
100
120
|
async def insert_one(self, **kwargs):
|
|
@@ -254,6 +274,9 @@ if motor is not None and ReturnDocument is not None:
|
|
|
254
274
|
async def has_key(self, key):
|
|
255
275
|
return bool(await self._collection.find_one({"key": key}))
|
|
256
276
|
|
|
277
|
+
async def getmany(self, *keys):
|
|
278
|
+
return await asyncio.gather(*(self.get(key) for key in keys))
|
|
279
|
+
|
|
257
280
|
async def status(self):
|
|
258
281
|
status = self._parse_config(self.config["mongo"])
|
|
259
282
|
status.update({"keyspace_size": await self.size()})
|
|
@@ -285,7 +308,9 @@ if motor is not None and ReturnDocument is not None:
|
|
|
285
308
|
await self._collection.find_one_and_delete({"key": key, "vals": val})
|
|
286
309
|
|
|
287
310
|
|
|
288
|
-
|
|
311
|
+
# Redis-based async storage classes are only defined when both redis package
|
|
312
|
+
# and RedisStorage are available (optional dependencies)
|
|
313
|
+
if redis is not None and RedisStorage is not None:
|
|
289
314
|
|
|
290
315
|
class AsyncRedisBuffer(redis.client.Pipeline):
|
|
291
316
|
def __init__(self, connection_pool, response_callbacks, transaction, buffer_size, shard_hint=None):
|
|
@@ -304,7 +329,7 @@ if redis is not None:
|
|
|
304
329
|
|
|
305
330
|
async def execute_command(self, *args, **kwargs):
|
|
306
331
|
if len(self.command_stack) >= self._buffer_size:
|
|
307
|
-
self.execute()
|
|
332
|
+
await self.execute()
|
|
308
333
|
await super(AsyncRedisBuffer, self).execute_command(*args, **kwargs)
|
|
309
334
|
|
|
310
335
|
class AsyncRedisStorage(RedisStorage):
|
|
@@ -347,9 +372,8 @@ if redis is not None:
|
|
|
347
372
|
|
|
348
373
|
async def getmany(self, *keys):
|
|
349
374
|
pipe = self._redis.pipeline()
|
|
350
|
-
pipe.multi()
|
|
351
375
|
for key in keys:
|
|
352
|
-
|
|
376
|
+
pipe.lrange(self.redis_key(key), 0, -1)
|
|
353
377
|
return await pipe.execute()
|
|
354
378
|
|
|
355
379
|
@staticmethod
|
|
@@ -422,6 +446,12 @@ if redis is not None:
|
|
|
422
446
|
async def _get_items(r, k):
|
|
423
447
|
return await r.smembers(k)
|
|
424
448
|
|
|
449
|
+
async def getmany(self, *keys):
|
|
450
|
+
pipe = self._redis.pipeline()
|
|
451
|
+
for key in keys:
|
|
452
|
+
pipe.smembers(self.redis_key(key))
|
|
453
|
+
return await pipe.execute()
|
|
454
|
+
|
|
425
455
|
async def remove_val(self, key, val, **kwargs):
|
|
426
456
|
buffer = kwargs.pop("buffer", False)
|
|
427
457
|
redis_key = self.redis_key(key)
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Deprecated experimental module.
|
|
2
|
+
|
|
3
|
+
.. deprecated::
|
|
4
|
+
The `datasketch.experimental` module is deprecated and will be removed in a future version.
|
|
5
|
+
Please use `datasketch.aio` instead:
|
|
6
|
+
|
|
7
|
+
Old: ``from datasketch.experimental import AsyncMinHashLSH``
|
|
8
|
+
New: ``from datasketch.aio import AsyncMinHashLSH``
|
|
9
|
+
|
|
10
|
+
Or simply: ``from datasketch import AsyncMinHashLSH``
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from typing import TYPE_CHECKING
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
# Visible to static analyzers so they know `__all__` is satisfied.
|
|
17
|
+
# Not imported at runtime - the real dispatch happens in __getattr__.
|
|
18
|
+
from datasketch.aio import AsyncMinHashLSH
|
|
19
|
+
|
|
20
|
+
__all__ = ["AsyncMinHashLSH"]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def __getattr__(name):
|
|
24
|
+
# PEP 562: only emit the DeprecationWarning when the user actually pulls a
|
|
25
|
+
# symbol out of this package, not on every `import datasketch.experimental`.
|
|
26
|
+
# This avoids the noisy triple-warning that fired when each intermediate
|
|
27
|
+
# __init__.py warned eagerly.
|
|
28
|
+
#
|
|
29
|
+
# We cache the resolved symbol back into globals() so subsequent accesses
|
|
30
|
+
# bypass __getattr__. This matters for two reasons:
|
|
31
|
+
# 1. `from pkg import x` internally performs both `hasattr(pkg, x)` and
|
|
32
|
+
# `getattr(pkg, x)`, so without caching __getattr__ fires twice.
|
|
33
|
+
# 2. It makes the warning a one-shot per process, which is the normal
|
|
34
|
+
# expectation for deprecation warnings.
|
|
35
|
+
if name == "AsyncMinHashLSH":
|
|
36
|
+
import warnings
|
|
37
|
+
|
|
38
|
+
warnings.warn(
|
|
39
|
+
"datasketch.experimental is deprecated. "
|
|
40
|
+
"Use 'from datasketch.aio import AsyncMinHashLSH' or "
|
|
41
|
+
"'from datasketch import AsyncMinHashLSH' instead.",
|
|
42
|
+
DeprecationWarning,
|
|
43
|
+
stacklevel=2,
|
|
44
|
+
)
|
|
45
|
+
from datasketch.aio import AsyncMinHashLSH
|
|
46
|
+
|
|
47
|
+
globals()[name] = AsyncMinHashLSH
|
|
48
|
+
return AsyncMinHashLSH
|
|
49
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Deprecated experimental aio module.
|
|
2
|
+
|
|
3
|
+
.. deprecated::
|
|
4
|
+
The `datasketch.experimental.aio` module is deprecated and will be removed in a future version.
|
|
5
|
+
Please use `datasketch.aio` instead:
|
|
6
|
+
|
|
7
|
+
Old: ``from datasketch.experimental.aio import AsyncMinHashLSH``
|
|
8
|
+
New: ``from datasketch.aio import AsyncMinHashLSH``
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from typing import TYPE_CHECKING
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
# Visible to static analyzers so they know `__all__` is satisfied.
|
|
15
|
+
# Not imported at runtime - the real dispatch happens in __getattr__.
|
|
16
|
+
from datasketch.aio import (
|
|
17
|
+
AsyncMinHashLSH,
|
|
18
|
+
AsyncMinHashLSHDeleteSession,
|
|
19
|
+
AsyncMinHashLSHInsertionSession,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"AsyncMinHashLSH",
|
|
24
|
+
"AsyncMinHashLSHDeleteSession",
|
|
25
|
+
"AsyncMinHashLSHInsertionSession",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
_DEPRECATED = frozenset(__all__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def __getattr__(name):
|
|
32
|
+
# PEP 562: emit the warning lazily on attribute access so that merely
|
|
33
|
+
# importing the parent package (e.g. as an intermediate step of
|
|
34
|
+
# `from datasketch.experimental.aio.lsh import ...`) does not fire
|
|
35
|
+
# a second, redundant warning. See the long comment in
|
|
36
|
+
# datasketch/experimental/__init__.py for why we cache into globals().
|
|
37
|
+
if name in _DEPRECATED:
|
|
38
|
+
import warnings
|
|
39
|
+
|
|
40
|
+
warnings.warn(
|
|
41
|
+
"datasketch.experimental.aio is deprecated. Use 'from datasketch.aio import AsyncMinHashLSH' instead.",
|
|
42
|
+
DeprecationWarning,
|
|
43
|
+
stacklevel=2,
|
|
44
|
+
)
|
|
45
|
+
import datasketch.aio as _new
|
|
46
|
+
|
|
47
|
+
value = getattr(_new, name)
|
|
48
|
+
globals()[name] = value
|
|
49
|
+
return value
|
|
50
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Deprecated experimental aio lsh module.
|
|
2
|
+
|
|
3
|
+
.. deprecated::
|
|
4
|
+
The `datasketch.experimental.aio.lsh` module is deprecated and will be removed in a future version.
|
|
5
|
+
Please use `datasketch.aio.lsh` instead:
|
|
6
|
+
|
|
7
|
+
Old: ``from datasketch.experimental.aio.lsh import AsyncMinHashLSH``
|
|
8
|
+
New: ``from datasketch.aio import AsyncMinHashLSH``
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from typing import TYPE_CHECKING
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
# Visible to static analyzers so they know `__all__` is satisfied.
|
|
15
|
+
# Not imported at runtime - the real dispatch happens in __getattr__.
|
|
16
|
+
from datasketch.aio.lsh import (
|
|
17
|
+
AsyncMinHashLSH,
|
|
18
|
+
AsyncMinHashLSHDeleteSession,
|
|
19
|
+
AsyncMinHashLSHInsertionSession,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"AsyncMinHashLSH",
|
|
24
|
+
"AsyncMinHashLSHDeleteSession",
|
|
25
|
+
"AsyncMinHashLSHInsertionSession",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
_DEPRECATED = frozenset(__all__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def __getattr__(name):
|
|
32
|
+
# Lazy warning via PEP 562: fires exactly once per attribute access on the
|
|
33
|
+
# deprecated module, and we cache the resolved symbol back into globals()
|
|
34
|
+
# so the warning is emitted once per process (see the long comment in
|
|
35
|
+
# datasketch/experimental/__init__.py for rationale).
|
|
36
|
+
if name in _DEPRECATED:
|
|
37
|
+
import warnings
|
|
38
|
+
|
|
39
|
+
warnings.warn(
|
|
40
|
+
"datasketch.experimental.aio.lsh is deprecated. Use 'from datasketch.aio import AsyncMinHashLSH' instead.",
|
|
41
|
+
DeprecationWarning,
|
|
42
|
+
stacklevel=2,
|
|
43
|
+
)
|
|
44
|
+
import datasketch.aio.lsh as _new
|
|
45
|
+
|
|
46
|
+
value = getattr(_new, name)
|
|
47
|
+
globals()[name] = value
|
|
48
|
+
return value
|
|
49
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
@@ -463,16 +463,24 @@ class MinHashLSH:
|
|
|
463
463
|
list: a list of unique keys.
|
|
464
464
|
|
|
465
465
|
"""
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
466
|
+
collected_result_lists = [hashtable.collect_select_buffer() for hashtable in self.hashtables]
|
|
467
|
+
if not any(collected_result_lists):
|
|
468
|
+
return []
|
|
469
|
+
|
|
470
|
+
# Each buffered query contributes one result list per hashtable. We first
|
|
471
|
+
# union candidates across bands for each query, then intersect across the
|
|
472
|
+
# buffered queries to match repeated calls to `query()`.
|
|
473
|
+
per_query_result_sets = [
|
|
474
|
+
set().union(*query_result_lists)
|
|
475
|
+
for query_result_lists in zip(*collected_result_lists)
|
|
470
476
|
]
|
|
471
|
-
if not
|
|
477
|
+
if not per_query_result_sets:
|
|
472
478
|
return []
|
|
479
|
+
|
|
480
|
+
candidates = set.intersection(*per_query_result_sets)
|
|
473
481
|
if self.prepickle:
|
|
474
|
-
return [pickle.loads(key) for key in
|
|
475
|
-
return list(
|
|
482
|
+
return [pickle.loads(key) for key in candidates]
|
|
483
|
+
return list(candidates)
|
|
476
484
|
|
|
477
485
|
def __contains__(self, key: Hashable) -> bool:
|
|
478
486
|
"""Args:
|
|
@@ -204,7 +204,7 @@ class MinHashLSHEnsemble:
|
|
|
204
204
|
if not self.is_empty():
|
|
205
205
|
raise ValueError("Cannot call index again on a non-empty index")
|
|
206
206
|
if not isinstance(entries, list):
|
|
207
|
-
queue = deque(
|
|
207
|
+
queue = deque()
|
|
208
208
|
for key, minhash, size in entries:
|
|
209
209
|
if size <= 0:
|
|
210
210
|
raise ValueError("Set size must be positive")
|
|
@@ -9,8 +9,8 @@ from datasketch.minhash import MinHash
|
|
|
9
9
|
class MinHashLSHForest:
|
|
10
10
|
"""The LSH Forest for MinHash. It supports top-k query in Jaccard
|
|
11
11
|
similarity.
|
|
12
|
-
Instead of using prefix trees as the
|
|
13
|
-
|
|
12
|
+
Instead of using prefix trees as described in the original LSH Forest
|
|
13
|
+
paper by Bawa et al. (WWW 2005),
|
|
14
14
|
I use a sorted array to store the hash values in every
|
|
15
15
|
hash table.
|
|
16
16
|
|
|
@@ -37,7 +37,8 @@ class MinHashLSHForest:
|
|
|
37
37
|
# Maximum depth of the prefix tree
|
|
38
38
|
self.k = int(num_perm / l)
|
|
39
39
|
self.hashtables = [defaultdict(list) for _ in range(self.l)]
|
|
40
|
-
self.hashranges = [(i * self.k, (i + 1) * self.k)
|
|
40
|
+
self.hashranges = [(i * self.k, (i + 1) * self.k)
|
|
41
|
+
for i in range(self.l)]
|
|
41
42
|
self.keys = dict()
|
|
42
43
|
# This is the sorted array implementation for the prefix trees
|
|
43
44
|
self.sorted_hashtables = [[] for _ in range(self.l)]
|
|
@@ -59,7 +60,8 @@ class MinHashLSHForest:
|
|
|
59
60
|
raise ValueError("The num_perm of MinHash out of range")
|
|
60
61
|
if key in self.keys:
|
|
61
62
|
raise ValueError("The given key has already been added")
|
|
62
|
-
self.keys[key] = [self._H(minhash.hashvalues[start:end])
|
|
63
|
+
self.keys[key] = [self._H(minhash.hashvalues[start:end])
|
|
64
|
+
for start, end in self.hashranges]
|
|
63
65
|
for H, hashtable in zip(self.keys[key], self.hashtables):
|
|
64
66
|
hashtable[H].append(key)
|
|
65
67
|
|
|
@@ -73,11 +75,13 @@ class MinHashLSHForest:
|
|
|
73
75
|
if r > self.k or r <= 0 or b > self.l or b <= 0:
|
|
74
76
|
raise ValueError("parameter outside range")
|
|
75
77
|
# Generate prefixes of concatenated hash values
|
|
76
|
-
hps = [self._H(minhash.hashvalues[start
|
|
78
|
+
hps = [self._H(minhash.hashvalues[start: start + r])
|
|
79
|
+
for start, _ in self.hashranges]
|
|
77
80
|
# Set the prefix length for look-ups in the sorted hash values list
|
|
78
81
|
prefix_size = len(hps[0])
|
|
79
82
|
for ht, hp, hashtable in zip(self.sorted_hashtables, hps, self.hashtables):
|
|
80
|
-
i = self._binary_search(
|
|
83
|
+
i = self._binary_search(
|
|
84
|
+
len(ht), lambda x, ht=ht, hp=hp: ht[x][:prefix_size] >= hp)
|
|
81
85
|
if i < len(ht) and ht[i][:prefix_size] == hp:
|
|
82
86
|
j = i
|
|
83
87
|
while j < len(ht) and ht[j][:prefix_size] == hp:
|
|
@@ -137,14 +141,17 @@ class MinHashLSHForest:
|
|
|
137
141
|
"""
|
|
138
142
|
byteslist = self.keys.get(key, None)
|
|
139
143
|
if byteslist is None:
|
|
140
|
-
raise KeyError(
|
|
144
|
+
raise KeyError(
|
|
145
|
+
f"The provided key does not exist in the LSHForest: {key}")
|
|
141
146
|
hashvalue_byte_size = len(byteslist[0]) // 8
|
|
142
|
-
hashvalues = np.empty(
|
|
147
|
+
hashvalues = np.empty(
|
|
148
|
+
len(byteslist) * hashvalue_byte_size, dtype=np.uint64)
|
|
143
149
|
for index, item in enumerate(byteslist):
|
|
144
150
|
# unswap the bytes, as their representation is flipped during storage
|
|
145
151
|
hv_segment = np.frombuffer(item, dtype=np.uint64).byteswap()
|
|
146
152
|
curr_index = index * hashvalue_byte_size
|
|
147
|
-
hashvalues[curr_index
|
|
153
|
+
hashvalues[curr_index: curr_index +
|
|
154
|
+
hashvalue_byte_size] = hv_segment
|
|
148
155
|
return hashvalues
|
|
149
156
|
|
|
150
157
|
def _binary_search(self, n, func):
|
|
@@ -603,12 +603,14 @@ if cassandra is not None:
|
|
|
603
603
|
del self._select_statements_and_parameters_with_decoders[:]
|
|
604
604
|
statements_and_parameters, decoders = zip(*buffer)
|
|
605
605
|
|
|
606
|
-
ret = collections.defaultdict(list)
|
|
607
606
|
query_results = self._select(statements_and_parameters)
|
|
608
|
-
|
|
607
|
+
ret = []
|
|
608
|
+
for rows, (_key_decoder, val_decoder) in zip(query_results, decoders):
|
|
609
|
+
values = []
|
|
609
610
|
for row in rows:
|
|
610
|
-
|
|
611
|
-
|
|
611
|
+
values.append((val_decoder(row.value), row.ts))
|
|
612
|
+
ret.append([x[0] for x in sorted(values, key=operator.itemgetter(1))])
|
|
613
|
+
return ret
|
|
612
614
|
|
|
613
615
|
def select(self, keys):
|
|
614
616
|
"""Select all values for the given keys.
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "datasketch"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.10.0"
|
|
8
8
|
description = "Probabilistic data structures for processing and searching very large datasets"
|
|
9
9
|
readme = "README.rst"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -37,7 +37,11 @@ benchmark = [
|
|
|
37
37
|
"pandas>=0.25.3",
|
|
38
38
|
"SetSimilaritySearch>=0.1.7",
|
|
39
39
|
"pyfarmhash>=0.2.2",
|
|
40
|
-
"nltk>=3.4.5",
|
|
40
|
+
"nltk>=3.4.5; python_version < '3.10'",
|
|
41
|
+
"nltk>=3.9.4; python_version >= '3.10'",
|
|
42
|
+
# Transitive deps of matplotlib listed to avoid dependabot uv.lock-only PRs.
|
|
43
|
+
"pillow>=12.2.0; python_version >= '3.10'",
|
|
44
|
+
"fonttools>=4.60.2",
|
|
41
45
|
]
|
|
42
46
|
test = [
|
|
43
47
|
"cassandra-driver>=3.20",
|
|
@@ -49,10 +53,17 @@ test = [
|
|
|
49
53
|
"pymongo>=3.9.0",
|
|
50
54
|
"nose>=1.3.7",
|
|
51
55
|
"nose-exclude>=0.5.0",
|
|
52
|
-
"pytest",
|
|
56
|
+
"pytest; python_version < '3.10'",
|
|
57
|
+
"pytest>=9.0.3; python_version >= '3.10'",
|
|
53
58
|
"pytest-rerunfailures",
|
|
54
59
|
"pytest-asyncio",
|
|
60
|
+
# Transitive dep of pytest listed to avoid dependabot uv.lock-only PRs.
|
|
61
|
+
"pygments>=2.20.0",
|
|
55
62
|
]
|
|
63
|
+
aio = ["aiounittest", "motor>3.6.0"]
|
|
64
|
+
# KEEP IN SYNC WITH `aio` ABOVE. Deprecated alias retained for backwards compat;
|
|
65
|
+
# PEP 621 does not support referencing one optional-dependency group from
|
|
66
|
+
# another, so the dependency list must be duplicated verbatim.
|
|
56
67
|
experimental_aio = ["aiounittest", "motor>3.6.0"]
|
|
57
68
|
|
|
58
69
|
[project.urls]
|
|
@@ -194,4 +205,4 @@ reportCallIssue = "none"
|
|
|
194
205
|
|
|
195
206
|
[tool.coverage.run]
|
|
196
207
|
source = ["datasketch"]
|
|
197
|
-
omit = ["*/
|
|
208
|
+
omit = ["*/tests/*", "*/test/*"]
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
"""Warning.
|
|
2
|
-
|
|
3
|
-
datasketch.experimental is dedicated to new modules that are to be merged into
|
|
4
|
-
the stable interface of datasketch. So their interfaces may change in future
|
|
5
|
-
versions.
|
|
6
|
-
|
|
7
|
-
To add a new class or function, register it here in this file. For example:
|
|
8
|
-
|
|
9
|
-
from new_module import NewModuleClass
|
|
10
|
-
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
from datasketch.experimental.aio.lsh import AsyncMinHashLSH
|
|
14
|
-
|
|
15
|
-
__all__ = ["AsyncMinHashLSH"]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|