datasketch 1.8.0__tar.gz → 1.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datasketch-1.8.0 → datasketch-1.9.0}/PKG-INFO +7 -1
- {datasketch-1.8.0 → datasketch-1.9.0}/README.rst +3 -0
- {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/experimental/aio/lsh.py +9 -1
- {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/experimental/aio/storage.py +33 -15
- {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/lsh.py +13 -5
- {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/lsh_bloom.py +2 -2
- {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/lshensemble.py +2 -1
- {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/minhash.py +8 -5
- {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/storage.py +15 -7
- {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/weighted_minhash.py +5 -5
- {datasketch-1.8.0 → datasketch-1.9.0}/pyproject.toml +38 -3
- {datasketch-1.8.0 → datasketch-1.9.0}/.gitignore +0 -0
- {datasketch-1.8.0 → datasketch-1.9.0}/LICENSE +0 -0
- {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/__init__.py +0 -0
- {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/b_bit_minhash.py +0 -0
- {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/experimental/__init__.py +0 -0
- {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/experimental/aio/__init__.py +0 -0
- {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/hashfunc.py +0 -0
- {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/hnsw.py +0 -0
- {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/hyperloglog.py +0 -0
- {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/hyperloglog_const.py +0 -0
- {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/lean_minhash.py +0 -0
- {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/lshensemble_partition.py +0 -0
- {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/lshforest.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datasketch
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.9.0
|
|
4
4
|
Summary: Probabilistic data structures for processing and searching very large datasets
|
|
5
5
|
Project-URL: Homepage, https://ekzhu.github.io/datasketch
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/ekzhu/datasketch/issues
|
|
@@ -17,6 +17,7 @@ Classifier: Programming Language :: Python :: 3.9
|
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.10
|
|
18
18
|
Classifier: Programming Language :: Python :: 3.11
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
21
|
Classifier: Topic :: Database
|
|
21
22
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
22
23
|
Requires-Python: >=3.9
|
|
@@ -49,6 +50,8 @@ Requires-Dist: nose-exclude>=0.5.0; extra == 'test'
|
|
|
49
50
|
Requires-Dist: nose>=1.3.7; extra == 'test'
|
|
50
51
|
Requires-Dist: pymongo>=3.9.0; extra == 'test'
|
|
51
52
|
Requires-Dist: pytest; extra == 'test'
|
|
53
|
+
Requires-Dist: pytest-asyncio; extra == 'test'
|
|
54
|
+
Requires-Dist: pytest-cov; extra == 'test'
|
|
52
55
|
Requires-Dist: pytest-rerunfailures; extra == 'test'
|
|
53
56
|
Requires-Dist: redis>=2.10.0; extra == 'test'
|
|
54
57
|
Description-Content-Type: text/x-rst
|
|
@@ -62,6 +65,9 @@ datasketch: Big Data Looks Small
|
|
|
62
65
|
.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
|
|
63
66
|
:target: https://zenodo.org/doi/10.5281/zenodo.598238
|
|
64
67
|
|
|
68
|
+
.. image:: https://codecov.io/gh/ekzhu/datasketch/branch/master/graph/badge.svg
|
|
69
|
+
:target: https://codecov.io/gh/ekzhu/datasketch
|
|
70
|
+
|
|
65
71
|
datasketch gives you probabilistic data structures that can process and
|
|
66
72
|
search very large amount of data super fast, with little loss of
|
|
67
73
|
accuracy.
|
|
@@ -7,6 +7,9 @@ datasketch: Big Data Looks Small
|
|
|
7
7
|
.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
|
|
8
8
|
:target: https://zenodo.org/doi/10.5281/zenodo.598238
|
|
9
9
|
|
|
10
|
+
.. image:: https://codecov.io/gh/ekzhu/datasketch/branch/master/graph/badge.svg
|
|
11
|
+
:target: https://codecov.io/gh/ekzhu/datasketch
|
|
12
|
+
|
|
10
13
|
datasketch gives you probabilistic data structures that can process and
|
|
11
14
|
search very large amount of data super fast, with little loss of
|
|
12
15
|
accuracy.
|
|
@@ -60,6 +60,7 @@ class AsyncMinHashLSH:
|
|
|
60
60
|
self._weights = weights
|
|
61
61
|
self._params = params
|
|
62
62
|
self.prepickle = storage_config["type"] == "aioredis" if prepickle is None else prepickle
|
|
63
|
+
self._require_bytes_keys = not self.prepickle
|
|
63
64
|
|
|
64
65
|
if self._threshold > 1.0 or self._threshold < 0.0:
|
|
65
66
|
raise ValueError("threshold must be in [0.0, 1.0]")
|
|
@@ -115,7 +116,9 @@ class AsyncMinHashLSH:
|
|
|
115
116
|
def __setstate__(self, state):
|
|
116
117
|
state["_lock"] = asyncio.Lock()
|
|
117
118
|
self.__dict__ = state
|
|
118
|
-
self.__init__(
|
|
119
|
+
self.__init__(
|
|
120
|
+
self._threshold, self._num_perm, self._weights, self._params, self._storage_config, self.prepickle
|
|
121
|
+
)
|
|
119
122
|
|
|
120
123
|
@property
|
|
121
124
|
def batch_size(self):
|
|
@@ -277,6 +280,11 @@ class AsyncMinHashLSH:
|
|
|
277
280
|
async def _insert(self, key, minhash, check_duplication=True, buffer=False):
|
|
278
281
|
if len(minhash) != self.h:
|
|
279
282
|
raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
|
|
283
|
+
if self._require_bytes_keys and not isinstance(key, bytes):
|
|
284
|
+
raise TypeError(
|
|
285
|
+
f"prepickle=False requires bytes keys for non-dict storage, got {type(key).__name__}. "
|
|
286
|
+
"Either pass bytes keys or use prepickle=True for automatic serialization."
|
|
287
|
+
)
|
|
280
288
|
if self.prepickle:
|
|
281
289
|
key = pickle.dumps(key)
|
|
282
290
|
|
|
@@ -323,16 +323,19 @@ if redis is not None:
|
|
|
323
323
|
)
|
|
324
324
|
self._initialized = True
|
|
325
325
|
|
|
326
|
+
async def close(self):
|
|
327
|
+
await self._redis.aclose()
|
|
328
|
+
|
|
326
329
|
@property
|
|
327
330
|
def initialized(self):
|
|
328
331
|
return self._initialized
|
|
329
332
|
|
|
330
333
|
class AsyncRedisListStorage(OrderedStorage, AsyncRedisStorage):
|
|
331
334
|
async def keys(self):
|
|
332
|
-
return await self._redis.hkeys(self._name)
|
|
335
|
+
return await self._redis.hkeys(self._name) # type: ignore
|
|
333
336
|
|
|
334
337
|
async def redis_keys(self):
|
|
335
|
-
return await self._redis.hvals(self._name)
|
|
338
|
+
return await self._redis.hvals(self._name) # type: ignore
|
|
336
339
|
|
|
337
340
|
def status(self):
|
|
338
341
|
status = self._parse_config(self.config["redis"])
|
|
@@ -353,15 +356,26 @@ if redis is not None:
|
|
|
353
356
|
async def _get_items(r, k):
|
|
354
357
|
return await r.lrange(k, 0, -1)
|
|
355
358
|
|
|
356
|
-
async def remove(self, *keys):
|
|
357
|
-
|
|
358
|
-
|
|
359
|
+
async def remove(self, *keys, **kwargs):
|
|
360
|
+
buffer = kwargs.pop("buffer", False)
|
|
361
|
+
if buffer:
|
|
362
|
+
await self._remove(self._buffer, *keys)
|
|
363
|
+
else:
|
|
364
|
+
await self._remove(self._redis, *keys)
|
|
365
|
+
|
|
366
|
+
async def _remove(self, r, *keys):
|
|
367
|
+
await r.hdel(self._name, *keys)
|
|
368
|
+
await r.delete(*[self.redis_key(key) for key in keys])
|
|
359
369
|
|
|
360
|
-
async def remove_val(self, key, val):
|
|
370
|
+
async def remove_val(self, key, val, **kwargs):
|
|
371
|
+
buffer = kwargs.pop("buffer", False)
|
|
361
372
|
redis_key = self.redis_key(key)
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
373
|
+
if buffer:
|
|
374
|
+
await self._buffer.lrem(redis_key, val)
|
|
375
|
+
else:
|
|
376
|
+
await self._redis.lrem(redis_key, val)
|
|
377
|
+
if not await self._redis.exists(redis_key): # type: ignore
|
|
378
|
+
await self._redis.hdel(self._name, redis_key) # type: ignore
|
|
365
379
|
|
|
366
380
|
async def insert(self, key, *vals, **kwargs):
|
|
367
381
|
# Using buffer=True outside of an `insertion_session`
|
|
@@ -380,7 +394,7 @@ if redis is not None:
|
|
|
380
394
|
await r.rpush(redis_key, *values)
|
|
381
395
|
|
|
382
396
|
async def size(self):
|
|
383
|
-
return await self._redis.hlen(self._name)
|
|
397
|
+
return await self._redis.hlen(self._name) # type: ignore
|
|
384
398
|
|
|
385
399
|
async def itemcounts(self):
|
|
386
400
|
pipe = self._redis.pipeline()
|
|
@@ -395,7 +409,7 @@ if redis is not None:
|
|
|
395
409
|
return await r.llen(k)
|
|
396
410
|
|
|
397
411
|
async def has_key(self, key):
|
|
398
|
-
return await self._redis.hexists(self._name, key)
|
|
412
|
+
return await self._redis.hexists(self._name, key) # type: ignore
|
|
399
413
|
|
|
400
414
|
async def empty_buffer(self):
|
|
401
415
|
await self._buffer.execute()
|
|
@@ -408,11 +422,15 @@ if redis is not None:
|
|
|
408
422
|
async def _get_items(r, k):
|
|
409
423
|
return await r.smembers(k)
|
|
410
424
|
|
|
411
|
-
async def remove_val(self, key, val):
|
|
425
|
+
async def remove_val(self, key, val, **kwargs):
|
|
426
|
+
buffer = kwargs.pop("buffer", False)
|
|
412
427
|
redis_key = self.redis_key(key)
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
428
|
+
if buffer:
|
|
429
|
+
await self._buffer.srem(redis_key, val)
|
|
430
|
+
else:
|
|
431
|
+
await self._redis.srem(redis_key, val)
|
|
432
|
+
if not await self._redis.exists(redis_key): # type: ignore
|
|
433
|
+
await self._redis.hdel(self._name, redis_key) # type: ignore
|
|
416
434
|
|
|
417
435
|
async def _insert(self, r, key, *values):
|
|
418
436
|
redis_key = self.redis_key(key)
|
|
@@ -3,12 +3,18 @@ from __future__ import annotations
|
|
|
3
3
|
import pickle
|
|
4
4
|
import struct
|
|
5
5
|
from collections.abc import Hashable
|
|
6
|
-
from typing import Callable, Optional, Union
|
|
6
|
+
from typing import Callable, List, Optional, Union
|
|
7
7
|
|
|
8
8
|
from scipy.integrate import quad as integrate
|
|
9
9
|
|
|
10
10
|
from datasketch.minhash import MinHash
|
|
11
|
-
from datasketch.storage import
|
|
11
|
+
from datasketch.storage import (
|
|
12
|
+
OrderedStorage,
|
|
13
|
+
UnorderedStorage,
|
|
14
|
+
_random_name,
|
|
15
|
+
ordered_storage,
|
|
16
|
+
unordered_storage,
|
|
17
|
+
)
|
|
12
18
|
from datasketch.weighted_minhash import WeightedMinHash
|
|
13
19
|
|
|
14
20
|
|
|
@@ -183,7 +189,7 @@ class MinHashLSH:
|
|
|
183
189
|
self._H = self._byteswap
|
|
184
190
|
|
|
185
191
|
basename = storage_config.get("basename", _random_name(11))
|
|
186
|
-
self.hashtables = [
|
|
192
|
+
self.hashtables: List[UnorderedStorage] = [
|
|
187
193
|
unordered_storage(
|
|
188
194
|
storage_config,
|
|
189
195
|
name=b"".join([basename, b"_bucket_", struct.pack(">H", i)]),
|
|
@@ -191,7 +197,7 @@ class MinHashLSH:
|
|
|
191
197
|
for i in range(self.b)
|
|
192
198
|
]
|
|
193
199
|
self.hashranges = [(i * self.r, (i + 1) * self.r) for i in range(self.b)]
|
|
194
|
-
self.keys = ordered_storage(storage_config, name=b"".join([basename, b"_keys"]))
|
|
200
|
+
self.keys: OrderedStorage = ordered_storage(storage_config, name=b"".join([basename, b"_keys"]))
|
|
195
201
|
|
|
196
202
|
@property
|
|
197
203
|
def buffer_size(self) -> int:
|
|
@@ -347,7 +353,7 @@ class MinHashLSH:
|
|
|
347
353
|
"""
|
|
348
354
|
return type(self) is type(other) and self.h == other.h and self.b == other.b and self.r == other.r
|
|
349
355
|
|
|
350
|
-
def _merge(self, other: MinHashLSH, check_overlap: bool = False, buffer: bool = False) ->
|
|
356
|
+
def _merge(self, other: MinHashLSH, check_overlap: bool = False, buffer: bool = False) -> None:
|
|
351
357
|
if self.__equivalent(other):
|
|
352
358
|
if check_overlap and set(self.keys).intersection(set(other.keys)):
|
|
353
359
|
raise ValueError("The keys are overlapping, duplicate key exists.")
|
|
@@ -524,6 +530,8 @@ class MinHashLSH:
|
|
|
524
530
|
return bytes(hs.byteswap().data)
|
|
525
531
|
|
|
526
532
|
def _hashed_byteswap(self, hs):
|
|
533
|
+
if self.hashfunc is None:
|
|
534
|
+
raise RuntimeError("Hash function not configured.")
|
|
527
535
|
return self.hashfunc(bytes(hs.byteswap().data))
|
|
528
536
|
|
|
529
537
|
def _query_b(self, minhash, b):
|
|
@@ -252,9 +252,9 @@ class MinHashLSHBloom:
|
|
|
252
252
|
raise ValueError("threshold must be in [0.0, 1.0]")
|
|
253
253
|
if num_perm < 2:
|
|
254
254
|
raise ValueError("Too few permutation functions")
|
|
255
|
-
if n <= 0:
|
|
255
|
+
if n is None or n <= 0:
|
|
256
256
|
raise ValueError("n for LSHBloom must be >= 0")
|
|
257
|
-
if fp >= 1.0 or fp <= 0.0:
|
|
257
|
+
if fp is None or fp >= 1.0 or fp <= 0.0:
|
|
258
258
|
raise ValueError("fp must be in (0.0, 1.0)")
|
|
259
259
|
if save_dir is None:
|
|
260
260
|
warnings.warn(
|
|
@@ -221,7 +221,8 @@ class MinHashLSHEnsemble:
|
|
|
221
221
|
entries.sort(key=lambda e: e[2])
|
|
222
222
|
curr_part = 0
|
|
223
223
|
for key, minhash, size in entries:
|
|
224
|
-
|
|
224
|
+
u = self.uppers[curr_part]
|
|
225
|
+
if size > u:
|
|
225
226
|
curr_part += 1
|
|
226
227
|
for r in self.indexes[curr_part]:
|
|
227
228
|
self.indexes[curr_part][r].insert(key, minhash)
|
|
@@ -3,15 +3,18 @@ from __future__ import annotations
|
|
|
3
3
|
import copy
|
|
4
4
|
import warnings
|
|
5
5
|
from collections.abc import Generator, Iterable
|
|
6
|
-
from typing import Callable, Optional
|
|
6
|
+
from typing import TYPE_CHECKING, Callable, Optional, Union
|
|
7
7
|
|
|
8
8
|
try:
|
|
9
9
|
from typing import Literal # py3.8+; if older, you can fallback to typing_extensions
|
|
10
|
-
except
|
|
10
|
+
except ImportError:
|
|
11
11
|
from typing_extensions import Literal
|
|
12
12
|
|
|
13
13
|
import numpy as np
|
|
14
14
|
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from numpy.typing import ArrayLike
|
|
17
|
+
|
|
15
18
|
# GPU backend
|
|
16
19
|
try:
|
|
17
20
|
import cupy as cp
|
|
@@ -114,8 +117,8 @@ class MinHash:
|
|
|
114
117
|
gpu_mode: Literal["disable", "detect", "always"] = "disable",
|
|
115
118
|
hashfunc: Callable = sha1_hash32,
|
|
116
119
|
hashobj: Optional[object] = None, # Deprecated.
|
|
117
|
-
hashvalues: Optional[
|
|
118
|
-
permutations: Optional[tuple[
|
|
120
|
+
hashvalues: Optional[ArrayLike] = None,
|
|
121
|
+
permutations: Optional[Union[tuple[ArrayLike, ArrayLike], ArrayLike]] = None,
|
|
119
122
|
) -> None:
|
|
120
123
|
if hashvalues is not None:
|
|
121
124
|
num_perm = len(hashvalues)
|
|
@@ -180,7 +183,7 @@ class MinHash:
|
|
|
180
183
|
dtype=np.uint64,
|
|
181
184
|
).T
|
|
182
185
|
|
|
183
|
-
def _parse_hashvalues(self, hashvalues):
|
|
186
|
+
def _parse_hashvalues(self, hashvalues) -> np.ndarray:
|
|
184
187
|
return np.array(hashvalues, dtype=np.uint64)
|
|
185
188
|
|
|
186
189
|
def update(self, b) -> None:
|
|
@@ -26,7 +26,7 @@ except ImportError:
|
|
|
26
26
|
c_concurrent = None
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
def ordered_storage(config, name=None):
|
|
29
|
+
def ordered_storage(config, name=None) -> "OrderedStorage":
|
|
30
30
|
"""Return ordered storage system based on the specified config.
|
|
31
31
|
|
|
32
32
|
The canonical example of such a storage container is
|
|
@@ -62,10 +62,10 @@ def ordered_storage(config, name=None):
|
|
|
62
62
|
return RedisListStorage(config, name=name)
|
|
63
63
|
if tp == "cassandra":
|
|
64
64
|
return CassandraListStorage(config, name=name)
|
|
65
|
-
|
|
65
|
+
raise ValueError(f"Unknown storage type: {tp}")
|
|
66
66
|
|
|
67
67
|
|
|
68
|
-
def unordered_storage(config, name=None):
|
|
68
|
+
def unordered_storage(config, name=None) -> "UnorderedStorage":
|
|
69
69
|
"""Return an unordered storage system based on the specified config.
|
|
70
70
|
|
|
71
71
|
The canonical example of such a storage container is
|
|
@@ -100,7 +100,7 @@ def unordered_storage(config, name=None):
|
|
|
100
100
|
return RedisSetStorage(config, name=name)
|
|
101
101
|
if tp == "cassandra":
|
|
102
102
|
return CassandraSetStorage(config, name=name)
|
|
103
|
-
|
|
103
|
+
raise ValueError(f"Unknown storage type: {tp}")
|
|
104
104
|
|
|
105
105
|
|
|
106
106
|
class Storage(ABC):
|
|
@@ -144,7 +144,7 @@ class Storage(ABC):
|
|
|
144
144
|
pass
|
|
145
145
|
|
|
146
146
|
@abstractmethod
|
|
147
|
-
def remove(self, *keys):
|
|
147
|
+
def remove(self, *keys, **kwargs):
|
|
148
148
|
"""Remove `keys` from storage."""
|
|
149
149
|
pass
|
|
150
150
|
|
|
@@ -154,12 +154,12 @@ class Storage(ABC):
|
|
|
154
154
|
pass
|
|
155
155
|
|
|
156
156
|
@abstractmethod
|
|
157
|
-
def size(self):
|
|
157
|
+
def size(self) -> int:
|
|
158
158
|
"""Return size of storage with respect to number of keys."""
|
|
159
159
|
pass
|
|
160
160
|
|
|
161
161
|
@abstractmethod
|
|
162
|
-
def itemcounts(self, **kwargs):
|
|
162
|
+
def itemcounts(self, **kwargs) -> dict:
|
|
163
163
|
"""Returns the number of items stored under each key."""
|
|
164
164
|
pass
|
|
165
165
|
|
|
@@ -168,6 +168,14 @@ class Storage(ABC):
|
|
|
168
168
|
"""Determines whether the key is in the storage or not."""
|
|
169
169
|
pass
|
|
170
170
|
|
|
171
|
+
@property
|
|
172
|
+
def buffer_size(self) -> int:
|
|
173
|
+
return getattr(self, "_buffer_size", 50000)
|
|
174
|
+
|
|
175
|
+
@buffer_size.setter
|
|
176
|
+
def buffer_size(self, value: int):
|
|
177
|
+
self._buffer_size = value
|
|
178
|
+
|
|
171
179
|
def status(self):
|
|
172
180
|
return {"keyspace_size": len(self)}
|
|
173
181
|
|
|
@@ -133,14 +133,15 @@ class WeightedMinHashGenerator:
|
|
|
133
133
|
WeightedMinHash: The weighted MinHash.
|
|
134
134
|
|
|
135
135
|
"""
|
|
136
|
-
if not isinstance(v, collections.abc.
|
|
137
|
-
raise TypeError("Input vector must be
|
|
136
|
+
if not isinstance(v, collections.abc.Sized):
|
|
137
|
+
raise TypeError("Input vector must be sized")
|
|
138
138
|
if not len(v) == self.dim:
|
|
139
139
|
raise ValueError("Input dimension mismatch, expecting %d" % self.dim)
|
|
140
140
|
if not isinstance(v, np.ndarray):
|
|
141
141
|
v = np.array(v, dtype=np.float32)
|
|
142
142
|
elif v.dtype != np.float32:
|
|
143
143
|
v = v.astype(np.float32)
|
|
144
|
+
v: np.ndarray = v
|
|
144
145
|
hashvalues = np.zeros((self.sample_size, 2), dtype=int)
|
|
145
146
|
vzeros = v == 0
|
|
146
147
|
if vzeros.all():
|
|
@@ -226,9 +227,8 @@ class WeightedMinHashGenerator:
|
|
|
226
227
|
doc_argmin = np.argmin(doc_ln_a, axis=1)
|
|
227
228
|
doc_k = doc_cidx[doc_argmin]
|
|
228
229
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
hashvalues = all_hashvalues[it_doc]
|
|
230
|
+
hashvalues = np.zeros((self.sample_size, 2), dtype=int)
|
|
231
|
+
all_hashvalues[it_doc] = hashvalues
|
|
232
232
|
hashvalues[:, 0], hashvalues[:, 1] = (
|
|
233
233
|
doc_k,
|
|
234
234
|
t[np.arange(self.sample_size), doc_begin + doc_argmin],
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "datasketch"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.9.0"
|
|
8
8
|
description = "Probabilistic data structures for processing and searching very large datasets"
|
|
9
9
|
readme = "README.rst"
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -21,6 +21,7 @@ classifiers = [
|
|
|
21
21
|
"Programming Language :: Python :: 3.10",
|
|
22
22
|
"Programming Language :: Python :: 3.11",
|
|
23
23
|
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Programming Language :: Python :: 3.13",
|
|
24
25
|
]
|
|
25
26
|
dependencies = ["numpy>=1.11", "scipy>=1.0.0"]
|
|
26
27
|
|
|
@@ -44,11 +45,13 @@ test = [
|
|
|
44
45
|
"mock>=2.0.0",
|
|
45
46
|
"mockredispy",
|
|
46
47
|
"coverage",
|
|
48
|
+
"pytest-cov",
|
|
47
49
|
"pymongo>=3.9.0",
|
|
48
50
|
"nose>=1.3.7",
|
|
49
51
|
"nose-exclude>=0.5.0",
|
|
50
52
|
"pytest",
|
|
51
53
|
"pytest-rerunfailures",
|
|
54
|
+
"pytest-asyncio",
|
|
52
55
|
]
|
|
53
56
|
experimental_aio = ["aiounittest", "motor>3.6.0"]
|
|
54
57
|
|
|
@@ -91,7 +94,6 @@ exclude = [
|
|
|
91
94
|
"dist",
|
|
92
95
|
"docs",
|
|
93
96
|
"examples",
|
|
94
|
-
"travis",
|
|
95
97
|
"datasketch/hyperloglog_const.py",
|
|
96
98
|
]
|
|
97
99
|
|
|
@@ -158,5 +160,38 @@ include = ["pyproject.toml", "README.rst", "LICENSE", "datasketch/**"]
|
|
|
158
160
|
|
|
159
161
|
[tool.pytest.ini_options]
|
|
160
162
|
minversion = "6.0"
|
|
161
|
-
addopts = ["--strict-markers", "--color=yes"]
|
|
163
|
+
addopts = ["--strict-markers", "--color=yes", "--cov-report=xml"]
|
|
162
164
|
testpaths = ["test"]
|
|
165
|
+
asyncio_mode = "auto"
|
|
166
|
+
|
|
167
|
+
[tool.pyright]
|
|
168
|
+
include = ["datasketch"]
|
|
169
|
+
exclude = [
|
|
170
|
+
"benchmark",
|
|
171
|
+
"docs",
|
|
172
|
+
"examples",
|
|
173
|
+
"test",
|
|
174
|
+
"travis",
|
|
175
|
+
"**/.venv/**",
|
|
176
|
+
"**/__pycache__",
|
|
177
|
+
]
|
|
178
|
+
pythonVersion = "3.9"
|
|
179
|
+
typeCheckingMode = "basic" # todo: change to "strict" in future
|
|
180
|
+
|
|
181
|
+
reportMissingImports = "none"
|
|
182
|
+
reportUnusedVariable = "warning"
|
|
183
|
+
reportAttributeAccessIssue = "none"
|
|
184
|
+
reportOptionalMemberAccess = "none"
|
|
185
|
+
reportGeneralTypeIssues = "none"
|
|
186
|
+
reportArgumentType = "none"
|
|
187
|
+
reportOptionalIterable = "none"
|
|
188
|
+
reportReturnType = "none"
|
|
189
|
+
reportRedeclaration = "none"
|
|
190
|
+
reportOperatorIssue = "none"
|
|
191
|
+
reportAssignmentType = "none"
|
|
192
|
+
reportOptionalSubscript = "none"
|
|
193
|
+
reportCallIssue = "none"
|
|
194
|
+
|
|
195
|
+
[tool.coverage.run]
|
|
196
|
+
source = ["datasketch"]
|
|
197
|
+
omit = ["*/experimental/*", "*/tests/*", "*/test/*"]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|