datasketch 1.8.0__tar.gz → 1.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {datasketch-1.8.0 → datasketch-1.9.0}/PKG-INFO +7 -1
  2. {datasketch-1.8.0 → datasketch-1.9.0}/README.rst +3 -0
  3. {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/experimental/aio/lsh.py +9 -1
  4. {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/experimental/aio/storage.py +33 -15
  5. {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/lsh.py +13 -5
  6. {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/lsh_bloom.py +2 -2
  7. {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/lshensemble.py +2 -1
  8. {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/minhash.py +8 -5
  9. {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/storage.py +15 -7
  10. {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/weighted_minhash.py +5 -5
  11. {datasketch-1.8.0 → datasketch-1.9.0}/pyproject.toml +38 -3
  12. {datasketch-1.8.0 → datasketch-1.9.0}/.gitignore +0 -0
  13. {datasketch-1.8.0 → datasketch-1.9.0}/LICENSE +0 -0
  14. {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/__init__.py +0 -0
  15. {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/b_bit_minhash.py +0 -0
  16. {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/experimental/__init__.py +0 -0
  17. {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/experimental/aio/__init__.py +0 -0
  18. {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/hashfunc.py +0 -0
  19. {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/hnsw.py +0 -0
  20. {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/hyperloglog.py +0 -0
  21. {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/hyperloglog_const.py +0 -0
  22. {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/lean_minhash.py +0 -0
  23. {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/lshensemble_partition.py +0 -0
  24. {datasketch-1.8.0 → datasketch-1.9.0}/datasketch/lshforest.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datasketch
3
- Version: 1.8.0
3
+ Version: 1.9.0
4
4
  Summary: Probabilistic data structures for processing and searching very large datasets
5
5
  Project-URL: Homepage, https://ekzhu.github.io/datasketch
6
6
  Project-URL: Bug Tracker, https://github.com/ekzhu/datasketch/issues
@@ -17,6 +17,7 @@ Classifier: Programming Language :: Python :: 3.9
17
17
  Classifier: Programming Language :: Python :: 3.10
18
18
  Classifier: Programming Language :: Python :: 3.11
19
19
  Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
20
21
  Classifier: Topic :: Database
21
22
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
22
23
  Requires-Python: >=3.9
@@ -49,6 +50,8 @@ Requires-Dist: nose-exclude>=0.5.0; extra == 'test'
49
50
  Requires-Dist: nose>=1.3.7; extra == 'test'
50
51
  Requires-Dist: pymongo>=3.9.0; extra == 'test'
51
52
  Requires-Dist: pytest; extra == 'test'
53
+ Requires-Dist: pytest-asyncio; extra == 'test'
54
+ Requires-Dist: pytest-cov; extra == 'test'
52
55
  Requires-Dist: pytest-rerunfailures; extra == 'test'
53
56
  Requires-Dist: redis>=2.10.0; extra == 'test'
54
57
  Description-Content-Type: text/x-rst
@@ -62,6 +65,9 @@ datasketch: Big Data Looks Small
62
65
  .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
63
66
  :target: https://zenodo.org/doi/10.5281/zenodo.598238
64
67
 
68
+ .. image:: https://codecov.io/gh/ekzhu/datasketch/branch/master/graph/badge.svg
69
+ :target: https://codecov.io/gh/ekzhu/datasketch
70
+
65
71
  datasketch gives you probabilistic data structures that can process and
66
72
  search very large amount of data super fast, with little loss of
67
73
  accuracy.
@@ -7,6 +7,9 @@ datasketch: Big Data Looks Small
7
7
  .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.598238.svg
8
8
  :target: https://zenodo.org/doi/10.5281/zenodo.598238
9
9
 
10
+ .. image:: https://codecov.io/gh/ekzhu/datasketch/branch/master/graph/badge.svg
11
+ :target: https://codecov.io/gh/ekzhu/datasketch
12
+
10
13
  datasketch gives you probabilistic data structures that can process and
11
14
  search very large amount of data super fast, with little loss of
12
15
  accuracy.
@@ -60,6 +60,7 @@ class AsyncMinHashLSH:
60
60
  self._weights = weights
61
61
  self._params = params
62
62
  self.prepickle = storage_config["type"] == "aioredis" if prepickle is None else prepickle
63
+ self._require_bytes_keys = not self.prepickle
63
64
 
64
65
  if self._threshold > 1.0 or self._threshold < 0.0:
65
66
  raise ValueError("threshold must be in [0.0, 1.0]")
@@ -115,7 +116,9 @@ class AsyncMinHashLSH:
115
116
  def __setstate__(self, state):
116
117
  state["_lock"] = asyncio.Lock()
117
118
  self.__dict__ = state
118
- self.__init__(self._threshold, self._num_perm, self._weights, self._params, self._storage_config)
119
+ self.__init__(
120
+ self._threshold, self._num_perm, self._weights, self._params, self._storage_config, self.prepickle
121
+ )
119
122
 
120
123
  @property
121
124
  def batch_size(self):
@@ -277,6 +280,11 @@ class AsyncMinHashLSH:
277
280
  async def _insert(self, key, minhash, check_duplication=True, buffer=False):
278
281
  if len(minhash) != self.h:
279
282
  raise ValueError("Expecting minhash with length %d, got %d" % (self.h, len(minhash)))
283
+ if self._require_bytes_keys and not isinstance(key, bytes):
284
+ raise TypeError(
285
+ f"prepickle=False requires bytes keys for non-dict storage, got {type(key).__name__}. "
286
+ "Either pass bytes keys or use prepickle=True for automatic serialization."
287
+ )
280
288
  if self.prepickle:
281
289
  key = pickle.dumps(key)
282
290
 
@@ -323,16 +323,19 @@ if redis is not None:
323
323
  )
324
324
  self._initialized = True
325
325
 
326
+ async def close(self):
327
+ await self._redis.aclose()
328
+
326
329
  @property
327
330
  def initialized(self):
328
331
  return self._initialized
329
332
 
330
333
  class AsyncRedisListStorage(OrderedStorage, AsyncRedisStorage):
331
334
  async def keys(self):
332
- return await self._redis.hkeys(self._name)
335
+ return await self._redis.hkeys(self._name) # type: ignore
333
336
 
334
337
  async def redis_keys(self):
335
- return await self._redis.hvals(self._name)
338
+ return await self._redis.hvals(self._name) # type: ignore
336
339
 
337
340
  def status(self):
338
341
  status = self._parse_config(self.config["redis"])
@@ -353,15 +356,26 @@ if redis is not None:
353
356
  async def _get_items(r, k):
354
357
  return await r.lrange(k, 0, -1)
355
358
 
356
- async def remove(self, *keys):
357
- await self._redis.hdel(self._name, *keys)
358
- await self._redis.delete(*[self.redis_key(key) for key in keys])
359
+ async def remove(self, *keys, **kwargs):
360
+ buffer = kwargs.pop("buffer", False)
361
+ if buffer:
362
+ await self._remove(self._buffer, *keys)
363
+ else:
364
+ await self._remove(self._redis, *keys)
365
+
366
+ async def _remove(self, r, *keys):
367
+ await r.hdel(self._name, *keys)
368
+ await r.delete(*[self.redis_key(key) for key in keys])
359
369
 
360
- async def remove_val(self, key, val):
370
+ async def remove_val(self, key, val, **kwargs):
371
+ buffer = kwargs.pop("buffer", False)
361
372
  redis_key = self.redis_key(key)
362
- await self._redis.lrem(redis_key, val)
363
- if not await self._redis.exists(redis_key):
364
- await self._redis.hdel(self._name, redis_key)
373
+ if buffer:
374
+ await self._buffer.lrem(redis_key, val)
375
+ else:
376
+ await self._redis.lrem(redis_key, val)
377
+ if not await self._redis.exists(redis_key): # type: ignore
378
+ await self._redis.hdel(self._name, redis_key) # type: ignore
365
379
 
366
380
  async def insert(self, key, *vals, **kwargs):
367
381
  # Using buffer=True outside of an `insertion_session`
@@ -380,7 +394,7 @@ if redis is not None:
380
394
  await r.rpush(redis_key, *values)
381
395
 
382
396
  async def size(self):
383
- return await self._redis.hlen(self._name)
397
+ return await self._redis.hlen(self._name) # type: ignore
384
398
 
385
399
  async def itemcounts(self):
386
400
  pipe = self._redis.pipeline()
@@ -395,7 +409,7 @@ if redis is not None:
395
409
  return await r.llen(k)
396
410
 
397
411
  async def has_key(self, key):
398
- return await self._redis.hexists(self._name, key)
412
+ return await self._redis.hexists(self._name, key) # type: ignore
399
413
 
400
414
  async def empty_buffer(self):
401
415
  await self._buffer.execute()
@@ -408,11 +422,15 @@ if redis is not None:
408
422
  async def _get_items(r, k):
409
423
  return await r.smembers(k)
410
424
 
411
- async def remove_val(self, key, val):
425
+ async def remove_val(self, key, val, **kwargs):
426
+ buffer = kwargs.pop("buffer", False)
412
427
  redis_key = self.redis_key(key)
413
- await self._redis.srem(redis_key, val)
414
- if not await self._redis.exists(redis_key):
415
- await self._redis.hdel(self._name, redis_key)
428
+ if buffer:
429
+ await self._buffer.srem(redis_key, val)
430
+ else:
431
+ await self._redis.srem(redis_key, val)
432
+ if not await self._redis.exists(redis_key): # type: ignore
433
+ await self._redis.hdel(self._name, redis_key) # type: ignore
416
434
 
417
435
  async def _insert(self, r, key, *values):
418
436
  redis_key = self.redis_key(key)
@@ -3,12 +3,18 @@ from __future__ import annotations
3
3
  import pickle
4
4
  import struct
5
5
  from collections.abc import Hashable
6
- from typing import Callable, Optional, Union
6
+ from typing import Callable, List, Optional, Union
7
7
 
8
8
  from scipy.integrate import quad as integrate
9
9
 
10
10
  from datasketch.minhash import MinHash
11
- from datasketch.storage import _random_name, ordered_storage, unordered_storage
11
+ from datasketch.storage import (
12
+ OrderedStorage,
13
+ UnorderedStorage,
14
+ _random_name,
15
+ ordered_storage,
16
+ unordered_storage,
17
+ )
12
18
  from datasketch.weighted_minhash import WeightedMinHash
13
19
 
14
20
 
@@ -183,7 +189,7 @@ class MinHashLSH:
183
189
  self._H = self._byteswap
184
190
 
185
191
  basename = storage_config.get("basename", _random_name(11))
186
- self.hashtables = [
192
+ self.hashtables: List[UnorderedStorage] = [
187
193
  unordered_storage(
188
194
  storage_config,
189
195
  name=b"".join([basename, b"_bucket_", struct.pack(">H", i)]),
@@ -191,7 +197,7 @@ class MinHashLSH:
191
197
  for i in range(self.b)
192
198
  ]
193
199
  self.hashranges = [(i * self.r, (i + 1) * self.r) for i in range(self.b)]
194
- self.keys = ordered_storage(storage_config, name=b"".join([basename, b"_keys"]))
200
+ self.keys: OrderedStorage = ordered_storage(storage_config, name=b"".join([basename, b"_keys"]))
195
201
 
196
202
  @property
197
203
  def buffer_size(self) -> int:
@@ -347,7 +353,7 @@ class MinHashLSH:
347
353
  """
348
354
  return type(self) is type(other) and self.h == other.h and self.b == other.b and self.r == other.r
349
355
 
350
- def _merge(self, other: MinHashLSH, check_overlap: bool = False, buffer: bool = False) -> MinHashLSH:
356
+ def _merge(self, other: MinHashLSH, check_overlap: bool = False, buffer: bool = False) -> None:
351
357
  if self.__equivalent(other):
352
358
  if check_overlap and set(self.keys).intersection(set(other.keys)):
353
359
  raise ValueError("The keys are overlapping, duplicate key exists.")
@@ -524,6 +530,8 @@ class MinHashLSH:
524
530
  return bytes(hs.byteswap().data)
525
531
 
526
532
  def _hashed_byteswap(self, hs):
533
+ if self.hashfunc is None:
534
+ raise RuntimeError("Hash function not configured.")
527
535
  return self.hashfunc(bytes(hs.byteswap().data))
528
536
 
529
537
  def _query_b(self, minhash, b):
@@ -252,9 +252,9 @@ class MinHashLSHBloom:
252
252
  raise ValueError("threshold must be in [0.0, 1.0]")
253
253
  if num_perm < 2:
254
254
  raise ValueError("Too few permutation functions")
255
- if n <= 0:
255
+ if n is None or n <= 0:
256
256
  raise ValueError("n for LSHBloom must be >= 0")
257
- if fp >= 1.0 or fp <= 0.0:
257
+ if fp is None or fp >= 1.0 or fp <= 0.0:
258
258
  raise ValueError("fp must be in (0.0, 1.0)")
259
259
  if save_dir is None:
260
260
  warnings.warn(
@@ -221,7 +221,8 @@ class MinHashLSHEnsemble:
221
221
  entries.sort(key=lambda e: e[2])
222
222
  curr_part = 0
223
223
  for key, minhash, size in entries:
224
- if size > self.uppers[curr_part]:
224
+ u = self.uppers[curr_part]
225
+ if size > u:
225
226
  curr_part += 1
226
227
  for r in self.indexes[curr_part]:
227
228
  self.indexes[curr_part][r].insert(key, minhash)
@@ -3,15 +3,18 @@ from __future__ import annotations
3
3
  import copy
4
4
  import warnings
5
5
  from collections.abc import Generator, Iterable
6
- from typing import Callable, Optional
6
+ from typing import TYPE_CHECKING, Callable, Optional, Union
7
7
 
8
8
  try:
9
9
  from typing import Literal # py3.8+; if older, you can fallback to typing_extensions
10
- except Exception:
10
+ except ImportError:
11
11
  from typing_extensions import Literal
12
12
 
13
13
  import numpy as np
14
14
 
15
+ if TYPE_CHECKING:
16
+ from numpy.typing import ArrayLike
17
+
15
18
  # GPU backend
16
19
  try:
17
20
  import cupy as cp
@@ -114,8 +117,8 @@ class MinHash:
114
117
  gpu_mode: Literal["disable", "detect", "always"] = "disable",
115
118
  hashfunc: Callable = sha1_hash32,
116
119
  hashobj: Optional[object] = None, # Deprecated.
117
- hashvalues: Optional[Iterable] = None,
118
- permutations: Optional[tuple[Iterable, Iterable]] = None,
120
+ hashvalues: Optional[ArrayLike] = None,
121
+ permutations: Optional[Union[tuple[ArrayLike, ArrayLike], ArrayLike]] = None,
119
122
  ) -> None:
120
123
  if hashvalues is not None:
121
124
  num_perm = len(hashvalues)
@@ -180,7 +183,7 @@ class MinHash:
180
183
  dtype=np.uint64,
181
184
  ).T
182
185
 
183
- def _parse_hashvalues(self, hashvalues):
186
+ def _parse_hashvalues(self, hashvalues) -> np.ndarray:
184
187
  return np.array(hashvalues, dtype=np.uint64)
185
188
 
186
189
  def update(self, b) -> None:
@@ -26,7 +26,7 @@ except ImportError:
26
26
  c_concurrent = None
27
27
 
28
28
 
29
- def ordered_storage(config, name=None):
29
+ def ordered_storage(config, name=None) -> "OrderedStorage":
30
30
  """Return ordered storage system based on the specified config.
31
31
 
32
32
  The canonical example of such a storage container is
@@ -62,10 +62,10 @@ def ordered_storage(config, name=None):
62
62
  return RedisListStorage(config, name=name)
63
63
  if tp == "cassandra":
64
64
  return CassandraListStorage(config, name=name)
65
- return None
65
+ raise ValueError(f"Unknown storage type: {tp}")
66
66
 
67
67
 
68
- def unordered_storage(config, name=None):
68
+ def unordered_storage(config, name=None) -> "UnorderedStorage":
69
69
  """Return an unordered storage system based on the specified config.
70
70
 
71
71
  The canonical example of such a storage container is
@@ -100,7 +100,7 @@ def unordered_storage(config, name=None):
100
100
  return RedisSetStorage(config, name=name)
101
101
  if tp == "cassandra":
102
102
  return CassandraSetStorage(config, name=name)
103
- return None
103
+ raise ValueError(f"Unknown storage type: {tp}")
104
104
 
105
105
 
106
106
  class Storage(ABC):
@@ -144,7 +144,7 @@ class Storage(ABC):
144
144
  pass
145
145
 
146
146
  @abstractmethod
147
- def remove(self, *keys):
147
+ def remove(self, *keys, **kwargs):
148
148
  """Remove `keys` from storage."""
149
149
  pass
150
150
 
@@ -154,12 +154,12 @@ class Storage(ABC):
154
154
  pass
155
155
 
156
156
  @abstractmethod
157
- def size(self):
157
+ def size(self) -> int:
158
158
  """Return size of storage with respect to number of keys."""
159
159
  pass
160
160
 
161
161
  @abstractmethod
162
- def itemcounts(self, **kwargs):
162
+ def itemcounts(self, **kwargs) -> dict:
163
163
  """Returns the number of items stored under each key."""
164
164
  pass
165
165
 
@@ -168,6 +168,14 @@ class Storage(ABC):
168
168
  """Determines whether the key is in the storage or not."""
169
169
  pass
170
170
 
171
+ @property
172
+ def buffer_size(self) -> int:
173
+ return getattr(self, "_buffer_size", 50000)
174
+
175
+ @buffer_size.setter
176
+ def buffer_size(self, value: int):
177
+ self._buffer_size = value
178
+
171
179
  def status(self):
172
180
  return {"keyspace_size": len(self)}
173
181
 
@@ -133,14 +133,15 @@ class WeightedMinHashGenerator:
133
133
  WeightedMinHash: The weighted MinHash.
134
134
 
135
135
  """
136
- if not isinstance(v, collections.abc.Iterable):
137
- raise TypeError("Input vector must be an iterable")
136
+ if not isinstance(v, collections.abc.Sized):
137
+ raise TypeError("Input vector must be sized")
138
138
  if not len(v) == self.dim:
139
139
  raise ValueError("Input dimension mismatch, expecting %d" % self.dim)
140
140
  if not isinstance(v, np.ndarray):
141
141
  v = np.array(v, dtype=np.float32)
142
142
  elif v.dtype != np.float32:
143
143
  v = v.astype(np.float32)
144
+ v: np.ndarray = v
144
145
  hashvalues = np.zeros((self.sample_size, 2), dtype=int)
145
146
  vzeros = v == 0
146
147
  if vzeros.all():
@@ -226,9 +227,8 @@ class WeightedMinHashGenerator:
226
227
  doc_argmin = np.argmin(doc_ln_a, axis=1)
227
228
  doc_k = doc_cidx[doc_argmin]
228
229
 
229
- all_hashvalues[it_doc] = np.zeros((self.sample_size, 2), dtype=int)
230
-
231
- hashvalues = all_hashvalues[it_doc]
230
+ hashvalues = np.zeros((self.sample_size, 2), dtype=int)
231
+ all_hashvalues[it_doc] = hashvalues
232
232
  hashvalues[:, 0], hashvalues[:, 1] = (
233
233
  doc_k,
234
234
  t[np.arange(self.sample_size), doc_begin + doc_argmin],
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "datasketch"
7
- version = "1.8.0"
7
+ version = "1.9.0"
8
8
  description = "Probabilistic data structures for processing and searching very large datasets"
9
9
  readme = "README.rst"
10
10
  requires-python = ">=3.9"
@@ -21,6 +21,7 @@ classifiers = [
21
21
  "Programming Language :: Python :: 3.10",
22
22
  "Programming Language :: Python :: 3.11",
23
23
  "Programming Language :: Python :: 3.12",
24
+ "Programming Language :: Python :: 3.13",
24
25
  ]
25
26
  dependencies = ["numpy>=1.11", "scipy>=1.0.0"]
26
27
 
@@ -44,11 +45,13 @@ test = [
44
45
  "mock>=2.0.0",
45
46
  "mockredispy",
46
47
  "coverage",
48
+ "pytest-cov",
47
49
  "pymongo>=3.9.0",
48
50
  "nose>=1.3.7",
49
51
  "nose-exclude>=0.5.0",
50
52
  "pytest",
51
53
  "pytest-rerunfailures",
54
+ "pytest-asyncio",
52
55
  ]
53
56
  experimental_aio = ["aiounittest", "motor>3.6.0"]
54
57
 
@@ -91,7 +94,6 @@ exclude = [
91
94
  "dist",
92
95
  "docs",
93
96
  "examples",
94
- "travis",
95
97
  "datasketch/hyperloglog_const.py",
96
98
  ]
97
99
 
@@ -158,5 +160,38 @@ include = ["pyproject.toml", "README.rst", "LICENSE", "datasketch/**"]
158
160
 
159
161
  [tool.pytest.ini_options]
160
162
  minversion = "6.0"
161
- addopts = ["--strict-markers", "--color=yes"]
163
+ addopts = ["--strict-markers", "--color=yes", "--cov-report=xml"]
162
164
  testpaths = ["test"]
165
+ asyncio_mode = "auto"
166
+
167
+ [tool.pyright]
168
+ include = ["datasketch"]
169
+ exclude = [
170
+ "benchmark",
171
+ "docs",
172
+ "examples",
173
+ "test",
174
+ "travis",
175
+ "**/.venv/**",
176
+ "**/__pycache__",
177
+ ]
178
+ pythonVersion = "3.9"
179
+ typeCheckingMode = "basic" # todo: change to "strict" in future
180
+
181
+ reportMissingImports = "none"
182
+ reportUnusedVariable = "warning"
183
+ reportAttributeAccessIssue = "none"
184
+ reportOptionalMemberAccess = "none"
185
+ reportGeneralTypeIssues = "none"
186
+ reportArgumentType = "none"
187
+ reportOptionalIterable = "none"
188
+ reportReturnType = "none"
189
+ reportRedeclaration = "none"
190
+ reportOperatorIssue = "none"
191
+ reportAssignmentType = "none"
192
+ reportOptionalSubscript = "none"
193
+ reportCallIssue = "none"
194
+
195
+ [tool.coverage.run]
196
+ source = ["datasketch"]
197
+ omit = ["*/experimental/*", "*/tests/*", "*/test/*"]
File without changes
File without changes