minhashlib 0.2.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: minhashlib
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: A fast and minimal minhashing based similarity checking library.
5
5
  Project-URL: Repository, https://github.com/ssavutu/minhashlib
6
6
  Project-URL: Bug Tracker, https://github.com/ssavutu/minhashlib/issues
@@ -1,4 +1,5 @@
1
1
  from collections import defaultdict
2
+ from functools import lru_cache
2
3
 
3
4
  import numpy as np
4
5
  from xxhash import xxh3_64_intdigest
@@ -28,10 +29,16 @@ def _false_negative_probability(threshold: float, bands: int, rows: int) -> floa
28
29
  return _integrate(prob, threshold, 1.0)
29
30
 
30
31
 
32
+ @lru_cache(maxsize=None)
31
33
  def _optimal_bands_rows(
32
34
  threshold: float, num_perm: int, fp_weight: float = 0.5, fn_weight: float = 0.5
33
35
  ) -> tuple[int, int]:
34
- """Pick ``(bands, rows)`` minimising weighted false-positive/false-negative error."""
36
+ """Pick ``(bands, rows)`` minimising weighted false-positive/false-negative error.
37
+
38
+ The search is a pure function of its arguments and costs ~tens of ms, so it is
39
+ memoised: constructing many LSH indexes with the same threshold/num_perm
40
+ (common in batch pipelines) recomputes nothing after the first call.
41
+ """
35
42
  best = (num_perm, 1)
36
43
  min_error = float("inf")
37
44
  for bands in range(1, num_perm + 1):
@@ -57,18 +57,30 @@ class MinHash:
57
57
  n = len(document)
58
58
  if n == 0:
59
59
  return np.array([], dtype=np.int64)
60
- if n < self.k:
60
+ p, k = self.p, self.k
61
+ hasher = xxh3_64_intdigest
62
+ if n < k:
61
63
  # Hash the whole string as a single shingle so documents shorter than
62
64
  # k remain distinguishable instead of collapsing to one signature.
63
- return np.array([xxh3_64_intdigest(document.encode("utf-8")) % self.p], dtype=np.int64)
64
-
65
- shingles = np.fromiter(
66
- (
67
- xxh3_64_intdigest(document[i : i + self.k].encode("utf-8")) % self.p
68
- for i in range(n - self.k + 1)
69
- ),
70
- dtype=np.int64,
71
- )
65
+ return np.array([hasher(document.encode("utf-8")) % p], dtype=np.int64)
66
+
67
+ count = n - k + 1
68
+ if document.isascii():
69
+ # ASCII: one byte per char, so encode once and slice the buffer instead
70
+ # of re-encoding each overlapping window. Byte slices are identical to
71
+ # ``document[i:i+k].encode()`` here, so signatures are unchanged.
72
+ data = document.encode("ascii")
73
+ shingles = np.fromiter(
74
+ (hasher(data[i : i + k]) % p for i in range(count)),
75
+ dtype=np.int64,
76
+ count=count,
77
+ )
78
+ else:
79
+ shingles = np.fromiter(
80
+ (hasher(document[i : i + k].encode("utf-8")) % p for i in range(count)),
81
+ dtype=np.int64,
82
+ count=count,
83
+ )
72
84
  return np.unique(shingles)
73
85
 
74
86
  def _generate_hash_parameters(self) -> NDArray[np.int64]:
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "minhashlib"
7
- version = "0.2.0"
7
+ version = "0.2.1"
8
8
  dependencies = [
9
9
  "numpy",
10
10
  "numba",
File without changes
File without changes
File without changes