misakanet-core 2.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- misakanet_core-2.7.0/PKG-INFO +68 -0
- misakanet_core-2.7.0/README.md +55 -0
- misakanet_core-2.7.0/pyproject.toml +23 -0
- misakanet_core-2.7.0/setup.cfg +4 -0
- misakanet_core-2.7.0/src/misakanet_core/__init__.py +202 -0
- misakanet_core-2.7.0/src/misakanet_core.egg-info/PKG-INFO +68 -0
- misakanet_core-2.7.0/src/misakanet_core.egg-info/SOURCES.txt +8 -0
- misakanet_core-2.7.0/src/misakanet_core.egg-info/dependency_links.txt +1 -0
- misakanet_core-2.7.0/src/misakanet_core.egg-info/top_level.txt +1 -0
- misakanet_core-2.7.0/tests/test_core.py +126 -0
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: misakanet-core
|
|
3
|
+
Version: 2.7.0
|
|
4
|
+
Summary: The zero-dependency core protocol engine for MisakaNet Swarm Knowledge Network.
|
|
5
|
+
Author-email: Ikalus1988 <sheldonisspark@gmail.com>
|
|
6
|
+
Project-URL: Homepage, https://github.com/Ikalus1988/MisakaNet
|
|
7
|
+
Project-URL: Bug Tracker, https://github.com/Ikalus1988/MisakaNet/issues
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# misakanet-core
|
|
15
|
+
|
|
16
|
+
**Zero-dependency BM25 search engine with RRF fusion** — extracted from [MisakaNet](https://github.com/Ikalus1988/MisakaNet).
|
|
17
|
+
|
|
18
|
+
- Pure Python, stdlib only
|
|
19
|
+
- BM25 ranking with configurable k1/b
|
|
20
|
+
- Metadata-weighted scoring
|
|
21
|
+
- RRF (Reciprocal Rank Fusion) for multi-query fusion
|
|
22
|
+
- CJK-aware tokenization
|
|
23
|
+
- Works in air-gapped environments
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install misakanet-core
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Usage
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
from misakanet_core import BM25, ScoredDocument, tokenize, rrf
|
|
35
|
+
|
|
36
|
+
# Prepare corpus
|
|
37
|
+
docs = [
|
|
38
|
+
ScoredDocument("doc1", tokenize("the cat sat on the mat")),
|
|
39
|
+
ScoredDocument("doc2", tokenize("the dog sat on the log")),
|
|
40
|
+
ScoredDocument("doc3", tokenize("cats and dogs are friends")),
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
# Build index and search
|
|
44
|
+
engine = BM25(docs)
|
|
45
|
+
results = engine.search("cat dog", top_k=5)
|
|
46
|
+
|
|
47
|
+
for result in results:
|
|
48
|
+
print(f"{result.doc_id}: {result.score:.4f}")
|
|
49
|
+
|
|
50
|
+
# Multi-query fusion with RRF
|
|
51
|
+
from misakanet_core import SearchResult, rrf
|
|
52
|
+
query1 = engine.search("cat")
|
|
53
|
+
query2 = engine.search("dog")
|
|
54
|
+
fused = rrf([query1, query2], top_k=3)
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Why not use elasticsearch / tantivy / whoosh?
|
|
58
|
+
|
|
59
|
+
| | misakanet-core | elasticsearch | tantivy | whoosh |
|
|
60
|
+
|---|---|---|---|---|
|
|
61
|
+
| Dependencies | **Zero** | JVM | Rust toolchain | Pure Python |
|
|
62
|
+
| Install time | 0.5s | 5min+ | 2min+ | 2s |
|
|
63
|
+
| Air-gapped | ✅ | ❌ | ❌ | ✅ |
|
|
64
|
+
| CJK support | ✅ | ✅ | ⚠️ | ⚠️ |
|
|
65
|
+
|
|
66
|
+
## License
|
|
67
|
+
|
|
68
|
+
MIT
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# misakanet-core
|
|
2
|
+
|
|
3
|
+
**Zero-dependency BM25 search engine with RRF fusion** — extracted from [MisakaNet](https://github.com/Ikalus1988/MisakaNet).
|
|
4
|
+
|
|
5
|
+
- Pure Python, stdlib only
|
|
6
|
+
- BM25 ranking with configurable k1/b
|
|
7
|
+
- Metadata-weighted scoring
|
|
8
|
+
- RRF (Reciprocal Rank Fusion) for multi-query fusion
|
|
9
|
+
- CJK-aware tokenization
|
|
10
|
+
- Works in air-gapped environments
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install misakanet-core
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Usage
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
from misakanet_core import BM25, ScoredDocument, tokenize, rrf
|
|
22
|
+
|
|
23
|
+
# Prepare corpus
|
|
24
|
+
docs = [
|
|
25
|
+
ScoredDocument("doc1", tokenize("the cat sat on the mat")),
|
|
26
|
+
ScoredDocument("doc2", tokenize("the dog sat on the log")),
|
|
27
|
+
ScoredDocument("doc3", tokenize("cats and dogs are friends")),
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
# Build index and search
|
|
31
|
+
engine = BM25(docs)
|
|
32
|
+
results = engine.search("cat dog", top_k=5)
|
|
33
|
+
|
|
34
|
+
for result in results:
|
|
35
|
+
print(f"{result.doc_id}: {result.score:.4f}")
|
|
36
|
+
|
|
37
|
+
# Multi-query fusion with RRF
|
|
38
|
+
from misakanet_core import SearchResult, rrf
|
|
39
|
+
query1 = engine.search("cat")
|
|
40
|
+
query2 = engine.search("dog")
|
|
41
|
+
fused = rrf([query1, query2], top_k=3)
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Why not use elasticsearch / tantivy / whoosh?
|
|
45
|
+
|
|
46
|
+
| | misakanet-core | elasticsearch | tantivy | whoosh |
|
|
47
|
+
|---|---|---|---|---|
|
|
48
|
+
| Dependencies | **Zero** | JVM | Rust toolchain | Pure Python |
|
|
49
|
+
| Install time | 0.5s | 5min+ | 2min+ | 2s |
|
|
50
|
+
| Air-gapped | ✅ | ❌ | ❌ | ✅ |
|
|
51
|
+
| CJK support | ✅ | ✅ | ⚠️ | ⚠️ |
|
|
52
|
+
|
|
53
|
+
## License
|
|
54
|
+
|
|
55
|
+
MIT
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "misakanet-core"
|
|
7
|
+
version = "2.7.0"
|
|
8
|
+
description = "The zero-dependency core protocol engine for MisakaNet Swarm Knowledge Network."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
authors = [
|
|
11
|
+
{ name = "Ikalus1988", email = "sheldonisspark@gmail.com" }
|
|
12
|
+
]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"License :: OSI Approved :: MIT License",
|
|
16
|
+
"Operating System :: OS Independent",
|
|
17
|
+
]
|
|
18
|
+
requires-python = ">=3.10"
|
|
19
|
+
dependencies = []
|
|
20
|
+
|
|
21
|
+
[project.urls]
|
|
22
|
+
"Homepage" = "https://github.com/Ikalus1988/MisakaNet"
|
|
23
|
+
"Bug Tracker" = "https://github.com/Ikalus1988/MisakaNet/issues"
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"""misakanet-core: Zero-dependency BM25 search engine.
|
|
2
|
+
Extracted from MisakaNet — see https://github.com/Ikalus1988/MisakaNet
|
|
3
|
+
"""
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import math
|
|
7
|
+
import re
|
|
8
|
+
import json
|
|
9
|
+
from typing import NamedTuple, Sequence
|
|
10
|
+
from collections import Counter
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"BM25", "RRF", "SearchResult",
|
|
15
|
+
"tokenize", "compute_idf", "compute_bm25",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
K1 = 1.5
|
|
19
|
+
B = 0.75
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# ── Types ──
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SearchResult(NamedTuple):
|
|
26
|
+
doc_id: str
|
|
27
|
+
score: float
|
|
28
|
+
metadata: dict
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class ScoredDocument:
|
|
33
|
+
"""Internal representation of a scored document."""
|
|
34
|
+
doc_id: str
|
|
35
|
+
tokens: list[str]
|
|
36
|
+
metadata: dict = field(default_factory=dict)
|
|
37
|
+
score: float = 0.0
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ── Tokenization ──
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def tokenize(text: str, min_len: int = 1) -> list[str]:
|
|
44
|
+
"""Split text into lower-case tokens, filtering short words.
|
|
45
|
+
|
|
46
|
+
Handles mixed Latin/CJK text. CJK characters are split individually
|
|
47
|
+
to maximize BM25 recall across different queries.
|
|
48
|
+
"""
|
|
49
|
+
text = text.lower()
|
|
50
|
+
# Split CJK characters each into their own token
|
|
51
|
+
text = re.sub(r"([\u4e00-\u9fff])", r" \1 ", text)
|
|
52
|
+
tokens = re.findall(r"[a-z0-9]+|[\u4e00-\u9fff]", text)
|
|
53
|
+
return [t for t in tokens if len(t) >= min_len]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# ── IDF Computation ──
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def compute_idf(
|
|
60
|
+
docs: Sequence[ScoredDocument],
|
|
61
|
+
smoothing: bool = True,
|
|
62
|
+
) -> dict[str, float]:
|
|
63
|
+
"""Compute Inverse Document Frequency for each token across a corpus."""
|
|
64
|
+
n = len(docs)
|
|
65
|
+
df: Counter[str] = Counter()
|
|
66
|
+
for doc in docs:
|
|
67
|
+
unique_tokens = set(doc.tokens)
|
|
68
|
+
df.update(unique_tokens)
|
|
69
|
+
|
|
70
|
+
eps = 1.0 if smoothing else 0.0
|
|
71
|
+
return {
|
|
72
|
+
token: math.log((n - freq + 0.5) / (freq + 0.5) + eps)
|
|
73
|
+
for token, freq in df.items()
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# ── BM25 Scoring ──
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def compute_bm25(
|
|
81
|
+
query_tokens: list[str],
|
|
82
|
+
doc_tokens: list[str],
|
|
83
|
+
idf: dict[str, float],
|
|
84
|
+
avg_dl: float,
|
|
85
|
+
k1: float = K1,
|
|
86
|
+
b: float = B,
|
|
87
|
+
) -> float:
|
|
88
|
+
"""Compute BM25 score for a single document against a query."""
|
|
89
|
+
dl = len(doc_tokens)
|
|
90
|
+
tf = Counter(doc_tokens)
|
|
91
|
+
score = 0.0
|
|
92
|
+
for qt in set(query_tokens):
|
|
93
|
+
if qt not in idf:
|
|
94
|
+
continue
|
|
95
|
+
freq = tf.get(qt, 0)
|
|
96
|
+
if freq == 0:
|
|
97
|
+
continue
|
|
98
|
+
numerator = freq * (k1 + 1)
|
|
99
|
+
denominator = freq + k1 * (1 - b + b * dl / avg_dl)
|
|
100
|
+
score += idf[qt] * numerator / denominator
|
|
101
|
+
return score
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# ── BM25 Engine ──
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class BM25:
|
|
108
|
+
"""BM25 retrieval engine with configurable scoring.
|
|
109
|
+
|
|
110
|
+
Usage::
|
|
111
|
+
|
|
112
|
+
docs = [
|
|
113
|
+
ScoredDocument("doc1", tokenize("the cat sat on the mat")),
|
|
114
|
+
ScoredDocument("doc2", tokenize("the dog sat on the log")),
|
|
115
|
+
]
|
|
116
|
+
engine = BM25(docs)
|
|
117
|
+
results = engine.search("cat mat", top_k=5)
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
def __init__(
|
|
121
|
+
self,
|
|
122
|
+
corpus: Sequence[ScoredDocument],
|
|
123
|
+
k1: float = K1,
|
|
124
|
+
b: float = B,
|
|
125
|
+
):
|
|
126
|
+
self.corpus = list(corpus)
|
|
127
|
+
self.k1 = k1
|
|
128
|
+
self.b = b
|
|
129
|
+
self.avg_dl = (
|
|
130
|
+
sum(len(d.tokens) for d in self.corpus) / max(len(self.corpus), 1)
|
|
131
|
+
)
|
|
132
|
+
self.idf = compute_idf(self.corpus)
|
|
133
|
+
|
|
134
|
+
def search(
|
|
135
|
+
self,
|
|
136
|
+
query: str,
|
|
137
|
+
top_k: int = 10,
|
|
138
|
+
metadata_weight: float = 0.0,
|
|
139
|
+
) -> list[SearchResult]:
|
|
140
|
+
"""Search the corpus and return ranked results.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
query: Raw query string (will be tokenized).
|
|
144
|
+
top_k: Maximum number of results to return.
|
|
145
|
+
metadata_weight: Weight for metadata score (0 = BM25 only).
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
List of SearchResult ordered by descending score.
|
|
149
|
+
"""
|
|
150
|
+
query_tokens = tokenize(query)
|
|
151
|
+
if not query_tokens:
|
|
152
|
+
return []
|
|
153
|
+
|
|
154
|
+
scored: list[SearchResult] = []
|
|
155
|
+
for doc in self.corpus:
|
|
156
|
+
bm25_score = compute_bm25(
|
|
157
|
+
query_tokens, doc.tokens, self.idf, self.avg_dl,
|
|
158
|
+
k1=self.k1, b=self.b,
|
|
159
|
+
)
|
|
160
|
+
meta_score = (
|
|
161
|
+
sum(doc.metadata.values()) * metadata_weight
|
|
162
|
+
if doc.metadata else 0.0
|
|
163
|
+
)
|
|
164
|
+
total = bm25_score + meta_score
|
|
165
|
+
scored.append(SearchResult(doc.doc_id, total, doc.metadata))
|
|
166
|
+
|
|
167
|
+
scored.sort(key=lambda x: x.score, reverse=True)
|
|
168
|
+
return scored[:top_k]
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
# ── RRF (Reciprocal Rank Fusion) ──
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def rrf(
|
|
175
|
+
result_lists: list[list[SearchResult]],
|
|
176
|
+
k: int = 60,
|
|
177
|
+
top_k: int = 10,
|
|
178
|
+
) -> list[SearchResult]:
|
|
179
|
+
"""Fuse multiple ranked lists using Reciprocal Rank Fusion.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
result_lists: Multiple ranked result lists from different queries.
|
|
183
|
+
k: RRF constant (default 60, per the original paper).
|
|
184
|
+
top_k: Maximum number of fused results.
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
Fused ranking ordered by descending RRF score.
|
|
188
|
+
"""
|
|
189
|
+
scores: dict[str, float] = {}
|
|
190
|
+
metadata: dict[str, dict] = {}
|
|
191
|
+
|
|
192
|
+
for rank_list in result_lists:
|
|
193
|
+
for rank, result in enumerate(rank_list, start=1):
|
|
194
|
+
scores[result.doc_id] = scores.get(result.doc_id, 0.0) + 1.0 / (k + rank)
|
|
195
|
+
if result.doc_id not in metadata:
|
|
196
|
+
metadata[result.doc_id] = result.metadata
|
|
197
|
+
|
|
198
|
+
fused = sorted(scores.items(), key=lambda x: -x[1])
|
|
199
|
+
return [
|
|
200
|
+
SearchResult(doc_id, score, metadata.get(doc_id, {}))
|
|
201
|
+
for doc_id, score in fused[:top_k]
|
|
202
|
+
]
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: misakanet-core
|
|
3
|
+
Version: 2.7.0
|
|
4
|
+
Summary: The zero-dependency core protocol engine for MisakaNet Swarm Knowledge Network.
|
|
5
|
+
Author-email: Ikalus1988 <sheldonisspark@gmail.com>
|
|
6
|
+
Project-URL: Homepage, https://github.com/Ikalus1988/MisakaNet
|
|
7
|
+
Project-URL: Bug Tracker, https://github.com/Ikalus1988/MisakaNet/issues
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# misakanet-core
|
|
15
|
+
|
|
16
|
+
**Zero-dependency BM25 search engine with RRF fusion** — extracted from [MisakaNet](https://github.com/Ikalus1988/MisakaNet).
|
|
17
|
+
|
|
18
|
+
- Pure Python, stdlib only
|
|
19
|
+
- BM25 ranking with configurable k1/b
|
|
20
|
+
- Metadata-weighted scoring
|
|
21
|
+
- RRF (Reciprocal Rank Fusion) for multi-query fusion
|
|
22
|
+
- CJK-aware tokenization
|
|
23
|
+
- Works in air-gapped environments
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install misakanet-core
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Usage
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
from misakanet_core import BM25, ScoredDocument, tokenize, rrf
|
|
35
|
+
|
|
36
|
+
# Prepare corpus
|
|
37
|
+
docs = [
|
|
38
|
+
ScoredDocument("doc1", tokenize("the cat sat on the mat")),
|
|
39
|
+
ScoredDocument("doc2", tokenize("the dog sat on the log")),
|
|
40
|
+
ScoredDocument("doc3", tokenize("cats and dogs are friends")),
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
# Build index and search
|
|
44
|
+
engine = BM25(docs)
|
|
45
|
+
results = engine.search("cat dog", top_k=5)
|
|
46
|
+
|
|
47
|
+
for result in results:
|
|
48
|
+
print(f"{result.doc_id}: {result.score:.4f}")
|
|
49
|
+
|
|
50
|
+
# Multi-query fusion with RRF
|
|
51
|
+
from misakanet_core import SearchResult, rrf
|
|
52
|
+
query1 = engine.search("cat")
|
|
53
|
+
query2 = engine.search("dog")
|
|
54
|
+
fused = rrf([query1, query2], top_k=3)
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Why not use elasticsearch / tantivy / whoosh?
|
|
58
|
+
|
|
59
|
+
| | misakanet-core | elasticsearch | tantivy | whoosh |
|
|
60
|
+
|---|---|---|---|---|
|
|
61
|
+
| Dependencies | **Zero** | JVM | Rust toolchain | Pure Python |
|
|
62
|
+
| Install time | 0.5s | 5min+ | 2min+ | 2s |
|
|
63
|
+
| Air-gapped | ✅ | ❌ | ❌ | ✅ |
|
|
64
|
+
| CJK support | ✅ | ✅ | ⚠️ | ⚠️ |
|
|
65
|
+
|
|
66
|
+
## License
|
|
67
|
+
|
|
68
|
+
MIT
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
misakanet_core
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""Tests for misakanet-core."""
|
|
2
|
+
from misakanet_core import (
|
|
3
|
+
BM25, ScoredDocument, SearchResult,
|
|
4
|
+
tokenize, compute_idf, compute_bm25, rrf,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_tokenize_basic():
|
|
9
|
+
assert tokenize("Hello World") == ["hello", "world"]
|
|
10
|
+
assert tokenize("BM25检索") == ["bm25", "检", "索"]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_tokenize_cjk():
|
|
14
|
+
"""CJK characters are kept as individual tokens for BM25 matching."""
|
|
15
|
+
result = tokenize("中文测试")
|
|
16
|
+
assert "中" in result
|
|
17
|
+
assert "文" in result
|
|
18
|
+
assert "测" in result
|
|
19
|
+
assert "试" in result
|
|
20
|
+
assert len(result) == 4
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_compute_idf():
|
|
24
|
+
docs = [
|
|
25
|
+
ScoredDocument("d1", tokenize("cat sat on mat")),
|
|
26
|
+
ScoredDocument("d2", tokenize("dog sat on log")),
|
|
27
|
+
ScoredDocument("d3", tokenize("cat sat on log")),
|
|
28
|
+
]
|
|
29
|
+
idf = compute_idf(docs)
|
|
30
|
+
assert idf["cat"] > idf["on"] # rarer = higher IDF
|
|
31
|
+
assert idf["on"] < idf["cat"] # appears in all 3, lower IDF
|
|
32
|
+
assert "sat" in idf
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_bm25_simple():
|
|
36
|
+
docs = [
|
|
37
|
+
ScoredDocument("d1", tokenize("cat sat on the mat")),
|
|
38
|
+
ScoredDocument("d2", tokenize("the dog sat on the log")),
|
|
39
|
+
]
|
|
40
|
+
engine = BM25(docs)
|
|
41
|
+
results = engine.search("cat", top_k=5)
|
|
42
|
+
assert len(results) == 2
|
|
43
|
+
assert results[0].doc_id == "d1" # d1 has "cat"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_bm25_top_k():
|
|
47
|
+
docs = [ScoredDocument(f"d{i}", tokenize(f"word{i} "*3)) for i in range(10)]
|
|
48
|
+
engine = BM25(docs)
|
|
49
|
+
results = engine.search("word3 word7", top_k=2)
|
|
50
|
+
assert len(results) == 2
|
|
51
|
+
assert results[0].doc_id == "d3"
|
|
52
|
+
assert results[1].doc_id == "d7"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_bm25_empty_query():
|
|
56
|
+
docs = [ScoredDocument("d1", tokenize("hello world"))]
|
|
57
|
+
engine = BM25(docs)
|
|
58
|
+
results = engine.search("")
|
|
59
|
+
assert results == []
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_bm25_empty_corpus():
|
|
63
|
+
engine = BM25([])
|
|
64
|
+
results = engine.search("anything")
|
|
65
|
+
assert results == []
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def test_rrf_basic():
|
|
69
|
+
# Use rankings where one item is clearly best
|
|
70
|
+
r1 = [
|
|
71
|
+
SearchResult("a", 1.0, {}),
|
|
72
|
+
SearchResult("b", 0.5, {}),
|
|
73
|
+
SearchResult("c", 0.3, {}),
|
|
74
|
+
]
|
|
75
|
+
r2 = [
|
|
76
|
+
SearchResult("a", 0.9, {}),
|
|
77
|
+
SearchResult("d", 0.4, {}),
|
|
78
|
+
SearchResult("e", 0.2, {}),
|
|
79
|
+
]
|
|
80
|
+
fused = rrf([r1, r2], k=60, top_k=3)
|
|
81
|
+
assert len(fused) <= 3
|
|
82
|
+
assert fused[0].doc_id == "a" # rank 1 in both lists
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def test_rrf_empty():
|
|
86
|
+
assert rrf([], top_k=5) == []
|
|
87
|
+
assert rrf([[SearchResult("a", 1.0, {})], []], top_k=5) == [SearchResult("a", 1.0 / 61, {})]
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def test_rrf_top_k_limits():
|
|
91
|
+
r1 = [SearchResult(chr(ord('a')+i), 1.0 - i*0.1, {}) for i in range(5)]
|
|
92
|
+
r2 = [SearchResult(chr(ord('z')-i), 1.0 - i*0.1, {}) for i in range(5)]
|
|
93
|
+
fused = rrf([r1, r2], top_k=3)
|
|
94
|
+
assert len(fused) <= 3
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def test_metadata_scoring():
|
|
98
|
+
docs = [
|
|
99
|
+
ScoredDocument("d1", tokenize("cat mat sat"), metadata={"priority": 1.0}),
|
|
100
|
+
ScoredDocument("d2", tokenize("dog cat"), metadata={"priority": 0.0}),
|
|
101
|
+
]
|
|
102
|
+
engine = BM25(docs)
|
|
103
|
+
# Without metadata weight, both match "cat" equally
|
|
104
|
+
# With metadata weight, d1 gets a boost
|
|
105
|
+
results_no_meta = engine.search("cat", top_k=5, metadata_weight=0.0)
|
|
106
|
+
results_with_meta = engine.search("cat", top_k=5, metadata_weight=0.5)
|
|
107
|
+
assert results_with_meta[0].doc_id == "d1" # metadata boost
|
|
108
|
+
# Without metadata, scores are from BM25 only
|
|
109
|
+
assert results_no_meta[0].score <= results_with_meta[0].score
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def test_compute_bm25_score():
|
|
113
|
+
idf = {"cat": 0.5, "dog": 0.3}
|
|
114
|
+
avg_dl = 3.0
|
|
115
|
+
score = compute_bm25(["cat"], ["cat", "dog", "bird"], idf, avg_dl)
|
|
116
|
+
assert score > 0
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def test_multiple_queries_rrf():
|
|
120
|
+
"""RRF across different query aspects should handle overlap."""
|
|
121
|
+
cat_results = [SearchResult("d1", 0.9, {}), SearchResult("d3", 0.7, {})]
|
|
122
|
+
dog_results = [SearchResult("d2", 0.8, {}), SearchResult("d3", 0.6, {})]
|
|
123
|
+
fused = rrf([cat_results, dog_results], top_k=3)
|
|
124
|
+
assert len(fused) >= 2
|
|
125
|
+
d3_count = sum(1 for r in fused if r.doc_id == "d3")
|
|
126
|
+
assert d3_count == 1 # no duplicates
|