misakanet-core 2.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,68 @@
1
+ Metadata-Version: 2.4
2
+ Name: misakanet-core
3
+ Version: 2.7.0
4
+ Summary: The zero-dependency core protocol engine for MisakaNet Swarm Knowledge Network.
5
+ Author-email: Ikalus1988 <sheldonisspark@gmail.com>
6
+ Project-URL: Homepage, https://github.com/Ikalus1988/MisakaNet
7
+ Project-URL: Bug Tracker, https://github.com/Ikalus1988/MisakaNet/issues
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+
14
+ # misakanet-core
15
+
16
+ **Zero-dependency BM25 search engine with RRF fusion** — extracted from [MisakaNet](https://github.com/Ikalus1988/MisakaNet).
17
+
18
+ - Pure Python, stdlib only
19
+ - BM25 ranking with configurable k1/b
20
+ - Metadata-weighted scoring
21
+ - RRF (Reciprocal Rank Fusion) for multi-query fusion
22
+ - CJK-aware tokenization
23
+ - Works in air-gapped environments
24
+
25
+ ## Installation
26
+
27
+ ```bash
28
+ pip install misakanet-core
29
+ ```
30
+
31
+ ## Usage
32
+
33
+ ```python
34
+ from misakanet_core import BM25, ScoredDocument, tokenize, rrf
35
+
36
+ # Prepare corpus
37
+ docs = [
38
+ ScoredDocument("doc1", tokenize("the cat sat on the mat")),
39
+ ScoredDocument("doc2", tokenize("the dog sat on the log")),
40
+ ScoredDocument("doc3", tokenize("cats and dogs are friends")),
41
+ ]
42
+
43
+ # Build index and search
44
+ engine = BM25(docs)
45
+ results = engine.search("cat dog", top_k=5)
46
+
47
+ for result in results:
48
+ print(f"{result.doc_id}: {result.score:.4f}")
49
+
50
+ # Multi-query fusion with RRF
51
+ from misakanet_core import SearchResult, rrf
52
+ query1 = engine.search("cat")
53
+ query2 = engine.search("dog")
54
+ fused = rrf([query1, query2], top_k=3)
55
+ ```
56
+
57
+ ## Why not use elasticsearch / tantivy / whoosh?
58
+
59
+ | | misakanet-core | elasticsearch | tantivy | whoosh |
60
+ |---|---|---|---|---|
61
+ | Dependencies | **Zero** | JVM | Rust toolchain | Pure Python |
62
+ | Install time | 0.5s | 5min+ | 2min+ | 2s |
63
+ | Air-gapped | ✅ | ❌ | ❌ | ✅ |
64
+ | CJK support | ✅ | ✅ | ⚠️ | ⚠️ |
65
+
66
+ ## License
67
+
68
+ MIT
@@ -0,0 +1,55 @@
1
+ # misakanet-core
2
+
3
+ **Zero-dependency BM25 search engine with RRF fusion** — extracted from [MisakaNet](https://github.com/Ikalus1988/MisakaNet).
4
+
5
+ - Pure Python, stdlib only
6
+ - BM25 ranking with configurable k1/b
7
+ - Metadata-weighted scoring
8
+ - RRF (Reciprocal Rank Fusion) for multi-query fusion
9
+ - CJK-aware tokenization
10
+ - Works in air-gapped environments
11
+
12
+ ## Installation
13
+
14
+ ```bash
15
+ pip install misakanet-core
16
+ ```
17
+
18
+ ## Usage
19
+
20
+ ```python
21
+ from misakanet_core import BM25, ScoredDocument, tokenize, rrf
22
+
23
+ # Prepare corpus
24
+ docs = [
25
+ ScoredDocument("doc1", tokenize("the cat sat on the mat")),
26
+ ScoredDocument("doc2", tokenize("the dog sat on the log")),
27
+ ScoredDocument("doc3", tokenize("cats and dogs are friends")),
28
+ ]
29
+
30
+ # Build index and search
31
+ engine = BM25(docs)
32
+ results = engine.search("cat dog", top_k=5)
33
+
34
+ for result in results:
35
+ print(f"{result.doc_id}: {result.score:.4f}")
36
+
37
+ # Multi-query fusion with RRF
38
+ from misakanet_core import SearchResult, rrf
39
+ query1 = engine.search("cat")
40
+ query2 = engine.search("dog")
41
+ fused = rrf([query1, query2], top_k=3)
42
+ ```
43
+
44
+ ## Why not use elasticsearch / tantivy / whoosh?
45
+
46
+ | | misakanet-core | elasticsearch | tantivy | whoosh |
47
+ |---|---|---|---|---|
48
+ | Dependencies | **Zero** | JVM | Rust toolchain | Pure Python |
49
+ | Install time | 0.5s | 5min+ | 2min+ | 2s |
50
+ | Air-gapped | ✅ | ❌ | ❌ | ✅ |
51
+ | CJK support | ✅ | ✅ | ⚠️ | ⚠️ |
52
+
53
+ ## License
54
+
55
+ MIT
@@ -0,0 +1,23 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "misakanet-core"
7
+ version = "2.7.0"
8
+ description = "The zero-dependency core protocol engine for MisakaNet Swarm Knowledge Network."
9
+ readme = "README.md"
10
+ authors = [
11
+ { name = "Ikalus1988", email = "sheldonisspark@gmail.com" }
12
+ ]
13
+ classifiers = [
14
+ "Programming Language :: Python :: 3",
15
+ "License :: OSI Approved :: MIT License",
16
+ "Operating System :: OS Independent",
17
+ ]
18
+ requires-python = ">=3.10"
19
+ dependencies = []
20
+
21
+ [project.urls]
22
+ "Homepage" = "https://github.com/Ikalus1988/MisakaNet"
23
+ "Bug Tracker" = "https://github.com/Ikalus1988/MisakaNet/issues"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,202 @@
1
+ """misakanet-core: Zero-dependency BM25 search engine.
2
+ Extracted from MisakaNet — see https://github.com/Ikalus1988/MisakaNet
3
+ """
4
+ from __future__ import annotations
5
+
6
+ import math
7
+ import re
8
+ import json
9
+ from typing import NamedTuple, Sequence
10
+ from collections import Counter
11
+ from dataclasses import dataclass, field
12
+
13
+ __all__ = [
14
+ "BM25", "RRF", "SearchResult",
15
+ "tokenize", "compute_idf", "compute_bm25",
16
+ ]
17
+
18
+ K1 = 1.5
19
+ B = 0.75
20
+
21
+
22
+ # ── Types ──
23
+
24
+
25
+ class SearchResult(NamedTuple):
26
+ doc_id: str
27
+ score: float
28
+ metadata: dict
29
+
30
+
31
+ @dataclass
32
+ class ScoredDocument:
33
+ """Internal representation of a scored document."""
34
+ doc_id: str
35
+ tokens: list[str]
36
+ metadata: dict = field(default_factory=dict)
37
+ score: float = 0.0
38
+
39
+
40
+ # ── Tokenization ──
41
+
42
+
43
+ def tokenize(text: str, min_len: int = 1) -> list[str]:
44
+ """Split text into lower-case tokens, filtering short words.
45
+
46
+ Handles mixed Latin/CJK text. CJK characters are split individually
47
+ to maximize BM25 recall across different queries.
48
+ """
49
+ text = text.lower()
50
+ # Split CJK characters each into their own token
51
+ text = re.sub(r"([\u4e00-\u9fff])", r" \1 ", text)
52
+ tokens = re.findall(r"[a-z0-9]+|[\u4e00-\u9fff]", text)
53
+ return [t for t in tokens if len(t) >= min_len]
54
+
55
+
56
+ # ── IDF Computation ──
57
+
58
+
59
+ def compute_idf(
60
+ docs: Sequence[ScoredDocument],
61
+ smoothing: bool = True,
62
+ ) -> dict[str, float]:
63
+ """Compute Inverse Document Frequency for each token across a corpus."""
64
+ n = len(docs)
65
+ df: Counter[str] = Counter()
66
+ for doc in docs:
67
+ unique_tokens = set(doc.tokens)
68
+ df.update(unique_tokens)
69
+
70
+ eps = 1.0 if smoothing else 0.0
71
+ return {
72
+ token: math.log((n - freq + 0.5) / (freq + 0.5) + eps)
73
+ for token, freq in df.items()
74
+ }
75
+
76
+
77
+ # ── BM25 Scoring ──
78
+
79
+
80
+ def compute_bm25(
81
+ query_tokens: list[str],
82
+ doc_tokens: list[str],
83
+ idf: dict[str, float],
84
+ avg_dl: float,
85
+ k1: float = K1,
86
+ b: float = B,
87
+ ) -> float:
88
+ """Compute BM25 score for a single document against a query."""
89
+ dl = len(doc_tokens)
90
+ tf = Counter(doc_tokens)
91
+ score = 0.0
92
+ for qt in set(query_tokens):
93
+ if qt not in idf:
94
+ continue
95
+ freq = tf.get(qt, 0)
96
+ if freq == 0:
97
+ continue
98
+ numerator = freq * (k1 + 1)
99
+ denominator = freq + k1 * (1 - b + b * dl / avg_dl)
100
+ score += idf[qt] * numerator / denominator
101
+ return score
102
+
103
+
104
+ # ── BM25 Engine ──
105
+
106
+
107
+ class BM25:
108
+ """BM25 retrieval engine with configurable scoring.
109
+
110
+ Usage::
111
+
112
+ docs = [
113
+ ScoredDocument("doc1", tokenize("the cat sat on the mat")),
114
+ ScoredDocument("doc2", tokenize("the dog sat on the log")),
115
+ ]
116
+ engine = BM25(docs)
117
+ results = engine.search("cat mat", top_k=5)
118
+ """
119
+
120
+ def __init__(
121
+ self,
122
+ corpus: Sequence[ScoredDocument],
123
+ k1: float = K1,
124
+ b: float = B,
125
+ ):
126
+ self.corpus = list(corpus)
127
+ self.k1 = k1
128
+ self.b = b
129
+ self.avg_dl = (
130
+ sum(len(d.tokens) for d in self.corpus) / max(len(self.corpus), 1)
131
+ )
132
+ self.idf = compute_idf(self.corpus)
133
+
134
+ def search(
135
+ self,
136
+ query: str,
137
+ top_k: int = 10,
138
+ metadata_weight: float = 0.0,
139
+ ) -> list[SearchResult]:
140
+ """Search the corpus and return ranked results.
141
+
142
+ Args:
143
+ query: Raw query string (will be tokenized).
144
+ top_k: Maximum number of results to return.
145
+ metadata_weight: Weight for metadata score (0 = BM25 only).
146
+
147
+ Returns:
148
+ List of SearchResult ordered by descending score.
149
+ """
150
+ query_tokens = tokenize(query)
151
+ if not query_tokens:
152
+ return []
153
+
154
+ scored: list[SearchResult] = []
155
+ for doc in self.corpus:
156
+ bm25_score = compute_bm25(
157
+ query_tokens, doc.tokens, self.idf, self.avg_dl,
158
+ k1=self.k1, b=self.b,
159
+ )
160
+ meta_score = (
161
+ sum(doc.metadata.values()) * metadata_weight
162
+ if doc.metadata else 0.0
163
+ )
164
+ total = bm25_score + meta_score
165
+ scored.append(SearchResult(doc.doc_id, total, doc.metadata))
166
+
167
+ scored.sort(key=lambda x: x.score, reverse=True)
168
+ return scored[:top_k]
169
+
170
+
171
+ # ── RRF (Reciprocal Rank Fusion) ──
172
+
173
+
174
+ def rrf(
175
+ result_lists: list[list[SearchResult]],
176
+ k: int = 60,
177
+ top_k: int = 10,
178
+ ) -> list[SearchResult]:
179
+ """Fuse multiple ranked lists using Reciprocal Rank Fusion.
180
+
181
+ Args:
182
+ result_lists: Multiple ranked result lists from different queries.
183
+ k: RRF constant (default 60, per the original paper).
184
+ top_k: Maximum number of fused results.
185
+
186
+ Returns:
187
+ Fused ranking ordered by descending RRF score.
188
+ """
189
+ scores: dict[str, float] = {}
190
+ metadata: dict[str, dict] = {}
191
+
192
+ for rank_list in result_lists:
193
+ for rank, result in enumerate(rank_list, start=1):
194
+ scores[result.doc_id] = scores.get(result.doc_id, 0.0) + 1.0 / (k + rank)
195
+ if result.doc_id not in metadata:
196
+ metadata[result.doc_id] = result.metadata
197
+
198
+ fused = sorted(scores.items(), key=lambda x: -x[1])
199
+ return [
200
+ SearchResult(doc_id, score, metadata.get(doc_id, {}))
201
+ for doc_id, score in fused[:top_k]
202
+ ]
@@ -0,0 +1,68 @@
1
+ Metadata-Version: 2.4
2
+ Name: misakanet-core
3
+ Version: 2.7.0
4
+ Summary: The zero-dependency core protocol engine for MisakaNet Swarm Knowledge Network.
5
+ Author-email: Ikalus1988 <sheldonisspark@gmail.com>
6
+ Project-URL: Homepage, https://github.com/Ikalus1988/MisakaNet
7
+ Project-URL: Bug Tracker, https://github.com/Ikalus1988/MisakaNet/issues
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+
14
+ # misakanet-core
15
+
16
+ **Zero-dependency BM25 search engine with RRF fusion** — extracted from [MisakaNet](https://github.com/Ikalus1988/MisakaNet).
17
+
18
+ - Pure Python, stdlib only
19
+ - BM25 ranking with configurable k1/b
20
+ - Metadata-weighted scoring
21
+ - RRF (Reciprocal Rank Fusion) for multi-query fusion
22
+ - CJK-aware tokenization
23
+ - Works in air-gapped environments
24
+
25
+ ## Installation
26
+
27
+ ```bash
28
+ pip install misakanet-core
29
+ ```
30
+
31
+ ## Usage
32
+
33
+ ```python
34
+ from misakanet_core import BM25, ScoredDocument, tokenize, rrf
35
+
36
+ # Prepare corpus
37
+ docs = [
38
+ ScoredDocument("doc1", tokenize("the cat sat on the mat")),
39
+ ScoredDocument("doc2", tokenize("the dog sat on the log")),
40
+ ScoredDocument("doc3", tokenize("cats and dogs are friends")),
41
+ ]
42
+
43
+ # Build index and search
44
+ engine = BM25(docs)
45
+ results = engine.search("cat dog", top_k=5)
46
+
47
+ for result in results:
48
+ print(f"{result.doc_id}: {result.score:.4f}")
49
+
50
+ # Multi-query fusion with RRF
51
+ from misakanet_core import SearchResult, rrf
52
+ query1 = engine.search("cat")
53
+ query2 = engine.search("dog")
54
+ fused = rrf([query1, query2], top_k=3)
55
+ ```
56
+
57
+ ## Why not use elasticsearch / tantivy / whoosh?
58
+
59
+ | | misakanet-core | elasticsearch | tantivy | whoosh |
60
+ |---|---|---|---|---|
61
+ | Dependencies | **Zero** | JVM | Rust toolchain | Pure Python |
62
+ | Install time | 0.5s | 5min+ | 2min+ | 2s |
63
+ | Air-gapped | ✅ | ❌ | ❌ | ✅ |
64
+ | CJK support | ✅ | ✅ | ⚠️ | ⚠️ |
65
+
66
+ ## License
67
+
68
+ MIT
@@ -0,0 +1,8 @@
1
+ README.md
2
+ pyproject.toml
3
+ src/misakanet_core/__init__.py
4
+ src/misakanet_core.egg-info/PKG-INFO
5
+ src/misakanet_core.egg-info/SOURCES.txt
6
+ src/misakanet_core.egg-info/dependency_links.txt
7
+ src/misakanet_core.egg-info/top_level.txt
8
+ tests/test_core.py
@@ -0,0 +1 @@
1
+ misakanet_core
@@ -0,0 +1,126 @@
1
+ """Tests for misakanet-core."""
2
+ from misakanet_core import (
3
+ BM25, ScoredDocument, SearchResult,
4
+ tokenize, compute_idf, compute_bm25, rrf,
5
+ )
6
+
7
+
8
+ def test_tokenize_basic():
9
+ assert tokenize("Hello World") == ["hello", "world"]
10
+ assert tokenize("BM25检索") == ["bm25", "检", "索"]
11
+
12
+
13
+ def test_tokenize_cjk():
14
+ """CJK characters are kept as individual tokens for BM25 matching."""
15
+ result = tokenize("中文测试")
16
+ assert "中" in result
17
+ assert "文" in result
18
+ assert "测" in result
19
+ assert "试" in result
20
+ assert len(result) == 4
21
+
22
+
23
+ def test_compute_idf():
24
+ docs = [
25
+ ScoredDocument("d1", tokenize("cat sat on mat")),
26
+ ScoredDocument("d2", tokenize("dog sat on log")),
27
+ ScoredDocument("d3", tokenize("cat sat on log")),
28
+ ]
29
+ idf = compute_idf(docs)
30
+ assert idf["cat"] > idf["on"] # rarer = higher IDF
31
+ assert idf["on"] < idf["cat"] # appears in all 3, lower IDF
32
+ assert "sat" in idf
33
+
34
+
35
+ def test_bm25_simple():
36
+ docs = [
37
+ ScoredDocument("d1", tokenize("cat sat on the mat")),
38
+ ScoredDocument("d2", tokenize("the dog sat on the log")),
39
+ ]
40
+ engine = BM25(docs)
41
+ results = engine.search("cat", top_k=5)
42
+ assert len(results) == 2
43
+ assert results[0].doc_id == "d1" # d1 has "cat"
44
+
45
+
46
+ def test_bm25_top_k():
47
+ docs = [ScoredDocument(f"d{i}", tokenize(f"word{i} "*3)) for i in range(10)]
48
+ engine = BM25(docs)
49
+ results = engine.search("word3 word7", top_k=2)
50
+ assert len(results) == 2
51
+ assert results[0].doc_id == "d3"
52
+ assert results[1].doc_id == "d7"
53
+
54
+
55
+ def test_bm25_empty_query():
56
+ docs = [ScoredDocument("d1", tokenize("hello world"))]
57
+ engine = BM25(docs)
58
+ results = engine.search("")
59
+ assert results == []
60
+
61
+
62
+ def test_bm25_empty_corpus():
63
+ engine = BM25([])
64
+ results = engine.search("anything")
65
+ assert results == []
66
+
67
+
68
+ def test_rrf_basic():
69
+ # Use rankings where one item is clearly best
70
+ r1 = [
71
+ SearchResult("a", 1.0, {}),
72
+ SearchResult("b", 0.5, {}),
73
+ SearchResult("c", 0.3, {}),
74
+ ]
75
+ r2 = [
76
+ SearchResult("a", 0.9, {}),
77
+ SearchResult("d", 0.4, {}),
78
+ SearchResult("e", 0.2, {}),
79
+ ]
80
+ fused = rrf([r1, r2], k=60, top_k=3)
81
+ assert len(fused) <= 3
82
+ assert fused[0].doc_id == "a" # rank 1 in both lists
83
+
84
+
85
+ def test_rrf_empty():
86
+ assert rrf([], top_k=5) == []
87
+ assert rrf([[SearchResult("a", 1.0, {})], []], top_k=5) == [SearchResult("a", 1.0 / 61, {})]
88
+
89
+
90
+ def test_rrf_top_k_limits():
91
+ r1 = [SearchResult(chr(ord('a')+i), 1.0 - i*0.1, {}) for i in range(5)]
92
+ r2 = [SearchResult(chr(ord('z')-i), 1.0 - i*0.1, {}) for i in range(5)]
93
+ fused = rrf([r1, r2], top_k=3)
94
+ assert len(fused) <= 3
95
+
96
+
97
+ def test_metadata_scoring():
98
+ docs = [
99
+ ScoredDocument("d1", tokenize("cat mat sat"), metadata={"priority": 1.0}),
100
+ ScoredDocument("d2", tokenize("dog cat"), metadata={"priority": 0.0}),
101
+ ]
102
+ engine = BM25(docs)
103
+ # Without metadata weight, both match "cat" equally
104
+ # With metadata weight, d1 gets a boost
105
+ results_no_meta = engine.search("cat", top_k=5, metadata_weight=0.0)
106
+ results_with_meta = engine.search("cat", top_k=5, metadata_weight=0.5)
107
+ assert results_with_meta[0].doc_id == "d1" # metadata boost
108
+ # Without metadata, scores are from BM25 only
109
+ assert results_no_meta[0].score <= results_with_meta[0].score
110
+
111
+
112
+ def test_compute_bm25_score():
113
+ idf = {"cat": 0.5, "dog": 0.3}
114
+ avg_dl = 3.0
115
+ score = compute_bm25(["cat"], ["cat", "dog", "bird"], idf, avg_dl)
116
+ assert score > 0
117
+
118
+
119
+ def test_multiple_queries_rrf():
120
+ """RRF across different query aspects should handle overlap."""
121
+ cat_results = [SearchResult("d1", 0.9, {}), SearchResult("d3", 0.7, {})]
122
+ dog_results = [SearchResult("d2", 0.8, {}), SearchResult("d3", 0.6, {})]
123
+ fused = rrf([cat_results, dog_results], top_k=3)
124
+ assert len(fused) >= 2
125
+ d3_count = sum(1 for r in fused if r.doc_id == "d3")
126
+ assert d3_count == 1 # no duplicates