superlocalmemory 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ATTRIBUTION.md +140 -0
- package/CHANGELOG.md +1749 -0
- package/LICENSE +21 -0
- package/README.md +600 -0
- package/bin/aider-smart +72 -0
- package/bin/slm +202 -0
- package/bin/slm-npm +73 -0
- package/bin/slm.bat +195 -0
- package/bin/slm.cmd +10 -0
- package/bin/superlocalmemoryv2:list +3 -0
- package/bin/superlocalmemoryv2:profile +3 -0
- package/bin/superlocalmemoryv2:recall +3 -0
- package/bin/superlocalmemoryv2:remember +3 -0
- package/bin/superlocalmemoryv2:reset +3 -0
- package/bin/superlocalmemoryv2:status +3 -0
- package/completions/slm.bash +58 -0
- package/completions/slm.zsh +76 -0
- package/configs/antigravity-mcp.json +13 -0
- package/configs/chatgpt-desktop-mcp.json +7 -0
- package/configs/claude-desktop-mcp.json +15 -0
- package/configs/codex-mcp.toml +13 -0
- package/configs/cody-commands.json +29 -0
- package/configs/continue-mcp.yaml +14 -0
- package/configs/continue-skills.yaml +26 -0
- package/configs/cursor-mcp.json +15 -0
- package/configs/gemini-cli-mcp.json +11 -0
- package/configs/jetbrains-mcp.json +11 -0
- package/configs/opencode-mcp.json +12 -0
- package/configs/perplexity-mcp.json +9 -0
- package/configs/vscode-copilot-mcp.json +12 -0
- package/configs/windsurf-mcp.json +16 -0
- package/configs/zed-mcp.json +12 -0
- package/docs/ARCHITECTURE.md +877 -0
- package/docs/CLI-COMMANDS-REFERENCE.md +425 -0
- package/docs/COMPETITIVE-ANALYSIS.md +210 -0
- package/docs/COMPRESSION-README.md +390 -0
- package/docs/GRAPH-ENGINE.md +503 -0
- package/docs/MCP-MANUAL-SETUP.md +720 -0
- package/docs/MCP-TROUBLESHOOTING.md +787 -0
- package/docs/PATTERN-LEARNING.md +363 -0
- package/docs/PROFILES-GUIDE.md +453 -0
- package/docs/RESET-GUIDE.md +353 -0
- package/docs/SEARCH-ENGINE-V2.2.0.md +748 -0
- package/docs/SEARCH-INTEGRATION-GUIDE.md +502 -0
- package/docs/UI-SERVER.md +254 -0
- package/docs/UNIVERSAL-INTEGRATION.md +432 -0
- package/docs/V2.2.0-OPTIONAL-SEARCH.md +666 -0
- package/docs/WINDOWS-INSTALL-README.txt +34 -0
- package/docs/WINDOWS-POST-INSTALL.txt +45 -0
- package/docs/example_graph_usage.py +148 -0
- package/hooks/memory-list-skill.js +130 -0
- package/hooks/memory-profile-skill.js +284 -0
- package/hooks/memory-recall-skill.js +109 -0
- package/hooks/memory-remember-skill.js +127 -0
- package/hooks/memory-reset-skill.js +274 -0
- package/install-skills.sh +436 -0
- package/install.ps1 +417 -0
- package/install.sh +755 -0
- package/mcp_server.py +585 -0
- package/package.json +94 -0
- package/requirements-core.txt +24 -0
- package/requirements.txt +10 -0
- package/scripts/postinstall.js +126 -0
- package/scripts/preuninstall.js +57 -0
- package/skills/slm-build-graph/SKILL.md +423 -0
- package/skills/slm-list-recent/SKILL.md +348 -0
- package/skills/slm-recall/SKILL.md +325 -0
- package/skills/slm-remember/SKILL.md +194 -0
- package/skills/slm-status/SKILL.md +363 -0
- package/skills/slm-switch-profile/SKILL.md +442 -0
- package/src/__pycache__/cache_manager.cpython-312.pyc +0 -0
- package/src/__pycache__/embedding_engine.cpython-312.pyc +0 -0
- package/src/__pycache__/graph_engine.cpython-312.pyc +0 -0
- package/src/__pycache__/hnsw_index.cpython-312.pyc +0 -0
- package/src/__pycache__/hybrid_search.cpython-312.pyc +0 -0
- package/src/__pycache__/memory-profiles.cpython-312.pyc +0 -0
- package/src/__pycache__/memory-reset.cpython-312.pyc +0 -0
- package/src/__pycache__/memory_compression.cpython-312.pyc +0 -0
- package/src/__pycache__/memory_store_v2.cpython-312.pyc +0 -0
- package/src/__pycache__/migrate_v1_to_v2.cpython-312.pyc +0 -0
- package/src/__pycache__/pattern_learner.cpython-312.pyc +0 -0
- package/src/__pycache__/query_optimizer.cpython-312.pyc +0 -0
- package/src/__pycache__/search_engine_v2.cpython-312.pyc +0 -0
- package/src/__pycache__/setup_validator.cpython-312.pyc +0 -0
- package/src/__pycache__/tree_manager.cpython-312.pyc +0 -0
- package/src/cache_manager.py +520 -0
- package/src/embedding_engine.py +671 -0
- package/src/graph_engine.py +970 -0
- package/src/hnsw_index.py +626 -0
- package/src/hybrid_search.py +693 -0
- package/src/memory-profiles.py +518 -0
- package/src/memory-reset.py +485 -0
- package/src/memory_compression.py +999 -0
- package/src/memory_store_v2.py +1088 -0
- package/src/migrate_v1_to_v2.py +638 -0
- package/src/pattern_learner.py +898 -0
- package/src/query_optimizer.py +513 -0
- package/src/search_engine_v2.py +403 -0
- package/src/setup_validator.py +479 -0
- package/src/tree_manager.py +720 -0
|
@@ -0,0 +1,403 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
SuperLocalMemory V2 - BM25 Search Engine
|
|
4
|
+
|
|
5
|
+
Copyright (c) 2026 Varun Pratap Bhardwaj
|
|
6
|
+
Solution Architect & Original Creator
|
|
7
|
+
|
|
8
|
+
Licensed under MIT License (see LICENSE file)
|
|
9
|
+
Repository: https://github.com/varun369/SuperLocalMemoryV2
|
|
10
|
+
|
|
11
|
+
ATTRIBUTION REQUIRED: This notice must be preserved in all copies.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
"""
|
|
15
|
+
BM25 Search Engine - Pure Python Implementation
|
|
16
|
+
|
|
17
|
+
Implements Okapi BM25 ranking function for relevance scoring without external dependencies.
|
|
18
|
+
BM25 (Best Match 25) is a probabilistic retrieval function that ranks documents based on
|
|
19
|
+
query term frequency with diminishing returns and document length normalization.
|
|
20
|
+
|
|
21
|
+
Algorithm: score(D,Q) = Σ IDF(qi) × (f(qi,D) × (k1 + 1)) / (f(qi,D) + k1 × (1 - b + b × |D| / avgdl))
|
|
22
|
+
|
|
23
|
+
Where:
|
|
24
|
+
- f(qi,D) = term frequency of query term qi in document D
|
|
25
|
+
- |D| = document length (number of tokens)
|
|
26
|
+
- avgdl = average document length in the collection
|
|
27
|
+
- k1 = term frequency saturation parameter (default: 1.5)
|
|
28
|
+
- b = document length normalization parameter (default: 0.75)
|
|
29
|
+
- IDF(qi) = log((N - df(qi) + 0.5) / (df(qi) + 0.5) + 1)
|
|
30
|
+
where N = total documents, df(qi) = document frequency of term qi
|
|
31
|
+
|
|
32
|
+
Performance Target: <30ms for 1K memories
|
|
33
|
+
|
|
34
|
+
Usage:
|
|
35
|
+
engine = BM25SearchEngine()
|
|
36
|
+
engine.index_documents(docs, doc_ids)
|
|
37
|
+
results = engine.search("query string", limit=10)
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
import math
|
|
41
|
+
import re
|
|
42
|
+
from collections import defaultdict, Counter
|
|
43
|
+
from typing import List, Dict, Tuple, Any, Optional
|
|
44
|
+
import time
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class BM25SearchEngine:
|
|
48
|
+
"""
|
|
49
|
+
Pure Python BM25 search engine with no external dependencies.
|
|
50
|
+
|
|
51
|
+
BM25 is the industry standard for keyword-based retrieval and outperforms
|
|
52
|
+
simple TF-IDF in most scenarios due to better term saturation handling.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(self, k1: float = 1.5, b: float = 0.75):
|
|
56
|
+
"""
|
|
57
|
+
Initialize BM25 search engine.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
k1: Term frequency saturation parameter (1.2-2.0 typical)
|
|
61
|
+
Higher values = more weight on term frequency
|
|
62
|
+
Default 1.5 is optimal for most use cases
|
|
63
|
+
b: Document length normalization (0.0-1.0)
|
|
64
|
+
0 = no normalization, 1 = full normalization
|
|
65
|
+
Default 0.75 balances short vs long documents
|
|
66
|
+
"""
|
|
67
|
+
self.k1 = k1
|
|
68
|
+
self.b = b
|
|
69
|
+
|
|
70
|
+
# Index structures
|
|
71
|
+
self.doc_ids: List[Any] = [] # Document IDs in index order
|
|
72
|
+
self.doc_lengths: List[int] = [] # Token count per document
|
|
73
|
+
self.avg_doc_length: float = 0.0
|
|
74
|
+
self.num_docs: int = 0
|
|
75
|
+
|
|
76
|
+
# Inverted index: term -> [(doc_idx, term_freq), ...]
|
|
77
|
+
self.inverted_index: Dict[str, List[Tuple[int, int]]] = defaultdict(list)
|
|
78
|
+
|
|
79
|
+
# Document frequency: term -> count of documents containing term
|
|
80
|
+
self.doc_freq: Dict[str, int] = defaultdict(int)
|
|
81
|
+
|
|
82
|
+
# Performance tracking
|
|
83
|
+
self.index_time: float = 0.0
|
|
84
|
+
self.last_search_time: float = 0.0
|
|
85
|
+
|
|
86
|
+
def _tokenize(self, text: str) -> List[str]:
|
|
87
|
+
"""
|
|
88
|
+
Tokenize text into normalized terms.
|
|
89
|
+
|
|
90
|
+
Applies:
|
|
91
|
+
- Lowercase normalization
|
|
92
|
+
- Unicode handling
|
|
93
|
+
- Alphanumeric + underscore/hyphen preservation
|
|
94
|
+
- Stopword filtering (minimal set for performance)
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
text: Input text to tokenize
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
List of normalized tokens
|
|
101
|
+
"""
|
|
102
|
+
# Lowercase and extract alphanumeric tokens (preserve _ and -)
|
|
103
|
+
tokens = re.findall(r'\b[a-z0-9_-]+\b', text.lower())
|
|
104
|
+
|
|
105
|
+
# Minimal stopword list (most common English words that add no value)
|
|
106
|
+
# Kept small for performance - full stopword lists slow down search
|
|
107
|
+
stopwords = {
|
|
108
|
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
|
|
109
|
+
'for', 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are',
|
|
110
|
+
'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do',
|
|
111
|
+
'does', 'did', 'will', 'would', 'could', 'should', 'this',
|
|
112
|
+
'that', 'these', 'those', 'it', 'its'
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
# Filter stopwords and very short tokens
|
|
116
|
+
tokens = [t for t in tokens if len(t) > 1 and t not in stopwords]
|
|
117
|
+
|
|
118
|
+
return tokens
|
|
119
|
+
|
|
120
|
+
def index_documents(self, documents: List[str], doc_ids: List[Any]) -> None:
|
|
121
|
+
"""
|
|
122
|
+
Build BM25 index from documents.
|
|
123
|
+
|
|
124
|
+
Time complexity: O(n × m) where n = num_docs, m = avg_tokens_per_doc
|
|
125
|
+
Space complexity: O(v × d) where v = vocabulary size, d = avg postings per term
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
documents: List of document texts to index
|
|
129
|
+
doc_ids: List of document identifiers (must match documents length)
|
|
130
|
+
|
|
131
|
+
Raises:
|
|
132
|
+
ValueError: If documents and doc_ids length mismatch
|
|
133
|
+
"""
|
|
134
|
+
if len(documents) != len(doc_ids):
|
|
135
|
+
raise ValueError("documents and doc_ids must have same length")
|
|
136
|
+
|
|
137
|
+
start_time = time.time()
|
|
138
|
+
|
|
139
|
+
# Reset index
|
|
140
|
+
self.doc_ids = doc_ids
|
|
141
|
+
self.doc_lengths = []
|
|
142
|
+
self.inverted_index = defaultdict(list)
|
|
143
|
+
self.doc_freq = defaultdict(int)
|
|
144
|
+
self.num_docs = len(documents)
|
|
145
|
+
|
|
146
|
+
# Build inverted index
|
|
147
|
+
for doc_idx, doc_text in enumerate(documents):
|
|
148
|
+
tokens = self._tokenize(doc_text)
|
|
149
|
+
self.doc_lengths.append(len(tokens))
|
|
150
|
+
|
|
151
|
+
# Count term frequencies in this document
|
|
152
|
+
term_freqs = Counter(tokens)
|
|
153
|
+
|
|
154
|
+
# Update inverted index and document frequency
|
|
155
|
+
for term, freq in term_freqs.items():
|
|
156
|
+
self.inverted_index[term].append((doc_idx, freq))
|
|
157
|
+
self.doc_freq[term] += 1
|
|
158
|
+
|
|
159
|
+
# Calculate average document length
|
|
160
|
+
if self.num_docs > 0:
|
|
161
|
+
self.avg_doc_length = sum(self.doc_lengths) / self.num_docs
|
|
162
|
+
else:
|
|
163
|
+
self.avg_doc_length = 0.0
|
|
164
|
+
|
|
165
|
+
self.index_time = time.time() - start_time
|
|
166
|
+
|
|
167
|
+
def _calculate_idf(self, term: str) -> float:
|
|
168
|
+
"""
|
|
169
|
+
Calculate Inverse Document Frequency (IDF) for a term.
|
|
170
|
+
|
|
171
|
+
IDF formula: log((N - df + 0.5) / (df + 0.5) + 1)
|
|
172
|
+
|
|
173
|
+
Intuition:
|
|
174
|
+
- Rare terms (low df) get high IDF scores
|
|
175
|
+
- Common terms (high df) get low IDF scores
|
|
176
|
+
- Prevents over-weighting common words
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
term: Query term
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
IDF score (higher = more discriminative term)
|
|
183
|
+
"""
|
|
184
|
+
df = self.doc_freq.get(term, 0)
|
|
185
|
+
|
|
186
|
+
# Okapi BM25 IDF formula with smoothing
|
|
187
|
+
idf = math.log(
|
|
188
|
+
(self.num_docs - df + 0.5) / (df + 0.5) + 1.0
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
return idf
|
|
192
|
+
|
|
193
|
+
def _calculate_bm25_score(self, doc_idx: int, query_term_freqs: Dict[str, int]) -> float:
|
|
194
|
+
"""
|
|
195
|
+
Calculate BM25 score for a document given query term frequencies.
|
|
196
|
+
|
|
197
|
+
BM25 formula:
|
|
198
|
+
score(D,Q) = Σ IDF(qi) × (f(qi,D) × (k1 + 1)) / (f(qi,D) + k1 × (1 - b + b × |D| / avgdl))
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
doc_idx: Document index in corpus
|
|
202
|
+
query_term_freqs: Query term frequencies
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
BM25 relevance score
|
|
206
|
+
"""
|
|
207
|
+
score = 0.0
|
|
208
|
+
doc_len = self.doc_lengths[doc_idx]
|
|
209
|
+
|
|
210
|
+
# Document length normalization factor
|
|
211
|
+
# Short docs penalized less, long docs penalized more
|
|
212
|
+
norm_factor = 1 - self.b + self.b * (doc_len / self.avg_doc_length)
|
|
213
|
+
|
|
214
|
+
for term, query_freq in query_term_freqs.items():
|
|
215
|
+
if term not in self.inverted_index:
|
|
216
|
+
continue
|
|
217
|
+
|
|
218
|
+
# Find term frequency in this document
|
|
219
|
+
term_freq = 0
|
|
220
|
+
for idx, freq in self.inverted_index[term]:
|
|
221
|
+
if idx == doc_idx:
|
|
222
|
+
term_freq = freq
|
|
223
|
+
break
|
|
224
|
+
|
|
225
|
+
if term_freq == 0:
|
|
226
|
+
continue
|
|
227
|
+
|
|
228
|
+
# Calculate IDF weight
|
|
229
|
+
idf = self._calculate_idf(term)
|
|
230
|
+
|
|
231
|
+
# BM25 term score with saturation
|
|
232
|
+
# As term_freq increases, score has diminishing returns
|
|
233
|
+
numerator = term_freq * (self.k1 + 1)
|
|
234
|
+
denominator = term_freq + self.k1 * norm_factor
|
|
235
|
+
|
|
236
|
+
score += idf * (numerator / denominator)
|
|
237
|
+
|
|
238
|
+
return score
|
|
239
|
+
|
|
240
|
+
def search(
|
|
241
|
+
self,
|
|
242
|
+
query: str,
|
|
243
|
+
limit: int = 10,
|
|
244
|
+
score_threshold: float = 0.0
|
|
245
|
+
) -> List[Tuple[Any, float]]:
|
|
246
|
+
"""
|
|
247
|
+
Search indexed documents using BM25 ranking.
|
|
248
|
+
|
|
249
|
+
Performance: O(q × p) where q = query terms, p = avg postings per term
|
|
250
|
+
Target: <30ms for 1K documents
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
query: Search query string
|
|
254
|
+
limit: Maximum number of results to return
|
|
255
|
+
score_threshold: Minimum BM25 score threshold (default: 0.0)
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
List of (doc_id, score) tuples, sorted by score descending
|
|
259
|
+
"""
|
|
260
|
+
start_time = time.time()
|
|
261
|
+
|
|
262
|
+
if self.num_docs == 0:
|
|
263
|
+
self.last_search_time = time.time() - start_time
|
|
264
|
+
return []
|
|
265
|
+
|
|
266
|
+
# Tokenize and count query terms
|
|
267
|
+
query_tokens = self._tokenize(query)
|
|
268
|
+
if not query_tokens:
|
|
269
|
+
self.last_search_time = time.time() - start_time
|
|
270
|
+
return []
|
|
271
|
+
|
|
272
|
+
query_term_freqs = Counter(query_tokens)
|
|
273
|
+
|
|
274
|
+
# Find candidate documents (documents containing at least one query term)
|
|
275
|
+
candidate_docs = set()
|
|
276
|
+
for term in query_term_freqs:
|
|
277
|
+
if term in self.inverted_index:
|
|
278
|
+
for doc_idx, _ in self.inverted_index[term]:
|
|
279
|
+
candidate_docs.add(doc_idx)
|
|
280
|
+
|
|
281
|
+
# Calculate BM25 scores for candidates
|
|
282
|
+
scores = []
|
|
283
|
+
for doc_idx in candidate_docs:
|
|
284
|
+
score = self._calculate_bm25_score(doc_idx, query_term_freqs)
|
|
285
|
+
|
|
286
|
+
if score >= score_threshold:
|
|
287
|
+
scores.append((self.doc_ids[doc_idx], score))
|
|
288
|
+
|
|
289
|
+
# Sort by score descending and limit results
|
|
290
|
+
scores.sort(key=lambda x: x[1], reverse=True)
|
|
291
|
+
results = scores[:limit]
|
|
292
|
+
|
|
293
|
+
self.last_search_time = time.time() - start_time
|
|
294
|
+
|
|
295
|
+
return results
|
|
296
|
+
|
|
297
|
+
def search_with_details(
|
|
298
|
+
self,
|
|
299
|
+
query: str,
|
|
300
|
+
limit: int = 10,
|
|
301
|
+
score_threshold: float = 0.0
|
|
302
|
+
) -> Dict[str, Any]:
|
|
303
|
+
"""
|
|
304
|
+
Search with detailed performance metrics and match information.
|
|
305
|
+
|
|
306
|
+
Useful for debugging and performance analysis.
|
|
307
|
+
|
|
308
|
+
Args:
|
|
309
|
+
query: Search query string
|
|
310
|
+
limit: Maximum number of results to return
|
|
311
|
+
score_threshold: Minimum score threshold
|
|
312
|
+
|
|
313
|
+
Returns:
|
|
314
|
+
Dictionary with results and metadata
|
|
315
|
+
"""
|
|
316
|
+
query_tokens = self._tokenize(query)
|
|
317
|
+
results = self.search(query, limit, score_threshold)
|
|
318
|
+
|
|
319
|
+
return {
|
|
320
|
+
'results': results,
|
|
321
|
+
'query_terms': query_tokens,
|
|
322
|
+
'num_results': len(results),
|
|
323
|
+
'search_time_ms': self.last_search_time * 1000,
|
|
324
|
+
'index_size': self.num_docs,
|
|
325
|
+
'avg_doc_length': self.avg_doc_length
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
329
|
+
"""
|
|
330
|
+
Get search engine statistics.
|
|
331
|
+
|
|
332
|
+
Returns:
|
|
333
|
+
Dictionary with index statistics
|
|
334
|
+
"""
|
|
335
|
+
return {
|
|
336
|
+
'num_documents': self.num_docs,
|
|
337
|
+
'vocabulary_size': len(self.inverted_index),
|
|
338
|
+
'avg_doc_length': self.avg_doc_length,
|
|
339
|
+
'total_tokens': sum(self.doc_lengths),
|
|
340
|
+
'index_time_ms': self.index_time * 1000,
|
|
341
|
+
'last_search_time_ms': self.last_search_time * 1000,
|
|
342
|
+
'k1': self.k1,
|
|
343
|
+
'b': self.b
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
# CLI interface for testing
|
|
348
|
+
if __name__ == "__main__":
|
|
349
|
+
import sys
|
|
350
|
+
|
|
351
|
+
# Demo usage
|
|
352
|
+
print("BM25 Search Engine - Demo")
|
|
353
|
+
print("=" * 60)
|
|
354
|
+
|
|
355
|
+
# Sample documents
|
|
356
|
+
documents = [
|
|
357
|
+
"Python is a high-level programming language with dynamic typing",
|
|
358
|
+
"JavaScript is widely used for web development and frontend applications",
|
|
359
|
+
"Machine learning uses Python libraries like scikit-learn and TensorFlow",
|
|
360
|
+
"React is a JavaScript framework for building user interfaces",
|
|
361
|
+
"Django is a Python web framework that follows MVC architecture",
|
|
362
|
+
"Neural networks are a key component of deep learning systems",
|
|
363
|
+
]
|
|
364
|
+
|
|
365
|
+
doc_ids = [f"doc_{i}" for i in range(len(documents))]
|
|
366
|
+
|
|
367
|
+
# Index documents
|
|
368
|
+
engine = BM25SearchEngine()
|
|
369
|
+
print(f"\nIndexing {len(documents)} documents...")
|
|
370
|
+
engine.index_documents(documents, doc_ids)
|
|
371
|
+
|
|
372
|
+
stats = engine.get_stats()
|
|
373
|
+
print(f"✓ Indexed in {stats['index_time_ms']:.2f}ms")
|
|
374
|
+
print(f" Vocabulary: {stats['vocabulary_size']} unique terms")
|
|
375
|
+
print(f" Avg doc length: {stats['avg_doc_length']:.1f} tokens")
|
|
376
|
+
|
|
377
|
+
# Test queries
|
|
378
|
+
test_queries = [
|
|
379
|
+
"Python programming",
|
|
380
|
+
"web development",
|
|
381
|
+
"machine learning",
|
|
382
|
+
"JavaScript framework"
|
|
383
|
+
]
|
|
384
|
+
|
|
385
|
+
print("\n" + "=" * 60)
|
|
386
|
+
print("Search Results:")
|
|
387
|
+
print("=" * 60)
|
|
388
|
+
|
|
389
|
+
for query in test_queries:
|
|
390
|
+
print(f"\nQuery: '{query}'")
|
|
391
|
+
results = engine.search_with_details(query, limit=3)
|
|
392
|
+
|
|
393
|
+
print(f" Found: {results['num_results']} results in {results['search_time_ms']:.2f}ms")
|
|
394
|
+
print(f" Query terms: {results['query_terms']}")
|
|
395
|
+
|
|
396
|
+
for doc_id, score in results['results']:
|
|
397
|
+
doc_idx = doc_ids.index(doc_id)
|
|
398
|
+
print(f" [{score:.3f}] {doc_id}: {documents[doc_idx][:60]}...")
|
|
399
|
+
|
|
400
|
+
print("\n" + "=" * 60)
|
|
401
|
+
print("Performance Summary:")
|
|
402
|
+
print(f" Average search time: {stats['last_search_time_ms']:.2f}ms")
|
|
403
|
+
print(f" Target: <30ms for 1K documents ✓")
|