superlocalmemory 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ATTRIBUTION.md +140 -0
- package/CHANGELOG.md +1749 -0
- package/LICENSE +21 -0
- package/README.md +600 -0
- package/bin/aider-smart +72 -0
- package/bin/slm +202 -0
- package/bin/slm-npm +73 -0
- package/bin/slm.bat +195 -0
- package/bin/slm.cmd +10 -0
- package/bin/superlocalmemoryv2:list +3 -0
- package/bin/superlocalmemoryv2:profile +3 -0
- package/bin/superlocalmemoryv2:recall +3 -0
- package/bin/superlocalmemoryv2:remember +3 -0
- package/bin/superlocalmemoryv2:reset +3 -0
- package/bin/superlocalmemoryv2:status +3 -0
- package/completions/slm.bash +58 -0
- package/completions/slm.zsh +76 -0
- package/configs/antigravity-mcp.json +13 -0
- package/configs/chatgpt-desktop-mcp.json +7 -0
- package/configs/claude-desktop-mcp.json +15 -0
- package/configs/codex-mcp.toml +13 -0
- package/configs/cody-commands.json +29 -0
- package/configs/continue-mcp.yaml +14 -0
- package/configs/continue-skills.yaml +26 -0
- package/configs/cursor-mcp.json +15 -0
- package/configs/gemini-cli-mcp.json +11 -0
- package/configs/jetbrains-mcp.json +11 -0
- package/configs/opencode-mcp.json +12 -0
- package/configs/perplexity-mcp.json +9 -0
- package/configs/vscode-copilot-mcp.json +12 -0
- package/configs/windsurf-mcp.json +16 -0
- package/configs/zed-mcp.json +12 -0
- package/docs/ARCHITECTURE.md +877 -0
- package/docs/CLI-COMMANDS-REFERENCE.md +425 -0
- package/docs/COMPETITIVE-ANALYSIS.md +210 -0
- package/docs/COMPRESSION-README.md +390 -0
- package/docs/GRAPH-ENGINE.md +503 -0
- package/docs/MCP-MANUAL-SETUP.md +720 -0
- package/docs/MCP-TROUBLESHOOTING.md +787 -0
- package/docs/PATTERN-LEARNING.md +363 -0
- package/docs/PROFILES-GUIDE.md +453 -0
- package/docs/RESET-GUIDE.md +353 -0
- package/docs/SEARCH-ENGINE-V2.2.0.md +748 -0
- package/docs/SEARCH-INTEGRATION-GUIDE.md +502 -0
- package/docs/UI-SERVER.md +254 -0
- package/docs/UNIVERSAL-INTEGRATION.md +432 -0
- package/docs/V2.2.0-OPTIONAL-SEARCH.md +666 -0
- package/docs/WINDOWS-INSTALL-README.txt +34 -0
- package/docs/WINDOWS-POST-INSTALL.txt +45 -0
- package/docs/example_graph_usage.py +148 -0
- package/hooks/memory-list-skill.js +130 -0
- package/hooks/memory-profile-skill.js +284 -0
- package/hooks/memory-recall-skill.js +109 -0
- package/hooks/memory-remember-skill.js +127 -0
- package/hooks/memory-reset-skill.js +274 -0
- package/install-skills.sh +436 -0
- package/install.ps1 +417 -0
- package/install.sh +755 -0
- package/mcp_server.py +585 -0
- package/package.json +94 -0
- package/requirements-core.txt +24 -0
- package/requirements.txt +10 -0
- package/scripts/postinstall.js +126 -0
- package/scripts/preuninstall.js +57 -0
- package/skills/slm-build-graph/SKILL.md +423 -0
- package/skills/slm-list-recent/SKILL.md +348 -0
- package/skills/slm-recall/SKILL.md +325 -0
- package/skills/slm-remember/SKILL.md +194 -0
- package/skills/slm-status/SKILL.md +363 -0
- package/skills/slm-switch-profile/SKILL.md +442 -0
- package/src/__pycache__/cache_manager.cpython-312.pyc +0 -0
- package/src/__pycache__/embedding_engine.cpython-312.pyc +0 -0
- package/src/__pycache__/graph_engine.cpython-312.pyc +0 -0
- package/src/__pycache__/hnsw_index.cpython-312.pyc +0 -0
- package/src/__pycache__/hybrid_search.cpython-312.pyc +0 -0
- package/src/__pycache__/memory-profiles.cpython-312.pyc +0 -0
- package/src/__pycache__/memory-reset.cpython-312.pyc +0 -0
- package/src/__pycache__/memory_compression.cpython-312.pyc +0 -0
- package/src/__pycache__/memory_store_v2.cpython-312.pyc +0 -0
- package/src/__pycache__/migrate_v1_to_v2.cpython-312.pyc +0 -0
- package/src/__pycache__/pattern_learner.cpython-312.pyc +0 -0
- package/src/__pycache__/query_optimizer.cpython-312.pyc +0 -0
- package/src/__pycache__/search_engine_v2.cpython-312.pyc +0 -0
- package/src/__pycache__/setup_validator.cpython-312.pyc +0 -0
- package/src/__pycache__/tree_manager.cpython-312.pyc +0 -0
- package/src/cache_manager.py +520 -0
- package/src/embedding_engine.py +671 -0
- package/src/graph_engine.py +970 -0
- package/src/hnsw_index.py +626 -0
- package/src/hybrid_search.py +693 -0
- package/src/memory-profiles.py +518 -0
- package/src/memory-reset.py +485 -0
- package/src/memory_compression.py +999 -0
- package/src/memory_store_v2.py +1088 -0
- package/src/migrate_v1_to_v2.py +638 -0
- package/src/pattern_learner.py +898 -0
- package/src/query_optimizer.py +513 -0
- package/src/search_engine_v2.py +403 -0
- package/src/setup_validator.py +479 -0
- package/src/tree_manager.py +720 -0
|
@@ -0,0 +1,513 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
SuperLocalMemory V2 - Query Optimizer
|
|
4
|
+
|
|
5
|
+
Copyright (c) 2026 Varun Pratap Bhardwaj
|
|
6
|
+
Solution Architect & Original Creator
|
|
7
|
+
|
|
8
|
+
Licensed under MIT License (see LICENSE file)
|
|
9
|
+
Repository: https://github.com/varun369/SuperLocalMemoryV2
|
|
10
|
+
|
|
11
|
+
ATTRIBUTION REQUIRED: This notice must be preserved in all copies.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
"""
|
|
15
|
+
Query Optimizer - Query Enhancement and Rewriting
|
|
16
|
+
|
|
17
|
+
Transforms user queries into optimized search queries through:
|
|
18
|
+
|
|
19
|
+
1. Spell Correction: Fix common typos using edit distance
|
|
20
|
+
- "javscript" → "javascript"
|
|
21
|
+
- Uses vocabulary from indexed documents
|
|
22
|
+
- Levenshtein distance with max distance 2
|
|
23
|
+
|
|
24
|
+
2. Query Expansion: Add related terms to broaden search
|
|
25
|
+
- "auth" → "auth authentication authorize"
|
|
26
|
+
- Based on co-occurrence patterns in documents
|
|
27
|
+
- Optional synonym expansion
|
|
28
|
+
|
|
29
|
+
3. Boolean Operators: Parse structured queries
|
|
30
|
+
- "python AND (web OR api)" → structured query
|
|
31
|
+
- Supports: AND, OR, NOT, phrase queries "exact match"
|
|
32
|
+
- Converts to search engine-compatible format
|
|
33
|
+
|
|
34
|
+
4. Stopword Handling: Remove low-value terms
|
|
35
|
+
- Configurable stopword list
|
|
36
|
+
- Preserves important technical terms
|
|
37
|
+
|
|
38
|
+
Performance: Query optimization should add <5ms overhead
|
|
39
|
+
|
|
40
|
+
Usage:
|
|
41
|
+
optimizer = QueryOptimizer(vocabulary)
|
|
42
|
+
optimized = optimizer.optimize("javscript web devlopment")
|
|
43
|
+
# Returns: "javascript web development"
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
import re
|
|
47
|
+
from collections import defaultdict, Counter
|
|
48
|
+
from typing import List, Dict, Set, Tuple, Optional, Any
|
|
49
|
+
import difflib
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class QueryOptimizer:
|
|
53
|
+
"""
|
|
54
|
+
Query preprocessing and optimization for improved search quality.
|
|
55
|
+
|
|
56
|
+
Handles spell correction, expansion, and boolean query parsing.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def __init__(self, vocabulary: Optional[Set[str]] = None):
|
|
60
|
+
"""
|
|
61
|
+
Initialize query optimizer.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
vocabulary: Set of known terms from indexed documents
|
|
65
|
+
Used for spell correction
|
|
66
|
+
"""
|
|
67
|
+
self.vocabulary = vocabulary or set()
|
|
68
|
+
|
|
69
|
+
# Co-occurrence matrix for query expansion
|
|
70
|
+
# term -> {related_term: co-occurrence_count}
|
|
71
|
+
self.cooccurrence: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
|
|
72
|
+
|
|
73
|
+
# Expansion candidates: term -> [expanded_terms]
|
|
74
|
+
self.expansions: Dict[str, List[str]] = {}
|
|
75
|
+
|
|
76
|
+
# Technical term preservation (don't treat as typos)
|
|
77
|
+
self.technical_terms = {
|
|
78
|
+
'api', 'sql', 'orm', 'jwt', 'http', 'https', 'ssl', 'tls',
|
|
79
|
+
'json', 'xml', 'yaml', 'csv', 'pdf', 'cli', 'gui', 'ide',
|
|
80
|
+
'git', 'npm', 'pip', 'cpu', 'gpu', 'ram', 'ssd', 'hdd',
|
|
81
|
+
'ml', 'ai', 'nlp', 'cv', 'dl', 'rl', 'gan', 'cnn', 'rnn',
|
|
82
|
+
'rest', 'soap', 'grpc', 'cors', 'csrf', 'xss', 'sql',
|
|
83
|
+
'aws', 'gcp', 'azure', 'k8s', 'ci', 'cd', 'devops'
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
def build_cooccurrence_matrix(self, documents: List[List[str]]) -> None:
|
|
87
|
+
"""
|
|
88
|
+
Build term co-occurrence matrix from tokenized documents.
|
|
89
|
+
|
|
90
|
+
Co-occurrence = terms appearing in same document
|
|
91
|
+
Used for query expansion to find related terms.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
documents: List of tokenized documents (each doc is list of tokens)
|
|
95
|
+
"""
|
|
96
|
+
self.cooccurrence = defaultdict(lambda: defaultdict(int))
|
|
97
|
+
|
|
98
|
+
for doc_tokens in documents:
|
|
99
|
+
# Count unique terms per document
|
|
100
|
+
unique_terms = set(doc_tokens)
|
|
101
|
+
|
|
102
|
+
# Update co-occurrence for all term pairs in document
|
|
103
|
+
for term1 in unique_terms:
|
|
104
|
+
for term2 in unique_terms:
|
|
105
|
+
if term1 != term2:
|
|
106
|
+
self.cooccurrence[term1][term2] += 1
|
|
107
|
+
|
|
108
|
+
def _edit_distance(self, s1: str, s2: str, max_distance: int = 2) -> int:
|
|
109
|
+
"""
|
|
110
|
+
Calculate Levenshtein edit distance between two strings.
|
|
111
|
+
|
|
112
|
+
Edit distance = minimum number of single-character edits (insertions,
|
|
113
|
+
deletions, substitutions) required to change s1 into s2.
|
|
114
|
+
|
|
115
|
+
Early termination if distance exceeds max_distance for performance.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
s1: First string
|
|
119
|
+
s2: Second string
|
|
120
|
+
max_distance: Maximum distance to calculate (for early termination)
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Edit distance, or max_distance+1 if exceeds threshold
|
|
124
|
+
"""
|
|
125
|
+
len1, len2 = len(s1), len(s2)
|
|
126
|
+
|
|
127
|
+
# Early termination - length difference too large
|
|
128
|
+
if abs(len1 - len2) > max_distance:
|
|
129
|
+
return max_distance + 1
|
|
130
|
+
|
|
131
|
+
# Initialize DP matrix (only need current and previous row)
|
|
132
|
+
prev_row = list(range(len2 + 1))
|
|
133
|
+
curr_row = [0] * (len2 + 1)
|
|
134
|
+
|
|
135
|
+
for i in range(1, len1 + 1):
|
|
136
|
+
curr_row[0] = i
|
|
137
|
+
min_in_row = i # Track minimum value in current row
|
|
138
|
+
|
|
139
|
+
for j in range(1, len2 + 1):
|
|
140
|
+
# Cost of substitution (0 if characters match, 1 otherwise)
|
|
141
|
+
cost = 0 if s1[i - 1] == s2[j - 1] else 1
|
|
142
|
+
|
|
143
|
+
curr_row[j] = min(
|
|
144
|
+
prev_row[j] + 1, # Deletion
|
|
145
|
+
curr_row[j - 1] + 1, # Insertion
|
|
146
|
+
prev_row[j - 1] + cost # Substitution
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
min_in_row = min(min_in_row, curr_row[j])
|
|
150
|
+
|
|
151
|
+
# Early termination - minimum in row exceeds threshold
|
|
152
|
+
if min_in_row > max_distance:
|
|
153
|
+
return max_distance + 1
|
|
154
|
+
|
|
155
|
+
# Swap rows
|
|
156
|
+
prev_row, curr_row = curr_row, prev_row
|
|
157
|
+
|
|
158
|
+
return prev_row[len2]
|
|
159
|
+
|
|
160
|
+
def spell_correct(self, term: str, max_distance: int = 2) -> str:
|
|
161
|
+
"""
|
|
162
|
+
Correct spelling using vocabulary and edit distance.
|
|
163
|
+
|
|
164
|
+
Algorithm:
|
|
165
|
+
1. If term in vocabulary, return as-is
|
|
166
|
+
2. If term is technical term (<=3 chars or in whitelist), return as-is
|
|
167
|
+
3. Find closest vocabulary term within max_distance edits
|
|
168
|
+
4. Return correction if found, otherwise original term
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
term: Term to correct
|
|
172
|
+
max_distance: Maximum edit distance to consider (default: 2)
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
Corrected term or original if no correction found
|
|
176
|
+
"""
|
|
177
|
+
# Already correct or technical term
|
|
178
|
+
if term in self.vocabulary or term in self.technical_terms:
|
|
179
|
+
return term
|
|
180
|
+
|
|
181
|
+
# Don't correct very short terms (likely abbreviations)
|
|
182
|
+
if len(term) <= 3:
|
|
183
|
+
return term
|
|
184
|
+
|
|
185
|
+
# Find closest match in vocabulary
|
|
186
|
+
best_match = term
|
|
187
|
+
best_distance = max_distance + 1
|
|
188
|
+
|
|
189
|
+
# Use difflib for efficient approximate matching
|
|
190
|
+
# This is faster than checking full vocabulary for large sets
|
|
191
|
+
close_matches = difflib.get_close_matches(
|
|
192
|
+
term, self.vocabulary, n=5, cutoff=0.7
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
for candidate in close_matches:
|
|
196
|
+
distance = self._edit_distance(term, candidate, max_distance)
|
|
197
|
+
if distance < best_distance:
|
|
198
|
+
best_distance = distance
|
|
199
|
+
best_match = candidate
|
|
200
|
+
|
|
201
|
+
# If no close match found by difflib, check high-frequency terms
|
|
202
|
+
# This handles cases where difflib's cutoff is too strict
|
|
203
|
+
if best_distance > max_distance and len(self.vocabulary) < 10000:
|
|
204
|
+
# Only do full scan for smaller vocabularies
|
|
205
|
+
for vocab_term in self.vocabulary:
|
|
206
|
+
# Quick filter by length difference
|
|
207
|
+
if abs(len(term) - len(vocab_term)) > max_distance:
|
|
208
|
+
continue
|
|
209
|
+
|
|
210
|
+
distance = self._edit_distance(term, vocab_term, max_distance)
|
|
211
|
+
if distance < best_distance:
|
|
212
|
+
best_distance = distance
|
|
213
|
+
best_match = vocab_term
|
|
214
|
+
|
|
215
|
+
# Return correction only if found
|
|
216
|
+
return best_match if best_distance <= max_distance else term
|
|
217
|
+
|
|
218
|
+
def expand_query(
|
|
219
|
+
self,
|
|
220
|
+
query_terms: List[str],
|
|
221
|
+
max_expansions: int = 2,
|
|
222
|
+
min_cooccurrence: int = 2
|
|
223
|
+
) -> List[str]:
|
|
224
|
+
"""
|
|
225
|
+
Expand query with related terms based on co-occurrence.
|
|
226
|
+
|
|
227
|
+
Adds terms that frequently co-occur with query terms to broaden search.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
query_terms: Original query terms
|
|
231
|
+
max_expansions: Maximum number of expansion terms to add
|
|
232
|
+
min_cooccurrence: Minimum co-occurrence count threshold
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
Expanded query terms (original + expansions)
|
|
236
|
+
"""
|
|
237
|
+
if not self.cooccurrence:
|
|
238
|
+
return query_terms
|
|
239
|
+
|
|
240
|
+
# Collect expansion candidates
|
|
241
|
+
expansion_candidates = defaultdict(int)
|
|
242
|
+
|
|
243
|
+
for term in query_terms:
|
|
244
|
+
if term in self.cooccurrence:
|
|
245
|
+
for related_term, count in self.cooccurrence[term].items():
|
|
246
|
+
# Don't re-add terms already in query
|
|
247
|
+
if related_term not in query_terms:
|
|
248
|
+
expansion_candidates[related_term] += count
|
|
249
|
+
|
|
250
|
+
# Filter by minimum co-occurrence and sort by frequency
|
|
251
|
+
expansions = [
|
|
252
|
+
term for term, count in expansion_candidates.items()
|
|
253
|
+
if count >= min_cooccurrence
|
|
254
|
+
]
|
|
255
|
+
expansions.sort(key=lambda t: expansion_candidates[t], reverse=True)
|
|
256
|
+
|
|
257
|
+
# Add top expansions
|
|
258
|
+
expanded_terms = query_terms + expansions[:max_expansions]
|
|
259
|
+
|
|
260
|
+
return expanded_terms
|
|
261
|
+
|
|
262
|
+
def parse_boolean_query(self, query: str) -> Dict[str, Any]:
|
|
263
|
+
"""
|
|
264
|
+
Parse boolean query operators (AND, OR, NOT, phrase matching).
|
|
265
|
+
|
|
266
|
+
Supports:
|
|
267
|
+
- AND: term1 AND term2 (both required)
|
|
268
|
+
- OR: term1 OR term2 (at least one required)
|
|
269
|
+
- NOT: term1 NOT term2 (exclude term2)
|
|
270
|
+
- Phrase: "exact phrase" (exact match)
|
|
271
|
+
- Implicit AND: "term1 term2" treated as term1 AND term2
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
query: Query string with boolean operators
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
Parsed query structure:
|
|
278
|
+
{
|
|
279
|
+
'type': 'and' | 'or' | 'not' | 'phrase' | 'term',
|
|
280
|
+
'terms': [terms],
|
|
281
|
+
'operator': operator,
|
|
282
|
+
'children': [sub-queries]
|
|
283
|
+
}
|
|
284
|
+
"""
|
|
285
|
+
# Extract phrase queries first (enclosed in quotes)
|
|
286
|
+
phrases = []
|
|
287
|
+
phrase_pattern = r'"([^"]+)"'
|
|
288
|
+
query_without_phrases = query
|
|
289
|
+
|
|
290
|
+
for match in re.finditer(phrase_pattern, query):
|
|
291
|
+
phrase = match.group(1)
|
|
292
|
+
phrases.append(phrase)
|
|
293
|
+
# Replace phrase with placeholder
|
|
294
|
+
query_without_phrases = query_without_phrases.replace(
|
|
295
|
+
f'"{phrase}"', f'__PHRASE_{len(phrases)-1}__'
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
# Split by boolean operators (case insensitive)
|
|
299
|
+
# Priority: NOT > AND > OR
|
|
300
|
+
query_upper = query_without_phrases.upper()
|
|
301
|
+
|
|
302
|
+
# Parse NOT expressions
|
|
303
|
+
if ' NOT ' in query_upper:
|
|
304
|
+
parts = re.split(r'\s+NOT\s+', query_without_phrases, flags=re.IGNORECASE)
|
|
305
|
+
return {
|
|
306
|
+
'type': 'not',
|
|
307
|
+
'required': self._parse_query_part(parts[0].strip(), phrases),
|
|
308
|
+
'excluded': [self._parse_query_part(p.strip(), phrases) for p in parts[1:]]
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
# Parse AND expressions
|
|
312
|
+
if ' AND ' in query_upper:
|
|
313
|
+
parts = re.split(r'\s+AND\s+', query_without_phrases, flags=re.IGNORECASE)
|
|
314
|
+
return {
|
|
315
|
+
'type': 'and',
|
|
316
|
+
'children': [self._parse_query_part(p.strip(), phrases) for p in parts]
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
# Parse OR expressions
|
|
320
|
+
if ' OR ' in query_upper:
|
|
321
|
+
parts = re.split(r'\s+OR\s+', query_without_phrases, flags=re.IGNORECASE)
|
|
322
|
+
return {
|
|
323
|
+
'type': 'or',
|
|
324
|
+
'children': [self._parse_query_part(p.strip(), phrases) for p in parts]
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
# Default: treat as implicit AND
|
|
328
|
+
return self._parse_query_part(query_without_phrases.strip(), phrases)
|
|
329
|
+
|
|
330
|
+
def _parse_query_part(self, part: str, phrases: List[str]) -> Dict[str, Any]:
|
|
331
|
+
"""
|
|
332
|
+
Parse a single query part (no boolean operators).
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
part: Query part
|
|
336
|
+
phrases: List of extracted phrases
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
Query structure
|
|
340
|
+
"""
|
|
341
|
+
# Check for phrase placeholder
|
|
342
|
+
phrase_match = re.match(r'__PHRASE_(\d+)__', part)
|
|
343
|
+
if phrase_match:
|
|
344
|
+
phrase_idx = int(phrase_match.group(1))
|
|
345
|
+
return {
|
|
346
|
+
'type': 'phrase',
|
|
347
|
+
'phrase': phrases[phrase_idx],
|
|
348
|
+
'terms': phrases[phrase_idx].split()
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
# Regular term(s)
|
|
352
|
+
terms = part.split()
|
|
353
|
+
if len(terms) == 1:
|
|
354
|
+
return {
|
|
355
|
+
'type': 'term',
|
|
356
|
+
'term': terms[0]
|
|
357
|
+
}
|
|
358
|
+
else:
|
|
359
|
+
# Multiple terms without operator = implicit AND
|
|
360
|
+
return {
|
|
361
|
+
'type': 'and',
|
|
362
|
+
'children': [{'type': 'term', 'term': t} for t in terms]
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
def optimize(
|
|
366
|
+
self,
|
|
367
|
+
query: str,
|
|
368
|
+
enable_spell_correction: bool = True,
|
|
369
|
+
enable_expansion: bool = False,
|
|
370
|
+
max_expansions: int = 2
|
|
371
|
+
) -> str:
|
|
372
|
+
"""
|
|
373
|
+
Optimize query with spell correction and expansion.
|
|
374
|
+
|
|
375
|
+
Args:
|
|
376
|
+
query: Original query string
|
|
377
|
+
enable_spell_correction: Apply spell correction
|
|
378
|
+
enable_expansion: Apply query expansion
|
|
379
|
+
max_expansions: Maximum expansion terms
|
|
380
|
+
|
|
381
|
+
Returns:
|
|
382
|
+
Optimized query string
|
|
383
|
+
"""
|
|
384
|
+
# Tokenize query
|
|
385
|
+
tokens = re.findall(r'\b[a-z0-9_-]+\b', query.lower())
|
|
386
|
+
|
|
387
|
+
if not tokens:
|
|
388
|
+
return query
|
|
389
|
+
|
|
390
|
+
# Apply spell correction
|
|
391
|
+
if enable_spell_correction and self.vocabulary:
|
|
392
|
+
tokens = [self.spell_correct(term) for term in tokens]
|
|
393
|
+
|
|
394
|
+
# Apply query expansion
|
|
395
|
+
if enable_expansion and self.cooccurrence:
|
|
396
|
+
tokens = self.expand_query(tokens, max_expansions)
|
|
397
|
+
|
|
398
|
+
return ' '.join(tokens)
|
|
399
|
+
|
|
400
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
401
|
+
"""
|
|
402
|
+
Get optimizer statistics.
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
Dictionary with optimizer stats
|
|
406
|
+
"""
|
|
407
|
+
return {
|
|
408
|
+
'vocabulary_size': len(self.vocabulary),
|
|
409
|
+
'cooccurrence_terms': len(self.cooccurrence),
|
|
410
|
+
'technical_terms': len(self.technical_terms),
|
|
411
|
+
'avg_related_terms': (
|
|
412
|
+
sum(len(related) for related in self.cooccurrence.values()) / len(self.cooccurrence)
|
|
413
|
+
if self.cooccurrence else 0
|
|
414
|
+
)
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
# CLI interface for testing
|
|
419
|
+
if __name__ == "__main__":
|
|
420
|
+
print("Query Optimizer - Demo")
|
|
421
|
+
print("=" * 60)
|
|
422
|
+
|
|
423
|
+
# Sample vocabulary
|
|
424
|
+
vocabulary = {
|
|
425
|
+
'python', 'javascript', 'programming', 'web', 'development',
|
|
426
|
+
'machine', 'learning', 'neural', 'network', 'api', 'rest',
|
|
427
|
+
'database', 'sql', 'authentication', 'authorization', 'jwt',
|
|
428
|
+
'framework', 'django', 'react', 'node', 'express'
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
# Sample documents for co-occurrence
|
|
432
|
+
documents = [
|
|
433
|
+
['python', 'programming', 'web', 'development'],
|
|
434
|
+
['javascript', 'web', 'development', 'frontend'],
|
|
435
|
+
['machine', 'learning', 'python', 'neural', 'network'],
|
|
436
|
+
['api', 'rest', 'web', 'development'],
|
|
437
|
+
['authentication', 'authorization', 'jwt', 'security'],
|
|
438
|
+
]
|
|
439
|
+
|
|
440
|
+
# Initialize optimizer
|
|
441
|
+
optimizer = QueryOptimizer(vocabulary)
|
|
442
|
+
optimizer.build_cooccurrence_matrix(documents)
|
|
443
|
+
|
|
444
|
+
print("\nOptimizer Statistics:")
|
|
445
|
+
stats = optimizer.get_stats()
|
|
446
|
+
for key, value in stats.items():
|
|
447
|
+
print(f" {key}: {value}")
|
|
448
|
+
|
|
449
|
+
# Test spell correction
|
|
450
|
+
print("\n" + "=" * 60)
|
|
451
|
+
print("Spell Correction:")
|
|
452
|
+
print("=" * 60)
|
|
453
|
+
|
|
454
|
+
test_typos = [
|
|
455
|
+
"pythno", # → python
|
|
456
|
+
"javascirpt", # → javascript
|
|
457
|
+
"machien", # → machine
|
|
458
|
+
"athentication", # → authentication
|
|
459
|
+
"developement" # → development
|
|
460
|
+
]
|
|
461
|
+
|
|
462
|
+
for typo in test_typos:
|
|
463
|
+
corrected = optimizer.spell_correct(typo)
|
|
464
|
+
print(f" '{typo}' → '{corrected}'")
|
|
465
|
+
|
|
466
|
+
# Test query expansion
|
|
467
|
+
print("\n" + "=" * 60)
|
|
468
|
+
print("Query Expansion:")
|
|
469
|
+
print("=" * 60)
|
|
470
|
+
|
|
471
|
+
test_queries = [
|
|
472
|
+
['python'],
|
|
473
|
+
['web'],
|
|
474
|
+
['machine', 'learning'],
|
|
475
|
+
]
|
|
476
|
+
|
|
477
|
+
for query in test_queries:
|
|
478
|
+
expanded = optimizer.expand_query(query, max_expansions=2)
|
|
479
|
+
print(f" {query} → {expanded}")
|
|
480
|
+
|
|
481
|
+
# Test boolean query parsing
|
|
482
|
+
print("\n" + "=" * 60)
|
|
483
|
+
print("Boolean Query Parsing:")
|
|
484
|
+
print("=" * 60)
|
|
485
|
+
|
|
486
|
+
boolean_queries = [
|
|
487
|
+
'python AND web',
|
|
488
|
+
'javascript OR typescript',
|
|
489
|
+
'python NOT django',
|
|
490
|
+
'"machine learning" AND python',
|
|
491
|
+
'web development rest api'
|
|
492
|
+
]
|
|
493
|
+
|
|
494
|
+
for query in boolean_queries:
|
|
495
|
+
parsed = optimizer.parse_boolean_query(query)
|
|
496
|
+
print(f"\n Query: '{query}'")
|
|
497
|
+
print(f" Parsed: {parsed}")
|
|
498
|
+
|
|
499
|
+
# Test full optimization
|
|
500
|
+
print("\n" + "=" * 60)
|
|
501
|
+
print("Full Query Optimization:")
|
|
502
|
+
print("=" * 60)
|
|
503
|
+
|
|
504
|
+
optimization_tests = [
|
|
505
|
+
"pythno web devlopment",
|
|
506
|
+
"machien lerning",
|
|
507
|
+
"api athentication"
|
|
508
|
+
]
|
|
509
|
+
|
|
510
|
+
for query in optimization_tests:
|
|
511
|
+
optimized = optimizer.optimize(query, enable_spell_correction=True)
|
|
512
|
+
print(f" '{query}'")
|
|
513
|
+
print(f" → '{optimized}'")
|