sirchmunk 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sirchmunk/api/__init__.py +1 -0
- sirchmunk/api/chat.py +1123 -0
- sirchmunk/api/components/__init__.py +0 -0
- sirchmunk/api/components/history_storage.py +402 -0
- sirchmunk/api/components/monitor_tracker.py +518 -0
- sirchmunk/api/components/settings_storage.py +353 -0
- sirchmunk/api/history.py +254 -0
- sirchmunk/api/knowledge.py +411 -0
- sirchmunk/api/main.py +120 -0
- sirchmunk/api/monitor.py +219 -0
- sirchmunk/api/run_server.py +54 -0
- sirchmunk/api/search.py +230 -0
- sirchmunk/api/settings.py +309 -0
- sirchmunk/api/tools.py +315 -0
- sirchmunk/cli/__init__.py +11 -0
- sirchmunk/cli/cli.py +789 -0
- sirchmunk/learnings/knowledge_base.py +5 -2
- sirchmunk/llm/prompts.py +12 -1
- sirchmunk/retrieve/text_retriever.py +186 -2
- sirchmunk/scan/file_scanner.py +2 -2
- sirchmunk/schema/knowledge.py +119 -35
- sirchmunk/search.py +384 -26
- sirchmunk/storage/__init__.py +2 -2
- sirchmunk/storage/{knowledge_manager.py → knowledge_storage.py} +265 -60
- sirchmunk/utils/constants.py +7 -5
- sirchmunk/utils/embedding_util.py +217 -0
- sirchmunk/utils/tokenizer_util.py +36 -1
- sirchmunk/version.py +1 -1
- {sirchmunk-0.0.1.dist-info → sirchmunk-0.0.2.dist-info}/METADATA +196 -14
- sirchmunk-0.0.2.dist-info/RECORD +69 -0
- {sirchmunk-0.0.1.dist-info → sirchmunk-0.0.2.dist-info}/WHEEL +1 -1
- sirchmunk-0.0.2.dist-info/top_level.txt +2 -0
- sirchmunk_mcp/__init__.py +25 -0
- sirchmunk_mcp/cli.py +478 -0
- sirchmunk_mcp/config.py +276 -0
- sirchmunk_mcp/server.py +355 -0
- sirchmunk_mcp/service.py +327 -0
- sirchmunk_mcp/setup.py +15 -0
- sirchmunk_mcp/tools.py +410 -0
- sirchmunk-0.0.1.dist-info/RECORD +0 -45
- sirchmunk-0.0.1.dist-info/top_level.txt +0 -1
- {sirchmunk-0.0.1.dist-info → sirchmunk-0.0.2.dist-info}/entry_points.txt +0 -0
- {sirchmunk-0.0.1.dist-info → sirchmunk-0.0.2.dist-info}/licenses/LICENSE +0 -0
sirchmunk/search.py
CHANGED
|
@@ -1,10 +1,15 @@
|
|
|
1
1
|
# Copyright (c) ModelScope Contributors. All rights reserved.
|
|
2
2
|
import ast
|
|
3
3
|
import json
|
|
4
|
+
import logging
|
|
5
|
+
import re
|
|
6
|
+
from datetime import datetime
|
|
4
7
|
from pathlib import Path
|
|
5
8
|
from typing import Any, Dict, List, Literal, Optional, Union
|
|
6
9
|
|
|
7
10
|
from sirchmunk.base import BaseSearch
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
8
13
|
from sirchmunk.learnings.knowledge_base import KnowledgeBase
|
|
9
14
|
from sirchmunk.llm.openai_chat import OpenAIChat
|
|
10
15
|
from sirchmunk.llm.prompts import (
|
|
@@ -14,8 +19,8 @@ from sirchmunk.llm.prompts import (
|
|
|
14
19
|
from sirchmunk.retrieve.text_retriever import GrepRetriever
|
|
15
20
|
from sirchmunk.schema.knowledge import KnowledgeCluster
|
|
16
21
|
from sirchmunk.schema.request import ContentItem, ImageURL, Message, Request
|
|
17
|
-
from sirchmunk.storage.
|
|
18
|
-
from sirchmunk.utils.constants import LLM_BASE_URL, LLM_API_KEY, LLM_MODEL_NAME,
|
|
22
|
+
from sirchmunk.storage.knowledge_storage import KnowledgeStorage
|
|
23
|
+
from sirchmunk.utils.constants import LLM_BASE_URL, LLM_API_KEY, LLM_MODEL_NAME, SIRCHMUNK_WORK_PATH
|
|
19
24
|
from sirchmunk.utils.deps import check_dependencies
|
|
20
25
|
from sirchmunk.utils.file_utils import get_fast_hash
|
|
21
26
|
from sirchmunk.utils import create_logger, LogCallback
|
|
@@ -35,12 +40,14 @@ class AgenticSearch(BaseSearch):
|
|
|
35
40
|
work_path: Optional[Union[str, Path]] = None,
|
|
36
41
|
verbose: bool = False,
|
|
37
42
|
log_callback: LogCallback = None,
|
|
43
|
+
reuse_knowledge: bool = True,
|
|
38
44
|
**kwargs,
|
|
39
45
|
):
|
|
40
46
|
super().__init__(**kwargs)
|
|
41
47
|
|
|
42
|
-
work_path = work_path or
|
|
43
|
-
|
|
48
|
+
work_path = work_path or SIRCHMUNK_WORK_PATH
|
|
49
|
+
# Ensure path is expanded (handle ~ and environment variables)
|
|
50
|
+
self.work_path: Path = Path(work_path).expanduser().resolve()
|
|
44
51
|
|
|
45
52
|
self.llm: OpenAIChat = llm or OpenAIChat(
|
|
46
53
|
base_url=LLM_BASE_URL,
|
|
@@ -62,7 +69,7 @@ class AgenticSearch(BaseSearch):
|
|
|
62
69
|
)
|
|
63
70
|
|
|
64
71
|
# Initialize KnowledgeManager for persistent storage
|
|
65
|
-
self.knowledge_manager =
|
|
72
|
+
self.knowledge_manager = KnowledgeStorage(work_path=str(self.work_path))
|
|
66
73
|
|
|
67
74
|
# Load historical knowledge clusters from cache
|
|
68
75
|
self._load_historical_knowledge()
|
|
@@ -71,6 +78,30 @@ class AgenticSearch(BaseSearch):
|
|
|
71
78
|
|
|
72
79
|
self.llm_usages: List[Dict[str, Any]] = []
|
|
73
80
|
|
|
81
|
+
# Maximum number of queries to keep per cluster (FIFO strategy)
|
|
82
|
+
self.max_queries_per_cluster: int = 5
|
|
83
|
+
|
|
84
|
+
# Initialize embedding client for cluster reuse
|
|
85
|
+
self.embedding_client = None
|
|
86
|
+
# Similarity threshold for cluster reuse
|
|
87
|
+
self.cluster_sim_threshold: float = kwargs.pop('cluster_sim_threshold', 0.85)
|
|
88
|
+
self.cluster_sim_top_k: int = kwargs.pop('cluster_sim_top_k', 3)
|
|
89
|
+
if reuse_knowledge:
|
|
90
|
+
try:
|
|
91
|
+
from sirchmunk.utils.embedding_util import EmbeddingUtil
|
|
92
|
+
|
|
93
|
+
self.embedding_client = EmbeddingUtil(
|
|
94
|
+
cache_dir=str(self.work_path / ".cache" / "models")
|
|
95
|
+
)
|
|
96
|
+
logger.debug(
|
|
97
|
+
f"Embedding client initialized: {self.embedding_client.get_model_info()}"
|
|
98
|
+
)
|
|
99
|
+
except Exception as e:
|
|
100
|
+
logger.warning(
|
|
101
|
+
f"Failed to initialize embedding client: {e}. Cluster reuse disabled."
|
|
102
|
+
)
|
|
103
|
+
self.embedding_client = None
|
|
104
|
+
|
|
74
105
|
if not check_dependencies():
|
|
75
106
|
print("Installing rga (ripgrep-all) and rg (ripgrep)...", flush=True)
|
|
76
107
|
install_rga()
|
|
@@ -84,6 +115,289 @@ class AgenticSearch(BaseSearch):
|
|
|
84
115
|
print(f"Loaded {cluster_count} historical knowledge clusters from cache")
|
|
85
116
|
except Exception as e:
|
|
86
117
|
print(f"[WARNING] Failed to load historical knowledge: {e}")
|
|
118
|
+
|
|
119
|
+
async def _try_reuse_cluster(
|
|
120
|
+
self,
|
|
121
|
+
query: str,
|
|
122
|
+
return_cluster: bool = False
|
|
123
|
+
) -> Optional[Union[str, KnowledgeCluster]]:
|
|
124
|
+
"""
|
|
125
|
+
Try to reuse existing knowledge cluster based on semantic similarity.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
query: Search query string
|
|
129
|
+
return_cluster: Whether to return the full cluster object or just content string
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Cluster content string or KnowledgeCluster object if found, None otherwise
|
|
133
|
+
"""
|
|
134
|
+
if not self.embedding_client:
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
try:
|
|
138
|
+
await self._logger.info("Searching for similar knowledge clusters...")
|
|
139
|
+
|
|
140
|
+
# Compute query embedding
|
|
141
|
+
query_embedding = (await self.embedding_client.embed([query]))[0]
|
|
142
|
+
|
|
143
|
+
# Search for similar clusters
|
|
144
|
+
similar_clusters = await self.knowledge_manager.search_similar_clusters(
|
|
145
|
+
query_embedding=query_embedding,
|
|
146
|
+
top_k=self.cluster_sim_top_k,
|
|
147
|
+
similarity_threshold=self.cluster_sim_threshold,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
if not similar_clusters:
|
|
151
|
+
await self._logger.info("No similar clusters found, performing new search...")
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
# Found similar cluster - process reuse
|
|
155
|
+
best_match = similar_clusters[0]
|
|
156
|
+
await self._logger.success(
|
|
157
|
+
f"♻️ Found similar cluster: {best_match['name']} "
|
|
158
|
+
f"(similarity: {best_match['similarity']:.3f})"
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Retrieve full cluster object
|
|
162
|
+
existing_cluster = await self.knowledge_manager.get(best_match["id"])
|
|
163
|
+
|
|
164
|
+
if not existing_cluster:
|
|
165
|
+
await self._logger.warning("Failed to retrieve cluster, falling back to new search")
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
# Add current query to queries list with FIFO strategy
|
|
169
|
+
self._add_query_to_cluster(existing_cluster, query)
|
|
170
|
+
|
|
171
|
+
# Update hotness and timestamp for reused cluster
|
|
172
|
+
existing_cluster.hotness = min(1.0, (existing_cluster.hotness or 0.5) + 0.1)
|
|
173
|
+
existing_cluster.last_modified = datetime.now()
|
|
174
|
+
|
|
175
|
+
# Recompute embedding with new query (before update to avoid double save)
|
|
176
|
+
if self.embedding_client:
|
|
177
|
+
try:
|
|
178
|
+
from sirchmunk.utils.embedding_util import compute_text_hash
|
|
179
|
+
|
|
180
|
+
combined_text = self.knowledge_manager.combine_cluster_fields(
|
|
181
|
+
existing_cluster.queries
|
|
182
|
+
)
|
|
183
|
+
text_hash = compute_text_hash(combined_text)
|
|
184
|
+
embedding_vector = (await self.embedding_client.embed([combined_text]))[0]
|
|
185
|
+
|
|
186
|
+
# Update embedding fields in database without triggering save
|
|
187
|
+
self.knowledge_manager.db.execute(
|
|
188
|
+
f"""
|
|
189
|
+
UPDATE {self.knowledge_manager.table_name}
|
|
190
|
+
SET
|
|
191
|
+
embedding_vector = ?::FLOAT[384],
|
|
192
|
+
embedding_model = ?,
|
|
193
|
+
embedding_timestamp = CURRENT_TIMESTAMP,
|
|
194
|
+
embedding_text_hash = ?
|
|
195
|
+
WHERE id = ?
|
|
196
|
+
""",
|
|
197
|
+
[embedding_vector, self.embedding_client.model_id, text_hash, existing_cluster.id]
|
|
198
|
+
)
|
|
199
|
+
await self._logger.debug(f"Updated embedding for cluster {existing_cluster.id}")
|
|
200
|
+
except Exception as emb_error:
|
|
201
|
+
await self._logger.warning(f"Failed to update embedding: {emb_error}")
|
|
202
|
+
|
|
203
|
+
# Single update call - saves cluster data and embedding together
|
|
204
|
+
await self.knowledge_manager.update(existing_cluster)
|
|
205
|
+
|
|
206
|
+
await self._logger.success("Reused existing knowledge cluster")
|
|
207
|
+
|
|
208
|
+
# Return based on return_cluster flag
|
|
209
|
+
if return_cluster:
|
|
210
|
+
return existing_cluster
|
|
211
|
+
else:
|
|
212
|
+
# Format and return cluster content as string
|
|
213
|
+
content = existing_cluster.content
|
|
214
|
+
if isinstance(content, list):
|
|
215
|
+
content = "\n".join(content)
|
|
216
|
+
return str(content) if content else "Knowledge cluster found but content is empty"
|
|
217
|
+
|
|
218
|
+
except Exception as e:
|
|
219
|
+
await self._logger.warning(
|
|
220
|
+
f"Failed to search similar clusters: {e}. Falling back to full search."
|
|
221
|
+
)
|
|
222
|
+
return None
|
|
223
|
+
|
|
224
|
+
def _add_query_to_cluster(self, cluster: KnowledgeCluster, query: str) -> None:
|
|
225
|
+
"""
|
|
226
|
+
Add query to cluster's queries list with FIFO strategy.
|
|
227
|
+
Keeps only the most recent N queries (where N = max_queries_per_cluster).
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
cluster: KnowledgeCluster to update
|
|
231
|
+
query: New query to add
|
|
232
|
+
"""
|
|
233
|
+
# Add query if not already present
|
|
234
|
+
if query not in cluster.queries:
|
|
235
|
+
cluster.queries.append(query)
|
|
236
|
+
|
|
237
|
+
# Apply FIFO strategy: keep only the most recent N queries
|
|
238
|
+
if len(cluster.queries) > self.max_queries_per_cluster:
|
|
239
|
+
# Remove oldest queries (from the beginning)
|
|
240
|
+
cluster.queries = cluster.queries[-self.max_queries_per_cluster:]
|
|
241
|
+
|
|
242
|
+
async def _save_cluster_with_embedding(self, cluster: KnowledgeCluster) -> None:
|
|
243
|
+
"""
|
|
244
|
+
Save knowledge cluster to persistent storage and compute embedding.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
cluster: KnowledgeCluster to save
|
|
248
|
+
"""
|
|
249
|
+
# Save knowledge cluster to persistent storage
|
|
250
|
+
try:
|
|
251
|
+
await self.knowledge_manager.insert(cluster)
|
|
252
|
+
await self._logger.info(f"Saved knowledge cluster {cluster.id} to cache")
|
|
253
|
+
except Exception as e:
|
|
254
|
+
# If cluster exists, update it instead
|
|
255
|
+
try:
|
|
256
|
+
await self.knowledge_manager.update(cluster)
|
|
257
|
+
await self._logger.info(f"Updated knowledge cluster {cluster.id} in cache")
|
|
258
|
+
except Exception as update_error:
|
|
259
|
+
await self._logger.warning(f"Failed to save knowledge cluster: {update_error}")
|
|
260
|
+
return
|
|
261
|
+
|
|
262
|
+
# Compute and store embedding for the cluster
|
|
263
|
+
if self.embedding_client:
|
|
264
|
+
try:
|
|
265
|
+
from sirchmunk.utils.embedding_util import compute_text_hash
|
|
266
|
+
|
|
267
|
+
# Combine queries for embedding
|
|
268
|
+
combined_text = self.knowledge_manager.combine_cluster_fields(
|
|
269
|
+
cluster.queries
|
|
270
|
+
)
|
|
271
|
+
text_hash = compute_text_hash(combined_text)
|
|
272
|
+
|
|
273
|
+
# Compute embedding
|
|
274
|
+
embedding_vector = (await self.embedding_client.embed([combined_text]))[0]
|
|
275
|
+
|
|
276
|
+
# Store embedding
|
|
277
|
+
await self.knowledge_manager.store_embedding(
|
|
278
|
+
cluster_id=cluster.id,
|
|
279
|
+
embedding_vector=embedding_vector,
|
|
280
|
+
embedding_model=self.embedding_client.model_id,
|
|
281
|
+
embedding_text_hash=text_hash
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
await self._logger.debug(f"Computed and stored embedding for cluster {cluster.id}")
|
|
285
|
+
|
|
286
|
+
except Exception as e:
|
|
287
|
+
await self._logger.warning(f"Failed to compute embedding for cluster {cluster.id}: {e}")
|
|
288
|
+
|
|
289
|
+
async def _search_by_filename(
|
|
290
|
+
self,
|
|
291
|
+
query: str,
|
|
292
|
+
search_paths: Union[str, Path, List[str], List[Path]],
|
|
293
|
+
max_depth: Optional[int] = 5,
|
|
294
|
+
include: Optional[List[str]] = None,
|
|
295
|
+
exclude: Optional[List[str]] = None,
|
|
296
|
+
grep_timeout: Optional[float] = 60.0,
|
|
297
|
+
top_k: Optional[int] = 10,
|
|
298
|
+
) -> List[Dict[str, Any]]:
|
|
299
|
+
"""
|
|
300
|
+
Perform filename-only search without LLM keyword extraction.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
query: Search query (used as filename pattern)
|
|
304
|
+
search_paths: Paths to search in
|
|
305
|
+
max_depth: Maximum directory depth
|
|
306
|
+
include: File patterns to include
|
|
307
|
+
exclude: File patterns to exclude
|
|
308
|
+
grep_timeout: Timeout for grep operations
|
|
309
|
+
top_k: Maximum number of results to return
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
List of file matches with metadata
|
|
313
|
+
"""
|
|
314
|
+
await self._logger.info("Performing filename-only search...")
|
|
315
|
+
|
|
316
|
+
# Extract potential filename patterns from query
|
|
317
|
+
patterns = []
|
|
318
|
+
|
|
319
|
+
# Check if query looks like a file pattern (contains file extensions or wildcards)
|
|
320
|
+
if any(char in query for char in ['*', '?', '[', ']']):
|
|
321
|
+
# Treat as direct glob/regex pattern
|
|
322
|
+
patterns = [query]
|
|
323
|
+
await self._logger.info(f"Using direct pattern: {query}")
|
|
324
|
+
else:
|
|
325
|
+
# Split into words and create flexible patterns
|
|
326
|
+
words = [w.strip() for w in query.strip().split() if w.strip()]
|
|
327
|
+
|
|
328
|
+
if not words:
|
|
329
|
+
await self._logger.warning("No valid words in query")
|
|
330
|
+
return []
|
|
331
|
+
|
|
332
|
+
# Strategy: Create patterns for each word that match anywhere in filename
|
|
333
|
+
# Use non-greedy matching and case-insensitive by default
|
|
334
|
+
for word in words:
|
|
335
|
+
# Escape special regex characters in the word
|
|
336
|
+
escaped_word = re.escape(word)
|
|
337
|
+
# Match word anywhere in filename (case-insensitive handled in retrieve_by_filename)
|
|
338
|
+
pattern = f".*{escaped_word}.*"
|
|
339
|
+
patterns.append(pattern)
|
|
340
|
+
await self._logger.debug(f"Created pattern for word '{word}': {pattern}")
|
|
341
|
+
|
|
342
|
+
if not patterns:
|
|
343
|
+
await self._logger.warning("No valid filename patterns extracted from query")
|
|
344
|
+
return []
|
|
345
|
+
|
|
346
|
+
await self._logger.info(f"Searching with {len(patterns)} pattern(s): {patterns}")
|
|
347
|
+
|
|
348
|
+
try:
|
|
349
|
+
# Use GrepRetriever's filename search
|
|
350
|
+
await self._logger.debug(f"Calling retrieve_by_filename with {len(patterns)} patterns")
|
|
351
|
+
results = await self.grep_retriever.retrieve_by_filename(
|
|
352
|
+
patterns=patterns,
|
|
353
|
+
path=search_paths,
|
|
354
|
+
case_sensitive=False,
|
|
355
|
+
max_depth=max_depth,
|
|
356
|
+
include=include,
|
|
357
|
+
exclude=exclude or ["*.pyc", "*.log"],
|
|
358
|
+
timeout=grep_timeout,
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
if results:
|
|
362
|
+
results = results[:top_k]
|
|
363
|
+
await self._logger.success(f" ✓ Found {len(results)} matching files", flush=True)
|
|
364
|
+
else:
|
|
365
|
+
await self._logger.warning("No files matched the patterns")
|
|
366
|
+
|
|
367
|
+
return results
|
|
368
|
+
|
|
369
|
+
except Exception as e:
|
|
370
|
+
await self._logger.error(f"Filename search failed: {e}")
|
|
371
|
+
import traceback
|
|
372
|
+
await self._logger.error(f"Traceback: {traceback.format_exc()}")
|
|
373
|
+
return []
|
|
374
|
+
|
|
375
|
+
@staticmethod
|
|
376
|
+
def _parse_summary_response(llm_response: str) -> tuple[str, bool]:
|
|
377
|
+
"""
|
|
378
|
+
Parse LLM response to extract summary and save decision.
|
|
379
|
+
|
|
380
|
+
Args:
|
|
381
|
+
llm_response: Raw LLM response containing SUMMARY and SHOULD_SAVE tags
|
|
382
|
+
|
|
383
|
+
Returns:
|
|
384
|
+
Tuple of (summary_text, should_save_flag)
|
|
385
|
+
"""
|
|
386
|
+
# Extract SUMMARY content
|
|
387
|
+
summary_fields = extract_fields(content=llm_response, tags=["SUMMARY", "SHOULD_SAVE"])
|
|
388
|
+
|
|
389
|
+
summary = summary_fields.get("summary", "").strip()
|
|
390
|
+
should_save_str = summary_fields.get("should_save", "true").strip().lower()
|
|
391
|
+
|
|
392
|
+
# Parse should_save flag
|
|
393
|
+
should_save = should_save_str in ["true", "yes", "1"]
|
|
394
|
+
|
|
395
|
+
# If extraction failed, use entire response as summary and assume should save
|
|
396
|
+
if not summary:
|
|
397
|
+
summary = llm_response.strip()
|
|
398
|
+
should_save = True
|
|
399
|
+
|
|
400
|
+
return summary, should_save
|
|
87
401
|
|
|
88
402
|
@staticmethod
|
|
89
403
|
def _extract_and_validate_keywords(llm_resp: str) -> dict:
|
|
@@ -262,8 +576,8 @@ class AgenticSearch(BaseSearch):
|
|
|
262
576
|
self,
|
|
263
577
|
query: str,
|
|
264
578
|
search_paths: Union[str, Path, List[str], List[Path]],
|
|
265
|
-
mode: Literal["FAST", "DEEP", "FILENAME_ONLY"] = "DEEP", # TODO
|
|
266
579
|
*,
|
|
580
|
+
mode: Literal["DEEP", "FILENAME_ONLY"] = "DEEP",
|
|
267
581
|
images: Optional[list] = None,
|
|
268
582
|
max_depth: Optional[int] = 5,
|
|
269
583
|
top_k_files: Optional[int] = 3,
|
|
@@ -272,17 +586,18 @@ class AgenticSearch(BaseSearch):
|
|
|
272
586
|
exclude: Optional[List[str]] = None,
|
|
273
587
|
verbose: Optional[bool] = True,
|
|
274
588
|
grep_timeout: Optional[float] = 60.0,
|
|
275
|
-
|
|
589
|
+
return_cluster: Optional[bool] = False,
|
|
590
|
+
) -> Union[str, List[Dict[str, Any]], KnowledgeCluster]:
|
|
276
591
|
"""
|
|
277
592
|
Perform intelligent search with multi-level keyword extraction.
|
|
278
593
|
|
|
279
594
|
Args:
|
|
280
595
|
query: Search query string
|
|
281
596
|
search_paths: Paths to search in
|
|
282
|
-
mode: Search mode (
|
|
597
|
+
mode: Search mode (DEEP/FILENAME_ONLY), default is DEEP
|
|
283
598
|
images: Optional image inputs
|
|
284
599
|
max_depth: Maximum directory depth to search
|
|
285
|
-
top_k_files: Number of top files to
|
|
600
|
+
top_k_files: Number of top files to grep-retrieve
|
|
286
601
|
keyword_levels: Number of keyword granularity levels (default: 3)
|
|
287
602
|
- Higher values provide more fallback options
|
|
288
603
|
- Recommended: 3-5 levels
|
|
@@ -290,10 +605,49 @@ class AgenticSearch(BaseSearch):
|
|
|
290
605
|
exclude: File patterns to exclude
|
|
291
606
|
verbose: Enable verbose logging
|
|
292
607
|
grep_timeout: Timeout for grep operations
|
|
608
|
+
return_cluster: Whether to return the full knowledge cluster. Ignore if mode is `FILENAME_ONLY`.
|
|
609
|
+
|
|
610
|
+
Mode behaviors:
|
|
611
|
+
- In FILENAME_ONLY mode, performs fast filename search without LLM involvement. Returns list of matching files.
|
|
612
|
+
Format: {'filename': 'Attention_Is_All_You_Need.pdf', 'match_score': 0.8, 'matched_pattern': '.*Attention.*', 'path': '/path/to/Attention_Is_All_You_Need.pdf', 'type': 'filename_match'}
|
|
613
|
+
|
|
614
|
+
+--------------+------------------+-----------------------+------------------------+
|
|
615
|
+
| Feature | FILENAME_ONLY | FAST (To be designed) | DEEP (Current) |
|
|
616
|
+
+--------------+------------------+-----------------------+------------------------+
|
|
617
|
+
| Speed | Very Fast (<1s) | Fast (<5s) | Slow (5-30s) |
|
|
618
|
+
| LLM Calls | 0 times | 1-2 times | 4-5 times |
|
|
619
|
+
| Return Type | List[Dict] | str / Cluster | str / Cluster |
|
|
620
|
+
| Use Case | File Location | Rapid Content Search | Deep Knowledge Extract |
|
|
621
|
+
+--------------+------------------+-----------------------+------------------------+
|
|
293
622
|
|
|
294
623
|
Returns:
|
|
295
|
-
Search result summary string
|
|
624
|
+
Search result summary string, or KnowledgeCluster if return_cluster is True, or List[Dict[str, Any]] for FILENAME_ONLY mode.
|
|
296
625
|
"""
|
|
626
|
+
# Handle FILENAME_ONLY mode: fast filename search without LLM
|
|
627
|
+
if mode == "FILENAME_ONLY":
|
|
628
|
+
filename_results: List[Dict[str, Any]] = await self._search_by_filename(
|
|
629
|
+
query=query,
|
|
630
|
+
search_paths=search_paths,
|
|
631
|
+
max_depth=max_depth,
|
|
632
|
+
include=include,
|
|
633
|
+
exclude=exclude,
|
|
634
|
+
grep_timeout=grep_timeout,
|
|
635
|
+
top_k=top_k_files,
|
|
636
|
+
)
|
|
637
|
+
|
|
638
|
+
if not filename_results:
|
|
639
|
+
error_msg = f"No files found matching query: '{query}'"
|
|
640
|
+
await self._logger.warning(error_msg)
|
|
641
|
+
return None if return_cluster else error_msg
|
|
642
|
+
|
|
643
|
+
await self._logger.success(f"Retrieved {len(filename_results)} matching files")
|
|
644
|
+
|
|
645
|
+
return filename_results
|
|
646
|
+
|
|
647
|
+
# Try to reuse existing cluster based on semantic similarity
|
|
648
|
+
reused_result = await self._try_reuse_cluster(query, return_cluster=return_cluster)
|
|
649
|
+
if reused_result:
|
|
650
|
+
return reused_result
|
|
297
651
|
|
|
298
652
|
# Build request
|
|
299
653
|
text_items: List[ContentItem] = [ContentItem(type="text", text=query)]
|
|
@@ -411,7 +765,8 @@ class AgenticSearch(BaseSearch):
|
|
|
411
765
|
await self._logger.info(f"Found {len(grep_results)} files, top {len(file_list)}:\n{tmp_sep.join(file_list)}")
|
|
412
766
|
|
|
413
767
|
if len(grep_results) == 0:
|
|
414
|
-
|
|
768
|
+
error_msg = f"No relevant information found for the query: {query}"
|
|
769
|
+
return None if return_cluster else error_msg
|
|
415
770
|
|
|
416
771
|
# Build knowledge cluster
|
|
417
772
|
await self._logger.info("Building knowledge cluster...")
|
|
@@ -429,7 +784,8 @@ class AgenticSearch(BaseSearch):
|
|
|
429
784
|
await self._logger.success(" ✓", flush=True)
|
|
430
785
|
|
|
431
786
|
if cluster is None:
|
|
432
|
-
|
|
787
|
+
error_msg = f"No relevant information found for the query: {query}"
|
|
788
|
+
return None if return_cluster else error_msg
|
|
433
789
|
|
|
434
790
|
if self.verbose:
|
|
435
791
|
await self._logger.info(json.dumps(cluster.to_dict(), ensure_ascii=False, indent=2))
|
|
@@ -451,25 +807,27 @@ class AgenticSearch(BaseSearch):
|
|
|
451
807
|
messages=[{"role": "user", "content": result_sum_prompt}],
|
|
452
808
|
stream=True,
|
|
453
809
|
)
|
|
454
|
-
|
|
810
|
+
llm_response: str = search_result_response.content
|
|
455
811
|
self.llm_usages.append(search_result_response.usage)
|
|
456
812
|
await self._logger.success(" ✓", flush=True)
|
|
457
813
|
await self._logger.success("Search completed successfully!")
|
|
458
814
|
|
|
815
|
+
# Parse LLM response to extract summary and save decision
|
|
816
|
+
search_result, should_save = self._parse_summary_response(llm_response)
|
|
817
|
+
|
|
459
818
|
# Add search results (file paths) to the cluster
|
|
460
819
|
if grep_results:
|
|
461
820
|
cluster.search_results.append(search_result)
|
|
821
|
+
|
|
822
|
+
# Add current query to queries list with FIFO strategy
|
|
823
|
+
self._add_query_to_cluster(cluster, query)
|
|
824
|
+
|
|
825
|
+
# Save cluster based on LLM's quality evaluation
|
|
826
|
+
if should_save:
|
|
827
|
+
await self._save_cluster_with_embedding(cluster)
|
|
828
|
+
else:
|
|
829
|
+
await self._logger.info(
|
|
830
|
+
"Cluster not saved - LLM determined insufficient quality or relevance"
|
|
831
|
+
)
|
|
462
832
|
|
|
463
|
-
|
|
464
|
-
try:
|
|
465
|
-
await self.knowledge_manager.insert(cluster)
|
|
466
|
-
await self._logger.info(f"Saved knowledge cluster {cluster.id} to cache")
|
|
467
|
-
except Exception as e:
|
|
468
|
-
# If cluster exists, update it instead
|
|
469
|
-
try:
|
|
470
|
-
await self.knowledge_manager.update(cluster)
|
|
471
|
-
await self._logger.info(f"Updated knowledge cluster {cluster.id} in cache")
|
|
472
|
-
except Exception as update_error:
|
|
473
|
-
await self._logger.warning(f"Failed to save knowledge cluster: {update_error}")
|
|
474
|
-
|
|
475
|
-
return search_result
|
|
833
|
+
return cluster if return_cluster else search_result
|
sirchmunk/storage/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# Copyright (c) ModelScope Contributors. All rights reserved.
|
|
2
2
|
"""Storage package initialization"""
|
|
3
3
|
|
|
4
|
-
from .
|
|
4
|
+
from .knowledge_storage import KnowledgeStorage
|
|
5
5
|
from .duckdb import DuckDBManager
|
|
6
6
|
|
|
7
|
-
__all__ = ["
|
|
7
|
+
__all__ = ["KnowledgeStorage", "DuckDBManager"]
|