PyPI - sirchmunk - Versions diffs - 0.0.1.post1__py3-none-any.whl → 0.0.2__py3-none-any.whl - Mend

sirchmunk 0.0.1.post1py3-none-any.whl → 0.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

sirchmunk/api/__init__.py +1 -0
sirchmunk/api/chat.py +1123 -0
sirchmunk/api/components/__init__.py +0 -0
sirchmunk/api/components/history_storage.py +402 -0
sirchmunk/api/components/monitor_tracker.py +518 -0
sirchmunk/api/components/settings_storage.py +353 -0
sirchmunk/api/history.py +254 -0
sirchmunk/api/knowledge.py +411 -0
sirchmunk/api/main.py +120 -0
sirchmunk/api/monitor.py +219 -0
sirchmunk/api/run_server.py +54 -0
sirchmunk/api/search.py +230 -0
sirchmunk/api/settings.py +309 -0
sirchmunk/api/tools.py +315 -0
sirchmunk/cli/__init__.py +11 -0
sirchmunk/cli/cli.py +789 -0
sirchmunk/learnings/knowledge_base.py +5 -2
sirchmunk/llm/prompts.py +12 -1
sirchmunk/retrieve/text_retriever.py +186 -2
sirchmunk/scan/file_scanner.py +2 -2
sirchmunk/schema/knowledge.py +119 -35
sirchmunk/search.py +384 -26
sirchmunk/storage/__init__.py +2 -2
sirchmunk/storage/{knowledge_manager.py → knowledge_storage.py} +265 -60
sirchmunk/utils/constants.py +7 -5
sirchmunk/utils/embedding_util.py +217 -0
sirchmunk/utils/tokenizer_util.py +36 -1
sirchmunk/version.py +1 -1
{sirchmunk-0.0.1.post1.dist-info → sirchmunk-0.0.2.dist-info}/METADATA +124 -9
sirchmunk-0.0.2.dist-info/RECORD +69 -0
{sirchmunk-0.0.1.post1.dist-info → sirchmunk-0.0.2.dist-info}/WHEEL +1 -1
sirchmunk-0.0.2.dist-info/top_level.txt +2 -0
sirchmunk_mcp/__init__.py +25 -0
sirchmunk_mcp/cli.py +478 -0
sirchmunk_mcp/config.py +276 -0
sirchmunk_mcp/server.py +355 -0
sirchmunk_mcp/service.py +327 -0
sirchmunk_mcp/setup.py +15 -0
sirchmunk_mcp/tools.py +410 -0
sirchmunk-0.0.1.post1.dist-info/RECORD +0 -45
sirchmunk-0.0.1.post1.dist-info/top_level.txt +0 -1
{sirchmunk-0.0.1.post1.dist-info → sirchmunk-0.0.2.dist-info}/entry_points.txt +0 -0
{sirchmunk-0.0.1.post1.dist-info → sirchmunk-0.0.2.dist-info}/licenses/LICENSE +0 -0

sirchmunk/search.py CHANGED Viewed

@@ -1,10 +1,15 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 import ast
 import json
+import logging
+import re
+from datetime import datetime
 from pathlib import Path
 from typing import Any, Dict, List, Literal, Optional, Union
 from sirchmunk.base import BaseSearch
+logger = logging.getLogger(__name__)
 from sirchmunk.learnings.knowledge_base import KnowledgeBase
 from sirchmunk.llm.openai_chat import OpenAIChat
 from sirchmunk.llm.prompts import (
@@ -14,8 +19,8 @@ from sirchmunk.llm.prompts import (
 from sirchmunk.retrieve.text_retriever import GrepRetriever
 from sirchmunk.schema.knowledge import KnowledgeCluster
 from sirchmunk.schema.request import ContentItem, ImageURL, Message, Request
-from sirchmunk.storage.knowledge_manager import KnowledgeManager
-from sirchmunk.utils.constants import LLM_BASE_URL, LLM_API_KEY, LLM_MODEL_NAME, WORK_PATH
+from sirchmunk.storage.knowledge_storage import KnowledgeStorage
+from sirchmunk.utils.constants import LLM_BASE_URL, LLM_API_KEY, LLM_MODEL_NAME, SIRCHMUNK_WORK_PATH
 from sirchmunk.utils.deps import check_dependencies
 from sirchmunk.utils.file_utils import get_fast_hash
 from sirchmunk.utils import create_logger, LogCallback
@@ -35,12 +40,14 @@ class AgenticSearch(BaseSearch):
         work_path: Optional[Union[str, Path]] = None,
         verbose: bool = False,
         log_callback: LogCallback = None,
+        reuse_knowledge: bool = True,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        work_path = work_path or WORK_PATH
-        self.work_path: Path = Path(work_path)
+        work_path = work_path or SIRCHMUNK_WORK_PATH
+        # Ensure path is expanded (handle ~ and environment variables)
+        self.work_path: Path = Path(work_path).expanduser().resolve()
         self.llm: OpenAIChat = llm or OpenAIChat(
             base_url=LLM_BASE_URL,
@@ -62,7 +69,7 @@ class AgenticSearch(BaseSearch):
         )
         # Initialize KnowledgeManager for persistent storage
-        self.knowledge_manager = KnowledgeManager(work_path=str(self.work_path))
+        self.knowledge_manager = KnowledgeStorage(work_path=str(self.work_path))
         # Load historical knowledge clusters from cache
         self._load_historical_knowledge()
@@ -71,6 +78,30 @@ class AgenticSearch(BaseSearch):
         self.llm_usages: List[Dict[str, Any]] = []
+        # Maximum number of queries to keep per cluster (FIFO strategy)
+        self.max_queries_per_cluster: int = 5
+        # Initialize embedding client for cluster reuse
+        self.embedding_client = None
+        # Similarity threshold for cluster reuse
+        self.cluster_sim_threshold: float = kwargs.pop('cluster_sim_threshold', 0.85)
+        self.cluster_sim_top_k: int = kwargs.pop('cluster_sim_top_k', 3)
+        if reuse_knowledge:
+            try:
+                from sirchmunk.utils.embedding_util import EmbeddingUtil
+                self.embedding_client = EmbeddingUtil(
+                    cache_dir=str(self.work_path / ".cache" / "models")
+                )
+                logger.debug(
+                    f"Embedding client initialized: {self.embedding_client.get_model_info()}"
+                )
+            except Exception as e:
+                logger.warning(
+                    f"Failed to initialize embedding client: {e}. Cluster reuse disabled."
+                )
+                self.embedding_client = None
         if not check_dependencies():
             print("Installing rga (ripgrep-all) and rg (ripgrep)...", flush=True)
             install_rga()
@@ -84,6 +115,289 @@ class AgenticSearch(BaseSearch):
             print(f"Loaded {cluster_count} historical knowledge clusters from cache")
         except Exception as e:
             print(f"[WARNING] Failed to load historical knowledge: {e}")
+    async def _try_reuse_cluster(
+        self,
+        query: str,
+        return_cluster: bool = False
+    ) -> Optional[Union[str, KnowledgeCluster]]:
+        """
+        Try to reuse existing knowledge cluster based on semantic similarity.
+        Args:
+            query: Search query string
+            return_cluster: Whether to return the full cluster object or just content string
+        Returns:
+            Cluster content string or KnowledgeCluster object if found, None otherwise
+        """
+        if not self.embedding_client:
+            return None
+        try:
+            await self._logger.info("Searching for similar knowledge clusters...")
+            # Compute query embedding
+            query_embedding = (await self.embedding_client.embed([query]))[0]
+            # Search for similar clusters
+            similar_clusters = await self.knowledge_manager.search_similar_clusters(
+                query_embedding=query_embedding,
+                top_k=self.cluster_sim_top_k,
+                similarity_threshold=self.cluster_sim_threshold,
+            )
+            if not similar_clusters:
+                await self._logger.info("No similar clusters found, performing new search...")
+                return None
+            # Found similar cluster - process reuse
+            best_match = similar_clusters[0]
+            await self._logger.success(
+                f"♻️ Found similar cluster: {best_match['name']} "
+                f"(similarity: {best_match['similarity']:.3f})"
+            )
+            # Retrieve full cluster object
+            existing_cluster = await self.knowledge_manager.get(best_match["id"])
+            if not existing_cluster:
+                await self._logger.warning("Failed to retrieve cluster, falling back to new search")
+                return None
+            # Add current query to queries list with FIFO strategy
+            self._add_query_to_cluster(existing_cluster, query)
+            # Update hotness and timestamp for reused cluster
+            existing_cluster.hotness = min(1.0, (existing_cluster.hotness or 0.5) + 0.1)
+            existing_cluster.last_modified = datetime.now()
+            # Recompute embedding with new query (before update to avoid double save)
+            if self.embedding_client:
+                try:
+                    from sirchmunk.utils.embedding_util import compute_text_hash
+                    combined_text = self.knowledge_manager.combine_cluster_fields(
+                        existing_cluster.queries
+                    )
+                    text_hash = compute_text_hash(combined_text)
+                    embedding_vector = (await self.embedding_client.embed([combined_text]))[0]
+                    # Update embedding fields in database without triggering save
+                    self.knowledge_manager.db.execute(
+                        f"""
+                        UPDATE {self.knowledge_manager.table_name}
+                        SET
+                            embedding_vector = ?::FLOAT[384],
+                            embedding_model = ?,
+                            embedding_timestamp = CURRENT_TIMESTAMP,
+                            embedding_text_hash = ?
+                        WHERE id = ?
+                        """,
+                        [embedding_vector, self.embedding_client.model_id, text_hash, existing_cluster.id]
+                    )
+                    await self._logger.debug(f"Updated embedding for cluster {existing_cluster.id}")
+                except Exception as emb_error:
+                    await self._logger.warning(f"Failed to update embedding: {emb_error}")
+            # Single update call - saves cluster data and embedding together
+            await self.knowledge_manager.update(existing_cluster)
+            await self._logger.success("Reused existing knowledge cluster")
+            # Return based on return_cluster flag
+            if return_cluster:
+                return existing_cluster
+            else:
+                # Format and return cluster content as string
+                content = existing_cluster.content
+                if isinstance(content, list):
+                    content = "\n".join(content)
+                return str(content) if content else "Knowledge cluster found but content is empty"
+        except Exception as e:
+            await self._logger.warning(
+                f"Failed to search similar clusters: {e}. Falling back to full search."
+            )
+            return None
+    def _add_query_to_cluster(self, cluster: KnowledgeCluster, query: str) -> None:
+        """
+        Add query to cluster's queries list with FIFO strategy.
+        Keeps only the most recent N queries (where N = max_queries_per_cluster).
+        Args:
+            cluster: KnowledgeCluster to update
+            query: New query to add
+        """
+        # Add query if not already present
+        if query not in cluster.queries:
+            cluster.queries.append(query)
+        # Apply FIFO strategy: keep only the most recent N queries
+        if len(cluster.queries) > self.max_queries_per_cluster:
+            # Remove oldest queries (from the beginning)
+            cluster.queries = cluster.queries[-self.max_queries_per_cluster:]
+    async def _save_cluster_with_embedding(self, cluster: KnowledgeCluster) -> None:
+        """
+        Save knowledge cluster to persistent storage and compute embedding.
+        Args:
+            cluster: KnowledgeCluster to save
+        """
+        # Save knowledge cluster to persistent storage
+        try:
+            await self.knowledge_manager.insert(cluster)
+            await self._logger.info(f"Saved knowledge cluster {cluster.id} to cache")
+        except Exception as e:
+            # If cluster exists, update it instead
+            try:
+                await self.knowledge_manager.update(cluster)
+                await self._logger.info(f"Updated knowledge cluster {cluster.id} in cache")
+            except Exception as update_error:
+                await self._logger.warning(f"Failed to save knowledge cluster: {update_error}")
+                return
+        # Compute and store embedding for the cluster
+        if self.embedding_client:
+            try:
+                from sirchmunk.utils.embedding_util import compute_text_hash
+                # Combine queries for embedding
+                combined_text = self.knowledge_manager.combine_cluster_fields(
+                    cluster.queries
+                )
+                text_hash = compute_text_hash(combined_text)
+                # Compute embedding
+                embedding_vector = (await self.embedding_client.embed([combined_text]))[0]
+                # Store embedding
+                await self.knowledge_manager.store_embedding(
+                    cluster_id=cluster.id,
+                    embedding_vector=embedding_vector,
+                    embedding_model=self.embedding_client.model_id,
+                    embedding_text_hash=text_hash
+                )
+                await self._logger.debug(f"Computed and stored embedding for cluster {cluster.id}")
+            except Exception as e:
+                await self._logger.warning(f"Failed to compute embedding for cluster {cluster.id}: {e}")
+    async def _search_by_filename(
+        self,
+        query: str,
+        search_paths: Union[str, Path, List[str], List[Path]],
+        max_depth: Optional[int] = 5,
+        include: Optional[List[str]] = None,
+        exclude: Optional[List[str]] = None,
+        grep_timeout: Optional[float] = 60.0,
+        top_k: Optional[int] = 10,
+    ) -> List[Dict[str, Any]]:
+        """
+        Perform filename-only search without LLM keyword extraction.
+        Args:
+            query: Search query (used as filename pattern)
+            search_paths: Paths to search in
+            max_depth: Maximum directory depth
+            include: File patterns to include
+            exclude: File patterns to exclude
+            grep_timeout: Timeout for grep operations
+            top_k: Maximum number of results to return
+        Returns:
+            List of file matches with metadata
+        """
+        await self._logger.info("Performing filename-only search...")
+        # Extract potential filename patterns from query
+        patterns = []
+        # Check if query looks like a file pattern (contains file extensions or wildcards)
+        if any(char in query for char in ['*', '?', '[', ']']):
+            # Treat as direct glob/regex pattern
+            patterns = [query]
+            await self._logger.info(f"Using direct pattern: {query}")
+        else:
+            # Split into words and create flexible patterns
+            words = [w.strip() for w in query.strip().split() if w.strip()]
+            if not words:
+                await self._logger.warning("No valid words in query")
+                return []
+            # Strategy: Create patterns for each word that match anywhere in filename
+            # Use non-greedy matching and case-insensitive by default
+            for word in words:
+                # Escape special regex characters in the word
+                escaped_word = re.escape(word)
+                # Match word anywhere in filename (case-insensitive handled in retrieve_by_filename)
+                pattern = f".*{escaped_word}.*"
+                patterns.append(pattern)
+                await self._logger.debug(f"Created pattern for word '{word}': {pattern}")
+        if not patterns:
+            await self._logger.warning("No valid filename patterns extracted from query")
+            return []
+        await self._logger.info(f"Searching with {len(patterns)} pattern(s): {patterns}")
+        try:
+            # Use GrepRetriever's filename search
+            await self._logger.debug(f"Calling retrieve_by_filename with {len(patterns)} patterns")
+            results = await self.grep_retriever.retrieve_by_filename(
+                patterns=patterns,
+                path=search_paths,
+                case_sensitive=False,
+                max_depth=max_depth,
+                include=include,
+                exclude=exclude or ["*.pyc", "*.log"],
+                timeout=grep_timeout,
+            )
+            if results:
+                results = results[:top_k]
+                await self._logger.success(f" ✓ Found {len(results)} matching files", flush=True)
+            else:
+                await self._logger.warning("No files matched the patterns")
+            return results
+        except Exception as e:
+            await self._logger.error(f"Filename search failed: {e}")
+            import traceback
+            await self._logger.error(f"Traceback: {traceback.format_exc()}")
+            return []
+    @staticmethod
+    def _parse_summary_response(llm_response: str) -> tuple[str, bool]:
+        """
+        Parse LLM response to extract summary and save decision.
+        Args:
+            llm_response: Raw LLM response containing SUMMARY and SHOULD_SAVE tags
+        Returns:
+            Tuple of (summary_text, should_save_flag)
+        """
+        # Extract SUMMARY content
+        summary_fields = extract_fields(content=llm_response, tags=["SUMMARY", "SHOULD_SAVE"])
+        summary = summary_fields.get("summary", "").strip()
+        should_save_str = summary_fields.get("should_save", "true").strip().lower()
+        # Parse should_save flag
+        should_save = should_save_str in ["true", "yes", "1"]
+        # If extraction failed, use entire response as summary and assume should save
+        if not summary:
+            summary = llm_response.strip()
+            should_save = True
+        return summary, should_save
     @staticmethod
     def _extract_and_validate_keywords(llm_resp: str) -> dict:
@@ -262,8 +576,8 @@ class AgenticSearch(BaseSearch):
         self,
         query: str,
         search_paths: Union[str, Path, List[str], List[Path]],
-        mode: Literal["FAST", "DEEP", "FILENAME_ONLY"] = "DEEP",  # TODO
         *,
+        mode: Literal["DEEP", "FILENAME_ONLY"] = "DEEP",
         images: Optional[list] = None,
         max_depth: Optional[int] = 5,
         top_k_files: Optional[int] = 3,
@@ -272,17 +586,18 @@ class AgenticSearch(BaseSearch):
         exclude: Optional[List[str]] = None,
         verbose: Optional[bool] = True,
         grep_timeout: Optional[float] = 60.0,
-    ) -> str:
+        return_cluster: Optional[bool] = False,
+    ) -> Union[str, List[Dict[str, Any]], KnowledgeCluster]:
         """
         Perform intelligent search with multi-level keyword extraction.
         Args:
             query: Search query string
             search_paths: Paths to search in
-            mode: Search mode (FAST/DEEP/FILENAME_ONLY)
+            mode: Search mode (DEEP/FILENAME_ONLY), default is DEEP
             images: Optional image inputs
             max_depth: Maximum directory depth to search
-            top_k_files: Number of top files to return
+            top_k_files: Number of top files to grep-retrieve
             keyword_levels: Number of keyword granularity levels (default: 3)
                           - Higher values provide more fallback options
                           - Recommended: 3-5 levels
@@ -290,10 +605,49 @@ class AgenticSearch(BaseSearch):
             exclude: File patterns to exclude
             verbose: Enable verbose logging
             grep_timeout: Timeout for grep operations
+            return_cluster: Whether to return the full knowledge cluster. Ignore if mode is `FILENAME_ONLY`.
+        Mode behaviors:
+            - In FILENAME_ONLY mode, performs fast filename search without LLM involvement. Returns list of matching files.
+               Format: {'filename': 'Attention_Is_All_You_Need.pdf', 'match_score': 0.8, 'matched_pattern': '.*Attention.*', 'path': '/path/to/Attention_Is_All_You_Need.pdf', 'type': 'filename_match'}
+            +--------------+------------------+-----------------------+------------------------+
+            | Feature      | FILENAME_ONLY    | FAST (To be designed) | DEEP (Current)         |
+            +--------------+------------------+-----------------------+------------------------+
+            | Speed        | Very Fast (<1s)  | Fast (<5s)           | Slow (5-30s)          |
+            | LLM Calls    | 0 times          | 1-2 times             | 4-5 times              |
+            | Return Type  | List[Dict]       | str / Cluster         | str / Cluster          |
+            | Use Case     | File Location    | Rapid Content Search  | Deep Knowledge Extract |
+            +--------------+------------------+-----------------------+------------------------+
         Returns:
-            Search result summary string
+            Search result summary string, or KnowledgeCluster if return_cluster is True, or List[Dict[str, Any]] for FILENAME_ONLY mode.
         """
+        # Handle FILENAME_ONLY mode: fast filename search without LLM
+        if mode == "FILENAME_ONLY":
+            filename_results: List[Dict[str, Any]] = await self._search_by_filename(
+                query=query,
+                search_paths=search_paths,
+                max_depth=max_depth,
+                include=include,
+                exclude=exclude,
+                grep_timeout=grep_timeout,
+                top_k=top_k_files,
+            )
+            if not filename_results:
+                error_msg = f"No files found matching query: '{query}'"
+                await self._logger.warning(error_msg)
+                return None if return_cluster else error_msg
+            await self._logger.success(f"Retrieved {len(filename_results)} matching files")
+            return filename_results
+        # Try to reuse existing cluster based on semantic similarity
+        reused_result = await self._try_reuse_cluster(query, return_cluster=return_cluster)
+        if reused_result:
+            return reused_result
         # Build request
         text_items: List[ContentItem] = [ContentItem(type="text", text=query)]
@@ -411,7 +765,8 @@ class AgenticSearch(BaseSearch):
             await self._logger.info(f"Found {len(grep_results)} files, top {len(file_list)}:\n{tmp_sep.join(file_list)}")
         if len(grep_results) == 0:
-            return f"No relevant information found for the query: {query}"
+            error_msg = f"No relevant information found for the query: {query}"
+            return None if return_cluster else error_msg
         # Build knowledge cluster
         await self._logger.info("Building knowledge cluster...")
@@ -429,7 +784,8 @@ class AgenticSearch(BaseSearch):
         await self._logger.success(" ✓", flush=True)
         if cluster is None:
-            return f"No relevant information found for the query: {query}"
+            error_msg = f"No relevant information found for the query: {query}"
+            return None if return_cluster else error_msg
         if self.verbose:
             await self._logger.info(json.dumps(cluster.to_dict(), ensure_ascii=False, indent=2))
@@ -451,25 +807,27 @@ class AgenticSearch(BaseSearch):
             messages=[{"role": "user", "content": result_sum_prompt}],
             stream=True,
         )
-        search_result: str = search_result_response.content
+        llm_response: str = search_result_response.content
         self.llm_usages.append(search_result_response.usage)
         await self._logger.success(" ✓", flush=True)
         await self._logger.success("Search completed successfully!")
+        # Parse LLM response to extract summary and save decision
+        search_result, should_save = self._parse_summary_response(llm_response)
         # Add search results (file paths) to the cluster
         if grep_results:
             cluster.search_results.append(search_result)
+        # Add current query to queries list with FIFO strategy
+        self._add_query_to_cluster(cluster, query)
+        # Save cluster based on LLM's quality evaluation
+        if should_save:
+            await self._save_cluster_with_embedding(cluster)
+        else:
+            await self._logger.info(
+                "Cluster not saved - LLM determined insufficient quality or relevance"
+            )
-        # Save knowledge cluster to persistent storage
-        try:
-            await self.knowledge_manager.insert(cluster)
-            await self._logger.info(f"Saved knowledge cluster {cluster.id} to cache")
-        except Exception as e:
-            # If cluster exists, update it instead
-            try:
-                await self.knowledge_manager.update(cluster)
-                await self._logger.info(f"Updated knowledge cluster {cluster.id} in cache")
-            except Exception as update_error:
-                await self._logger.warning(f"Failed to save knowledge cluster: {update_error}")
-        return search_result
+        return cluster if return_cluster else search_result

sirchmunk/storage/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
 """Storage package initialization"""
-from .knowledge_manager import KnowledgeManager
+from .knowledge_storage import KnowledgeStorage
 from .duckdb import DuckDBManager
-__all__ = ["KnowledgeManager", "DuckDBManager"]
+__all__ = ["KnowledgeStorage", "DuckDBManager"]

sirchmunk 0.0.1.post1__py3-none-any.whl → 0.0.2__py3-none-any.whl

sirchmunk 0.0.1.post1py3-none-any.whl → 0.0.2py3-none-any.whl