PyPI - sirchmunk - Versions diffs - 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl - Mend

sirchmunk 0.0.1py3-none-any.whl → 0.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

sirchmunk/api/__init__.py +1 -0
sirchmunk/api/chat.py +1123 -0
sirchmunk/api/components/__init__.py +0 -0
sirchmunk/api/components/history_storage.py +402 -0
sirchmunk/api/components/monitor_tracker.py +518 -0
sirchmunk/api/components/settings_storage.py +353 -0
sirchmunk/api/history.py +254 -0
sirchmunk/api/knowledge.py +411 -0
sirchmunk/api/main.py +120 -0
sirchmunk/api/monitor.py +219 -0
sirchmunk/api/run_server.py +54 -0
sirchmunk/api/search.py +230 -0
sirchmunk/api/settings.py +309 -0
sirchmunk/api/tools.py +315 -0
sirchmunk/cli/__init__.py +11 -0
sirchmunk/cli/cli.py +789 -0
sirchmunk/learnings/knowledge_base.py +5 -2
sirchmunk/llm/prompts.py +12 -1
sirchmunk/retrieve/text_retriever.py +186 -2
sirchmunk/scan/file_scanner.py +2 -2
sirchmunk/schema/knowledge.py +119 -35
sirchmunk/search.py +384 -26
sirchmunk/storage/__init__.py +2 -2
sirchmunk/storage/{knowledge_manager.py → knowledge_storage.py} +265 -60
sirchmunk/utils/constants.py +7 -5
sirchmunk/utils/embedding_util.py +217 -0
sirchmunk/utils/tokenizer_util.py +36 -1
sirchmunk/version.py +1 -1
{sirchmunk-0.0.1.dist-info → sirchmunk-0.0.2.dist-info}/METADATA +196 -14
sirchmunk-0.0.2.dist-info/RECORD +69 -0
{sirchmunk-0.0.1.dist-info → sirchmunk-0.0.2.dist-info}/WHEEL +1 -1
sirchmunk-0.0.2.dist-info/top_level.txt +2 -0
sirchmunk_mcp/__init__.py +25 -0
sirchmunk_mcp/cli.py +478 -0
sirchmunk_mcp/config.py +276 -0
sirchmunk_mcp/server.py +355 -0
sirchmunk_mcp/service.py +327 -0
sirchmunk_mcp/setup.py +15 -0
sirchmunk_mcp/tools.py +410 -0
sirchmunk-0.0.1.dist-info/RECORD +0 -45
sirchmunk-0.0.1.dist-info/top_level.txt +0 -1
{sirchmunk-0.0.1.dist-info → sirchmunk-0.0.2.dist-info}/entry_points.txt +0 -0
{sirchmunk-0.0.1.dist-info → sirchmunk-0.0.2.dist-info}/licenses/LICENSE +0 -0

sirchmunk/learnings/knowledge_base.py CHANGED Viewed

@@ -20,7 +20,7 @@ from sirchmunk.schema.knowledge import (
 )
 from sirchmunk.schema.metadata import FileInfo
 from sirchmunk.schema.request import Request
-from sirchmunk.utils.constants import DEFAULT_WORK_PATH
+from sirchmunk.utils.constants import DEFAULT_SIRCHMUNK_WORK_PATH
 from sirchmunk.utils.file_utils import StorageStructure, fast_extract
 from sirchmunk.utils import create_logger, LogCallback
 from sirchmunk.utils.utils import extract_fields
@@ -51,7 +51,9 @@ class KnowledgeBase:
         self.llm = llm
         self.metadata_map = metadata_map
         self.work_path: Path = (
-            DEFAULT_WORK_PATH if work_path is None else Path(work_path).resolve()
+            Path(DEFAULT_SIRCHMUNK_WORK_PATH).expanduser().resolve()
+            if work_path is None
+            else Path(work_path).expanduser().resolve()
         )
         self.metadata_path: Path = (
             self.work_path / StorageStructure.CACHE_DIR / StorageStructure.METADATA_DIR
@@ -208,6 +210,7 @@ class KnowledgeBase:
         cluster_id = f"C{hashlib.sha256(cluster_text.encode('utf-8')).hexdigest()[:10]}"
+        # TODO: Adapt cluster attributes based on real scenarios
         cluster = KnowledgeCluster(
             id=cluster_id,
             name=cluster_name,

sirchmunk/llm/prompts.py CHANGED Viewed

@@ -169,8 +169,19 @@ Analyze the provided {text_content} and generate a concise summary in the form o
 - **User Input**: {user_input}
 - **Search Result Text**: {text_content}
-### Output
+### Quality Evaluation
+After generating the summary, evaluate whether this knowledge cluster is worth saving to the persistent cache based on:
+1. Does the search result contain substantial, relevant information for the user input?
+2. Is the content meaningful and not just error messages or "no information found"?
+3. Are there sufficient evidences and context to answer the user's query?
+If YES to all above, output "true"; otherwise output "false".
+### Output Format
+<SUMMARY>
 [Generate the Markdown Briefing here]
+</SUMMARY>
+<SHOULD_SAVE>true/false</SHOULD_SAVE>
 """

sirchmunk/retrieve/text_retriever.py CHANGED Viewed

@@ -9,7 +9,7 @@ from typing import Any, Dict, List, Literal, Optional, Union
 from loguru import logger
-from ..utils.constants import GREP_CONCURRENT_LIMIT, DEFAULT_WORK_PATH
+from ..utils.constants import GREP_CONCURRENT_LIMIT, DEFAULT_SIRCHMUNK_WORK_PATH
 from ..utils.file_utils import StorageStructure
 from .base import BaseRetriever
@@ -29,7 +29,7 @@ class GrepRetriever(BaseRetriever):
     def __init__(self, work_path: Union[str, Path] = None, **kwargs):
         super().__init__()
-        self.work_path: Path = Path(work_path or DEFAULT_WORK_PATH)
+        self.work_path: Path = Path(work_path or DEFAULT_SIRCHMUNK_WORK_PATH).expanduser().resolve()
         self.rga_cache: Path = (
             self.work_path / StorageStructure.CACHE_DIR / StorageStructure.GREP_DIR
         )
@@ -688,6 +688,190 @@ class GrepRetriever(BaseRetriever):
         return result["stdout"].strip().splitlines() if result["stdout"].strip() else []
+    async def retrieve_by_filename(
+        self,
+        patterns: Union[str, List[str]],
+        path: Union[str, Path, List[str], List[Path], None] = None,
+        *,
+        case_sensitive: bool = False,
+        max_depth: Optional[int] = None,
+        include: Optional[List[str]] = None,
+        exclude: Optional[List[str]] = None,
+        file_type: Optional[str] = None,
+        rank: bool = True,
+        timeout: float = 60.0,
+    ) -> List[Dict[str, Any]]:
+        """Search for files by filename patterns (fast file name matching).
+        This method performs filename-only search without reading file contents,
+        making it significantly faster than content-based search.
+        Args:
+            patterns: Single pattern (str) or list of patterns (List[str]) to match filenames.
+                     Patterns are treated as regex by default (e.g., "test.*\\.py").
+            path: Single path (str/Path) or multiple paths (List[str]/List[Path]) to search in.
+            case_sensitive: If True, enable case-sensitive filename matching.
+            max_depth: Maximum directory depth to search.
+            include: List of glob patterns to include (e.g., ["*.py", "*.md"]).
+            exclude: List of glob patterns to exclude (e.g., ["*.pyc", "*.log"]).
+            file_type: Search only files of given type (e.g., 'py', 'md').
+            rank: If True, rank results by pattern match quality (e.g., exact match > partial match).
+            timeout: Maximum time in seconds to wait for the search to complete.
+        Returns:
+            List of match objects with structure:
+            [
+                {
+                    'path': '/absolute/path/to/file.py',
+                    'filename': 'file.py',
+                    'match_score': 1.0,  # relevance score (0.0-1.0)
+                    'type': 'filename_match'
+                },
+                ...
+            ]
+        """
+        # Normalize patterns
+        if isinstance(patterns, str):
+            patterns = [patterns]
+        logger.debug(f"retrieve_by_filename called with patterns: {patterns}, path: {path}, "
+                    f"include: {include}, exclude: {exclude}, max_depth: {max_depth}")
+        # Normalize paths
+        if path is None:
+            paths = ["."]
+        elif isinstance(path, (str, Path)):
+            paths = [str(path)]
+        else:
+            paths = [str(p) for p in path]
+        # List all files in the specified paths
+        all_files = []
+        for search_path in paths:
+            try:
+                files = await self.list_files(
+                    path=search_path,
+                    max_depth=max_depth,
+                    include=include,
+                    exclude=exclude,
+                    file_type=file_type,
+                )
+                all_files.extend(files)
+            except Exception as e:
+                logger.warning(f"Failed to list files in {search_path}: {e}")
+                continue
+        if not all_files:
+            logger.debug("No files found to search")
+            return []
+        logger.debug(f"Searching through {len(all_files)} files with patterns: {patterns}")
+        # Filter files by patterns
+        results = []
+        for file_path in all_files:
+            # Get both absolute and relative paths for proper handling
+            file_path_obj = Path(file_path)
+            filename = file_path_obj.name
+            # Check if filename matches any pattern
+            for pattern in patterns:
+                try:
+                    # Compile regex pattern
+                    flags = 0 if case_sensitive else re.IGNORECASE
+                    regex = re.compile(pattern, flags)
+                    match = regex.search(filename)
+                    if match:
+                        logger.debug(f"Pattern '{pattern}' matched file: {filename}")
+                        # Calculate match score
+                        match_score = self._calculate_filename_match_score(
+                            filename=filename,
+                            pattern=pattern,
+                            case_sensitive=case_sensitive
+                        )
+                        # Use absolute path if file exists, otherwise keep original path
+                        try:
+                            abs_path = str(file_path_obj.resolve())
+                        except (OSError, RuntimeError):
+                            abs_path = str(file_path_obj.absolute()) if file_path_obj.is_absolute() else file_path
+                        results.append({
+                            'path': abs_path,
+                            'filename': filename,
+                            'match_score': match_score,
+                            'type': 'filename_match',
+                            'matched_pattern': pattern,
+                        })
+                        break  # Only count each file once (first matching pattern)
+                except re.error as e:
+                    logger.warning(f"Invalid regex pattern '{pattern}': {e}")
+                    continue
+        logger.debug(f"Found {len(results)} matching files")
+        # Rank results by match score if requested
+        if rank and results:
+            results.sort(key=lambda x: x['match_score'], reverse=True)
+        return results
+    @staticmethod
+    def _calculate_filename_match_score(
+        filename: str,
+        pattern: str,
+        case_sensitive: bool = False
+    ) -> float:
+        """Calculate relevance score for filename pattern match.
+        Args:
+            filename: The filename that matched
+            pattern: The regex pattern that was matched
+            case_sensitive: Whether the match was case-sensitive
+        Returns:
+            Score between 0.0 and 1.0, where:
+            - 1.0 = exact match (highest priority)
+            - 0.9 = exact match with different case
+            - 0.7-0.8 = starts with pattern
+            - 0.5-0.6 = contains pattern
+            - 0.3-0.4 = partial regex match
+        """
+        # Normalize for comparison
+        fn_lower = filename.lower()
+        pattern_lower = pattern.lower()
+        # Remove regex special characters for literal comparison
+        pattern_literal = re.sub(r'[.*+?^${}()|[\]\\]', '', pattern)
+        pattern_literal_lower = pattern_literal.lower()
+        # Exact match (case-sensitive)
+        if filename == pattern or filename == pattern_literal:
+            return 1.0
+        # Exact match (case-insensitive)
+        if not case_sensitive and (fn_lower == pattern_lower or fn_lower == pattern_literal_lower):
+            return 0.9
+        # Starts with pattern
+        if filename.startswith(pattern_literal):
+            return 0.8
+        if fn_lower.startswith(pattern_literal_lower):
+            return 0.75
+        # Contains pattern (full)
+        if pattern_literal in filename:
+            return 0.6
+        if pattern_literal_lower in fn_lower:
+            return 0.55
+        # Partial match (proportional to match length)
+        match_ratio = len(pattern_literal) / max(len(filename), 1)
+        return 0.3 + (match_ratio * 0.2)  # Score between 0.3 and 0.5
     def file_types(self) -> Dict[str, List[str]]:
         """List supported file types and their associated globs/extensions.

sirchmunk/scan/file_scanner.py CHANGED Viewed

@@ -58,9 +58,9 @@ class FileScanner(BaseScanner):
             corpus_path = [corpus_path]
         self.corpus_paths: List[Path] = [Path(p).resolve() for p in corpus_path]
-        # Set work and metadata paths
+        # Set work and metadata paths (expand ~ and resolve to absolute path)
         self.work_path: Path = (
-            Path.cwd() if work_path is None else Path(work_path).resolve()
+            Path.cwd() if work_path is None else Path(work_path).expanduser().resolve()
         )
         self.metadata_path: Path = (
             self.work_path / StorageStructure.CACHE_DIR / StorageStructure.METADATA_DIR

sirchmunk/schema/knowledge.py CHANGED Viewed

@@ -230,6 +230,10 @@ class KnowledgeCluster:
     # Used to track which sources contributed to this knowledge cluster
     search_results: List[str] = None
+    # Historical queries: list of original user input queries that led to this cluster
+    # Used for semantic similarity matching and cluster reuse
+    queries: List[str] = None
     def __post_init__(self):
         if self.related_clusters is None:
             self.related_clusters = []
@@ -237,6 +241,9 @@ class KnowledgeCluster:
         if self.search_results is None:
             self.search_results = []
+        if self.queries is None:
+            self.queries = []
         if self.create_time is None:
             self.create_time = datetime.now(timezone.utc)
@@ -246,6 +253,117 @@ class KnowledgeCluster:
         if self.version is None:
             self.version = 0
+    def __repr__(self) -> str:
+        """
+        Return a concise representation for debugging.
+        """
+        # Get content length
+        content_len = 0
+        if isinstance(self.content, str):
+            content_len = len(self.content)
+        elif isinstance(self.content, list):
+            content_len = sum(len(c) for c in self.content)
+        return (
+            f"KnowledgeCluster(id={self.id!r}, name={self.name!r}, "
+            f"version={self.version}, lifecycle={self.lifecycle.value}, "
+            f"evidences={len(self.evidences)}, queries={len(self.queries)}, "
+            f"content_len={content_len}, search_results={len(self.search_results)})"
+        )
+    def __str__(self) -> str:
+        """
+        Return a human-readable string representation.
+        """
+        separator = "─" * 70  # Horizontal separator line
+        # Extract description text
+        desc_text = ""
+        if isinstance(self.description, str):
+            desc_text = self.description
+        elif isinstance(self.description, list):
+            desc_preview = []
+            for i, item in enumerate(self.description, 1):
+                desc_preview.append(f"  [{i}] {item}")
+            desc_text = "\n".join(desc_preview)
+        # Extract content text
+        content_text = ""
+        if isinstance(self.content, str):
+            content_text = self.content
+        elif isinstance(self.content, list):
+            content_text = self.content[0] if self.content else ""  # Preview first item
+        # Build basic info
+        lines = [
+            f"━━━ KnowledgeCluster: {self.name} ━━━",
+            f"ID: {self.id}",
+            f"Description:\n{desc_text}" if desc_text else "Description: N/A",
+            f"Lifecycle: {self.lifecycle.value} | Version: {self.version}",
+            f"Confidence: {self.confidence:.3f}" if self.confidence else "Confidence: N/A",
+        ]
+        # Add content preview
+        if content_text:
+            lines.append(separator)
+            lines.append(f"Content Preview:\n{content_text}")
+        # Add evidences with preview (max 5)
+        if self.evidences:
+            lines.append(separator)
+            lines.append(f"Evidences ({len(self.evidences)} total):")
+            for i, evidence in enumerate(self.evidences[:5], 1):
+                file_path = str(evidence.file_or_url)
+                # Shorten path if too long
+                if len(file_path) > 60:
+                    file_path = "..." + file_path[-57:]
+                summary_preview = evidence.summary[:80] + "..." if len(evidence.summary) > 80 else evidence.summary
+                lines.append(f"  [{i}] {file_path}")
+                lines.append(f"      {summary_preview}")
+                lines.append(f"      Snippets: {len(evidence.snippets)}, Found: {evidence.is_found}")
+            if len(self.evidences) > 5:
+                lines.append(f"  ... (+{len(self.evidences) - 5} more evidences)")
+        # Add optional fields
+        has_optional_fields = False
+        optional_lines = []
+        if self.hotness is not None:
+            optional_lines.append(f"Hotness: {self.hotness:.3f}")
+            has_optional_fields = True
+        if self.abstraction_level:
+            optional_lines.append(f"Abstraction: {self.abstraction_level.name}")
+            has_optional_fields = True
+        if self.queries:
+            queries_preview = ", ".join(f'"{q}"' for q in self.queries[:3])
+            if len(self.queries) > 3:
+                queries_preview += f" (+{len(self.queries) - 3} more)"
+            optional_lines.append(f"Related Queries: {queries_preview}")
+            has_optional_fields = True
+        if has_optional_fields:
+            lines.append(separator)
+            lines.extend(optional_lines)
+        # Add search results
+        if self.search_results:
+            lines.append(separator)
+            lines.append(f"Search Results ({len(self.search_results)} files):")
+            for i, result in enumerate(self.search_results[:5], 1):
+                result_preview = result[:80] + "..." if len(result) > 80 else result
+                lines.append(f"  [{i}] {result_preview}")
+            if len(self.search_results) > 5:
+                lines.append(f"  ... (+{len(self.search_results) - 5} more)")
+        # Add timestamp
+        if self.last_modified:
+            lines.append(separator)
+            lines.append(f"Last Modified: {self.last_modified.strftime('%Y-%m-%d %H:%M:%S')}")
+        return "\n".join(lines)
     @property
     def primary_evidence_files(self) -> Set[str]:
         """Return set of unique file IDs backing this cluster — useful for evidence-layer prefetch."""
@@ -279,40 +397,6 @@ class KnowledgeCluster:
             "version": self.version,
             "related_clusters": [rc.to_dict() for rc in self.related_clusters],
             "search_results": self.search_results,
+            "queries": self.queries,
         }
-if __name__ == "__main__":
-    # Create instance
-    cluster = KnowledgeCluster(
-        id="C1001",
-        name="Test Cluster",
-        description=["A desc from perspective A.", "A desc from perspective B."],
-        content="Detailed content of the knowledge cluster.",
-        scripts=["print('Hello World')"],
-        resources=[
-            {"type": "url", "value": "https://example.com"},
-            {"type": "file", "value": "/data/image1.png"},
-        ],
-        patterns=["pattern A", "pattern B"],
-        constraints=[Constraint("x > 0", "low", "x must be positive")],
-        evidences=[
-            EvidenceUnit(
-                doc_id="doc1",
-                file_or_url=Path("/data/file.txt"),
-                segment={"text": "supporting text", "type": "match", "line_number": 10},
-                score=0.9,
-                extracted_at=datetime(2025, 1, 1),
-            )
-        ],
-        confidence=0.85,
-        abstraction_level=AbstractionLevel.PRINCIPLE,
-        landmark_potential=0.6,
-        hotness=0.4,
-        lifecycle=Lifecycle.STABLE,
-        create_time=datetime(2025, 1, 1),
-        last_modified=datetime(2025, 1, 2),
-    )
-    print(cluster.to_dict())

sirchmunk 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl

sirchmunk 0.0.1py3-none-any.whl → 0.0.2py3-none-any.whl