sirchmunk 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. sirchmunk/api/__init__.py +1 -0
  2. sirchmunk/api/chat.py +1123 -0
  3. sirchmunk/api/components/__init__.py +0 -0
  4. sirchmunk/api/components/history_storage.py +402 -0
  5. sirchmunk/api/components/monitor_tracker.py +518 -0
  6. sirchmunk/api/components/settings_storage.py +353 -0
  7. sirchmunk/api/history.py +254 -0
  8. sirchmunk/api/knowledge.py +411 -0
  9. sirchmunk/api/main.py +120 -0
  10. sirchmunk/api/monitor.py +219 -0
  11. sirchmunk/api/run_server.py +54 -0
  12. sirchmunk/api/search.py +230 -0
  13. sirchmunk/api/settings.py +309 -0
  14. sirchmunk/api/tools.py +315 -0
  15. sirchmunk/cli/__init__.py +11 -0
  16. sirchmunk/cli/cli.py +789 -0
  17. sirchmunk/learnings/knowledge_base.py +5 -2
  18. sirchmunk/llm/prompts.py +12 -1
  19. sirchmunk/retrieve/text_retriever.py +186 -2
  20. sirchmunk/scan/file_scanner.py +2 -2
  21. sirchmunk/schema/knowledge.py +119 -35
  22. sirchmunk/search.py +384 -26
  23. sirchmunk/storage/__init__.py +2 -2
  24. sirchmunk/storage/{knowledge_manager.py → knowledge_storage.py} +265 -60
  25. sirchmunk/utils/constants.py +7 -5
  26. sirchmunk/utils/embedding_util.py +217 -0
  27. sirchmunk/utils/tokenizer_util.py +36 -1
  28. sirchmunk/version.py +1 -1
  29. {sirchmunk-0.0.1.dist-info → sirchmunk-0.0.2.dist-info}/METADATA +196 -14
  30. sirchmunk-0.0.2.dist-info/RECORD +69 -0
  31. {sirchmunk-0.0.1.dist-info → sirchmunk-0.0.2.dist-info}/WHEEL +1 -1
  32. sirchmunk-0.0.2.dist-info/top_level.txt +2 -0
  33. sirchmunk_mcp/__init__.py +25 -0
  34. sirchmunk_mcp/cli.py +478 -0
  35. sirchmunk_mcp/config.py +276 -0
  36. sirchmunk_mcp/server.py +355 -0
  37. sirchmunk_mcp/service.py +327 -0
  38. sirchmunk_mcp/setup.py +15 -0
  39. sirchmunk_mcp/tools.py +410 -0
  40. sirchmunk-0.0.1.dist-info/RECORD +0 -45
  41. sirchmunk-0.0.1.dist-info/top_level.txt +0 -1
  42. {sirchmunk-0.0.1.dist-info → sirchmunk-0.0.2.dist-info}/entry_points.txt +0 -0
  43. {sirchmunk-0.0.1.dist-info → sirchmunk-0.0.2.dist-info}/licenses/LICENSE +0 -0
@@ -20,7 +20,7 @@ from sirchmunk.schema.knowledge import (
20
20
  )
21
21
  from sirchmunk.schema.metadata import FileInfo
22
22
  from sirchmunk.schema.request import Request
23
- from sirchmunk.utils.constants import DEFAULT_WORK_PATH
23
+ from sirchmunk.utils.constants import DEFAULT_SIRCHMUNK_WORK_PATH
24
24
  from sirchmunk.utils.file_utils import StorageStructure, fast_extract
25
25
  from sirchmunk.utils import create_logger, LogCallback
26
26
  from sirchmunk.utils.utils import extract_fields
@@ -51,7 +51,9 @@ class KnowledgeBase:
51
51
  self.llm = llm
52
52
  self.metadata_map = metadata_map
53
53
  self.work_path: Path = (
54
- DEFAULT_WORK_PATH if work_path is None else Path(work_path).resolve()
54
+ Path(DEFAULT_SIRCHMUNK_WORK_PATH).expanduser().resolve()
55
+ if work_path is None
56
+ else Path(work_path).expanduser().resolve()
55
57
  )
56
58
  self.metadata_path: Path = (
57
59
  self.work_path / StorageStructure.CACHE_DIR / StorageStructure.METADATA_DIR
@@ -208,6 +210,7 @@ class KnowledgeBase:
208
210
 
209
211
  cluster_id = f"C{hashlib.sha256(cluster_text.encode('utf-8')).hexdigest()[:10]}"
210
212
 
213
+ # TODO: Adapt cluster attributes based on real scenarios
211
214
  cluster = KnowledgeCluster(
212
215
  id=cluster_id,
213
216
  name=cluster_name,
sirchmunk/llm/prompts.py CHANGED
@@ -169,8 +169,19 @@ Analyze the provided {text_content} and generate a concise summary in the form o
169
169
  - **User Input**: {user_input}
170
170
  - **Search Result Text**: {text_content}
171
171
 
172
- ### Output
172
+ ### Quality Evaluation
173
+ After generating the summary, evaluate whether this knowledge cluster is worth saving to the persistent cache based on:
174
+ 1. Does the search result contain substantial, relevant information for the user input?
175
+ 2. Is the content meaningful and not just error messages or "no information found"?
176
+ 3. Are there sufficient evidences and context to answer the user's query?
177
+
178
+ If YES to all above, output "true"; otherwise output "false".
179
+
180
+ ### Output Format
181
+ <SUMMARY>
173
182
  [Generate the Markdown Briefing here]
183
+ </SUMMARY>
184
+ <SHOULD_SAVE>true/false</SHOULD_SAVE>
174
185
  """
175
186
 
176
187
 
@@ -9,7 +9,7 @@ from typing import Any, Dict, List, Literal, Optional, Union
9
9
 
10
10
  from loguru import logger
11
11
 
12
- from ..utils.constants import GREP_CONCURRENT_LIMIT, DEFAULT_WORK_PATH
12
+ from ..utils.constants import GREP_CONCURRENT_LIMIT, DEFAULT_SIRCHMUNK_WORK_PATH
13
13
  from ..utils.file_utils import StorageStructure
14
14
  from .base import BaseRetriever
15
15
 
@@ -29,7 +29,7 @@ class GrepRetriever(BaseRetriever):
29
29
  def __init__(self, work_path: Union[str, Path] = None, **kwargs):
30
30
  super().__init__()
31
31
 
32
- self.work_path: Path = Path(work_path or DEFAULT_WORK_PATH)
32
+ self.work_path: Path = Path(work_path or DEFAULT_SIRCHMUNK_WORK_PATH).expanduser().resolve()
33
33
  self.rga_cache: Path = (
34
34
  self.work_path / StorageStructure.CACHE_DIR / StorageStructure.GREP_DIR
35
35
  )
@@ -688,6 +688,190 @@ class GrepRetriever(BaseRetriever):
688
688
 
689
689
  return result["stdout"].strip().splitlines() if result["stdout"].strip() else []
690
690
 
691
+ async def retrieve_by_filename(
692
+ self,
693
+ patterns: Union[str, List[str]],
694
+ path: Union[str, Path, List[str], List[Path], None] = None,
695
+ *,
696
+ case_sensitive: bool = False,
697
+ max_depth: Optional[int] = None,
698
+ include: Optional[List[str]] = None,
699
+ exclude: Optional[List[str]] = None,
700
+ file_type: Optional[str] = None,
701
+ rank: bool = True,
702
+ timeout: float = 60.0,
703
+ ) -> List[Dict[str, Any]]:
704
+ """Search for files by filename patterns (fast file name matching).
705
+
706
+ This method performs filename-only search without reading file contents,
707
+ making it significantly faster than content-based search.
708
+
709
+ Args:
710
+ patterns: Single pattern (str) or list of patterns (List[str]) to match filenames.
711
+ Patterns are treated as regex by default (e.g., "test.*\\.py").
712
+ path: Single path (str/Path) or multiple paths (List[str]/List[Path]) to search in.
713
+ case_sensitive: If True, enable case-sensitive filename matching.
714
+ max_depth: Maximum directory depth to search.
715
+ include: List of glob patterns to include (e.g., ["*.py", "*.md"]).
716
+ exclude: List of glob patterns to exclude (e.g., ["*.pyc", "*.log"]).
717
+ file_type: Search only files of given type (e.g., 'py', 'md').
718
+ rank: If True, rank results by pattern match quality (e.g., exact match > partial match).
719
+ timeout: Maximum time in seconds to wait for the search to complete.
720
+
721
+ Returns:
722
+ List of match objects with structure:
723
+ [
724
+ {
725
+ 'path': '/absolute/path/to/file.py',
726
+ 'filename': 'file.py',
727
+ 'match_score': 1.0, # relevance score (0.0-1.0)
728
+ 'type': 'filename_match'
729
+ },
730
+ ...
731
+ ]
732
+ """
733
+ # Normalize patterns
734
+ if isinstance(patterns, str):
735
+ patterns = [patterns]
736
+
737
+ logger.debug(f"retrieve_by_filename called with patterns: {patterns}, path: {path}, "
738
+ f"include: {include}, exclude: {exclude}, max_depth: {max_depth}")
739
+
740
+ # Normalize paths
741
+ if path is None:
742
+ paths = ["."]
743
+ elif isinstance(path, (str, Path)):
744
+ paths = [str(path)]
745
+ else:
746
+ paths = [str(p) for p in path]
747
+
748
+ # List all files in the specified paths
749
+ all_files = []
750
+ for search_path in paths:
751
+ try:
752
+ files = await self.list_files(
753
+ path=search_path,
754
+ max_depth=max_depth,
755
+ include=include,
756
+ exclude=exclude,
757
+ file_type=file_type,
758
+ )
759
+ all_files.extend(files)
760
+ except Exception as e:
761
+ logger.warning(f"Failed to list files in {search_path}: {e}")
762
+ continue
763
+
764
+ if not all_files:
765
+ logger.debug("No files found to search")
766
+ return []
767
+
768
+ logger.debug(f"Searching through {len(all_files)} files with patterns: {patterns}")
769
+
770
+ # Filter files by patterns
771
+ results = []
772
+ for file_path in all_files:
773
+ # Get both absolute and relative paths for proper handling
774
+ file_path_obj = Path(file_path)
775
+ filename = file_path_obj.name
776
+
777
+ # Check if filename matches any pattern
778
+ for pattern in patterns:
779
+ try:
780
+ # Compile regex pattern
781
+ flags = 0 if case_sensitive else re.IGNORECASE
782
+ regex = re.compile(pattern, flags)
783
+
784
+ match = regex.search(filename)
785
+ if match:
786
+ logger.debug(f"Pattern '{pattern}' matched file: {filename}")
787
+
788
+ # Calculate match score
789
+ match_score = self._calculate_filename_match_score(
790
+ filename=filename,
791
+ pattern=pattern,
792
+ case_sensitive=case_sensitive
793
+ )
794
+
795
+ # Use absolute path if file exists, otherwise keep original path
796
+ try:
797
+ abs_path = str(file_path_obj.resolve())
798
+ except (OSError, RuntimeError):
799
+ abs_path = str(file_path_obj.absolute()) if file_path_obj.is_absolute() else file_path
800
+
801
+ results.append({
802
+ 'path': abs_path,
803
+ 'filename': filename,
804
+ 'match_score': match_score,
805
+ 'type': 'filename_match',
806
+ 'matched_pattern': pattern,
807
+ })
808
+ break # Only count each file once (first matching pattern)
809
+
810
+ except re.error as e:
811
+ logger.warning(f"Invalid regex pattern '{pattern}': {e}")
812
+ continue
813
+
814
+ logger.debug(f"Found {len(results)} matching files")
815
+
816
+ # Rank results by match score if requested
817
+ if rank and results:
818
+ results.sort(key=lambda x: x['match_score'], reverse=True)
819
+
820
+ return results
821
+
822
+ @staticmethod
823
+ def _calculate_filename_match_score(
824
+ filename: str,
825
+ pattern: str,
826
+ case_sensitive: bool = False
827
+ ) -> float:
828
+ """Calculate relevance score for filename pattern match.
829
+
830
+ Args:
831
+ filename: The filename that matched
832
+ pattern: The regex pattern that was matched
833
+ case_sensitive: Whether the match was case-sensitive
834
+
835
+ Returns:
836
+ Score between 0.0 and 1.0, where:
837
+ - 1.0 = exact match (highest priority)
838
+ - 0.9 = exact match with different case
839
+ - 0.7-0.8 = starts with pattern
840
+ - 0.5-0.6 = contains pattern
841
+ - 0.3-0.4 = partial regex match
842
+ """
843
+ # Normalize for comparison
844
+ fn_lower = filename.lower()
845
+ pattern_lower = pattern.lower()
846
+
847
+ # Remove regex special characters for literal comparison
848
+ pattern_literal = re.sub(r'[.*+?^${}()|[\]\\]', '', pattern)
849
+ pattern_literal_lower = pattern_literal.lower()
850
+
851
+ # Exact match (case-sensitive)
852
+ if filename == pattern or filename == pattern_literal:
853
+ return 1.0
854
+
855
+ # Exact match (case-insensitive)
856
+ if not case_sensitive and (fn_lower == pattern_lower or fn_lower == pattern_literal_lower):
857
+ return 0.9
858
+
859
+ # Starts with pattern
860
+ if filename.startswith(pattern_literal):
861
+ return 0.8
862
+ if fn_lower.startswith(pattern_literal_lower):
863
+ return 0.75
864
+
865
+ # Contains pattern (full)
866
+ if pattern_literal in filename:
867
+ return 0.6
868
+ if pattern_literal_lower in fn_lower:
869
+ return 0.55
870
+
871
+ # Partial match (proportional to match length)
872
+ match_ratio = len(pattern_literal) / max(len(filename), 1)
873
+ return 0.3 + (match_ratio * 0.2) # Score between 0.3 and 0.5
874
+
691
875
  def file_types(self) -> Dict[str, List[str]]:
692
876
  """List supported file types and their associated globs/extensions.
693
877
 
@@ -58,9 +58,9 @@ class FileScanner(BaseScanner):
58
58
  corpus_path = [corpus_path]
59
59
  self.corpus_paths: List[Path] = [Path(p).resolve() for p in corpus_path]
60
60
 
61
- # Set work and metadata paths
61
+ # Set work and metadata paths (expand ~ and resolve to absolute path)
62
62
  self.work_path: Path = (
63
- Path.cwd() if work_path is None else Path(work_path).resolve()
63
+ Path.cwd() if work_path is None else Path(work_path).expanduser().resolve()
64
64
  )
65
65
  self.metadata_path: Path = (
66
66
  self.work_path / StorageStructure.CACHE_DIR / StorageStructure.METADATA_DIR
@@ -230,6 +230,10 @@ class KnowledgeCluster:
230
230
  # Used to track which sources contributed to this knowledge cluster
231
231
  search_results: List[str] = None
232
232
 
233
+ # Historical queries: list of original user input queries that led to this cluster
234
+ # Used for semantic similarity matching and cluster reuse
235
+ queries: List[str] = None
236
+
233
237
  def __post_init__(self):
234
238
  if self.related_clusters is None:
235
239
  self.related_clusters = []
@@ -237,6 +241,9 @@ class KnowledgeCluster:
237
241
  if self.search_results is None:
238
242
  self.search_results = []
239
243
 
244
+ if self.queries is None:
245
+ self.queries = []
246
+
240
247
  if self.create_time is None:
241
248
  self.create_time = datetime.now(timezone.utc)
242
249
 
@@ -246,6 +253,117 @@ class KnowledgeCluster:
246
253
  if self.version is None:
247
254
  self.version = 0
248
255
 
256
+ def __repr__(self) -> str:
257
+ """
258
+ Return a concise representation for debugging.
259
+ """
260
+ # Get content length
261
+ content_len = 0
262
+ if isinstance(self.content, str):
263
+ content_len = len(self.content)
264
+ elif isinstance(self.content, list):
265
+ content_len = sum(len(c) for c in self.content)
266
+
267
+ return (
268
+ f"KnowledgeCluster(id={self.id!r}, name={self.name!r}, "
269
+ f"version={self.version}, lifecycle={self.lifecycle.value}, "
270
+ f"evidences={len(self.evidences)}, queries={len(self.queries)}, "
271
+ f"content_len={content_len}, search_results={len(self.search_results)})"
272
+ )
273
+
274
+ def __str__(self) -> str:
275
+ """
276
+ Return a human-readable string representation.
277
+ """
278
+ separator = "─" * 70 # Horizontal separator line
279
+
280
+ # Extract description text
281
+ desc_text = ""
282
+ if isinstance(self.description, str):
283
+ desc_text = self.description
284
+ elif isinstance(self.description, list):
285
+ desc_preview = []
286
+ for i, item in enumerate(self.description, 1):
287
+ desc_preview.append(f" [{i}] {item}")
288
+ desc_text = "\n".join(desc_preview)
289
+
290
+ # Extract content text
291
+ content_text = ""
292
+ if isinstance(self.content, str):
293
+ content_text = self.content
294
+ elif isinstance(self.content, list):
295
+ content_text = self.content[0] if self.content else "" # Preview first item
296
+
297
+ # Build basic info
298
+ lines = [
299
+ f"━━━ KnowledgeCluster: {self.name} ━━━",
300
+ f"ID: {self.id}",
301
+ f"Description:\n{desc_text}" if desc_text else "Description: N/A",
302
+ f"Lifecycle: {self.lifecycle.value} | Version: {self.version}",
303
+ f"Confidence: {self.confidence:.3f}" if self.confidence else "Confidence: N/A",
304
+ ]
305
+
306
+ # Add content preview
307
+ if content_text:
308
+ lines.append(separator)
309
+ lines.append(f"Content Preview:\n{content_text}")
310
+
311
+ # Add evidences with preview (max 5)
312
+ if self.evidences:
313
+ lines.append(separator)
314
+ lines.append(f"Evidences ({len(self.evidences)} total):")
315
+ for i, evidence in enumerate(self.evidences[:5], 1):
316
+ file_path = str(evidence.file_or_url)
317
+ # Shorten path if too long
318
+ if len(file_path) > 60:
319
+ file_path = "..." + file_path[-57:]
320
+ summary_preview = evidence.summary[:80] + "..." if len(evidence.summary) > 80 else evidence.summary
321
+ lines.append(f" [{i}] {file_path}")
322
+ lines.append(f" {summary_preview}")
323
+ lines.append(f" Snippets: {len(evidence.snippets)}, Found: {evidence.is_found}")
324
+ if len(self.evidences) > 5:
325
+ lines.append(f" ... (+{len(self.evidences) - 5} more evidences)")
326
+
327
+ # Add optional fields
328
+ has_optional_fields = False
329
+ optional_lines = []
330
+
331
+ if self.hotness is not None:
332
+ optional_lines.append(f"Hotness: {self.hotness:.3f}")
333
+ has_optional_fields = True
334
+
335
+ if self.abstraction_level:
336
+ optional_lines.append(f"Abstraction: {self.abstraction_level.name}")
337
+ has_optional_fields = True
338
+
339
+ if self.queries:
340
+ queries_preview = ", ".join(f'"{q}"' for q in self.queries[:3])
341
+ if len(self.queries) > 3:
342
+ queries_preview += f" (+{len(self.queries) - 3} more)"
343
+ optional_lines.append(f"Related Queries: {queries_preview}")
344
+ has_optional_fields = True
345
+
346
+ if has_optional_fields:
347
+ lines.append(separator)
348
+ lines.extend(optional_lines)
349
+
350
+ # Add search results
351
+ if self.search_results:
352
+ lines.append(separator)
353
+ lines.append(f"Search Results ({len(self.search_results)} files):")
354
+ for i, result in enumerate(self.search_results[:5], 1):
355
+ result_preview = result[:80] + "..." if len(result) > 80 else result
356
+ lines.append(f" [{i}] {result_preview}")
357
+ if len(self.search_results) > 5:
358
+ lines.append(f" ... (+{len(self.search_results) - 5} more)")
359
+
360
+ # Add timestamp
361
+ if self.last_modified:
362
+ lines.append(separator)
363
+ lines.append(f"Last Modified: {self.last_modified.strftime('%Y-%m-%d %H:%M:%S')}")
364
+
365
+ return "\n".join(lines)
366
+
249
367
  @property
250
368
  def primary_evidence_files(self) -> Set[str]:
251
369
  """Return set of unique file IDs backing this cluster — useful for evidence-layer prefetch."""
@@ -279,40 +397,6 @@ class KnowledgeCluster:
279
397
  "version": self.version,
280
398
  "related_clusters": [rc.to_dict() for rc in self.related_clusters],
281
399
  "search_results": self.search_results,
400
+ "queries": self.queries,
282
401
  }
283
402
 
284
-
285
- if __name__ == "__main__":
286
-
287
- # Create instance
288
- cluster = KnowledgeCluster(
289
- id="C1001",
290
- name="Test Cluster",
291
- description=["A desc from perspective A.", "A desc from perspective B."],
292
- content="Detailed content of the knowledge cluster.",
293
- scripts=["print('Hello World')"],
294
- resources=[
295
- {"type": "url", "value": "https://example.com"},
296
- {"type": "file", "value": "/data/image1.png"},
297
- ],
298
- patterns=["pattern A", "pattern B"],
299
- constraints=[Constraint("x > 0", "low", "x must be positive")],
300
- evidences=[
301
- EvidenceUnit(
302
- doc_id="doc1",
303
- file_or_url=Path("/data/file.txt"),
304
- segment={"text": "supporting text", "type": "match", "line_number": 10},
305
- score=0.9,
306
- extracted_at=datetime(2025, 1, 1),
307
- )
308
- ],
309
- confidence=0.85,
310
- abstraction_level=AbstractionLevel.PRINCIPLE,
311
- landmark_potential=0.6,
312
- hotness=0.4,
313
- lifecycle=Lifecycle.STABLE,
314
- create_time=datetime(2025, 1, 1),
315
- last_modified=datetime(2025, 1, 2),
316
- )
317
-
318
- print(cluster.to_dict())