sirchmunk 0.0.0__py3-none-any.whl → 0.0.1.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. sirchmunk/__init__.py +8 -0
  2. sirchmunk/base.py +17 -0
  3. sirchmunk/insight/__init__.py +4 -0
  4. sirchmunk/insight/text_insights.py +292 -0
  5. sirchmunk/learnings/__init__.py +1 -0
  6. sirchmunk/learnings/evidence_processor.py +525 -0
  7. sirchmunk/learnings/knowledge_base.py +232 -0
  8. sirchmunk/llm/__init__.py +2 -0
  9. sirchmunk/llm/openai_chat.py +247 -0
  10. sirchmunk/llm/prompts.py +216 -0
  11. sirchmunk/retrieve/__init__.py +1 -0
  12. sirchmunk/retrieve/base.py +25 -0
  13. sirchmunk/retrieve/text_retriever.py +1026 -0
  14. sirchmunk/scan/__init__.py +1 -0
  15. sirchmunk/scan/base.py +18 -0
  16. sirchmunk/scan/file_scanner.py +373 -0
  17. sirchmunk/scan/web_scanner.py +18 -0
  18. sirchmunk/scheduler/__init__.py +0 -0
  19. sirchmunk/schema/__init__.py +2 -0
  20. sirchmunk/schema/cognition.py +106 -0
  21. sirchmunk/schema/context.py +25 -0
  22. sirchmunk/schema/knowledge.py +318 -0
  23. sirchmunk/schema/metadata.py +658 -0
  24. sirchmunk/schema/request.py +221 -0
  25. sirchmunk/schema/response.py +20 -0
  26. sirchmunk/schema/snapshot.py +346 -0
  27. sirchmunk/search.py +475 -0
  28. sirchmunk/storage/__init__.py +7 -0
  29. sirchmunk/storage/duckdb.py +676 -0
  30. sirchmunk/storage/knowledge_manager.py +720 -0
  31. sirchmunk/utils/__init__.py +15 -0
  32. sirchmunk/utils/constants.py +15 -0
  33. sirchmunk/utils/deps.py +23 -0
  34. sirchmunk/utils/file_utils.py +70 -0
  35. sirchmunk/utils/install_rga.py +124 -0
  36. sirchmunk/utils/log_utils.py +360 -0
  37. sirchmunk/utils/tokenizer_util.py +55 -0
  38. sirchmunk/utils/utils.py +108 -0
  39. sirchmunk/version.py +1 -1
  40. sirchmunk-0.0.1.post1.dist-info/METADATA +483 -0
  41. sirchmunk-0.0.1.post1.dist-info/RECORD +45 -0
  42. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/WHEEL +1 -1
  43. sirchmunk-0.0.0.dist-info/METADATA +0 -26
  44. sirchmunk-0.0.0.dist-info/RECORD +0 -8
  45. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/entry_points.txt +0 -0
  46. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/licenses/LICENSE +0 -0
  47. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/top_level.txt +0 -0
sirchmunk/search.py ADDED
@@ -0,0 +1,475 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ import ast
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List, Literal, Optional, Union
6
+
7
+ from sirchmunk.base import BaseSearch
8
+ from sirchmunk.learnings.knowledge_base import KnowledgeBase
9
+ from sirchmunk.llm.openai_chat import OpenAIChat
10
+ from sirchmunk.llm.prompts import (
11
+ generate_keyword_extraction_prompt,
12
+ SEARCH_RESULT_SUMMARY,
13
+ )
14
+ from sirchmunk.retrieve.text_retriever import GrepRetriever
15
+ from sirchmunk.schema.knowledge import KnowledgeCluster
16
+ from sirchmunk.schema.request import ContentItem, ImageURL, Message, Request
17
+ from sirchmunk.storage.knowledge_manager import KnowledgeManager
18
+ from sirchmunk.utils.constants import LLM_BASE_URL, LLM_API_KEY, LLM_MODEL_NAME, WORK_PATH
19
+ from sirchmunk.utils.deps import check_dependencies
20
+ from sirchmunk.utils.file_utils import get_fast_hash
21
+ from sirchmunk.utils import create_logger, LogCallback
22
+ from sirchmunk.utils.install_rga import install_rga
23
+ from sirchmunk.utils.utils import (
24
+ KeywordValidation,
25
+ extract_fields,
26
+ log_tf_norm_penalty,
27
+ )
28
+
29
+
30
+ class AgenticSearch(BaseSearch):
31
+
32
+ def __init__(
33
+ self,
34
+ llm: Optional[OpenAIChat] = None,
35
+ work_path: Optional[Union[str, Path]] = None,
36
+ verbose: bool = False,
37
+ log_callback: LogCallback = None,
38
+ **kwargs,
39
+ ):
40
+ super().__init__(**kwargs)
41
+
42
+ work_path = work_path or WORK_PATH
43
+ self.work_path: Path = Path(work_path)
44
+
45
+ self.llm: OpenAIChat = llm or OpenAIChat(
46
+ base_url=LLM_BASE_URL,
47
+ api_key=LLM_API_KEY,
48
+ model=LLM_MODEL_NAME,
49
+ log_callback=log_callback,
50
+ )
51
+
52
+ self.grep_retriever: GrepRetriever = GrepRetriever(work_path=self.work_path)
53
+
54
+ # Create bound logger with callback - returns AsyncLogger instance
55
+ self._logger = create_logger(log_callback=log_callback, enable_async=True)
56
+
57
+ # Pass log_callback to KnowledgeBase so it can also log through the same callback
58
+ self.knowledge_base = KnowledgeBase(
59
+ llm=self.llm,
60
+ work_path=self.work_path,
61
+ log_callback=log_callback
62
+ )
63
+
64
+ # Initialize KnowledgeManager for persistent storage
65
+ self.knowledge_manager = KnowledgeManager(work_path=str(self.work_path))
66
+
67
+ # Load historical knowledge clusters from cache
68
+ self._load_historical_knowledge()
69
+
70
+ self.verbose: bool = verbose
71
+
72
+ self.llm_usages: List[Dict[str, Any]] = []
73
+
74
+ if not check_dependencies():
75
+ print("Installing rga (ripgrep-all) and rg (ripgrep)...", flush=True)
76
+ install_rga()
77
+
78
+ def _load_historical_knowledge(self):
79
+ """Load historical knowledge clusters from local cache"""
80
+ try:
81
+ stats = self.knowledge_manager.get_stats()
82
+ cluster_count = stats.get('custom_stats', {}).get('total_clusters', 0)
83
+ # Use sync logger for initialization
84
+ print(f"Loaded {cluster_count} historical knowledge clusters from cache")
85
+ except Exception as e:
86
+ print(f"[WARNING] Failed to load historical knowledge: {e}")
87
+
88
+ @staticmethod
89
+ def _extract_and_validate_keywords(llm_resp: str) -> dict:
90
+ """
91
+ Extract and validate keywords with IDF scores from LLM response.
92
+ """
93
+ res: Dict[str, float] = {}
94
+
95
+ # Extract JSON-like content within <KEYWORDS></KEYWORDS> tags
96
+ tag: str = "KEYWORDS"
97
+ keywords_json: Optional[str, None] = extract_fields(
98
+ content=llm_resp,
99
+ tags=[tag],
100
+ ).get(tag.lower(), None)
101
+
102
+ if not keywords_json:
103
+ return res
104
+
105
+ # Try to parse as dict format
106
+ try:
107
+ res = json.loads(keywords_json)
108
+ except json.JSONDecodeError:
109
+ try:
110
+ res = ast.literal_eval(keywords_json)
111
+ except Exception as e:
112
+ return {}
113
+
114
+ # Validate using Pydantic model
115
+ try:
116
+ return KeywordValidation(root=res).model_dump()
117
+ except Exception as e:
118
+ return {}
119
+
120
+ @staticmethod
121
+ def _extract_and_validate_multi_level_keywords(
122
+ llm_resp: str,
123
+ num_levels: int = 3
124
+ ) -> List[Dict[str, float]]:
125
+ """
126
+ Extract and validate multiple sets of keywords from LLM response.
127
+
128
+ Args:
129
+ llm_resp: LLM response containing keyword sets
130
+ num_levels: Number of keyword granularity levels to extract
131
+
132
+ Returns:
133
+ List of keyword dicts, one for each level: [level1_keywords, level2_keywords, ...]
134
+ """
135
+ keyword_sets: List[Dict[str, float]] = []
136
+
137
+ # Generate tags dynamically based on num_levels
138
+ tags = [f"KEYWORDS_LEVEL_{i+1}" for i in range(num_levels)]
139
+
140
+ # Extract all fields at once
141
+ extracted_fields = extract_fields(content=llm_resp, tags=tags)
142
+
143
+ for level_idx, tag in enumerate(tags, start=1):
144
+ keywords_dict: Dict[str, float] = {}
145
+ keywords_json: Optional[str] = extracted_fields.get(tag.lower(), None)
146
+
147
+ if not keywords_json:
148
+ keyword_sets.append({})
149
+ continue
150
+
151
+ # Try to parse as dict format
152
+ try:
153
+ keywords_dict = json.loads(keywords_json)
154
+ except json.JSONDecodeError:
155
+ try:
156
+ keywords_dict = ast.literal_eval(keywords_json)
157
+ except Exception as e:
158
+ keyword_sets.append({})
159
+ continue
160
+
161
+ # Validate using Pydantic model
162
+ try:
163
+ validated = KeywordValidation(root=keywords_dict).model_dump()
164
+ keyword_sets.append(validated)
165
+ except Exception as e:
166
+ keyword_sets.append({})
167
+
168
+ return keyword_sets
169
+
170
+ @staticmethod
171
+ def fast_deduplicate_by_content(data: List[dict]):
172
+ """
173
+ Deduplicates results based on content fingerprints.
174
+ Keeps the document with the highest total_score for each unique content.
175
+
176
+ Args:
177
+ data: sorted grep results by 'total_score' field.
178
+
179
+ Returns:
180
+ deduplicated grep results.
181
+ """
182
+ unique_fingerprints = set()
183
+ deduplicated_results = []
184
+
185
+ for item in data:
186
+ path = item["path"]
187
+
188
+ # 2. Generate a fast fingerprint instead of full MD5
189
+ fingerprint = get_fast_hash(path)
190
+
191
+ # 3. Add to results only if this content hasn't been seen yet
192
+ if fingerprint and fingerprint not in unique_fingerprints:
193
+ unique_fingerprints.add(fingerprint)
194
+ deduplicated_results.append(item)
195
+
196
+ return deduplicated_results
197
+
198
+ def process_grep_results(
199
+ self, results: List[Dict[str, Any]], keywords_with_idf: Dict[str, float]
200
+ ) -> List[Dict[str, Any]]:
201
+ """
202
+ Process grep results to calculate total scores for doc and scores for lines based on keywords with IDF.
203
+
204
+ Args:
205
+ results: List of grep result dictionaries.
206
+ keywords_with_idf: Dictionary of keywords with their corresponding IDF scores.
207
+
208
+ Returns:
209
+ Processed and sorted list of grep result dictionaries.
210
+ """
211
+ results = [
212
+ res
213
+ for res in results
214
+ if res.get("total_matches", 0) >= len(keywords_with_idf)
215
+ ]
216
+
217
+ for grep_res in results:
218
+ keywords_tf_in_doc: Dict[str, int] = {
219
+ k.lower(): 0 for k, v in keywords_with_idf.items()
220
+ }
221
+ matches = grep_res.get("matches", [])
222
+ for match_item in matches:
223
+ keywords_tf_in_line: Dict[str, int] = {
224
+ k.lower(): 0 for k, v in keywords_with_idf.items()
225
+ }
226
+ submatches = match_item.get("data", {}).get("submatches", [])
227
+ for submatch_item in submatches:
228
+ hit_word: str = submatch_item["match"]["text"].lower()
229
+ if hit_word in keywords_tf_in_doc:
230
+ keywords_tf_in_doc[hit_word] += 1
231
+ if hit_word in keywords_tf_in_line:
232
+ keywords_tf_in_line[hit_word] += 1
233
+ match_item_score: float = 0.0
234
+ for w, idf in keywords_with_idf.items():
235
+ match_item_score += idf * log_tf_norm_penalty(
236
+ keywords_tf_in_line.get(w.lower(), 0)
237
+ )
238
+ match_item["score"] = (
239
+ match_item["score"]
240
+ * match_item_score
241
+ * log_tf_norm_penalty(
242
+ count=len(match_item["data"]["lines"]["text"]),
243
+ ideal_range=(50, 200),
244
+ )
245
+ )
246
+ # Calculate total score for current document
247
+ total_score: float = 0.0
248
+ for w, idf in keywords_with_idf.items():
249
+ total_score += idf * log_tf_norm_penalty(
250
+ keywords_tf_in_doc.get(w.lower(), 0)
251
+ )
252
+
253
+ grep_res["total_score"] = total_score
254
+ matches.sort(key=lambda x: x["score"], reverse=True)
255
+
256
+ results.sort(key=lambda x: x["total_score"], reverse=True)
257
+ results = self.fast_deduplicate_by_content(results)
258
+
259
+ return results
260
+
261
+ async def search(
262
+ self,
263
+ query: str,
264
+ search_paths: Union[str, Path, List[str], List[Path]],
265
+ mode: Literal["FAST", "DEEP", "FILENAME_ONLY"] = "DEEP", # TODO
266
+ *,
267
+ images: Optional[list] = None,
268
+ max_depth: Optional[int] = 5,
269
+ top_k_files: Optional[int] = 3,
270
+ keyword_levels: Optional[int] = 3,
271
+ include: Optional[List[str]] = None,
272
+ exclude: Optional[List[str]] = None,
273
+ verbose: Optional[bool] = True,
274
+ grep_timeout: Optional[float] = 60.0,
275
+ ) -> str:
276
+ """
277
+ Perform intelligent search with multi-level keyword extraction.
278
+
279
+ Args:
280
+ query: Search query string
281
+ search_paths: Paths to search in
282
+ mode: Search mode (FAST/DEEP/FILENAME_ONLY)
283
+ images: Optional image inputs
284
+ max_depth: Maximum directory depth to search
285
+ top_k_files: Number of top files to return
286
+ keyword_levels: Number of keyword granularity levels (default: 3)
287
+ - Higher values provide more fallback options
288
+ - Recommended: 3-5 levels
289
+ include: File patterns to include
290
+ exclude: File patterns to exclude
291
+ verbose: Enable verbose logging
292
+ grep_timeout: Timeout for grep operations
293
+
294
+ Returns:
295
+ Search result summary string
296
+ """
297
+
298
+ # Build request
299
+ text_items: List[ContentItem] = [ContentItem(type="text", text=query)]
300
+ image_items: List[ContentItem] = []
301
+ if images is not None and len(images) > 0:
302
+ # TODO: to be implemented
303
+ await self._logger.warning("Image search is not yet implemented.")
304
+ image_items = [
305
+ ContentItem(
306
+ type="image_url",
307
+ image_url=ImageURL(url=image_url),
308
+ )
309
+ for image_url in images
310
+ ]
311
+
312
+ request: Request = Request(
313
+ messages=[
314
+ Message(
315
+ role="user",
316
+ content=text_items + image_items,
317
+ ),
318
+ ],
319
+ )
320
+
321
+ # Extract multi-level keywords in one LLM call
322
+ await self._logger.info(f"Extracting {keyword_levels}-level query keywords.")
323
+
324
+ # Generate dynamic prompt based on keyword_levels
325
+ dynamic_prompt = generate_keyword_extraction_prompt(num_levels=keyword_levels)
326
+ keyword_extraction_prompt = dynamic_prompt.format(user_input=request.get_user_input())
327
+
328
+ resp_keywords_response = await self.llm.achat(
329
+ messages=[{"role": "user", "content": keyword_extraction_prompt}],
330
+ stream=False,
331
+ )
332
+ resp_keywords: str = resp_keywords_response.content
333
+ self.llm_usages.append(resp_keywords_response.usage)
334
+
335
+ await self._logger.success(" ✓", flush=True)
336
+
337
+ # Parse N sets of keywords
338
+ keyword_sets: List[Dict[str, float]] = self._extract_and_validate_multi_level_keywords(
339
+ resp_keywords,
340
+ num_levels=keyword_levels
341
+ )
342
+
343
+ # Ensure we have keyword_levels sets (even if some are empty)
344
+ while len(keyword_sets) < keyword_levels:
345
+ keyword_sets.append({})
346
+
347
+ # Log all extracted keyword sets
348
+ for level_idx, keywords in enumerate(keyword_sets, start=1):
349
+ specificity = "General" if level_idx == 1 else "Specific" if level_idx == keyword_levels else f"Level {level_idx}"
350
+ await self._logger.info(f"Level {level_idx} ({specificity}) keywords: {keywords}")
351
+
352
+ # Try each keyword set in order (from general to specific) until we get results
353
+ # Using priority hit principle: stop as soon as we find results
354
+ grep_results: List[Dict[str, Any]] = []
355
+ query_keywords: Dict[str, float] = {}
356
+
357
+ for level_idx, keywords in enumerate(keyword_sets, start=1):
358
+ if not keywords:
359
+ await self._logger.warning(f"Level {level_idx} keywords set is empty, skipping...")
360
+ continue
361
+
362
+ specificity = "General" if level_idx == 1 else "Specific" if level_idx == keyword_levels else f"Level {level_idx}"
363
+ await self._logger.info(f"Searching with Level {level_idx} ({specificity}) keywords.")
364
+
365
+ # Perform grep search with current keyword set
366
+ temp_grep_results: List[Dict[str, Any]] = await self.grep_retriever.retrieve(
367
+ terms=list(keywords.keys()),
368
+ path=search_paths,
369
+ logic="or",
370
+ case_sensitive=False,
371
+ whole_word=False,
372
+ literal=False,
373
+ regex=True,
374
+ max_depth=max_depth,
375
+ include=None,
376
+ exclude=["*.pyc", "*.log"],
377
+ file_type=None,
378
+ invert_match=False,
379
+ count_only=False,
380
+ line_number=True,
381
+ with_filename=True,
382
+ rank=True,
383
+ rga_no_cache=False,
384
+ rga_cache_max_blob_len=10000000,
385
+ rga_cache_path=None,
386
+ timeout=grep_timeout,
387
+ )
388
+
389
+ # Merge and process results
390
+ temp_grep_results = self.grep_retriever.merge_results(temp_grep_results)
391
+ temp_grep_results = self.process_grep_results(
392
+ results=temp_grep_results, keywords_with_idf=keywords
393
+ )
394
+
395
+ # Check if we found results
396
+ if len(temp_grep_results) > 0:
397
+ await self._logger.success(f" ✓ (found {len(temp_grep_results)} files)", flush=True)
398
+ grep_results = temp_grep_results
399
+ query_keywords = keywords
400
+ break
401
+ else:
402
+ await self._logger.warning(" ✗ (no results, trying next level)", flush=True)
403
+
404
+ # If still no results after all attempts
405
+ if len(grep_results) == 0:
406
+ await self._logger.error(f"All {keyword_levels} keyword granularity levels failed to find results")
407
+
408
+ if verbose:
409
+ tmp_sep = "\n"
410
+ file_list = [str(r['path']) for r in grep_results[:top_k_files]]
411
+ await self._logger.info(f"Found {len(grep_results)} files, top {len(file_list)}:\n{tmp_sep.join(file_list)}")
412
+
413
+ if len(grep_results) == 0:
414
+ return f"No relevant information found for the query: {query}"
415
+
416
+ # Build knowledge cluster
417
+ await self._logger.info("Building knowledge cluster...")
418
+ cluster: KnowledgeCluster = await self.knowledge_base.build(
419
+ request=request,
420
+ retrieved_infos=grep_results,
421
+ keywords=query_keywords,
422
+ top_k_files=top_k_files,
423
+ top_k_snippets=5,
424
+ verbose=verbose,
425
+ )
426
+
427
+ self.llm_usages.extend(self.knowledge_base.llm_usages)
428
+
429
+ await self._logger.success(" ✓", flush=True)
430
+
431
+ if cluster is None:
432
+ return f"No relevant information found for the query: {query}"
433
+
434
+ if self.verbose:
435
+ await self._logger.info(json.dumps(cluster.to_dict(), ensure_ascii=False, indent=2))
436
+
437
+ sep: str = "\n"
438
+ cluster_text_content: str = (
439
+ f"{cluster.name}\n\n"
440
+ f"{sep.join(cluster.description)}\n\n"
441
+ f"{cluster.content if isinstance(cluster.content, str) else sep.join(cluster.content)}"
442
+ )
443
+
444
+ result_sum_prompt: str = SEARCH_RESULT_SUMMARY.format(
445
+ user_input=request.get_user_input(),
446
+ text_content=cluster_text_content,
447
+ )
448
+
449
+ await self._logger.info("Generating search result summary...")
450
+ search_result_response = await self.llm.achat(
451
+ messages=[{"role": "user", "content": result_sum_prompt}],
452
+ stream=True,
453
+ )
454
+ search_result: str = search_result_response.content
455
+ self.llm_usages.append(search_result_response.usage)
456
+ await self._logger.success(" ✓", flush=True)
457
+ await self._logger.success("Search completed successfully!")
458
+
459
+ # Add search results (file paths) to the cluster
460
+ if grep_results:
461
+ cluster.search_results.append(search_result)
462
+
463
+ # Save knowledge cluster to persistent storage
464
+ try:
465
+ await self.knowledge_manager.insert(cluster)
466
+ await self._logger.info(f"Saved knowledge cluster {cluster.id} to cache")
467
+ except Exception as e:
468
+ # If cluster exists, update it instead
469
+ try:
470
+ await self.knowledge_manager.update(cluster)
471
+ await self._logger.info(f"Updated knowledge cluster {cluster.id} in cache")
472
+ except Exception as update_error:
473
+ await self._logger.warning(f"Failed to save knowledge cluster: {update_error}")
474
+
475
+ return search_result
@@ -0,0 +1,7 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ """Storage package initialization"""
3
+
4
+ from .knowledge_manager import KnowledgeManager
5
+ from .duckdb import DuckDBManager
6
+
7
+ __all__ = ["KnowledgeManager", "DuckDBManager"]