sirchmunk 0.0.0__py3-none-any.whl → 0.0.1.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. sirchmunk/__init__.py +8 -0
  2. sirchmunk/base.py +17 -0
  3. sirchmunk/insight/__init__.py +4 -0
  4. sirchmunk/insight/text_insights.py +292 -0
  5. sirchmunk/learnings/__init__.py +1 -0
  6. sirchmunk/learnings/evidence_processor.py +525 -0
  7. sirchmunk/learnings/knowledge_base.py +232 -0
  8. sirchmunk/llm/__init__.py +2 -0
  9. sirchmunk/llm/openai_chat.py +247 -0
  10. sirchmunk/llm/prompts.py +216 -0
  11. sirchmunk/retrieve/__init__.py +1 -0
  12. sirchmunk/retrieve/base.py +25 -0
  13. sirchmunk/retrieve/text_retriever.py +1026 -0
  14. sirchmunk/scan/__init__.py +1 -0
  15. sirchmunk/scan/base.py +18 -0
  16. sirchmunk/scan/file_scanner.py +373 -0
  17. sirchmunk/scan/web_scanner.py +18 -0
  18. sirchmunk/scheduler/__init__.py +0 -0
  19. sirchmunk/schema/__init__.py +2 -0
  20. sirchmunk/schema/cognition.py +106 -0
  21. sirchmunk/schema/context.py +25 -0
  22. sirchmunk/schema/knowledge.py +318 -0
  23. sirchmunk/schema/metadata.py +658 -0
  24. sirchmunk/schema/request.py +221 -0
  25. sirchmunk/schema/response.py +20 -0
  26. sirchmunk/schema/snapshot.py +346 -0
  27. sirchmunk/search.py +475 -0
  28. sirchmunk/storage/__init__.py +7 -0
  29. sirchmunk/storage/duckdb.py +676 -0
  30. sirchmunk/storage/knowledge_manager.py +720 -0
  31. sirchmunk/utils/__init__.py +15 -0
  32. sirchmunk/utils/constants.py +15 -0
  33. sirchmunk/utils/deps.py +23 -0
  34. sirchmunk/utils/file_utils.py +70 -0
  35. sirchmunk/utils/install_rga.py +124 -0
  36. sirchmunk/utils/log_utils.py +360 -0
  37. sirchmunk/utils/tokenizer_util.py +55 -0
  38. sirchmunk/utils/utils.py +108 -0
  39. sirchmunk/version.py +1 -1
  40. sirchmunk-0.0.1.post1.dist-info/METADATA +483 -0
  41. sirchmunk-0.0.1.post1.dist-info/RECORD +45 -0
  42. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/WHEEL +1 -1
  43. sirchmunk-0.0.0.dist-info/METADATA +0 -26
  44. sirchmunk-0.0.0.dist-info/RECORD +0 -8
  45. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/entry_points.txt +0 -0
  46. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/licenses/LICENSE +0 -0
  47. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/top_level.txt +0 -0
sirchmunk/__init__.py CHANGED
@@ -0,0 +1,8 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ from .search import AgenticSearch
3
+ from .version import __version__
4
+
5
+ __all__ = [
6
+ "__version__",
7
+ "AgenticSearch",
8
+ ]
sirchmunk/base.py ADDED
@@ -0,0 +1,17 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ from abc import ABC, abstractmethod
3
+
4
+
5
+ class BaseSearch(ABC):
6
+ """
7
+ Abstract base class for search implementations.
8
+ """
9
+
10
+ def __init__(self, *args, **kwargs): ...
11
+
12
+ @abstractmethod
13
+ def search(self, *args, **kwargs):
14
+ """
15
+ Perform a search based on the given query.
16
+ """
17
+ raise NotImplementedError("search method not implemented")
@@ -0,0 +1,4 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ """
3
+ Data Insights Module to empower agents with corpus analysis capabilities.
4
+ """
@@ -0,0 +1,292 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ import re
3
+ from typing import List, Optional, Tuple, Union
4
+
5
+ import numpy as np
6
+ from modelscope import snapshot_download
7
+
8
+ from sirchmunk.llm.openai_chat import OpenAIChat
9
+ from sirchmunk.llm.prompts import (
10
+ SNAPSHOT_KEYWORDS_EXTRACTION,
11
+ SNAPSHOT_TOC_EXTRACTION,
12
+ )
13
+
14
+
15
+ class KeyPhraseExtractor:
16
+ """
17
+ Key phrase extraction using sentence-transformers and scikit-learn.
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ model_id: str = "AI-ModelScope/all-MiniLM-L6-v2",
23
+ device: Union[str, None] = None,
24
+ stop_words: Optional[List[str]] = None,
25
+ ngram_range: Tuple[int, int] = (1, 2),
26
+ ):
27
+ """
28
+ Key phrase extraction using sentence-transformers and scikit-learn.
29
+
30
+ Args:
31
+ model_id (str): model id from ModelScope.
32
+ device (str): device to load the model on ('cpu' or 'cuda' or 'mps').
33
+ stop_words (Optional[List[str]]): list of stop words to filter out.
34
+ ngram_range (Tuple[int, int]): n-gram range for candidate generation.
35
+ """
36
+ from sentence_transformers import SentenceTransformer
37
+
38
+ model_dir: str = snapshot_download(
39
+ model_id=model_id,
40
+ ignore_patterns=[
41
+ "openvino/*",
42
+ "onnx/*",
43
+ "pytorch_model.bin",
44
+ "rust_model.ot",
45
+ "tf_model.h5",
46
+ ],
47
+ )
48
+ self.model = SentenceTransformer(model_dir, device=device)
49
+
50
+ self.ngram_range = ngram_range
51
+ self.stop_words = set(stop_words) if stop_words else set()
52
+ if not self.stop_words:
53
+ self.stop_words = {
54
+ # Chinese stop words
55
+ "的",
56
+ "了",
57
+ "在",
58
+ "是",
59
+ "我",
60
+ "有",
61
+ "和",
62
+ "就",
63
+ "不",
64
+ "人",
65
+ "都",
66
+ "一",
67
+ "一个",
68
+ "上",
69
+ # English stop words
70
+ "the",
71
+ "a",
72
+ "an",
73
+ "and",
74
+ "or",
75
+ "but",
76
+ "in",
77
+ "on",
78
+ "at",
79
+ "to",
80
+ "for",
81
+ "of",
82
+ "with",
83
+ "by",
84
+ "is",
85
+ "are",
86
+ "was",
87
+ "were",
88
+ "be",
89
+ "been",
90
+ "have",
91
+ "has",
92
+ "had",
93
+ "do",
94
+ "does",
95
+ }
96
+
97
+ @staticmethod
98
+ def _preprocess(text: str) -> str:
99
+ """
100
+ Keep only Chinese, English, digits, and spaces; convert to lowercase.
101
+ """
102
+ text = re.sub(r"[^\w\s\u4e00-\u9fa5]", " ", text.lower())
103
+ return " ".join(text.split())
104
+
105
+ def _generate_candidates(self, docs: List[str]) -> List[str]:
106
+ """
107
+ Generate candidate key phrases using CountVectorizer.
108
+ """
109
+ from sklearn.feature_extraction.text import CountVectorizer
110
+
111
+ n_docs = len(docs)
112
+ if n_docs == 0:
113
+ return []
114
+
115
+ vectorizer_params = {
116
+ "ngram_range": self.ngram_range,
117
+ "stop_words": list(self.stop_words),
118
+ "token_pattern": r"(?u)\b[\w\u4e00-\u9fa5]+\b",
119
+ "lowercase": True,
120
+ "min_df": 1,
121
+ }
122
+ if n_docs > 3:
123
+ vectorizer_params["max_df"] = 0.85
124
+
125
+ try:
126
+ vectorizer = CountVectorizer(**vectorizer_params)
127
+ vectorizer.fit(docs)
128
+ except ValueError as e:
129
+ if "max_df corresponds to" in str(e):
130
+ # fallback
131
+ vectorizer_params.pop("max_df", None)
132
+ vectorizer = CountVectorizer(**vectorizer_params)
133
+ vectorizer.fit(docs)
134
+ else:
135
+ raise
136
+
137
+ candidates = vectorizer.get_feature_names_out().tolist()
138
+ candidates = [c.strip() for c in candidates if len(re.sub(r"\s+", "", c)) > 1]
139
+
140
+ return candidates
141
+
142
+ def extract(
143
+ self,
144
+ contents: List[str],
145
+ top_k: int = 20,
146
+ use_mmr: bool = True,
147
+ diversity: float = 0.5,
148
+ candidates: Optional[List[str]] = None,
149
+ batch_size: int = 32,
150
+ ) -> List[Tuple[str, float]]:
151
+ """
152
+ Extract key phrases from the given contents.
153
+
154
+ contents (List[str]): List of document strings.
155
+ top_k (int): Number of top key phrases to return.
156
+ use_mmr (bool): Whether to use MMR for diversity.
157
+ diversity (float): Diversity factor for MMR (0 to 1).
158
+ candidates (Optional[List[str]]): Predefined candidate phrases/keywords.
159
+ batch_size (int): Batch size for encoding.
160
+ """
161
+ if not contents:
162
+ return []
163
+
164
+ # Step 1: merge and preprocess documents
165
+ doc = " ".join(contents)
166
+ doc = self._preprocess(doc)
167
+
168
+ # Step 2: Generate candidates if not provided
169
+ if candidates is None:
170
+ candidates = self._generate_candidates([doc])
171
+ if not candidates:
172
+ return []
173
+
174
+ # Step 3: Encode document and candidates
175
+ sentences = [doc] + candidates
176
+ embeddings = self.model.encode(
177
+ sentences,
178
+ batch_size=batch_size,
179
+ show_progress_bar=False,
180
+ convert_to_numpy=True,
181
+ normalize_embeddings=True,
182
+ )
183
+ doc_emb = embeddings[0]
184
+ cand_embs = embeddings[1:]
185
+
186
+ # Step 4: Compute similarities
187
+ similarities = (doc_emb @ cand_embs.T).flatten() # shape: (n_candidates,)
188
+
189
+ if not use_mmr:
190
+ idx = np.argsort(similarities)[::-1][:top_k]
191
+ return [(candidates[i], float(similarities[i])) for i in idx]
192
+
193
+ # Step 5 (Optional): MMR diversity selection
194
+ selected_idx = [int(np.argmax(similarities))]
195
+ remaining_idx = list(set(range(len(candidates))) - set(selected_idx))
196
+
197
+ while len(selected_idx) < top_k and remaining_idx:
198
+ candidate_scores = []
199
+ for i in remaining_idx:
200
+ sim_to_doc = similarities[i]
201
+ sim_to_selected = np.max(cand_embs[selected_idx] @ cand_embs[i])
202
+ mmr_score = diversity * sim_to_doc - (1 - diversity) * sim_to_selected
203
+ candidate_scores.append((i, mmr_score))
204
+
205
+ # Select the candidate with the highest MMR score
206
+ next_i = max(candidate_scores, key=lambda x: x[1])[0]
207
+ selected_idx.append(next_i)
208
+ remaining_idx.remove(next_i)
209
+
210
+ return [(candidates[i], float(similarities[i])) for i in selected_idx]
211
+
212
+
213
+ class TextInsights:
214
+ """
215
+ Text insights: key information extraction.
216
+ """
217
+
218
+ def __init__(self, llm: Optional[OpenAIChat] = None, **kwargs):
219
+ """
220
+ Initialize TextInsights with an optional LLM instance.
221
+
222
+ Args:
223
+ llm (Optional[OpenAIChat]): An instance of OpenAIChat.
224
+ **kwargs: Additional keyword arguments for KeyPhraseExtractor.
225
+ """
226
+ self._llm = llm
227
+
228
+ self._key_phrase_extractor = (
229
+ KeyPhraseExtractor(
230
+ model_id="AI-ModelScope/all-MiniLM-L6-v2",
231
+ device=None,
232
+ ngram_range=(1, 2),
233
+ )
234
+ if self._llm is None
235
+ else None
236
+ )
237
+
238
+ self._kwargs = kwargs
239
+
240
+ def extract_phrase(self, contents: List[str], max_num: int = 20) -> List[str]:
241
+ """
242
+ Extract key phrases from the given contents.
243
+
244
+ Args:
245
+ contents (List[str]): List of document strings.
246
+ max_num (int): Maximum number of key phrases to extract.
247
+
248
+ Returns:
249
+ List[str]: Extracted key phrases.
250
+ """
251
+
252
+ if self._llm is not None:
253
+ prompt = SNAPSHOT_KEYWORDS_EXTRACTION.format(
254
+ document_content="\n\n".join(contents), max_num=max_num
255
+ )
256
+ messages = [{"role": "user", "content": prompt}]
257
+ response: str = self._llm.chat(
258
+ messages=messages,
259
+ stream=True,
260
+ )
261
+
262
+ results = [
263
+ phrase.strip().lower()
264
+ for phrase in response.split(",")
265
+ if phrase.strip()
266
+ ]
267
+
268
+ else:
269
+ extracted_phrases = self._key_phrase_extractor.extract(
270
+ contents=contents, top_k=max_num, **self._kwargs
271
+ )
272
+ results = [phrase for phrase, _ in extracted_phrases]
273
+
274
+ return results
275
+
276
+ def extract_toc(self, contents: List[str]) -> str:
277
+ """
278
+ Extract the `Table of contents from input document.`
279
+ """
280
+
281
+ if self._llm is None:
282
+ return ""
283
+
284
+ prompt = SNAPSHOT_TOC_EXTRACTION.format(document_content="\n\n".join(contents))
285
+ messages = [{"role": "user", "content": prompt}]
286
+
287
+ response: str = self._llm.chat(
288
+ messages=messages,
289
+ stream=True,
290
+ )
291
+
292
+ return response
@@ -0,0 +1 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.