sirchmunk 0.0.0__py3-none-any.whl → 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sirchmunk/__init__.py +8 -0
- sirchmunk/base.py +17 -0
- sirchmunk/insight/__init__.py +4 -0
- sirchmunk/insight/text_insights.py +292 -0
- sirchmunk/learnings/__init__.py +1 -0
- sirchmunk/learnings/evidence_processor.py +525 -0
- sirchmunk/learnings/knowledge_base.py +232 -0
- sirchmunk/llm/__init__.py +2 -0
- sirchmunk/llm/openai_chat.py +247 -0
- sirchmunk/llm/prompts.py +216 -0
- sirchmunk/retrieve/__init__.py +1 -0
- sirchmunk/retrieve/base.py +25 -0
- sirchmunk/retrieve/text_retriever.py +1026 -0
- sirchmunk/scan/__init__.py +1 -0
- sirchmunk/scan/base.py +18 -0
- sirchmunk/scan/file_scanner.py +373 -0
- sirchmunk/scan/web_scanner.py +18 -0
- sirchmunk/scheduler/__init__.py +0 -0
- sirchmunk/schema/__init__.py +2 -0
- sirchmunk/schema/cognition.py +106 -0
- sirchmunk/schema/context.py +25 -0
- sirchmunk/schema/knowledge.py +318 -0
- sirchmunk/schema/metadata.py +658 -0
- sirchmunk/schema/request.py +221 -0
- sirchmunk/schema/response.py +20 -0
- sirchmunk/schema/snapshot.py +346 -0
- sirchmunk/search.py +475 -0
- sirchmunk/storage/__init__.py +7 -0
- sirchmunk/storage/duckdb.py +676 -0
- sirchmunk/storage/knowledge_manager.py +720 -0
- sirchmunk/utils/__init__.py +15 -0
- sirchmunk/utils/constants.py +15 -0
- sirchmunk/utils/deps.py +23 -0
- sirchmunk/utils/file_utils.py +70 -0
- sirchmunk/utils/install_rga.py +124 -0
- sirchmunk/utils/log_utils.py +360 -0
- sirchmunk/utils/tokenizer_util.py +55 -0
- sirchmunk/utils/utils.py +108 -0
- sirchmunk/version.py +1 -1
- sirchmunk-0.0.1.dist-info/METADATA +416 -0
- sirchmunk-0.0.1.dist-info/RECORD +45 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/WHEEL +1 -1
- sirchmunk-0.0.0.dist-info/METADATA +0 -26
- sirchmunk-0.0.0.dist-info/RECORD +0 -8
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/entry_points.txt +0 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/licenses/LICENSE +0 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.dist-info}/top_level.txt +0 -0
sirchmunk/__init__.py
CHANGED
sirchmunk/base.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Copyright (c) ModelScope Contributors. All rights reserved.
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class BaseSearch(ABC):
|
|
6
|
+
"""
|
|
7
|
+
Abstract base class for search implementations.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
def __init__(self, *args, **kwargs): ...
|
|
11
|
+
|
|
12
|
+
@abstractmethod
|
|
13
|
+
def search(self, *args, **kwargs):
|
|
14
|
+
"""
|
|
15
|
+
Perform a search based on the given query.
|
|
16
|
+
"""
|
|
17
|
+
raise NotImplementedError("search method not implemented")
|
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
# Copyright (c) ModelScope Contributors. All rights reserved.
|
|
2
|
+
import re
|
|
3
|
+
from typing import List, Optional, Tuple, Union
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from modelscope import snapshot_download
|
|
7
|
+
|
|
8
|
+
from sirchmunk.llm.openai_chat import OpenAIChat
|
|
9
|
+
from sirchmunk.llm.prompts import (
|
|
10
|
+
SNAPSHOT_KEYWORDS_EXTRACTION,
|
|
11
|
+
SNAPSHOT_TOC_EXTRACTION,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class KeyPhraseExtractor:
|
|
16
|
+
"""
|
|
17
|
+
Key phrase extraction using sentence-transformers and scikit-learn.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
model_id: str = "AI-ModelScope/all-MiniLM-L6-v2",
|
|
23
|
+
device: Union[str, None] = None,
|
|
24
|
+
stop_words: Optional[List[str]] = None,
|
|
25
|
+
ngram_range: Tuple[int, int] = (1, 2),
|
|
26
|
+
):
|
|
27
|
+
"""
|
|
28
|
+
Key phrase extraction using sentence-transformers and scikit-learn.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
model_id (str): model id from ModelScope.
|
|
32
|
+
device (str): device to load the model on ('cpu' or 'cuda' or 'mps').
|
|
33
|
+
stop_words (Optional[List[str]]): list of stop words to filter out.
|
|
34
|
+
ngram_range (Tuple[int, int]): n-gram range for candidate generation.
|
|
35
|
+
"""
|
|
36
|
+
from sentence_transformers import SentenceTransformer
|
|
37
|
+
|
|
38
|
+
model_dir: str = snapshot_download(
|
|
39
|
+
model_id=model_id,
|
|
40
|
+
ignore_patterns=[
|
|
41
|
+
"openvino/*",
|
|
42
|
+
"onnx/*",
|
|
43
|
+
"pytorch_model.bin",
|
|
44
|
+
"rust_model.ot",
|
|
45
|
+
"tf_model.h5",
|
|
46
|
+
],
|
|
47
|
+
)
|
|
48
|
+
self.model = SentenceTransformer(model_dir, device=device)
|
|
49
|
+
|
|
50
|
+
self.ngram_range = ngram_range
|
|
51
|
+
self.stop_words = set(stop_words) if stop_words else set()
|
|
52
|
+
if not self.stop_words:
|
|
53
|
+
self.stop_words = {
|
|
54
|
+
# Chinese stop words
|
|
55
|
+
"的",
|
|
56
|
+
"了",
|
|
57
|
+
"在",
|
|
58
|
+
"是",
|
|
59
|
+
"我",
|
|
60
|
+
"有",
|
|
61
|
+
"和",
|
|
62
|
+
"就",
|
|
63
|
+
"不",
|
|
64
|
+
"人",
|
|
65
|
+
"都",
|
|
66
|
+
"一",
|
|
67
|
+
"一个",
|
|
68
|
+
"上",
|
|
69
|
+
# English stop words
|
|
70
|
+
"the",
|
|
71
|
+
"a",
|
|
72
|
+
"an",
|
|
73
|
+
"and",
|
|
74
|
+
"or",
|
|
75
|
+
"but",
|
|
76
|
+
"in",
|
|
77
|
+
"on",
|
|
78
|
+
"at",
|
|
79
|
+
"to",
|
|
80
|
+
"for",
|
|
81
|
+
"of",
|
|
82
|
+
"with",
|
|
83
|
+
"by",
|
|
84
|
+
"is",
|
|
85
|
+
"are",
|
|
86
|
+
"was",
|
|
87
|
+
"were",
|
|
88
|
+
"be",
|
|
89
|
+
"been",
|
|
90
|
+
"have",
|
|
91
|
+
"has",
|
|
92
|
+
"had",
|
|
93
|
+
"do",
|
|
94
|
+
"does",
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
@staticmethod
|
|
98
|
+
def _preprocess(text: str) -> str:
|
|
99
|
+
"""
|
|
100
|
+
Keep only Chinese, English, digits, and spaces; convert to lowercase.
|
|
101
|
+
"""
|
|
102
|
+
text = re.sub(r"[^\w\s\u4e00-\u9fa5]", " ", text.lower())
|
|
103
|
+
return " ".join(text.split())
|
|
104
|
+
|
|
105
|
+
def _generate_candidates(self, docs: List[str]) -> List[str]:
|
|
106
|
+
"""
|
|
107
|
+
Generate candidate key phrases using CountVectorizer.
|
|
108
|
+
"""
|
|
109
|
+
from sklearn.feature_extraction.text import CountVectorizer
|
|
110
|
+
|
|
111
|
+
n_docs = len(docs)
|
|
112
|
+
if n_docs == 0:
|
|
113
|
+
return []
|
|
114
|
+
|
|
115
|
+
vectorizer_params = {
|
|
116
|
+
"ngram_range": self.ngram_range,
|
|
117
|
+
"stop_words": list(self.stop_words),
|
|
118
|
+
"token_pattern": r"(?u)\b[\w\u4e00-\u9fa5]+\b",
|
|
119
|
+
"lowercase": True,
|
|
120
|
+
"min_df": 1,
|
|
121
|
+
}
|
|
122
|
+
if n_docs > 3:
|
|
123
|
+
vectorizer_params["max_df"] = 0.85
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
vectorizer = CountVectorizer(**vectorizer_params)
|
|
127
|
+
vectorizer.fit(docs)
|
|
128
|
+
except ValueError as e:
|
|
129
|
+
if "max_df corresponds to" in str(e):
|
|
130
|
+
# fallback
|
|
131
|
+
vectorizer_params.pop("max_df", None)
|
|
132
|
+
vectorizer = CountVectorizer(**vectorizer_params)
|
|
133
|
+
vectorizer.fit(docs)
|
|
134
|
+
else:
|
|
135
|
+
raise
|
|
136
|
+
|
|
137
|
+
candidates = vectorizer.get_feature_names_out().tolist()
|
|
138
|
+
candidates = [c.strip() for c in candidates if len(re.sub(r"\s+", "", c)) > 1]
|
|
139
|
+
|
|
140
|
+
return candidates
|
|
141
|
+
|
|
142
|
+
def extract(
|
|
143
|
+
self,
|
|
144
|
+
contents: List[str],
|
|
145
|
+
top_k: int = 20,
|
|
146
|
+
use_mmr: bool = True,
|
|
147
|
+
diversity: float = 0.5,
|
|
148
|
+
candidates: Optional[List[str]] = None,
|
|
149
|
+
batch_size: int = 32,
|
|
150
|
+
) -> List[Tuple[str, float]]:
|
|
151
|
+
"""
|
|
152
|
+
Extract key phrases from the given contents.
|
|
153
|
+
|
|
154
|
+
contents (List[str]): List of document strings.
|
|
155
|
+
top_k (int): Number of top key phrases to return.
|
|
156
|
+
use_mmr (bool): Whether to use MMR for diversity.
|
|
157
|
+
diversity (float): Diversity factor for MMR (0 to 1).
|
|
158
|
+
candidates (Optional[List[str]]): Predefined candidate phrases/keywords.
|
|
159
|
+
batch_size (int): Batch size for encoding.
|
|
160
|
+
"""
|
|
161
|
+
if not contents:
|
|
162
|
+
return []
|
|
163
|
+
|
|
164
|
+
# Step 1: merge and preprocess documents
|
|
165
|
+
doc = " ".join(contents)
|
|
166
|
+
doc = self._preprocess(doc)
|
|
167
|
+
|
|
168
|
+
# Step 2: Generate candidates if not provided
|
|
169
|
+
if candidates is None:
|
|
170
|
+
candidates = self._generate_candidates([doc])
|
|
171
|
+
if not candidates:
|
|
172
|
+
return []
|
|
173
|
+
|
|
174
|
+
# Step 3: Encode document and candidates
|
|
175
|
+
sentences = [doc] + candidates
|
|
176
|
+
embeddings = self.model.encode(
|
|
177
|
+
sentences,
|
|
178
|
+
batch_size=batch_size,
|
|
179
|
+
show_progress_bar=False,
|
|
180
|
+
convert_to_numpy=True,
|
|
181
|
+
normalize_embeddings=True,
|
|
182
|
+
)
|
|
183
|
+
doc_emb = embeddings[0]
|
|
184
|
+
cand_embs = embeddings[1:]
|
|
185
|
+
|
|
186
|
+
# Step 4: Compute similarities
|
|
187
|
+
similarities = (doc_emb @ cand_embs.T).flatten() # shape: (n_candidates,)
|
|
188
|
+
|
|
189
|
+
if not use_mmr:
|
|
190
|
+
idx = np.argsort(similarities)[::-1][:top_k]
|
|
191
|
+
return [(candidates[i], float(similarities[i])) for i in idx]
|
|
192
|
+
|
|
193
|
+
# Step 5 (Optional): MMR diversity selection
|
|
194
|
+
selected_idx = [int(np.argmax(similarities))]
|
|
195
|
+
remaining_idx = list(set(range(len(candidates))) - set(selected_idx))
|
|
196
|
+
|
|
197
|
+
while len(selected_idx) < top_k and remaining_idx:
|
|
198
|
+
candidate_scores = []
|
|
199
|
+
for i in remaining_idx:
|
|
200
|
+
sim_to_doc = similarities[i]
|
|
201
|
+
sim_to_selected = np.max(cand_embs[selected_idx] @ cand_embs[i])
|
|
202
|
+
mmr_score = diversity * sim_to_doc - (1 - diversity) * sim_to_selected
|
|
203
|
+
candidate_scores.append((i, mmr_score))
|
|
204
|
+
|
|
205
|
+
# Select the candidate with the highest MMR score
|
|
206
|
+
next_i = max(candidate_scores, key=lambda x: x[1])[0]
|
|
207
|
+
selected_idx.append(next_i)
|
|
208
|
+
remaining_idx.remove(next_i)
|
|
209
|
+
|
|
210
|
+
return [(candidates[i], float(similarities[i])) for i in selected_idx]
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
class TextInsights:
|
|
214
|
+
"""
|
|
215
|
+
Text insights: key information extraction.
|
|
216
|
+
"""
|
|
217
|
+
|
|
218
|
+
def __init__(self, llm: Optional[OpenAIChat] = None, **kwargs):
|
|
219
|
+
"""
|
|
220
|
+
Initialize TextInsights with an optional LLM instance.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
llm (Optional[OpenAIChat]): An instance of OpenAIChat.
|
|
224
|
+
**kwargs: Additional keyword arguments for KeyPhraseExtractor.
|
|
225
|
+
"""
|
|
226
|
+
self._llm = llm
|
|
227
|
+
|
|
228
|
+
self._key_phrase_extractor = (
|
|
229
|
+
KeyPhraseExtractor(
|
|
230
|
+
model_id="AI-ModelScope/all-MiniLM-L6-v2",
|
|
231
|
+
device=None,
|
|
232
|
+
ngram_range=(1, 2),
|
|
233
|
+
)
|
|
234
|
+
if self._llm is None
|
|
235
|
+
else None
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
self._kwargs = kwargs
|
|
239
|
+
|
|
240
|
+
def extract_phrase(self, contents: List[str], max_num: int = 20) -> List[str]:
|
|
241
|
+
"""
|
|
242
|
+
Extract key phrases from the given contents.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
contents (List[str]): List of document strings.
|
|
246
|
+
max_num (int): Maximum number of key phrases to extract.
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
List[str]: Extracted key phrases.
|
|
250
|
+
"""
|
|
251
|
+
|
|
252
|
+
if self._llm is not None:
|
|
253
|
+
prompt = SNAPSHOT_KEYWORDS_EXTRACTION.format(
|
|
254
|
+
document_content="\n\n".join(contents), max_num=max_num
|
|
255
|
+
)
|
|
256
|
+
messages = [{"role": "user", "content": prompt}]
|
|
257
|
+
response: str = self._llm.chat(
|
|
258
|
+
messages=messages,
|
|
259
|
+
stream=True,
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
results = [
|
|
263
|
+
phrase.strip().lower()
|
|
264
|
+
for phrase in response.split(",")
|
|
265
|
+
if phrase.strip()
|
|
266
|
+
]
|
|
267
|
+
|
|
268
|
+
else:
|
|
269
|
+
extracted_phrases = self._key_phrase_extractor.extract(
|
|
270
|
+
contents=contents, top_k=max_num, **self._kwargs
|
|
271
|
+
)
|
|
272
|
+
results = [phrase for phrase, _ in extracted_phrases]
|
|
273
|
+
|
|
274
|
+
return results
|
|
275
|
+
|
|
276
|
+
def extract_toc(self, contents: List[str]) -> str:
|
|
277
|
+
"""
|
|
278
|
+
Extract the `Table of contents from input document.`
|
|
279
|
+
"""
|
|
280
|
+
|
|
281
|
+
if self._llm is None:
|
|
282
|
+
return ""
|
|
283
|
+
|
|
284
|
+
prompt = SNAPSHOT_TOC_EXTRACTION.format(document_content="\n\n".join(contents))
|
|
285
|
+
messages = [{"role": "user", "content": prompt}]
|
|
286
|
+
|
|
287
|
+
response: str = self._llm.chat(
|
|
288
|
+
messages=messages,
|
|
289
|
+
stream=True,
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
return response
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Copyright (c) ModelScope Contributors. All rights reserved.
|