sirchmunk 0.0.0__py3-none-any.whl → 0.0.1.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sirchmunk/__init__.py +8 -0
- sirchmunk/base.py +17 -0
- sirchmunk/insight/__init__.py +4 -0
- sirchmunk/insight/text_insights.py +292 -0
- sirchmunk/learnings/__init__.py +1 -0
- sirchmunk/learnings/evidence_processor.py +525 -0
- sirchmunk/learnings/knowledge_base.py +232 -0
- sirchmunk/llm/__init__.py +2 -0
- sirchmunk/llm/openai_chat.py +247 -0
- sirchmunk/llm/prompts.py +216 -0
- sirchmunk/retrieve/__init__.py +1 -0
- sirchmunk/retrieve/base.py +25 -0
- sirchmunk/retrieve/text_retriever.py +1026 -0
- sirchmunk/scan/__init__.py +1 -0
- sirchmunk/scan/base.py +18 -0
- sirchmunk/scan/file_scanner.py +373 -0
- sirchmunk/scan/web_scanner.py +18 -0
- sirchmunk/scheduler/__init__.py +0 -0
- sirchmunk/schema/__init__.py +2 -0
- sirchmunk/schema/cognition.py +106 -0
- sirchmunk/schema/context.py +25 -0
- sirchmunk/schema/knowledge.py +318 -0
- sirchmunk/schema/metadata.py +658 -0
- sirchmunk/schema/request.py +221 -0
- sirchmunk/schema/response.py +20 -0
- sirchmunk/schema/snapshot.py +346 -0
- sirchmunk/search.py +475 -0
- sirchmunk/storage/__init__.py +7 -0
- sirchmunk/storage/duckdb.py +676 -0
- sirchmunk/storage/knowledge_manager.py +720 -0
- sirchmunk/utils/__init__.py +15 -0
- sirchmunk/utils/constants.py +15 -0
- sirchmunk/utils/deps.py +23 -0
- sirchmunk/utils/file_utils.py +70 -0
- sirchmunk/utils/install_rga.py +124 -0
- sirchmunk/utils/log_utils.py +360 -0
- sirchmunk/utils/tokenizer_util.py +55 -0
- sirchmunk/utils/utils.py +108 -0
- sirchmunk/version.py +1 -1
- sirchmunk-0.0.1.post1.dist-info/METADATA +483 -0
- sirchmunk-0.0.1.post1.dist-info/RECORD +45 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/WHEEL +1 -1
- sirchmunk-0.0.0.dist-info/METADATA +0 -26
- sirchmunk-0.0.0.dist-info/RECORD +0 -8
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/entry_points.txt +0 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/licenses/LICENSE +0 -0
- {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
# Copyright (c) ModelScope Contributors. All rights reserved.
|
|
2
|
+
import json
|
|
3
|
+
import hashlib
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Awaitable, Callable, Dict, List, Optional, Union
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
from sirchmunk.learnings.evidence_processor import (
|
|
10
|
+
MonteCarloEvidenceSampling,
|
|
11
|
+
RoiResult,
|
|
12
|
+
)
|
|
13
|
+
from sirchmunk.llm.openai_chat import OpenAIChat
|
|
14
|
+
from sirchmunk.llm.prompts import EVIDENCE_SUMMARY
|
|
15
|
+
from sirchmunk.schema.knowledge import (
|
|
16
|
+
AbstractionLevel,
|
|
17
|
+
EvidenceUnit,
|
|
18
|
+
KnowledgeCluster,
|
|
19
|
+
Lifecycle,
|
|
20
|
+
)
|
|
21
|
+
from sirchmunk.schema.metadata import FileInfo
|
|
22
|
+
from sirchmunk.schema.request import Request
|
|
23
|
+
from sirchmunk.utils.constants import DEFAULT_WORK_PATH
|
|
24
|
+
from sirchmunk.utils.file_utils import StorageStructure, fast_extract
|
|
25
|
+
from sirchmunk.utils import create_logger, LogCallback
|
|
26
|
+
from sirchmunk.utils.utils import extract_fields
|
|
27
|
+
|
|
28
|
+
class KnowledgeBase:
|
|
29
|
+
"""
|
|
30
|
+
A knowledge base that manages knowledge clusters built from retrieved information and metadata dynamically.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
llm: OpenAIChat,
|
|
36
|
+
metadata_map: Dict[str, Any] = None,
|
|
37
|
+
work_path: Union[str, Path] = None,
|
|
38
|
+
log_callback: LogCallback = None,
|
|
39
|
+
):
|
|
40
|
+
"""
|
|
41
|
+
Initialize the KnowledgeBase with an LLM and metadata mapping.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
llm (OpenAIChat): An instance of the OpenAIChat LLM for processing text.
|
|
45
|
+
metadata_map (Dict[str, Any]): A mapping of all metadata information.
|
|
46
|
+
k: metadata cache key, refers to `FileInfo.cache_key`
|
|
47
|
+
v: metadata path or content
|
|
48
|
+
work_path: Working directory path
|
|
49
|
+
log_callback: Optional log callback function for custom logging
|
|
50
|
+
"""
|
|
51
|
+
self.llm = llm
|
|
52
|
+
self.metadata_map = metadata_map
|
|
53
|
+
self.work_path: Path = (
|
|
54
|
+
DEFAULT_WORK_PATH if work_path is None else Path(work_path).resolve()
|
|
55
|
+
)
|
|
56
|
+
self.metadata_path: Path = (
|
|
57
|
+
self.work_path / StorageStructure.CACHE_DIR / StorageStructure.METADATA_DIR
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Store log_callback for passing to child components
|
|
61
|
+
self.log_callback = log_callback
|
|
62
|
+
|
|
63
|
+
# Create bound logger with callback - returns AsyncLogger instance
|
|
64
|
+
self._log = create_logger(log_callback=log_callback)
|
|
65
|
+
|
|
66
|
+
self.llm_usages: List[Dict[str, Any]] = []
|
|
67
|
+
|
|
68
|
+
@staticmethod
|
|
69
|
+
def _get_file_info(
|
|
70
|
+
file_or_url: str, metadata_path: Union[str, Path]
|
|
71
|
+
) -> Optional[FileInfo]:
|
|
72
|
+
|
|
73
|
+
cache_key: str = FileInfo.get_cache_key(file_or_url=file_or_url)
|
|
74
|
+
meta_file: Path = Path(metadata_path) / f"{cache_key}.json"
|
|
75
|
+
|
|
76
|
+
if not meta_file.exists():
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
with open(meta_file, "r", encoding="utf-8") as f:
|
|
80
|
+
metadata_content = json.load(f)
|
|
81
|
+
|
|
82
|
+
return FileInfo.from_dict(info=metadata_content)
|
|
83
|
+
|
|
84
|
+
@staticmethod
|
|
85
|
+
def _compose_cluster_text(
|
|
86
|
+
name: Optional[str],
|
|
87
|
+
description: Union[List[str], str, None],
|
|
88
|
+
content: Union[List[str], str, None],
|
|
89
|
+
) -> str:
|
|
90
|
+
"""
|
|
91
|
+
Compose a stable text representation of a cluster from name, description, and content.
|
|
92
|
+
This is used for deterministic cluster ID generation.
|
|
93
|
+
"""
|
|
94
|
+
parts: List[str] = []
|
|
95
|
+
if name:
|
|
96
|
+
parts.append(str(name).strip())
|
|
97
|
+
|
|
98
|
+
if description:
|
|
99
|
+
if isinstance(description, list):
|
|
100
|
+
parts.extend([str(item).strip() for item in description if item])
|
|
101
|
+
else:
|
|
102
|
+
parts.append(str(description).strip())
|
|
103
|
+
|
|
104
|
+
if content:
|
|
105
|
+
if isinstance(content, list):
|
|
106
|
+
parts.extend([str(item).strip() for item in content if item])
|
|
107
|
+
else:
|
|
108
|
+
parts.append(str(content).strip())
|
|
109
|
+
|
|
110
|
+
return "\n\n".join([part for part in parts if part])
|
|
111
|
+
|
|
112
|
+
async def build(
|
|
113
|
+
self,
|
|
114
|
+
request: Request,
|
|
115
|
+
retrieved_infos: List[Dict[str, Any]],
|
|
116
|
+
keywords: Dict[str, float] = None,
|
|
117
|
+
top_k_files: Optional[int] = 3,
|
|
118
|
+
top_k_snippets: Optional[int] = 5,
|
|
119
|
+
confidence_threshold: Optional[float] = 8.0,
|
|
120
|
+
verbose: bool = True,
|
|
121
|
+
) -> Union[KnowledgeCluster, None]:
|
|
122
|
+
"""Build a knowledge cluster from retrieved information and metadata dynamically."""
|
|
123
|
+
|
|
124
|
+
if len(retrieved_infos) == 0:
|
|
125
|
+
await self._log.warning(
|
|
126
|
+
"No retrieved information available to build knowledge cluster."
|
|
127
|
+
)
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
retrieved_infos = retrieved_infos[:top_k_files]
|
|
131
|
+
|
|
132
|
+
keywords = keywords or {}
|
|
133
|
+
|
|
134
|
+
# Get evidence units (regions of interest) from raw retrieved infos
|
|
135
|
+
evidences: List[EvidenceUnit] = []
|
|
136
|
+
for info in retrieved_infos:
|
|
137
|
+
file_path_or_url: str = info["path"]
|
|
138
|
+
|
|
139
|
+
# TODO: handle more file types; deal with large files; Async adaptive
|
|
140
|
+
extraction_result = await fast_extract(file_path=file_path_or_url)
|
|
141
|
+
doc_content: str = extraction_result.content
|
|
142
|
+
|
|
143
|
+
sampler = MonteCarloEvidenceSampling(
|
|
144
|
+
llm=self.llm,
|
|
145
|
+
doc_content=doc_content,
|
|
146
|
+
verbose=verbose,
|
|
147
|
+
log_callback=self.log_callback,
|
|
148
|
+
)
|
|
149
|
+
roi_result: RoiResult = await sampler.get_roi(
|
|
150
|
+
query=request.get_user_input(),
|
|
151
|
+
keywords=keywords,
|
|
152
|
+
confidence_threshold=confidence_threshold,
|
|
153
|
+
top_k=top_k_snippets,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
evidence_unit = EvidenceUnit(
|
|
157
|
+
doc_id=FileInfo.get_cache_key(file_path_or_url),
|
|
158
|
+
file_or_url=Path(file_path_or_url),
|
|
159
|
+
summary=roi_result.summary,
|
|
160
|
+
is_found=roi_result.is_found,
|
|
161
|
+
snippets=roi_result.snippets,
|
|
162
|
+
extracted_at=datetime.now(),
|
|
163
|
+
conflict_group=[],
|
|
164
|
+
)
|
|
165
|
+
self.llm_usages.extend(sampler.llm_usages)
|
|
166
|
+
evidences.append(evidence_unit)
|
|
167
|
+
|
|
168
|
+
if len(evidences) == 0:
|
|
169
|
+
await self._log.warning("No evidence units extracted from retrieved information.")
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
# Get `name`, `description` and `content` from user request and evidences using LLM
|
|
173
|
+
# TODO: to be processed other type of segments
|
|
174
|
+
evidence_contents: List[str] = [ev.summary for ev in evidences]
|
|
175
|
+
|
|
176
|
+
evidence_summary_prompt: str = EVIDENCE_SUMMARY.format(
|
|
177
|
+
user_input=request.get_user_input(),
|
|
178
|
+
evidences="\n\n".join(evidence_contents),
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
evidence_summary_llm_response = await self.llm.achat(
|
|
182
|
+
messages=[{"role": "user", "content": evidence_summary_prompt}],
|
|
183
|
+
stream=True,
|
|
184
|
+
)
|
|
185
|
+
evidence_summary_response: str = evidence_summary_llm_response.content
|
|
186
|
+
self.llm_usages.append(evidence_summary_llm_response.usage)
|
|
187
|
+
|
|
188
|
+
cluster_infos: Dict[str, Any] = extract_fields(
|
|
189
|
+
content=evidence_summary_response
|
|
190
|
+
)
|
|
191
|
+
if len(cluster_infos) == 0:
|
|
192
|
+
await self._log.warning(
|
|
193
|
+
"Failed to extract knowledge cluster information from LLM response."
|
|
194
|
+
)
|
|
195
|
+
return None
|
|
196
|
+
|
|
197
|
+
cluster_name = cluster_infos.get("name")
|
|
198
|
+
cluster_description = cluster_infos.get("description")
|
|
199
|
+
cluster_content = cluster_infos.get("content")
|
|
200
|
+
|
|
201
|
+
cluster_text = self._compose_cluster_text(
|
|
202
|
+
name=cluster_name,
|
|
203
|
+
description=cluster_description,
|
|
204
|
+
content=cluster_content,
|
|
205
|
+
)
|
|
206
|
+
if not cluster_text:
|
|
207
|
+
cluster_text = request.get_user_input() or "unknown"
|
|
208
|
+
|
|
209
|
+
cluster_id = f"C{hashlib.sha256(cluster_text.encode('utf-8')).hexdigest()[:10]}"
|
|
210
|
+
|
|
211
|
+
cluster = KnowledgeCluster(
|
|
212
|
+
id=cluster_id,
|
|
213
|
+
name=cluster_name,
|
|
214
|
+
description=[cluster_description] if cluster_description else [],
|
|
215
|
+
content=cluster_content,
|
|
216
|
+
scripts=[],
|
|
217
|
+
resources=[],
|
|
218
|
+
patterns=[],
|
|
219
|
+
constraints=[],
|
|
220
|
+
evidences=evidences,
|
|
221
|
+
confidence=0.5,
|
|
222
|
+
abstraction_level=AbstractionLevel.TECHNIQUE,
|
|
223
|
+
landmark_potential=0.5,
|
|
224
|
+
hotness=0.5,
|
|
225
|
+
lifecycle=Lifecycle.EMERGING,
|
|
226
|
+
create_time=datetime.now(),
|
|
227
|
+
last_modified=datetime.now(),
|
|
228
|
+
version=1,
|
|
229
|
+
related_clusters=[],
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
return cluster
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
# Copyright (c) ModelScope Contributors. All rights reserved.
|
|
2
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
|
|
5
|
+
from openai import AsyncOpenAI, OpenAI
|
|
6
|
+
from sirchmunk.utils import create_logger, LogCallback
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class OpenAIChatResponse:
|
|
14
|
+
"""
|
|
15
|
+
Data class representing the response from the OpenAI Chat API.
|
|
16
|
+
"""
|
|
17
|
+
content: str
|
|
18
|
+
role: str = "assistant"
|
|
19
|
+
usage: Dict[str, int] = field(default_factory=dict)
|
|
20
|
+
model: str = None
|
|
21
|
+
finish_reason: str = None
|
|
22
|
+
logprobs: Any = None
|
|
23
|
+
|
|
24
|
+
def __str__(self):
|
|
25
|
+
return self.content
|
|
26
|
+
|
|
27
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
28
|
+
"""
|
|
29
|
+
Convert the response to a dictionary.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Dict[str, Any]: The response as a dictionary.
|
|
33
|
+
"""
|
|
34
|
+
return {
|
|
35
|
+
"content": self.content,
|
|
36
|
+
"role": self.role,
|
|
37
|
+
"usage": self.usage,
|
|
38
|
+
"model": self.model,
|
|
39
|
+
"finish_reason": self.finish_reason,
|
|
40
|
+
"logprobs": self.logprobs,
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class OpenAIChat:
|
|
45
|
+
"""
|
|
46
|
+
A client for interacting with OpenAI's chat completion API.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
api_key: str = None,
|
|
52
|
+
base_url: str = None,
|
|
53
|
+
model: str = None,
|
|
54
|
+
log_callback: LogCallback = None,
|
|
55
|
+
**kwargs,
|
|
56
|
+
):
|
|
57
|
+
"""
|
|
58
|
+
Initialize the OpenAIChat client.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
api_key (str): The API key for OpenAI.
|
|
62
|
+
base_url (str): The base URL for the OpenAI API.
|
|
63
|
+
model (str): The model to use for chat completions.
|
|
64
|
+
log_callback (LogCallback): Optional callback for logging.
|
|
65
|
+
**kwargs: Additional keyword arguments passed to the OpenAI client create method.
|
|
66
|
+
"""
|
|
67
|
+
self._client = OpenAI(
|
|
68
|
+
api_key=api_key,
|
|
69
|
+
base_url=base_url,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
self._async_client = AsyncOpenAI(
|
|
73
|
+
api_key=api_key,
|
|
74
|
+
base_url=base_url,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
self._model = model
|
|
78
|
+
self._kwargs = kwargs
|
|
79
|
+
|
|
80
|
+
# Initialize synchronous and asynchronous loggers
|
|
81
|
+
self._logger = create_logger(log_callback=log_callback, enable_async=False)
|
|
82
|
+
self._logger_async = create_logger(log_callback=log_callback, enable_async=True)
|
|
83
|
+
|
|
84
|
+
def chat(
|
|
85
|
+
self,
|
|
86
|
+
messages: List[Dict[str, Any]],
|
|
87
|
+
stream: bool = True,
|
|
88
|
+
) -> OpenAIChatResponse:
|
|
89
|
+
"""
|
|
90
|
+
Generate a chat completion synchronously.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
messages (List[Dict[str, Any]]): A list of messages for the chat.
|
|
94
|
+
stream (bool): Whether to stream the response.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
OpenAIChatResponse: The structured response containing content, usage, etc.
|
|
98
|
+
"""
|
|
99
|
+
# Ensure we try to get usage metrics even in streaming mode if supported by the API version
|
|
100
|
+
request_kwargs = self._kwargs.copy()
|
|
101
|
+
if stream and "stream_options" not in request_kwargs:
|
|
102
|
+
request_kwargs["stream_options"] = {"include_usage": True}
|
|
103
|
+
|
|
104
|
+
resp = self._client.chat.completions.create(
|
|
105
|
+
model=self._model, messages=messages, stream=stream, **request_kwargs
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
res_content: str = ""
|
|
109
|
+
role: str = "assistant"
|
|
110
|
+
usage: Dict[str, int] = {}
|
|
111
|
+
finish_reason: str = None
|
|
112
|
+
response_model: str = self._model
|
|
113
|
+
|
|
114
|
+
if stream:
|
|
115
|
+
for chunk in resp:
|
|
116
|
+
# Extract usage if present (usually in the last chunk if stream_options is set)
|
|
117
|
+
if chunk.usage:
|
|
118
|
+
usage = chunk.usage.model_dump()
|
|
119
|
+
|
|
120
|
+
# Update model name if provided in chunks
|
|
121
|
+
if chunk.model:
|
|
122
|
+
response_model = chunk.model
|
|
123
|
+
|
|
124
|
+
if not chunk.choices:
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
delta = chunk.choices[0].delta
|
|
128
|
+
|
|
129
|
+
# Capture role (usually only in the first chunk)
|
|
130
|
+
if delta.role:
|
|
131
|
+
role = delta.role
|
|
132
|
+
self._logger.info(f"[role={delta.role}] ", end="", flush=True)
|
|
133
|
+
|
|
134
|
+
# Capture content
|
|
135
|
+
if delta.content:
|
|
136
|
+
self._logger.info(delta.content, end="", flush=True)
|
|
137
|
+
res_content += delta.content
|
|
138
|
+
|
|
139
|
+
# Capture finish reason
|
|
140
|
+
if chunk.choices[0].finish_reason:
|
|
141
|
+
finish_reason = chunk.choices[0].finish_reason
|
|
142
|
+
|
|
143
|
+
# Print a newline at the end of streaming for cleaner logs
|
|
144
|
+
self._logger.info("", end="\n", flush=True)
|
|
145
|
+
|
|
146
|
+
else:
|
|
147
|
+
# Non-streaming response
|
|
148
|
+
message = resp.choices[0].message
|
|
149
|
+
res_content = message.content or ""
|
|
150
|
+
role = message.role
|
|
151
|
+
finish_reason = resp.choices[0].finish_reason
|
|
152
|
+
response_model = resp.model
|
|
153
|
+
if resp.usage:
|
|
154
|
+
usage = resp.usage.model_dump()
|
|
155
|
+
|
|
156
|
+
# Log the full response content since we didn't stream it
|
|
157
|
+
self._logger.info(f"[role={role}] {res_content}")
|
|
158
|
+
|
|
159
|
+
return OpenAIChatResponse(
|
|
160
|
+
content=res_content,
|
|
161
|
+
role=role,
|
|
162
|
+
usage=usage,
|
|
163
|
+
model=response_model,
|
|
164
|
+
finish_reason=finish_reason
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
async def achat(
|
|
168
|
+
self,
|
|
169
|
+
messages: List[Dict[str, Any]],
|
|
170
|
+
stream: bool = True,
|
|
171
|
+
) -> OpenAIChatResponse:
|
|
172
|
+
"""
|
|
173
|
+
Generate a chat completion asynchronously.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
messages (List[Dict[str, Any]]): A list of messages for the chat.
|
|
177
|
+
stream (bool): Whether to stream the response.
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
OpenAIChatResponse: The structured response containing content, usage, etc.
|
|
181
|
+
"""
|
|
182
|
+
# Ensure we try to get usage metrics even in streaming mode
|
|
183
|
+
request_kwargs = self._kwargs.copy()
|
|
184
|
+
if stream and "stream_options" not in request_kwargs:
|
|
185
|
+
request_kwargs["stream_options"] = {"include_usage": True}
|
|
186
|
+
|
|
187
|
+
resp = await self._async_client.chat.completions.create(
|
|
188
|
+
model=self._model, messages=messages, stream=stream, **request_kwargs
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
res_content: str = ""
|
|
192
|
+
role: str = "assistant"
|
|
193
|
+
usage: Dict[str, int] = {}
|
|
194
|
+
finish_reason: str = None
|
|
195
|
+
response_model: str = self._model
|
|
196
|
+
|
|
197
|
+
if stream:
|
|
198
|
+
async for chunk in resp:
|
|
199
|
+
# Extract usage if present (usually in the last chunk if stream_options is set)
|
|
200
|
+
if chunk.usage:
|
|
201
|
+
usage = chunk.usage.model_dump()
|
|
202
|
+
|
|
203
|
+
if chunk.model:
|
|
204
|
+
response_model = chunk.model
|
|
205
|
+
|
|
206
|
+
if not chunk.choices:
|
|
207
|
+
continue
|
|
208
|
+
|
|
209
|
+
delta = chunk.choices[0].delta
|
|
210
|
+
|
|
211
|
+
# Capture role
|
|
212
|
+
if delta.role:
|
|
213
|
+
role = delta.role
|
|
214
|
+
await self._logger_async.info(f"[role={delta.role}] ", end="", flush=True)
|
|
215
|
+
|
|
216
|
+
# Capture content
|
|
217
|
+
if delta.content:
|
|
218
|
+
await self._logger_async.info(delta.content, end="", flush=True)
|
|
219
|
+
res_content += delta.content
|
|
220
|
+
|
|
221
|
+
# Capture finish reason
|
|
222
|
+
if chunk.choices[0].finish_reason:
|
|
223
|
+
finish_reason = chunk.choices[0].finish_reason
|
|
224
|
+
|
|
225
|
+
# Print a newline at the end of streaming for cleaner logs
|
|
226
|
+
await self._logger_async.info("", end="\n", flush=True)
|
|
227
|
+
|
|
228
|
+
else:
|
|
229
|
+
# Non-streaming response
|
|
230
|
+
message = resp.choices[0].message
|
|
231
|
+
res_content = message.content or ""
|
|
232
|
+
role = message.role
|
|
233
|
+
finish_reason = resp.choices[0].finish_reason
|
|
234
|
+
response_model = resp.model
|
|
235
|
+
if resp.usage:
|
|
236
|
+
usage = resp.usage.model_dump()
|
|
237
|
+
|
|
238
|
+
# Log the full response content since we didn't stream it
|
|
239
|
+
await self._logger_async.info(f"[role={role}] {res_content}")
|
|
240
|
+
|
|
241
|
+
return OpenAIChatResponse(
|
|
242
|
+
content=res_content,
|
|
243
|
+
role=role,
|
|
244
|
+
usage=usage,
|
|
245
|
+
model=response_model,
|
|
246
|
+
finish_reason=finish_reason
|
|
247
|
+
)
|
sirchmunk/llm/prompts.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
# Copyright (c) ModelScope Contributors. All rights reserved.
|
|
2
|
+
# flake8: noqa
|
|
3
|
+
# yapf: disable
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
SNAPSHOT_KEYWORDS_EXTRACTION = """
|
|
7
|
+
Analyze the following document and extract the most relevant and representative key phrases.
|
|
8
|
+
Prioritize terms that capture the core topics, central concepts, important entities (e.g., people, organizations, locations), and domain-specific terminology.
|
|
9
|
+
Exclude generic words (e.g., "the", "and", "result", "study") unless they are part of a meaningful multi-word phrase.
|
|
10
|
+
Limit the output to {max_num} concise key phrases, ranked by significance.
|
|
11
|
+
You **MUST** output the key phrases as a comma-separated list without any additional explanation or formatting.
|
|
12
|
+
You **MUST** adjust the language of the key phrases to be consistent with the language of the input document.
|
|
13
|
+
|
|
14
|
+
**Intput Document**:
|
|
15
|
+
{document_content}
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
SNAPSHOT_TOC_EXTRACTION = """
|
|
20
|
+
Generate a Table of Contents (ToC) from the given document, adapting its depth and content density to the document’s inherent complexity.
|
|
21
|
+
|
|
22
|
+
Requirements:
|
|
23
|
+
|
|
24
|
+
1. Adaptive Hierarchy Depth: Dynamically set the depth between 3 to 5 levels, based on the document’s structural and semantic complexity (e.g., 3 levels for simple notices, 5 for technical specs).
|
|
25
|
+
2. Summarized Entries: Each ToC item must concisely summarize the section’s core content (10–25 words), not just repeat headings. Capture purpose, key actions, or critical info.
|
|
26
|
+
3. Faithfulness: Do not invent sections. Infer headings only from logical paragraph groupings if explicit titles are absent.
|
|
27
|
+
4. Format: Use Markdown nested lists with 2-space indents per level (e.g., - → - → -). Output ToC only—no preamble or commentary.
|
|
28
|
+
|
|
29
|
+
**Input Document**:
|
|
30
|
+
{document_content}
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
QUERY_KEYWORDS_EXTRACTION = """
|
|
35
|
+
### Role: Search Optimization Expert & Information Retrieval Specialist
|
|
36
|
+
|
|
37
|
+
### Task:
|
|
38
|
+
Extract **{num_levels} sets** of keywords from the user query with **different granularities** to maximize search hit rate.
|
|
39
|
+
|
|
40
|
+
### Multi-Level Keyword Granularity Strategy:
|
|
41
|
+
|
|
42
|
+
Extract {num_levels} levels of keywords with progressively finer granularity:
|
|
43
|
+
|
|
44
|
+
{level_descriptions}
|
|
45
|
+
|
|
46
|
+
### IDF Value Guidelines:
|
|
47
|
+
- Estimate the **IDF (Inverse Document Frequency)** for each keyword based on its rarity in general corpus
|
|
48
|
+
- IDF range: **[0-10]** where:
|
|
49
|
+
- 0-3: Very common terms (e.g., "the", "is", "data")
|
|
50
|
+
- 4-6: Moderately common terms (e.g., "algorithm", "network")
|
|
51
|
+
- 7-9: Rare/specific terms (e.g., "backpropagation", "xgboost")
|
|
52
|
+
- 10: Extremely rare/specialized terms
|
|
53
|
+
- IDF values are **independent** of keyword level - focus on term rarity, not granularity
|
|
54
|
+
|
|
55
|
+
### Requirements:
|
|
56
|
+
- Each level should have 3-5 keywords
|
|
57
|
+
- Keywords must become progressively **finer-grained** from Level 1 to Level {num_levels}
|
|
58
|
+
- **Level 1**: Coarse-grained phrases/multi-word expressions
|
|
59
|
+
- **Level {num_levels}**: Fine-grained single words or precise technical terms
|
|
60
|
+
- ONLY extract from the user query context; do NOT add external information
|
|
61
|
+
- Ensure keywords at different levels are complementary, not redundant
|
|
62
|
+
|
|
63
|
+
### Output Format:
|
|
64
|
+
Output {num_levels} separate JSON-like dicts within their respective tags:
|
|
65
|
+
|
|
66
|
+
{output_format_example}
|
|
67
|
+
|
|
68
|
+
### User Query:
|
|
69
|
+
{{user_input}}
|
|
70
|
+
|
|
71
|
+
### {num_levels}-Level Keywords (Coarse to Fine):
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def generate_keyword_extraction_prompt(num_levels: int = 3) -> str:
|
|
76
|
+
"""
|
|
77
|
+
Generate a dynamic keyword extraction prompt template based on the number of levels.
|
|
78
|
+
|
|
79
|
+
The returned template still contains {{user_input}} placeholder that needs to be
|
|
80
|
+
filled in by the caller.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
num_levels: Number of granularity levels (default: 3)
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Prompt template string with {{user_input}} placeholder
|
|
87
|
+
"""
|
|
88
|
+
# Generate level descriptions with granularity focus
|
|
89
|
+
level_descriptions = []
|
|
90
|
+
for i in range(1, num_levels + 1):
|
|
91
|
+
# Define granularity characteristics
|
|
92
|
+
if i == 1:
|
|
93
|
+
granularity = "Coarse-grained"
|
|
94
|
+
desc_text = "Multi-word phrases, compound expressions, broader concepts"
|
|
95
|
+
examples = '"machine learning algorithms", "data processing pipeline", "neural network training"'
|
|
96
|
+
elif i == num_levels:
|
|
97
|
+
granularity = "Fine-grained"
|
|
98
|
+
desc_text = "Single words, precise terms, atomic concepts"
|
|
99
|
+
examples = '"optimization", "gradient", "tensor", "epoch"'
|
|
100
|
+
else:
|
|
101
|
+
granularity = f"Medium-grained (Level {i})"
|
|
102
|
+
desc_text = "2-3 word phrases or compound terms transitioning to single words"
|
|
103
|
+
examples = '"deep learning", "batch normalization", "learning rate"'
|
|
104
|
+
|
|
105
|
+
level_descriptions.append(
|
|
106
|
+
f"**Level {i}** ({granularity}):\n"
|
|
107
|
+
f" - Granularity: {desc_text}\n"
|
|
108
|
+
f" - Example keywords: {examples}\n"
|
|
109
|
+
f" - Note: IDF values should reflect term rarity, not granularity level"
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Generate output format examples (avoiding f-string interpolation issues)
|
|
113
|
+
output_examples = []
|
|
114
|
+
for i in range(1, num_levels + 1):
|
|
115
|
+
# Use double braces to escape them in the format string
|
|
116
|
+
example_dict = '{{"keyword1": idf_value, "keyword2": idf_value, ...}}'
|
|
117
|
+
output_examples.append(
|
|
118
|
+
f"<KEYWORDS_LEVEL_{i}>\n{example_dict}\n</KEYWORDS_LEVEL_{i}>"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# Format the template with num_levels, descriptions, and examples
|
|
122
|
+
# Note: {{user_input}} becomes {user_input} after this format call
|
|
123
|
+
return QUERY_KEYWORDS_EXTRACTION.format(
|
|
124
|
+
num_levels=num_levels,
|
|
125
|
+
level_descriptions="\n\n".join(level_descriptions),
|
|
126
|
+
output_format_example="\n\n".join(output_examples)
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
EVIDENCE_SUMMARY = """
|
|
131
|
+
## Role: High-Precision Information Synthesis Expert
|
|
132
|
+
|
|
133
|
+
## Task:
|
|
134
|
+
Synthesize a structured response based on the User Input and the provided Evidences.
|
|
135
|
+
|
|
136
|
+
### Critical Constraint:
|
|
137
|
+
1. **Language Consistency:** All output fields (<DESCRIPTION>, <NAME>, and <CONTENT>) MUST be written in the **same language** as the User Input.
|
|
138
|
+
2. **Ignore irrelevant noise:** Focus exclusively on information that directly relates to the User Input. If evidences contain conflicting or redundant data, prioritize accuracy and relevance.
|
|
139
|
+
|
|
140
|
+
### Input Data:
|
|
141
|
+
- **User Input:** {user_input}
|
|
142
|
+
- **Retrieved Evidences:** {evidences}
|
|
143
|
+
|
|
144
|
+
### Output Instructions:
|
|
145
|
+
1. **<DESCRIPTION>**: A high-level, concise synthesis of how the evidences address the user input.
|
|
146
|
+
- *Constraint:* Maximum 3 sentences. Written in the language of {user_input}.
|
|
147
|
+
2. **<NAME>**: A ultra-short, catchy title or identifier for the description.
|
|
148
|
+
- *Constraint:* Exactly 1 sentence, maximum 30 characters. Written in the language of {user_input}.
|
|
149
|
+
3. **<CONTENT>**: A detailed and comprehensive summary of all relevant key points extracted from the evidences.
|
|
150
|
+
- *Constraint:* Written in the language of {user_input}.
|
|
151
|
+
|
|
152
|
+
### Output Format:
|
|
153
|
+
<DESCRIPTION>[Concise synthesis]</DESCRIPTION>
|
|
154
|
+
<NAME>[Short title]</NAME>
|
|
155
|
+
<CONTENT>[Detailed summary]</CONTENT>
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
SEARCH_RESULT_SUMMARY = """
|
|
160
|
+
### Task
|
|
161
|
+
Analyze the provided {text_content} and generate a concise summary in the form of a Markdown Briefing.
|
|
162
|
+
|
|
163
|
+
### Constraints
|
|
164
|
+
1. **Language Continuity**: The output must be in the SAME language as the User Input.
|
|
165
|
+
2. **Format**: Use Markdown (headings, bullet points, and bold text) for high readability.
|
|
166
|
+
3. **Style**: Keep it professional, objective, and clear. Avoid fluff.
|
|
167
|
+
|
|
168
|
+
### Input Data
|
|
169
|
+
- **User Input**: {user_input}
|
|
170
|
+
- **Search Result Text**: {text_content}
|
|
171
|
+
|
|
172
|
+
### Output
|
|
173
|
+
[Generate the Markdown Briefing here]
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
EVALUATE_EVIDENCE_SAMPLE = """
|
|
178
|
+
You are a document retrieval assistant. Please evaluate if the text snippet contains clues to answer the user's question.
|
|
179
|
+
|
|
180
|
+
### Language Constraint:
|
|
181
|
+
Detect the language of the "Query" and provide the "reasoning" and "output" in the same language (e.g., if the query is in Chinese, the reasoning must be in Chinese).
|
|
182
|
+
|
|
183
|
+
### Inputs:
|
|
184
|
+
Query: "{query}"
|
|
185
|
+
|
|
186
|
+
Text Snippet (Source: {sample_source}):
|
|
187
|
+
"...{sample_content}..."
|
|
188
|
+
|
|
189
|
+
### Output Requirement:
|
|
190
|
+
Return JSON:
|
|
191
|
+
- score (0-10):
|
|
192
|
+
0-3: Completely irrelevant.
|
|
193
|
+
4-7: Contains relevant keywords or context but no direct answer.
|
|
194
|
+
8-10: Contains exact data, facts, or direct answer.
|
|
195
|
+
- reasoning: Short reasoning in the SAME language as the query.
|
|
196
|
+
|
|
197
|
+
JSON format only.
|
|
198
|
+
"""
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
ROI_RESULT_SUMMARY = """
|
|
202
|
+
### Task
|
|
203
|
+
Analyze the provided {text_content} and generate a concise summary in the form of a Markdown Briefing.
|
|
204
|
+
|
|
205
|
+
### Constraints
|
|
206
|
+
1. **Language Continuity**: The output must be in the SAME language as the User Input.
|
|
207
|
+
2. **Format**: Use Markdown (headings, bullet points, and bold text) for high readability.
|
|
208
|
+
3. **Style**: Keep it professional, objective, and clear. Avoid fluff.
|
|
209
|
+
|
|
210
|
+
### Input Data
|
|
211
|
+
- **User Input**: {user_input}
|
|
212
|
+
- **Search Result Text**: {text_content}
|
|
213
|
+
|
|
214
|
+
### Output
|
|
215
|
+
[Generate the Markdown Briefing here]
|
|
216
|
+
"""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Copyright (c) ModelScope Contributors. All rights reserved.
|