sirchmunk 0.0.0__py3-none-any.whl → 0.0.1.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. sirchmunk/__init__.py +8 -0
  2. sirchmunk/base.py +17 -0
  3. sirchmunk/insight/__init__.py +4 -0
  4. sirchmunk/insight/text_insights.py +292 -0
  5. sirchmunk/learnings/__init__.py +1 -0
  6. sirchmunk/learnings/evidence_processor.py +525 -0
  7. sirchmunk/learnings/knowledge_base.py +232 -0
  8. sirchmunk/llm/__init__.py +2 -0
  9. sirchmunk/llm/openai_chat.py +247 -0
  10. sirchmunk/llm/prompts.py +216 -0
  11. sirchmunk/retrieve/__init__.py +1 -0
  12. sirchmunk/retrieve/base.py +25 -0
  13. sirchmunk/retrieve/text_retriever.py +1026 -0
  14. sirchmunk/scan/__init__.py +1 -0
  15. sirchmunk/scan/base.py +18 -0
  16. sirchmunk/scan/file_scanner.py +373 -0
  17. sirchmunk/scan/web_scanner.py +18 -0
  18. sirchmunk/scheduler/__init__.py +0 -0
  19. sirchmunk/schema/__init__.py +2 -0
  20. sirchmunk/schema/cognition.py +106 -0
  21. sirchmunk/schema/context.py +25 -0
  22. sirchmunk/schema/knowledge.py +318 -0
  23. sirchmunk/schema/metadata.py +658 -0
  24. sirchmunk/schema/request.py +221 -0
  25. sirchmunk/schema/response.py +20 -0
  26. sirchmunk/schema/snapshot.py +346 -0
  27. sirchmunk/search.py +475 -0
  28. sirchmunk/storage/__init__.py +7 -0
  29. sirchmunk/storage/duckdb.py +676 -0
  30. sirchmunk/storage/knowledge_manager.py +720 -0
  31. sirchmunk/utils/__init__.py +15 -0
  32. sirchmunk/utils/constants.py +15 -0
  33. sirchmunk/utils/deps.py +23 -0
  34. sirchmunk/utils/file_utils.py +70 -0
  35. sirchmunk/utils/install_rga.py +124 -0
  36. sirchmunk/utils/log_utils.py +360 -0
  37. sirchmunk/utils/tokenizer_util.py +55 -0
  38. sirchmunk/utils/utils.py +108 -0
  39. sirchmunk/version.py +1 -1
  40. sirchmunk-0.0.1.post1.dist-info/METADATA +483 -0
  41. sirchmunk-0.0.1.post1.dist-info/RECORD +45 -0
  42. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/WHEEL +1 -1
  43. sirchmunk-0.0.0.dist-info/METADATA +0 -26
  44. sirchmunk-0.0.0.dist-info/RECORD +0 -8
  45. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/entry_points.txt +0 -0
  46. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/licenses/LICENSE +0 -0
  47. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,232 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ import json
3
+ import hashlib
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ from typing import Any, Awaitable, Callable, Dict, List, Optional, Union
7
+
8
+
9
+ from sirchmunk.learnings.evidence_processor import (
10
+ MonteCarloEvidenceSampling,
11
+ RoiResult,
12
+ )
13
+ from sirchmunk.llm.openai_chat import OpenAIChat
14
+ from sirchmunk.llm.prompts import EVIDENCE_SUMMARY
15
+ from sirchmunk.schema.knowledge import (
16
+ AbstractionLevel,
17
+ EvidenceUnit,
18
+ KnowledgeCluster,
19
+ Lifecycle,
20
+ )
21
+ from sirchmunk.schema.metadata import FileInfo
22
+ from sirchmunk.schema.request import Request
23
+ from sirchmunk.utils.constants import DEFAULT_WORK_PATH
24
+ from sirchmunk.utils.file_utils import StorageStructure, fast_extract
25
+ from sirchmunk.utils import create_logger, LogCallback
26
+ from sirchmunk.utils.utils import extract_fields
27
+
28
+ class KnowledgeBase:
29
+ """
30
+ A knowledge base that manages knowledge clusters built from retrieved information and metadata dynamically.
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ llm: OpenAIChat,
36
+ metadata_map: Dict[str, Any] = None,
37
+ work_path: Union[str, Path] = None,
38
+ log_callback: LogCallback = None,
39
+ ):
40
+ """
41
+ Initialize the KnowledgeBase with an LLM and metadata mapping.
42
+
43
+ Args:
44
+ llm (OpenAIChat): An instance of the OpenAIChat LLM for processing text.
45
+ metadata_map (Dict[str, Any]): A mapping of all metadata information.
46
+ k: metadata cache key, refers to `FileInfo.cache_key`
47
+ v: metadata path or content
48
+ work_path: Working directory path
49
+ log_callback: Optional log callback function for custom logging
50
+ """
51
+ self.llm = llm
52
+ self.metadata_map = metadata_map
53
+ self.work_path: Path = (
54
+ DEFAULT_WORK_PATH if work_path is None else Path(work_path).resolve()
55
+ )
56
+ self.metadata_path: Path = (
57
+ self.work_path / StorageStructure.CACHE_DIR / StorageStructure.METADATA_DIR
58
+ )
59
+
60
+ # Store log_callback for passing to child components
61
+ self.log_callback = log_callback
62
+
63
+ # Create bound logger with callback - returns AsyncLogger instance
64
+ self._log = create_logger(log_callback=log_callback)
65
+
66
+ self.llm_usages: List[Dict[str, Any]] = []
67
+
68
+ @staticmethod
69
+ def _get_file_info(
70
+ file_or_url: str, metadata_path: Union[str, Path]
71
+ ) -> Optional[FileInfo]:
72
+
73
+ cache_key: str = FileInfo.get_cache_key(file_or_url=file_or_url)
74
+ meta_file: Path = Path(metadata_path) / f"{cache_key}.json"
75
+
76
+ if not meta_file.exists():
77
+ return None
78
+
79
+ with open(meta_file, "r", encoding="utf-8") as f:
80
+ metadata_content = json.load(f)
81
+
82
+ return FileInfo.from_dict(info=metadata_content)
83
+
84
+ @staticmethod
85
+ def _compose_cluster_text(
86
+ name: Optional[str],
87
+ description: Union[List[str], str, None],
88
+ content: Union[List[str], str, None],
89
+ ) -> str:
90
+ """
91
+ Compose a stable text representation of a cluster from name, description, and content.
92
+ This is used for deterministic cluster ID generation.
93
+ """
94
+ parts: List[str] = []
95
+ if name:
96
+ parts.append(str(name).strip())
97
+
98
+ if description:
99
+ if isinstance(description, list):
100
+ parts.extend([str(item).strip() for item in description if item])
101
+ else:
102
+ parts.append(str(description).strip())
103
+
104
+ if content:
105
+ if isinstance(content, list):
106
+ parts.extend([str(item).strip() for item in content if item])
107
+ else:
108
+ parts.append(str(content).strip())
109
+
110
+ return "\n\n".join([part for part in parts if part])
111
+
112
+ async def build(
113
+ self,
114
+ request: Request,
115
+ retrieved_infos: List[Dict[str, Any]],
116
+ keywords: Dict[str, float] = None,
117
+ top_k_files: Optional[int] = 3,
118
+ top_k_snippets: Optional[int] = 5,
119
+ confidence_threshold: Optional[float] = 8.0,
120
+ verbose: bool = True,
121
+ ) -> Union[KnowledgeCluster, None]:
122
+ """Build a knowledge cluster from retrieved information and metadata dynamically."""
123
+
124
+ if len(retrieved_infos) == 0:
125
+ await self._log.warning(
126
+ "No retrieved information available to build knowledge cluster."
127
+ )
128
+ return None
129
+
130
+ retrieved_infos = retrieved_infos[:top_k_files]
131
+
132
+ keywords = keywords or {}
133
+
134
+ # Get evidence units (regions of interest) from raw retrieved infos
135
+ evidences: List[EvidenceUnit] = []
136
+ for info in retrieved_infos:
137
+ file_path_or_url: str = info["path"]
138
+
139
+ # TODO: handle more file types; deal with large files; Async adaptive
140
+ extraction_result = await fast_extract(file_path=file_path_or_url)
141
+ doc_content: str = extraction_result.content
142
+
143
+ sampler = MonteCarloEvidenceSampling(
144
+ llm=self.llm,
145
+ doc_content=doc_content,
146
+ verbose=verbose,
147
+ log_callback=self.log_callback,
148
+ )
149
+ roi_result: RoiResult = await sampler.get_roi(
150
+ query=request.get_user_input(),
151
+ keywords=keywords,
152
+ confidence_threshold=confidence_threshold,
153
+ top_k=top_k_snippets,
154
+ )
155
+
156
+ evidence_unit = EvidenceUnit(
157
+ doc_id=FileInfo.get_cache_key(file_path_or_url),
158
+ file_or_url=Path(file_path_or_url),
159
+ summary=roi_result.summary,
160
+ is_found=roi_result.is_found,
161
+ snippets=roi_result.snippets,
162
+ extracted_at=datetime.now(),
163
+ conflict_group=[],
164
+ )
165
+ self.llm_usages.extend(sampler.llm_usages)
166
+ evidences.append(evidence_unit)
167
+
168
+ if len(evidences) == 0:
169
+ await self._log.warning("No evidence units extracted from retrieved information.")
170
+ return None
171
+
172
+ # Get `name`, `description` and `content` from user request and evidences using LLM
173
+ # TODO: to be processed other type of segments
174
+ evidence_contents: List[str] = [ev.summary for ev in evidences]
175
+
176
+ evidence_summary_prompt: str = EVIDENCE_SUMMARY.format(
177
+ user_input=request.get_user_input(),
178
+ evidences="\n\n".join(evidence_contents),
179
+ )
180
+
181
+ evidence_summary_llm_response = await self.llm.achat(
182
+ messages=[{"role": "user", "content": evidence_summary_prompt}],
183
+ stream=True,
184
+ )
185
+ evidence_summary_response: str = evidence_summary_llm_response.content
186
+ self.llm_usages.append(evidence_summary_llm_response.usage)
187
+
188
+ cluster_infos: Dict[str, Any] = extract_fields(
189
+ content=evidence_summary_response
190
+ )
191
+ if len(cluster_infos) == 0:
192
+ await self._log.warning(
193
+ "Failed to extract knowledge cluster information from LLM response."
194
+ )
195
+ return None
196
+
197
+ cluster_name = cluster_infos.get("name")
198
+ cluster_description = cluster_infos.get("description")
199
+ cluster_content = cluster_infos.get("content")
200
+
201
+ cluster_text = self._compose_cluster_text(
202
+ name=cluster_name,
203
+ description=cluster_description,
204
+ content=cluster_content,
205
+ )
206
+ if not cluster_text:
207
+ cluster_text = request.get_user_input() or "unknown"
208
+
209
+ cluster_id = f"C{hashlib.sha256(cluster_text.encode('utf-8')).hexdigest()[:10]}"
210
+
211
+ cluster = KnowledgeCluster(
212
+ id=cluster_id,
213
+ name=cluster_name,
214
+ description=[cluster_description] if cluster_description else [],
215
+ content=cluster_content,
216
+ scripts=[],
217
+ resources=[],
218
+ patterns=[],
219
+ constraints=[],
220
+ evidences=evidences,
221
+ confidence=0.5,
222
+ abstraction_level=AbstractionLevel.TECHNIQUE,
223
+ landmark_potential=0.5,
224
+ hotness=0.5,
225
+ lifecycle=Lifecycle.EMERGING,
226
+ create_time=datetime.now(),
227
+ last_modified=datetime.now(),
228
+ version=1,
229
+ related_clusters=[],
230
+ )
231
+
232
+ return cluster
@@ -0,0 +1,2 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ from .openai_chat import OpenAIChat
@@ -0,0 +1,247 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional
3
+ from dataclasses import dataclass, field
4
+
5
+ from openai import AsyncOpenAI, OpenAI
6
+ from sirchmunk.utils import create_logger, LogCallback
7
+
8
+ if TYPE_CHECKING:
9
+ pass
10
+
11
+
12
+ @dataclass
13
+ class OpenAIChatResponse:
14
+ """
15
+ Data class representing the response from the OpenAI Chat API.
16
+ """
17
+ content: str
18
+ role: str = "assistant"
19
+ usage: Dict[str, int] = field(default_factory=dict)
20
+ model: str = None
21
+ finish_reason: str = None
22
+ logprobs: Any = None
23
+
24
+ def __str__(self):
25
+ return self.content
26
+
27
+ def to_dict(self) -> Dict[str, Any]:
28
+ """
29
+ Convert the response to a dictionary.
30
+
31
+ Returns:
32
+ Dict[str, Any]: The response as a dictionary.
33
+ """
34
+ return {
35
+ "content": self.content,
36
+ "role": self.role,
37
+ "usage": self.usage,
38
+ "model": self.model,
39
+ "finish_reason": self.finish_reason,
40
+ "logprobs": self.logprobs,
41
+ }
42
+
43
+
44
+ class OpenAIChat:
45
+ """
46
+ A client for interacting with OpenAI's chat completion API.
47
+ """
48
+
49
+ def __init__(
50
+ self,
51
+ api_key: str = None,
52
+ base_url: str = None,
53
+ model: str = None,
54
+ log_callback: LogCallback = None,
55
+ **kwargs,
56
+ ):
57
+ """
58
+ Initialize the OpenAIChat client.
59
+
60
+ Args:
61
+ api_key (str): The API key for OpenAI.
62
+ base_url (str): The base URL for the OpenAI API.
63
+ model (str): The model to use for chat completions.
64
+ log_callback (LogCallback): Optional callback for logging.
65
+ **kwargs: Additional keyword arguments passed to the OpenAI client create method.
66
+ """
67
+ self._client = OpenAI(
68
+ api_key=api_key,
69
+ base_url=base_url,
70
+ )
71
+
72
+ self._async_client = AsyncOpenAI(
73
+ api_key=api_key,
74
+ base_url=base_url,
75
+ )
76
+
77
+ self._model = model
78
+ self._kwargs = kwargs
79
+
80
+ # Initialize synchronous and asynchronous loggers
81
+ self._logger = create_logger(log_callback=log_callback, enable_async=False)
82
+ self._logger_async = create_logger(log_callback=log_callback, enable_async=True)
83
+
84
+ def chat(
85
+ self,
86
+ messages: List[Dict[str, Any]],
87
+ stream: bool = True,
88
+ ) -> OpenAIChatResponse:
89
+ """
90
+ Generate a chat completion synchronously.
91
+
92
+ Args:
93
+ messages (List[Dict[str, Any]]): A list of messages for the chat.
94
+ stream (bool): Whether to stream the response.
95
+
96
+ Returns:
97
+ OpenAIChatResponse: The structured response containing content, usage, etc.
98
+ """
99
+ # Ensure we try to get usage metrics even in streaming mode if supported by the API version
100
+ request_kwargs = self._kwargs.copy()
101
+ if stream and "stream_options" not in request_kwargs:
102
+ request_kwargs["stream_options"] = {"include_usage": True}
103
+
104
+ resp = self._client.chat.completions.create(
105
+ model=self._model, messages=messages, stream=stream, **request_kwargs
106
+ )
107
+
108
+ res_content: str = ""
109
+ role: str = "assistant"
110
+ usage: Dict[str, int] = {}
111
+ finish_reason: str = None
112
+ response_model: str = self._model
113
+
114
+ if stream:
115
+ for chunk in resp:
116
+ # Extract usage if present (usually in the last chunk if stream_options is set)
117
+ if chunk.usage:
118
+ usage = chunk.usage.model_dump()
119
+
120
+ # Update model name if provided in chunks
121
+ if chunk.model:
122
+ response_model = chunk.model
123
+
124
+ if not chunk.choices:
125
+ continue
126
+
127
+ delta = chunk.choices[0].delta
128
+
129
+ # Capture role (usually only in the first chunk)
130
+ if delta.role:
131
+ role = delta.role
132
+ self._logger.info(f"[role={delta.role}] ", end="", flush=True)
133
+
134
+ # Capture content
135
+ if delta.content:
136
+ self._logger.info(delta.content, end="", flush=True)
137
+ res_content += delta.content
138
+
139
+ # Capture finish reason
140
+ if chunk.choices[0].finish_reason:
141
+ finish_reason = chunk.choices[0].finish_reason
142
+
143
+ # Print a newline at the end of streaming for cleaner logs
144
+ self._logger.info("", end="\n", flush=True)
145
+
146
+ else:
147
+ # Non-streaming response
148
+ message = resp.choices[0].message
149
+ res_content = message.content or ""
150
+ role = message.role
151
+ finish_reason = resp.choices[0].finish_reason
152
+ response_model = resp.model
153
+ if resp.usage:
154
+ usage = resp.usage.model_dump()
155
+
156
+ # Log the full response content since we didn't stream it
157
+ self._logger.info(f"[role={role}] {res_content}")
158
+
159
+ return OpenAIChatResponse(
160
+ content=res_content,
161
+ role=role,
162
+ usage=usage,
163
+ model=response_model,
164
+ finish_reason=finish_reason
165
+ )
166
+
167
+ async def achat(
168
+ self,
169
+ messages: List[Dict[str, Any]],
170
+ stream: bool = True,
171
+ ) -> OpenAIChatResponse:
172
+ """
173
+ Generate a chat completion asynchronously.
174
+
175
+ Args:
176
+ messages (List[Dict[str, Any]]): A list of messages for the chat.
177
+ stream (bool): Whether to stream the response.
178
+
179
+ Returns:
180
+ OpenAIChatResponse: The structured response containing content, usage, etc.
181
+ """
182
+ # Ensure we try to get usage metrics even in streaming mode
183
+ request_kwargs = self._kwargs.copy()
184
+ if stream and "stream_options" not in request_kwargs:
185
+ request_kwargs["stream_options"] = {"include_usage": True}
186
+
187
+ resp = await self._async_client.chat.completions.create(
188
+ model=self._model, messages=messages, stream=stream, **request_kwargs
189
+ )
190
+
191
+ res_content: str = ""
192
+ role: str = "assistant"
193
+ usage: Dict[str, int] = {}
194
+ finish_reason: str = None
195
+ response_model: str = self._model
196
+
197
+ if stream:
198
+ async for chunk in resp:
199
+ # Extract usage if present (usually in the last chunk if stream_options is set)
200
+ if chunk.usage:
201
+ usage = chunk.usage.model_dump()
202
+
203
+ if chunk.model:
204
+ response_model = chunk.model
205
+
206
+ if not chunk.choices:
207
+ continue
208
+
209
+ delta = chunk.choices[0].delta
210
+
211
+ # Capture role
212
+ if delta.role:
213
+ role = delta.role
214
+ await self._logger_async.info(f"[role={delta.role}] ", end="", flush=True)
215
+
216
+ # Capture content
217
+ if delta.content:
218
+ await self._logger_async.info(delta.content, end="", flush=True)
219
+ res_content += delta.content
220
+
221
+ # Capture finish reason
222
+ if chunk.choices[0].finish_reason:
223
+ finish_reason = chunk.choices[0].finish_reason
224
+
225
+ # Print a newline at the end of streaming for cleaner logs
226
+ await self._logger_async.info("", end="\n", flush=True)
227
+
228
+ else:
229
+ # Non-streaming response
230
+ message = resp.choices[0].message
231
+ res_content = message.content or ""
232
+ role = message.role
233
+ finish_reason = resp.choices[0].finish_reason
234
+ response_model = resp.model
235
+ if resp.usage:
236
+ usage = resp.usage.model_dump()
237
+
238
+ # Log the full response content since we didn't stream it
239
+ await self._logger_async.info(f"[role={role}] {res_content}")
240
+
241
+ return OpenAIChatResponse(
242
+ content=res_content,
243
+ role=role,
244
+ usage=usage,
245
+ model=response_model,
246
+ finish_reason=finish_reason
247
+ )
@@ -0,0 +1,216 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ # flake8: noqa
3
+ # yapf: disable
4
+
5
+
6
+ SNAPSHOT_KEYWORDS_EXTRACTION = """
7
+ Analyze the following document and extract the most relevant and representative key phrases.
8
+ Prioritize terms that capture the core topics, central concepts, important entities (e.g., people, organizations, locations), and domain-specific terminology.
9
+ Exclude generic words (e.g., "the", "and", "result", "study") unless they are part of a meaningful multi-word phrase.
10
+ Limit the output to {max_num} concise key phrases, ranked by significance.
11
+ You **MUST** output the key phrases as a comma-separated list without any additional explanation or formatting.
12
+ You **MUST** adjust the language of the key phrases to be consistent with the language of the input document.
13
+
14
+ **Intput Document**:
15
+ {document_content}
16
+ """
17
+
18
+
19
+ SNAPSHOT_TOC_EXTRACTION = """
20
+ Generate a Table of Contents (ToC) from the given document, adapting its depth and content density to the document’s inherent complexity.
21
+
22
+ Requirements:
23
+
24
+ 1. Adaptive Hierarchy Depth: Dynamically set the depth between 3 to 5 levels, based on the document’s structural and semantic complexity (e.g., 3 levels for simple notices, 5 for technical specs).
25
+ 2. Summarized Entries: Each ToC item must concisely summarize the section’s core content (10–25 words), not just repeat headings. Capture purpose, key actions, or critical info.
26
+ 3. Faithfulness: Do not invent sections. Infer headings only from logical paragraph groupings if explicit titles are absent.
27
+ 4. Format: Use Markdown nested lists with 2-space indents per level (e.g., - → - → -). Output ToC only—no preamble or commentary.
28
+
29
+ **Input Document**:
30
+ {document_content}
31
+ """
32
+
33
+
34
+ QUERY_KEYWORDS_EXTRACTION = """
35
+ ### Role: Search Optimization Expert & Information Retrieval Specialist
36
+
37
+ ### Task:
38
+ Extract **{num_levels} sets** of keywords from the user query with **different granularities** to maximize search hit rate.
39
+
40
+ ### Multi-Level Keyword Granularity Strategy:
41
+
42
+ Extract {num_levels} levels of keywords with progressively finer granularity:
43
+
44
+ {level_descriptions}
45
+
46
+ ### IDF Value Guidelines:
47
+ - Estimate the **IDF (Inverse Document Frequency)** for each keyword based on its rarity in general corpus
48
+ - IDF range: **[0-10]** where:
49
+ - 0-3: Very common terms (e.g., "the", "is", "data")
50
+ - 4-6: Moderately common terms (e.g., "algorithm", "network")
51
+ - 7-9: Rare/specific terms (e.g., "backpropagation", "xgboost")
52
+ - 10: Extremely rare/specialized terms
53
+ - IDF values are **independent** of keyword level - focus on term rarity, not granularity
54
+
55
+ ### Requirements:
56
+ - Each level should have 3-5 keywords
57
+ - Keywords must become progressively **finer-grained** from Level 1 to Level {num_levels}
58
+ - **Level 1**: Coarse-grained phrases/multi-word expressions
59
+ - **Level {num_levels}**: Fine-grained single words or precise technical terms
60
+ - ONLY extract from the user query context; do NOT add external information
61
+ - Ensure keywords at different levels are complementary, not redundant
62
+
63
+ ### Output Format:
64
+ Output {num_levels} separate JSON-like dicts within their respective tags:
65
+
66
+ {output_format_example}
67
+
68
+ ### User Query:
69
+ {{user_input}}
70
+
71
+ ### {num_levels}-Level Keywords (Coarse to Fine):
72
+ """
73
+
74
+
75
+ def generate_keyword_extraction_prompt(num_levels: int = 3) -> str:
76
+ """
77
+ Generate a dynamic keyword extraction prompt template based on the number of levels.
78
+
79
+ The returned template still contains {{user_input}} placeholder that needs to be
80
+ filled in by the caller.
81
+
82
+ Args:
83
+ num_levels: Number of granularity levels (default: 3)
84
+
85
+ Returns:
86
+ Prompt template string with {{user_input}} placeholder
87
+ """
88
+ # Generate level descriptions with granularity focus
89
+ level_descriptions = []
90
+ for i in range(1, num_levels + 1):
91
+ # Define granularity characteristics
92
+ if i == 1:
93
+ granularity = "Coarse-grained"
94
+ desc_text = "Multi-word phrases, compound expressions, broader concepts"
95
+ examples = '"machine learning algorithms", "data processing pipeline", "neural network training"'
96
+ elif i == num_levels:
97
+ granularity = "Fine-grained"
98
+ desc_text = "Single words, precise terms, atomic concepts"
99
+ examples = '"optimization", "gradient", "tensor", "epoch"'
100
+ else:
101
+ granularity = f"Medium-grained (Level {i})"
102
+ desc_text = "2-3 word phrases or compound terms transitioning to single words"
103
+ examples = '"deep learning", "batch normalization", "learning rate"'
104
+
105
+ level_descriptions.append(
106
+ f"**Level {i}** ({granularity}):\n"
107
+ f" - Granularity: {desc_text}\n"
108
+ f" - Example keywords: {examples}\n"
109
+ f" - Note: IDF values should reflect term rarity, not granularity level"
110
+ )
111
+
112
+ # Generate output format examples (avoiding f-string interpolation issues)
113
+ output_examples = []
114
+ for i in range(1, num_levels + 1):
115
+ # Use double braces to escape them in the format string
116
+ example_dict = '{{"keyword1": idf_value, "keyword2": idf_value, ...}}'
117
+ output_examples.append(
118
+ f"<KEYWORDS_LEVEL_{i}>\n{example_dict}\n</KEYWORDS_LEVEL_{i}>"
119
+ )
120
+
121
+ # Format the template with num_levels, descriptions, and examples
122
+ # Note: {{user_input}} becomes {user_input} after this format call
123
+ return QUERY_KEYWORDS_EXTRACTION.format(
124
+ num_levels=num_levels,
125
+ level_descriptions="\n\n".join(level_descriptions),
126
+ output_format_example="\n\n".join(output_examples)
127
+ )
128
+
129
+
130
+ EVIDENCE_SUMMARY = """
131
+ ## Role: High-Precision Information Synthesis Expert
132
+
133
+ ## Task:
134
+ Synthesize a structured response based on the User Input and the provided Evidences.
135
+
136
+ ### Critical Constraint:
137
+ 1. **Language Consistency:** All output fields (<DESCRIPTION>, <NAME>, and <CONTENT>) MUST be written in the **same language** as the User Input.
138
+ 2. **Ignore irrelevant noise:** Focus exclusively on information that directly relates to the User Input. If evidences contain conflicting or redundant data, prioritize accuracy and relevance.
139
+
140
+ ### Input Data:
141
+ - **User Input:** {user_input}
142
+ - **Retrieved Evidences:** {evidences}
143
+
144
+ ### Output Instructions:
145
+ 1. **<DESCRIPTION>**: A high-level, concise synthesis of how the evidences address the user input.
146
+ - *Constraint:* Maximum 3 sentences. Written in the language of {user_input}.
147
+ 2. **<NAME>**: A ultra-short, catchy title or identifier for the description.
148
+ - *Constraint:* Exactly 1 sentence, maximum 30 characters. Written in the language of {user_input}.
149
+ 3. **<CONTENT>**: A detailed and comprehensive summary of all relevant key points extracted from the evidences.
150
+ - *Constraint:* Written in the language of {user_input}.
151
+
152
+ ### Output Format:
153
+ <DESCRIPTION>[Concise synthesis]</DESCRIPTION>
154
+ <NAME>[Short title]</NAME>
155
+ <CONTENT>[Detailed summary]</CONTENT>
156
+ """
157
+
158
+
159
+ SEARCH_RESULT_SUMMARY = """
160
+ ### Task
161
+ Analyze the provided {text_content} and generate a concise summary in the form of a Markdown Briefing.
162
+
163
+ ### Constraints
164
+ 1. **Language Continuity**: The output must be in the SAME language as the User Input.
165
+ 2. **Format**: Use Markdown (headings, bullet points, and bold text) for high readability.
166
+ 3. **Style**: Keep it professional, objective, and clear. Avoid fluff.
167
+
168
+ ### Input Data
169
+ - **User Input**: {user_input}
170
+ - **Search Result Text**: {text_content}
171
+
172
+ ### Output
173
+ [Generate the Markdown Briefing here]
174
+ """
175
+
176
+
177
+ EVALUATE_EVIDENCE_SAMPLE = """
178
+ You are a document retrieval assistant. Please evaluate if the text snippet contains clues to answer the user's question.
179
+
180
+ ### Language Constraint:
181
+ Detect the language of the "Query" and provide the "reasoning" and "output" in the same language (e.g., if the query is in Chinese, the reasoning must be in Chinese).
182
+
183
+ ### Inputs:
184
+ Query: "{query}"
185
+
186
+ Text Snippet (Source: {sample_source}):
187
+ "...{sample_content}..."
188
+
189
+ ### Output Requirement:
190
+ Return JSON:
191
+ - score (0-10):
192
+ 0-3: Completely irrelevant.
193
+ 4-7: Contains relevant keywords or context but no direct answer.
194
+ 8-10: Contains exact data, facts, or direct answer.
195
+ - reasoning: Short reasoning in the SAME language as the query.
196
+
197
+ JSON format only.
198
+ """
199
+
200
+
201
+ ROI_RESULT_SUMMARY = """
202
+ ### Task
203
+ Analyze the provided {text_content} and generate a concise summary in the form of a Markdown Briefing.
204
+
205
+ ### Constraints
206
+ 1. **Language Continuity**: The output must be in the SAME language as the User Input.
207
+ 2. **Format**: Use Markdown (headings, bullet points, and bold text) for high readability.
208
+ 3. **Style**: Keep it professional, objective, and clear. Avoid fluff.
209
+
210
+ ### Input Data
211
+ - **User Input**: {user_input}
212
+ - **Search Result Text**: {text_content}
213
+
214
+ ### Output
215
+ [Generate the Markdown Briefing here]
216
+ """
@@ -0,0 +1 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.