sirchmunk 0.0.0__py3-none-any.whl → 0.0.1.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. sirchmunk/__init__.py +8 -0
  2. sirchmunk/base.py +17 -0
  3. sirchmunk/insight/__init__.py +4 -0
  4. sirchmunk/insight/text_insights.py +292 -0
  5. sirchmunk/learnings/__init__.py +1 -0
  6. sirchmunk/learnings/evidence_processor.py +525 -0
  7. sirchmunk/learnings/knowledge_base.py +232 -0
  8. sirchmunk/llm/__init__.py +2 -0
  9. sirchmunk/llm/openai_chat.py +247 -0
  10. sirchmunk/llm/prompts.py +216 -0
  11. sirchmunk/retrieve/__init__.py +1 -0
  12. sirchmunk/retrieve/base.py +25 -0
  13. sirchmunk/retrieve/text_retriever.py +1026 -0
  14. sirchmunk/scan/__init__.py +1 -0
  15. sirchmunk/scan/base.py +18 -0
  16. sirchmunk/scan/file_scanner.py +373 -0
  17. sirchmunk/scan/web_scanner.py +18 -0
  18. sirchmunk/scheduler/__init__.py +0 -0
  19. sirchmunk/schema/__init__.py +2 -0
  20. sirchmunk/schema/cognition.py +106 -0
  21. sirchmunk/schema/context.py +25 -0
  22. sirchmunk/schema/knowledge.py +318 -0
  23. sirchmunk/schema/metadata.py +658 -0
  24. sirchmunk/schema/request.py +221 -0
  25. sirchmunk/schema/response.py +20 -0
  26. sirchmunk/schema/snapshot.py +346 -0
  27. sirchmunk/search.py +475 -0
  28. sirchmunk/storage/__init__.py +7 -0
  29. sirchmunk/storage/duckdb.py +676 -0
  30. sirchmunk/storage/knowledge_manager.py +720 -0
  31. sirchmunk/utils/__init__.py +15 -0
  32. sirchmunk/utils/constants.py +15 -0
  33. sirchmunk/utils/deps.py +23 -0
  34. sirchmunk/utils/file_utils.py +70 -0
  35. sirchmunk/utils/install_rga.py +124 -0
  36. sirchmunk/utils/log_utils.py +360 -0
  37. sirchmunk/utils/tokenizer_util.py +55 -0
  38. sirchmunk/utils/utils.py +108 -0
  39. sirchmunk/version.py +1 -1
  40. sirchmunk-0.0.1.post1.dist-info/METADATA +483 -0
  41. sirchmunk-0.0.1.post1.dist-info/RECORD +45 -0
  42. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/WHEEL +1 -1
  43. sirchmunk-0.0.0.dist-info/METADATA +0 -26
  44. sirchmunk-0.0.0.dist-info/RECORD +0 -8
  45. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/entry_points.txt +0 -0
  46. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/licenses/LICENSE +0 -0
  47. {sirchmunk-0.0.0.dist-info → sirchmunk-0.0.1.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,221 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ from dataclasses import dataclass
3
+ from typing import Dict, List, Literal, Optional, Union
4
+
5
+
6
+ @dataclass
7
+ class ImageURL:
8
+ """Represents an image URL with optional detail and media type."""
9
+
10
+ url: str
11
+ detail: str = "auto"
12
+ media_type: str = "image/jpeg" # Necessary for Anthropic
13
+
14
+
15
+ @dataclass
16
+ class ContentItem:
17
+ """Represents a content item, which can be either text or an image URL."""
18
+
19
+ type: str # "text" or "image_url"
20
+ text: Optional[str] = None
21
+ image_url: Optional[ImageURL] = None
22
+
23
+ def to_openai(self):
24
+ if self.type == "text":
25
+ return {"type": "text", "text": self.text}
26
+ return {
27
+ "type": "image_url",
28
+ "image_url": {"url": self.image_url.url, "detail": self.image_url.detail},
29
+ }
30
+
31
+ def to_anthropic(self):
32
+ if self.type == "text":
33
+ return {"type": "text", "text": self.text}
34
+
35
+ # Strip Base64 prefix if present for Anthropic
36
+ raw_data = self.image_url.url
37
+ if "base64," in raw_data:
38
+ raw_data = raw_data.split("base64,")[1]
39
+
40
+ return {
41
+ "type": "image",
42
+ "source": {
43
+ "type": "base64",
44
+ "media_type": self.image_url.media_type,
45
+ "data": raw_data,
46
+ },
47
+ }
48
+
49
+
50
+ @dataclass
51
+ class Message:
52
+ """Represents a message in the conversation, system/user/assistant."""
53
+
54
+ role: str
55
+ content: Union[str, List[ContentItem]]
56
+
57
+
58
+ @dataclass
59
+ class Request:
60
+ """
61
+ Represents a request to Agentic Search API, supporting both OpenAI and Anthropic message formats.
62
+ """
63
+
64
+ messages: List[Message]
65
+ system: Optional[str] = "You are a helpful assistant."
66
+ message_format: Literal["openai", "anthropic"] = "openai"
67
+
68
+ def get_system(self) -> str:
69
+ """Get the system prompt."""
70
+ return self.system
71
+
72
+ def get_user_input(self) -> str:
73
+ """Extract the user query from the messages."""
74
+ for m in self.messages:
75
+ if m.role == "user":
76
+ if isinstance(m.content, str):
77
+ return m.content
78
+ else:
79
+ texts = [c.text for c in m.content if c.type == "text" and c.text]
80
+ return " ".join(texts)
81
+ return ""
82
+
83
+ def get_image_urls(self) -> List[str]:
84
+ """Extract image URLs from user messages."""
85
+ image_urls = []
86
+ for m in self.messages:
87
+ if m.role == "user":
88
+ if isinstance(m.content, list):
89
+ for c in m.content:
90
+ if c.type == "image_url" and c.image_url:
91
+ image_urls.append(c.image_url.url)
92
+ return image_urls
93
+
94
+ def to_payload(
95
+ self, prompt_template: Optional[str] = None
96
+ ) -> Union[List[Dict], Dict]:
97
+ """Convert messages to the appropriate API payload format based on message_format."""
98
+ if self.message_format == "openai":
99
+ return self._to_openai_payload(prompt_template=prompt_template)
100
+ elif self.message_format == "anthropic":
101
+ return self._to_anthropic_payload(prompt_template=prompt_template)
102
+ else:
103
+ raise ValueError(
104
+ f"Unsupported message format: {self.message_format}, must be 'openai' or 'anthropic'."
105
+ )
106
+
107
+ def _to_openai_payload(self, prompt_template: Optional[str] = None) -> List[Dict]:
108
+ """Convert messages to OpenAI API payload format."""
109
+ formatted_msgs = []
110
+ system_msg: Message = Message(role="system", content=self.system)
111
+ self.messages.insert(0, system_msg)
112
+
113
+ for m in self.messages:
114
+ if m.role == "user" and prompt_template:
115
+ # Apply prompt template if provided
116
+ if isinstance(m.content, str):
117
+ content = prompt_template.format(user_input=m.content)
118
+ else:
119
+ content = []
120
+ for c in m.content:
121
+ if c.type == "text":
122
+ formatted_text = prompt_template.format(user_input=c.text)
123
+ content.append({"type": "text", "text": formatted_text})
124
+ else:
125
+ content.append(c.to_openai())
126
+ else:
127
+ content = (
128
+ m.content
129
+ if isinstance(m.content, str)
130
+ else [c.to_openai() for c in m.content]
131
+ )
132
+ formatted_msgs.append({"role": m.role, "content": content})
133
+
134
+ return formatted_msgs
135
+
136
+ def _to_anthropic_payload(self, prompt_template: Optional[str] = None) -> Dict:
137
+ """Convert messages to Anthropic API payload format."""
138
+ formatted_msgs = []
139
+
140
+ for m in self.messages:
141
+ if m.role == "user" and prompt_template:
142
+ # Apply prompt template if provided
143
+ if isinstance(m.content, str):
144
+ content = prompt_template.format(user_input=m.content)
145
+ else:
146
+ content = []
147
+ for c in m.content:
148
+ if c.type == "text":
149
+ formatted_text = prompt_template.format(user_input=c.text)
150
+ content.append({"type": "text", "text": formatted_text})
151
+ else:
152
+ content.append(c.to_anthropic())
153
+ else:
154
+ # Anthropic expects 'system' as a top-level parameter, not in messages
155
+ content = (
156
+ m.content
157
+ if isinstance(m.content, str)
158
+ else [c.to_anthropic() for c in m.content]
159
+ )
160
+
161
+ formatted_msgs.append({"role": m.role, "content": content})
162
+
163
+ payload = {"system": self.system, "messages": formatted_msgs}
164
+
165
+ return payload
166
+
167
+
168
+ if __name__ == "__main__":
169
+ import json
170
+
171
+ prompt_template: str = (
172
+ "Please answer the following question carefully based on the given information: {user_input}"
173
+ )
174
+
175
+ # Usage for Anthropic
176
+ req_anthropic = Request(
177
+ message_format="anthropic",
178
+ system="Analyze the video frames carefully.",
179
+ messages=[
180
+ Message(
181
+ role="user",
182
+ content=[
183
+ ContentItem(type="text", text="What is happening here?"),
184
+ ContentItem(
185
+ type="image_url",
186
+ image_url=ImageURL(url="base64_string_here..."),
187
+ ),
188
+ ],
189
+ ),
190
+ ],
191
+ )
192
+ print(
193
+ f"Anthropic Payload:\n{json.dumps(req_anthropic.to_payload(prompt_template=prompt_template), ensure_ascii=False, indent=2)}"
194
+ )
195
+
196
+ # Usage for OpenAI
197
+ req_openai = Request(
198
+ message_format="openai",
199
+ system="You are a helpful assistant.",
200
+ messages=[
201
+ Message(
202
+ role="user",
203
+ content=[
204
+ ContentItem(type="text", text="What is unusual about this image?"),
205
+ ContentItem(
206
+ type="image_url",
207
+ image_url=ImageURL(
208
+ url="https://example.com/strange-building.jpg"
209
+ ),
210
+ ),
211
+ ],
212
+ ),
213
+ ],
214
+ )
215
+ print(
216
+ f"\nOpenAI Payload:\n{json.dumps(req_openai.to_payload(prompt_template=prompt_template), ensure_ascii=False, indent=2)}"
217
+ )
218
+
219
+ print(f"\nUser Query (OpenAI format): {req_openai.get_user_input()}")
220
+
221
+ print(f"\nImage URLs (OpenAI format): {req_openai.get_image_urls()}")
@@ -0,0 +1,20 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ from dataclasses import dataclass
3
+ from typing import Any, Dict
4
+
5
+
6
+ @dataclass
7
+ class Response:
8
+ """
9
+ Represents a response generated by the agentic search system.
10
+ """
11
+
12
+ # The main content of the response
13
+ content: str
14
+
15
+ # Additional data or metadata associated with the response
16
+ metadata: Dict[str, Any] = None
17
+
18
+ def __post_init__(self):
19
+ if self.metadata is None:
20
+ self.metadata = {}
@@ -0,0 +1,346 @@
1
+ # Copyright (c) ModelScope Contributors. All rights reserved.
2
+ import random
3
+ import re
4
+ from abc import ABC, abstractmethod
5
+ from dataclasses import dataclass, field
6
+ from pathlib import Path
7
+ from typing import Any, List, Optional, Union
8
+
9
+ from loguru import logger
10
+
11
+ from sirchmunk.llm.openai_chat import OpenAIChat
12
+ from sirchmunk.utils.tokenizer_util import TokenizerUtil
13
+
14
+
15
+ @dataclass
16
+ class SnapshotInfo:
17
+ """
18
+ Data class to hold snapshot information of the specified file.
19
+ """
20
+
21
+ title: str = field(default="")
22
+
23
+ description: str = field(default="")
24
+
25
+ keywords: List[str] = field(default_factory=list)
26
+
27
+ contents: List[str] = field(default_factory=list)
28
+
29
+ # Additional resources (e.g., images, tables, or others associated with current file)
30
+ resources: List[Any] = field(default_factory=list)
31
+
32
+ def to_dict(self):
33
+ """Convert SnapshotInfo to a dictionary."""
34
+ return {
35
+ "title": self.title,
36
+ "description": self.description,
37
+ "keywords": self.keywords,
38
+ "contents": self.contents,
39
+ "resources": self.resources,
40
+ }
41
+
42
+
43
+ class Snapshot(ABC):
44
+ """
45
+ Base class for file snapshotting strategies.
46
+ """
47
+
48
+ def __init__(
49
+ self,
50
+ llm: Optional[OpenAIChat] = None,
51
+ **kwargs,
52
+ ):
53
+
54
+ self.llm: Optional[OpenAIChat] = llm
55
+ self.kwargs = kwargs
56
+
57
+ @abstractmethod
58
+ def sampling(self, **kwargs) -> List[str]:
59
+ """
60
+ Abstract method to perform sampling on a file.
61
+ """
62
+ raise NotImplementedError("Subclasses must implement this method.")
63
+
64
+
65
+ class TextSnapshot(Snapshot):
66
+ """
67
+ Text file sampling strategies for snapshotting large text files.
68
+ """
69
+
70
+ # TODO: sampling支持多次采样,尤其是针对中间的chunks;后续LLM可以自适应动态调用sampling来做内容增强;
71
+ # Staring mode(凝视模式,参考Radar中的概念):该过程在search阶段由LLM启发式调度;越复杂的文档,中间chunks采样越多;同时需要多次迭代summary,并回写metadata的snapshot
72
+ # 对于contents,采用多阶段summary
73
+ # def resampling(): ...
74
+
75
+ MAX_SNAPSHOT_TOKENS = 2048 # Can be adaptive based on file size
76
+
77
+ MAX_FILE_SIZE = 500 * 1024 * 1024 # 500 MB
78
+
79
+ def __init__(self, llm: Optional[OpenAIChat] = None, **kwargs):
80
+
81
+ from sirchmunk.insight.text_insights import TextInsights
82
+
83
+ super().__init__(llm=llm, **kwargs)
84
+
85
+ self.text_insights = TextInsights(llm=llm)
86
+ self.tokenizer_util = TokenizerUtil()
87
+
88
+ @staticmethod
89
+ def filter_line(line: str) -> Optional[str]:
90
+ """Filter out lines with low information content or noise.
91
+
92
+ Filters out:
93
+ - Empty lines or whitespace-only lines
94
+ - Markdown formatting noise (horizontal rules, excessive headings, etc.)
95
+ - Lines consisting mostly of symbols/punctuation
96
+ - URLs, email addresses, and common noise patterns
97
+ - Very short lines (typically < 3 characters after cleaning)
98
+ - Lines with excessive repeated characters
99
+ - Common boilerplate/footer patterns
100
+
101
+ Args:
102
+ line: Input line string.
103
+
104
+ Returns:
105
+ Cleaned line string if it passes filters, None otherwise.
106
+ """
107
+ if not line:
108
+ return None
109
+
110
+ # Strip whitespace and check for empty lines
111
+ stripped = line.strip()
112
+ if not stripped:
113
+ return None
114
+
115
+ # Remove leading/trailing whitespace but preserve internal structure
116
+ cleaned = line.rstrip(
117
+ "\n\r"
118
+ ) # Keep original indentation, only remove line endings
119
+
120
+ # Check line length (after stripping)
121
+ if len(stripped) < 10:
122
+ return None
123
+
124
+ # Markdown-specific noise patterns
125
+ markdown_noise_patterns = [
126
+ r"^\s*-{3,}\s*$", # Horizontal rules (---, --- , etc.)
127
+ r"^\s*\*{3,}\s*$", # Horizontal rules (***)
128
+ r"^\s*_{3,}\s*$", # Horizontal rules (___)
129
+ r"^\s*#{6,}.*$", # Excessive headings (6+ #)
130
+ r"^\s*>+\s*$", # Empty blockquotes
131
+ r"^\s*[-*+]\s*$", # Empty list items
132
+ r"^\s*\d+\.\s*$", # Empty numbered list items
133
+ r"^\s*!\[.*\]\(.*\)\s*$", # Standalone images without context
134
+ r"^\s*\[.*\]:\s*https?://\S+\s*$", # Reference-style link definitions
135
+ r"^\s*```\s*\w*\s*$", # Code block delimiters (``` or ```python)
136
+ r"^\s*~~~\s*\w*\s*$", # Alternative code block delimiters
137
+ r"^\s*\|\s*[-:]+\s*\|\s*$", # Table separator rows
138
+ r"^\s*\|(\s*:?-+:?\s*\|)+\s*$", # Alternative table separator format
139
+ ]
140
+
141
+ # Check for Markdown noise patterns
142
+ for pattern in markdown_noise_patterns:
143
+ if re.match(pattern, stripped):
144
+ return None
145
+
146
+ # Remove common noise patterns and check if line becomes empty
147
+ noise_patterns = [
148
+ r"^https?://\S+", # URLs at start of line
149
+ r"\bhttps?://\S+\b", # URLs anywhere
150
+ r"\bwww\.\S+\b", # www URLs
151
+ r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", # emails
152
+ r"^[\s\*\-\_\=\#\+]+$", # lines with only symbols
153
+ ]
154
+
155
+ # Check for excessive symbols ratio
156
+ alphanumeric_chars = sum(c.isalnum() for c in stripped)
157
+ if len(stripped) > 0 and alphanumeric_chars / len(stripped) < 0.3:
158
+ return None
159
+
160
+ # Check for excessive repeated characters (e.g., "..........." or "----------")
161
+ if TextSnapshot._has_excessive_repetition(stripped):
162
+ return None
163
+
164
+ # Check for common boilerplate patterns
165
+ boilerplate_patterns = [
166
+ r"^(copyright|©|\(c\))",
167
+ r"^all rights reserved",
168
+ r"^confidential",
169
+ r"^page \d+",
170
+ r"^\d+\s+of\s+\d+$",
171
+ r"^file:|^path:",
172
+ r"^created:|^modified:",
173
+ r"^author:",
174
+ r"^version:",
175
+ r"^build:",
176
+ r"^generated by",
177
+ r"^this document",
178
+ r"^last updated",
179
+ r"^table of contents",
180
+ r"^toc:",
181
+ r"^contents",
182
+ r"^index",
183
+ ]
184
+
185
+ stripped_lower = stripped.lower()
186
+ for pattern in boilerplate_patterns:
187
+ if re.search(pattern, stripped_lower):
188
+ return None
189
+
190
+ # Apply noise pattern removal and check if meaningful content remains
191
+ temp_line = stripped
192
+ for pattern in noise_patterns + markdown_noise_patterns:
193
+ temp_line = re.sub(pattern, "", temp_line)
194
+
195
+ # After removing noise, check if we still have meaningful content
196
+ temp_stripped = temp_line.strip()
197
+ if (
198
+ len(temp_stripped) < 3
199
+ or (
200
+ sum(c.isalnum() for c in temp_stripped) / len(temp_stripped)
201
+ if len(temp_stripped) > 0
202
+ else 0
203
+ )
204
+ < 0.4
205
+ ):
206
+ return None
207
+
208
+ # Additional check: reject lines that are mostly Markdown syntax characters
209
+ markdown_chars = sum(1 for c in stripped if c in "#*_-~`>![](){}|:")
210
+ if len(stripped) > 0 and markdown_chars / len(stripped) > 0.6:
211
+ return None
212
+
213
+ return cleaned
214
+
215
+ @staticmethod
216
+ def _has_excessive_repetition(text: str) -> bool:
217
+ """Check if text has excessive character repetition.
218
+
219
+ Args:
220
+ text: Input text string.
221
+
222
+ Returns:
223
+ True if excessive repetition detected.
224
+ """
225
+ if len(text) < 10:
226
+ return False
227
+
228
+ # Check for sequences of same character (e.g., "------", "......")
229
+ for i in range(len(text) - 4):
230
+ if text[i] == text[i + 1] == text[i + 2] == text[i + 3] == text[i + 4]:
231
+ # Allow some legitimate cases like "hello....." but not full line
232
+ if text.count(text[i]) / len(text) > 0.6:
233
+ return True
234
+
235
+ # Check for alternating patterns (e.g., "- - - -", ". . . .")
236
+ if re.search(r"([^\w\s])\s*\1\s*\1\s*\1", text):
237
+ return True
238
+
239
+ return False
240
+
241
+ def sampling(
242
+ self,
243
+ file_path: Union[str, Path],
244
+ max_snapshot_tokens: int = MAX_SNAPSHOT_TOKENS,
245
+ max_file_size: int = MAX_FILE_SIZE,
246
+ ) -> Union[SnapshotInfo, None]:
247
+
248
+ file_path = Path(file_path)
249
+ file_size = file_path.stat().st_size
250
+ if file_size > max_file_size: # TODO: add more strategies for large files
251
+ logger.warning(
252
+ f"File size {file_size} exceeds maximum allowed size of {max_file_size} bytes, skipping snapshot."
253
+ )
254
+ return None
255
+
256
+ snapshot_info = SnapshotInfo()
257
+
258
+ selected_lines = []
259
+ accumulated_tokens = 0
260
+ line_count = 0
261
+
262
+ # Stream through file once
263
+ with open(file_path, "r", encoding="utf-8", errors="replace") as f:
264
+ for line in f:
265
+ line = self.filter_line(line)
266
+ if not line:
267
+ continue
268
+
269
+ line_count += 1
270
+ line_tokens = self.tokenizer_util.count_tokens(contents=line.strip())
271
+
272
+ # Adaptive sampling strategy:
273
+ # - Early in file: higher acceptance probability
274
+ # - Near token limit: lower probability
275
+ # - Always accept if budget allows and line is small
276
+
277
+ # Calculate acceptance probability
278
+ token_fill_ratio = (
279
+ accumulated_tokens / max_snapshot_tokens
280
+ if max_snapshot_tokens > 0
281
+ else 0
282
+ )
283
+ line_token_ratio = (
284
+ line_tokens / max_snapshot_tokens if max_snapshot_tokens > 0 else 0
285
+ )
286
+
287
+ # Base probability decreases as we fill token budget
288
+ base_prob = max(0.2, 1.0 - token_fill_ratio * 2.0)
289
+
290
+ # Adjust for line size (prefer smaller lines when budget is tight)
291
+ size_penalty = (
292
+ min(1.0, 0.8 + (1.0 - line_token_ratio) * 0.4)
293
+ if token_fill_ratio > 0.5
294
+ else 1.0
295
+ )
296
+
297
+ # Add some randomness to avoid deterministic patterns
298
+ noise = random.uniform(0.8, 1.2)
299
+ acceptance_prob = min(1.0, base_prob * size_penalty * noise)
300
+
301
+ # Force accept conditions
302
+ if (accumulated_tokens == 0 and line_tokens > 0) or (
303
+ accumulated_tokens + line_tokens <= max_snapshot_tokens * 0.7
304
+ ):
305
+ acceptance_prob = 1.0
306
+
307
+ # Sampling decision
308
+ if random.random() < acceptance_prob:
309
+ # Check if adding this line keeps us within reasonable bounds
310
+ if (
311
+ accumulated_tokens + line_tokens <= max_snapshot_tokens * 1.5
312
+ ): # Allow 50% overflow max
313
+ selected_lines.append((line, line_count))
314
+ accumulated_tokens += line_tokens
315
+
316
+ # Early termination if we've significantly exceeded target
317
+ if accumulated_tokens >= max_snapshot_tokens * 1.2:
318
+ break
319
+
320
+ # Sort by original line number to preserve document structure
321
+ selected_lines.sort(key=lambda x: x[1])
322
+ logger.info(
323
+ f"Got {len(selected_lines)} selected lines, total tokens: {accumulated_tokens}"
324
+ )
325
+
326
+ snapshot_info.contents = [line for line, _ in selected_lines]
327
+
328
+ # Get keywords/key phrase
329
+ try:
330
+ snapshot_info.keywords = self.text_insights.extract_phrase(
331
+ contents=snapshot_info.contents
332
+ )
333
+ except Exception as e:
334
+ logger.error(f"Error extracting keywords: {e}")
335
+ snapshot_info.keywords = []
336
+
337
+ try:
338
+ # TODO: 与phrase一起放到同一个llm calling中
339
+ snapshot_info.description = self.text_insights.extract_toc(
340
+ contents=snapshot_info.contents
341
+ )
342
+ except Exception as e:
343
+ logger.error(f"Error extracting description: {e}")
344
+ snapshot_info.description = ""
345
+
346
+ return snapshot_info