entari-plugin-hyw 3.2.112__py3-none-any.whl → 3.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of entari-plugin-hyw might be problematic. Click here for more details.

@@ -1,92 +1,128 @@
1
- VISION_SYSTEM_PROMPT = """你是一个专业的视觉转文字专家.
1
+ VISION_SP = """# 你是一个专业的视觉转文字专家.
2
2
 
3
- [用户消息]
4
- {user_msgs}
5
-
6
- [核心任务]
3
+ # 核心任务
7
4
  - 智能分析图片内容, 转述成文本, 除此之外不要添加任何内容
8
5
  - 文字优先: 若包含清晰文字(文档、截图等), 必须完整准确转录, 不要遗漏.
9
6
  - 视觉补充: 若无文字, 重点描述视觉内容(物体、场景、氛围).
10
7
  - 用户要求: 根据用户消息中提示侧重转文本的偏向, 若无或无关联则不理会常规完成.
11
- """
12
-
13
- INTRUCT_SYSTEM_PROMPT = """你是一个专业的指导专家.
14
8
 
15
- [用户消息]
9
+ ## 用户消息
10
+ ```text
16
11
  {user_msgs}
12
+ ```
13
+ """
17
14
 
18
- [核心任务]
19
- - 决定是否使用搜索工具
20
- - 如果用户消息包含典型名词、可能的专有名词组合, 且意图为解释此词, 请使用搜索工具, 搜索工具会给你返回最新的资料和图片.
21
- - 如果用户消息明显不需要搜索, 或虽然存在名词但是作为过程参与不涉及结果, 则不调用搜索工具
22
- - 如果用户的消息明显有两个搜索的方向, 本次对话最多同时调用两个搜索工具分开搜索
23
- - 理解用户话语, 提炼出搜索关键词.
24
- - 保持原意, 禁止添加额外内容.
25
- - 禁止擅自分割关键词导致语意变化.
26
- - 决定是否放权 mcp工具 给 agent
27
- - 如果用户显式地表达了要求模型使用mcp帮助完成任务的意图, 调用工具放权
15
+ INTRUCT_SP = """# 你是一个专业的指导专家.
16
+
17
+ ## 核心任务
18
+ - 决定预处理工具:
19
+ - 用户消息包含链接: 调用 crawl_page 获取内容, 无需其他工具
20
+ - 用户消息包含典型名词、可能的专有名词组合: internal_web_search (提炼出关键词搜索, 保持原意, 指向不同的领域时优先搜索最贴切的, 最多同时调用2)
21
+ - 用户消息适合加入图片点缀: internal_image_search
22
+ - 用户消息不需要搜索: 不调用工具
23
+ - 调用 set_mode:
24
+ - 绝大部分常规问题: standard
25
+ - 用户要求研究/深度搜索: agent
26
+ - 需要获取页面具体信息才能回答问题: agent
28
27
  > 所有工具需要在本次对话同时调用
29
28
 
30
- [调用工具]
29
+ ## 调用工具
30
+ - 使用工具时, 必须通过 function_call / tool_call 机制调用.
31
31
  {tools_desc}
32
+
33
+ ## 你的回复
34
+ 调用工具后无需额外文本.
35
+
36
+ ## 用户消息
37
+ ```
38
+ {user_msgs}
39
+ ```
32
40
  """
33
41
 
34
- INTRUCT_SYSTEM_PROMPT_VISION_ADD = """
35
- [视觉专家消息]
42
+
43
+ INTRUCT_SP_VISION_ADD = """
44
+ ## 视觉专家消息
45
+ ```text
36
46
  {vision_msgs}
47
+ ```
37
48
  """
38
49
 
50
+ AGENT_SP = """# 你是一个 Agent 总控专家, 你需要理解用户意图, 根据已有信息给出最终回复.
51
+ > 请确保你输出的任何消息有着准确的来源, 减少输出错误信息.
39
52
 
40
- AGENT_SYSTEM_PROMPT = """
41
- 你是一个全能助手, 请根据用户需求和搜索结果中贴切用户意图的可靠信息解释用户消息中的关键词.
53
+ 当前模式: {mode}, {mode_desc}
42
54
 
43
- [用户消息]
44
- {user_msgs}
55
+ ## 最终回复格式要求
56
+ - 直接输出 Markdown 正文.
45
57
 
46
- [回复格式要求]
47
58
  当不调用工具发送文本, 即会变成最终回复, 请遵守:
48
- - 语言: 简体中文, 百科式风格.
49
- - 正文格式: 使用 Markdown, 有大标题, 可以使用数学公式, 格式内容丰富.
59
+ - 语言: 简体中文, 百科式风格, 语言严谨不啰嗦.
60
+ - 正文格式: 使用 Markdown格式, [hightlight, katex], 有大标题, 内容丰富突出重点.
61
+ - 工具引用:
62
+ - 搜索摘要引用: 使用 `search:数字id` 如 `search:3`
63
+ - 页面内容引用: 使用 `page:数字id` 如 `page:5`
64
+ - 每个引用必须分开标注
65
+ - 在正文底部添加 references 代码块:
66
+ - 用不到的条目不写, 没有专家给信息就不写.
67
+ ```references
68
+ [1] [search] [文本描述](url)
69
+ [3] [search] [文本描述](url)
70
+ [5] [page] [页面标题](url)
71
+ [7] [page] [页面标题](url)
72
+ ```
73
+
74
+ ## 用户消息
75
+ ```text
76
+ {user_msgs}
77
+ ```
50
78
  """
51
79
 
52
- AGENT_SYSTEM_PROMPT_INTRUCT_VISION_ADD = """
53
- [视觉专家消息]
54
- {vision_msgs}
80
+ # PS: agent 无搜索图片权限
81
+ AGENT_SP_TOOLS_STANDARD_ADD = """
82
+ 你需要整合已有的信息, 提炼用户消息中的关键词, 进行最终回复.
55
83
  """
56
84
 
57
- AGENT_SYSTEM_PROMPT_SEARCH_ADD = """
58
- [搜索专家给出的信息]
59
- {search_msgs}
60
85
 
61
- [最终回复]
62
- - 图片: 如果本次回答适合配图, 对搜索到的图片, 选择 1-3 张合适的尽量类型、来源、不同、主题契合的图片, 美观分布嵌入正文 `![alt](url)`.
63
- - 搜索引用: 在正文中使用 `ref:数字id` (代码形式) 如 `ref:1` 标注来源, 并在文末通过 `references` 代码块列出, 每个引用必须分开标注.
64
- - 你需要在最终回复底部添加 `references` 代码块.
86
+ AGENT_SP_TOOLS_AGENT_ADD = """
87
+ - 你现在可以使用工具: {tools_desc}
88
+ - 你需要判断顺序或并发使用工具获取信息:
89
+ - 0-1 internal_web_search
90
+ - 0-1 次 internal_image_search (如果用户需要图片, 通常和 internal_web_search 并发执行)
91
+ - 1-2 次 crawl_page
92
+ - 使用工具时, 必须通过 function_call / tool_call 机制调用.
93
+ """
94
+
65
95
 
66
- ```references
67
- 1. [标题](url)
68
- 2. [标题](url)
96
+
97
+ AGENT_SP_INTRUCT_VISION_ADD = """
98
+ ## 视觉专家消息
99
+ ```text
100
+ {vision_msgs}
69
101
  ```
70
102
  """
71
103
 
72
- AGENT_SYSTEM_PROMPT_MCP_ADD = """
73
- [MCP 工具已授权]
74
- 可用工具:
75
- {tools_desc}
76
-
77
- > 积极使用工具完成任务,工具优先于文本回复。
78
- 秧根扭起来,我们喜羊羊大东北是我的家乡,我就在这一个人
79
- [最终回复格式]
80
- - 工具引用: 在正文中使用 `mcp:字母顺序` (代码形式) 如 `mcp:a` 标注来源, 并在文末通过 `mcp` 代码块列出, 每个引用必须分开标注.
81
- - 底部添加 `mcp` 代码块列出工具调用流程:
82
- - 格式: `[图标] 工具名称` + 文本描述
83
- - 图标: navigate, snapshot, click, type, code, wait, default
84
-
85
- ```mcp
86
- 1. [code] browser_run_code: 执行JavaScript计算
87
- 2. [navigate] navigate: 导航到xxx网站
104
+ AGENT_SP_SEARCH_ADD = """
105
+ ## 搜索专家消息
106
+ ```text
107
+ {search_msgs}
88
108
  ```
109
+
110
+
89
111
  """
90
112
 
113
+ AGENT_SP_PAGE_ADD = """
114
+ ## 页面内容专家消息
115
+ ```text
116
+ {page_msgs}
117
+ ```
118
+ - 引用页面内容时, 必须使用 `page:id` 格式
119
+ """
91
120
 
121
+ AGENT_SP_IMAGE_SEARCH_ADD = """
122
+ ## 图像搜索专家消息
123
+ ```text
124
+ {image_search_msgs}
125
+ ```
126
+ - 每进行一次 internal_image_search, 挑选 1 张图像插入正文
127
+ """
92
128
 
@@ -1,193 +1,241 @@
1
- import re
2
- import httpx
3
1
  import urllib.parse
4
2
  from typing import List, Dict, Optional, Any
5
3
  from loguru import logger
4
+ from crawl4ai import AsyncWebCrawler
5
+ from crawl4ai.async_configs import CrawlerRunConfig
6
+ from crawl4ai.cache_context import CacheMode
7
+
8
+ # Shared crawler instance to avoid repeated init
9
+ _shared_crawler: Optional[AsyncWebCrawler] = None
10
+
11
+
12
+ async def get_shared_crawler() -> AsyncWebCrawler:
13
+ global _shared_crawler
14
+ if _shared_crawler is None:
15
+ _shared_crawler = AsyncWebCrawler()
16
+ await _shared_crawler.start()
17
+ return _shared_crawler
18
+
19
+
20
+ async def close_shared_crawler():
21
+ global _shared_crawler
22
+ if _shared_crawler:
23
+ try:
24
+ await _shared_crawler.close()
25
+ except Exception:
26
+ pass
27
+ _shared_crawler = None
6
28
 
7
29
  class SearchService:
8
30
  """
9
- Specialized service for interacting with SearXNG.
10
- Uses regex-based HTML parsing to ensure O(n) performance and zero blocking,
11
- bypasssing heavy DOM parsers like Trafilatura.
31
+ Crawl4AI-backed search & fetch service.
32
+ Uses the configured search engine results page (SERP) URL and parses links from the HTML.
12
33
  """
13
34
  def __init__(self, config: Any):
14
35
  self.config = config
36
+ self._default_limit = 8
37
+ self._crawler: Optional[AsyncWebCrawler] = None
15
38
 
16
- async def search(self, query: str) -> List[Dict[str, str]]:
17
- """
18
- Execute search and parse results using Regex.
19
- Returns a list of dicts: {'title': str, 'url': str, 'content': str}
20
- """
21
- # 1. Construct URL (Force HTML format since JSON is 403)
39
+ def _build_search_url(self, query: str) -> str:
22
40
  encoded_query = urllib.parse.quote(query)
23
- base = getattr(self.config, "search_base_url", "http://127.0.0.1:8888/search?")
24
-
25
- # Ensure we don't have double '?' or '&' issues
41
+ base = getattr(self.config, "search_base_url", "https://lite.duckduckgo.com/lite/?q={query}")
42
+ if "{query}" in base:
43
+ return base.replace("{query}", encoded_query).replace("{limit}", str(self._default_limit))
26
44
  sep = "&" if "?" in base else "?"
27
-
28
- # Remove any existing format=json if present in base (just in case)
29
- base = base.replace("format=json&", "").replace("&format=json", "")
30
-
31
- # Handle {query} placeholder if present (common in config defaults)
45
+ return f"{base}{sep}q={encoded_query}"
46
+
47
+ def _build_image_url(self, query: str) -> str:
48
+ encoded_query = urllib.parse.quote(query)
49
+ base = getattr(self.config, "image_search_base_url", "https://duckduckgo.com/?q={query}&iax=images&ia=images")
32
50
  if "{query}" in base:
33
- # We need to handle potential other placeholders like {limit} if they exist, or escape them
34
- # For simplicity, we just replace {query} and ignore format/limit changes since we parse HTML
35
- # Actually, standard python format() might fail if other braces exist.
36
- # safe replace:
37
- url = base.replace("{query}", encoded_query)
38
- # Remove other common placeholders if they linger
39
- url = url.replace("{limit}", "8")
40
- else:
41
- # Append mode
42
- url = f"{base}{sep}q={encoded_query}&language=zh-CN"
43
-
44
- logger.info(f"SearchService: Fetching {url}")
51
+ return base.replace("{query}", encoded_query).replace("{limit}", str(self._default_limit))
52
+ sep = "&" if "?" in base else "?"
53
+ return f"{base}{sep}q={encoded_query}&iax=images&ia=images"
54
+
55
+ async def search(self, query: str) -> List[Dict[str, str]]:
56
+ """
57
+ Crawl the configured SERP using Crawl4AI and return parsed results.
58
+ """
59
+ if not query:
60
+ return []
61
+
62
+ url = self._build_search_url(query)
63
+ logger.info(f"SearchService(Crawl4AI): fetching {url}")
45
64
 
46
65
  try:
47
- async with httpx.AsyncClient(timeout=10.0) as client:
48
- resp = await client.get(url)
49
- if resp.status_code != 200:
50
- logger.error(f"Search failed: {resp.status_code}")
51
- return []
52
- html = resp.text
53
- return self._parse_searxng_html(html)
66
+ crawler = await self._get_crawler()
67
+ result = await crawler.arun(
68
+ url=url,
69
+ config=CrawlerRunConfig(
70
+ wait_until="domcontentloaded",
71
+ wait_for="article",
72
+ cache_mode=CacheMode.BYPASS,
73
+ word_count_threshold=1,
74
+ screenshot=False,
75
+ capture_console_messages=False,
76
+ capture_network_requests=False,
77
+ ),
78
+ )
79
+ return self._parse_markdown_result(result, limit=self._default_limit)
54
80
  except Exception as e:
55
- logger.error(f"Search execution failed: {e}")
81
+ logger.error(f"Crawl4AI search failed: {e}")
56
82
  return []
57
83
 
58
- def _parse_searxng_html(self, html: str) -> List[Dict[str, str]]:
84
+ def _parse_markdown_result(self, result, limit: int = 8) -> List[Dict[str, str]]:
85
+ """Parse Crawl4AI result into search items without manual HTML parsing."""
86
+ md = (result.markdown or result.extracted_content or "").strip()
87
+ lines = [ln.strip() for ln in md.splitlines() if ln.strip()]
88
+ links = result.links.get("external", []) if getattr(result, "links", None) else []
89
+ seen = set()
90
+ results: List[Dict[str, str]] = []
91
+
92
+ def find_snippet(url: str, domain: str) -> str:
93
+ for ln in lines:
94
+ if url in ln or (domain and domain in ln):
95
+ return ln[:400]
96
+ # fallback to first non-empty line
97
+ return lines[0][:400] if lines else ""
98
+
99
+ for link in links:
100
+ url = link.get("href") or ""
101
+ if not url or url in seen:
102
+ continue
103
+ seen.add(url)
104
+ domain = urllib.parse.urlparse(url).hostname or ""
105
+ title = link.get("title") or link.get("text") or url
106
+ snippet = find_snippet(url, domain)
107
+ results.append({
108
+ "title": title.strip(),
109
+ "url": url,
110
+ "domain": domain,
111
+ "content": snippet or title,
112
+ })
113
+ if len(results) >= limit:
114
+ break
115
+
116
+ if not results:
117
+ logger.warning(f"SearchService: no results parsed; md_length={len(md)}, links={len(links)}")
118
+ else:
119
+ logger.info(f"SearchService: parsed {len(results)} results via Crawl4AI links")
120
+ return results
121
+
122
+ async def fetch_page(self, url: str) -> Dict[str, str]:
59
123
  """
60
- Parse SearXNG HTML results using Regex.
61
- Target structure:
62
- <article class="result ...">
63
- <h3><a href="(url)">(title)</a></h3>
64
- <p class="content">(snippet)</p>
65
- </article>
124
+ Fetch a single page via Crawl4AI and return cleaned markdown/text plus metadata.
66
125
  """
67
- results = []
68
-
69
- # Regex to find result blocks.
70
- # We split by <article to find chunks, then parse each chunk.
71
- # This is safer than a global regex which might get confused by nested structures.
72
- chunks = html.split('<article')
73
-
74
- for chunk in chunks[1:]: # Skip preamble
75
- try:
76
- # 1. Extract URL and Title
77
- # Look for <a href="..." ... >Title</a> inside h3
78
- # Simplified pattern: href="([^"]+)" text is >([^<]+)<
79
- link_match = re.search(r'href="([^"]+)".*?>([^<]+)<', chunk)
80
- if not link_match:
81
- continue
82
-
83
- url = link_match.group(1)
84
- title = link_match.group(2).strip()
85
-
86
- # Verify it's a valid result link (sometimes engine links appear)
87
- if "searxng" in url or url.startswith("/"):
88
- continue
126
+ if not url:
127
+ return {"content": "Error: missing url", "title": "Error", "url": ""}
89
128
 
90
- # 2. Extract Snippet
91
- # Look for class="content">...<
92
- # We try to capture text until the next tag open
93
- snippet_match = re.search(r'class="content"[^>]*>([\s\S]*?)</p>', chunk)
94
- snippet = ""
95
- if snippet_match:
96
- # Clean up HTML tags from snippet if any remain (basic check)
97
- raw_snippet = snippet_match.group(1)
98
- snippet = re.sub(r'<[^>]+>', '', raw_snippet).strip()
99
-
100
- if url and title:
101
- # SAFETY: Truncate snippet to 500 chars to prevent context explosion
102
- final_snippet = (snippet or title)[:500]
103
- results.append({
104
- "title": title,
105
- "url": url,
106
- "content": final_snippet
107
- })
108
-
109
- if len(results) >= 8: # Limit to 8 results
110
- break
111
-
129
+ try:
130
+ crawler = await self._get_crawler()
131
+ result = await crawler.arun(
132
+ url=url,
133
+ config=CrawlerRunConfig(
134
+ wait_until="networkidle",
135
+ wait_for_images=False, # Faster: skip image loading
136
+ cache_mode=CacheMode.BYPASS,
137
+ word_count_threshold=1,
138
+ screenshot=False,
139
+ capture_console_messages=False,
140
+ capture_network_requests=False,
141
+ ),
142
+ )
143
+ if not result.success:
144
+ return {"content": f"Error: crawl failed ({result.error_message or 'unknown'})", "title": "Error", "url": url}
145
+
146
+ content = result.markdown or result.extracted_content or result.cleaned_html or result.html or ""
147
+ # Extract metadata if available, otherwise fallback
148
+ title = "No Title"
149
+ if result.metadata:
150
+ title = result.metadata.get("title") or result.metadata.get("og:title") or title
151
+
152
+ # If metadata title is missing/generic, try to grab from links or url? No, metadata is best.
153
+ if title == "No Title" and result.links:
154
+ # Minimal fallback not really possible without parsing HTML again or regex
155
+ pass
156
+
157
+ return {
158
+ "content": content[:8000],
159
+ "title": title,
160
+ "url": result.url or url
161
+ }
162
+ except Exception as e:
163
+ logger.error(f"Crawl4AI fetch failed: {e}")
164
+ return {"content": f"Error: crawl failed ({e})", "title": "Error", "url": url}
165
+
166
+ async def _get_crawler(self) -> AsyncWebCrawler:
167
+ # Prefer shared crawler to minimize INIT logs; fall back to local if needed
168
+ try:
169
+ return await get_shared_crawler()
170
+ except Exception as e:
171
+ logger.warning(f"Shared crawler unavailable, creating local: {e}")
172
+ if self._crawler is None:
173
+ self._crawler = AsyncWebCrawler()
174
+ await self._crawler.start()
175
+ return self._crawler
176
+
177
+ async def close(self):
178
+ if self._crawler:
179
+ try:
180
+ await self._crawler.close()
112
181
  except Exception:
113
- continue
114
-
115
- logger.info(f"SearchService: Parsed {len(results)} results")
116
- return results
182
+ pass
183
+ self._crawler = None
117
184
 
118
185
  async def image_search(self, query: str) -> List[Dict[str, str]]:
119
186
  """
120
- Perform image search using regex parsing on HTML results.
187
+ Image search via Crawl4AI media extraction.
121
188
  """
122
- if not query: return []
123
-
124
- encoded_query = urllib.parse.quote(query)
125
- base = getattr(self.config, "image_search_base_url", "http://127.0.0.1:8888/search?")
126
- sep = "&" if "?" in base else "?"
127
-
128
- # Clean format=json
129
- base = base.replace("format=json&", "").replace("&format=json", "")
130
-
131
- if "{query}" in base:
132
- url = base.replace("{query}", encoded_query)
133
- url = url.replace("{limit}", "8")
134
- else:
135
- url = f"{base}{sep}q={encoded_query}&iax=images&ia=images"
136
-
137
- logger.info(f"SearchService: Fetching Images {url}")
138
-
189
+ if not query:
190
+ return []
191
+
192
+ url = self._build_image_url(query)
193
+ logger.info(f"SearchService(Crawl4AI Image): fetching {url}")
194
+
139
195
  try:
140
- async with httpx.AsyncClient(timeout=10.0) as client:
141
- resp = await client.get(url)
142
- resp.raise_for_status()
143
- html_content = resp.text
196
+ # Use image crawler (text_mode=False) for image search
197
+ crawler = await self._get_crawler()
198
+ result = await crawler.arun(
199
+ url=url,
200
+ config=CrawlerRunConfig(
201
+ wait_until="networkidle",
202
+ wait_for_images=True,
203
+ wait_for="img",
204
+ cache_mode=CacheMode.BYPASS,
205
+ word_count_threshold=1,
206
+ screenshot=False,
207
+ capture_console_messages=False,
208
+ capture_network_requests=False,
209
+ ),
210
+ )
211
+ images = []
212
+ seen = set()
213
+ for img in result.media.get("images", []):
214
+ src = img.get("src") or ""
215
+ if not src:
216
+ continue
217
+ if src.startswith("//"):
218
+ src = "https:" + src
219
+ if not src.startswith("http"):
220
+ continue
221
+ if src in seen:
222
+ continue
223
+ seen.add(src)
224
+ alt = (img.get("alt") or img.get("desc") or "").strip()
225
+ domain = urllib.parse.urlparse(src).hostname or ""
226
+ images.append({
227
+ "title": alt or "Image",
228
+ "url": src,
229
+ "domain": domain,
230
+ "content": alt or "Image",
231
+ })
232
+ if len(images) >= self._default_limit:
233
+ break
234
+ if not images:
235
+ logger.warning(f"SearchService: no images parsed; media_count={len(result.media.get('images', []))}")
236
+ else:
237
+ logger.info(f"SearchService: parsed {len(images)} images via Crawl4AI media")
238
+ return images
144
239
  except Exception as e:
145
- logger.error(f"Image Search failed: {e}")
240
+ logger.error(f"Crawl4AI image search failed: {e}")
146
241
  return []
147
-
148
- # Regex for Images (DuckDuckGo style / Generic)
149
- # DDG images usually in a script or complex layout.
150
- # For simplicity in V2 regex approach, we look for common img tags with logical classes or structure
151
- # OR, since the user's SearXNG likely returns standard HTML list for images too.
152
- # SearXNG Image results usually: <img src="..." alt="..."> inside a result container.
153
- # Let's try a generic pattern for SearXNG image results
154
-
155
- results = []
156
- # SearXNG pattern: <div class="img-search-result"> ... <img src="URL" ...>
157
- # Or just look for img tags with src that are http
158
-
159
- # More robust SearXNG specific regex:
160
- # Pattern: <img class="image" src="(?P<url>[^"]+)" alt="(?P<title>[^"]+)"
161
- # This is a guess. Let's try to match standard "result_image" or similar if possible.
162
-
163
- # Assuming SearXNG:
164
- # More robust regex to capture images from various engines (SearXNG, Google, Bing)
165
- # 1. Try generic <img ... src="..."> with http
166
- # 2. Try to extract alt text if available
167
-
168
- # Pattern 1: Standard img tag with src
169
- # We look for src="http..." and optional alt
170
- image_matches = re.finditer(r'<img[^>]+src=["\'](http[^"\']+)["\'][^>]*>', html_content, re.IGNORECASE)
171
-
172
- for match in image_matches:
173
- img_tag = match.group(0)
174
- img_url = match.group(1)
175
-
176
- # Extract alt/title
177
- alt_match = re.search(r'alt=["\']([^"\']*)["\']', img_tag, re.IGNORECASE)
178
- title = alt_match.group(1) if alt_match else ""
179
-
180
- # Filter out tiny icons/favicons/data uris if possible
181
- if "favicon" in img_url or "static" in img_url or "data:image" in img_url:
182
- continue
183
-
184
- results.append({
185
- "title": title or "Image",
186
- "url": img_url,
187
- "content": f"Image: {title}"
188
- })
189
-
190
- if len(results) >= 8:
191
- break
192
-
193
- return results
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: entari_plugin_hyw
3
- Version: 3.2.112
3
+ Version: 3.3.0
4
4
  Summary: Use large language models to interpret chat messages
5
5
  Author-email: kumoSleeping <zjr2992@outlook.com>
6
6
  License: MIT
@@ -18,15 +18,10 @@ Requires-Python: >=3.10
18
18
  Description-Content-Type: text/markdown
19
19
  Requires-Dist: arclet-entari[full]>=0.16.5
20
20
  Requires-Dist: openai
21
- Requires-Dist: mcp
22
21
  Requires-Dist: httpx
23
22
  Requires-Dist: markdown>=3.10
24
- Requires-Dist: trafilatura>=2.0.0
25
- Requires-Dist: playwright>=1.56.0
23
+ Requires-Dist: crawl4ai>=0.7.8
26
24
  Requires-Dist: jinja2>=3.0
27
- Provides-Extra: playwright
28
- Requires-Dist: playwright>=1.56.0; extra == "playwright"
29
- Requires-Dist: trafilatura>=2.0.0; extra == "playwright"
30
25
  Provides-Extra: dev
31
26
  Requires-Dist: entari-plugin-server>=0.5.0; extra == "dev"
32
27
  Requires-Dist: satori-python-adapter-onebot11>=0.2.5; extra == "dev"
@@ -43,7 +38,9 @@ Requires-Dist: satori-python-adapter-onebot11>=0.2.5; extra == "dev"
43
38
 
44
39
  </div>
45
40
 
46
- # v3.2迎来大幅度改动、现在图文不符
41
+ # v3.3 迎来大幅度改动、现在图文不符
42
+
43
+
47
44
 
48
45
  ## 🎑 效果展示
49
46
 
@@ -140,3 +137,6 @@ hyw -t 一大段话。
140
137
  ### 引用回复
141
138
  支持引用消息进行追问,机器人会自动读取被引用的消息作为上下文:
142
139
  - **引用 + 命令**:机器人将理解被引用消息的内容(包括图片)通过 `MessageChain` 操作拼接 `Text`、`Image` 与部分 `Custom`。
140
+
141
+ UncleCode. (2024). Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper [Computer software].
142
+ GitHub. https://github.com/unclecode/crawl4ai