entari-plugin-hyw 3.2.112__py3-none-any.whl → 3.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of entari-plugin-hyw might be problematic. Click here for more details.
- entari_plugin_hyw/__init__.py +37 -32
- entari_plugin_hyw/assets/libs/tailwind.css +1 -1
- entari_plugin_hyw/assets/tailwind.input.css +1 -1
- entari_plugin_hyw/assets/template.j2 +103 -55
- entari_plugin_hyw/core/config.py +3 -4
- entari_plugin_hyw/core/pipeline.py +507 -282
- entari_plugin_hyw/core/render.py +184 -119
- entari_plugin_hyw/utils/__init__.py +1 -2
- entari_plugin_hyw/utils/browser.py +25 -46
- entari_plugin_hyw/utils/playwright_tool.py +22 -32
- entari_plugin_hyw/utils/prompts.py +94 -58
- entari_plugin_hyw/utils/search.py +212 -164
- {entari_plugin_hyw-3.2.112.dist-info → entari_plugin_hyw-3.3.0.dist-info}/METADATA +8 -8
- {entari_plugin_hyw-3.2.112.dist-info → entari_plugin_hyw-3.3.0.dist-info}/RECORD +16 -17
- entari_plugin_hyw/utils/mcp_playwright.py +0 -128
- {entari_plugin_hyw-3.2.112.dist-info → entari_plugin_hyw-3.3.0.dist-info}/WHEEL +0 -0
- {entari_plugin_hyw-3.2.112.dist-info → entari_plugin_hyw-3.3.0.dist-info}/top_level.txt +0 -0
entari_plugin_hyw/core/render.py
CHANGED
|
@@ -3,6 +3,7 @@ import gc
|
|
|
3
3
|
import os
|
|
4
4
|
import markdown
|
|
5
5
|
import base64
|
|
6
|
+
import html # Import html for escaping
|
|
6
7
|
import mimetypes
|
|
7
8
|
from datetime import datetime
|
|
8
9
|
from urllib.parse import urlparse
|
|
@@ -10,9 +11,49 @@ from typing import List, Dict, Optional, Any, Union
|
|
|
10
11
|
import re
|
|
11
12
|
import json
|
|
12
13
|
from pathlib import Path
|
|
13
|
-
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
|
14
14
|
from loguru import logger
|
|
15
15
|
from jinja2 import Environment, FileSystemLoader, select_autoescape
|
|
16
|
+
from crawl4ai import AsyncWebCrawler
|
|
17
|
+
from crawl4ai.async_configs import CrawlerRunConfig
|
|
18
|
+
from crawl4ai.cache_context import CacheMode
|
|
19
|
+
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
|
20
|
+
from playwright.async_api import async_playwright
|
|
21
|
+
|
|
22
|
+
# Patch Crawl4AI 0.7.x to support screenshot from raw/file HTML
|
|
23
|
+
async def _c4a_generate_screenshot_from_html(self, html: str) -> str:
|
|
24
|
+
"""
|
|
25
|
+
Monkey-patched fallback: render arbitrary HTML to a screenshot using Playwright.
|
|
26
|
+
"""
|
|
27
|
+
page, context = await self.browser_manager.get_page(
|
|
28
|
+
crawlerRunConfig=CrawlerRunConfig(
|
|
29
|
+
adjust_viewport_to_content=True,
|
|
30
|
+
wait_until="networkidle",
|
|
31
|
+
wait_for_images=True,
|
|
32
|
+
cache_mode=CacheMode.BYPASS,
|
|
33
|
+
)
|
|
34
|
+
)
|
|
35
|
+
try:
|
|
36
|
+
try:
|
|
37
|
+
await page.set_viewport_size({"width": 520, "height": 1200})
|
|
38
|
+
except Exception:
|
|
39
|
+
pass
|
|
40
|
+
await page.set_content(html, wait_until="networkidle")
|
|
41
|
+
await page.wait_for_timeout(150)
|
|
42
|
+
element = await page.query_selector("#main-container")
|
|
43
|
+
if element:
|
|
44
|
+
screenshot_bytes = await element.screenshot()
|
|
45
|
+
else:
|
|
46
|
+
screenshot_bytes = await page.screenshot(full_page=True)
|
|
47
|
+
import base64 as _b64
|
|
48
|
+
return _b64.b64encode(screenshot_bytes).decode()
|
|
49
|
+
finally:
|
|
50
|
+
try:
|
|
51
|
+
await context.close()
|
|
52
|
+
except Exception:
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
if not hasattr(AsyncPlaywrightCrawlerStrategy, "_generate_screenshot_from_html"):
|
|
56
|
+
AsyncPlaywrightCrawlerStrategy._generate_screenshot_from_html = _c4a_generate_screenshot_from_html
|
|
16
57
|
|
|
17
58
|
class ContentRenderer:
|
|
18
59
|
def __init__(self, template_path: str = None):
|
|
@@ -65,18 +106,6 @@ class ContentRenderer:
|
|
|
65
106
|
)
|
|
66
107
|
self.template = self.env.get_template(template_name)
|
|
67
108
|
|
|
68
|
-
async def _set_content_safe(self, page, html: str, timeout_ms: int) -> bool:
|
|
69
|
-
html_size = len(html)
|
|
70
|
-
try:
|
|
71
|
-
await page.set_content(html, wait_until="networkidle", timeout=timeout_ms)
|
|
72
|
-
return True
|
|
73
|
-
except PlaywrightTimeoutError:
|
|
74
|
-
logger.warning(f"ContentRenderer: page.set_content timed out after {timeout_ms}ms (html_size={html_size})")
|
|
75
|
-
return False
|
|
76
|
-
except Exception as exc:
|
|
77
|
-
logger.warning(f"ContentRenderer: page.set_content failed (html_size={html_size}): {exc}")
|
|
78
|
-
return False
|
|
79
|
-
|
|
80
109
|
def _get_icon_data_url(self, icon_name: str) -> str:
|
|
81
110
|
if not icon_name:
|
|
82
111
|
return ""
|
|
@@ -143,8 +172,9 @@ class ContentRenderer:
|
|
|
143
172
|
suggestions: List[str] = None,
|
|
144
173
|
stats: Dict[str, Any] = None,
|
|
145
174
|
references: List[Dict[str, Any]] = None,
|
|
146
|
-
|
|
175
|
+
page_references: List[Dict[str, Any]] = None,
|
|
147
176
|
stages_used: List[Dict[str, Any]] = None,
|
|
177
|
+
flow_steps: List[Dict[str, Any]] = None,
|
|
148
178
|
model_name: str = "",
|
|
149
179
|
provider_name: str = "Unknown",
|
|
150
180
|
behavior_summary: str = "Text Generation",
|
|
@@ -156,16 +186,25 @@ class ContentRenderer:
|
|
|
156
186
|
billing_info: Dict[str, Any] = None,
|
|
157
187
|
render_timeout_ms: int = 6000):
|
|
158
188
|
"""
|
|
159
|
-
Render markdown content to an image using
|
|
189
|
+
Render markdown content to an image using Crawl4AI (headless) and Jinja2.
|
|
160
190
|
"""
|
|
161
191
|
render_start_time = asyncio.get_event_loop().time()
|
|
192
|
+
|
|
193
|
+
# Resolve output path early to avoid relative URI issues
|
|
194
|
+
resolved_output_path = Path(output_path).resolve()
|
|
195
|
+
resolved_output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
162
196
|
|
|
163
197
|
# Preprocess to fix common markdown issues
|
|
164
|
-
markdown_content = re.sub(r'(?<=\S)\n(?=\s*(\d
|
|
198
|
+
markdown_content = re.sub(r'(?<=\S)\n(?=\s*(\d+\.|\-|\*|\+) )', r'\n\n', markdown_content)
|
|
165
199
|
|
|
166
200
|
# AGGRESSIVE CLEANING: Strip out "References" section and "[code]" blocks from the text
|
|
167
201
|
# because we are rendering them as structured UI elements now.
|
|
168
202
|
|
|
203
|
+
# 0. Remove content before first # heading (keep the heading)
|
|
204
|
+
heading_match = re.search(r'^(#[^#])', markdown_content, re.MULTILINE)
|
|
205
|
+
if heading_match:
|
|
206
|
+
markdown_content = markdown_content[heading_match.start():]
|
|
207
|
+
|
|
169
208
|
# 1. Remove "References" or "Citations" header and everything after it specific to the end of file
|
|
170
209
|
# Matches ### References, ## References, **References**, etc., followed by list items
|
|
171
210
|
markdown_content = re.sub(r'(?i)^\s*(#{1,3}|\*\*)\s*(References|Citations|Sources).*$', '', markdown_content, flags=re.MULTILINE | re.DOTALL)
|
|
@@ -184,8 +223,11 @@ class ContentRenderer:
|
|
|
184
223
|
math_blocks = {}
|
|
185
224
|
|
|
186
225
|
def protect_math(match):
|
|
187
|
-
key = f"
|
|
188
|
-
|
|
226
|
+
key = f"MATHBLOCK{len(math_blocks)}PLACEHOLDER"
|
|
227
|
+
# Escape ONLY < and > to prevent them from being parsed as HTML tags
|
|
228
|
+
# We preserve & and other chars to avoid breaking LaTeX alignment
|
|
229
|
+
escaped_math = match.group(0).replace("<", "<").replace(">", ">")
|
|
230
|
+
math_blocks[key] = escaped_math
|
|
189
231
|
return key
|
|
190
232
|
|
|
191
233
|
# Patterns for math:
|
|
@@ -195,10 +237,10 @@ class ContentRenderer:
|
|
|
195
237
|
# Note: We must handle multiline for $$ and \[
|
|
196
238
|
|
|
197
239
|
# Regex for $$...$$
|
|
198
|
-
markdown_content = re.sub(r'\$\$(.*?)
|
|
240
|
+
markdown_content = re.sub(r'\$\$(.*?)\$\$', protect_math, markdown_content, flags=re.DOTALL)
|
|
199
241
|
|
|
200
242
|
# Regex for \[...\]
|
|
201
|
-
markdown_content = re.sub(r'\\\[(.*?)\\\]
|
|
243
|
+
markdown_content = re.sub(r'\\\[(.*?)\\\]', protect_math, markdown_content, flags=re.DOTALL)
|
|
202
244
|
|
|
203
245
|
# Regex for \(...\) (usually single line, but DOTALL is safest if user wraps lines)
|
|
204
246
|
markdown_content = re.sub(r'\\\((.*?)\\\)', protect_math, markdown_content, flags=re.DOTALL)
|
|
@@ -227,7 +269,7 @@ class ContentRenderer:
|
|
|
227
269
|
for i, part in enumerate(parts):
|
|
228
270
|
# Check if this part is a code block containing our specific citation format
|
|
229
271
|
if part.startswith('<code'):
|
|
230
|
-
# Match <code>ref:123</code>
|
|
272
|
+
# Match <code>ref:123</code>
|
|
231
273
|
# Note: attributes like class might be present if we are unlucky, but `ref:` inside usually means inline code.
|
|
232
274
|
|
|
233
275
|
# 1. Numeric: <code>ref:123</code>
|
|
@@ -236,12 +278,11 @@ class ContentRenderer:
|
|
|
236
278
|
citation_id = ref_match.group(1)
|
|
237
279
|
parts[i] = f'<span class="inline-flex items-center justify-center min-w-[16px] h-4 px-0.5 text-[10px] font-bold text-blue-600 bg-blue-50 border border-blue-200 rounded mx-0.5 align-top relative -top-0.5">{citation_id}</span>'
|
|
238
280
|
continue
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
parts[i] = f'<span class="inline-flex items-center justify-center min-w-[16px] h-4 px-0.5 text-[10px] font-bold text-orange-600 bg-orange-50 border border-orange-200 rounded mx-0.5 align-top relative -top-0.5">{mcp_id}</span>'
|
|
281
|
+
# 2. Flow marker: <code>flow:a</code>
|
|
282
|
+
flow_match = re.match(r'^<code.*?>flow:([a-zA-Z])</code>$', part)
|
|
283
|
+
if flow_match:
|
|
284
|
+
flow_id = flow_match.group(1).lower()
|
|
285
|
+
parts[i] = f'<span class="inline-flex items-center justify-center min-w-[16px] h-4 px-0.5 text-[10px] font-bold text-orange-700 bg-orange-50 border border-orange-200 rounded mx-0.5 align-top relative -top-0.5">{flow_id}</span>'
|
|
245
286
|
continue
|
|
246
287
|
|
|
247
288
|
# If it's NOT a code block, or a code block we didn't transform, we leave it alone.
|
|
@@ -250,12 +291,12 @@ class ContentRenderer:
|
|
|
250
291
|
content_html = "".join(parts)
|
|
251
292
|
|
|
252
293
|
# Strip out the structured JSON blocks if they leaked into the content
|
|
253
|
-
# Look for <pre>... containing "
|
|
294
|
+
# Look for <pre>... containing "references" at the end
|
|
254
295
|
# Make regex robust to any language class or no class
|
|
255
|
-
content_html = re.sub(r'<pre><code[^>]*>[^<]*
|
|
296
|
+
content_html = re.sub(r'<pre><code[^>]*>[^<]*references[^<]*</code></pre>\s*$', '', content_html, flags=re.DOTALL | re.IGNORECASE)
|
|
256
297
|
# Loop to remove multiple if present
|
|
257
|
-
while re.search(r'<pre><code[^>]*>[^<]*
|
|
258
|
-
content_html = re.sub(r'<pre><code[^>]*>[^<]*
|
|
298
|
+
while re.search(r'<pre><code[^>]*>[^<]*references[^<]*</code></pre>\s*$', content_html, flags=re.DOTALL | re.IGNORECASE):
|
|
299
|
+
content_html = re.sub(r'<pre><code[^>]*>[^<]*references[^<]*</code></pre>\s*$', '', content_html, flags=re.DOTALL | re.IGNORECASE)
|
|
259
300
|
|
|
260
301
|
# --- PREPARE DATA FOR JINJA TEMPLATE ---
|
|
261
302
|
|
|
@@ -264,6 +305,7 @@ class ContentRenderer:
|
|
|
264
305
|
|
|
265
306
|
# Unified Search Icon (RemixIcon)
|
|
266
307
|
SEARCH_ICON = '<i class="ri-search-line text-[16px]"></i>'
|
|
308
|
+
BROWSER_ICON = '<i class="ri-global-line text-[16px]"></i>'
|
|
267
309
|
DEFAULT_ICON = '<i class="ri-box-3-line text-[16px]"></i>'
|
|
268
310
|
|
|
269
311
|
# Helper to infer provider/icon name from model string
|
|
@@ -301,6 +343,26 @@ class ContentRenderer:
|
|
|
301
343
|
"favicon_url": f"https://www.google.com/s2/favicons?domain={domain}&sz=32"
|
|
302
344
|
})
|
|
303
345
|
|
|
346
|
+
# 2b. Page Reference Processing (crawled pages)
|
|
347
|
+
processed_page_refs = []
|
|
348
|
+
if page_references:
|
|
349
|
+
for ref in page_references[:8]:
|
|
350
|
+
url = ref.get("url", "#")
|
|
351
|
+
try:
|
|
352
|
+
domain = urlparse(url).netloc
|
|
353
|
+
if domain.startswith("www."): domain = domain[4:]
|
|
354
|
+
except:
|
|
355
|
+
domain = "unknown"
|
|
356
|
+
|
|
357
|
+
processed_page_refs.append({
|
|
358
|
+
"title": ref.get("title", "No Title"),
|
|
359
|
+
"url": url,
|
|
360
|
+
"domain": domain,
|
|
361
|
+
"favicon_url": f"https://www.google.com/s2/favicons?domain={domain}&sz=32"
|
|
362
|
+
})
|
|
363
|
+
|
|
364
|
+
flow_steps = flow_steps or []
|
|
365
|
+
|
|
304
366
|
if stages_used:
|
|
305
367
|
for stage in stages_used:
|
|
306
368
|
name = stage.get("name", "Step")
|
|
@@ -310,6 +372,8 @@ class ContentRenderer:
|
|
|
310
372
|
|
|
311
373
|
if name == "Search":
|
|
312
374
|
icon_html = SEARCH_ICON
|
|
375
|
+
elif name == "Crawler":
|
|
376
|
+
icon_html = BROWSER_ICON
|
|
313
377
|
else:
|
|
314
378
|
# Try to find vendor logo
|
|
315
379
|
# 1. Check explicit icon_config
|
|
@@ -342,35 +406,29 @@ class ContentRenderer:
|
|
|
342
406
|
# References go to "Search"
|
|
343
407
|
if name == "Search" and processed_refs:
|
|
344
408
|
stage_children['references'] = processed_refs
|
|
345
|
-
|
|
346
|
-
#
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
STEP_ICONS = {
|
|
352
|
-
"navigate": '<i class="ri-compass-3-line"></i>',
|
|
353
|
-
"snapshot": '<i class="ri-camera-lens-line"></i>',
|
|
354
|
-
"click": '<i class="ri-cursor-fill"></i>',
|
|
355
|
-
"type": '<i class="ri-keyboard-line"></i>',
|
|
356
|
-
"code": '<i class="ri-code-line"></i>',
|
|
357
|
-
"search": SEARCH_ICON,
|
|
358
|
-
"default": '<i class="ri-arrow-right-s-line"></i>',
|
|
409
|
+
|
|
410
|
+
# Flow steps go to "Agent"
|
|
411
|
+
if name == "Agent" and flow_steps:
|
|
412
|
+
FLOW_ICONS = {
|
|
413
|
+
"search": SEARCH_ICON,
|
|
414
|
+
"page": '<i class="ri-file-text-line text-[16px]"></i>',
|
|
359
415
|
}
|
|
360
|
-
|
|
416
|
+
formatted_flow = []
|
|
417
|
+
for step in flow_steps:
|
|
361
418
|
icon_key = step.get("icon", "").lower()
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
elif "type" in icon_key or "input" in icon_key: icon_key = "type"
|
|
366
|
-
elif "shot" in icon_key: icon_key = "snapshot"
|
|
367
|
-
|
|
368
|
-
stage_mcp_steps.append({
|
|
369
|
-
"name": step.get("name", "unknown"),
|
|
370
|
-
"description": step.get("description", ""),
|
|
371
|
-
"icon_svg": STEP_ICONS.get(icon_key, STEP_ICONS["default"])
|
|
419
|
+
formatted_flow.append({
|
|
420
|
+
"icon_svg": FLOW_ICONS.get(icon_key, FLOW_ICONS.get("search")),
|
|
421
|
+
"description": step.get("description", "")
|
|
372
422
|
})
|
|
373
|
-
stage_children['
|
|
423
|
+
stage_children['flow_steps'] = formatted_flow
|
|
424
|
+
|
|
425
|
+
# Pass through Search Queries
|
|
426
|
+
if "queries" in stage:
|
|
427
|
+
stage_children["queries"] = stage["queries"]
|
|
428
|
+
|
|
429
|
+
# Pass through Crawled Pages
|
|
430
|
+
if "crawled_pages" in stage:
|
|
431
|
+
stage_children["crawled_pages"] = stage["crawled_pages"]
|
|
374
432
|
|
|
375
433
|
processed_stages.append({
|
|
376
434
|
"name": name,
|
|
@@ -432,14 +490,31 @@ class ContentRenderer:
|
|
|
432
490
|
"billing_html": billing_html
|
|
433
491
|
}
|
|
434
492
|
|
|
493
|
+
# 5. Feature Flags for Header Icons
|
|
494
|
+
feature_flags = {
|
|
495
|
+
"has_vision": False,
|
|
496
|
+
"has_search": False,
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
# Check Vision
|
|
500
|
+
if stats_dict.get("vision_duration", 0) > 0:
|
|
501
|
+
feature_flags["has_vision"] = True
|
|
502
|
+
|
|
503
|
+
# Check Search
|
|
504
|
+
if any(s.get("name") == "Search" for s in stages_used or []):
|
|
505
|
+
feature_flags["has_search"] = True
|
|
506
|
+
|
|
435
507
|
# Render Template
|
|
436
508
|
context = {
|
|
437
509
|
"content_html": content_html,
|
|
438
510
|
"suggestions": suggestions or [],
|
|
439
511
|
"stages": processed_stages,
|
|
440
512
|
"references": processed_refs,
|
|
513
|
+
"page_references": processed_page_refs,
|
|
441
514
|
"references_json": json.dumps(references or []),
|
|
442
515
|
"stats": processed_stats,
|
|
516
|
+
"flags": feature_flags,
|
|
517
|
+
"total_time": stats_dict.get("total_time", 0) or 0,
|
|
443
518
|
**self.assets
|
|
444
519
|
}
|
|
445
520
|
|
|
@@ -455,73 +530,25 @@ class ContentRenderer:
|
|
|
455
530
|
continue
|
|
456
531
|
|
|
457
532
|
try:
|
|
458
|
-
#
|
|
533
|
+
# Use Playwright directly for crisp element screenshot (Crawl4AI already depends on it)
|
|
459
534
|
async with async_playwright() as p:
|
|
460
|
-
# logger.info("ContentRenderer: playwright context ready, launching browser...")
|
|
461
535
|
browser = await p.chromium.launch(headless=True)
|
|
462
536
|
try:
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
raise RuntimeError("set_content failed")
|
|
470
|
-
|
|
471
|
-
# Wait for images with user-configured timeout (render_timeout_ms)
|
|
472
|
-
image_timeout_sec = render_timeout_ms / 1000.0
|
|
473
|
-
try:
|
|
474
|
-
await asyncio.wait_for(
|
|
475
|
-
page.evaluate("""
|
|
476
|
-
() => Promise.all(
|
|
477
|
-
Array.from(document.images).map(img => {
|
|
478
|
-
if (img.complete) {
|
|
479
|
-
if (img.naturalWidth === 0 || img.naturalHeight === 0) {
|
|
480
|
-
img.style.display = 'none';
|
|
481
|
-
}
|
|
482
|
-
return Promise.resolve();
|
|
483
|
-
}
|
|
484
|
-
return new Promise((resolve) => {
|
|
485
|
-
img.onload = () => {
|
|
486
|
-
if (img.naturalWidth === 0 || img.naturalHeight === 0) {
|
|
487
|
-
img.style.display = 'none';
|
|
488
|
-
}
|
|
489
|
-
resolve();
|
|
490
|
-
};
|
|
491
|
-
img.onerror = () => {
|
|
492
|
-
img.style.display = 'none';
|
|
493
|
-
resolve();
|
|
494
|
-
};
|
|
495
|
-
});
|
|
496
|
-
})
|
|
497
|
-
)
|
|
498
|
-
"""),
|
|
499
|
-
timeout=image_timeout_sec
|
|
500
|
-
)
|
|
501
|
-
except asyncio.TimeoutError:
|
|
502
|
-
logger.warning(f"ContentRenderer: image loading timed out after {image_timeout_sec}s, continuing...")
|
|
503
|
-
|
|
504
|
-
# Brief wait for layout to stabilize
|
|
505
|
-
await asyncio.sleep(0.1)
|
|
506
|
-
|
|
507
|
-
# Try element screenshot first, fallback to full page
|
|
537
|
+
page = await browser.new_page(
|
|
538
|
+
viewport={"width": 520, "height": 1400},
|
|
539
|
+
device_scale_factor=3,
|
|
540
|
+
)
|
|
541
|
+
await page.set_content(final_html, wait_until="networkidle")
|
|
542
|
+
await page.wait_for_timeout(150)
|
|
508
543
|
element = await page.query_selector("#main-container")
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
await page.screenshot(path=output_path, full_page=True)
|
|
515
|
-
except Exception as screenshot_exc:
|
|
516
|
-
logger.warning(f"ContentRenderer: element screenshot failed ({screenshot_exc}), trying full page...")
|
|
517
|
-
await page.screenshot(path=output_path, full_page=True)
|
|
518
|
-
|
|
544
|
+
if element:
|
|
545
|
+
await element.screenshot(path=resolved_output_path, type="jpeg", quality=98)
|
|
546
|
+
else:
|
|
547
|
+
await page.screenshot(path=resolved_output_path, full_page=True, type="jpeg", quality=98)
|
|
548
|
+
return True
|
|
519
549
|
finally:
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
except Exception as exc:
|
|
523
|
-
logger.warning(f"ContentRenderer: failed to close browser ({exc})")
|
|
524
|
-
return True
|
|
550
|
+
await browser.close()
|
|
551
|
+
|
|
525
552
|
except Exception as exc:
|
|
526
553
|
last_exc = exc
|
|
527
554
|
logger.warning(f"ContentRenderer: render attempt {attempt}/{max_attempts} failed ({exc})")
|
|
@@ -529,3 +556,41 @@ class ContentRenderer:
|
|
|
529
556
|
content_html = None
|
|
530
557
|
final_html = None
|
|
531
558
|
gc.collect()
|
|
559
|
+
|
|
560
|
+
logger.error(f"ContentRenderer: render failed after {max_attempts} attempts ({last_exc})")
|
|
561
|
+
return False
|
|
562
|
+
|
|
563
|
+
async def render_models_list(
|
|
564
|
+
self,
|
|
565
|
+
models: List[Dict[str, Any]],
|
|
566
|
+
output_path: str,
|
|
567
|
+
default_base_url: str = "https://openrouter.ai/api/v1",
|
|
568
|
+
render_timeout_ms: int = 6000,
|
|
569
|
+
) -> bool:
|
|
570
|
+
"""
|
|
571
|
+
Lightweight models list renderer leveraging the main render pipeline.
|
|
572
|
+
"""
|
|
573
|
+
lines = ["# 模型列表"]
|
|
574
|
+
for idx, model in enumerate(models or [], start=1):
|
|
575
|
+
name = model.get("name", "unknown")
|
|
576
|
+
base_url = model.get("base_url") or default_base_url
|
|
577
|
+
provider = model.get("provider", "")
|
|
578
|
+
lines.append(f"{idx}. **{name}** \n - base_url: {base_url} \n - provider: {provider}")
|
|
579
|
+
|
|
580
|
+
markdown_content = "\n\n".join(lines) if len(lines) > 1 else "# 模型列表\n暂无模型"
|
|
581
|
+
|
|
582
|
+
return await self.render(
|
|
583
|
+
markdown_content=markdown_content,
|
|
584
|
+
output_path=output_path,
|
|
585
|
+
suggestions=[],
|
|
586
|
+
stats={"time": 0.0},
|
|
587
|
+
references=[],
|
|
588
|
+
stages_used=[],
|
|
589
|
+
model_name="",
|
|
590
|
+
provider_name="Models",
|
|
591
|
+
behavior_summary="Model List",
|
|
592
|
+
icon_config="openai",
|
|
593
|
+
base_url=default_base_url,
|
|
594
|
+
billing_info=None,
|
|
595
|
+
render_timeout_ms=render_timeout_ms,
|
|
596
|
+
)
|
|
@@ -1,61 +1,40 @@
|
|
|
1
|
-
import
|
|
2
|
-
import urllib.parse
|
|
3
|
-
from typing import Any, Optional
|
|
4
|
-
|
|
5
|
-
import httpx
|
|
6
|
-
import trafilatura
|
|
1
|
+
from typing import Any
|
|
7
2
|
from loguru import logger
|
|
3
|
+
from crawl4ai import AsyncWebCrawler
|
|
4
|
+
from crawl4ai.async_configs import CrawlerRunConfig
|
|
5
|
+
from crawl4ai.cache_context import CacheMode
|
|
8
6
|
|
|
9
7
|
|
|
10
8
|
class BrowserTool:
|
|
11
|
-
"""
|
|
9
|
+
"""Crawl4AI-based page fetcher."""
|
|
12
10
|
|
|
13
11
|
def __init__(self, config: Any):
|
|
14
12
|
self.config = config
|
|
15
|
-
self._client: Optional[httpx.AsyncClient] = None
|
|
16
|
-
|
|
17
|
-
async def _ensure_client(self) -> httpx.AsyncClient:
|
|
18
|
-
if self._client is None:
|
|
19
|
-
timeout = httpx.Timeout(8.0)
|
|
20
|
-
self._client = httpx.AsyncClient(
|
|
21
|
-
timeout=timeout,
|
|
22
|
-
follow_redirects=True,
|
|
23
|
-
headers={
|
|
24
|
-
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
|
|
25
|
-
"Accept": "application/json,text/html;q=0.9,*/*;q=0.8",
|
|
26
|
-
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
27
|
-
},
|
|
28
|
-
verify=False
|
|
29
|
-
)
|
|
30
|
-
return self._client
|
|
31
13
|
|
|
32
14
|
async def navigate(self, url: str) -> str:
|
|
33
|
-
"""Fetch URL content via
|
|
15
|
+
"""Fetch URL content via Crawl4AI and return markdown."""
|
|
16
|
+
if not url:
|
|
17
|
+
return "Error: missing url"
|
|
34
18
|
try:
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
content = html[:4000]
|
|
52
|
-
return content
|
|
19
|
+
async with AsyncWebCrawler() as crawler:
|
|
20
|
+
result = await crawler.arun(
|
|
21
|
+
url=url,
|
|
22
|
+
config=CrawlerRunConfig(
|
|
23
|
+
wait_until="networkidle",
|
|
24
|
+
wait_for_images=True,
|
|
25
|
+
cache_mode=CacheMode.BYPASS,
|
|
26
|
+
word_count_threshold=1,
|
|
27
|
+
screenshot=False,
|
|
28
|
+
),
|
|
29
|
+
)
|
|
30
|
+
if not result.success:
|
|
31
|
+
return f"Error navigating to {url}: {result.error_message or result.status_code}"
|
|
32
|
+
|
|
33
|
+
content = result.markdown or result.extracted_content or result.cleaned_html or result.html or ""
|
|
34
|
+
return content[:8000]
|
|
53
35
|
except Exception as e:
|
|
54
36
|
logger.error(f"HTTP navigation failed: {e}")
|
|
55
37
|
return f"Error navigating to {url}: {e}"
|
|
56
38
|
|
|
57
39
|
async def close(self):
|
|
58
|
-
|
|
59
|
-
await self._client.aclose()
|
|
60
|
-
self._client = None
|
|
61
|
-
|
|
40
|
+
return None
|
|
@@ -1,46 +1,36 @@
|
|
|
1
|
-
import
|
|
2
|
-
from typing import Any, Optional
|
|
3
|
-
|
|
4
|
-
import trafilatura
|
|
1
|
+
from typing import Any
|
|
5
2
|
from loguru import logger
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
except Exception: # pragma: no cover
|
|
10
|
-
async_playwright = None
|
|
3
|
+
from crawl4ai.async_configs import CrawlerRunConfig
|
|
4
|
+
from crawl4ai.cache_context import CacheMode
|
|
5
|
+
from .search import get_shared_crawler
|
|
11
6
|
|
|
12
7
|
|
|
13
8
|
class PlaywrightTool:
|
|
9
|
+
"""
|
|
10
|
+
Backwards-compatible wrapper now powered by Crawl4AI.
|
|
11
|
+
"""
|
|
14
12
|
def __init__(self, config: Any):
|
|
15
13
|
self.config = config
|
|
16
14
|
|
|
17
15
|
async def navigate(self, url: str) -> str:
|
|
18
16
|
if not url:
|
|
19
17
|
return "Error: Missing url"
|
|
20
|
-
if async_playwright is None:
|
|
21
|
-
return "Error: Playwright is not available in this environment."
|
|
22
18
|
|
|
23
|
-
headless = bool(getattr(self.config, "headless", True))
|
|
24
19
|
try:
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
trafilatura.extract,
|
|
36
|
-
html,
|
|
37
|
-
include_links=True,
|
|
38
|
-
include_images=True,
|
|
39
|
-
include_tables=True,
|
|
40
|
-
output_format="markdown",
|
|
20
|
+
crawler = await get_shared_crawler()
|
|
21
|
+
result = await crawler.arun(
|
|
22
|
+
url=url,
|
|
23
|
+
config=CrawlerRunConfig(
|
|
24
|
+
wait_until="networkidle",
|
|
25
|
+
wait_for_images=True,
|
|
26
|
+
cache_mode=CacheMode.BYPASS,
|
|
27
|
+
word_count_threshold=1,
|
|
28
|
+
screenshot=False,
|
|
29
|
+
),
|
|
41
30
|
)
|
|
42
|
-
|
|
31
|
+
if not result.success:
|
|
32
|
+
return f"Error: crawl failed ({result.error_message or result.status_code})"
|
|
33
|
+
return (result.markdown or result.extracted_content or result.cleaned_html or result.html or "")[:8000]
|
|
43
34
|
except Exception as e:
|
|
44
|
-
logger.warning(f"
|
|
45
|
-
return f"Error:
|
|
46
|
-
|
|
35
|
+
logger.warning(f"Crawl navigation failed: {e}")
|
|
36
|
+
return f"Error: Crawl navigation failed: {e}"
|