entari-plugin-hyw 3.2.113__py3-none-any.whl → 3.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of entari-plugin-hyw might be problematic. Click here for more details.

@@ -3,6 +3,7 @@ import gc
3
3
  import os
4
4
  import markdown
5
5
  import base64
6
+ import html # Import html for escaping
6
7
  import mimetypes
7
8
  from datetime import datetime
8
9
  from urllib.parse import urlparse
@@ -10,9 +11,49 @@ from typing import List, Dict, Optional, Any, Union
10
11
  import re
11
12
  import json
12
13
  from pathlib import Path
13
- from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
14
14
  from loguru import logger
15
15
  from jinja2 import Environment, FileSystemLoader, select_autoescape
16
+ from crawl4ai import AsyncWebCrawler
17
+ from crawl4ai.async_configs import CrawlerRunConfig
18
+ from crawl4ai.cache_context import CacheMode
19
+ from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
20
+ from playwright.async_api import async_playwright
21
+
22
+ # Patch Crawl4AI 0.7.x to support screenshot from raw/file HTML
23
+ async def _c4a_generate_screenshot_from_html(self, html: str) -> str:
24
+ """
25
+ Monkey-patched fallback: render arbitrary HTML to a screenshot using Playwright.
26
+ """
27
+ page, context = await self.browser_manager.get_page(
28
+ crawlerRunConfig=CrawlerRunConfig(
29
+ adjust_viewport_to_content=True,
30
+ wait_until="networkidle",
31
+ wait_for_images=True,
32
+ cache_mode=CacheMode.BYPASS,
33
+ )
34
+ )
35
+ try:
36
+ try:
37
+ await page.set_viewport_size({"width": 520, "height": 1200})
38
+ except Exception:
39
+ pass
40
+ await page.set_content(html, wait_until="networkidle")
41
+ await page.wait_for_timeout(150)
42
+ element = await page.query_selector("#main-container")
43
+ if element:
44
+ screenshot_bytes = await element.screenshot()
45
+ else:
46
+ screenshot_bytes = await page.screenshot(full_page=True)
47
+ import base64 as _b64
48
+ return _b64.b64encode(screenshot_bytes).decode()
49
+ finally:
50
+ try:
51
+ await context.close()
52
+ except Exception:
53
+ pass
54
+
55
+ if not hasattr(AsyncPlaywrightCrawlerStrategy, "_generate_screenshot_from_html"):
56
+ AsyncPlaywrightCrawlerStrategy._generate_screenshot_from_html = _c4a_generate_screenshot_from_html
16
57
 
17
58
  class ContentRenderer:
18
59
  def __init__(self, template_path: str = None):
@@ -65,18 +106,6 @@ class ContentRenderer:
65
106
  )
66
107
  self.template = self.env.get_template(template_name)
67
108
 
68
- async def _set_content_safe(self, page, html: str, timeout_ms: int) -> bool:
69
- html_size = len(html)
70
- try:
71
- await page.set_content(html, wait_until="networkidle", timeout=timeout_ms)
72
- return True
73
- except PlaywrightTimeoutError:
74
- logger.warning(f"ContentRenderer: page.set_content timed out after {timeout_ms}ms (html_size={html_size})")
75
- return False
76
- except Exception as exc:
77
- logger.warning(f"ContentRenderer: page.set_content failed (html_size={html_size}): {exc}")
78
- return False
79
-
80
109
  def _get_icon_data_url(self, icon_name: str) -> str:
81
110
  if not icon_name:
82
111
  return ""
@@ -143,8 +172,9 @@ class ContentRenderer:
143
172
  suggestions: List[str] = None,
144
173
  stats: Dict[str, Any] = None,
145
174
  references: List[Dict[str, Any]] = None,
146
- mcp_steps: List[Dict[str, Any]] = None,
175
+ page_references: List[Dict[str, Any]] = None,
147
176
  stages_used: List[Dict[str, Any]] = None,
177
+ flow_steps: List[Dict[str, Any]] = None,
148
178
  model_name: str = "",
149
179
  provider_name: str = "Unknown",
150
180
  behavior_summary: str = "Text Generation",
@@ -156,16 +186,25 @@ class ContentRenderer:
156
186
  billing_info: Dict[str, Any] = None,
157
187
  render_timeout_ms: int = 6000):
158
188
  """
159
- Render markdown content to an image using Playwright and Jinja2.
189
+ Render markdown content to an image using Crawl4AI (headless) and Jinja2.
160
190
  """
161
191
  render_start_time = asyncio.get_event_loop().time()
192
+
193
+ # Resolve output path early to avoid relative URI issues
194
+ resolved_output_path = Path(output_path).resolve()
195
+ resolved_output_path.parent.mkdir(parents=True, exist_ok=True)
162
196
 
163
197
  # Preprocess to fix common markdown issues
164
- markdown_content = re.sub(r'(?<=\S)\n(?=\s*(\d+\.|[-*+]) )', r'\n\n', markdown_content)
198
+ markdown_content = re.sub(r'(?<=\S)\n(?=\s*(\d+\.|\-|\*|\+) )', r'\n\n', markdown_content)
165
199
 
166
200
  # AGGRESSIVE CLEANING: Strip out "References" section and "[code]" blocks from the text
167
201
  # because we are rendering them as structured UI elements now.
168
202
 
203
+ # 0. Remove content before first # heading (keep the heading)
204
+ heading_match = re.search(r'^(#[^#])', markdown_content, re.MULTILINE)
205
+ if heading_match:
206
+ markdown_content = markdown_content[heading_match.start():]
207
+
169
208
  # 1. Remove "References" or "Citations" header and everything after it specific to the end of file
170
209
  # Matches ### References, ## References, **References**, etc., followed by list items
171
210
  markdown_content = re.sub(r'(?i)^\s*(#{1,3}|\*\*)\s*(References|Citations|Sources).*$', '', markdown_content, flags=re.MULTILINE | re.DOTALL)
@@ -184,8 +223,11 @@ class ContentRenderer:
184
223
  math_blocks = {}
185
224
 
186
225
  def protect_math(match):
187
- key = f"__MATH_BLOCK_{len(math_blocks)}__"
188
- math_blocks[key] = match.group(0)
226
+ key = f"MATHBLOCK{len(math_blocks)}PLACEHOLDER"
227
+ # Escape ONLY < and > to prevent them from being parsed as HTML tags
228
+ # We preserve & and other chars to avoid breaking LaTeX alignment
229
+ escaped_math = match.group(0).replace("<", "&lt;").replace(">", "&gt;")
230
+ math_blocks[key] = escaped_math
189
231
  return key
190
232
 
191
233
  # Patterns for math:
@@ -195,10 +237,10 @@ class ContentRenderer:
195
237
  # Note: We must handle multiline for $$ and \[
196
238
 
197
239
  # Regex for $$...$$
198
- markdown_content = re.sub(r'\$\$(.*?)\$\$\s*', protect_math, markdown_content, flags=re.DOTALL)
240
+ markdown_content = re.sub(r'\$\$(.*?)\$\$', protect_math, markdown_content, flags=re.DOTALL)
199
241
 
200
242
  # Regex for \[...\]
201
- markdown_content = re.sub(r'\\\[(.*?)\\\]\s*', protect_math, markdown_content, flags=re.DOTALL)
243
+ markdown_content = re.sub(r'\\\[(.*?)\\\]', protect_math, markdown_content, flags=re.DOTALL)
202
244
 
203
245
  # Regex for \(...\) (usually single line, but DOTALL is safest if user wraps lines)
204
246
  markdown_content = re.sub(r'\\\((.*?)\\\)', protect_math, markdown_content, flags=re.DOTALL)
@@ -227,7 +269,7 @@ class ContentRenderer:
227
269
  for i, part in enumerate(parts):
228
270
  # Check if this part is a code block containing our specific citation format
229
271
  if part.startswith('<code'):
230
- # Match <code>ref:123</code> or <code>mcp:abc</code>
272
+ # Match <code>ref:123</code>
231
273
  # Note: attributes like class might be present if we are unlucky, but `ref:` inside usually means inline code.
232
274
 
233
275
  # 1. Numeric: <code>ref:123</code>
@@ -236,12 +278,11 @@ class ContentRenderer:
236
278
  citation_id = ref_match.group(1)
237
279
  parts[i] = f'<span class="inline-flex items-center justify-center min-w-[16px] h-4 px-0.5 text-[10px] font-bold text-blue-600 bg-blue-50 border border-blue-200 rounded mx-0.5 align-top relative -top-0.5">{citation_id}</span>'
238
280
  continue
239
-
240
- # 2. Alpha: <code>mcp:abc</code>
241
- mcp_match = re.match(r'^<code.*?>mcp:([a-zA-Z]+)</code>$', part)
242
- if mcp_match:
243
- mcp_id = mcp_match.group(1)
244
- parts[i] = f'<span class="inline-flex items-center justify-center min-w-[16px] h-4 px-0.5 text-[10px] font-bold text-orange-600 bg-orange-50 border border-orange-200 rounded mx-0.5 align-top relative -top-0.5">{mcp_id}</span>'
281
+ # 2. Flow marker: <code>flow:a</code>
282
+ flow_match = re.match(r'^<code.*?>flow:([a-zA-Z])</code>$', part)
283
+ if flow_match:
284
+ flow_id = flow_match.group(1).lower()
285
+ parts[i] = f'<span class="inline-flex items-center justify-center min-w-[16px] h-4 px-0.5 text-[10px] font-bold text-orange-700 bg-orange-50 border border-orange-200 rounded mx-0.5 align-top relative -top-0.5">{flow_id}</span>'
245
286
  continue
246
287
 
247
288
  # If it's NOT a code block, or a code block we didn't transform, we leave it alone.
@@ -250,12 +291,12 @@ class ContentRenderer:
250
291
  content_html = "".join(parts)
251
292
 
252
293
  # Strip out the structured JSON blocks if they leaked into the content
253
- # Look for <pre>... containing "mcp_steps" or "references" at the end
294
+ # Look for <pre>... containing "references" at the end
254
295
  # Make regex robust to any language class or no class
255
- content_html = re.sub(r'<pre><code[^>]*>[^<]*(mcp_steps|references)[^<]*</code></pre>\s*$', '', content_html, flags=re.DOTALL | re.IGNORECASE)
296
+ content_html = re.sub(r'<pre><code[^>]*>[^<]*references[^<]*</code></pre>\s*$', '', content_html, flags=re.DOTALL | re.IGNORECASE)
256
297
  # Loop to remove multiple if present
257
- while re.search(r'<pre><code[^>]*>[^<]*(mcp_steps|references)[^<]*</code></pre>\s*$', content_html, flags=re.DOTALL | re.IGNORECASE):
258
- content_html = re.sub(r'<pre><code[^>]*>[^<]*(mcp_steps|references)[^<]*</code></pre>\s*$', '', content_html, flags=re.DOTALL | re.IGNORECASE)
298
+ while re.search(r'<pre><code[^>]*>[^<]*references[^<]*</code></pre>\s*$', content_html, flags=re.DOTALL | re.IGNORECASE):
299
+ content_html = re.sub(r'<pre><code[^>]*>[^<]*references[^<]*</code></pre>\s*$', '', content_html, flags=re.DOTALL | re.IGNORECASE)
259
300
 
260
301
  # --- PREPARE DATA FOR JINJA TEMPLATE ---
261
302
 
@@ -264,6 +305,7 @@ class ContentRenderer:
264
305
 
265
306
  # Unified Search Icon (RemixIcon)
266
307
  SEARCH_ICON = '<i class="ri-search-line text-[16px]"></i>'
308
+ BROWSER_ICON = '<i class="ri-global-line text-[16px]"></i>'
267
309
  DEFAULT_ICON = '<i class="ri-box-3-line text-[16px]"></i>'
268
310
 
269
311
  # Helper to infer provider/icon name from model string
@@ -301,6 +343,26 @@ class ContentRenderer:
301
343
  "favicon_url": f"https://www.google.com/s2/favicons?domain={domain}&sz=32"
302
344
  })
303
345
 
346
+ # 2b. Page Reference Processing (crawled pages)
347
+ processed_page_refs = []
348
+ if page_references:
349
+ for ref in page_references[:8]:
350
+ url = ref.get("url", "#")
351
+ try:
352
+ domain = urlparse(url).netloc
353
+ if domain.startswith("www."): domain = domain[4:]
354
+ except:
355
+ domain = "unknown"
356
+
357
+ processed_page_refs.append({
358
+ "title": ref.get("title", "No Title"),
359
+ "url": url,
360
+ "domain": domain,
361
+ "favicon_url": f"https://www.google.com/s2/favicons?domain={domain}&sz=32"
362
+ })
363
+
364
+ flow_steps = flow_steps or []
365
+
304
366
  if stages_used:
305
367
  for stage in stages_used:
306
368
  name = stage.get("name", "Step")
@@ -310,6 +372,8 @@ class ContentRenderer:
310
372
 
311
373
  if name == "Search":
312
374
  icon_html = SEARCH_ICON
375
+ elif name == "Crawler":
376
+ icon_html = BROWSER_ICON
313
377
  else:
314
378
  # Try to find vendor logo
315
379
  # 1. Check explicit icon_config
@@ -342,35 +406,29 @@ class ContentRenderer:
342
406
  # References go to "Search"
343
407
  if name == "Search" and processed_refs:
344
408
  stage_children['references'] = processed_refs
345
-
346
- # MCP Steps go to "Agent"
347
- # Process MCP steps here for the template
348
- stage_mcp_steps = []
349
- if name == "Agent" and mcp_steps:
350
- # RemixIcon Mapping
351
- STEP_ICONS = {
352
- "navigate": '<i class="ri-compass-3-line"></i>',
353
- "snapshot": '<i class="ri-camera-lens-line"></i>',
354
- "click": '<i class="ri-cursor-fill"></i>',
355
- "type": '<i class="ri-keyboard-line"></i>',
356
- "code": '<i class="ri-code-line"></i>',
357
- "search": SEARCH_ICON,
358
- "default": '<i class="ri-arrow-right-s-line"></i>',
409
+
410
+ # Flow steps go to "Agent"
411
+ if name == "Agent" and flow_steps:
412
+ FLOW_ICONS = {
413
+ "search": SEARCH_ICON,
414
+ "page": '<i class="ri-file-text-line text-[16px]"></i>',
359
415
  }
360
- for step in mcp_steps:
416
+ formatted_flow = []
417
+ for step in flow_steps:
361
418
  icon_key = step.get("icon", "").lower()
362
- if "search" in icon_key: icon_key = "search"
363
- elif "nav" in icon_key or "visit" in icon_key: icon_key = "navigate"
364
- elif "click" in icon_key: icon_key = "click"
365
- elif "type" in icon_key or "input" in icon_key: icon_key = "type"
366
- elif "shot" in icon_key: icon_key = "snapshot"
367
-
368
- stage_mcp_steps.append({
369
- "name": step.get("name", "unknown"),
370
- "description": step.get("description", ""),
371
- "icon_svg": STEP_ICONS.get(icon_key, STEP_ICONS["default"])
419
+ formatted_flow.append({
420
+ "icon_svg": FLOW_ICONS.get(icon_key, FLOW_ICONS.get("search")),
421
+ "description": step.get("description", "")
372
422
  })
373
- stage_children['mcp_steps'] = stage_mcp_steps
423
+ stage_children['flow_steps'] = formatted_flow
424
+
425
+ # Pass through Search Queries
426
+ if "queries" in stage:
427
+ stage_children["queries"] = stage["queries"]
428
+
429
+ # Pass through Crawled Pages
430
+ if "crawled_pages" in stage:
431
+ stage_children["crawled_pages"] = stage["crawled_pages"]
374
432
 
375
433
  processed_stages.append({
376
434
  "name": name,
@@ -432,14 +490,31 @@ class ContentRenderer:
432
490
  "billing_html": billing_html
433
491
  }
434
492
 
493
+ # 5. Feature Flags for Header Icons
494
+ feature_flags = {
495
+ "has_vision": False,
496
+ "has_search": False,
497
+ }
498
+
499
+ # Check Vision
500
+ if stats_dict.get("vision_duration", 0) > 0:
501
+ feature_flags["has_vision"] = True
502
+
503
+ # Check Search
504
+ if any(s.get("name") == "Search" for s in stages_used or []):
505
+ feature_flags["has_search"] = True
506
+
435
507
  # Render Template
436
508
  context = {
437
509
  "content_html": content_html,
438
510
  "suggestions": suggestions or [],
439
511
  "stages": processed_stages,
440
512
  "references": processed_refs,
513
+ "page_references": processed_page_refs,
441
514
  "references_json": json.dumps(references or []),
442
515
  "stats": processed_stats,
516
+ "flags": feature_flags,
517
+ "total_time": stats_dict.get("total_time", 0) or 0,
443
518
  **self.assets
444
519
  }
445
520
 
@@ -455,73 +530,25 @@ class ContentRenderer:
455
530
  continue
456
531
 
457
532
  try:
458
- # logger.info("ContentRenderer: launching playwright...")
533
+ # Use Playwright directly for crisp element screenshot (Crawl4AI already depends on it)
459
534
  async with async_playwright() as p:
460
- # logger.info("ContentRenderer: playwright context ready, launching browser...")
461
535
  browser = await p.chromium.launch(headless=True)
462
536
  try:
463
- # Use device_scale_factor=2 for high DPI rendering (better quality)
464
- page = await browser.new_page(viewport={"width": 450, "height": 1200}, device_scale_factor=2)
465
-
466
- # Set content (10s timeout to handle slow CDN loading)
467
- set_ok = await self._set_content_safe(page, final_html, 10000)
468
- if not set_ok or page.is_closed():
469
- raise RuntimeError("set_content failed")
470
-
471
- # Wait for images with user-configured timeout (render_timeout_ms)
472
- image_timeout_sec = render_timeout_ms / 1000.0
473
- try:
474
- await asyncio.wait_for(
475
- page.evaluate("""
476
- () => Promise.all(
477
- Array.from(document.images).map(img => {
478
- if (img.complete) {
479
- if (img.naturalWidth === 0 || img.naturalHeight === 0) {
480
- img.style.display = 'none';
481
- }
482
- return Promise.resolve();
483
- }
484
- return new Promise((resolve) => {
485
- img.onload = () => {
486
- if (img.naturalWidth === 0 || img.naturalHeight === 0) {
487
- img.style.display = 'none';
488
- }
489
- resolve();
490
- };
491
- img.onerror = () => {
492
- img.style.display = 'none';
493
- resolve();
494
- };
495
- });
496
- })
497
- )
498
- """),
499
- timeout=image_timeout_sec
500
- )
501
- except asyncio.TimeoutError:
502
- logger.warning(f"ContentRenderer: image loading timed out after {image_timeout_sec}s, continuing...")
503
-
504
- # Brief wait for layout to stabilize
505
- await asyncio.sleep(0.1)
506
-
507
- # Try element screenshot first, fallback to full page
537
+ page = await browser.new_page(
538
+ viewport={"width": 520, "height": 1400},
539
+ device_scale_factor=3,
540
+ )
541
+ await page.set_content(final_html, wait_until="networkidle")
542
+ await page.wait_for_timeout(150)
508
543
  element = await page.query_selector("#main-container")
509
-
510
- try:
511
- if element:
512
- await element.screenshot(path=output_path)
513
- else:
514
- await page.screenshot(path=output_path, full_page=True)
515
- except Exception as screenshot_exc:
516
- logger.warning(f"ContentRenderer: element screenshot failed ({screenshot_exc}), trying full page...")
517
- await page.screenshot(path=output_path, full_page=True)
518
-
544
+ if element:
545
+ await element.screenshot(path=resolved_output_path, type="jpeg", quality=98)
546
+ else:
547
+ await page.screenshot(path=resolved_output_path, full_page=True, type="jpeg", quality=98)
548
+ return True
519
549
  finally:
520
- try:
521
- await browser.close()
522
- except Exception as exc:
523
- logger.warning(f"ContentRenderer: failed to close browser ({exc})")
524
- return True
550
+ await browser.close()
551
+
525
552
  except Exception as exc:
526
553
  last_exc = exc
527
554
  logger.warning(f"ContentRenderer: render attempt {attempt}/{max_attempts} failed ({exc})")
@@ -529,3 +556,41 @@ class ContentRenderer:
529
556
  content_html = None
530
557
  final_html = None
531
558
  gc.collect()
559
+
560
+ logger.error(f"ContentRenderer: render failed after {max_attempts} attempts ({last_exc})")
561
+ return False
562
+
563
+ async def render_models_list(
564
+ self,
565
+ models: List[Dict[str, Any]],
566
+ output_path: str,
567
+ default_base_url: str = "https://openrouter.ai/api/v1",
568
+ render_timeout_ms: int = 6000,
569
+ ) -> bool:
570
+ """
571
+ Lightweight models list renderer leveraging the main render pipeline.
572
+ """
573
+ lines = ["# 模型列表"]
574
+ for idx, model in enumerate(models or [], start=1):
575
+ name = model.get("name", "unknown")
576
+ base_url = model.get("base_url") or default_base_url
577
+ provider = model.get("provider", "")
578
+ lines.append(f"{idx}. **{name}** \n - base_url: {base_url} \n - provider: {provider}")
579
+
580
+ markdown_content = "\n\n".join(lines) if len(lines) > 1 else "# 模型列表\n暂无模型"
581
+
582
+ return await self.render(
583
+ markdown_content=markdown_content,
584
+ output_path=output_path,
585
+ suggestions=[],
586
+ stats={"time": 0.0},
587
+ references=[],
588
+ stages_used=[],
589
+ model_name="",
590
+ provider_name="Models",
591
+ behavior_summary="Model List",
592
+ icon_config="openai",
593
+ base_url=default_base_url,
594
+ billing_info=None,
595
+ render_timeout_ms=render_timeout_ms,
596
+ )
@@ -1,3 +1,2 @@
1
- from .browser import BrowserTool
2
- from .prompts import AGENT_SYSTEM_PROMPT
1
+ from .prompts import AGENT_SP
3
2
  from .misc import process_onebot_json, process_images
@@ -1,61 +1,40 @@
1
- import asyncio
2
- import urllib.parse
3
- from typing import Any, Optional
4
-
5
- import httpx
6
- import trafilatura
1
+ from typing import Any
7
2
  from loguru import logger
3
+ from crawl4ai import AsyncWebCrawler
4
+ from crawl4ai.async_configs import CrawlerRunConfig
5
+ from crawl4ai.cache_context import CacheMode
8
6
 
9
7
 
10
8
  class BrowserTool:
11
- """Simple HTTP fetcher for search and page content."""
9
+ """Crawl4AI-based page fetcher."""
12
10
 
13
11
  def __init__(self, config: Any):
14
12
  self.config = config
15
- self._client: Optional[httpx.AsyncClient] = None
16
-
17
- async def _ensure_client(self) -> httpx.AsyncClient:
18
- if self._client is None:
19
- timeout = httpx.Timeout(8.0)
20
- self._client = httpx.AsyncClient(
21
- timeout=timeout,
22
- follow_redirects=True,
23
- headers={
24
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
25
- "Accept": "application/json,text/html;q=0.9,*/*;q=0.8",
26
- "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
27
- },
28
- verify=False
29
- )
30
- return self._client
31
13
 
32
14
  async def navigate(self, url: str) -> str:
33
- """Fetch URL content via HTTP and extract markdown."""
15
+ """Fetch URL content via Crawl4AI and return markdown."""
16
+ if not url:
17
+ return "Error: missing url"
34
18
  try:
35
- client = await self._ensure_client()
36
- resp = await client.get(url)
37
- if resp.status_code >= 400:
38
- logger.error(f"HTTP navigation failed status={resp.status_code} url={url}")
39
- return f"Error navigating to {url}: {resp.status_code}"
40
-
41
- html = resp.text
42
- content = await asyncio.to_thread(
43
- trafilatura.extract,
44
- html,
45
- include_links=True,
46
- include_images=True,
47
- include_tables=True,
48
- output_format="markdown",
49
- )
50
- if not content:
51
- content = html[:4000]
52
- return content
19
+ async with AsyncWebCrawler() as crawler:
20
+ result = await crawler.arun(
21
+ url=url,
22
+ config=CrawlerRunConfig(
23
+ wait_until="networkidle",
24
+ wait_for_images=True,
25
+ cache_mode=CacheMode.BYPASS,
26
+ word_count_threshold=1,
27
+ screenshot=False,
28
+ ),
29
+ )
30
+ if not result.success:
31
+ return f"Error navigating to {url}: {result.error_message or result.status_code}"
32
+
33
+ content = result.markdown or result.extracted_content or result.cleaned_html or result.html or ""
34
+ return content[:8000]
53
35
  except Exception as e:
54
36
  logger.error(f"HTTP navigation failed: {e}")
55
37
  return f"Error navigating to {url}: {e}"
56
38
 
57
39
  async def close(self):
58
- if self._client:
59
- await self._client.aclose()
60
- self._client = None
61
-
40
+ return None
@@ -1,46 +1,36 @@
1
- import asyncio
2
- from typing import Any, Optional
3
-
4
- import trafilatura
1
+ from typing import Any
5
2
  from loguru import logger
6
-
7
- try:
8
- from playwright.async_api import async_playwright
9
- except Exception: # pragma: no cover
10
- async_playwright = None
3
+ from crawl4ai.async_configs import CrawlerRunConfig
4
+ from crawl4ai.cache_context import CacheMode
5
+ from .search import get_shared_crawler
11
6
 
12
7
 
13
8
  class PlaywrightTool:
9
+ """
10
+ Backwards-compatible wrapper now powered by Crawl4AI.
11
+ """
14
12
  def __init__(self, config: Any):
15
13
  self.config = config
16
14
 
17
15
  async def navigate(self, url: str) -> str:
18
16
  if not url:
19
17
  return "Error: Missing url"
20
- if async_playwright is None:
21
- return "Error: Playwright is not available in this environment."
22
18
 
23
- headless = bool(getattr(self.config, "headless", True))
24
19
  try:
25
- async with async_playwright() as p:
26
- browser = await p.chromium.launch(headless=headless)
27
- context = await browser.new_context()
28
- page = await context.new_page()
29
- await page.goto(url, wait_until="domcontentloaded", timeout=15000)
30
- html = await page.content()
31
- await context.close()
32
- await browser.close()
33
-
34
- content = await asyncio.to_thread(
35
- trafilatura.extract,
36
- html,
37
- include_links=True,
38
- include_images=True,
39
- include_tables=True,
40
- output_format="markdown",
20
+ crawler = await get_shared_crawler()
21
+ result = await crawler.arun(
22
+ url=url,
23
+ config=CrawlerRunConfig(
24
+ wait_until="networkidle",
25
+ wait_for_images=True,
26
+ cache_mode=CacheMode.BYPASS,
27
+ word_count_threshold=1,
28
+ screenshot=False,
29
+ ),
41
30
  )
42
- return content or html[:4000]
31
+ if not result.success:
32
+ return f"Error: crawl failed ({result.error_message or result.status_code})"
33
+ return (result.markdown or result.extracted_content or result.cleaned_html or result.html or "")[:8000]
43
34
  except Exception as e:
44
- logger.warning(f"Playwright navigation failed: {e}")
45
- return f"Error: Playwright navigation failed: {e}"
46
-
35
+ logger.warning(f"Crawl navigation failed: {e}")
36
+ return f"Error: Crawl navigation failed: {e}"