entari-plugin-hyw 4.0.0rc5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of entari-plugin-hyw might be problematic. Click here for more details.

Files changed (99) hide show
  1. entari_plugin_hyw/__init__.py +532 -0
  2. entari_plugin_hyw/assets/card-dist/index.html +387 -0
  3. entari_plugin_hyw/assets/card-dist/logos/anthropic.svg +1 -0
  4. entari_plugin_hyw/assets/card-dist/logos/cerebras.svg +9 -0
  5. entari_plugin_hyw/assets/card-dist/logos/deepseek.png +0 -0
  6. entari_plugin_hyw/assets/card-dist/logos/gemini.svg +1 -0
  7. entari_plugin_hyw/assets/card-dist/logos/google.svg +1 -0
  8. entari_plugin_hyw/assets/card-dist/logos/grok.png +0 -0
  9. entari_plugin_hyw/assets/card-dist/logos/huggingface.png +0 -0
  10. entari_plugin_hyw/assets/card-dist/logos/microsoft.svg +15 -0
  11. entari_plugin_hyw/assets/card-dist/logos/minimax.png +0 -0
  12. entari_plugin_hyw/assets/card-dist/logos/mistral.png +0 -0
  13. entari_plugin_hyw/assets/card-dist/logos/nvida.png +0 -0
  14. entari_plugin_hyw/assets/card-dist/logos/openai.svg +1 -0
  15. entari_plugin_hyw/assets/card-dist/logos/openrouter.png +0 -0
  16. entari_plugin_hyw/assets/card-dist/logos/perplexity.svg +24 -0
  17. entari_plugin_hyw/assets/card-dist/logos/qwen.png +0 -0
  18. entari_plugin_hyw/assets/card-dist/logos/xai.png +0 -0
  19. entari_plugin_hyw/assets/card-dist/logos/xiaomi.png +0 -0
  20. entari_plugin_hyw/assets/card-dist/logos/zai.png +0 -0
  21. entari_plugin_hyw/assets/card-dist/vite.svg +1 -0
  22. entari_plugin_hyw/assets/icon/anthropic.svg +1 -0
  23. entari_plugin_hyw/assets/icon/cerebras.svg +9 -0
  24. entari_plugin_hyw/assets/icon/deepseek.png +0 -0
  25. entari_plugin_hyw/assets/icon/gemini.svg +1 -0
  26. entari_plugin_hyw/assets/icon/google.svg +1 -0
  27. entari_plugin_hyw/assets/icon/grok.png +0 -0
  28. entari_plugin_hyw/assets/icon/huggingface.png +0 -0
  29. entari_plugin_hyw/assets/icon/microsoft.svg +15 -0
  30. entari_plugin_hyw/assets/icon/minimax.png +0 -0
  31. entari_plugin_hyw/assets/icon/mistral.png +0 -0
  32. entari_plugin_hyw/assets/icon/nvida.png +0 -0
  33. entari_plugin_hyw/assets/icon/openai.svg +1 -0
  34. entari_plugin_hyw/assets/icon/openrouter.png +0 -0
  35. entari_plugin_hyw/assets/icon/perplexity.svg +24 -0
  36. entari_plugin_hyw/assets/icon/qwen.png +0 -0
  37. entari_plugin_hyw/assets/icon/xai.png +0 -0
  38. entari_plugin_hyw/assets/icon/xiaomi.png +0 -0
  39. entari_plugin_hyw/assets/icon/zai.png +0 -0
  40. entari_plugin_hyw/browser/__init__.py +10 -0
  41. entari_plugin_hyw/browser/engines/base.py +13 -0
  42. entari_plugin_hyw/browser/engines/bing.py +95 -0
  43. entari_plugin_hyw/browser/engines/searxng.py +137 -0
  44. entari_plugin_hyw/browser/landing.html +172 -0
  45. entari_plugin_hyw/browser/manager.py +153 -0
  46. entari_plugin_hyw/browser/service.py +275 -0
  47. entari_plugin_hyw/card-ui/.gitignore +24 -0
  48. entari_plugin_hyw/card-ui/README.md +5 -0
  49. entari_plugin_hyw/card-ui/index.html +16 -0
  50. entari_plugin_hyw/card-ui/package-lock.json +2342 -0
  51. entari_plugin_hyw/card-ui/package.json +31 -0
  52. entari_plugin_hyw/card-ui/public/logos/anthropic.svg +1 -0
  53. entari_plugin_hyw/card-ui/public/logos/cerebras.svg +9 -0
  54. entari_plugin_hyw/card-ui/public/logos/deepseek.png +0 -0
  55. entari_plugin_hyw/card-ui/public/logos/gemini.svg +1 -0
  56. entari_plugin_hyw/card-ui/public/logos/google.svg +1 -0
  57. entari_plugin_hyw/card-ui/public/logos/grok.png +0 -0
  58. entari_plugin_hyw/card-ui/public/logos/huggingface.png +0 -0
  59. entari_plugin_hyw/card-ui/public/logos/microsoft.svg +15 -0
  60. entari_plugin_hyw/card-ui/public/logos/minimax.png +0 -0
  61. entari_plugin_hyw/card-ui/public/logos/mistral.png +0 -0
  62. entari_plugin_hyw/card-ui/public/logos/nvida.png +0 -0
  63. entari_plugin_hyw/card-ui/public/logos/openai.svg +1 -0
  64. entari_plugin_hyw/card-ui/public/logos/openrouter.png +0 -0
  65. entari_plugin_hyw/card-ui/public/logos/perplexity.svg +24 -0
  66. entari_plugin_hyw/card-ui/public/logos/qwen.png +0 -0
  67. entari_plugin_hyw/card-ui/public/logos/xai.png +0 -0
  68. entari_plugin_hyw/card-ui/public/logos/xiaomi.png +0 -0
  69. entari_plugin_hyw/card-ui/public/logos/zai.png +0 -0
  70. entari_plugin_hyw/card-ui/public/vite.svg +1 -0
  71. entari_plugin_hyw/card-ui/src/App.vue +756 -0
  72. entari_plugin_hyw/card-ui/src/assets/vue.svg +1 -0
  73. entari_plugin_hyw/card-ui/src/components/HelloWorld.vue +41 -0
  74. entari_plugin_hyw/card-ui/src/components/MarkdownContent.vue +382 -0
  75. entari_plugin_hyw/card-ui/src/components/SectionCard.vue +41 -0
  76. entari_plugin_hyw/card-ui/src/components/StageCard.vue +240 -0
  77. entari_plugin_hyw/card-ui/src/main.ts +5 -0
  78. entari_plugin_hyw/card-ui/src/style.css +29 -0
  79. entari_plugin_hyw/card-ui/src/test_regex.js +103 -0
  80. entari_plugin_hyw/card-ui/src/types.ts +61 -0
  81. entari_plugin_hyw/card-ui/tsconfig.app.json +16 -0
  82. entari_plugin_hyw/card-ui/tsconfig.json +7 -0
  83. entari_plugin_hyw/card-ui/tsconfig.node.json +26 -0
  84. entari_plugin_hyw/card-ui/vite.config.ts +16 -0
  85. entari_plugin_hyw/definitions.py +130 -0
  86. entari_plugin_hyw/history.py +248 -0
  87. entari_plugin_hyw/image_cache.py +274 -0
  88. entari_plugin_hyw/misc.py +135 -0
  89. entari_plugin_hyw/modular_pipeline.py +351 -0
  90. entari_plugin_hyw/render_vue.py +401 -0
  91. entari_plugin_hyw/search.py +116 -0
  92. entari_plugin_hyw/stage_base.py +88 -0
  93. entari_plugin_hyw/stage_instruct.py +328 -0
  94. entari_plugin_hyw/stage_instruct_review.py +92 -0
  95. entari_plugin_hyw/stage_summary.py +164 -0
  96. entari_plugin_hyw-4.0.0rc5.dist-info/METADATA +116 -0
  97. entari_plugin_hyw-4.0.0rc5.dist-info/RECORD +99 -0
  98. entari_plugin_hyw-4.0.0rc5.dist-info/WHEEL +5 -0
  99. entari_plugin_hyw-4.0.0rc5.dist-info/top_level.txt +1 -0
@@ -0,0 +1,401 @@
1
+ """
2
+ Vue-based Card Renderer (DrissionPage-based)
3
+
4
+ Renders content to image using the shared DrissionPage browser.
5
+ Wraps synchronous DrissionPage operations in a thread pool.
6
+ """
7
+
8
+ import json
9
+ import asyncio
10
+ from pathlib import Path
11
+ from typing import List, Dict, Any, Optional
12
+ from concurrent.futures import ThreadPoolExecutor
13
+
14
+ from loguru import logger
15
+ from .browser.manager import SharedBrowserManager
16
+
17
+
18
+ class ContentRenderer:
19
+ """Renderer using DrissionPage with thread pool for async interface."""
20
+
21
+ def __init__(self, template_path: str = None, auto_start: bool = True, headless: bool = True):
22
+ self.headless = headless
23
+
24
+ if template_path is None:
25
+ current_dir = Path(__file__).parent
26
+ template_path = current_dir / "assets" / "card-dist" / "index.html"
27
+
28
+ self.template_path = Path(template_path)
29
+ if not self.template_path.exists():
30
+ raise FileNotFoundError(f"Vue template not found: {self.template_path}")
31
+
32
+ self.template_content = self.template_path.read_text(encoding="utf-8")
33
+ logger.info(f"ContentRenderer: loaded Vue template ({len(self.template_content)} bytes)")
34
+
35
+ self._manager = None
36
+ self._executor = ThreadPoolExecutor(max_workers=10) # Enough for batch crawls
37
+ self._render_tab = None
38
+
39
+ if auto_start:
40
+ self._ensure_manager()
41
+
42
+ def _ensure_manager(self):
43
+ """Ensure shared browser manager exists."""
44
+ if not self._manager:
45
+ from .browser.manager import get_shared_browser_manager
46
+ self._manager = get_shared_browser_manager(headless=self.headless)
47
+
48
+ async def start(self, timeout: int = 6000):
49
+ """Initialize renderer manager (async wrapper)."""
50
+ loop = asyncio.get_running_loop()
51
+ await loop.run_in_executor(self._executor, self._ensure_manager)
52
+
53
+ async def prepare_tab(self) -> str:
54
+ """Async wrapper to prepare a new render tab."""
55
+ loop = asyncio.get_running_loop()
56
+ return await loop.run_in_executor(self._executor, self._prepare_tab_sync)
57
+
58
+ def _prepare_tab_sync(self) -> str:
59
+ """Create and warm up a new tab, return its ID."""
60
+ import time as pytimeout
61
+ start = pytimeout.time()
62
+ self._ensure_manager()
63
+ try:
64
+ tab = self._manager.new_tab(self.template_path.as_uri())
65
+ tab_id = tab.tab_id
66
+
67
+ # Basic wait
68
+ tab.wait(1)
69
+
70
+ # Pre-warm
71
+ warmup_data = {
72
+ "markdown": "# Ready",
73
+ "total_time": 0,
74
+ "stages": [],
75
+ "references": [],
76
+ "stats": {},
77
+ "theme_color": "#ef4444",
78
+ }
79
+
80
+ if tab.ele('#app', timeout=5):
81
+ tab.run_js(f"window.updateRenderData({json.dumps(warmup_data)})")
82
+
83
+ elapsed = pytimeout.time() - start
84
+ logger.info(f"ContentRenderer: Prepared tab {tab_id} in {elapsed:.2f}s")
85
+ return tab_id
86
+ except Exception as e:
87
+ logger.error(f"ContentRenderer: Failed to prepare tab: {e}")
88
+ raise
89
+
90
+ async def render_pages_batch(
91
+ self,
92
+ pages: List[Dict[str, Any]],
93
+ theme_color: str = "#ef4444"
94
+ ) -> List[str]:
95
+ """
96
+ Render multiple page markdown contents to images concurrently.
97
+
98
+ Args:
99
+ pages: List of dicts with 'title', 'content', 'url' keys
100
+ theme_color: Theme color for rendering
101
+
102
+ Returns:
103
+ List of base64-encoded JPG images
104
+ """
105
+ if not pages:
106
+ return []
107
+
108
+ loop = asyncio.get_running_loop()
109
+
110
+ # Prepare tabs concurrently
111
+ logger.info(f"ContentRenderer: Preparing {len(pages)} tabs for batch render")
112
+ tab_tasks = [
113
+ loop.run_in_executor(self._executor, self._prepare_tab_sync)
114
+ for _ in pages
115
+ ]
116
+ tab_ids = await asyncio.gather(*tab_tasks, return_exceptions=True)
117
+
118
+ # Filter out failed tab preparations
119
+ valid_pairs = []
120
+ for i, (page, tab_id) in enumerate(zip(pages, tab_ids)):
121
+ if isinstance(tab_id, Exception):
122
+ logger.warning(f"ContentRenderer: Failed to prepare tab for page {i}: {tab_id}")
123
+ else:
124
+ valid_pairs.append((page, tab_id))
125
+
126
+ if not valid_pairs:
127
+ return []
128
+
129
+ # Render concurrently
130
+ render_tasks = [
131
+ loop.run_in_executor(
132
+ self._executor,
133
+ self._render_page_to_b64_sync,
134
+ page,
135
+ tab_id,
136
+ theme_color
137
+ )
138
+ for page, tab_id in valid_pairs
139
+ ]
140
+
141
+ results = await asyncio.gather(*render_tasks, return_exceptions=True)
142
+
143
+ # Process results
144
+ screenshots = []
145
+ for i, res in enumerate(results):
146
+ if isinstance(res, Exception):
147
+ logger.warning(f"ContentRenderer: Batch render error for page {i}: {res}")
148
+ screenshots.append(None)
149
+ else:
150
+ screenshots.append(res)
151
+
152
+ logger.info(f"ContentRenderer: Batch rendered {len([s for s in screenshots if s])} pages")
153
+ return screenshots
154
+
155
+ def _render_page_to_b64_sync(
156
+ self,
157
+ page_data: Dict[str, Any],
158
+ tab_id: str,
159
+ theme_color: str
160
+ ) -> Optional[str]:
161
+ """Render a single page's markdown to base64 image."""
162
+ tab = None
163
+ try:
164
+ self._ensure_manager()
165
+ browser_page = self._manager.page
166
+
167
+ try:
168
+ tab = browser_page.get_tab(tab_id)
169
+ except Exception:
170
+ return None
171
+
172
+ if not tab:
173
+ return None
174
+
175
+ # Build render data for this page
176
+ markdown = f"# {page_data.get('title', 'Page')}\n\n{page_data.get('content', '')}"
177
+
178
+ render_data = {
179
+ "markdown": markdown,
180
+ "total_time": 0,
181
+ "stages": [],
182
+ "references": [],
183
+ "page_references": [],
184
+ "image_references": [],
185
+ "stats": {},
186
+ "theme_color": theme_color,
187
+ }
188
+
189
+ # 1. Update Data & Settle
190
+ tab.run_js(f"window.updateRenderData({json.dumps(render_data)})")
191
+ tab.wait(0.5) # Since images are Base64, decoding is nearly instant once injected
192
+
193
+ # 2. Dynamic Resize
194
+ # Get actual content height to prevent clipping
195
+ scroll_height = tab.run_js('return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);')
196
+ viewport_height = int(scroll_height) + 200
197
+
198
+ tab.run_cdp('Emulation.setDeviceMetricsOverride',
199
+ width=1920, height=viewport_height, deviceScaleFactor=1, mobile=False
200
+ )
201
+
202
+ # 3. Hide Scrollbars (Now that viewport is large enough, overflow:hidden won't clip)
203
+ tab.run_js('document.documentElement.style.overflow = "hidden"')
204
+ tab.run_js('document.body.style.overflow = "hidden"')
205
+
206
+ # Use element's actual position and size
207
+ main_ele = tab.ele('#main-container', timeout=3)
208
+ if main_ele:
209
+ # Robustly hide scrollbars via CDP and Style Injection
210
+ SharedBrowserManager.hide_scrollbars(tab)
211
+
212
+ # Force root styles to eliminate gutter and ensure full width
213
+ tab.run_js('document.documentElement.style.overflow = "hidden";')
214
+ tab.run_js('document.body.style.overflow = "hidden";')
215
+ tab.run_js('document.documentElement.style.scrollbarGutter = "unset";')
216
+ tab.run_js('document.documentElement.style.width = "100%";')
217
+
218
+ orig_overflow = "auto" # just a placeholder, we rely on full refresh usually or don't care about restoring for single-purpose tabs
219
+
220
+ b64_img = main_ele.get_screenshot(as_base64='jpg')
221
+
222
+ # Restore not strictly needed for throwaway render tabs, but good practice
223
+ # tab.run_js(f'document.documentElement.style.overflow = "{orig_overflow}";')
224
+ try:
225
+ tab.set.scroll_bars(True)
226
+ except:
227
+ pass
228
+ return b64_img
229
+ else:
230
+ return tab.get_screenshot(as_base64='jpg', full_page=False)
231
+
232
+ except Exception as e:
233
+ logger.error(f"ContentRenderer: Failed to render page: {e}")
234
+ return None
235
+ finally:
236
+ if tab:
237
+ try:
238
+ tab.close()
239
+ except Exception:
240
+ pass
241
+
242
+
243
+ async def render(
244
+ self,
245
+ markdown_content: str,
246
+ output_path: str,
247
+ tab_id: Optional[str] = None,
248
+ stats: Dict[str, Any] = None,
249
+ references: List[Dict[str, Any]] = None,
250
+ page_references: List[Dict[str, Any]] = None,
251
+ image_references: List[Dict[str, Any]] = None,
252
+ stages_used: List[Dict[str, Any]] = None,
253
+ theme_color: str = "#ef4444",
254
+ **kwargs
255
+ ) -> bool:
256
+ """Render content to image using a specific (pre-warmed) tab or a temp one."""
257
+ loop = asyncio.get_running_loop()
258
+ return await loop.run_in_executor(
259
+ self._executor,
260
+ self._render_sync,
261
+ markdown_content,
262
+ output_path,
263
+ tab_id,
264
+ stats,
265
+ references,
266
+ page_references,
267
+ image_references,
268
+ stages_used,
269
+ theme_color
270
+ )
271
+
272
+ def _render_sync(
273
+ self,
274
+ markdown_content: str,
275
+ output_path: str,
276
+ tab_id: Optional[str],
277
+ stats: Dict[str, Any],
278
+ references: List[Dict[str, Any]],
279
+ page_references: List[Dict[str, Any]],
280
+ image_references: List[Dict[str, Any]],
281
+ stages_used: List[Dict[str, Any]],
282
+ theme_color: str
283
+ ) -> bool:
284
+ """Synchronous render implementation."""
285
+ tab = None
286
+
287
+ try:
288
+ self._ensure_manager()
289
+ page = self._manager.page
290
+
291
+ if tab_id:
292
+ try:
293
+ tab = page.get_tab(tab_id)
294
+ except Exception:
295
+ pass
296
+
297
+ if not tab:
298
+ logger.warning("ContentRenderer: Pre-warmed tab not found, creating new.")
299
+ tab = page.new_tab(self.template_path.as_uri())
300
+ tab.wait(0.5)
301
+
302
+ resolved_output_path = Path(output_path).resolve()
303
+ resolved_output_path.parent.mkdir(parents=True, exist_ok=True)
304
+
305
+ stats_dict = stats[0] if isinstance(stats, list) and stats else (stats or {})
306
+
307
+ render_data = {
308
+ "markdown": markdown_content,
309
+ "total_time": stats_dict.get("total_time", 0) or 0,
310
+ "stages": stages_used or [],
311
+ "references": references or [],
312
+ "page_references": page_references or [],
313
+ "image_references": image_references or [],
314
+ "stats": stats_dict,
315
+ "theme_color": theme_color,
316
+ }
317
+
318
+ tab.run_js(f"window.updateRenderData({json.dumps(render_data)})")
319
+
320
+ # Brief settle wait for masonry/images
321
+ tab.wait(0.6)
322
+
323
+ # Dynamic Resize
324
+ scroll_height = tab.run_js('return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);')
325
+ viewport_height = int(scroll_height) + 200
326
+
327
+ tab.run_cdp('Emulation.setDeviceMetricsOverride',
328
+ width=1920, height=viewport_height, deviceScaleFactor=1, mobile=False
329
+ )
330
+
331
+ # Hide scrollbars
332
+ tab.run_js('document.documentElement.style.overflow = "hidden"')
333
+ tab.run_js('document.body.style.overflow = "hidden"')
334
+
335
+ # Use element's actual position and size
336
+ main_ele = tab.ele('#main-container', timeout=5)
337
+ if main_ele:
338
+ import base64
339
+
340
+ # Robustly hide scrollbars via CDP and Style Injection
341
+ SharedBrowserManager.hide_scrollbars(tab)
342
+
343
+ # Force root styles to eliminate gutter and ensure full width
344
+ tab.run_js('document.documentElement.style.overflow = "hidden";')
345
+ tab.run_js('document.body.style.overflow = "hidden";')
346
+ tab.run_js('document.documentElement.style.scrollbarGutter = "unset";')
347
+ tab.run_js('document.documentElement.style.width = "100%";')
348
+
349
+ b64_img = main_ele.get_screenshot(as_base64='jpg')
350
+
351
+ # Restore scrollbars (optional here since we often close or navigate away)
352
+ try:
353
+ tab.set.scroll_bars(True)
354
+ except:
355
+ pass
356
+
357
+ with open(str(resolved_output_path), 'wb') as f:
358
+ f.write(base64.b64decode(b64_img))
359
+ else:
360
+ logger.warning("ContentRenderer: #main-container not found, using fallback")
361
+ tab.get_screenshot(path=str(resolved_output_path.parent), name=resolved_output_path.name, full_page=True)
362
+
363
+ return True
364
+ except Exception as e:
365
+ logger.error(f"ContentRenderer: Render failed: {e}")
366
+ return False
367
+ finally:
368
+ if tab:
369
+ try:
370
+ tab.close()
371
+ except Exception:
372
+ pass
373
+
374
+ async def close(self):
375
+ """Close renderer."""
376
+ self._executor.shutdown(wait=False)
377
+ if self._render_tab:
378
+ try:
379
+ self._render_tab.close()
380
+ except Exception:
381
+ pass
382
+ self._render_tab = None
383
+
384
+
385
+ # Singleton
386
+ _content_renderer: Optional[ContentRenderer] = None
387
+
388
+
389
+
390
+ async def get_content_renderer() -> ContentRenderer:
391
+ global _content_renderer
392
+ if _content_renderer is None:
393
+ _content_renderer = ContentRenderer()
394
+ await _content_renderer.start()
395
+ return _content_renderer
396
+
397
+
398
+ def set_global_renderer(renderer: ContentRenderer):
399
+ """Set the global renderer instance."""
400
+ global _content_renderer
401
+ _content_renderer = renderer
@@ -0,0 +1,116 @@
1
+ import asyncio
2
+ import urllib.parse
3
+ import re
4
+ import time
5
+ from typing import List, Dict, Any, Optional
6
+ from loguru import logger
7
+
8
+ from .browser.service import get_screenshot_service
9
+ # New engines
10
+ from .browser.engines.bing import BingEngine
11
+ from .browser.engines.searxng import SearXNGEngine
12
+
13
+ class SearchService:
14
+ def __init__(self, config: Any):
15
+ self.config = config
16
+ self._headless = getattr(config, "headless", True)
17
+ self._fetch_timeout = getattr(config, "fetch_timeout", 20.0)
18
+ self._default_limit = getattr(config, "search_limit", 10)
19
+
20
+ # Domain blocking
21
+ self._blocked_domains = getattr(config, "blocked_domains", []) or []
22
+
23
+ # Select Engine
24
+ self._engine_name = getattr(config, "search_engine", "bing").lower()
25
+ if self._engine_name == "bing":
26
+ self._engine = BingEngine()
27
+ else:
28
+ self._engine = SearXNGEngine()
29
+
30
+ logger.info(f"SearchService initialized with engine: {self._engine_name}")
31
+
32
+ def _build_search_url(self, query: str) -> str:
33
+ return self._engine.build_url(query, self._default_limit)
34
+
35
+ async def search_batch(self, queries: List[str]) -> List[List[Dict[str, Any]]]:
36
+ """Execute multiple searches concurrently."""
37
+ tasks = [self.search(q) for q in queries]
38
+ return await asyncio.gather(*tasks)
39
+
40
+ async def search(self, query: str) -> List[Dict[str, Any]]:
41
+ """
42
+ Main search entry point.
43
+ Returns parsed results + 1 raw page item (marked hidden).
44
+ """
45
+ if not query:
46
+ return []
47
+
48
+ # Apply blocking
49
+ final_query = query
50
+ enable_blocking = getattr(self.config, "enable_domain_blocking", True)
51
+ if enable_blocking and self._blocked_domains and "-site:" not in query:
52
+ exclusions = " ".join([f"-site:{d}" for d in self._blocked_domains])
53
+ final_query = f"{query} {exclusions}"
54
+
55
+ url = self._build_search_url(final_query)
56
+ logger.info(f"Search: '{query}' -> {url}")
57
+
58
+ results = []
59
+ try:
60
+ # Fetch - Search parsing doesn't need screenshot, only HTML
61
+ page_data = await self.fetch_page_raw(url, include_screenshot=False)
62
+ content = page_data.get("html", "") or page_data.get("content", "")
63
+
64
+ # 1. Add Raw Page Item (Always)
65
+ # This allows history manager to save the raw search page for debugging
66
+ raw_item = {
67
+ "title": f"Raw Search: {query}",
68
+ "url": url,
69
+ "content": content, # Keep original content
70
+ "type": "search_raw_page", # Special type for history
71
+ "_hidden": False, # Unhidden to allow LLM access if needed
72
+ "query": query,
73
+ "images": page_data.get("images", [])
74
+ }
75
+ results.append(raw_item)
76
+
77
+ # 2. Parse Results
78
+ if content and not content.startswith("Error"):
79
+ parsed = self._engine.parse(content)
80
+ logger.info(f"Search parsed {len(parsed)} results for '{query}' using {self._engine_name}")
81
+ results.extend(parsed)
82
+ else:
83
+ logger.warning(f"Search failed/empty for '{query}': {content[:100]}")
84
+
85
+ return results
86
+
87
+ except Exception as e:
88
+ logger.error(f"Search error for '{query}': {e}")
89
+ # Ensure we return at least an error item
90
+ return [{
91
+ "title": f"Error Search: {query}",
92
+ "url": url,
93
+ "content": f"Error: {e}",
94
+ "type": "search_raw_page",
95
+ "_hidden": True
96
+ }]
97
+
98
+ async def fetch_pages_batch(self, urls: List[str], include_screenshot: bool = True) -> List[Dict[str, Any]]:
99
+ """Fetch multiple pages concurrently."""
100
+ tasks = [self.fetch_page(u, include_screenshot=include_screenshot) for u in urls]
101
+ return await asyncio.gather(*tasks)
102
+
103
+ async def fetch_page(self, url: str, timeout: Optional[float] = None, include_screenshot: bool = True) -> Dict[str, Any]:
104
+ """
105
+ Fetch a single page for reading/extracting content.
106
+ """
107
+ if timeout is None:
108
+ timeout = self._fetch_timeout
109
+ return await self.fetch_page_raw(url, timeout, include_screenshot=include_screenshot)
110
+
111
+ async def fetch_page_raw(self, url: str, timeout: Optional[float] = None, include_screenshot: bool = True) -> Dict[str, Any]:
112
+ """Internal: Get raw data from browser service."""
113
+ if timeout is None:
114
+ timeout = self._fetch_timeout
115
+ service = get_screenshot_service(headless=self._headless)
116
+ return await service.fetch_page(url, timeout=timeout, include_screenshot=include_screenshot)
@@ -0,0 +1,88 @@
1
+ """
2
+ Stage Base Classes
3
+
4
+ Abstract base classes for pipeline stages.
5
+ Each stage is a self-contained unit of work.
6
+ """
7
+
8
+ from abc import ABC, abstractmethod
9
+ from dataclasses import dataclass, field
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ from openai import AsyncOpenAI
13
+
14
+
15
+ @dataclass
16
+ class StageContext:
17
+ """Shared context passed between stages."""
18
+ user_input: str
19
+ images: List[str] = field(default_factory=list)
20
+ conversation_history: List[Dict] = field(default_factory=list)
21
+ instruct_history: List[Dict] = field(default_factory=list) # History for Instruct stage rounds
22
+
23
+ # Accumulated data
24
+ web_results: List[Dict] = field(default_factory=list)
25
+ agent_context: str = ""
26
+ review_context: str = "" # Context passed from Instruct to Review stage
27
+
28
+ # Mode info (set by Instruct stage)
29
+ task_list: List[str] = field(default_factory=list)
30
+
31
+ # Control flags
32
+ should_refuse: bool = False
33
+ refuse_reason: str = ""
34
+
35
+ # ID counter for unified referencing
36
+ global_id_counter: int = 0
37
+
38
+ def next_id(self) -> int:
39
+ """Get next global ID."""
40
+ self.global_id_counter += 1
41
+ return self.global_id_counter
42
+
43
+
44
+ @dataclass
45
+ class StageResult:
46
+ """Result from a stage execution."""
47
+ success: bool
48
+ data: Dict[str, Any] = field(default_factory=dict)
49
+ usage: Dict[str, int] = field(default_factory=lambda: {"input_tokens": 0, "output_tokens": 0})
50
+ trace: Dict[str, Any] = field(default_factory=dict)
51
+ error: Optional[str] = None
52
+
53
+
54
+ class BaseStage(ABC):
55
+ """Abstract base class for pipeline stages."""
56
+
57
+ def __init__(self, config: Any, search_service: Any, client: AsyncOpenAI):
58
+ self.config = config
59
+ self.search_service = search_service
60
+ self.client = client
61
+
62
+ @property
63
+ @abstractmethod
64
+ def name(self) -> str:
65
+ """Stage name for logging and tracing."""
66
+ pass
67
+
68
+ @abstractmethod
69
+ async def execute(self, context: StageContext) -> StageResult:
70
+ """
71
+ Execute the stage.
72
+
73
+ Args:
74
+ context: Shared context with accumulated data
75
+
76
+ Returns:
77
+ StageResult with success status, data, usage, and trace info
78
+ """
79
+ pass
80
+
81
+ def _client_for(self, api_key: Optional[str], base_url: Optional[str]) -> AsyncOpenAI:
82
+ """Get or create client with custom credentials."""
83
+ if api_key or base_url:
84
+ return AsyncOpenAI(
85
+ base_url=base_url or self.config.base_url,
86
+ api_key=api_key or self.config.api_key
87
+ )
88
+ return self.client