entari-plugin-hyw 0.3.5__py3-none-any.whl → 4.0.0rc14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of entari-plugin-hyw might be problematic. Click here for more details.

Files changed (78) hide show
  1. entari_plugin_hyw/Untitled-1 +1865 -0
  2. entari_plugin_hyw/__init__.py +979 -116
  3. entari_plugin_hyw/filters.py +83 -0
  4. entari_plugin_hyw/history.py +251 -0
  5. entari_plugin_hyw/misc.py +214 -0
  6. entari_plugin_hyw/search_cache.py +154 -0
  7. entari_plugin_hyw-4.0.0rc14.dist-info/METADATA +118 -0
  8. entari_plugin_hyw-4.0.0rc14.dist-info/RECORD +72 -0
  9. {entari_plugin_hyw-0.3.5.dist-info → entari_plugin_hyw-4.0.0rc14.dist-info}/WHEEL +1 -1
  10. {entari_plugin_hyw-0.3.5.dist-info → entari_plugin_hyw-4.0.0rc14.dist-info}/top_level.txt +1 -0
  11. hyw_core/__init__.py +94 -0
  12. hyw_core/agent.py +768 -0
  13. hyw_core/browser_control/__init__.py +63 -0
  14. hyw_core/browser_control/assets/card-dist/index.html +425 -0
  15. hyw_core/browser_control/assets/card-dist/logos/anthropic.svg +1 -0
  16. hyw_core/browser_control/assets/card-dist/logos/cerebras.svg +9 -0
  17. hyw_core/browser_control/assets/card-dist/logos/deepseek.png +0 -0
  18. hyw_core/browser_control/assets/card-dist/logos/gemini.svg +1 -0
  19. hyw_core/browser_control/assets/card-dist/logos/google.svg +1 -0
  20. hyw_core/browser_control/assets/card-dist/logos/grok.png +0 -0
  21. hyw_core/browser_control/assets/card-dist/logos/huggingface.png +0 -0
  22. hyw_core/browser_control/assets/card-dist/logos/microsoft.svg +15 -0
  23. hyw_core/browser_control/assets/card-dist/logos/minimax.png +0 -0
  24. hyw_core/browser_control/assets/card-dist/logos/mistral.png +0 -0
  25. hyw_core/browser_control/assets/card-dist/logos/nvida.png +0 -0
  26. hyw_core/browser_control/assets/card-dist/logos/openai.svg +1 -0
  27. hyw_core/browser_control/assets/card-dist/logos/openrouter.png +0 -0
  28. hyw_core/browser_control/assets/card-dist/logos/perplexity.svg +24 -0
  29. hyw_core/browser_control/assets/card-dist/logos/qwen.png +0 -0
  30. hyw_core/browser_control/assets/card-dist/logos/xai.png +0 -0
  31. hyw_core/browser_control/assets/card-dist/logos/xiaomi.png +0 -0
  32. hyw_core/browser_control/assets/card-dist/logos/zai.png +0 -0
  33. hyw_core/browser_control/assets/card-dist/vite.svg +1 -0
  34. hyw_core/browser_control/assets/index.html +5691 -0
  35. hyw_core/browser_control/assets/logos/anthropic.svg +1 -0
  36. hyw_core/browser_control/assets/logos/cerebras.svg +9 -0
  37. hyw_core/browser_control/assets/logos/deepseek.png +0 -0
  38. hyw_core/browser_control/assets/logos/gemini.svg +1 -0
  39. hyw_core/browser_control/assets/logos/google.svg +1 -0
  40. hyw_core/browser_control/assets/logos/grok.png +0 -0
  41. hyw_core/browser_control/assets/logos/huggingface.png +0 -0
  42. hyw_core/browser_control/assets/logos/microsoft.svg +15 -0
  43. hyw_core/browser_control/assets/logos/minimax.png +0 -0
  44. hyw_core/browser_control/assets/logos/mistral.png +0 -0
  45. hyw_core/browser_control/assets/logos/nvida.png +0 -0
  46. hyw_core/browser_control/assets/logos/openai.svg +1 -0
  47. hyw_core/browser_control/assets/logos/openrouter.png +0 -0
  48. hyw_core/browser_control/assets/logos/perplexity.svg +24 -0
  49. hyw_core/browser_control/assets/logos/qwen.png +0 -0
  50. hyw_core/browser_control/assets/logos/xai.png +0 -0
  51. hyw_core/browser_control/assets/logos/xiaomi.png +0 -0
  52. hyw_core/browser_control/assets/logos/zai.png +0 -0
  53. hyw_core/browser_control/engines/__init__.py +15 -0
  54. hyw_core/browser_control/engines/base.py +13 -0
  55. hyw_core/browser_control/engines/default.py +166 -0
  56. hyw_core/browser_control/engines/duckduckgo.py +171 -0
  57. hyw_core/browser_control/landing.html +172 -0
  58. hyw_core/browser_control/manager.py +173 -0
  59. hyw_core/browser_control/renderer.py +446 -0
  60. hyw_core/browser_control/service.py +940 -0
  61. hyw_core/config.py +154 -0
  62. hyw_core/core.py +462 -0
  63. hyw_core/crawling/__init__.py +18 -0
  64. hyw_core/crawling/completeness.py +437 -0
  65. hyw_core/crawling/models.py +88 -0
  66. hyw_core/definitions.py +104 -0
  67. hyw_core/image_cache.py +274 -0
  68. hyw_core/pipeline.py +502 -0
  69. hyw_core/search.py +171 -0
  70. hyw_core/stages/__init__.py +21 -0
  71. hyw_core/stages/base.py +95 -0
  72. hyw_core/stages/summary.py +191 -0
  73. entari_plugin_hyw/agent.py +0 -419
  74. entari_plugin_hyw/compressor.py +0 -59
  75. entari_plugin_hyw/tools.py +0 -236
  76. entari_plugin_hyw/vision.py +0 -35
  77. entari_plugin_hyw-0.3.5.dist-info/METADATA +0 -112
  78. entari_plugin_hyw-0.3.5.dist-info/RECORD +0 -9
@@ -0,0 +1,437 @@
1
+ """
2
+ Page Completeness Checker
3
+
4
+ Multi-signal page readiness detection with focus on image loading guarantees.
5
+ Implements Crawl4AI's wait_for_images and networkidle concepts.
6
+ """
7
+
8
+ import time
9
+ from typing import Any, Optional, Dict
10
+ from dataclasses import dataclass
11
+ from loguru import logger
12
+
13
+ from .models import CrawlConfig, CompletenessResult
14
+
15
+
16
+ # CSP-compliant JavaScript for image loading verification
17
+ # Based on Crawl4AI's wait_for_images logic
18
+ IMAGE_CHECK_JS = """
19
+ (() => {
20
+ const results = {
21
+ total: 0,
22
+ loaded: 0,
23
+ failed: 0,
24
+ placeholders: 0,
25
+ details: []
26
+ };
27
+
28
+ const imgs = Array.from(document.querySelectorAll('img'));
29
+ const viewportHeight = window.innerHeight;
30
+ const minSize = %d; // Injected from Python
31
+
32
+ for (const img of imgs) {
33
+ // Skip tiny images (icons, tracking pixels)
34
+ if (img.clientWidth < minSize && img.clientHeight < minSize) {
35
+ continue;
36
+ }
37
+
38
+ // Skip images outside viewport (unless scan_full_page)
39
+ const rect = img.getBoundingClientRect();
40
+ const inViewport = rect.top < viewportHeight * 2 && rect.bottom > -viewportHeight;
41
+
42
+ if (!inViewport && !%s) { // scan_full_page injected
43
+ continue;
44
+ }
45
+
46
+ results.total++;
47
+
48
+ const src = img.src || img.getAttribute('data-src') || '';
49
+ const dataSrc = img.getAttribute('data-src') || img.getAttribute('data-original') ||
50
+ img.getAttribute('data-lazy-src') || img.getAttribute('data-lazy') || '';
51
+ const className = (typeof img.className === 'string' ? img.className : '').toLowerCase();
52
+ const loadingAttr = img.getAttribute('loading') || '';
53
+
54
+ // Enhanced placeholder detection for blurred preview images (like mcmod.cn)
55
+ const isPlaceholder = (
56
+ // 1. data-src exists but not yet loaded into src
57
+ (dataSrc && img.src !== dataSrc) ||
58
+ // 2. Natural size much smaller than display size (blurred placeholder)
59
+ (img.naturalWidth < 50 && img.clientWidth > 100) ||
60
+ (img.naturalWidth < 100 && img.clientWidth > 200 && img.naturalWidth * 4 < img.clientWidth) ||
61
+ // 3. Common placeholder keywords in src
62
+ src.includes('placeholder') ||
63
+ src.includes('loading') ||
64
+ src.includes('blank') ||
65
+ // 4. SVG placeholder or 1x1 tracking pixel
66
+ src.startsWith('data:image/svg+xml') ||
67
+ (img.naturalWidth === 1 && img.naturalHeight === 1) ||
68
+ // 5. Lazy-loading class indicators (common patterns)
69
+ className.includes('lazy') ||
70
+ className.includes('lazyload') ||
71
+ className.includes('lozad') ||
72
+ className.includes('b-lazy') ||
73
+ // 6. Blur indicators (common for LQIP - Low Quality Image Placeholder)
74
+ className.includes('blur') ||
75
+ src.includes('blur') ||
76
+ src.includes('thumb') ||
77
+ src.includes('thumbnail') ||
78
+ // 7. loading="lazy" + not complete (browser native lazy loading)
79
+ (loadingAttr === 'lazy' && !img.complete) ||
80
+ // 8. CSS blur filter applied (visual blurring)
81
+ (window.getComputedStyle(img).filter || '').includes('blur')
82
+ );
83
+
84
+ if (isPlaceholder) {
85
+ results.placeholders++;
86
+ results.details.push({
87
+ src: src.substring(0, 100),
88
+ status: 'placeholder',
89
+ natural: [img.naturalWidth, img.naturalHeight],
90
+ display: [img.clientWidth, img.clientHeight]
91
+ });
92
+ continue;
93
+ }
94
+
95
+ // Check if fully loaded
96
+ if (img.complete && img.naturalWidth > 0 && img.naturalHeight > 0) {
97
+ results.loaded++;
98
+ results.details.push({
99
+ src: src.substring(0, 100),
100
+ status: 'loaded',
101
+ natural: [img.naturalWidth, img.naturalHeight]
102
+ });
103
+ } else {
104
+ results.failed++;
105
+ results.details.push({
106
+ src: src.substring(0, 100),
107
+ status: 'pending',
108
+ complete: img.complete,
109
+ natural: [img.naturalWidth, img.naturalHeight]
110
+ });
111
+ }
112
+ }
113
+
114
+ return results;
115
+ })()
116
+ """
117
+
118
+ # JavaScript for height stability check
119
+ HEIGHT_CHECK_JS = """
120
+ (() => {
121
+ return {
122
+ height: Math.max(
123
+ document.body.scrollHeight || 0,
124
+ document.documentElement.scrollHeight || 0,
125
+ document.body.offsetHeight || 0,
126
+ document.documentElement.offsetHeight || 0
127
+ ),
128
+ ready: document.readyState === 'complete'
129
+ };
130
+ })()
131
+ """
132
+
133
+
134
+ class CompletenessChecker:
135
+ """
136
+ Multi-signal page completeness verification.
137
+
138
+ Signals checked:
139
+ 1. Image Loading (naturalWidth > 0, not placeholder)
140
+ 2. Height Stability (no layout shifts)
141
+ 3. DOM Ready State
142
+
143
+ Usage:
144
+ checker = CompletenessChecker(CrawlConfig())
145
+ result = checker.check(tab)
146
+
147
+ # Or wait until complete
148
+ result = checker.wait_for_complete(tab)
149
+ """
150
+
151
+ def __init__(self, config: Optional[CrawlConfig] = None):
152
+ self._config = config or CrawlConfig()
153
+
154
+ def check(self, tab: Any) -> CompletenessResult:
155
+ """
156
+ Perform a single completeness check.
157
+
158
+ Args:
159
+ tab: DrissionPage tab object
160
+
161
+ Returns:
162
+ CompletenessResult with all signals
163
+ """
164
+ start = time.time()
165
+
166
+ # Check images
167
+ img_result = self._check_images(tab)
168
+
169
+ # Check height
170
+ height_result = self._check_height(tab)
171
+
172
+ # Determine overall completeness
173
+ # Complete if: all real images loaded AND height is stable
174
+ total_pending = img_result.get('failed', 0) + img_result.get('placeholders', 0)
175
+ all_images_loaded = total_pending == 0 or img_result.get('total', 0) == 0
176
+
177
+ return CompletenessResult(
178
+ is_complete=all_images_loaded and height_result.get('ready', False),
179
+ total_images=img_result.get('total', 0),
180
+ loaded_images=img_result.get('loaded', 0),
181
+ failed_images=img_result.get('failed', 0),
182
+ placeholder_images=img_result.get('placeholders', 0),
183
+ height=height_result.get('height', 0),
184
+ height_stable=True, # Single check can't determine stability
185
+ network_idle=True, # TODO: network monitoring
186
+ check_duration=time.time() - start
187
+ )
188
+
189
+ def wait_for_complete(
190
+ self,
191
+ tab: Any,
192
+ timeout: Optional[float] = None
193
+ ) -> CompletenessResult:
194
+ """
195
+ Wait for page to be fully loaded with image guarantees.
196
+
197
+ Uses multi-signal approach:
198
+ 1. Poll image loading status
199
+ 2. Track height stability
200
+ 3. Respect timeout
201
+
202
+ Args:
203
+ tab: DrissionPage tab object
204
+ timeout: Max wait time (default from config)
205
+
206
+ Returns:
207
+ CompletenessResult (may not be complete if timeout)
208
+ """
209
+ timeout = timeout or self._config.image_load_timeout
210
+ start = time.time()
211
+
212
+ height_history = []
213
+ stable_count = 0
214
+ last_result = None
215
+
216
+ logger.debug(f"CompletenessChecker: Starting wait (timeout={timeout}s)")
217
+
218
+ while time.time() - start < timeout:
219
+ result = self.check(tab)
220
+ last_result = result
221
+
222
+ # Track height stability
223
+ height_history.append(result.height)
224
+ if len(height_history) > self._config.height_stability_checks:
225
+ height_history.pop(0)
226
+
227
+ # Check if height is stable
228
+ height_stable = False
229
+ if len(height_history) >= self._config.height_stability_checks:
230
+ max_h = max(height_history)
231
+ min_h = min(height_history)
232
+ height_stable = (max_h - min_h) <= self._config.height_stability_threshold
233
+
234
+ # Log progress
235
+ elapsed = time.time() - start
236
+ logger.debug(
237
+ f"CompletenessChecker: [{elapsed:.1f}s] "
238
+ f"images={result.loaded_images}/{result.total_images} "
239
+ f"pending={result.failed_images} "
240
+ f"placeholders={result.placeholder_images} "
241
+ f"height={result.height} stable={height_stable}"
242
+ )
243
+
244
+ # Check if all images loaded AND height stable
245
+ all_loaded = (
246
+ result.failed_images == 0 and
247
+ result.placeholder_images == 0
248
+ )
249
+
250
+ if all_loaded and height_stable:
251
+ stable_count += 1
252
+ if stable_count >= self._config.image_stability_checks:
253
+ logger.info(
254
+ f"CompletenessChecker: Page complete! "
255
+ f"{result.loaded_images} images loaded, "
256
+ f"height={result.height}, "
257
+ f"took {elapsed:.2f}s"
258
+ )
259
+ return CompletenessResult(
260
+ is_complete=True,
261
+ total_images=result.total_images,
262
+ loaded_images=result.loaded_images,
263
+ failed_images=0,
264
+ placeholder_images=0,
265
+ height=result.height,
266
+ height_stable=True,
267
+ network_idle=True,
268
+ check_duration=elapsed
269
+ )
270
+ else:
271
+ stable_count = 0
272
+
273
+ # Wait before next check
274
+ time.sleep(self._config.image_check_interval)
275
+
276
+ # Timeout reached
277
+ elapsed = time.time() - start
278
+ logger.warning(
279
+ f"CompletenessChecker: Timeout after {elapsed:.1f}s! "
280
+ f"images={last_result.loaded_images}/{last_result.total_images} "
281
+ f"pending={last_result.failed_images}"
282
+ )
283
+
284
+ # Return last result with is_complete=False
285
+ if last_result:
286
+ return CompletenessResult(
287
+ is_complete=False,
288
+ total_images=last_result.total_images,
289
+ loaded_images=last_result.loaded_images,
290
+ failed_images=last_result.failed_images,
291
+ placeholder_images=last_result.placeholder_images,
292
+ height=last_result.height,
293
+ height_stable=len(set(height_history)) == 1,
294
+ network_idle=True,
295
+ check_duration=elapsed
296
+ )
297
+
298
+ return CompletenessResult(
299
+ is_complete=False,
300
+ total_images=0, loaded_images=0, failed_images=0, placeholder_images=0,
301
+ height=0, height_stable=False, network_idle=False,
302
+ check_duration=elapsed
303
+ )
304
+
305
+ def _check_images(self, tab: Any) -> Dict[str, Any]:
306
+ """Run image check JavaScript and return results."""
307
+ try:
308
+ js = IMAGE_CHECK_JS % (
309
+ self._config.min_image_size,
310
+ 'true' if self._config.scan_full_page else 'false'
311
+ )
312
+ result = tab.run_js(js, as_expr=True)
313
+ return result or {'total': 0, 'loaded': 0, 'failed': 0, 'placeholders': 0}
314
+ except Exception as e:
315
+ logger.warning(f"CompletenessChecker: Image check failed: {e}")
316
+ return {'total': 0, 'loaded': 0, 'failed': 0, 'placeholders': 0}
317
+
318
+ def _check_height(self, tab: Any) -> Dict[str, Any]:
319
+ """Run height check JavaScript and return results."""
320
+ try:
321
+ result = tab.run_js(HEIGHT_CHECK_JS, as_expr=True)
322
+ return result or {'height': 0, 'ready': False}
323
+ except Exception as e:
324
+ logger.warning(f"CompletenessChecker: Height check failed: {e}")
325
+ return {'height': 0, 'ready': False}
326
+
327
+
328
+ def trigger_lazy_load(tab: Any, config: Optional[CrawlConfig] = None) -> None:
329
+ """
330
+ Scroll through page to trigger lazy-loaded images.
331
+
332
+ Implements Crawl4AI's scan_full_page behavior.
333
+ Strategy: Fast scroll with minimal delay (0.2s) per step to trigger network requests,
334
+ then wait at the bottom for all images to settle.
335
+
336
+ Args:
337
+ tab: DrissionPage tab object
338
+ config: Crawl configuration
339
+ """
340
+ config = config or CrawlConfig()
341
+ if not config.scan_full_page:
342
+ return
343
+
344
+ start = time.time()
345
+ current_pos = 0
346
+
347
+ logger.info(f"CompletenessChecker: Starting lazy load scroll (fast scroll + final wait)")
348
+
349
+ try:
350
+ max_scroll_steps = 100
351
+ step_count = 0
352
+
353
+ # 1. Fast Scroll Phase
354
+ while step_count < max_scroll_steps:
355
+ step_count += 1
356
+ current_pos += config.scroll_step
357
+ tab.run_js(f"window.scrollTo(0, {current_pos});")
358
+
359
+ # Simple fixed delay per step (0.2s) as requested
360
+ time.sleep(0.2)
361
+
362
+ # Check if reached bottom
363
+ height = tab.run_js("""
364
+ Math.max(
365
+ document.body.scrollHeight || 0,
366
+ document.documentElement.scrollHeight || 0
367
+ )
368
+ """, as_expr=True) or 0
369
+
370
+ if current_pos >= height:
371
+ logger.debug(f"CompletenessChecker: Reached bottom at position {current_pos}")
372
+ break
373
+
374
+ # 2. Wait Phase at Bottom (Wait for images to settle - reduced timeout)
375
+ logger.debug("CompletenessChecker: Reached bottom, waiting for images to settle (max 2s)...")
376
+ wait_start = time.time()
377
+ max_wait_at_bottom = 2.0 # Reduced from 8s to 2s - scroll usually triggers loading quickly
378
+
379
+ # Quick check: just verify images are not placeholders (simplified check)
380
+ check_all_images_js = """
381
+ (() => {
382
+ const imgs = Array.from(document.querySelectorAll('img'));
383
+ if (imgs.length === 0) return true;
384
+
385
+ // Quick check: count non-placeholder images that are loaded
386
+ let loaded_count = 0;
387
+ let total_count = 0;
388
+
389
+ for (const img of imgs) {
390
+ // Skip tiny images
391
+ if (img.clientWidth < 50 && img.clientHeight < 50) continue;
392
+
393
+ total_count++;
394
+ const dataSrc = img.getAttribute('data-src') || img.getAttribute('data-original') || '';
395
+ const src = img.src || '';
396
+
397
+ // Check if placeholder
398
+ const isPlaceholder = (
399
+ (dataSrc && img.src !== dataSrc) ||
400
+ (img.naturalWidth < 50 && img.clientWidth > 100) ||
401
+ src.includes('placeholder') || src.includes('loading')
402
+ );
403
+
404
+ // If not placeholder and loaded, count it
405
+ if (!isPlaceholder && img.complete && img.naturalWidth > 0) {
406
+ loaded_count++;
407
+ }
408
+ }
409
+
410
+ // If most images are loaded (80%+), consider it done
411
+ return total_count === 0 || (loaded_count / total_count) >= 0.8;
412
+ })()
413
+ """
414
+
415
+ # Quick check loop with shorter interval
416
+ check_count = 0
417
+ max_checks = 4 # 2s / 0.5s = 4 checks max
418
+ while check_count < max_checks:
419
+ try:
420
+ all_loaded = tab.run_js(check_all_images_js, as_expr=True)
421
+ if all_loaded:
422
+ elapsed_wait = time.time() - wait_start
423
+ logger.debug(f"CompletenessChecker: Images settled at bottom in {elapsed_wait:.1f}s")
424
+ break
425
+ except:
426
+ pass
427
+ time.sleep(0.5)
428
+ check_count += 1
429
+
430
+ # Scroll back to top
431
+ tab.run_js("window.scrollTo(0, 0);")
432
+
433
+ elapsed = time.time() - start
434
+ logger.info(f"CompletenessChecker: Lazy load scroll complete - {step_count} steps in {elapsed:.1f}s")
435
+
436
+ except Exception as e:
437
+ logger.warning(f"CompletenessChecker: Lazy load scroll failed: {e}")
@@ -0,0 +1,88 @@
1
+ """
2
+ Crawling Data Models
3
+
4
+ Core data structures for the intelligent crawling system.
5
+ """
6
+
7
+ from dataclasses import dataclass, field
8
+ from typing import List, Optional, Set
9
+
10
+
11
+ @dataclass
12
+ class CrawlConfig:
13
+ """
14
+ Crawling configuration with Crawl4AI-style parameters.
15
+
16
+ Focuses on page completeness and image loading guarantees.
17
+ """
18
+ # Wait Strategy (Priority: Image Loading)
19
+ wait_for_images: bool = True
20
+ wait_until: str = "networkidle" # domcontentloaded | networkidle | load
21
+ delay_before_return: float = 0.1
22
+ page_timeout: float = 30.0
23
+
24
+ # Image Loading Specific
25
+ image_load_timeout: float = 10.0 # Max wait for images
26
+ image_stability_checks: int = 3 # Consecutive stable checks needed
27
+ image_check_interval: float = 0.2 # Interval between checks
28
+ min_image_size: int = 50 # Ignore images smaller than this
29
+
30
+ # Scroll for lazy loading
31
+ scan_full_page: bool = True
32
+ scroll_step: int = 800
33
+ scroll_delay: float = 0.5
34
+ scroll_timeout: float = 15.0
35
+
36
+ # Height Stability
37
+ height_stability_checks: int = 3
38
+ height_stability_threshold: int = 10 # pixels
39
+
40
+ # Future: Adaptive Stop Logic
41
+ confidence_threshold: float = 0.75
42
+ min_gain_threshold: float = 0.1
43
+ max_pages: int = 20
44
+
45
+
46
+ @dataclass
47
+ class CompletenessResult:
48
+ """Result from completeness check."""
49
+ is_complete: bool
50
+ total_images: int
51
+ loaded_images: int
52
+ failed_images: int
53
+ placeholder_images: int
54
+ height: int
55
+ height_stable: bool
56
+ network_idle: bool
57
+ check_duration: float
58
+
59
+ @property
60
+ def image_load_ratio(self) -> float:
61
+ if self.total_images == 0:
62
+ return 1.0
63
+ return self.loaded_images / self.total_images
64
+
65
+
66
+ @dataclass
67
+ class PageResult:
68
+ """Result from fetching a single page."""
69
+ url: str
70
+ final_url: str
71
+ title: str
72
+ html: str
73
+ content: str # Extracted markdown
74
+ images: List[str] = field(default_factory=list) # base64 images
75
+ screenshot: Optional[str] = None
76
+
77
+ # Quality Signals
78
+ load_time: float = 0.0
79
+ completeness: Optional[CompletenessResult] = None
80
+
81
+ # Error handling
82
+ error: Optional[str] = None
83
+
84
+ @property
85
+ def is_complete(self) -> bool:
86
+ if self.completeness is None:
87
+ return False
88
+ return self.completeness.is_complete
@@ -0,0 +1,104 @@
1
+ """
2
+ Centralized Definitions
3
+
4
+ All global prompts and tool definitions for the pipeline stages.
5
+ """
6
+
7
+ from typing import Dict, Any
8
+
9
+ # Used by SummaryStage - language appended at runtime
10
+ SUMMARY_REPORT_SP = """# 你是一个总结助手 (Agent), 你的职责是基于搜索工具给出的信息,回答用户的问题或解释用户问题中的关键词。
11
+ ## 核心原则
12
+ 最小限度使用自身知识, 尽可能使用 web_tool 获取信息.
13
+
14
+ ## 工具使用指南
15
+ - 适当时候调用 `refuse_answer`
16
+
17
+ ## 回答格式
18
+ - `# ` 大标题约 8-10 个字
19
+ - <summary>...</summary> 约 100 字的概括
20
+ - 二级标题 + markdown 正文
21
+ - 正文使用 [1] 格式引用信息来源, 无需写出源, 系统自动渲染
22
+ """
23
+
24
+ def get_refuse_answer_tool() -> Dict[str, Any]:
25
+ """Tool for refusing to answer inappropriate content."""
26
+ return {
27
+ "type": "function",
28
+ "function": {
29
+ "name": "refuse_answer",
30
+ "description": "违规内容拒绝回答,内容涉及隐喻政治事件、任务、现役国家领导人、r18+、r18g(但不包含正常galgame、科普等)",
31
+ "parameters": {
32
+ "type": "object",
33
+ "properties": {
34
+ "reason": {"type": "string", "description": "拒绝回答的原因(展示给用户)"},
35
+ },
36
+ "required": ["reason"],
37
+ },
38
+ },
39
+ }
40
+
41
+
42
+ def get_web_tool() -> Dict[str, Any]:
43
+ """Tool for web search with filter syntax and URL screenshot."""
44
+ return {
45
+ "type": "function",
46
+ "function": {
47
+ "name": "web_tool",
48
+ "description": """搜索网页或截图指定URL。用于获取最新信息、查找资料。
49
+ ## 使用方式
50
+ 网页搜索(大部分问题优先使用此方法):
51
+ 直接传入搜索词如 "python async" 会返回搜索结果列表
52
+
53
+ 网页截图(当用户明确要求截图时使用):
54
+ 传入完整URL如 "https://example.com" 会直接截图该页面
55
+
56
+ 网页搜索 + 网页截图(可以预测能直接搜到什么样的结果时使用): (最终截图最多3张)
57
+ - 域名过滤: "github=2: python async" → 会搜索 "python async github" 并截图 链接/标题包含 "github" 的前2个结果
58
+ - 序号选择: "1,2: minecraft mods" → 会搜索 "minecraft mods" 并截图第1、2个结果
59
+ - 多域名: "mcmod=1, github=1: forge mod" → 会搜索 "forge mod mcmod github" 并截图 链接/标题包含 "mcmod" 的前1个结果和 链接/标题包含 "github" 的前1个结果
60
+ """,
61
+ "parameters": {
62
+ "type": "object",
63
+ "properties": {
64
+ "query": {
65
+ "type": "string",
66
+ "description": "搜索查询或URL。支持过滤器语法(见描述)"
67
+ }
68
+ },
69
+ "required": ["query"]
70
+ }
71
+ }
72
+ }
73
+
74
+
75
+ # =============================================================================
76
+ # AGENT PROMPTS
77
+ # =============================================================================
78
+
79
+ AGENT_SYSTEM_PROMPT = """# 你是一个智能助手 (Agent), 你的职责是使用 `web_tool` 工具来帮助用户搜索网页或截图URL, 同时完成用户分配给你的任务.
80
+ ## 任务
81
+ 理解用户意图分配给你的任务.
82
+ 如果用户没有明确分配任务, 则默认任务为解释用户问题中的关键词.
83
+
84
+ ## 核心原则
85
+ 最小限度使用自身知识, 尽可能使用 web_tool 获取信息.
86
+
87
+ ## 工具使用指南
88
+ - 并行调用工具
89
+ - 网页搜索: 可以同时调用3次, 其中URL截图消耗较大, 最多同时调用1个
90
+ - 积极使用 web_tool 获取信息
91
+ - 搜索时, 关键词保证简单、指向准确、利于传统搜索引擎.
92
+ - 获取页面截图时, 只使用官方性较强的 wiki、官方网站、资源站等等, 不使用第三方转载新闻网站.
93
+ - 最多可调用2轮工具, 之后必须给出最终回答
94
+ - 适当时候调用 `refuse_answer`
95
+ - 对于具体任务, 如果是转述、格式化、翻译等, 请直接给出最终回答, 不再调用工具
96
+
97
+ ## 回答格式
98
+ - `# ` 大标题约 8-10 个字
99
+ - <summary>...</summary> 约 100 字的概括
100
+ - 二级标题 + markdown 正文
101
+ - 正文使用 [1] 格式引用信息来源, 无需写出源, 系统自动渲染
102
+ """
103
+
104
+