entari-plugin-hyw 0.3.5__py3-none-any.whl → 4.0.0rc14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of entari-plugin-hyw might be problematic. Click here for more details.
- entari_plugin_hyw/Untitled-1 +1865 -0
- entari_plugin_hyw/__init__.py +979 -116
- entari_plugin_hyw/filters.py +83 -0
- entari_plugin_hyw/history.py +251 -0
- entari_plugin_hyw/misc.py +214 -0
- entari_plugin_hyw/search_cache.py +154 -0
- entari_plugin_hyw-4.0.0rc14.dist-info/METADATA +118 -0
- entari_plugin_hyw-4.0.0rc14.dist-info/RECORD +72 -0
- {entari_plugin_hyw-0.3.5.dist-info → entari_plugin_hyw-4.0.0rc14.dist-info}/WHEEL +1 -1
- {entari_plugin_hyw-0.3.5.dist-info → entari_plugin_hyw-4.0.0rc14.dist-info}/top_level.txt +1 -0
- hyw_core/__init__.py +94 -0
- hyw_core/agent.py +768 -0
- hyw_core/browser_control/__init__.py +63 -0
- hyw_core/browser_control/assets/card-dist/index.html +425 -0
- hyw_core/browser_control/assets/card-dist/logos/anthropic.svg +1 -0
- hyw_core/browser_control/assets/card-dist/logos/cerebras.svg +9 -0
- hyw_core/browser_control/assets/card-dist/logos/deepseek.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/gemini.svg +1 -0
- hyw_core/browser_control/assets/card-dist/logos/google.svg +1 -0
- hyw_core/browser_control/assets/card-dist/logos/grok.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/huggingface.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/microsoft.svg +15 -0
- hyw_core/browser_control/assets/card-dist/logos/minimax.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/mistral.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/nvida.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/openai.svg +1 -0
- hyw_core/browser_control/assets/card-dist/logos/openrouter.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/perplexity.svg +24 -0
- hyw_core/browser_control/assets/card-dist/logos/qwen.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/xai.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/xiaomi.png +0 -0
- hyw_core/browser_control/assets/card-dist/logos/zai.png +0 -0
- hyw_core/browser_control/assets/card-dist/vite.svg +1 -0
- hyw_core/browser_control/assets/index.html +5691 -0
- hyw_core/browser_control/assets/logos/anthropic.svg +1 -0
- hyw_core/browser_control/assets/logos/cerebras.svg +9 -0
- hyw_core/browser_control/assets/logos/deepseek.png +0 -0
- hyw_core/browser_control/assets/logos/gemini.svg +1 -0
- hyw_core/browser_control/assets/logos/google.svg +1 -0
- hyw_core/browser_control/assets/logos/grok.png +0 -0
- hyw_core/browser_control/assets/logos/huggingface.png +0 -0
- hyw_core/browser_control/assets/logos/microsoft.svg +15 -0
- hyw_core/browser_control/assets/logos/minimax.png +0 -0
- hyw_core/browser_control/assets/logos/mistral.png +0 -0
- hyw_core/browser_control/assets/logos/nvida.png +0 -0
- hyw_core/browser_control/assets/logos/openai.svg +1 -0
- hyw_core/browser_control/assets/logos/openrouter.png +0 -0
- hyw_core/browser_control/assets/logos/perplexity.svg +24 -0
- hyw_core/browser_control/assets/logos/qwen.png +0 -0
- hyw_core/browser_control/assets/logos/xai.png +0 -0
- hyw_core/browser_control/assets/logos/xiaomi.png +0 -0
- hyw_core/browser_control/assets/logos/zai.png +0 -0
- hyw_core/browser_control/engines/__init__.py +15 -0
- hyw_core/browser_control/engines/base.py +13 -0
- hyw_core/browser_control/engines/default.py +166 -0
- hyw_core/browser_control/engines/duckduckgo.py +171 -0
- hyw_core/browser_control/landing.html +172 -0
- hyw_core/browser_control/manager.py +173 -0
- hyw_core/browser_control/renderer.py +446 -0
- hyw_core/browser_control/service.py +940 -0
- hyw_core/config.py +154 -0
- hyw_core/core.py +462 -0
- hyw_core/crawling/__init__.py +18 -0
- hyw_core/crawling/completeness.py +437 -0
- hyw_core/crawling/models.py +88 -0
- hyw_core/definitions.py +104 -0
- hyw_core/image_cache.py +274 -0
- hyw_core/pipeline.py +502 -0
- hyw_core/search.py +171 -0
- hyw_core/stages/__init__.py +21 -0
- hyw_core/stages/base.py +95 -0
- hyw_core/stages/summary.py +191 -0
- entari_plugin_hyw/agent.py +0 -419
- entari_plugin_hyw/compressor.py +0 -59
- entari_plugin_hyw/tools.py +0 -236
- entari_plugin_hyw/vision.py +0 -35
- entari_plugin_hyw-0.3.5.dist-info/METADATA +0 -112
- entari_plugin_hyw-0.3.5.dist-info/RECORD +0 -9
|
@@ -0,0 +1,437 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Page Completeness Checker
|
|
3
|
+
|
|
4
|
+
Multi-signal page readiness detection with focus on image loading guarantees.
|
|
5
|
+
Implements Crawl4AI's wait_for_images and networkidle concepts.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import time
|
|
9
|
+
from typing import Any, Optional, Dict
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from loguru import logger
|
|
12
|
+
|
|
13
|
+
from .models import CrawlConfig, CompletenessResult
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# CSP-compliant JavaScript for image loading verification
|
|
17
|
+
# Based on Crawl4AI's wait_for_images logic
|
|
18
|
+
IMAGE_CHECK_JS = """
|
|
19
|
+
(() => {
|
|
20
|
+
const results = {
|
|
21
|
+
total: 0,
|
|
22
|
+
loaded: 0,
|
|
23
|
+
failed: 0,
|
|
24
|
+
placeholders: 0,
|
|
25
|
+
details: []
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
const imgs = Array.from(document.querySelectorAll('img'));
|
|
29
|
+
const viewportHeight = window.innerHeight;
|
|
30
|
+
const minSize = %d; // Injected from Python
|
|
31
|
+
|
|
32
|
+
for (const img of imgs) {
|
|
33
|
+
// Skip tiny images (icons, tracking pixels)
|
|
34
|
+
if (img.clientWidth < minSize && img.clientHeight < minSize) {
|
|
35
|
+
continue;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Skip images outside viewport (unless scan_full_page)
|
|
39
|
+
const rect = img.getBoundingClientRect();
|
|
40
|
+
const inViewport = rect.top < viewportHeight * 2 && rect.bottom > -viewportHeight;
|
|
41
|
+
|
|
42
|
+
if (!inViewport && !%s) { // scan_full_page injected
|
|
43
|
+
continue;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
results.total++;
|
|
47
|
+
|
|
48
|
+
const src = img.src || img.getAttribute('data-src') || '';
|
|
49
|
+
const dataSrc = img.getAttribute('data-src') || img.getAttribute('data-original') ||
|
|
50
|
+
img.getAttribute('data-lazy-src') || img.getAttribute('data-lazy') || '';
|
|
51
|
+
const className = (typeof img.className === 'string' ? img.className : '').toLowerCase();
|
|
52
|
+
const loadingAttr = img.getAttribute('loading') || '';
|
|
53
|
+
|
|
54
|
+
// Enhanced placeholder detection for blurred preview images (like mcmod.cn)
|
|
55
|
+
const isPlaceholder = (
|
|
56
|
+
// 1. data-src exists but not yet loaded into src
|
|
57
|
+
(dataSrc && img.src !== dataSrc) ||
|
|
58
|
+
// 2. Natural size much smaller than display size (blurred placeholder)
|
|
59
|
+
(img.naturalWidth < 50 && img.clientWidth > 100) ||
|
|
60
|
+
(img.naturalWidth < 100 && img.clientWidth > 200 && img.naturalWidth * 4 < img.clientWidth) ||
|
|
61
|
+
// 3. Common placeholder keywords in src
|
|
62
|
+
src.includes('placeholder') ||
|
|
63
|
+
src.includes('loading') ||
|
|
64
|
+
src.includes('blank') ||
|
|
65
|
+
// 4. SVG placeholder or 1x1 tracking pixel
|
|
66
|
+
src.startsWith('data:image/svg+xml') ||
|
|
67
|
+
(img.naturalWidth === 1 && img.naturalHeight === 1) ||
|
|
68
|
+
// 5. Lazy-loading class indicators (common patterns)
|
|
69
|
+
className.includes('lazy') ||
|
|
70
|
+
className.includes('lazyload') ||
|
|
71
|
+
className.includes('lozad') ||
|
|
72
|
+
className.includes('b-lazy') ||
|
|
73
|
+
// 6. Blur indicators (common for LQIP - Low Quality Image Placeholder)
|
|
74
|
+
className.includes('blur') ||
|
|
75
|
+
src.includes('blur') ||
|
|
76
|
+
src.includes('thumb') ||
|
|
77
|
+
src.includes('thumbnail') ||
|
|
78
|
+
// 7. loading="lazy" + not complete (browser native lazy loading)
|
|
79
|
+
(loadingAttr === 'lazy' && !img.complete) ||
|
|
80
|
+
// 8. CSS blur filter applied (visual blurring)
|
|
81
|
+
(window.getComputedStyle(img).filter || '').includes('blur')
|
|
82
|
+
);
|
|
83
|
+
|
|
84
|
+
if (isPlaceholder) {
|
|
85
|
+
results.placeholders++;
|
|
86
|
+
results.details.push({
|
|
87
|
+
src: src.substring(0, 100),
|
|
88
|
+
status: 'placeholder',
|
|
89
|
+
natural: [img.naturalWidth, img.naturalHeight],
|
|
90
|
+
display: [img.clientWidth, img.clientHeight]
|
|
91
|
+
});
|
|
92
|
+
continue;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// Check if fully loaded
|
|
96
|
+
if (img.complete && img.naturalWidth > 0 && img.naturalHeight > 0) {
|
|
97
|
+
results.loaded++;
|
|
98
|
+
results.details.push({
|
|
99
|
+
src: src.substring(0, 100),
|
|
100
|
+
status: 'loaded',
|
|
101
|
+
natural: [img.naturalWidth, img.naturalHeight]
|
|
102
|
+
});
|
|
103
|
+
} else {
|
|
104
|
+
results.failed++;
|
|
105
|
+
results.details.push({
|
|
106
|
+
src: src.substring(0, 100),
|
|
107
|
+
status: 'pending',
|
|
108
|
+
complete: img.complete,
|
|
109
|
+
natural: [img.naturalWidth, img.naturalHeight]
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
return results;
|
|
115
|
+
})()
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
# JavaScript for height stability check
|
|
119
|
+
HEIGHT_CHECK_JS = """
|
|
120
|
+
(() => {
|
|
121
|
+
return {
|
|
122
|
+
height: Math.max(
|
|
123
|
+
document.body.scrollHeight || 0,
|
|
124
|
+
document.documentElement.scrollHeight || 0,
|
|
125
|
+
document.body.offsetHeight || 0,
|
|
126
|
+
document.documentElement.offsetHeight || 0
|
|
127
|
+
),
|
|
128
|
+
ready: document.readyState === 'complete'
|
|
129
|
+
};
|
|
130
|
+
})()
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class CompletenessChecker:
|
|
135
|
+
"""
|
|
136
|
+
Multi-signal page completeness verification.
|
|
137
|
+
|
|
138
|
+
Signals checked:
|
|
139
|
+
1. Image Loading (naturalWidth > 0, not placeholder)
|
|
140
|
+
2. Height Stability (no layout shifts)
|
|
141
|
+
3. DOM Ready State
|
|
142
|
+
|
|
143
|
+
Usage:
|
|
144
|
+
checker = CompletenessChecker(CrawlConfig())
|
|
145
|
+
result = checker.check(tab)
|
|
146
|
+
|
|
147
|
+
# Or wait until complete
|
|
148
|
+
result = checker.wait_for_complete(tab)
|
|
149
|
+
"""
|
|
150
|
+
|
|
151
|
+
def __init__(self, config: Optional[CrawlConfig] = None):
|
|
152
|
+
self._config = config or CrawlConfig()
|
|
153
|
+
|
|
154
|
+
def check(self, tab: Any) -> CompletenessResult:
|
|
155
|
+
"""
|
|
156
|
+
Perform a single completeness check.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
tab: DrissionPage tab object
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
CompletenessResult with all signals
|
|
163
|
+
"""
|
|
164
|
+
start = time.time()
|
|
165
|
+
|
|
166
|
+
# Check images
|
|
167
|
+
img_result = self._check_images(tab)
|
|
168
|
+
|
|
169
|
+
# Check height
|
|
170
|
+
height_result = self._check_height(tab)
|
|
171
|
+
|
|
172
|
+
# Determine overall completeness
|
|
173
|
+
# Complete if: all real images loaded AND height is stable
|
|
174
|
+
total_pending = img_result.get('failed', 0) + img_result.get('placeholders', 0)
|
|
175
|
+
all_images_loaded = total_pending == 0 or img_result.get('total', 0) == 0
|
|
176
|
+
|
|
177
|
+
return CompletenessResult(
|
|
178
|
+
is_complete=all_images_loaded and height_result.get('ready', False),
|
|
179
|
+
total_images=img_result.get('total', 0),
|
|
180
|
+
loaded_images=img_result.get('loaded', 0),
|
|
181
|
+
failed_images=img_result.get('failed', 0),
|
|
182
|
+
placeholder_images=img_result.get('placeholders', 0),
|
|
183
|
+
height=height_result.get('height', 0),
|
|
184
|
+
height_stable=True, # Single check can't determine stability
|
|
185
|
+
network_idle=True, # TODO: network monitoring
|
|
186
|
+
check_duration=time.time() - start
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
def wait_for_complete(
|
|
190
|
+
self,
|
|
191
|
+
tab: Any,
|
|
192
|
+
timeout: Optional[float] = None
|
|
193
|
+
) -> CompletenessResult:
|
|
194
|
+
"""
|
|
195
|
+
Wait for page to be fully loaded with image guarantees.
|
|
196
|
+
|
|
197
|
+
Uses multi-signal approach:
|
|
198
|
+
1. Poll image loading status
|
|
199
|
+
2. Track height stability
|
|
200
|
+
3. Respect timeout
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
tab: DrissionPage tab object
|
|
204
|
+
timeout: Max wait time (default from config)
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
CompletenessResult (may not be complete if timeout)
|
|
208
|
+
"""
|
|
209
|
+
timeout = timeout or self._config.image_load_timeout
|
|
210
|
+
start = time.time()
|
|
211
|
+
|
|
212
|
+
height_history = []
|
|
213
|
+
stable_count = 0
|
|
214
|
+
last_result = None
|
|
215
|
+
|
|
216
|
+
logger.debug(f"CompletenessChecker: Starting wait (timeout={timeout}s)")
|
|
217
|
+
|
|
218
|
+
while time.time() - start < timeout:
|
|
219
|
+
result = self.check(tab)
|
|
220
|
+
last_result = result
|
|
221
|
+
|
|
222
|
+
# Track height stability
|
|
223
|
+
height_history.append(result.height)
|
|
224
|
+
if len(height_history) > self._config.height_stability_checks:
|
|
225
|
+
height_history.pop(0)
|
|
226
|
+
|
|
227
|
+
# Check if height is stable
|
|
228
|
+
height_stable = False
|
|
229
|
+
if len(height_history) >= self._config.height_stability_checks:
|
|
230
|
+
max_h = max(height_history)
|
|
231
|
+
min_h = min(height_history)
|
|
232
|
+
height_stable = (max_h - min_h) <= self._config.height_stability_threshold
|
|
233
|
+
|
|
234
|
+
# Log progress
|
|
235
|
+
elapsed = time.time() - start
|
|
236
|
+
logger.debug(
|
|
237
|
+
f"CompletenessChecker: [{elapsed:.1f}s] "
|
|
238
|
+
f"images={result.loaded_images}/{result.total_images} "
|
|
239
|
+
f"pending={result.failed_images} "
|
|
240
|
+
f"placeholders={result.placeholder_images} "
|
|
241
|
+
f"height={result.height} stable={height_stable}"
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# Check if all images loaded AND height stable
|
|
245
|
+
all_loaded = (
|
|
246
|
+
result.failed_images == 0 and
|
|
247
|
+
result.placeholder_images == 0
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
if all_loaded and height_stable:
|
|
251
|
+
stable_count += 1
|
|
252
|
+
if stable_count >= self._config.image_stability_checks:
|
|
253
|
+
logger.info(
|
|
254
|
+
f"CompletenessChecker: Page complete! "
|
|
255
|
+
f"{result.loaded_images} images loaded, "
|
|
256
|
+
f"height={result.height}, "
|
|
257
|
+
f"took {elapsed:.2f}s"
|
|
258
|
+
)
|
|
259
|
+
return CompletenessResult(
|
|
260
|
+
is_complete=True,
|
|
261
|
+
total_images=result.total_images,
|
|
262
|
+
loaded_images=result.loaded_images,
|
|
263
|
+
failed_images=0,
|
|
264
|
+
placeholder_images=0,
|
|
265
|
+
height=result.height,
|
|
266
|
+
height_stable=True,
|
|
267
|
+
network_idle=True,
|
|
268
|
+
check_duration=elapsed
|
|
269
|
+
)
|
|
270
|
+
else:
|
|
271
|
+
stable_count = 0
|
|
272
|
+
|
|
273
|
+
# Wait before next check
|
|
274
|
+
time.sleep(self._config.image_check_interval)
|
|
275
|
+
|
|
276
|
+
# Timeout reached
|
|
277
|
+
elapsed = time.time() - start
|
|
278
|
+
logger.warning(
|
|
279
|
+
f"CompletenessChecker: Timeout after {elapsed:.1f}s! "
|
|
280
|
+
f"images={last_result.loaded_images}/{last_result.total_images} "
|
|
281
|
+
f"pending={last_result.failed_images}"
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
# Return last result with is_complete=False
|
|
285
|
+
if last_result:
|
|
286
|
+
return CompletenessResult(
|
|
287
|
+
is_complete=False,
|
|
288
|
+
total_images=last_result.total_images,
|
|
289
|
+
loaded_images=last_result.loaded_images,
|
|
290
|
+
failed_images=last_result.failed_images,
|
|
291
|
+
placeholder_images=last_result.placeholder_images,
|
|
292
|
+
height=last_result.height,
|
|
293
|
+
height_stable=len(set(height_history)) == 1,
|
|
294
|
+
network_idle=True,
|
|
295
|
+
check_duration=elapsed
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
return CompletenessResult(
|
|
299
|
+
is_complete=False,
|
|
300
|
+
total_images=0, loaded_images=0, failed_images=0, placeholder_images=0,
|
|
301
|
+
height=0, height_stable=False, network_idle=False,
|
|
302
|
+
check_duration=elapsed
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
def _check_images(self, tab: Any) -> Dict[str, Any]:
|
|
306
|
+
"""Run image check JavaScript and return results."""
|
|
307
|
+
try:
|
|
308
|
+
js = IMAGE_CHECK_JS % (
|
|
309
|
+
self._config.min_image_size,
|
|
310
|
+
'true' if self._config.scan_full_page else 'false'
|
|
311
|
+
)
|
|
312
|
+
result = tab.run_js(js, as_expr=True)
|
|
313
|
+
return result or {'total': 0, 'loaded': 0, 'failed': 0, 'placeholders': 0}
|
|
314
|
+
except Exception as e:
|
|
315
|
+
logger.warning(f"CompletenessChecker: Image check failed: {e}")
|
|
316
|
+
return {'total': 0, 'loaded': 0, 'failed': 0, 'placeholders': 0}
|
|
317
|
+
|
|
318
|
+
def _check_height(self, tab: Any) -> Dict[str, Any]:
|
|
319
|
+
"""Run height check JavaScript and return results."""
|
|
320
|
+
try:
|
|
321
|
+
result = tab.run_js(HEIGHT_CHECK_JS, as_expr=True)
|
|
322
|
+
return result or {'height': 0, 'ready': False}
|
|
323
|
+
except Exception as e:
|
|
324
|
+
logger.warning(f"CompletenessChecker: Height check failed: {e}")
|
|
325
|
+
return {'height': 0, 'ready': False}
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def trigger_lazy_load(tab: Any, config: Optional[CrawlConfig] = None) -> None:
|
|
329
|
+
"""
|
|
330
|
+
Scroll through page to trigger lazy-loaded images.
|
|
331
|
+
|
|
332
|
+
Implements Crawl4AI's scan_full_page behavior.
|
|
333
|
+
Strategy: Fast scroll with minimal delay (0.2s) per step to trigger network requests,
|
|
334
|
+
then wait at the bottom for all images to settle.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
tab: DrissionPage tab object
|
|
338
|
+
config: Crawl configuration
|
|
339
|
+
"""
|
|
340
|
+
config = config or CrawlConfig()
|
|
341
|
+
if not config.scan_full_page:
|
|
342
|
+
return
|
|
343
|
+
|
|
344
|
+
start = time.time()
|
|
345
|
+
current_pos = 0
|
|
346
|
+
|
|
347
|
+
logger.info(f"CompletenessChecker: Starting lazy load scroll (fast scroll + final wait)")
|
|
348
|
+
|
|
349
|
+
try:
|
|
350
|
+
max_scroll_steps = 100
|
|
351
|
+
step_count = 0
|
|
352
|
+
|
|
353
|
+
# 1. Fast Scroll Phase
|
|
354
|
+
while step_count < max_scroll_steps:
|
|
355
|
+
step_count += 1
|
|
356
|
+
current_pos += config.scroll_step
|
|
357
|
+
tab.run_js(f"window.scrollTo(0, {current_pos});")
|
|
358
|
+
|
|
359
|
+
# Simple fixed delay per step (0.2s) as requested
|
|
360
|
+
time.sleep(0.2)
|
|
361
|
+
|
|
362
|
+
# Check if reached bottom
|
|
363
|
+
height = tab.run_js("""
|
|
364
|
+
Math.max(
|
|
365
|
+
document.body.scrollHeight || 0,
|
|
366
|
+
document.documentElement.scrollHeight || 0
|
|
367
|
+
)
|
|
368
|
+
""", as_expr=True) or 0
|
|
369
|
+
|
|
370
|
+
if current_pos >= height:
|
|
371
|
+
logger.debug(f"CompletenessChecker: Reached bottom at position {current_pos}")
|
|
372
|
+
break
|
|
373
|
+
|
|
374
|
+
# 2. Wait Phase at Bottom (Wait for images to settle - reduced timeout)
|
|
375
|
+
logger.debug("CompletenessChecker: Reached bottom, waiting for images to settle (max 2s)...")
|
|
376
|
+
wait_start = time.time()
|
|
377
|
+
max_wait_at_bottom = 2.0 # Reduced from 8s to 2s - scroll usually triggers loading quickly
|
|
378
|
+
|
|
379
|
+
# Quick check: just verify images are not placeholders (simplified check)
|
|
380
|
+
check_all_images_js = """
|
|
381
|
+
(() => {
|
|
382
|
+
const imgs = Array.from(document.querySelectorAll('img'));
|
|
383
|
+
if (imgs.length === 0) return true;
|
|
384
|
+
|
|
385
|
+
// Quick check: count non-placeholder images that are loaded
|
|
386
|
+
let loaded_count = 0;
|
|
387
|
+
let total_count = 0;
|
|
388
|
+
|
|
389
|
+
for (const img of imgs) {
|
|
390
|
+
// Skip tiny images
|
|
391
|
+
if (img.clientWidth < 50 && img.clientHeight < 50) continue;
|
|
392
|
+
|
|
393
|
+
total_count++;
|
|
394
|
+
const dataSrc = img.getAttribute('data-src') || img.getAttribute('data-original') || '';
|
|
395
|
+
const src = img.src || '';
|
|
396
|
+
|
|
397
|
+
// Check if placeholder
|
|
398
|
+
const isPlaceholder = (
|
|
399
|
+
(dataSrc && img.src !== dataSrc) ||
|
|
400
|
+
(img.naturalWidth < 50 && img.clientWidth > 100) ||
|
|
401
|
+
src.includes('placeholder') || src.includes('loading')
|
|
402
|
+
);
|
|
403
|
+
|
|
404
|
+
// If not placeholder and loaded, count it
|
|
405
|
+
if (!isPlaceholder && img.complete && img.naturalWidth > 0) {
|
|
406
|
+
loaded_count++;
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// If most images are loaded (80%+), consider it done
|
|
411
|
+
return total_count === 0 || (loaded_count / total_count) >= 0.8;
|
|
412
|
+
})()
|
|
413
|
+
"""
|
|
414
|
+
|
|
415
|
+
# Quick check loop with shorter interval
|
|
416
|
+
check_count = 0
|
|
417
|
+
max_checks = 4 # 2s / 0.5s = 4 checks max
|
|
418
|
+
while check_count < max_checks:
|
|
419
|
+
try:
|
|
420
|
+
all_loaded = tab.run_js(check_all_images_js, as_expr=True)
|
|
421
|
+
if all_loaded:
|
|
422
|
+
elapsed_wait = time.time() - wait_start
|
|
423
|
+
logger.debug(f"CompletenessChecker: Images settled at bottom in {elapsed_wait:.1f}s")
|
|
424
|
+
break
|
|
425
|
+
except:
|
|
426
|
+
pass
|
|
427
|
+
time.sleep(0.5)
|
|
428
|
+
check_count += 1
|
|
429
|
+
|
|
430
|
+
# Scroll back to top
|
|
431
|
+
tab.run_js("window.scrollTo(0, 0);")
|
|
432
|
+
|
|
433
|
+
elapsed = time.time() - start
|
|
434
|
+
logger.info(f"CompletenessChecker: Lazy load scroll complete - {step_count} steps in {elapsed:.1f}s")
|
|
435
|
+
|
|
436
|
+
except Exception as e:
|
|
437
|
+
logger.warning(f"CompletenessChecker: Lazy load scroll failed: {e}")
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Crawling Data Models
|
|
3
|
+
|
|
4
|
+
Core data structures for the intelligent crawling system.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from typing import List, Optional, Set
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class CrawlConfig:
|
|
13
|
+
"""
|
|
14
|
+
Crawling configuration with Crawl4AI-style parameters.
|
|
15
|
+
|
|
16
|
+
Focuses on page completeness and image loading guarantees.
|
|
17
|
+
"""
|
|
18
|
+
# Wait Strategy (Priority: Image Loading)
|
|
19
|
+
wait_for_images: bool = True
|
|
20
|
+
wait_until: str = "networkidle" # domcontentloaded | networkidle | load
|
|
21
|
+
delay_before_return: float = 0.1
|
|
22
|
+
page_timeout: float = 30.0
|
|
23
|
+
|
|
24
|
+
# Image Loading Specific
|
|
25
|
+
image_load_timeout: float = 10.0 # Max wait for images
|
|
26
|
+
image_stability_checks: int = 3 # Consecutive stable checks needed
|
|
27
|
+
image_check_interval: float = 0.2 # Interval between checks
|
|
28
|
+
min_image_size: int = 50 # Ignore images smaller than this
|
|
29
|
+
|
|
30
|
+
# Scroll for lazy loading
|
|
31
|
+
scan_full_page: bool = True
|
|
32
|
+
scroll_step: int = 800
|
|
33
|
+
scroll_delay: float = 0.5
|
|
34
|
+
scroll_timeout: float = 15.0
|
|
35
|
+
|
|
36
|
+
# Height Stability
|
|
37
|
+
height_stability_checks: int = 3
|
|
38
|
+
height_stability_threshold: int = 10 # pixels
|
|
39
|
+
|
|
40
|
+
# Future: Adaptive Stop Logic
|
|
41
|
+
confidence_threshold: float = 0.75
|
|
42
|
+
min_gain_threshold: float = 0.1
|
|
43
|
+
max_pages: int = 20
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class CompletenessResult:
|
|
48
|
+
"""Result from completeness check."""
|
|
49
|
+
is_complete: bool
|
|
50
|
+
total_images: int
|
|
51
|
+
loaded_images: int
|
|
52
|
+
failed_images: int
|
|
53
|
+
placeholder_images: int
|
|
54
|
+
height: int
|
|
55
|
+
height_stable: bool
|
|
56
|
+
network_idle: bool
|
|
57
|
+
check_duration: float
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def image_load_ratio(self) -> float:
|
|
61
|
+
if self.total_images == 0:
|
|
62
|
+
return 1.0
|
|
63
|
+
return self.loaded_images / self.total_images
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class PageResult:
|
|
68
|
+
"""Result from fetching a single page."""
|
|
69
|
+
url: str
|
|
70
|
+
final_url: str
|
|
71
|
+
title: str
|
|
72
|
+
html: str
|
|
73
|
+
content: str # Extracted markdown
|
|
74
|
+
images: List[str] = field(default_factory=list) # base64 images
|
|
75
|
+
screenshot: Optional[str] = None
|
|
76
|
+
|
|
77
|
+
# Quality Signals
|
|
78
|
+
load_time: float = 0.0
|
|
79
|
+
completeness: Optional[CompletenessResult] = None
|
|
80
|
+
|
|
81
|
+
# Error handling
|
|
82
|
+
error: Optional[str] = None
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def is_complete(self) -> bool:
|
|
86
|
+
if self.completeness is None:
|
|
87
|
+
return False
|
|
88
|
+
return self.completeness.is_complete
|
hyw_core/definitions.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Centralized Definitions
|
|
3
|
+
|
|
4
|
+
All global prompts and tool definitions for the pipeline stages.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Dict, Any
|
|
8
|
+
|
|
9
|
+
# Used by SummaryStage - language appended at runtime
|
|
10
|
+
SUMMARY_REPORT_SP = """# 你是一个总结助手 (Agent), 你的职责是基于搜索工具给出的信息,回答用户的问题或解释用户问题中的关键词。
|
|
11
|
+
## 核心原则
|
|
12
|
+
最小限度使用自身知识, 尽可能使用 web_tool 获取信息.
|
|
13
|
+
|
|
14
|
+
## 工具使用指南
|
|
15
|
+
- 适当时候调用 `refuse_answer`
|
|
16
|
+
|
|
17
|
+
## 回答格式
|
|
18
|
+
- `# ` 大标题约 8-10 个字
|
|
19
|
+
- <summary>...</summary> 约 100 字的概括
|
|
20
|
+
- 二级标题 + markdown 正文
|
|
21
|
+
- 正文使用 [1] 格式引用信息来源, 无需写出源, 系统自动渲染
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def get_refuse_answer_tool() -> Dict[str, Any]:
|
|
25
|
+
"""Tool for refusing to answer inappropriate content."""
|
|
26
|
+
return {
|
|
27
|
+
"type": "function",
|
|
28
|
+
"function": {
|
|
29
|
+
"name": "refuse_answer",
|
|
30
|
+
"description": "违规内容拒绝回答,内容涉及隐喻政治事件、任务、现役国家领导人、r18+、r18g(但不包含正常galgame、科普等)",
|
|
31
|
+
"parameters": {
|
|
32
|
+
"type": "object",
|
|
33
|
+
"properties": {
|
|
34
|
+
"reason": {"type": "string", "description": "拒绝回答的原因(展示给用户)"},
|
|
35
|
+
},
|
|
36
|
+
"required": ["reason"],
|
|
37
|
+
},
|
|
38
|
+
},
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_web_tool() -> Dict[str, Any]:
|
|
43
|
+
"""Tool for web search with filter syntax and URL screenshot."""
|
|
44
|
+
return {
|
|
45
|
+
"type": "function",
|
|
46
|
+
"function": {
|
|
47
|
+
"name": "web_tool",
|
|
48
|
+
"description": """搜索网页或截图指定URL。用于获取最新信息、查找资料。
|
|
49
|
+
## 使用方式
|
|
50
|
+
网页搜索(大部分问题优先使用此方法):
|
|
51
|
+
直接传入搜索词如 "python async" 会返回搜索结果列表
|
|
52
|
+
|
|
53
|
+
网页截图(当用户明确要求截图时使用):
|
|
54
|
+
传入完整URL如 "https://example.com" 会直接截图该页面
|
|
55
|
+
|
|
56
|
+
网页搜索 + 网页截图(可以预测能直接搜到什么样的结果时使用): (最终截图最多3张)
|
|
57
|
+
- 域名过滤: "github=2: python async" → 会搜索 "python async github" 并截图 链接/标题包含 "github" 的前2个结果
|
|
58
|
+
- 序号选择: "1,2: minecraft mods" → 会搜索 "minecraft mods" 并截图第1、2个结果
|
|
59
|
+
- 多域名: "mcmod=1, github=1: forge mod" → 会搜索 "forge mod mcmod github" 并截图 链接/标题包含 "mcmod" 的前1个结果和 链接/标题包含 "github" 的前1个结果
|
|
60
|
+
""",
|
|
61
|
+
"parameters": {
|
|
62
|
+
"type": "object",
|
|
63
|
+
"properties": {
|
|
64
|
+
"query": {
|
|
65
|
+
"type": "string",
|
|
66
|
+
"description": "搜索查询或URL。支持过滤器语法(见描述)"
|
|
67
|
+
}
|
|
68
|
+
},
|
|
69
|
+
"required": ["query"]
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
# =============================================================================
|
|
76
|
+
# AGENT PROMPTS
|
|
77
|
+
# =============================================================================
|
|
78
|
+
|
|
79
|
+
AGENT_SYSTEM_PROMPT = """# 你是一个智能助手 (Agent), 你的职责是使用 `web_tool` 工具来帮助用户搜索网页或截图URL, 同时完成用户分配给你的任务.
|
|
80
|
+
## 任务
|
|
81
|
+
理解用户意图分配给你的任务.
|
|
82
|
+
如果用户没有明确分配任务, 则默认任务为解释用户问题中的关键词.
|
|
83
|
+
|
|
84
|
+
## 核心原则
|
|
85
|
+
最小限度使用自身知识, 尽可能使用 web_tool 获取信息.
|
|
86
|
+
|
|
87
|
+
## 工具使用指南
|
|
88
|
+
- 并行调用工具
|
|
89
|
+
- 网页搜索: 可以同时调用3次, 其中URL截图消耗较大, 最多同时调用1个
|
|
90
|
+
- 积极使用 web_tool 获取信息
|
|
91
|
+
- 搜索时, 关键词保证简单、指向准确、利于传统搜索引擎.
|
|
92
|
+
- 获取页面截图时, 只使用官方性较强的 wiki、官方网站、资源站等等, 不使用第三方转载新闻网站.
|
|
93
|
+
- 最多可调用2轮工具, 之后必须给出最终回答
|
|
94
|
+
- 适当时候调用 `refuse_answer`
|
|
95
|
+
- 对于具体任务, 如果是转述、格式化、翻译等, 请直接给出最终回答, 不再调用工具
|
|
96
|
+
|
|
97
|
+
## 回答格式
|
|
98
|
+
- `# ` 大标题约 8-10 个字
|
|
99
|
+
- <summary>...</summary> 约 100 字的概括
|
|
100
|
+
- 二级标题 + markdown 正文
|
|
101
|
+
- 正文使用 [1] 格式引用信息来源, 无需写出源, 系统自动渲染
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
|