entari-plugin-hyw 4.0.0rc9__py3-none-any.whl → 4.0.0rc11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of entari-plugin-hyw might be problematic. Click here for more details.
- entari_plugin_hyw/__init__.py +76 -39
- {entari_plugin_hyw-4.0.0rc9.dist-info → entari_plugin_hyw-4.0.0rc11.dist-info}/METADATA +1 -1
- {entari_plugin_hyw-4.0.0rc9.dist-info → entari_plugin_hyw-4.0.0rc11.dist-info}/RECORD +16 -14
- hyw_core/browser_control/__init__.py +1 -3
- hyw_core/browser_control/assets/card-dist/index.html +48 -32
- hyw_core/browser_control/engines/__init__.py +0 -2
- hyw_core/browser_control/manager.py +18 -5
- hyw_core/browser_control/renderer.py +36 -6
- hyw_core/browser_control/service.py +245 -142
- hyw_core/core.py +0 -7
- hyw_core/crawling/__init__.py +18 -0
- hyw_core/crawling/completeness.py +437 -0
- hyw_core/crawling/models.py +88 -0
- hyw_core/search.py +4 -6
- hyw_core/browser_control/engines/google.py +0 -155
- {entari_plugin_hyw-4.0.0rc9.dist-info → entari_plugin_hyw-4.0.0rc11.dist-info}/WHEEL +0 -0
- {entari_plugin_hyw-4.0.0rc9.dist-info → entari_plugin_hyw-4.0.0rc11.dist-info}/top_level.txt +0 -0
|
@@ -5,13 +5,11 @@ Provides search engine adapters for different search providers.
|
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
from .base import SearchEngine
|
|
8
|
-
from .google import GoogleEngine
|
|
9
8
|
from .duckduckgo import DuckDuckGoEngine
|
|
10
9
|
from .default import DefaultEngine
|
|
11
10
|
|
|
12
11
|
__all__ = [
|
|
13
12
|
"SearchEngine",
|
|
14
|
-
"GoogleEngine",
|
|
15
13
|
"DuckDuckGoEngine",
|
|
16
14
|
"DefaultEngine",
|
|
17
15
|
]
|
|
@@ -124,15 +124,28 @@ class SharedBrowserManager:
|
|
|
124
124
|
@staticmethod
|
|
125
125
|
def hide_scrollbars(page: ChromiumPage):
|
|
126
126
|
"""
|
|
127
|
-
Robustly hide scrollbars using CDP commands.
|
|
128
|
-
This
|
|
127
|
+
Robustly hide scrollbars using CDP commands AND CSS injection.
|
|
128
|
+
This provides double protection against scrollbar gutters.
|
|
129
129
|
"""
|
|
130
130
|
try:
|
|
131
|
-
#
|
|
131
|
+
# 1. CDP Command
|
|
132
132
|
page.run_cdp('Emulation.setScrollbarsHidden', hidden=True)
|
|
133
|
-
|
|
133
|
+
|
|
134
|
+
# 2. CSS Injection (Standard + Webkit)
|
|
135
|
+
css = """
|
|
136
|
+
::-webkit-scrollbar { display: none !important; width: 0 !important; height: 0 !important; }
|
|
137
|
+
* { -ms-overflow-style: none !important; scrollbar-width: none !important; }
|
|
138
|
+
"""
|
|
139
|
+
# Inject into current page
|
|
140
|
+
page.run_js(f"""
|
|
141
|
+
const style = document.createElement('style');
|
|
142
|
+
style.textContent = `{css}`;
|
|
143
|
+
document.head.appendChild(style);
|
|
144
|
+
""")
|
|
145
|
+
|
|
146
|
+
logger.debug("SharedBrowserManager: Scrollbars hidden via CDP + CSS.")
|
|
134
147
|
except Exception as e:
|
|
135
|
-
logger.warning(f"SharedBrowserManager: Failed to hide scrollbars
|
|
148
|
+
logger.warning(f"SharedBrowserManager: Failed to hide scrollbars: {e}")
|
|
136
149
|
|
|
137
150
|
|
|
138
151
|
# Module-level singleton accessor
|
|
@@ -55,16 +55,43 @@ class ContentRenderer:
|
|
|
55
55
|
loop = asyncio.get_running_loop()
|
|
56
56
|
return await loop.run_in_executor(self._executor, self._prepare_tab_sync)
|
|
57
57
|
|
|
58
|
-
def _wait_for_render_finished(self, tab, timeout: float =
|
|
58
|
+
def _wait_for_render_finished(self, tab, timeout: float = 12.0, context: str = ""):
|
|
59
59
|
"""Wait for window.RENDER_FINISHED to be true in the tab."""
|
|
60
60
|
import time as pytime
|
|
61
61
|
start = pytime.time()
|
|
62
|
+
|
|
63
|
+
# Check initial state
|
|
64
|
+
initial_state = tab.run_js("return window.RENDER_FINISHED")
|
|
65
|
+
logger.debug(f"ContentRenderer[{context}]: Starting wait, initial RENDER_FINISHED={initial_state}")
|
|
66
|
+
|
|
67
|
+
# If already true, it's stale from previous render - need to wait for JS to reset it
|
|
68
|
+
if initial_state:
|
|
69
|
+
logger.debug(f"ContentRenderer[{context}]: RENDER_FINISHED was true, waiting for reset...")
|
|
70
|
+
# Wait for JS to reset it to false (updateRenderData sets it to false)
|
|
71
|
+
reset_start = pytime.time()
|
|
72
|
+
while pytime.time() - reset_start < 1.0: # 1s max to wait for reset
|
|
73
|
+
is_reset = tab.run_js("return window.RENDER_FINISHED")
|
|
74
|
+
if not is_reset:
|
|
75
|
+
logger.debug(f"ContentRenderer[{context}]: RENDER_FINISHED reset to false")
|
|
76
|
+
break
|
|
77
|
+
pytime.sleep(0.05)
|
|
78
|
+
else:
|
|
79
|
+
logger.warning(f"ContentRenderer[{context}]: RENDER_FINISHED not reset, force resetting via JS")
|
|
80
|
+
tab.run_js("window.RENDER_FINISHED = false")
|
|
81
|
+
|
|
82
|
+
# Now wait for it to become true
|
|
83
|
+
poll_count = 0
|
|
62
84
|
while pytime.time() - start < timeout:
|
|
63
85
|
is_finished = tab.run_js("return window.RENDER_FINISHED")
|
|
86
|
+
poll_count += 1
|
|
64
87
|
if is_finished:
|
|
88
|
+
elapsed = pytime.time() - start
|
|
89
|
+
logger.debug(f"ContentRenderer[{context}]: RENDER_FINISHED=true after {elapsed:.2f}s ({poll_count} polls)")
|
|
65
90
|
return True
|
|
66
|
-
pytime.sleep(0.
|
|
67
|
-
|
|
91
|
+
pytime.sleep(0.1) # Poll every 100ms
|
|
92
|
+
|
|
93
|
+
elapsed = pytime.time() - start
|
|
94
|
+
logger.warning(f"ContentRenderer[{context}]: Wait for RENDER_FINISHED timed out after {elapsed:.2f}s ({poll_count} polls)")
|
|
68
95
|
return False
|
|
69
96
|
|
|
70
97
|
def _prepare_tab_sync(self) -> str:
|
|
@@ -89,8 +116,9 @@ class ContentRenderer:
|
|
|
89
116
|
"theme_color": "#ef4444",
|
|
90
117
|
}
|
|
91
118
|
|
|
119
|
+
logger.debug(f"ContentRenderer: Calling warmup updateRenderData for tab {tab_id}")
|
|
92
120
|
tab.run_js(f"window.updateRenderData({json.dumps(warmup_data)})")
|
|
93
|
-
self._wait_for_render_finished(tab, timeout=
|
|
121
|
+
self._wait_for_render_finished(tab, timeout=12.0, context=f"warmup:{tab_id}")
|
|
94
122
|
|
|
95
123
|
# Wait for main-container after warmup (Vue needs to render it)
|
|
96
124
|
tab.ele('#main-container', timeout=3)
|
|
@@ -203,7 +231,7 @@ class ContentRenderer:
|
|
|
203
231
|
|
|
204
232
|
# 1. Update Data & Wait for Finished flag
|
|
205
233
|
tab.run_js(f"window.updateRenderData({json.dumps(render_data)})")
|
|
206
|
-
self._wait_for_render_finished(tab)
|
|
234
|
+
self._wait_for_render_finished(tab, context=f"batch:{tab_id}")
|
|
207
235
|
|
|
208
236
|
# 2. Dynamic Resize
|
|
209
237
|
# Get actual content height to prevent clipping
|
|
@@ -330,10 +358,12 @@ class ContentRenderer:
|
|
|
330
358
|
"theme_color": theme_color,
|
|
331
359
|
}
|
|
332
360
|
|
|
361
|
+
actual_tab_id = getattr(tab, 'tab_id', 'unknown')
|
|
362
|
+
logger.info(f"ContentRenderer: Calling updateRenderData for tab {actual_tab_id}, markdown length={len(markdown_content)}")
|
|
333
363
|
tab.run_js(f"window.updateRenderData({json.dumps(render_data)})")
|
|
334
364
|
|
|
335
365
|
# Wait for event-driven finish
|
|
336
|
-
self._wait_for_render_finished(tab, timeout=
|
|
366
|
+
self._wait_for_render_finished(tab, timeout=12.0, context=f"render:{actual_tab_id}")
|
|
337
367
|
|
|
338
368
|
# Dynamic Resize
|
|
339
369
|
scroll_height = tab.run_js('return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);')
|
|
@@ -13,6 +13,10 @@ from typing import Optional, Dict, Any, List
|
|
|
13
13
|
from loguru import logger
|
|
14
14
|
import trafilatura
|
|
15
15
|
|
|
16
|
+
# Import intelligent completeness checker
|
|
17
|
+
from ..crawling.completeness import CompletenessChecker, trigger_lazy_load
|
|
18
|
+
from ..crawling.models import CrawlConfig
|
|
19
|
+
|
|
16
20
|
class ScreenshotService:
|
|
17
21
|
"""
|
|
18
22
|
Browser Service using DrissionPage.
|
|
@@ -281,14 +285,27 @@ class ScreenshotService:
|
|
|
281
285
|
// 2. Check loading status using decode() AND heuristic for placeholders
|
|
282
286
|
// Some sites load a tiny blurred placeholder first.
|
|
283
287
|
const checks = visibleImgs.map(img => {
|
|
284
|
-
//
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
+
// Enhanced placeholder detection (matching completeness.py)
|
|
289
|
+
const dataSrc = img.getAttribute('data-src') || img.getAttribute('data-original') ||
|
|
290
|
+
img.getAttribute('data-lazy-src') || img.getAttribute('data-lazy') || '';
|
|
291
|
+
const className = (typeof img.className === 'string' ? img.className : '').toLowerCase();
|
|
292
|
+
const loadingAttr = img.getAttribute('loading') || '';
|
|
293
|
+
const src = img.src || '';
|
|
288
294
|
|
|
289
295
|
const isPlaceholder = (
|
|
290
|
-
|
|
291
|
-
(
|
|
296
|
+
// data-src not yet loaded
|
|
297
|
+
(dataSrc && img.src !== dataSrc) ||
|
|
298
|
+
// Natural size much smaller than display (blurred placeholder)
|
|
299
|
+
(img.naturalWidth < 50 && img.clientWidth > 100) ||
|
|
300
|
+
(img.naturalWidth < 100 && img.clientWidth > 200 && img.naturalWidth * 4 < img.clientWidth) ||
|
|
301
|
+
// Lazy-loading class indicators
|
|
302
|
+
className.includes('lazy') ||
|
|
303
|
+
className.includes('lazyload') ||
|
|
304
|
+
className.includes('lozad') ||
|
|
305
|
+
// CSS blur filter applied
|
|
306
|
+
(window.getComputedStyle(img).filter || '').includes('blur') ||
|
|
307
|
+
// loading="lazy" + not complete
|
|
308
|
+
(loadingAttr === 'lazy' && !img.complete)
|
|
292
309
|
);
|
|
293
310
|
|
|
294
311
|
if (isPlaceholder) {
|
|
@@ -537,81 +554,71 @@ class ScreenshotService:
|
|
|
537
554
|
"""Synchronous screenshot."""
|
|
538
555
|
if not url: return None
|
|
539
556
|
tab = None
|
|
557
|
+
capture_width = 1024 # Standard capture width
|
|
558
|
+
|
|
540
559
|
try:
|
|
541
560
|
self._ensure_ready()
|
|
542
561
|
page = self._manager.page
|
|
543
562
|
if not page: return None
|
|
544
563
|
|
|
545
|
-
tab
|
|
564
|
+
# Create blank tab first
|
|
565
|
+
tab = page.new_tab()
|
|
546
566
|
|
|
547
|
-
#
|
|
567
|
+
# Set viewport BEFORE navigation so page renders at target width from the start
|
|
568
|
+
# This eliminates the need for post-load resize and reflow
|
|
548
569
|
try:
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
570
|
+
tab.run_cdp('Emulation.setDeviceMetricsOverride',
|
|
571
|
+
width=capture_width, height=900, deviceScaleFactor=1, mobile=False)
|
|
572
|
+
except:
|
|
573
|
+
pass
|
|
574
|
+
|
|
575
|
+
# Now navigate to the URL - page will render at target width
|
|
576
|
+
tab.get(url)
|
|
577
|
+
|
|
578
|
+
# Network monitoring removed - not needed for simplified wait logic
|
|
579
|
+
|
|
580
|
+
# Initialize crawl config for completeness checking (defined outside try for scope)
|
|
581
|
+
crawl_config = CrawlConfig(
|
|
582
|
+
scan_full_page=True,
|
|
583
|
+
scroll_step=800,
|
|
584
|
+
scroll_delay=0.5,
|
|
585
|
+
scroll_timeout=20.0, # Increased for lazy-loading pages
|
|
586
|
+
image_load_timeout=3.0, # Image loading timeout: 3 seconds
|
|
587
|
+
image_stability_checks=3,
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
# === Start scrolling immediately to trigger lazy loading ===
|
|
591
|
+
# Scroll first, then wait for DOM - this allows lazy loading to start in parallel
|
|
592
|
+
logger.info(f"ScreenshotService: Starting lazy load scroll for {url} (before DOM wait)")
|
|
593
|
+
trigger_lazy_load(tab, crawl_config)
|
|
594
|
+
|
|
595
|
+
# Now wait for DOM to be ready (after scroll has triggered lazy loading)
|
|
555
596
|
try:
|
|
556
597
|
# Wait for full page load (including JS execution)
|
|
557
|
-
|
|
598
|
+
try:
|
|
599
|
+
tab.wait.doc_loaded(timeout=timeout)
|
|
600
|
+
except:
|
|
601
|
+
pass
|
|
558
602
|
|
|
559
603
|
# Wait for actual content to appear (for CDN verification pages)
|
|
560
|
-
#
|
|
561
|
-
# 1. Network Idle: Wait for silence in XHR/POST
|
|
562
|
-
# 2. Stability: Wait for Height/Text/DOM stability
|
|
604
|
+
# Simplified wait logic for screenshot: just check basic readiness
|
|
563
605
|
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
last_h = 0
|
|
567
|
-
last_text_len = 0
|
|
568
|
-
last_html_len = 0
|
|
569
|
-
stable_count = 0
|
|
570
|
-
|
|
571
|
-
for i in range(200): # Max 200 iterations (~20s)
|
|
606
|
+
for i in range(20): # Max 20 iterations (~1s) - much faster
|
|
572
607
|
try:
|
|
573
|
-
# 1. Check Network Activity
|
|
574
|
-
has_recent_network = False
|
|
575
|
-
try:
|
|
576
|
-
# Iterate over any captured packets since last check
|
|
577
|
-
for packet in tab.listen.steps(timeout=0.01):
|
|
578
|
-
# Check if it's a significant request (POST or XHR/Fetch)
|
|
579
|
-
method = packet.method.upper()
|
|
580
|
-
r_type = packet.resourceType.upper() if getattr(packet, 'resourceType', None) else ""
|
|
581
|
-
|
|
582
|
-
# Interested in: POST requests OR any XHR/Fetch response
|
|
583
|
-
if method == 'POST' or 'XMLHTTPREQUEST' in r_type or 'FETCH' in r_type:
|
|
584
|
-
# Ignore some common noise? (Optional: analytics, tracking)
|
|
585
|
-
# For now, simplistic approach: any API traffic resets stability
|
|
586
|
-
has_recent_network = True
|
|
587
|
-
# logger.debug(f"Network Activity: {method} {packet.url[:50]}")
|
|
588
|
-
break
|
|
589
|
-
except:
|
|
590
|
-
pass
|
|
591
|
-
|
|
592
|
-
# 2. Check DOM State
|
|
593
608
|
state = tab.run_js('''
|
|
594
609
|
return {
|
|
595
610
|
ready: document.readyState === 'complete',
|
|
596
611
|
title: document.title,
|
|
597
|
-
|
|
598
|
-
document.body.scrollHeight || 0,
|
|
599
|
-
document.documentElement.scrollHeight || 0
|
|
600
|
-
),
|
|
601
|
-
text: document.body.innerText.substring(0, 1000) || "",
|
|
602
|
-
html_len: document.body.innerHTML.length || 0
|
|
612
|
+
text: document.body.innerText.substring(0, 500) || ""
|
|
603
613
|
};
|
|
604
|
-
''') or {'ready': False, 'title': "", '
|
|
614
|
+
''') or {'ready': False, 'title': "", 'text': ""}
|
|
605
615
|
|
|
606
616
|
is_ready = state.get('ready', False)
|
|
607
617
|
title = state.get('title', "").lower()
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
text_len = len(text_content)
|
|
611
|
-
html_len = int(state.get('html_len', 0))
|
|
612
|
-
text_lower = text_content.lower()
|
|
618
|
+
text_lower = state.get('text', "").lower()
|
|
619
|
+
text_len = len(text_lower)
|
|
613
620
|
|
|
614
|
-
#
|
|
621
|
+
# Check for verification pages
|
|
615
622
|
is_verification = "checking your browser" in text_lower or \
|
|
616
623
|
"just a moment" in text_lower or \
|
|
617
624
|
"please wait" in text_lower or \
|
|
@@ -619,54 +626,19 @@ class ScreenshotService:
|
|
|
619
626
|
"just a moment" in title or \
|
|
620
627
|
"loading..." in title
|
|
621
628
|
|
|
622
|
-
#
|
|
623
|
-
|
|
624
|
-
is_height_stable = current_h == last_h
|
|
625
|
-
is_text_stable = abs(text_len - last_text_len) < 5 # Allow minor fluctuations
|
|
626
|
-
is_dom_stable = abs(html_len - last_html_len) < 20 # Allow minor fluctuations (ads/tracking)
|
|
627
|
-
|
|
628
|
-
if is_height_stable and is_text_stable and is_dom_stable and not has_recent_network:
|
|
629
|
-
stable_count += 1
|
|
630
|
-
else:
|
|
631
|
-
# Reset if ANY metric changed or NETWORK active
|
|
632
|
-
stable_count = 0
|
|
633
|
-
# if has_recent_network: logger.debug("Stability reset: Network Activity")
|
|
629
|
+
# Basic content check
|
|
630
|
+
has_content = text_len > 100
|
|
634
631
|
|
|
635
|
-
#
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
# Dynamic Stability Requirement:
|
|
639
|
-
# If page looks like it's loading (small content), require longer stability
|
|
640
|
-
if has_recent_network:
|
|
641
|
-
# If we just saw network, enforce at least 1s (10 ticks) clean silence even for large pages
|
|
642
|
-
required_stability = max(10, 40 if text_len < 500 else 25)
|
|
643
|
-
else:
|
|
644
|
-
required_stability = 40 if text_len < 500 else 25 # 4.0s or 2.5s
|
|
645
|
-
|
|
646
|
-
is_stable = stable_count >= required_stability
|
|
647
|
-
|
|
648
|
-
# Pass if all conditions met
|
|
649
|
-
if is_ready and not is_verification and has_content and is_stable:
|
|
632
|
+
# Pass if ready, not verification, and has content
|
|
633
|
+
if is_ready and not is_verification and has_content:
|
|
634
|
+
logger.debug(f"ScreenshotService: Page ready after {i * 0.05:.2f}s")
|
|
650
635
|
break
|
|
651
|
-
|
|
652
|
-
last_h = current_h
|
|
653
|
-
last_text_len = text_len
|
|
654
|
-
last_html_len = html_len
|
|
655
|
-
|
|
656
|
-
# Wait timing within loop (tab.listen.steps consumed some time, so sleep less)
|
|
657
|
-
try: time.sleep(0.05)
|
|
658
|
-
except: pass
|
|
659
636
|
|
|
637
|
+
time.sleep(0.05)
|
|
660
638
|
except Exception:
|
|
661
|
-
|
|
662
|
-
try: time.sleep(0.1)
|
|
663
|
-
except: pass
|
|
639
|
+
time.sleep(0.05)
|
|
664
640
|
continue
|
|
665
641
|
|
|
666
|
-
# Cleanup listener
|
|
667
|
-
try: tab.listen.stop()
|
|
668
|
-
except: pass
|
|
669
|
-
|
|
670
642
|
# DEBUG: Save HTML to inspect what happened (in data dir)
|
|
671
643
|
try:
|
|
672
644
|
import os
|
|
@@ -675,50 +647,179 @@ class ScreenshotService:
|
|
|
675
647
|
f.write(f"<!-- URL: {url} -->\n")
|
|
676
648
|
f.write(tab.html)
|
|
677
649
|
except: pass
|
|
678
|
-
|
|
679
|
-
# Use faster scroll step (800) to ensure lazy loaded images appear
|
|
680
|
-
self._scroll_to_bottom(tab, step=800, delay=2.0, timeout=min(timeout, 10))
|
|
681
|
-
|
|
682
|
-
except:
|
|
683
|
-
pass
|
|
684
|
-
|
|
685
|
-
# Refine calculation: Set viewport width to 1024
|
|
686
650
|
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
# Calculate actual content height after lazy loading
|
|
690
|
-
try:
|
|
691
|
-
# Use a robust height calculation
|
|
692
|
-
content_height = tab.run_js('''
|
|
693
|
-
return Math.max(
|
|
694
|
-
document.body.scrollHeight || 0,
|
|
695
|
-
document.documentElement.scrollHeight || 0,
|
|
696
|
-
document.body.offsetHeight || 0,
|
|
697
|
-
document.documentElement.offsetHeight || 0,
|
|
698
|
-
document.documentElement.clientHeight || 0
|
|
699
|
-
);
|
|
700
|
-
''')
|
|
701
|
-
# Add a small buffer and cap at 15000px to prevent memory issues
|
|
702
|
-
h = min(int(content_height) + 50, 15000)
|
|
703
|
-
except:
|
|
704
|
-
h = 1000 # Fallback
|
|
651
|
+
except Exception as e:
|
|
652
|
+
logger.warning(f"ScreenshotService: Page readiness check failed: {e}")
|
|
705
653
|
|
|
706
|
-
#
|
|
707
|
-
try:
|
|
708
|
-
tab.run_cdp('Emulation.setDeviceMetricsOverride',
|
|
709
|
-
width=capture_width, height=h, deviceScaleFactor=1, mobile=False)
|
|
710
|
-
except:
|
|
711
|
-
pass
|
|
712
|
-
|
|
713
|
-
# Scrollbar Hiding
|
|
654
|
+
# Scrollbar Hiding first (before any height calculation)
|
|
714
655
|
from .manager import SharedBrowserManager
|
|
715
656
|
SharedBrowserManager.hide_scrollbars(tab)
|
|
716
657
|
|
|
717
|
-
# Scroll back to top
|
|
658
|
+
# Scroll back to top
|
|
718
659
|
tab.run_js("window.scrollTo(0, 0);")
|
|
719
660
|
|
|
720
|
-
#
|
|
721
|
-
#
|
|
661
|
+
# Image loading monitoring with time tracking - DISABLED
|
|
662
|
+
# No longer waiting for images to load
|
|
663
|
+
# Initialize image tracking JavaScript
|
|
664
|
+
# image_tracking_js = """
|
|
665
|
+
# (() => {
|
|
666
|
+
# if (!window._imageLoadTracker) {
|
|
667
|
+
# window._imageLoadTracker = {
|
|
668
|
+
# startTime: Date.now(),
|
|
669
|
+
# images: new Map()
|
|
670
|
+
# };
|
|
671
|
+
#
|
|
672
|
+
# const imgs = Array.from(document.querySelectorAll('img'));
|
|
673
|
+
# const minSize = 50;
|
|
674
|
+
#
|
|
675
|
+
# imgs.forEach((img, idx) => {
|
|
676
|
+
# if (img.clientWidth < minSize && img.clientHeight < minSize) return;
|
|
677
|
+
#
|
|
678
|
+
# const src = img.src || img.getAttribute('data-src') || '';
|
|
679
|
+
# const key = `${idx}_${src.substring(0, 100)}`;
|
|
680
|
+
#
|
|
681
|
+
# if (img.complete && img.naturalWidth > 0 && img.naturalHeight > 0) {
|
|
682
|
+
# // Already loaded
|
|
683
|
+
# window._imageLoadTracker.images.set(key, {
|
|
684
|
+
# src: src.substring(0, 150),
|
|
685
|
+
# status: 'loaded',
|
|
686
|
+
# loadTime: 0, // Already loaded before tracking
|
|
687
|
+
# naturalSize: [img.naturalWidth, img.naturalHeight],
|
|
688
|
+
# displaySize: [img.clientWidth, img.clientHeight]
|
|
689
|
+
# });
|
|
690
|
+
# } else {
|
|
691
|
+
# // Track loading
|
|
692
|
+
# window._imageLoadTracker.images.set(key, {
|
|
693
|
+
# src: src.substring(0, 150),
|
|
694
|
+
# status: 'pending',
|
|
695
|
+
# startTime: Date.now(),
|
|
696
|
+
# naturalSize: [img.naturalWidth, img.naturalHeight],
|
|
697
|
+
# displaySize: [img.clientWidth, img.clientHeight]
|
|
698
|
+
# });
|
|
699
|
+
#
|
|
700
|
+
# // Add load event listener
|
|
701
|
+
# img.addEventListener('load', () => {
|
|
702
|
+
# const entry = window._imageLoadTracker.images.get(key);
|
|
703
|
+
# if (entry && entry.status === 'pending') {
|
|
704
|
+
# entry.status = 'loaded';
|
|
705
|
+
# entry.loadTime = Date.now() - entry.startTime;
|
|
706
|
+
# }
|
|
707
|
+
# });
|
|
708
|
+
#
|
|
709
|
+
# img.addEventListener('error', () => {
|
|
710
|
+
# const entry = window._imageLoadTracker.images.get(key);
|
|
711
|
+
# if (entry && entry.status === 'pending') {
|
|
712
|
+
# entry.status = 'failed';
|
|
713
|
+
# entry.loadTime = Date.now() - entry.startTime;
|
|
714
|
+
# }
|
|
715
|
+
# });
|
|
716
|
+
# }
|
|
717
|
+
# });
|
|
718
|
+
# }
|
|
719
|
+
#
|
|
720
|
+
# // Return current status
|
|
721
|
+
# const results = [];
|
|
722
|
+
# window._imageLoadTracker.images.forEach((value, key) => {
|
|
723
|
+
# const entry = {
|
|
724
|
+
# src: value.src,
|
|
725
|
+
# status: value.status,
|
|
726
|
+
# loadTime: value.status === 'loaded' ? (value.loadTime || 0) : (Date.now() - value.startTime),
|
|
727
|
+
# naturalSize: value.naturalSize,
|
|
728
|
+
# displaySize: value.displaySize
|
|
729
|
+
# };
|
|
730
|
+
# results.push(entry);
|
|
731
|
+
# });
|
|
732
|
+
#
|
|
733
|
+
# return {
|
|
734
|
+
# total: results.length,
|
|
735
|
+
# loaded: results.filter(r => r.status === 'loaded').length,
|
|
736
|
+
# pending: results.filter(r => r.status === 'pending').length,
|
|
737
|
+
# failed: results.filter(r => r.status === 'failed').length,
|
|
738
|
+
# details: results
|
|
739
|
+
# };
|
|
740
|
+
# })()
|
|
741
|
+
# """
|
|
742
|
+
#
|
|
743
|
+
# # Initialize tracking
|
|
744
|
+
# tab.run_js(image_tracking_js)
|
|
745
|
+
#
|
|
746
|
+
# # Monitor image loading with dynamic stop logic
|
|
747
|
+
# check_interval = 0.2 # Check every 200ms
|
|
748
|
+
# image_timeout = 3.0 # Image loading timeout: 3 seconds
|
|
749
|
+
# monitoring_start = time.time()
|
|
750
|
+
# loaded_times = [] # Track load times of completed images
|
|
751
|
+
#
|
|
752
|
+
# logger.info(f"ScreenshotService: Starting image load monitoring (timeout={image_timeout}s)...")
|
|
753
|
+
#
|
|
754
|
+
# while True:
|
|
755
|
+
# elapsed = time.time() - monitoring_start
|
|
756
|
+
#
|
|
757
|
+
# # Check timeout first
|
|
758
|
+
# if elapsed >= image_timeout:
|
|
759
|
+
# logger.info(f"ScreenshotService: Image loading timeout ({image_timeout}s) reached")
|
|
760
|
+
# break
|
|
761
|
+
#
|
|
762
|
+
# # Get current image status
|
|
763
|
+
# status = tab.run_js(image_tracking_js, as_expr=True) or {
|
|
764
|
+
# 'total': 0, 'loaded': 0, 'pending': 0, 'failed': 0, 'details': []
|
|
765
|
+
# }
|
|
766
|
+
#
|
|
767
|
+
# # Log each image's status and load time
|
|
768
|
+
# for img_detail in status.get('details', []):
|
|
769
|
+
# src_short = img_detail.get('src', '')[:80]
|
|
770
|
+
# status_str = img_detail.get('status', 'unknown')
|
|
771
|
+
# load_time = img_detail.get('loadTime', 0)
|
|
772
|
+
# logger.info(
|
|
773
|
+
# f"ScreenshotService: Image [{status_str}] "
|
|
774
|
+
# f"loadTime={load_time:.0f}ms "
|
|
775
|
+
# f"src={src_short}"
|
|
776
|
+
# )
|
|
777
|
+
#
|
|
778
|
+
# # Collect load times of completed images
|
|
779
|
+
# loaded_times = [
|
|
780
|
+
# img.get('loadTime', 0)
|
|
781
|
+
# for img in status.get('details', [])
|
|
782
|
+
# if img.get('status') == 'loaded' and img.get('loadTime', 0) > 0
|
|
783
|
+
# ]
|
|
784
|
+
#
|
|
785
|
+
# pending_count = status.get('pending', 0)
|
|
786
|
+
# loaded_count = status.get('loaded', 0)
|
|
787
|
+
#
|
|
788
|
+
# # Check stop conditions
|
|
789
|
+
# if pending_count == 0:
|
|
790
|
+
# logger.info(f"ScreenshotService: All images loaded. Total: {status.get('total', 0)}, Loaded: {loaded_count}")
|
|
791
|
+
# break
|
|
792
|
+
#
|
|
793
|
+
# # Check dynamic stop condition (if we have loaded images to calculate average)
|
|
794
|
+
# if loaded_times:
|
|
795
|
+
# avg_load_time = sum(loaded_times) / len(loaded_times)
|
|
796
|
+
# max_wait_time = avg_load_time * 2
|
|
797
|
+
#
|
|
798
|
+
# # Check if any pending image has exceeded max wait time
|
|
799
|
+
# pending_images = [
|
|
800
|
+
# img for img in status.get('details', [])
|
|
801
|
+
# if img.get('status') == 'pending'
|
|
802
|
+
# ]
|
|
803
|
+
#
|
|
804
|
+
# should_stop = False
|
|
805
|
+
# for pending_img in pending_images:
|
|
806
|
+
# wait_time = pending_img.get('loadTime', 0)
|
|
807
|
+
# if wait_time >= max_wait_time:
|
|
808
|
+
# should_stop = True
|
|
809
|
+
# logger.info(
|
|
810
|
+
# f"ScreenshotService: Stopping - pending image waited {wait_time:.0f}ms, "
|
|
811
|
+
# f"exceeds 2x avg load time ({max_wait_time:.0f}ms, avg={avg_load_time:.0f}ms)"
|
|
812
|
+
# )
|
|
813
|
+
# break
|
|
814
|
+
#
|
|
815
|
+
# if should_stop:
|
|
816
|
+
# break
|
|
817
|
+
#
|
|
818
|
+
# # Wait before next check
|
|
819
|
+
# time.sleep(check_interval)
|
|
820
|
+
|
|
821
|
+
# Now calculate final height ONCE after all content loaded
|
|
822
|
+
# CompletenessChecker already verified height stability
|
|
722
823
|
try:
|
|
723
824
|
final_height = tab.run_js('''
|
|
724
825
|
return Math.max(
|
|
@@ -728,13 +829,15 @@ class ScreenshotService:
|
|
|
728
829
|
document.documentElement.offsetHeight || 0
|
|
729
830
|
);
|
|
730
831
|
''')
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
width=capture_width, height=final_h, deviceScaleFactor=1, mobile=False)
|
|
832
|
+
h = min(int(final_height) + 50, 15000)
|
|
833
|
+
tab.run_cdp('Emulation.setDeviceMetricsOverride',
|
|
834
|
+
width=capture_width, height=h, deviceScaleFactor=1, mobile=False)
|
|
735
835
|
except:
|
|
736
836
|
pass
|
|
737
837
|
|
|
838
|
+
# Final scroll to top
|
|
839
|
+
tab.run_js("window.scrollTo(0, 0);")
|
|
840
|
+
|
|
738
841
|
# Use full_page=False because we manually set the viewport to the full height
|
|
739
842
|
# This avoids stitching artifacts and blank spaces
|
|
740
843
|
return tab.get_screenshot(as_base64='jpg', full_page=False)
|
hyw_core/core.py
CHANGED
|
@@ -10,7 +10,6 @@ from dataclasses import dataclass, field
|
|
|
10
10
|
from typing import Dict, List, Any, Optional, Callable, Awaitable
|
|
11
11
|
|
|
12
12
|
from loguru import logger
|
|
13
|
-
from openai import AsyncOpenAI
|
|
14
13
|
|
|
15
14
|
from .config import HywCoreConfig, ModelConfig
|
|
16
15
|
from .pipeline import ModularPipeline
|
|
@@ -95,12 +94,6 @@ class HywCore:
|
|
|
95
94
|
self.config = config
|
|
96
95
|
self._send_func = send_func
|
|
97
96
|
|
|
98
|
-
# Create OpenAI client
|
|
99
|
-
self._client = AsyncOpenAI(
|
|
100
|
-
api_key=config.api_key,
|
|
101
|
-
base_url=config.base_url if config.base_url else None
|
|
102
|
-
)
|
|
103
|
-
|
|
104
97
|
# Create search service
|
|
105
98
|
self._search_service = SearchService(config)
|
|
106
99
|
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
hyw_core.crawling - Intelligent Web Crawling Module
|
|
3
|
+
|
|
4
|
+
Provides Crawl4AI-inspired adaptive crawling with:
|
|
5
|
+
- Page completeness guarantees (image loading verification)
|
|
6
|
+
- Content quality scoring
|
|
7
|
+
- Adaptive stop logic
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from .models import CrawlConfig, PageResult, CompletenessResult
|
|
11
|
+
from .completeness import CompletenessChecker
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"CrawlConfig",
|
|
15
|
+
"PageResult",
|
|
16
|
+
"CompletenessResult",
|
|
17
|
+
"CompletenessChecker",
|
|
18
|
+
]
|