entari-plugin-hyw 4.0.0rc10__tar.gz → 4.0.0rc11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of entari-plugin-hyw might be problematic. Click here for more details.

Files changed (76) hide show
  1. {entari_plugin_hyw-4.0.0rc10/src/entari_plugin_hyw.egg-info → entari_plugin_hyw-4.0.0rc11}/PKG-INFO +1 -1
  2. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/pyproject.toml +1 -1
  3. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/entari_plugin_hyw/__init__.py +23 -14
  4. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11/src/entari_plugin_hyw.egg-info}/PKG-INFO +1 -1
  5. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/service.py +208 -119
  6. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/core.py +0 -7
  7. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/crawling/completeness.py +99 -10
  8. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/MANIFEST.in +0 -0
  9. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/README.md +0 -0
  10. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/setup.cfg +0 -0
  11. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/entari_plugin_hyw/Untitled-1 +0 -0
  12. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/entari_plugin_hyw/history.py +0 -0
  13. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/entari_plugin_hyw/misc.py +0 -0
  14. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/entari_plugin_hyw/search_cache.py +0 -0
  15. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/entari_plugin_hyw.egg-info/SOURCES.txt +0 -0
  16. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/entari_plugin_hyw.egg-info/dependency_links.txt +0 -0
  17. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/entari_plugin_hyw.egg-info/requires.txt +0 -0
  18. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/entari_plugin_hyw.egg-info/top_level.txt +0 -0
  19. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/__init__.py +0 -0
  20. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/__init__.py +0 -0
  21. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/card-dist/index.html +0 -0
  22. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/card-dist/logos/anthropic.svg +0 -0
  23. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/card-dist/logos/cerebras.svg +0 -0
  24. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/card-dist/logos/deepseek.png +0 -0
  25. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/card-dist/logos/gemini.svg +0 -0
  26. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/card-dist/logos/google.svg +0 -0
  27. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/card-dist/logos/grok.png +0 -0
  28. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/card-dist/logos/huggingface.png +0 -0
  29. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/card-dist/logos/microsoft.svg +0 -0
  30. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/card-dist/logos/minimax.png +0 -0
  31. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/card-dist/logos/mistral.png +0 -0
  32. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/card-dist/logos/nvida.png +0 -0
  33. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/card-dist/logos/openai.svg +0 -0
  34. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/card-dist/logos/openrouter.png +0 -0
  35. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/card-dist/logos/perplexity.svg +0 -0
  36. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/card-dist/logos/qwen.png +0 -0
  37. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/card-dist/logos/xai.png +0 -0
  38. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/card-dist/logos/xiaomi.png +0 -0
  39. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/card-dist/logos/zai.png +0 -0
  40. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/card-dist/vite.svg +0 -0
  41. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/index.html +0 -0
  42. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/logos/anthropic.svg +0 -0
  43. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/logos/cerebras.svg +0 -0
  44. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/logos/deepseek.png +0 -0
  45. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/logos/gemini.svg +0 -0
  46. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/logos/google.svg +0 -0
  47. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/logos/grok.png +0 -0
  48. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/logos/huggingface.png +0 -0
  49. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/logos/microsoft.svg +0 -0
  50. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/logos/minimax.png +0 -0
  51. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/logos/mistral.png +0 -0
  52. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/logos/nvida.png +0 -0
  53. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/logos/openai.svg +0 -0
  54. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/logos/openrouter.png +0 -0
  55. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/logos/perplexity.svg +0 -0
  56. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/logos/qwen.png +0 -0
  57. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/logos/xai.png +0 -0
  58. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/logos/xiaomi.png +0 -0
  59. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/assets/logos/zai.png +0 -0
  60. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/engines/__init__.py +0 -0
  61. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/engines/base.py +0 -0
  62. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/engines/default.py +0 -0
  63. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/engines/duckduckgo.py +0 -0
  64. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/landing.html +0 -0
  65. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/manager.py +0 -0
  66. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/browser_control/renderer.py +0 -0
  67. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/config.py +0 -0
  68. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/crawling/__init__.py +0 -0
  69. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/crawling/models.py +0 -0
  70. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/definitions.py +0 -0
  71. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/image_cache.py +0 -0
  72. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/pipeline.py +0 -0
  73. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/search.py +0 -0
  74. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/stages/__init__.py +0 -0
  75. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/stages/base.py +0 -0
  76. {entari_plugin_hyw-4.0.0rc10 → entari_plugin_hyw-4.0.0rc11}/src/hyw_core/stages/summary.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: entari_plugin_hyw
3
- Version: 4.0.0rc10
3
+ Version: 4.0.0rc11
4
4
  Summary: Use large language models to interpret chat messages
5
5
  Author-email: kumoSleeping <zjr2992@outlook.com>
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "entari_plugin_hyw"
7
- version = "4.0.0-rc10"
7
+ version = "4.0.0-rc11"
8
8
  description = "Use large language models to interpret chat messages"
9
9
  authors = [{name = "kumoSleeping", email = "zjr2992@outlook.com"}]
10
10
  dependencies = [
@@ -108,6 +108,13 @@ def parse_filter_syntax(query: str, max_count: int = 3):
108
108
  if total > max_count:
109
109
  return [], search_query, f"最多选择{max_count}个结果 (当前选择了{total}个)"
110
110
 
111
+ # Append filter names to search query
112
+ # Extract filter names (only 'link' type, skip 'index' type)
113
+ filter_names = [f[1] for f in filters if f[0] == 'link']
114
+ if filter_names:
115
+ # Append filter names to search query: "search_query filter1 filter2"
116
+ search_query = f"{search_query} {' '.join(filter_names)}"
117
+
111
118
  return filters, search_query, None
112
119
 
113
120
 
@@ -212,12 +219,11 @@ renderer = ContentRenderer(headless=conf.headless)
212
219
  set_global_renderer(renderer)
213
220
  search_cache = SearchResultCache(ttl_seconds=600.0) # 10 minutes
214
221
 
215
- _hyw_core: Optional[HywCore] = None
222
+ # Initialize HywCore immediately at plugin load time (not lazy)
223
+ # This avoids the 2s delay on first user request caused by AsyncOpenAI client creation
224
+ _hyw_core: HywCore = HywCore(conf.to_hyw_core_config())
216
225
 
217
226
  def get_hyw_core() -> HywCore:
218
- global _hyw_core
219
- if _hyw_core is None:
220
- _hyw_core = HywCore(conf.to_hyw_core_config())
221
227
  return _hyw_core
222
228
 
223
229
 
@@ -799,10 +805,14 @@ async def handle_web_command(session: Session[MessageCreatedEvent], result: Arpa
799
805
  await session.send(filter_error)
800
806
  return
801
807
 
802
- # Start search and tab pre-warming in parallel
808
+ # Start search first
803
809
  local_renderer = await get_content_renderer()
804
810
  search_task = asyncio.create_task(core.search([search_query]))
805
- tab_task = asyncio.create_task(local_renderer.prepare_tab())
811
+
812
+ # Only pre-warm tab if NOT in filter mode (filter mode = screenshots only, no card render)
813
+ tab_task = None
814
+ if not filters:
815
+ tab_task = asyncio.create_task(local_renderer.prepare_tab())
806
816
 
807
817
  if conf.reaction:
808
818
  asyncio.create_task(react(session, "🔍"))
@@ -811,24 +821,23 @@ async def handle_web_command(session: Session[MessageCreatedEvent], result: Arpa
811
821
  flat_results = results[0] if results else []
812
822
 
813
823
  if not flat_results:
814
- try: await tab_task
815
- except: pass
824
+ if tab_task:
825
+ try: await tab_task
826
+ except: pass
816
827
  await session.send("Search returned no results.")
817
828
  return
818
829
 
819
830
  visible = [r for r in flat_results if not r.get("_hidden", False)]
820
831
 
821
832
  if not visible:
822
- try: await tab_task
823
- except: pass
833
+ if tab_task:
834
+ try: await tab_task
835
+ except: pass
824
836
  await session.send("Search returned no visible results.")
825
837
  return
826
838
 
827
- # === Filter Mode: Screenshot matching links ===
839
+ # === Filter Mode: Screenshot matching links (NO tab needed) ===
828
840
  if filters:
829
- # No need for tab in filter/screenshot mode, cancel it
830
- try: await tab_task
831
- except: pass
832
841
 
833
842
  urls_to_screenshot = []
834
843
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: entari_plugin_hyw
3
- Version: 4.0.0rc10
3
+ Version: 4.0.0rc11
4
4
  Summary: Use large language models to interpret chat messages
5
5
  Author-email: kumoSleeping <zjr2992@outlook.com>
6
6
  License: MIT
@@ -285,14 +285,27 @@ class ScreenshotService:
285
285
  // 2. Check loading status using decode() AND heuristic for placeholders
286
286
  // Some sites load a tiny blurred placeholder first.
287
287
  const checks = visibleImgs.map(img => {
288
- // HEURISTIC: content is likely not ready if:
289
- // - img has 'data-src' but src is different (or src is empty)
290
- // - img has 'loading="lazy"' and is not complete
291
- // - naturalWidth is very small (placeholder) compared to display width
288
+ // Enhanced placeholder detection (matching completeness.py)
289
+ const dataSrc = img.getAttribute('data-src') || img.getAttribute('data-original') ||
290
+ img.getAttribute('data-lazy-src') || img.getAttribute('data-lazy') || '';
291
+ const className = (typeof img.className === 'string' ? img.className : '').toLowerCase();
292
+ const loadingAttr = img.getAttribute('loading') || '';
293
+ const src = img.src || '';
292
294
 
293
295
  const isPlaceholder = (
294
- (img.getAttribute('data-src') && img.src !== img.getAttribute('data-src')) ||
295
- (img.naturalWidth < 50 && img.clientWidth > 100)
296
+ // data-src not yet loaded
297
+ (dataSrc && img.src !== dataSrc) ||
298
+ // Natural size much smaller than display (blurred placeholder)
299
+ (img.naturalWidth < 50 && img.clientWidth > 100) ||
300
+ (img.naturalWidth < 100 && img.clientWidth > 200 && img.naturalWidth * 4 < img.clientWidth) ||
301
+ // Lazy-loading class indicators
302
+ className.includes('lazy') ||
303
+ className.includes('lazyload') ||
304
+ className.includes('lozad') ||
305
+ // CSS blur filter applied
306
+ (window.getComputedStyle(img).filter || '').includes('blur') ||
307
+ // loading="lazy" + not complete
308
+ (loadingAttr === 'lazy' && !img.complete)
296
309
  );
297
310
 
298
311
  if (isPlaceholder) {
@@ -562,84 +575,50 @@ class ScreenshotService:
562
575
  # Now navigate to the URL - page will render at target width
563
576
  tab.get(url)
564
577
 
565
- # Start monitoring network traffic
566
- try:
567
- # Listen for data packets (XHR/Fetch/POST)
568
- # Targets: xhr, fetch. POST usually falls under these or Document.
569
- tab.listen.start(targets=True) # Listen to everything for now to be safe
570
- except Exception as e:
571
- logger.warning(f"ScreenshotService: Failed to start network listener: {e}")
578
+ # Network monitoring removed - not needed for simplified wait logic
572
579
 
573
580
  # Initialize crawl config for completeness checking (defined outside try for scope)
574
581
  crawl_config = CrawlConfig(
575
582
  scan_full_page=True,
576
583
  scroll_step=800,
577
584
  scroll_delay=0.5,
578
- scroll_timeout=min(timeout, 10),
579
- image_load_timeout=8.0,
585
+ scroll_timeout=20.0, # Increased for lazy-loading pages
586
+ image_load_timeout=3.0, # Image loading timeout: 3 seconds
580
587
  image_stability_checks=3,
581
588
  )
582
589
 
590
+ # === Start scrolling immediately to trigger lazy loading ===
591
+ # Scroll first, then wait for DOM - this allows lazy loading to start in parallel
592
+ logger.info(f"ScreenshotService: Starting lazy load scroll for {url} (before DOM wait)")
593
+ trigger_lazy_load(tab, crawl_config)
594
+
595
+ # Now wait for DOM to be ready (after scroll has triggered lazy loading)
583
596
  try:
584
597
  # Wait for full page load (including JS execution)
585
- tab.wait.load_complete(timeout=timeout)
598
+ try:
599
+ tab.wait.doc_loaded(timeout=timeout)
600
+ except:
601
+ pass
586
602
 
587
603
  # Wait for actual content to appear (for CDN verification pages)
588
- # Smart Wait Logic (Final Robust):
589
- # 1. Network Idle: Wait for silence in XHR/POST
590
- # 2. Stability: Wait for Height/Text/DOM stability
591
-
592
- time.sleep(1.5) # user request: force wait 1.5s before detection
604
+ # Simplified wait logic for screenshot: just check basic readiness
593
605
 
594
- last_h = 0
595
- last_text_len = 0
596
- last_html_len = 0
597
- stable_count = 0
598
-
599
- for i in range(200): # Max 200 iterations (~20s)
606
+ for i in range(20): # Max 20 iterations (~1s) - much faster
600
607
  try:
601
- # 1. Check Network Activity
602
- has_recent_network = False
603
- try:
604
- # Iterate over any captured packets since last check
605
- for packet in tab.listen.steps(timeout=0.01):
606
- # Check if it's a significant request (POST or XHR/Fetch)
607
- method = packet.method.upper()
608
- r_type = packet.resourceType.upper() if getattr(packet, 'resourceType', None) else ""
609
-
610
- # Interested in: POST requests OR any XHR/Fetch response
611
- if method == 'POST' or 'XMLHTTPREQUEST' in r_type or 'FETCH' in r_type:
612
- # Ignore some common noise? (Optional: analytics, tracking)
613
- # For now, simplistic approach: any API traffic resets stability
614
- has_recent_network = True
615
- # logger.debug(f"Network Activity: {method} {packet.url[:50]}")
616
- break
617
- except:
618
- pass
619
-
620
- # 2. Check DOM State
621
608
  state = tab.run_js('''
622
609
  return {
623
610
  ready: document.readyState === 'complete',
624
611
  title: document.title,
625
- height: Math.max(
626
- document.body.scrollHeight || 0,
627
- document.documentElement.scrollHeight || 0
628
- ),
629
- text: document.body.innerText.substring(0, 1000) || "",
630
- html_len: document.body.innerHTML.length || 0
612
+ text: document.body.innerText.substring(0, 500) || ""
631
613
  };
632
- ''') or {'ready': False, 'title': "", 'height': 0, 'text': "", 'html_len': 0}
614
+ ''') or {'ready': False, 'title': "", 'text': ""}
633
615
 
634
616
  is_ready = state.get('ready', False)
635
617
  title = state.get('title', "").lower()
636
- current_h = int(state.get('height', 0))
637
- text_content = state.get('text', "")
638
- text_len = len(text_content)
639
- html_len = int(state.get('html_len', 0))
640
- text_lower = text_content.lower()
618
+ text_lower = state.get('text', "").lower()
619
+ text_len = len(text_lower)
641
620
 
642
- # Blacklist check (Loading indicators)
621
+ # Check for verification pages
643
622
  is_verification = "checking your browser" in text_lower or \
644
623
  "just a moment" in text_lower or \
645
624
  "please wait" in text_lower or \
@@ -647,54 +626,19 @@ class ScreenshotService:
647
626
  "just a moment" in title or \
648
627
  "loading..." in title
649
628
 
650
- # Stability check (Multi-metric + Network)
651
- # We require STABILITY across Height, Text Length, DOM Size AND Network Silence
652
- is_height_stable = current_h == last_h
653
- is_text_stable = abs(text_len - last_text_len) < 5 # Allow minor fluctuations
654
- is_dom_stable = abs(html_len - last_html_len) < 20 # Allow minor fluctuations (ads/tracking)
655
-
656
- if is_height_stable and is_text_stable and is_dom_stable and not has_recent_network:
657
- stable_count += 1
658
- else:
659
- # Reset if ANY metric changed or NETWORK active
660
- stable_count = 0
661
- # if has_recent_network: logger.debug("Stability reset: Network Activity")
662
-
663
- # Conditions
664
- has_content = text_len > 100 # At least 100 real chars
629
+ # Basic content check
630
+ has_content = text_len > 100
665
631
 
666
- # Dynamic Stability Requirement:
667
- # If page looks like it's loading (small content), require longer stability
668
- if has_recent_network:
669
- # If we just saw network, enforce at least 1s (10 ticks) clean silence even for large pages
670
- required_stability = max(10, 40 if text_len < 500 else 25)
671
- else:
672
- required_stability = 40 if text_len < 500 else 25 # 4.0s or 2.5s
673
-
674
- is_stable = stable_count >= required_stability
675
-
676
- # Pass if all conditions met
677
- if is_ready and not is_verification and has_content and is_stable:
632
+ # Pass if ready, not verification, and has content
633
+ if is_ready and not is_verification and has_content:
634
+ logger.debug(f"ScreenshotService: Page ready after {i * 0.05:.2f}s")
678
635
  break
679
-
680
- last_h = current_h
681
- last_text_len = text_len
682
- last_html_len = html_len
683
-
684
- # Wait timing within loop (tab.listen.steps consumed some time, so sleep less)
685
- try: time.sleep(0.05)
686
- except: pass
687
636
 
637
+ time.sleep(0.05)
688
638
  except Exception:
689
- stable_count = 0
690
- try: time.sleep(0.1)
691
- except: pass
639
+ time.sleep(0.05)
692
640
  continue
693
641
 
694
- # Cleanup listener
695
- try: tab.listen.stop()
696
- except: pass
697
-
698
642
  # DEBUG: Save HTML to inspect what happened (in data dir)
699
643
  try:
700
644
  import os
@@ -703,12 +647,9 @@ class ScreenshotService:
703
647
  f.write(f"<!-- URL: {url} -->\n")
704
648
  f.write(tab.html)
705
649
  except: pass
706
-
707
- # Use crawling module for lazy loading trigger (config defined above)
708
- trigger_lazy_load(tab, crawl_config)
709
-
710
- except:
711
- pass
650
+
651
+ except Exception as e:
652
+ logger.warning(f"ScreenshotService: Page readiness check failed: {e}")
712
653
 
713
654
  # Scrollbar Hiding first (before any height calculation)
714
655
  from .manager import SharedBrowserManager
@@ -717,17 +658,165 @@ class ScreenshotService:
717
658
  # Scroll back to top
718
659
  tab.run_js("window.scrollTo(0, 0);")
719
660
 
720
- # Use CompletenessChecker to verify all images are loaded
721
- checker = CompletenessChecker(crawl_config)
722
- completeness = checker.wait_for_complete(tab, timeout=crawl_config.image_load_timeout)
723
-
724
- logger.info(
725
- f"ScreenshotService: Image completeness: "
726
- f"{completeness.loaded_images}/{completeness.total_images} loaded, "
727
- f"{completeness.failed_images} pending, "
728
- f"{completeness.placeholder_images} placeholders, "
729
- f"complete={completeness.is_complete}"
730
- )
661
+ # Image loading monitoring with time tracking - DISABLED
662
+ # No longer waiting for images to load
663
+ # Initialize image tracking JavaScript
664
+ # image_tracking_js = """
665
+ # (() => {
666
+ # if (!window._imageLoadTracker) {
667
+ # window._imageLoadTracker = {
668
+ # startTime: Date.now(),
669
+ # images: new Map()
670
+ # };
671
+ #
672
+ # const imgs = Array.from(document.querySelectorAll('img'));
673
+ # const minSize = 50;
674
+ #
675
+ # imgs.forEach((img, idx) => {
676
+ # if (img.clientWidth < minSize && img.clientHeight < minSize) return;
677
+ #
678
+ # const src = img.src || img.getAttribute('data-src') || '';
679
+ # const key = `${idx}_${src.substring(0, 100)}`;
680
+ #
681
+ # if (img.complete && img.naturalWidth > 0 && img.naturalHeight > 0) {
682
+ # // Already loaded
683
+ # window._imageLoadTracker.images.set(key, {
684
+ # src: src.substring(0, 150),
685
+ # status: 'loaded',
686
+ # loadTime: 0, // Already loaded before tracking
687
+ # naturalSize: [img.naturalWidth, img.naturalHeight],
688
+ # displaySize: [img.clientWidth, img.clientHeight]
689
+ # });
690
+ # } else {
691
+ # // Track loading
692
+ # window._imageLoadTracker.images.set(key, {
693
+ # src: src.substring(0, 150),
694
+ # status: 'pending',
695
+ # startTime: Date.now(),
696
+ # naturalSize: [img.naturalWidth, img.naturalHeight],
697
+ # displaySize: [img.clientWidth, img.clientHeight]
698
+ # });
699
+ #
700
+ # // Add load event listener
701
+ # img.addEventListener('load', () => {
702
+ # const entry = window._imageLoadTracker.images.get(key);
703
+ # if (entry && entry.status === 'pending') {
704
+ # entry.status = 'loaded';
705
+ # entry.loadTime = Date.now() - entry.startTime;
706
+ # }
707
+ # });
708
+ #
709
+ # img.addEventListener('error', () => {
710
+ # const entry = window._imageLoadTracker.images.get(key);
711
+ # if (entry && entry.status === 'pending') {
712
+ # entry.status = 'failed';
713
+ # entry.loadTime = Date.now() - entry.startTime;
714
+ # }
715
+ # });
716
+ # }
717
+ # });
718
+ # }
719
+ #
720
+ # // Return current status
721
+ # const results = [];
722
+ # window._imageLoadTracker.images.forEach((value, key) => {
723
+ # const entry = {
724
+ # src: value.src,
725
+ # status: value.status,
726
+ # loadTime: value.status === 'loaded' ? (value.loadTime || 0) : (Date.now() - value.startTime),
727
+ # naturalSize: value.naturalSize,
728
+ # displaySize: value.displaySize
729
+ # };
730
+ # results.push(entry);
731
+ # });
732
+ #
733
+ # return {
734
+ # total: results.length,
735
+ # loaded: results.filter(r => r.status === 'loaded').length,
736
+ # pending: results.filter(r => r.status === 'pending').length,
737
+ # failed: results.filter(r => r.status === 'failed').length,
738
+ # details: results
739
+ # };
740
+ # })()
741
+ # """
742
+ #
743
+ # # Initialize tracking
744
+ # tab.run_js(image_tracking_js)
745
+ #
746
+ # # Monitor image loading with dynamic stop logic
747
+ # check_interval = 0.2 # Check every 200ms
748
+ # image_timeout = 3.0 # Image loading timeout: 3 seconds
749
+ # monitoring_start = time.time()
750
+ # loaded_times = [] # Track load times of completed images
751
+ #
752
+ # logger.info(f"ScreenshotService: Starting image load monitoring (timeout={image_timeout}s)...")
753
+ #
754
+ # while True:
755
+ # elapsed = time.time() - monitoring_start
756
+ #
757
+ # # Check timeout first
758
+ # if elapsed >= image_timeout:
759
+ # logger.info(f"ScreenshotService: Image loading timeout ({image_timeout}s) reached")
760
+ # break
761
+ #
762
+ # # Get current image status
763
+ # status = tab.run_js(image_tracking_js, as_expr=True) or {
764
+ # 'total': 0, 'loaded': 0, 'pending': 0, 'failed': 0, 'details': []
765
+ # }
766
+ #
767
+ # # Log each image's status and load time
768
+ # for img_detail in status.get('details', []):
769
+ # src_short = img_detail.get('src', '')[:80]
770
+ # status_str = img_detail.get('status', 'unknown')
771
+ # load_time = img_detail.get('loadTime', 0)
772
+ # logger.info(
773
+ # f"ScreenshotService: Image [{status_str}] "
774
+ # f"loadTime={load_time:.0f}ms "
775
+ # f"src={src_short}"
776
+ # )
777
+ #
778
+ # # Collect load times of completed images
779
+ # loaded_times = [
780
+ # img.get('loadTime', 0)
781
+ # for img in status.get('details', [])
782
+ # if img.get('status') == 'loaded' and img.get('loadTime', 0) > 0
783
+ # ]
784
+ #
785
+ # pending_count = status.get('pending', 0)
786
+ # loaded_count = status.get('loaded', 0)
787
+ #
788
+ # # Check stop conditions
789
+ # if pending_count == 0:
790
+ # logger.info(f"ScreenshotService: All images loaded. Total: {status.get('total', 0)}, Loaded: {loaded_count}")
791
+ # break
792
+ #
793
+ # # Check dynamic stop condition (if we have loaded images to calculate average)
794
+ # if loaded_times:
795
+ # avg_load_time = sum(loaded_times) / len(loaded_times)
796
+ # max_wait_time = avg_load_time * 2
797
+ #
798
+ # # Check if any pending image has exceeded max wait time
799
+ # pending_images = [
800
+ # img for img in status.get('details', [])
801
+ # if img.get('status') == 'pending'
802
+ # ]
803
+ #
804
+ # should_stop = False
805
+ # for pending_img in pending_images:
806
+ # wait_time = pending_img.get('loadTime', 0)
807
+ # if wait_time >= max_wait_time:
808
+ # should_stop = True
809
+ # logger.info(
810
+ # f"ScreenshotService: Stopping - pending image waited {wait_time:.0f}ms, "
811
+ # f"exceeds 2x avg load time ({max_wait_time:.0f}ms, avg={avg_load_time:.0f}ms)"
812
+ # )
813
+ # break
814
+ #
815
+ # if should_stop:
816
+ # break
817
+ #
818
+ # # Wait before next check
819
+ # time.sleep(check_interval)
731
820
 
732
821
  # Now calculate final height ONCE after all content loaded
733
822
  # CompletenessChecker already verified height stability
@@ -10,7 +10,6 @@ from dataclasses import dataclass, field
10
10
  from typing import Dict, List, Any, Optional, Callable, Awaitable
11
11
 
12
12
  from loguru import logger
13
- from openai import AsyncOpenAI
14
13
 
15
14
  from .config import HywCoreConfig, ModelConfig
16
15
  from .pipeline import ModularPipeline
@@ -95,12 +94,6 @@ class HywCore:
95
94
  self.config = config
96
95
  self._send_func = send_func
97
96
 
98
- # Create OpenAI client
99
- self._client = AsyncOpenAI(
100
- api_key=config.api_key,
101
- base_url=config.base_url if config.base_url else None
102
- )
103
-
104
97
  # Create search service
105
98
  self._search_service = SearchService(config)
106
99
 
@@ -46,13 +46,39 @@ IMAGE_CHECK_JS = """
46
46
  results.total++;
47
47
 
48
48
  const src = img.src || img.getAttribute('data-src') || '';
49
+ const dataSrc = img.getAttribute('data-src') || img.getAttribute('data-original') ||
50
+ img.getAttribute('data-lazy-src') || img.getAttribute('data-lazy') || '';
51
+ const className = (typeof img.className === 'string' ? img.className : '').toLowerCase();
52
+ const loadingAttr = img.getAttribute('loading') || '';
53
+
54
+ // Enhanced placeholder detection for blurred preview images (like mcmod.cn)
49
55
  const isPlaceholder = (
50
- (img.getAttribute('data-src') && img.src !== img.getAttribute('data-src')) ||
56
+ // 1. data-src exists but not yet loaded into src
57
+ (dataSrc && img.src !== dataSrc) ||
58
+ // 2. Natural size much smaller than display size (blurred placeholder)
51
59
  (img.naturalWidth < 50 && img.clientWidth > 100) ||
60
+ (img.naturalWidth < 100 && img.clientWidth > 200 && img.naturalWidth * 4 < img.clientWidth) ||
61
+ // 3. Common placeholder keywords in src
52
62
  src.includes('placeholder') ||
53
63
  src.includes('loading') ||
64
+ src.includes('blank') ||
65
+ // 4. SVG placeholder or 1x1 tracking pixel
54
66
  src.startsWith('data:image/svg+xml') ||
55
- (img.naturalWidth === 1 && img.naturalHeight === 1)
67
+ (img.naturalWidth === 1 && img.naturalHeight === 1) ||
68
+ // 5. Lazy-loading class indicators (common patterns)
69
+ className.includes('lazy') ||
70
+ className.includes('lazyload') ||
71
+ className.includes('lozad') ||
72
+ className.includes('b-lazy') ||
73
+ // 6. Blur indicators (common for LQIP - Low Quality Image Placeholder)
74
+ className.includes('blur') ||
75
+ src.includes('blur') ||
76
+ src.includes('thumb') ||
77
+ src.includes('thumbnail') ||
78
+ // 7. loading="lazy" + not complete (browser native lazy loading)
79
+ (loadingAttr === 'lazy' && !img.complete) ||
80
+ // 8. CSS blur filter applied (visual blurring)
81
+ (window.getComputedStyle(img).filter || '').includes('blur')
56
82
  );
57
83
 
58
84
  if (isPlaceholder) {
@@ -304,6 +330,8 @@ def trigger_lazy_load(tab: Any, config: Optional[CrawlConfig] = None) -> None:
304
330
  Scroll through page to trigger lazy-loaded images.
305
331
 
306
332
  Implements Crawl4AI's scan_full_page behavior.
333
+ Strategy: Fast scroll with minimal delay (0.2s) per step to trigger network requests,
334
+ then wait at the bottom for all images to settle.
307
335
 
308
336
  Args:
309
337
  tab: DrissionPage tab object
@@ -316,33 +344,94 @@ def trigger_lazy_load(tab: Any, config: Optional[CrawlConfig] = None) -> None:
316
344
  start = time.time()
317
345
  current_pos = 0
318
346
 
319
- logger.debug("CompletenessChecker: Starting lazy load trigger scroll")
347
+ logger.info(f"CompletenessChecker: Starting lazy load scroll (fast scroll + final wait)")
320
348
 
321
349
  try:
322
- while time.time() - start < config.scroll_timeout:
323
- # Scroll down
350
+ max_scroll_steps = 100
351
+ step_count = 0
352
+
353
+ # 1. Fast Scroll Phase
354
+ while step_count < max_scroll_steps:
355
+ step_count += 1
324
356
  current_pos += config.scroll_step
325
357
  tab.run_js(f"window.scrollTo(0, {current_pos});")
326
358
 
327
- # Wait for lazy load triggers
328
- time.sleep(config.scroll_delay)
359
+ # Simple fixed delay per step (0.2s) as requested
360
+ time.sleep(0.2)
329
361
 
330
362
  # Check if reached bottom
331
363
  height = tab.run_js("""
332
- return Math.max(
364
+ Math.max(
333
365
  document.body.scrollHeight || 0,
334
366
  document.documentElement.scrollHeight || 0
335
- );
367
+ )
336
368
  """, as_expr=True) or 0
337
369
 
338
370
  if current_pos >= height:
371
+ logger.debug(f"CompletenessChecker: Reached bottom at position {current_pos}")
339
372
  break
373
+
374
+ # 2. Wait Phase at Bottom (Wait for images to settle - reduced timeout)
375
+ logger.debug("CompletenessChecker: Reached bottom, waiting for images to settle (max 2s)...")
376
+ wait_start = time.time()
377
+ max_wait_at_bottom = 2.0 # Reduced from 8s to 2s - scroll usually triggers loading quickly
378
+
379
+ # Quick check: just verify images are not placeholders (simplified check)
380
+ check_all_images_js = """
381
+ (() => {
382
+ const imgs = Array.from(document.querySelectorAll('img'));
383
+ if (imgs.length === 0) return true;
384
+
385
+ // Quick check: count non-placeholder images that are loaded
386
+ let loaded_count = 0;
387
+ let total_count = 0;
388
+
389
+ for (const img of imgs) {
390
+ // Skip tiny images
391
+ if (img.clientWidth < 50 && img.clientHeight < 50) continue;
392
+
393
+ total_count++;
394
+ const dataSrc = img.getAttribute('data-src') || img.getAttribute('data-original') || '';
395
+ const src = img.src || '';
396
+
397
+ // Check if placeholder
398
+ const isPlaceholder = (
399
+ (dataSrc && img.src !== dataSrc) ||
400
+ (img.naturalWidth < 50 && img.clientWidth > 100) ||
401
+ src.includes('placeholder') || src.includes('loading')
402
+ );
403
+
404
+ // If not placeholder and loaded, count it
405
+ if (!isPlaceholder && img.complete && img.naturalWidth > 0) {
406
+ loaded_count++;
407
+ }
408
+ }
409
+
410
+ // If most images are loaded (80%+), consider it done
411
+ return total_count === 0 || (loaded_count / total_count) >= 0.8;
412
+ })()
413
+ """
414
+
415
+ # Quick check loop with shorter interval
416
+ check_count = 0
417
+ max_checks = 4 # 2s / 0.5s = 4 checks max
418
+ while check_count < max_checks:
419
+ try:
420
+ all_loaded = tab.run_js(check_all_images_js, as_expr=True)
421
+ if all_loaded:
422
+ elapsed_wait = time.time() - wait_start
423
+ logger.debug(f"CompletenessChecker: Images settled at bottom in {elapsed_wait:.1f}s")
424
+ break
425
+ except:
426
+ pass
427
+ time.sleep(0.5)
428
+ check_count += 1
340
429
 
341
430
  # Scroll back to top
342
431
  tab.run_js("window.scrollTo(0, 0);")
343
432
 
344
433
  elapsed = time.time() - start
345
- logger.debug(f"CompletenessChecker: Lazy load scroll complete in {elapsed:.1f}s")
434
+ logger.info(f"CompletenessChecker: Lazy load scroll complete - {step_count} steps in {elapsed:.1f}s")
346
435
 
347
436
  except Exception as e:
348
437
  logger.warning(f"CompletenessChecker: Lazy load scroll failed: {e}")