entari-plugin-hyw 4.0.0rc7__py3-none-any.whl → 4.0.0rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. entari_plugin_hyw/Untitled-1 +1865 -0
  2. entari_plugin_hyw/__init__.py +726 -394
  3. entari_plugin_hyw/history.py +26 -13
  4. entari_plugin_hyw/misc.py +3 -0
  5. entari_plugin_hyw/search_cache.py +154 -0
  6. {entari_plugin_hyw-4.0.0rc7.dist-info → entari_plugin_hyw-4.0.0rc9.dist-info}/METADATA +3 -1
  7. entari_plugin_hyw-4.0.0rc9.dist-info/RECORD +68 -0
  8. {entari_plugin_hyw-4.0.0rc7.dist-info → entari_plugin_hyw-4.0.0rc9.dist-info}/WHEEL +1 -1
  9. {entari_plugin_hyw-4.0.0rc7.dist-info → entari_plugin_hyw-4.0.0rc9.dist-info}/top_level.txt +1 -0
  10. hyw_core/__init__.py +94 -0
  11. hyw_core/browser_control/__init__.py +65 -0
  12. hyw_core/browser_control/assets/card-dist/index.html +409 -0
  13. hyw_core/browser_control/assets/index.html +5691 -0
  14. hyw_core/browser_control/engines/__init__.py +17 -0
  15. {entari_plugin_hyw/browser → hyw_core/browser_control}/engines/duckduckgo.py +42 -8
  16. {entari_plugin_hyw/browser → hyw_core/browser_control}/engines/google.py +1 -1
  17. {entari_plugin_hyw/browser → hyw_core/browser_control}/manager.py +15 -8
  18. entari_plugin_hyw/render_vue.py → hyw_core/browser_control/renderer.py +29 -14
  19. {entari_plugin_hyw/browser → hyw_core/browser_control}/service.py +340 -112
  20. hyw_core/config.py +154 -0
  21. hyw_core/core.py +322 -0
  22. hyw_core/definitions.py +83 -0
  23. entari_plugin_hyw/modular_pipeline.py → hyw_core/pipeline.py +121 -97
  24. {entari_plugin_hyw → hyw_core}/search.py +19 -14
  25. hyw_core/stages/__init__.py +21 -0
  26. entari_plugin_hyw/stage_base.py → hyw_core/stages/base.py +2 -2
  27. entari_plugin_hyw/stage_summary.py → hyw_core/stages/summary.py +34 -11
  28. entari_plugin_hyw/assets/card-dist/index.html +0 -387
  29. entari_plugin_hyw/browser/__init__.py +0 -10
  30. entari_plugin_hyw/browser/engines/bing.py +0 -95
  31. entari_plugin_hyw/card-ui/.gitignore +0 -24
  32. entari_plugin_hyw/card-ui/README.md +0 -5
  33. entari_plugin_hyw/card-ui/index.html +0 -16
  34. entari_plugin_hyw/card-ui/package-lock.json +0 -2342
  35. entari_plugin_hyw/card-ui/package.json +0 -31
  36. entari_plugin_hyw/card-ui/public/logos/anthropic.svg +0 -1
  37. entari_plugin_hyw/card-ui/public/logos/cerebras.svg +0 -9
  38. entari_plugin_hyw/card-ui/public/logos/deepseek.png +0 -0
  39. entari_plugin_hyw/card-ui/public/logos/gemini.svg +0 -1
  40. entari_plugin_hyw/card-ui/public/logos/google.svg +0 -1
  41. entari_plugin_hyw/card-ui/public/logos/grok.png +0 -0
  42. entari_plugin_hyw/card-ui/public/logos/huggingface.png +0 -0
  43. entari_plugin_hyw/card-ui/public/logos/microsoft.svg +0 -15
  44. entari_plugin_hyw/card-ui/public/logos/minimax.png +0 -0
  45. entari_plugin_hyw/card-ui/public/logos/mistral.png +0 -0
  46. entari_plugin_hyw/card-ui/public/logos/nvida.png +0 -0
  47. entari_plugin_hyw/card-ui/public/logos/openai.svg +0 -1
  48. entari_plugin_hyw/card-ui/public/logos/openrouter.png +0 -0
  49. entari_plugin_hyw/card-ui/public/logos/perplexity.svg +0 -24
  50. entari_plugin_hyw/card-ui/public/logos/qwen.png +0 -0
  51. entari_plugin_hyw/card-ui/public/logos/xai.png +0 -0
  52. entari_plugin_hyw/card-ui/public/logos/xiaomi.png +0 -0
  53. entari_plugin_hyw/card-ui/public/logos/zai.png +0 -0
  54. entari_plugin_hyw/card-ui/public/vite.svg +0 -1
  55. entari_plugin_hyw/card-ui/src/App.vue +0 -787
  56. entari_plugin_hyw/card-ui/src/assets/vue.svg +0 -1
  57. entari_plugin_hyw/card-ui/src/components/HelloWorld.vue +0 -41
  58. entari_plugin_hyw/card-ui/src/components/MarkdownContent.vue +0 -382
  59. entari_plugin_hyw/card-ui/src/components/SectionCard.vue +0 -41
  60. entari_plugin_hyw/card-ui/src/components/StageCard.vue +0 -240
  61. entari_plugin_hyw/card-ui/src/main.ts +0 -5
  62. entari_plugin_hyw/card-ui/src/style.css +0 -29
  63. entari_plugin_hyw/card-ui/src/test_regex.js +0 -103
  64. entari_plugin_hyw/card-ui/src/types.ts +0 -61
  65. entari_plugin_hyw/card-ui/tsconfig.app.json +0 -16
  66. entari_plugin_hyw/card-ui/tsconfig.json +0 -7
  67. entari_plugin_hyw/card-ui/tsconfig.node.json +0 -26
  68. entari_plugin_hyw/card-ui/vite.config.ts +0 -16
  69. entari_plugin_hyw/definitions.py +0 -174
  70. entari_plugin_hyw/stage_instruct.py +0 -355
  71. entari_plugin_hyw/stage_instruct_deepsearch.py +0 -104
  72. entari_plugin_hyw/stage_vision.py +0 -113
  73. entari_plugin_hyw-4.0.0rc7.dist-info/RECORD +0 -102
  74. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/anthropic.svg +0 -0
  75. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/cerebras.svg +0 -0
  76. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/deepseek.png +0 -0
  77. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/gemini.svg +0 -0
  78. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/google.svg +0 -0
  79. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/grok.png +0 -0
  80. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/huggingface.png +0 -0
  81. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/microsoft.svg +0 -0
  82. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/minimax.png +0 -0
  83. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/mistral.png +0 -0
  84. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/nvida.png +0 -0
  85. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/openai.svg +0 -0
  86. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/openrouter.png +0 -0
  87. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/perplexity.svg +0 -0
  88. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/qwen.png +0 -0
  89. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/xai.png +0 -0
  90. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/xiaomi.png +0 -0
  91. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/logos/zai.png +0 -0
  92. {entari_plugin_hyw → hyw_core/browser_control}/assets/card-dist/vite.svg +0 -0
  93. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/anthropic.svg +0 -0
  94. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/cerebras.svg +0 -0
  95. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/deepseek.png +0 -0
  96. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/gemini.svg +0 -0
  97. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/google.svg +0 -0
  98. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/grok.png +0 -0
  99. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/huggingface.png +0 -0
  100. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/microsoft.svg +0 -0
  101. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/minimax.png +0 -0
  102. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/mistral.png +0 -0
  103. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/nvida.png +0 -0
  104. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/openai.svg +0 -0
  105. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/openrouter.png +0 -0
  106. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/perplexity.svg +0 -0
  107. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/qwen.png +0 -0
  108. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/xai.png +0 -0
  109. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/xiaomi.png +0 -0
  110. {entari_plugin_hyw/assets/icon → hyw_core/browser_control/assets/logos}/zai.png +0 -0
  111. {entari_plugin_hyw/browser → hyw_core/browser_control}/engines/base.py +0 -0
  112. {entari_plugin_hyw/browser → hyw_core/browser_control}/engines/default.py +0 -0
  113. {entari_plugin_hyw/browser → hyw_core/browser_control}/landing.html +0 -0
  114. {entari_plugin_hyw → hyw_core}/image_cache.py +0 -0
@@ -22,65 +22,22 @@ class ScreenshotService:
22
22
  self.headless = headless
23
23
  self._manager = None
24
24
  self._executor = ThreadPoolExecutor(max_workers=10)
25
- self._search_tab_pool = [] # List of Tab objects
26
- self._pool_lock = threading.Lock()
27
25
 
28
26
  if auto_start:
29
27
  self._ensure_ready()
30
-
31
- def prepare_search_tabs_background(self, count: int, url: str = "https://www.google.com") -> None:
32
- """
33
- Pre-launch tabs for search (BACKGROUND - fire and forget).
34
- Tabs are created in background thread, may not be ready immediately.
35
- """
36
- self._executor.submit(self._prepare_search_tabs_sync, count, url)
37
28
 
38
- def _prepare_search_tabs_sync(self, count: int, url: str = "https://www.google.com"):
39
- """Sync implementation of tab preparation - creates tabs in PARALLEL."""
29
+ def _get_tab(self, url: str) -> Any:
30
+ """Create a new tab and navigate to URL."""
31
+ self._ensure_ready()
32
+ return self._manager.new_tab(url)
33
+
34
+ def _release_tab(self, tab: Any):
35
+ """Close tab after use."""
36
+ if not tab: return
40
37
  try:
41
- self._ensure_ready()
42
- page = self._manager.page
43
- if not page: return
44
-
45
- with self._pool_lock:
46
- current_count = len(self._search_tab_pool)
47
- needed = count - current_count
48
-
49
- if needed <= 0:
50
- return
51
-
52
- logger.info(f"ScreenshotService: Pre-launching {needed} search tabs for {url} (parallel)...")
53
-
54
- # Create tabs in parallel using threads
55
- created_tabs = [None] * needed
56
-
57
- def create_single_tab(index):
58
- try:
59
- tab = page.new_tab(url)
60
- created_tabs[index] = tab
61
- logger.debug(f"ScreenshotService: Tab {index} ready")
62
- except Exception as e:
63
- logger.error(f"ScreenshotService: Failed to create tab {index}: {e}")
64
-
65
- threads = []
66
- for i in range(needed):
67
- t = threading.Thread(target=create_single_tab, args=(i,))
68
- t.start()
69
- threads.append(t)
70
-
71
- # Wait for all threads to complete
72
- for t in threads:
73
- t.join()
74
-
75
- # Add successfully created tabs to pool
76
- with self._pool_lock:
77
- for tab in created_tabs:
78
- if tab:
79
- self._search_tab_pool.append(tab)
80
- logger.info(f"ScreenshotService: Tab pool ready ({len(self._search_tab_pool)} tabs)")
81
-
82
- except Exception as e:
83
- logger.error(f"ScreenshotService: Failed to prepare tabs: {e}")
38
+ tab.close()
39
+ except:
40
+ pass
84
41
 
85
42
  async def search_via_page_input_batch(self, queries: List[str], url: str, selector: str = "#input") -> List[Dict[str, Any]]:
86
43
  """
@@ -104,17 +61,13 @@ class ScreenshotService:
104
61
 
105
62
  for i in range(len(queries)):
106
63
  tab = None
107
- # Try to get from pool first
108
- with self._pool_lock:
109
- if self._search_tab_pool:
110
- tab = self._search_tab_pool.pop(0)
111
- logger.debug(f"ScreenshotService: Got tab {i} from pool")
64
+ # Try to get from pool first (using shared logic now)
65
+ try:
66
+ tab = self._get_tab(target_url)
67
+ except Exception as e:
68
+ logger.warning(f"ScreenshotService: Batch search tab creation failed: {e}")
112
69
 
113
- if not tab:
114
- # Create new
115
- self._ensure_ready()
116
- tab = self._manager.page.new_tab(target_url)
117
- logger.debug(f"ScreenshotService: Created tab {i} for {target_url}")
70
+
118
71
 
119
72
  tabs.append(tab)
120
73
 
@@ -155,7 +108,8 @@ class ScreenshotService:
155
108
 
156
109
  logger.debug(f"Search[{index}]: Waiting for search results...")
157
110
  tab.wait.doc_loaded(timeout=10)
158
- time.sleep(0.5)
111
+ # Reduced settle wait for extraction
112
+ time.sleep(0.1)
159
113
 
160
114
  logger.debug(f"Search[{index}]: Extracting content...")
161
115
  html = tab.html
@@ -178,8 +132,7 @@ class ScreenshotService:
178
132
  logger.error(f"ScreenshotService: Search error for '{query}': {e}")
179
133
  results[index] = {"content": f"Error: {e}", "title": "Error", "url": "", "html": ""}
180
134
  finally:
181
- try: tab.close()
182
- except: pass
135
+ self._release_tab(tab)
183
136
 
184
137
  threads = []
185
138
  for i, (tab, query) in enumerate(zip(tabs, queries)):
@@ -248,7 +201,7 @@ class ScreenshotService:
248
201
 
249
202
  # Small delay for address bar to focus
250
203
  import time as _time
251
- _time.sleep(0.1)
204
+ _time.sleep(0.05)
252
205
 
253
206
  # Type the query
254
207
  tab.actions.type(query)
@@ -259,8 +212,8 @@ class ScreenshotService:
259
212
  # Wait for page to load
260
213
  try:
261
214
  tab.wait.doc_loaded(timeout=timeout)
262
- # Additional wait for search results
263
- _time.sleep(1)
215
+ # Reduced wait for initial results
216
+ _time.sleep(0.2)
264
217
  except:
265
218
  pass
266
219
 
@@ -292,6 +245,89 @@ class ScreenshotService:
292
245
  try: tab.close()
293
246
  except: pass
294
247
 
248
+ def _scroll_to_bottom(self, tab, step: int = 800, delay: float = 2.0, timeout: float = 10.0):
249
+ """
250
+ Scroll down gradually to trigger lazy loading.
251
+
252
+ Args:
253
+ delay: Max wait time per scroll step (seconds) if images aren't loading.
254
+ """
255
+ import time
256
+ start = time.time()
257
+ current_pos = 0
258
+ try:
259
+ while time.time() - start < timeout:
260
+ # Scroll down
261
+ current_pos += step
262
+ tab.run_js(f"window.scrollTo(0, {current_pos});")
263
+
264
+ # Active Wait: Check if images in viewport are loaded
265
+ # Poll every 100ms, up to 'delay' seconds
266
+ wait_start = time.time()
267
+ while time.time() - wait_start < delay:
268
+ all_loaded = tab.run_js("""
269
+ return (async () => {
270
+ const imgs = Array.from(document.querySelectorAll('img'));
271
+ const viewportHeight = window.innerHeight;
272
+
273
+ // 1. Identify images currently in viewport
274
+ const visibleImgs = imgs.filter(img => {
275
+ const rect = img.getBoundingClientRect();
276
+ return (rect.top < viewportHeight && rect.bottom > 0) && (rect.width > 0 && rect.height > 0);
277
+ });
278
+
279
+ if (visibleImgs.length === 0) return true;
280
+
281
+ // 2. Check loading status using decode() AND heuristic for placeholders
282
+ // Some sites load a tiny blurred placeholder first.
283
+ const checks = visibleImgs.map(img => {
284
+ // HEURISTIC: content is likely not ready if:
285
+ // - img has 'data-src' but src is different (or src is empty)
286
+ // - img has 'loading="lazy"' and is not complete
287
+ // - naturalWidth is very small (placeholder) compared to display width
288
+
289
+ const isPlaceholder = (
290
+ (img.getAttribute('data-src') && img.src !== img.getAttribute('data-src')) ||
291
+ (img.naturalWidth < 50 && img.clientWidth > 100)
292
+ );
293
+
294
+ if (isPlaceholder) {
295
+ // If it looks like a placeholder, we return false (not loaded)
296
+ // unless it stays like this for too long (handled by outer timeout)
297
+ return Promise.resolve(false);
298
+ }
299
+
300
+ if (img.complete && img.naturalHeight > 0) return Promise.resolve(true);
301
+
302
+ return img.decode().then(() => true).catch(() => false);
303
+ });
304
+
305
+ // Race against a small timeout to avoid hanging on one broken image
306
+ const allDecoded = Promise.all(checks);
307
+ const timeout = new Promise(resolve => setTimeout(() => resolve(false), 500));
308
+
309
+ // If any check returned false (meaning placeholder or not decoded), result is false
310
+ return Promise.race([allDecoded, timeout]).then(results => {
311
+ if (!Array.isArray(results)) return results === true;
312
+ return results.every(res => res === true);
313
+ });
314
+ })();
315
+ """)
316
+ if all_loaded:
317
+ break
318
+ time.sleep(0.1)
319
+
320
+ # Check if reached bottom
321
+ height = tab.run_js("return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight);")
322
+ if current_pos >= height:
323
+ break
324
+
325
+ # Ensure final layout settle
326
+ time.sleep(0.2)
327
+
328
+ except Exception as e:
329
+ logger.warning(f"ScreenshotService: Scroll failed: {e}")
330
+
295
331
  def _fetch_page_sync(self, url: str, timeout: float, include_screenshot: bool) -> Dict[str, Any]:
296
332
  """Synchronous fetch logic."""
297
333
  if not url:
@@ -304,27 +340,48 @@ class ScreenshotService:
304
340
  if not page:
305
341
  return {"content": "Error: Browser not available", "title": "Error", "url": url}
306
342
 
307
- # New Tab with URL directly
308
- tab = page.new_tab(url)
343
+ # Get from pool
344
+ tab = self._get_tab(url)
309
345
 
310
346
  # Wait logic - optimized for search pages
311
347
  is_search_page = any(s in url.lower() for s in ['search', 'bing.com', 'duckduckgo', 'google.com/search', 'searx'])
312
348
  if is_search_page:
313
- # Optimized waiting for search engine results
314
- try:
315
- # Google uses #search or #rso
316
- # DuckDuckGo uses #react-layout
317
- # Bing uses #b_results
318
- if 'google' in url.lower():
319
- # Wait for results container (fastest possible return)
320
- tab.ele('#search', timeout=timeout)
321
- elif 'bing' in url.lower():
322
- tab.ele('#b_results', timeout=timeout)
323
- else:
324
- # Generic search fallback
325
- tab.wait.doc_loaded(timeout=timeout)
326
- except:
327
- pass
349
+ # Optimized waiting: Rapidly poll for ACTUAL results > 0
350
+ start_time = time.time()
351
+
352
+ # Special fast-path for DDG Lite (HTML only, no JS rendering needed)
353
+ if 'lite.duckduckgo' in url:
354
+ # just wait for body, it's static HTML
355
+ try:
356
+ tab.wait.doc_loaded(timeout=timeout)
357
+ except: pass
358
+ # Sleep tiny bit to ensure render
359
+ time.sleep(0.5)
360
+ else:
361
+ while time.time() - start_time < timeout:
362
+ found_results = False
363
+ try:
364
+ if 'google' in url.lower():
365
+ # Check if we have any result items (.g, .MjjYud) or the main container (#search)
366
+ # Using checks with minimal timeout to allow fast looping
367
+ if tab.ele('.g', timeout=0.1) or tab.ele('.MjjYud', timeout=0.1) or tab.ele('#search', timeout=0.1):
368
+ found_results = True
369
+ elif 'bing' in url.lower():
370
+ if tab.ele('.b_algo', timeout=0.1) or tab.ele('#b_results', timeout=0.1):
371
+ found_results = True
372
+ elif 'duckduckgo' in url.lower():
373
+ if tab.ele('.result', timeout=0.1) or tab.ele('#react-layout', timeout=0.1):
374
+ found_results = True
375
+ else:
376
+ # Generic fallback: wait for body to be populated
377
+ if tab.ele('body', timeout=0.1):
378
+ found_results = True
379
+ except:
380
+ pass
381
+
382
+ if found_results:
383
+ break
384
+ time.sleep(0.05) # Faster polling (50ms) as requested
328
385
  else:
329
386
  # 1. Wait for document to settle (Fast Dynamic Wait)
330
387
  try:
@@ -451,8 +508,7 @@ class ScreenshotService:
451
508
  return {"content": f"Error: fetch failed ({e})", "title": "Error", "url": url}
452
509
  finally:
453
510
  if tab:
454
- try: tab.close()
455
- except: pass
511
+ self._release_tab(tab)
456
512
 
457
513
  async def fetch_pages_batch(self, urls: List[str], timeout: float = 20.0, include_screenshot: bool = True) -> List[Dict[str, Any]]:
458
514
  """Fetch multiple pages concurrently."""
@@ -461,6 +517,13 @@ class ScreenshotService:
461
517
  tasks = [self.fetch_page(url, timeout, include_screenshot) for url in urls]
462
518
  return await asyncio.gather(*tasks, return_exceptions=True)
463
519
 
520
+ async def screenshot_urls_batch(self, urls: List[str], timeout: float = 15.0, full_page: bool = True) -> List[Optional[str]]:
521
+ """Take screenshots of multiple URLs concurrently."""
522
+ if not urls: return []
523
+ logger.info(f"ScreenshotService: Batch screenshot {len(urls)} URLs")
524
+ tasks = [self.screenshot_url(url, timeout=timeout, full_page=full_page) for url in urls]
525
+ return await asyncio.gather(*tasks, return_exceptions=True)
526
+
464
527
  async def screenshot_url(self, url: str, wait_load: bool = True, timeout: float = 15.0, full_page: bool = False, quality: int = 80) -> Optional[str]:
465
528
  """Screenshot URL (Async wrapper for sync)."""
466
529
  loop = asyncio.get_running_loop()
@@ -480,36 +543,201 @@ class ScreenshotService:
480
543
  if not page: return None
481
544
 
482
545
  tab = page.new_tab(url)
546
+
547
+ # Start monitoring network traffic
483
548
  try:
484
- if wait_load:
485
- tab.wait.load_complete(timeout=timeout)
486
- else:
487
- tab.wait.doc_loaded(timeout=timeout)
488
- except: pass
549
+ # Listen for data packets (XHR/Fetch/POST)
550
+ # Targets: xhr, fetch. POST usually falls under these or Document.
551
+ tab.listen.start(targets=True) # Listen to everything for now to be safe
552
+ except Exception as e:
553
+ logger.warning(f"ScreenshotService: Failed to start network listener: {e}")
554
+
555
+ try:
556
+ # Wait for full page load (including JS execution)
557
+ tab.wait.load_complete(timeout=timeout)
558
+
559
+ # Wait for actual content to appear (for CDN verification pages)
560
+ # Smart Wait Logic (Final Robust):
561
+ # 1. Network Idle: Wait for silence in XHR/POST
562
+ # 2. Stability: Wait for Height/Text/DOM stability
563
+
564
+ time.sleep(1.5) # user request: force wait 1.5s before detection
565
+
566
+ last_h = 0
567
+ last_text_len = 0
568
+ last_html_len = 0
569
+ stable_count = 0
570
+
571
+ for i in range(200): # Max 200 iterations (~20s)
572
+ try:
573
+ # 1. Check Network Activity
574
+ has_recent_network = False
575
+ try:
576
+ # Iterate over any captured packets since last check
577
+ for packet in tab.listen.steps(timeout=0.01):
578
+ # Check if it's a significant request (POST or XHR/Fetch)
579
+ method = packet.method.upper()
580
+ r_type = packet.resourceType.upper() if getattr(packet, 'resourceType', None) else ""
581
+
582
+ # Interested in: POST requests OR any XHR/Fetch response
583
+ if method == 'POST' or 'XMLHTTPREQUEST' in r_type or 'FETCH' in r_type:
584
+ # Ignore some common noise? (Optional: analytics, tracking)
585
+ # For now, simplistic approach: any API traffic resets stability
586
+ has_recent_network = True
587
+ # logger.debug(f"Network Activity: {method} {packet.url[:50]}")
588
+ break
589
+ except:
590
+ pass
591
+
592
+ # 2. Check DOM State
593
+ state = tab.run_js('''
594
+ return {
595
+ ready: document.readyState === 'complete',
596
+ title: document.title,
597
+ height: Math.max(
598
+ document.body.scrollHeight || 0,
599
+ document.documentElement.scrollHeight || 0
600
+ ),
601
+ text: document.body.innerText.substring(0, 1000) || "",
602
+ html_len: document.body.innerHTML.length || 0
603
+ };
604
+ ''') or {'ready': False, 'title': "", 'height': 0, 'text': "", 'html_len': 0}
605
+
606
+ is_ready = state.get('ready', False)
607
+ title = state.get('title', "").lower()
608
+ current_h = int(state.get('height', 0))
609
+ text_content = state.get('text', "")
610
+ text_len = len(text_content)
611
+ html_len = int(state.get('html_len', 0))
612
+ text_lower = text_content.lower()
613
+
614
+ # Blacklist check (Loading indicators)
615
+ is_verification = "checking your browser" in text_lower or \
616
+ "just a moment" in text_lower or \
617
+ "please wait" in text_lower or \
618
+ "security check" in title or \
619
+ "just a moment" in title or \
620
+ "loading..." in title
621
+
622
+ # Stability check (Multi-metric + Network)
623
+ # We require STABILITY across Height, Text Length, DOM Size AND Network Silence
624
+ is_height_stable = current_h == last_h
625
+ is_text_stable = abs(text_len - last_text_len) < 5 # Allow minor fluctuations
626
+ is_dom_stable = abs(html_len - last_html_len) < 20 # Allow minor fluctuations (ads/tracking)
627
+
628
+ if is_height_stable and is_text_stable and is_dom_stable and not has_recent_network:
629
+ stable_count += 1
630
+ else:
631
+ # Reset if ANY metric changed or NETWORK active
632
+ stable_count = 0
633
+ # if has_recent_network: logger.debug("Stability reset: Network Activity")
634
+
635
+ # Conditions
636
+ has_content = text_len > 100 # At least 100 real chars
637
+
638
+ # Dynamic Stability Requirement:
639
+ # If page looks like it's loading (small content), require longer stability
640
+ if has_recent_network:
641
+ # If we just saw network, enforce at least 1s (10 ticks) clean silence even for large pages
642
+ required_stability = max(10, 40 if text_len < 500 else 25)
643
+ else:
644
+ required_stability = 40 if text_len < 500 else 25 # 4.0s or 2.5s
645
+
646
+ is_stable = stable_count >= required_stability
647
+
648
+ # Pass if all conditions met
649
+ if is_ready and not is_verification and has_content and is_stable:
650
+ break
651
+
652
+ last_h = current_h
653
+ last_text_len = text_len
654
+ last_html_len = html_len
655
+
656
+ # Wait timing within loop (tab.listen.steps consumed some time, so sleep less)
657
+ try: time.sleep(0.05)
658
+ except: pass
659
+
660
+ except Exception:
661
+ stable_count = 0
662
+ try: time.sleep(0.1)
663
+ except: pass
664
+ continue
665
+
666
+ # Cleanup listener
667
+ try: tab.listen.stop()
668
+ except: pass
669
+
670
+ # DEBUG: Save HTML to inspect what happened (in data dir)
671
+ try:
672
+ import os
673
+ log_path = os.path.join(os.getcwd(), "data", "browser.log.html")
674
+ with open(log_path, "w", encoding="utf-8") as f:
675
+ f.write(f"<!-- URL: {url} -->\n")
676
+ f.write(tab.html)
677
+ except: pass
678
+
679
+ # Use faster scroll step (800) to ensure lazy loaded images appear
680
+ self._scroll_to_bottom(tab, step=800, delay=2.0, timeout=min(timeout, 10))
681
+
682
+ except:
683
+ pass
489
684
 
490
- # Wait for main element
491
- if tab.ele("#main-container"):
492
- pass
685
+ # Refine calculation: Set viewport width to 1024
686
+
687
+ capture_width = 1024
493
688
 
689
+ # Calculate actual content height after lazy loading
690
+ try:
691
+ # Use a robust height calculation
692
+ content_height = tab.run_js('''
693
+ return Math.max(
694
+ document.body.scrollHeight || 0,
695
+ document.documentElement.scrollHeight || 0,
696
+ document.body.offsetHeight || 0,
697
+ document.documentElement.offsetHeight || 0,
698
+ document.documentElement.clientHeight || 0
699
+ );
700
+ ''')
701
+ # Add a small buffer and cap at 15000px to prevent memory issues
702
+ h = min(int(content_height) + 50, 15000)
703
+ except:
704
+ h = 1000 # Fallback
705
+
706
+ # Set viewport to full content size for single-shot capture
707
+ try:
708
+ tab.run_cdp('Emulation.setDeviceMetricsOverride',
709
+ width=capture_width, height=h, deviceScaleFactor=1, mobile=False)
710
+ except:
711
+ pass
712
+
494
713
  # Scrollbar Hiding
495
714
  from .manager import SharedBrowserManager
496
715
  SharedBrowserManager.hide_scrollbars(tab)
497
- tab.run_js("""
498
- const style = document.createElement('style');
499
- style.textContent = `
500
- ::-webkit-scrollbar { display: none !important; }
501
- html, body { -ms-overflow-style: none !important; scrollbar-width: none !important; }
502
- `;
503
- document.head.appendChild(style);
504
- document.documentElement.style.overflow = 'hidden';
505
- document.body.style.overflow = 'hidden';
506
- """)
507
716
 
508
- ele = tab.ele("#main-container")
509
- if ele:
510
- return ele.get_screenshot(as_base64='jpg', quality=quality)
511
- else:
512
- return tab.get_screenshot(as_base64='jpg', full_page=full_page, quality=quality)
717
+ # Scroll back to top before screenshot
718
+ tab.run_js("window.scrollTo(0, 0);")
719
+
720
+ # Content is already loaded by _scroll_to_bottom which waits for images
721
+ # Just recalculate final height (content may have grown during scrolling)
722
+ try:
723
+ final_height = tab.run_js('''
724
+ return Math.max(
725
+ document.body.scrollHeight || 0,
726
+ document.documentElement.scrollHeight || 0,
727
+ document.body.offsetHeight || 0,
728
+ document.documentElement.offsetHeight || 0
729
+ );
730
+ ''')
731
+ final_h = min(int(final_height) + 50, 15000)
732
+ if final_h != h:
733
+ tab.run_cdp('Emulation.setDeviceMetricsOverride',
734
+ width=capture_width, height=final_h, deviceScaleFactor=1, mobile=False)
735
+ except:
736
+ pass
737
+
738
+ # Use full_page=False because we manually set the viewport to the full height
739
+ # This avoids stitching artifacts and blank spaces
740
+ return tab.get_screenshot(as_base64='jpg', full_page=False)
513
741
 
514
742
  except Exception as e:
515
743
  logger.error(f"ScreenshotService: Screenshot URL failed: {e}")