optexity-browser-use 0.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. browser_use/__init__.py +157 -0
  2. browser_use/actor/__init__.py +11 -0
  3. browser_use/actor/element.py +1175 -0
  4. browser_use/actor/mouse.py +134 -0
  5. browser_use/actor/page.py +561 -0
  6. browser_use/actor/playground/flights.py +41 -0
  7. browser_use/actor/playground/mixed_automation.py +54 -0
  8. browser_use/actor/playground/playground.py +236 -0
  9. browser_use/actor/utils.py +176 -0
  10. browser_use/agent/cloud_events.py +282 -0
  11. browser_use/agent/gif.py +424 -0
  12. browser_use/agent/judge.py +170 -0
  13. browser_use/agent/message_manager/service.py +473 -0
  14. browser_use/agent/message_manager/utils.py +52 -0
  15. browser_use/agent/message_manager/views.py +98 -0
  16. browser_use/agent/prompts.py +413 -0
  17. browser_use/agent/service.py +2316 -0
  18. browser_use/agent/system_prompt.md +185 -0
  19. browser_use/agent/system_prompt_flash.md +10 -0
  20. browser_use/agent/system_prompt_no_thinking.md +183 -0
  21. browser_use/agent/views.py +743 -0
  22. browser_use/browser/__init__.py +41 -0
  23. browser_use/browser/cloud/cloud.py +203 -0
  24. browser_use/browser/cloud/views.py +89 -0
  25. browser_use/browser/events.py +578 -0
  26. browser_use/browser/profile.py +1158 -0
  27. browser_use/browser/python_highlights.py +548 -0
  28. browser_use/browser/session.py +3225 -0
  29. browser_use/browser/session_manager.py +399 -0
  30. browser_use/browser/video_recorder.py +162 -0
  31. browser_use/browser/views.py +200 -0
  32. browser_use/browser/watchdog_base.py +260 -0
  33. browser_use/browser/watchdogs/__init__.py +0 -0
  34. browser_use/browser/watchdogs/aboutblank_watchdog.py +253 -0
  35. browser_use/browser/watchdogs/crash_watchdog.py +335 -0
  36. browser_use/browser/watchdogs/default_action_watchdog.py +2729 -0
  37. browser_use/browser/watchdogs/dom_watchdog.py +817 -0
  38. browser_use/browser/watchdogs/downloads_watchdog.py +1277 -0
  39. browser_use/browser/watchdogs/local_browser_watchdog.py +461 -0
  40. browser_use/browser/watchdogs/permissions_watchdog.py +43 -0
  41. browser_use/browser/watchdogs/popups_watchdog.py +143 -0
  42. browser_use/browser/watchdogs/recording_watchdog.py +126 -0
  43. browser_use/browser/watchdogs/screenshot_watchdog.py +62 -0
  44. browser_use/browser/watchdogs/security_watchdog.py +280 -0
  45. browser_use/browser/watchdogs/storage_state_watchdog.py +335 -0
  46. browser_use/cli.py +2359 -0
  47. browser_use/code_use/__init__.py +16 -0
  48. browser_use/code_use/formatting.py +192 -0
  49. browser_use/code_use/namespace.py +665 -0
  50. browser_use/code_use/notebook_export.py +276 -0
  51. browser_use/code_use/service.py +1340 -0
  52. browser_use/code_use/system_prompt.md +574 -0
  53. browser_use/code_use/utils.py +150 -0
  54. browser_use/code_use/views.py +171 -0
  55. browser_use/config.py +505 -0
  56. browser_use/controller/__init__.py +3 -0
  57. browser_use/dom/enhanced_snapshot.py +161 -0
  58. browser_use/dom/markdown_extractor.py +169 -0
  59. browser_use/dom/playground/extraction.py +312 -0
  60. browser_use/dom/playground/multi_act.py +32 -0
  61. browser_use/dom/serializer/clickable_elements.py +200 -0
  62. browser_use/dom/serializer/code_use_serializer.py +287 -0
  63. browser_use/dom/serializer/eval_serializer.py +478 -0
  64. browser_use/dom/serializer/html_serializer.py +212 -0
  65. browser_use/dom/serializer/paint_order.py +197 -0
  66. browser_use/dom/serializer/serializer.py +1170 -0
  67. browser_use/dom/service.py +825 -0
  68. browser_use/dom/utils.py +129 -0
  69. browser_use/dom/views.py +906 -0
  70. browser_use/exceptions.py +5 -0
  71. browser_use/filesystem/__init__.py +0 -0
  72. browser_use/filesystem/file_system.py +619 -0
  73. browser_use/init_cmd.py +376 -0
  74. browser_use/integrations/gmail/__init__.py +24 -0
  75. browser_use/integrations/gmail/actions.py +115 -0
  76. browser_use/integrations/gmail/service.py +225 -0
  77. browser_use/llm/__init__.py +155 -0
  78. browser_use/llm/anthropic/chat.py +242 -0
  79. browser_use/llm/anthropic/serializer.py +312 -0
  80. browser_use/llm/aws/__init__.py +36 -0
  81. browser_use/llm/aws/chat_anthropic.py +242 -0
  82. browser_use/llm/aws/chat_bedrock.py +289 -0
  83. browser_use/llm/aws/serializer.py +257 -0
  84. browser_use/llm/azure/chat.py +91 -0
  85. browser_use/llm/base.py +57 -0
  86. browser_use/llm/browser_use/__init__.py +3 -0
  87. browser_use/llm/browser_use/chat.py +201 -0
  88. browser_use/llm/cerebras/chat.py +193 -0
  89. browser_use/llm/cerebras/serializer.py +109 -0
  90. browser_use/llm/deepseek/chat.py +212 -0
  91. browser_use/llm/deepseek/serializer.py +109 -0
  92. browser_use/llm/exceptions.py +29 -0
  93. browser_use/llm/google/__init__.py +3 -0
  94. browser_use/llm/google/chat.py +542 -0
  95. browser_use/llm/google/serializer.py +120 -0
  96. browser_use/llm/groq/chat.py +229 -0
  97. browser_use/llm/groq/parser.py +158 -0
  98. browser_use/llm/groq/serializer.py +159 -0
  99. browser_use/llm/messages.py +238 -0
  100. browser_use/llm/models.py +271 -0
  101. browser_use/llm/oci_raw/__init__.py +10 -0
  102. browser_use/llm/oci_raw/chat.py +443 -0
  103. browser_use/llm/oci_raw/serializer.py +229 -0
  104. browser_use/llm/ollama/chat.py +97 -0
  105. browser_use/llm/ollama/serializer.py +143 -0
  106. browser_use/llm/openai/chat.py +264 -0
  107. browser_use/llm/openai/like.py +15 -0
  108. browser_use/llm/openai/serializer.py +165 -0
  109. browser_use/llm/openrouter/chat.py +211 -0
  110. browser_use/llm/openrouter/serializer.py +26 -0
  111. browser_use/llm/schema.py +176 -0
  112. browser_use/llm/views.py +48 -0
  113. browser_use/logging_config.py +330 -0
  114. browser_use/mcp/__init__.py +18 -0
  115. browser_use/mcp/__main__.py +12 -0
  116. browser_use/mcp/client.py +544 -0
  117. browser_use/mcp/controller.py +264 -0
  118. browser_use/mcp/server.py +1114 -0
  119. browser_use/observability.py +204 -0
  120. browser_use/py.typed +0 -0
  121. browser_use/sandbox/__init__.py +41 -0
  122. browser_use/sandbox/sandbox.py +637 -0
  123. browser_use/sandbox/views.py +132 -0
  124. browser_use/screenshots/__init__.py +1 -0
  125. browser_use/screenshots/service.py +52 -0
  126. browser_use/sync/__init__.py +6 -0
  127. browser_use/sync/auth.py +357 -0
  128. browser_use/sync/service.py +161 -0
  129. browser_use/telemetry/__init__.py +51 -0
  130. browser_use/telemetry/service.py +112 -0
  131. browser_use/telemetry/views.py +101 -0
  132. browser_use/tokens/__init__.py +0 -0
  133. browser_use/tokens/custom_pricing.py +24 -0
  134. browser_use/tokens/mappings.py +4 -0
  135. browser_use/tokens/service.py +580 -0
  136. browser_use/tokens/views.py +108 -0
  137. browser_use/tools/registry/service.py +572 -0
  138. browser_use/tools/registry/views.py +174 -0
  139. browser_use/tools/service.py +1675 -0
  140. browser_use/tools/utils.py +82 -0
  141. browser_use/tools/views.py +100 -0
  142. browser_use/utils.py +670 -0
  143. optexity_browser_use-0.9.5.dist-info/METADATA +344 -0
  144. optexity_browser_use-0.9.5.dist-info/RECORD +147 -0
  145. optexity_browser_use-0.9.5.dist-info/WHEEL +4 -0
  146. optexity_browser_use-0.9.5.dist-info/entry_points.txt +3 -0
  147. optexity_browser_use-0.9.5.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1277 @@
1
+ """Downloads watchdog for monitoring and handling file downloads."""
2
+
3
+ import asyncio
4
+ import json
5
+ import os
6
+ import tempfile
7
+ from pathlib import Path
8
+ from typing import TYPE_CHECKING, Any, ClassVar
9
+ from urllib.parse import urlparse
10
+
11
+ import anyio
12
+ from bubus import BaseEvent
13
+ from cdp_use.cdp.browser import DownloadProgressEvent, DownloadWillBeginEvent
14
+ from cdp_use.cdp.network import ResponseReceivedEvent
15
+ from cdp_use.cdp.target import SessionID, TargetID
16
+ from pydantic import PrivateAttr
17
+
18
+ from browser_use.browser.events import (
19
+ BrowserLaunchEvent,
20
+ BrowserStateRequestEvent,
21
+ BrowserStoppedEvent,
22
+ FileDownloadedEvent,
23
+ NavigationCompleteEvent,
24
+ TabClosedEvent,
25
+ TabCreatedEvent,
26
+ )
27
+ from browser_use.browser.watchdog_base import BaseWatchdog
28
+
29
+ if TYPE_CHECKING:
30
+ pass
31
+
32
+
33
+ class DownloadsWatchdog(BaseWatchdog):
34
+ """Monitors downloads and handles file download events."""
35
+
36
+ # Events this watchdog listens to (for documentation)
37
+ LISTENS_TO: ClassVar[list[type[BaseEvent[Any]]]] = [
38
+ BrowserLaunchEvent,
39
+ BrowserStateRequestEvent,
40
+ BrowserStoppedEvent,
41
+ TabCreatedEvent,
42
+ TabClosedEvent,
43
+ NavigationCompleteEvent,
44
+ ]
45
+
46
+ # Events this watchdog emits
47
+ EMITS: ClassVar[list[type[BaseEvent[Any]]]] = [
48
+ FileDownloadedEvent,
49
+ ]
50
+
51
+ # Private state
52
+ _sessions_with_listeners: set[str] = PrivateAttr(default_factory=set) # Track sessions that already have download listeners
53
+ _active_downloads: dict[str, Any] = PrivateAttr(default_factory=dict)
54
+ _pdf_viewer_cache: dict[str, bool] = PrivateAttr(default_factory=dict) # Cache PDF viewer status by target URL
55
+ _download_cdp_session_setup: bool = PrivateAttr(default=False) # Track if CDP session is set up
56
+ _download_cdp_session: Any = PrivateAttr(default=None) # Store CDP session reference
57
+ _cdp_event_tasks: set[asyncio.Task] = PrivateAttr(default_factory=set) # Track CDP event handler tasks
58
+ _cdp_downloads_info: dict[str, dict[str, Any]] = PrivateAttr(default_factory=dict) # Map guid -> info
59
+ _use_js_fetch_for_local: bool = PrivateAttr(default=False) # Guard JS fetch path for local regular downloads
60
+ _session_pdf_urls: dict[str, str] = PrivateAttr(default_factory=dict) # URL -> path for PDFs downloaded this session
61
+ _network_monitored_targets: set[str] = PrivateAttr(default_factory=set) # Track targets with network monitoring enabled
62
+ _detected_downloads: set[str] = PrivateAttr(default_factory=set) # Track detected download URLs to avoid duplicates
63
+ _network_callback_registered: bool = PrivateAttr(default=False) # Track if global network callback is registered
64
+
65
+ async def on_BrowserLaunchEvent(self, event: BrowserLaunchEvent) -> None:
66
+ self.logger.debug(f'[DownloadsWatchdog] Received BrowserLaunchEvent, EventBus ID: {id(self.event_bus)}')
67
+ # Ensure downloads directory exists
68
+ downloads_path = self.browser_session.browser_profile.downloads_path
69
+ if downloads_path:
70
+ expanded_path = Path(downloads_path).expanduser().resolve()
71
+ expanded_path.mkdir(parents=True, exist_ok=True)
72
+ self.logger.debug(f'[DownloadsWatchdog] Ensured downloads directory exists: {expanded_path}')
73
+
74
+ async def on_TabCreatedEvent(self, event: TabCreatedEvent) -> None:
75
+ """Monitor new tabs for downloads."""
76
+ # logger.info(f'[DownloadsWatchdog] TabCreatedEvent received for tab {event.target_id[-4:]}: {event.url}')
77
+
78
+ # Assert downloads path is configured (should always be set by BrowserProfile default)
79
+ assert self.browser_session.browser_profile.downloads_path is not None, 'Downloads path must be configured'
80
+
81
+ if event.target_id:
82
+ # logger.info(f'[DownloadsWatchdog] Found target for tab {event.target_id}, calling attach_to_target')
83
+ await self.attach_to_target(event.target_id)
84
+ else:
85
+ self.logger.warning(f'[DownloadsWatchdog] No target found for tab {event.target_id}')
86
+
87
+ async def on_TabClosedEvent(self, event: TabClosedEvent) -> None:
88
+ """Stop monitoring closed tabs."""
89
+ pass # No cleanup needed, browser context handles target lifecycle
90
+
91
+ async def on_BrowserStateRequestEvent(self, event: BrowserStateRequestEvent) -> None:
92
+ """Handle browser state request events."""
93
+ cdp_session = self.browser_session.agent_focus
94
+ if not cdp_session:
95
+ return
96
+
97
+ url = await self.browser_session.get_current_page_url()
98
+ if not url:
99
+ return
100
+
101
+ target_id = cdp_session.target_id
102
+ self.event_bus.dispatch(
103
+ NavigationCompleteEvent(
104
+ event_type='NavigationCompleteEvent',
105
+ url=url,
106
+ target_id=target_id,
107
+ event_parent_id=event.event_id,
108
+ )
109
+ )
110
+
111
+ async def on_BrowserStoppedEvent(self, event: BrowserStoppedEvent) -> None:
112
+ """Clean up when browser stops."""
113
+ # Cancel all CDP event handler tasks
114
+ for task in list(self._cdp_event_tasks):
115
+ if not task.done():
116
+ task.cancel()
117
+ # Wait for all tasks to complete cancellation
118
+ if self._cdp_event_tasks:
119
+ await asyncio.gather(*self._cdp_event_tasks, return_exceptions=True)
120
+ self._cdp_event_tasks.clear()
121
+
122
+ # Clean up CDP session
123
+ # CDP sessions are now cached and managed by BrowserSession
124
+ self._download_cdp_session = None
125
+ self._download_cdp_session_setup = False
126
+
127
+ # Clear other state
128
+ self._sessions_with_listeners.clear()
129
+ self._active_downloads.clear()
130
+ self._pdf_viewer_cache.clear()
131
+ self._session_pdf_urls.clear()
132
+ self._network_monitored_targets.clear()
133
+ self._detected_downloads.clear()
134
+ self._network_callback_registered = False
135
+
136
+ async def on_NavigationCompleteEvent(self, event: NavigationCompleteEvent) -> None:
137
+ """Check for PDFs after navigation completes."""
138
+ self.logger.debug(f'[DownloadsWatchdog] NavigationCompleteEvent received for {event.url}, tab #{event.target_id[-4:]}')
139
+
140
+ # Clear PDF cache for the navigated URL since content may have changed
141
+ if event.url in self._pdf_viewer_cache:
142
+ del self._pdf_viewer_cache[event.url]
143
+
144
+ # Check if auto-download is enabled
145
+ auto_download_enabled = self._is_auto_download_enabled()
146
+ if not auto_download_enabled:
147
+ return
148
+
149
+ # Note: Using network-based PDF detection that doesn't require JavaScript
150
+
151
+ target_id = event.target_id
152
+ self.logger.debug(f'[DownloadsWatchdog] Got target_id={target_id} for tab #{event.target_id[-4:]}')
153
+
154
+ is_pdf = await self.check_for_pdf_viewer(target_id)
155
+ if is_pdf:
156
+ self.logger.debug(f'[DownloadsWatchdog] 📄 PDF detected at {event.url}, triggering auto-download...')
157
+ download_path = await self.trigger_pdf_download(target_id)
158
+ if not download_path:
159
+ self.logger.warning(f'[DownloadsWatchdog] ⚠️ PDF download failed for {event.url}')
160
+
161
+ def _is_auto_download_enabled(self) -> bool:
162
+ """Check if auto-download PDFs is enabled in browser profile."""
163
+ return self.browser_session.browser_profile.auto_download_pdfs
164
+
165
+ async def attach_to_target(self, target_id: TargetID) -> None:
166
+ """Set up download monitoring for a specific target."""
167
+
168
+ # Define CDP event handlers outside of try to avoid indentation/scope issues
169
+ def download_will_begin_handler(event: DownloadWillBeginEvent, session_id: SessionID | None) -> None:
170
+ self.logger.debug(f'[DownloadsWatchdog] Download will begin: {event}')
171
+ # Cache info for later completion event handling (esp. remote browsers)
172
+ guid = event.get('guid', '')
173
+ try:
174
+ suggested_filename = event.get('suggestedFilename')
175
+ assert suggested_filename, 'CDP DownloadWillBegin missing suggestedFilename'
176
+ self._cdp_downloads_info[guid] = {
177
+ 'url': event.get('url', ''),
178
+ 'suggested_filename': suggested_filename,
179
+ 'handled': False,
180
+ }
181
+ except (AssertionError, KeyError):
182
+ pass
183
+ # Create and track the task
184
+ task = asyncio.create_task(self._handle_cdp_download(event, target_id, session_id))
185
+ self._cdp_event_tasks.add(task)
186
+ # Remove from set when done
187
+ task.add_done_callback(lambda t: self._cdp_event_tasks.discard(t))
188
+
189
+ def download_progress_handler(event: DownloadProgressEvent, session_id: SessionID | None) -> None:
190
+ # Check if download is complete
191
+ if event.get('state') == 'completed':
192
+ file_path = event.get('filePath')
193
+ guid = event.get('guid', '')
194
+ if self.browser_session.is_local:
195
+ if file_path:
196
+ self.logger.debug(f'[DownloadsWatchdog] Download completed: {file_path}')
197
+ # Track the download
198
+ self._track_download(file_path)
199
+ # Mark as handled to prevent fallback duplicate dispatch
200
+ try:
201
+ if guid in self._cdp_downloads_info:
202
+ self._cdp_downloads_info[guid]['handled'] = True
203
+ except (KeyError, AttributeError):
204
+ pass
205
+ else:
206
+ # No local file path provided, local polling in _handle_cdp_download will handle it
207
+ self.logger.debug(
208
+ '[DownloadsWatchdog] No filePath in progress event (local); polling will handle detection'
209
+ )
210
+ else:
211
+ # Remote browser: do not touch local filesystem. Fallback to downloadPath+suggestedFilename
212
+ info = self._cdp_downloads_info.get(guid, {})
213
+ try:
214
+ suggested_filename = info.get('suggested_filename') or (Path(file_path).name if file_path else 'download')
215
+ downloads_path = str(self.browser_session.browser_profile.downloads_path or '')
216
+ effective_path = file_path or str(Path(downloads_path) / suggested_filename)
217
+ file_name = Path(effective_path).name
218
+ file_ext = Path(file_name).suffix.lower().lstrip('.')
219
+ self.event_bus.dispatch(
220
+ FileDownloadedEvent(
221
+ url=info.get('url', ''),
222
+ path=str(effective_path),
223
+ file_name=file_name,
224
+ file_size=0,
225
+ file_type=file_ext if file_ext else None,
226
+ )
227
+ )
228
+ self.logger.debug(f'[DownloadsWatchdog] ✅ (remote) Download completed: {effective_path}')
229
+ finally:
230
+ if guid in self._cdp_downloads_info:
231
+ del self._cdp_downloads_info[guid]
232
+
233
+ try:
234
+ downloads_path_raw = self.browser_session.browser_profile.downloads_path
235
+ if not downloads_path_raw:
236
+ # logger.info(f'[DownloadsWatchdog] No downloads path configured, skipping target: {target_id}')
237
+ return # No downloads path configured
238
+
239
+ # Check if we already have a download listener on this session
240
+ # to prevent duplicate listeners from being added
241
+ # Note: Since download listeners are set up once per browser session, not per target,
242
+ # we just track if we've set up the browser-level listener
243
+ if self._download_cdp_session_setup:
244
+ self.logger.debug('[DownloadsWatchdog] Download listener already set up for browser session')
245
+ return
246
+
247
+ # logger.debug(f'[DownloadsWatchdog] Setting up CDP download listener for target: {target_id}')
248
+
249
+ # Use CDP session for download events but store reference in watchdog
250
+ if not self._download_cdp_session_setup:
251
+ # Set up CDP session for downloads (only once per browser session)
252
+ cdp_client = self.browser_session.cdp_client
253
+
254
+ # Set download behavior to allow downloads and enable events
255
+ downloads_path = self.browser_session.browser_profile.downloads_path
256
+ if not downloads_path:
257
+ self.logger.warning('[DownloadsWatchdog] No downloads path configured, skipping CDP download setup')
258
+ return
259
+ # Ensure path is properly expanded (~ -> absolute path)
260
+ expanded_downloads_path = Path(downloads_path).expanduser().resolve()
261
+ await cdp_client.send.Browser.setDownloadBehavior(
262
+ params={
263
+ 'behavior': 'allow',
264
+ 'downloadPath': str(expanded_downloads_path), # Use expanded absolute path
265
+ 'eventsEnabled': True,
266
+ }
267
+ )
268
+
269
+ # Register the handlers with CDP
270
+ cdp_client.register.Browser.downloadWillBegin(download_will_begin_handler) # type: ignore[arg-type]
271
+ cdp_client.register.Browser.downloadProgress(download_progress_handler) # type: ignore[arg-type]
272
+
273
+ self._download_cdp_session_setup = True
274
+ self.logger.debug('[DownloadsWatchdog] Set up CDP download listeners')
275
+
276
+ # No need to track individual targets since download listener is browser-level
277
+ # logger.debug(f'[DownloadsWatchdog] Successfully set up CDP download listener for target: {target_id}')
278
+
279
+ except Exception as e:
280
+ self.logger.warning(f'[DownloadsWatchdog] Failed to set up CDP download listener for target {target_id}: {e}')
281
+
282
+ # Set up network monitoring for this target (catches ALL download variants)
283
+ await self._setup_network_monitoring(target_id)
284
+
285
+ async def _setup_network_monitoring(self, target_id: TargetID) -> None:
286
+ """Set up network monitoring to detect PDFs and downloads from ALL sources.
287
+
288
+ This catches:
289
+ - Direct PDF navigation
290
+ - PDFs in iframes
291
+ - PDFs with embed/object tags
292
+ - JavaScript-triggered downloads
293
+ - Any Content-Disposition: attachment headers
294
+ """
295
+ # Skip if already monitoring this target
296
+ if target_id in self._network_monitored_targets:
297
+ self.logger.debug(f'[DownloadsWatchdog] Network monitoring already enabled for target {target_id[-4:]}')
298
+ return
299
+
300
+ # Check if auto-download is enabled
301
+ if not self._is_auto_download_enabled():
302
+ self.logger.debug('[DownloadsWatchdog] Auto-download disabled, skipping network monitoring')
303
+ return
304
+
305
+ try:
306
+ cdp_client = self.browser_session.cdp_client
307
+
308
+ # Register the global callback once
309
+ if not self._network_callback_registered:
310
+
311
+ def on_response_received(event: ResponseReceivedEvent, session_id: str | None) -> None:
312
+ """Handle Network.responseReceived event to detect downloadable content.
313
+
314
+ This callback is registered globally and uses session_id to determine the correct target.
315
+ """
316
+ try:
317
+ # Look up target_id from session_id
318
+ event_target_id = self.browser_session.get_target_id_from_session_id(session_id)
319
+ if not event_target_id:
320
+ # Session not in pool - might be a stale session or not yet tracked
321
+ return
322
+
323
+ # Only process events for targets we're monitoring
324
+ if event_target_id not in self._network_monitored_targets:
325
+ return
326
+
327
+ response = event.get('response', {})
328
+ url = response.get('url', '')
329
+ content_type = response.get('mimeType', '').lower()
330
+ headers = response.get('headers', {})
331
+
332
+ # Skip non-HTTP URLs (data:, about:, chrome-extension:, etc.)
333
+ if not url.startswith('http'):
334
+ return
335
+
336
+ # Check if it's a PDF
337
+ is_pdf = 'application/pdf' in content_type
338
+
339
+ # Check if it's marked as download via Content-Disposition header
340
+ content_disposition = headers.get('content-disposition', '').lower()
341
+ is_download_attachment = 'attachment' in content_disposition
342
+
343
+ # Filter out image/video/audio files even if marked as attachment
344
+ # These are likely resources, not intentional downloads
345
+ unwanted_content_types = [
346
+ 'image/',
347
+ 'video/',
348
+ 'audio/',
349
+ 'text/css',
350
+ 'text/javascript',
351
+ 'application/javascript',
352
+ 'application/x-javascript',
353
+ 'text/html',
354
+ 'application/json',
355
+ 'font/',
356
+ 'application/font',
357
+ 'application/x-font',
358
+ ]
359
+ is_unwanted_type = any(content_type.startswith(prefix) for prefix in unwanted_content_types)
360
+ if is_unwanted_type:
361
+ return
362
+
363
+ # Check URL extension to filter out obvious images/resources
364
+ url_lower = url.lower().split('?')[0] # Remove query params
365
+ unwanted_extensions = [
366
+ '.jpg',
367
+ '.jpeg',
368
+ '.png',
369
+ '.gif',
370
+ '.webp',
371
+ '.svg',
372
+ '.ico',
373
+ '.css',
374
+ '.js',
375
+ '.woff',
376
+ '.woff2',
377
+ '.ttf',
378
+ '.eot',
379
+ '.mp4',
380
+ '.webm',
381
+ '.mp3',
382
+ '.wav',
383
+ '.ogg',
384
+ ]
385
+ if any(url_lower.endswith(ext) for ext in unwanted_extensions):
386
+ return
387
+
388
+ # Only process if it's a PDF or download
389
+ if not (is_pdf or is_download_attachment):
390
+ return
391
+
392
+ # Check if we've already processed this URL in this session
393
+ if url in self._detected_downloads:
394
+ self.logger.debug(f'[DownloadsWatchdog] Already detected download: {url[:80]}...')
395
+ return
396
+
397
+ # Mark as detected to avoid duplicates
398
+ self._detected_downloads.add(url)
399
+
400
+ # Extract filename from Content-Disposition if available
401
+ suggested_filename = None
402
+ if 'filename=' in content_disposition:
403
+ # Parse filename from Content-Disposition header
404
+ import re
405
+
406
+ filename_match = re.search(r'filename[^;=\n]*=(([\'"]).*?\2|[^;\n]*)', content_disposition)
407
+ if filename_match:
408
+ suggested_filename = filename_match.group(1).strip('\'"')
409
+
410
+ self.logger.info(f'[DownloadsWatchdog] 🔍 Detected downloadable content via network: {url[:80]}...')
411
+ self.logger.debug(
412
+ f'[DownloadsWatchdog] Content-Type: {content_type}, Is PDF: {is_pdf}, Is Attachment: {is_download_attachment}'
413
+ )
414
+
415
+ # Trigger download asynchronously in background (don't block event handler)
416
+ async def download_in_background():
417
+ try:
418
+ download_path = await self.download_file_from_url(
419
+ url=url,
420
+ target_id=event_target_id, # Use target_id from session_id lookup
421
+ content_type=content_type,
422
+ suggested_filename=suggested_filename,
423
+ )
424
+
425
+ if download_path:
426
+ self.logger.info(f'[DownloadsWatchdog] ✅ Successfully downloaded: {download_path}')
427
+ else:
428
+ self.logger.warning(f'[DownloadsWatchdog] ⚠️ Failed to download: {url[:80]}...')
429
+ except Exception as e:
430
+ self.logger.error(f'[DownloadsWatchdog] Error downloading in background: {type(e).__name__}: {e}')
431
+
432
+ # Create background task
433
+ task = asyncio.create_task(download_in_background())
434
+ self._cdp_event_tasks.add(task)
435
+ task.add_done_callback(lambda t: self._cdp_event_tasks.discard(t))
436
+
437
+ except Exception as e:
438
+ self.logger.error(f'[DownloadsWatchdog] Error in network response handler: {type(e).__name__}: {e}')
439
+
440
+ # Register the callback globally (once)
441
+ cdp_client.register.Network.responseReceived(on_response_received)
442
+ self._network_callback_registered = True
443
+ self.logger.debug('[DownloadsWatchdog] ✅ Registered global network response callback')
444
+
445
+ # Get or create CDP session for this target
446
+ cdp_session = await self.browser_session.get_or_create_cdp_session(target_id, focus=False)
447
+
448
+ # Enable Network domain to monitor HTTP responses (per-target/per-session)
449
+ await cdp_client.send.Network.enable(session_id=cdp_session.session_id)
450
+ self.logger.debug(f'[DownloadsWatchdog] Enabled Network domain for target {target_id[-4:]}')
451
+
452
+ # Mark this target as monitored
453
+ self._network_monitored_targets.add(target_id)
454
+ self.logger.debug(f'[DownloadsWatchdog] ✅ Network monitoring enabled for target {target_id[-4:]}')
455
+
456
+ except Exception as e:
457
+ self.logger.warning(f'[DownloadsWatchdog] Failed to set up network monitoring for target {target_id}: {e}')
458
+
459
+ async def download_file_from_url(
460
+ self, url: str, target_id: TargetID, content_type: str | None = None, suggested_filename: str | None = None
461
+ ) -> str | None:
462
+ """Generic method to download any file from a URL.
463
+
464
+ Args:
465
+ url: The URL to download
466
+ target_id: The target ID for CDP session
467
+ content_type: Optional content type (e.g., 'application/pdf')
468
+ suggested_filename: Optional filename from Content-Disposition header
469
+
470
+ Returns:
471
+ Path to downloaded file, or None if download failed
472
+ """
473
+ if not self.browser_session.browser_profile.downloads_path:
474
+ self.logger.warning('[DownloadsWatchdog] No downloads path configured')
475
+ return None
476
+
477
+ # Check if already downloaded in this session
478
+ if url in self._session_pdf_urls:
479
+ existing_path = self._session_pdf_urls[url]
480
+ self.logger.debug(f'[DownloadsWatchdog] File already downloaded in session: {existing_path}')
481
+ return existing_path
482
+
483
+ try:
484
+ # Get or create CDP session for this target
485
+ temp_session = await self.browser_session.get_or_create_cdp_session(target_id, focus=False)
486
+
487
+ # Determine filename
488
+ if suggested_filename:
489
+ filename = suggested_filename
490
+ else:
491
+ # Extract from URL
492
+ filename = os.path.basename(url.split('?')[0]) # Remove query params
493
+ if not filename or '.' not in filename:
494
+ # Fallback: use content type to determine extension
495
+ if content_type and 'pdf' in content_type:
496
+ filename = 'document.pdf'
497
+ else:
498
+ filename = 'download'
499
+
500
+ # Ensure downloads directory exists
501
+ downloads_dir = str(self.browser_session.browser_profile.downloads_path)
502
+ os.makedirs(downloads_dir, exist_ok=True)
503
+
504
+ # Generate unique filename if file exists
505
+ final_filename = filename
506
+ existing_files = os.listdir(downloads_dir)
507
+ if filename in existing_files:
508
+ base, ext = os.path.splitext(filename)
509
+ counter = 1
510
+ while f'{base} ({counter}){ext}' in existing_files:
511
+ counter += 1
512
+ final_filename = f'{base} ({counter}){ext}'
513
+ self.logger.debug(f'[DownloadsWatchdog] File exists, using: {final_filename}')
514
+
515
+ self.logger.debug(f'[DownloadsWatchdog] Downloading from: {url[:100]}...')
516
+
517
+ # Download using JavaScript fetch to leverage browser cache
518
+ escaped_url = json.dumps(url)
519
+
520
+ result = await asyncio.wait_for(
521
+ temp_session.cdp_client.send.Runtime.evaluate(
522
+ params={
523
+ 'expression': f"""
524
+ (async () => {{
525
+ try {{
526
+ const response = await fetch({escaped_url}, {{
527
+ cache: 'force-cache'
528
+ }});
529
+ if (!response.ok) {{
530
+ throw new Error(`HTTP error! status: ${{response.status}}`);
531
+ }}
532
+ const blob = await response.blob();
533
+ const arrayBuffer = await blob.arrayBuffer();
534
+ const uint8Array = new Uint8Array(arrayBuffer);
535
+
536
+ return {{
537
+ data: Array.from(uint8Array),
538
+ responseSize: uint8Array.length
539
+ }};
540
+ }} catch (error) {{
541
+ throw new Error(`Fetch failed: ${{error.message}}`);
542
+ }}
543
+ }})()
544
+ """,
545
+ 'awaitPromise': True,
546
+ 'returnByValue': True,
547
+ },
548
+ session_id=temp_session.session_id,
549
+ ),
550
+ timeout=15.0, # 15 second timeout
551
+ )
552
+
553
+ download_result = result.get('result', {}).get('value', {})
554
+
555
+ if download_result and download_result.get('data') and len(download_result['data']) > 0:
556
+ download_path = os.path.join(downloads_dir, final_filename)
557
+
558
+ # Save the file asynchronously
559
+ async with await anyio.open_file(download_path, 'wb') as f:
560
+ await f.write(bytes(download_result['data']))
561
+
562
+ # Verify file was written successfully
563
+ if os.path.exists(download_path):
564
+ actual_size = os.path.getsize(download_path)
565
+ self.logger.debug(f'[DownloadsWatchdog] File written: {download_path} ({actual_size} bytes)')
566
+
567
+ # Determine file type
568
+ file_ext = Path(final_filename).suffix.lower().lstrip('.')
569
+ mime_type = content_type or f'application/{file_ext}'
570
+
571
+ # Store URL->path mapping for this session
572
+ self._session_pdf_urls[url] = download_path
573
+
574
+ # Emit file downloaded event
575
+ self.logger.debug(f'[DownloadsWatchdog] Dispatching FileDownloadedEvent for {final_filename}')
576
+ self.event_bus.dispatch(
577
+ FileDownloadedEvent(
578
+ url=url,
579
+ path=download_path,
580
+ file_name=final_filename,
581
+ file_size=actual_size,
582
+ file_type=file_ext if file_ext else None,
583
+ mime_type=mime_type,
584
+ auto_download=True,
585
+ )
586
+ )
587
+
588
+ return download_path
589
+ else:
590
+ self.logger.error(f'[DownloadsWatchdog] Failed to write file: {download_path}')
591
+ return None
592
+ else:
593
+ self.logger.warning(f'[DownloadsWatchdog] No data received when downloading from {url}')
594
+ return None
595
+
596
+ except TimeoutError:
597
+ self.logger.warning(f'[DownloadsWatchdog] Download timed out: {url[:80]}...')
598
+ return None
599
+ except Exception as e:
600
+ self.logger.warning(f'[DownloadsWatchdog] Download failed: {type(e).__name__}: {e}')
601
+ return None
602
+
603
+ def _track_download(self, file_path: str) -> None:
604
+ """Track a completed download and dispatch the appropriate event.
605
+
606
+ Args:
607
+ file_path: The path to the downloaded file
608
+ """
609
+ try:
610
+ # Get file info
611
+ path = Path(file_path)
612
+ if path.exists():
613
+ file_size = path.stat().st_size
614
+ self.logger.debug(f'[DownloadsWatchdog] Tracked download: {path.name} ({file_size} bytes)')
615
+
616
+ # Dispatch download event
617
+ from browser_use.browser.events import FileDownloadedEvent
618
+
619
+ self.event_bus.dispatch(
620
+ FileDownloadedEvent(
621
+ url=str(path), # Use the file path as URL for local files
622
+ path=str(path),
623
+ file_name=path.name,
624
+ file_size=file_size,
625
+ )
626
+ )
627
+ else:
628
+ self.logger.warning(f'[DownloadsWatchdog] Downloaded file not found: {file_path}')
629
+ except Exception as e:
630
+ self.logger.error(f'[DownloadsWatchdog] Error tracking download: {e}')
631
+
632
+ async def _handle_cdp_download(
633
+ self, event: DownloadWillBeginEvent, target_id: TargetID, session_id: SessionID | None
634
+ ) -> None:
635
+ """Handle a CDP Page.downloadWillBegin event."""
636
+ downloads_dir = (
637
+ Path(
638
+ self.browser_session.browser_profile.downloads_path
639
+ or f'{tempfile.gettempdir()}/browser_use_downloads.{str(self.browser_session.id)[-4:]}'
640
+ )
641
+ .expanduser()
642
+ .resolve()
643
+ ) # Ensure path is properly expanded
644
+
645
+ # Initialize variables that may be used outside try blocks
646
+ unique_filename = None
647
+ file_size = 0
648
+ expected_path = None
649
+ download_result = None
650
+ download_url = event.get('url', '')
651
+ suggested_filename = event.get('suggestedFilename', 'download')
652
+ guid = event.get('guid', '')
653
+
654
+ try:
655
+ self.logger.debug(f'[DownloadsWatchdog] ⬇️ File download starting: {suggested_filename} from {download_url[:100]}...')
656
+ self.logger.debug(f'[DownloadsWatchdog] Full CDP event: {event}')
657
+
658
+ # Since Browser.setDownloadBehavior is already configured, the browser will download the file
659
+ # We just need to wait for it to appear in the downloads directory
660
+ expected_path = downloads_dir / suggested_filename
661
+
662
+ # Debug: List current directory contents
663
+ self.logger.debug(f'[DownloadsWatchdog] Downloads directory: {downloads_dir}')
664
+ if downloads_dir.exists():
665
+ files_before = list(downloads_dir.iterdir())
666
+ self.logger.debug(f'[DownloadsWatchdog] Files before download: {[f.name for f in files_before]}')
667
+
668
+ # Try manual JavaScript fetch as a fallback for local browsers (disabled for regular local downloads)
669
+ if self.browser_session.is_local and self._use_js_fetch_for_local:
670
+ self.logger.debug(f'[DownloadsWatchdog] Attempting JS fetch fallback for {download_url}')
671
+
672
+ unique_filename = None
673
+ file_size = None
674
+ download_result = None
675
+ try:
676
+ # Escape the URL for JavaScript
677
+ import json
678
+
679
+ escaped_url = json.dumps(download_url)
680
+
681
+ # Get the proper session for the frame that initiated the download
682
+ cdp_session = await self.browser_session.cdp_client_for_frame(event.get('frameId'))
683
+ assert cdp_session
684
+
685
+ result = await cdp_session.cdp_client.send.Runtime.evaluate(
686
+ params={
687
+ 'expression': f"""
688
+ (async () => {{
689
+ try {{
690
+ const response = await fetch({escaped_url});
691
+ if (!response.ok) {{
692
+ throw new Error(`HTTP error! status: ${{response.status}}`);
693
+ }}
694
+ const blob = await response.blob();
695
+ const arrayBuffer = await blob.arrayBuffer();
696
+ const uint8Array = new Uint8Array(arrayBuffer);
697
+ return {{
698
+ data: Array.from(uint8Array),
699
+ size: uint8Array.length,
700
+ contentType: response.headers.get('content-type') || 'application/octet-stream'
701
+ }};
702
+ }} catch (error) {{
703
+ throw new Error(`Fetch failed: ${{error.message}}`);
704
+ }}
705
+ }})()
706
+ """,
707
+ 'awaitPromise': True,
708
+ 'returnByValue': True,
709
+ },
710
+ session_id=cdp_session.session_id,
711
+ )
712
+ download_result = result.get('result', {}).get('value')
713
+
714
+ if download_result and download_result.get('data'):
715
+ # Save the file
716
+ file_data = bytes(download_result['data'])
717
+ file_size = len(file_data)
718
+
719
+ # Ensure unique filename
720
+ unique_filename = await self._get_unique_filename(str(downloads_dir), suggested_filename)
721
+ final_path = downloads_dir / unique_filename
722
+
723
+ # Write the file
724
+ import anyio
725
+
726
+ async with await anyio.open_file(final_path, 'wb') as f:
727
+ await f.write(file_data)
728
+
729
+ self.logger.debug(f'[DownloadsWatchdog] ✅ Downloaded and saved file: {final_path} ({file_size} bytes)')
730
+ expected_path = final_path
731
+ # Emit download event immediately
732
+ file_ext = expected_path.suffix.lower().lstrip('.')
733
+ file_type = file_ext if file_ext else None
734
+ self.event_bus.dispatch(
735
+ FileDownloadedEvent(
736
+ url=download_url,
737
+ path=str(expected_path),
738
+ file_name=unique_filename or expected_path.name,
739
+ file_size=file_size or 0,
740
+ file_type=file_type,
741
+ mime_type=(download_result.get('contentType') if download_result else None),
742
+ from_cache=False,
743
+ auto_download=False,
744
+ )
745
+ )
746
+ # Mark as handled to prevent duplicate dispatch from progress/polling paths
747
+ try:
748
+ if guid in self._cdp_downloads_info:
749
+ self._cdp_downloads_info[guid]['handled'] = True
750
+ except (KeyError, AttributeError):
751
+ pass
752
+ self.logger.debug(
753
+ f'[DownloadsWatchdog] ✅ File download completed via CDP: {suggested_filename} ({file_size} bytes) saved to {expected_path}'
754
+ )
755
+ return
756
+ else:
757
+ self.logger.error('[DownloadsWatchdog] ❌ No data received from fetch')
758
+
759
+ except Exception as fetch_error:
760
+ self.logger.error(f'[DownloadsWatchdog] ❌ Failed to download file via fetch: {fetch_error}')
761
+
762
+ # For remote browsers, don't poll local filesystem; downloadProgress handler will emit the event
763
+ if not self.browser_session.is_local:
764
+ return
765
+ except Exception as e:
766
+ self.logger.error(f'[DownloadsWatchdog] ❌ Error handling CDP download: {type(e).__name__} {e}')
767
+
768
+ # If we reach here, the fetch method failed, so wait for native download
769
+ # Poll the downloads directory for new files
770
+ self.logger.debug(f'[DownloadsWatchdog] Checking if browser auto-download saved the file for us: {suggested_filename}')
771
+
772
+ # Get initial list of files in downloads directory
773
+ initial_files = set()
774
+ if Path(downloads_dir).exists():
775
+ for f in Path(downloads_dir).iterdir():
776
+ if f.is_file() and not f.name.startswith('.'):
777
+ initial_files.add(f.name)
778
+
779
+ # Poll for new files
780
+ max_wait = 20 # seconds
781
+ start_time = asyncio.get_event_loop().time()
782
+
783
+ while asyncio.get_event_loop().time() - start_time < max_wait:
784
+ await asyncio.sleep(5.0) # Check every 5 seconds
785
+
786
+ if Path(downloads_dir).exists():
787
+ for file_path in Path(downloads_dir).iterdir():
788
+ # Skip hidden files and files that were already there
789
+ if file_path.is_file() and not file_path.name.startswith('.') and file_path.name not in initial_files:
790
+ # Check if file has content (> 4 bytes)
791
+ try:
792
+ file_size = file_path.stat().st_size
793
+ if file_size > 4:
794
+ # Found a new download!
795
+ self.logger.debug(
796
+ f'[DownloadsWatchdog] ✅ Found downloaded file: {file_path} ({file_size} bytes)'
797
+ )
798
+
799
+ # Determine file type from extension
800
+ file_ext = file_path.suffix.lower().lstrip('.')
801
+ file_type = file_ext if file_ext else None
802
+
803
+ # Dispatch download event
804
+ # Skip if already handled by progress/JS fetch
805
+ info = self._cdp_downloads_info.get(guid, {})
806
+ if info.get('handled'):
807
+ return
808
+ self.event_bus.dispatch(
809
+ FileDownloadedEvent(
810
+ url=download_url,
811
+ path=str(file_path),
812
+ file_name=file_path.name,
813
+ file_size=file_size,
814
+ file_type=file_type,
815
+ )
816
+ )
817
+ # Mark as handled after dispatch
818
+ try:
819
+ if guid in self._cdp_downloads_info:
820
+ self._cdp_downloads_info[guid]['handled'] = True
821
+ except (KeyError, AttributeError):
822
+ pass
823
+ return
824
+ except Exception as e:
825
+ self.logger.debug(f'[DownloadsWatchdog] Error checking file {file_path}: {e}')
826
+
827
+ self.logger.warning(f'[DownloadsWatchdog] Download did not complete within {max_wait} seconds')
828
+
829
+ async def _handle_download(self, download: Any) -> None:
830
+ """Handle a download event."""
831
+ download_id = f'{id(download)}'
832
+ self._active_downloads[download_id] = download
833
+ self.logger.debug(f'[DownloadsWatchdog] ⬇️ Handling download: {download.suggested_filename} from {download.url[:100]}...')
834
+
835
+ # Debug: Check if download is already being handled elsewhere
836
+ failure = (
837
+ await download.failure()
838
+ ) # TODO: it always fails for some reason, figure out why connect_over_cdp makes accept_downloads not work
839
+ self.logger.warning(f'[DownloadsWatchdog] ❌ Download state - canceled: {failure}, url: {download.url}')
840
+ # logger.info(f'[DownloadsWatchdog] Active downloads count: {len(self._active_downloads)}')
841
+
842
+ try:
843
+ current_step = 'getting_download_info'
844
+ # Get download info immediately
845
+ url = download.url
846
+ suggested_filename = download.suggested_filename
847
+
848
+ current_step = 'determining_download_directory'
849
+ # Determine download directory from browser profile
850
+ downloads_dir = self.browser_session.browser_profile.downloads_path
851
+ if not downloads_dir:
852
+ downloads_dir = str(Path.home() / 'Downloads')
853
+ else:
854
+ downloads_dir = str(downloads_dir) # Ensure it's a string
855
+
856
+ # Check if Playwright already auto-downloaded the file (due to CDP setup)
857
+ original_path = Path(downloads_dir) / suggested_filename
858
+ if original_path.exists() and original_path.stat().st_size > 0:
859
+ self.logger.debug(
860
+ f'[DownloadsWatchdog] File already downloaded by Playwright: {original_path} ({original_path.stat().st_size} bytes)'
861
+ )
862
+
863
+ # Use the existing file instead of creating a duplicate
864
+ download_path = original_path
865
+ file_size = original_path.stat().st_size
866
+ unique_filename = suggested_filename
867
+ else:
868
+ current_step = 'generating_unique_filename'
869
+ # Ensure unique filename
870
+ unique_filename = await self._get_unique_filename(downloads_dir, suggested_filename)
871
+ download_path = Path(downloads_dir) / unique_filename
872
+
873
+ self.logger.debug(f'[DownloadsWatchdog] Download started: {unique_filename} from {url[:100]}...')
874
+
875
+ current_step = 'calling_save_as'
876
+ # Save the download using Playwright's save_as method
877
+ self.logger.debug(f'[DownloadsWatchdog] Saving download to: {download_path}')
878
+ self.logger.debug(f'[DownloadsWatchdog] Download path exists: {download_path.parent.exists()}')
879
+ self.logger.debug(f'[DownloadsWatchdog] Download path writable: {os.access(download_path.parent, os.W_OK)}')
880
+
881
+ try:
882
+ self.logger.debug('[DownloadsWatchdog] About to call download.save_as()...')
883
+ await download.save_as(str(download_path))
884
+ self.logger.debug(f'[DownloadsWatchdog] Successfully saved download to: {download_path}')
885
+ current_step = 'save_as_completed'
886
+ except Exception as save_error:
887
+ self.logger.error(f'[DownloadsWatchdog] save_as() failed with error: {save_error}')
888
+ raise save_error
889
+
890
+ # Get file info
891
+ file_size = download_path.stat().st_size if download_path.exists() else 0
892
+
893
+ # Determine file type from extension
894
+ file_ext = download_path.suffix.lower().lstrip('.')
895
+ file_type = file_ext if file_ext else None
896
+
897
+ # Try to get MIME type from response headers if available
898
+ mime_type = None
899
+ # Note: Playwright doesn't expose response headers directly from Download object
900
+
901
+ # Check if this was a PDF auto-download
902
+ auto_download = False
903
+ if file_type == 'pdf':
904
+ auto_download = self._is_auto_download_enabled()
905
+
906
+ # Emit download event
907
+ self.event_bus.dispatch(
908
+ FileDownloadedEvent(
909
+ url=url,
910
+ path=str(download_path),
911
+ file_name=suggested_filename,
912
+ file_size=file_size,
913
+ file_type=file_type,
914
+ mime_type=mime_type,
915
+ from_cache=False,
916
+ auto_download=auto_download,
917
+ )
918
+ )
919
+
920
+ self.logger.debug(
921
+ f'[DownloadsWatchdog] ✅ Download completed: {suggested_filename} ({file_size} bytes) saved to {download_path}'
922
+ )
923
+
924
+ # File is now tracked on filesystem, no need to track in memory
925
+
926
+ except Exception as e:
927
+ self.logger.error(
928
+ f'[DownloadsWatchdog] Error handling download at step "{locals().get("current_step", "unknown")}", error: {e}'
929
+ )
930
+ self.logger.error(
931
+ f'[DownloadsWatchdog] Download state - URL: {download.url}, filename: {download.suggested_filename}'
932
+ )
933
+ finally:
934
+ # Clean up tracking
935
+ if download_id in self._active_downloads:
936
+ del self._active_downloads[download_id]
937
+
938
+ async def check_for_pdf_viewer(self, target_id: TargetID) -> bool:
939
+ """Check if the current target is a PDF using network-based detection.
940
+
941
+ This method avoids JavaScript execution that can crash WebSocket connections.
942
+ Returns True if a PDF is detected and should be downloaded.
943
+ """
944
+ self.logger.debug(f'[DownloadsWatchdog] Checking if target {target_id} is PDF viewer...')
945
+
946
+ # Get target info to get URL
947
+ cdp_client = self.browser_session.cdp_client
948
+ targets = await cdp_client.send.Target.getTargets()
949
+ target_info = next((t for t in targets['targetInfos'] if t['targetId'] == target_id), None)
950
+ if not target_info:
951
+ self.logger.warning(f'[DownloadsWatchdog] No target info found for {target_id}')
952
+ return False
953
+
954
+ page_url = target_info.get('url', '')
955
+
956
+ # Check cache first
957
+ if page_url in self._pdf_viewer_cache:
958
+ cached_result = self._pdf_viewer_cache[page_url]
959
+ self.logger.debug(f'[DownloadsWatchdog] Using cached PDF check result for {page_url}: {cached_result}')
960
+ return cached_result
961
+
962
+ try:
963
+ # Method 1: Check URL patterns (fastest, most reliable)
964
+ url_is_pdf = self._check_url_for_pdf(page_url)
965
+ if url_is_pdf:
966
+ self.logger.debug(f'[DownloadsWatchdog] PDF detected via URL pattern: {page_url}')
967
+ self._pdf_viewer_cache[page_url] = True
968
+ return True
969
+
970
+ # Method 2: Check network response headers via CDP (safer than JavaScript)
971
+ header_is_pdf = await self._check_network_headers_for_pdf(target_id)
972
+ if header_is_pdf:
973
+ self.logger.debug(f'[DownloadsWatchdog] PDF detected via network headers: {page_url}')
974
+ self._pdf_viewer_cache[page_url] = True
975
+ return True
976
+
977
+ # Method 3: Check Chrome's PDF viewer specific URLs
978
+ chrome_pdf_viewer = self._is_chrome_pdf_viewer_url(page_url)
979
+ if chrome_pdf_viewer:
980
+ self.logger.debug(f'[DownloadsWatchdog] Chrome PDF viewer detected: {page_url}')
981
+ self._pdf_viewer_cache[page_url] = True
982
+ return True
983
+
984
+ # Not a PDF
985
+ self._pdf_viewer_cache[page_url] = False
986
+ return False
987
+
988
+ except Exception as e:
989
+ self.logger.warning(f'[DownloadsWatchdog] ❌ Error checking for PDF viewer: {e}')
990
+ self._pdf_viewer_cache[page_url] = False
991
+ return False
992
+
993
+ def _check_url_for_pdf(self, url: str) -> bool:
994
+ """Check if URL indicates a PDF file."""
995
+ if not url:
996
+ return False
997
+
998
+ url_lower = url.lower()
999
+
1000
+ # Direct PDF file extensions
1001
+ if url_lower.endswith('.pdf'):
1002
+ return True
1003
+
1004
+ # PDF in path
1005
+ if '.pdf' in url_lower:
1006
+ return True
1007
+
1008
+ # PDF MIME type in URL parameters
1009
+ if any(
1010
+ param in url_lower
1011
+ for param in [
1012
+ 'content-type=application/pdf',
1013
+ 'content-type=application%2fpdf',
1014
+ 'mimetype=application/pdf',
1015
+ 'type=application/pdf',
1016
+ ]
1017
+ ):
1018
+ return True
1019
+
1020
+ return False
1021
+
1022
+ def _is_chrome_pdf_viewer_url(self, url: str) -> bool:
1023
+ """Check if this is Chrome's internal PDF viewer URL."""
1024
+ if not url:
1025
+ return False
1026
+
1027
+ url_lower = url.lower()
1028
+
1029
+ # Chrome PDF viewer uses chrome-extension:// URLs
1030
+ if 'chrome-extension://' in url_lower and 'pdf' in url_lower:
1031
+ return True
1032
+
1033
+ # Chrome PDF viewer internal URLs
1034
+ if url_lower.startswith('chrome://') and 'pdf' in url_lower:
1035
+ return True
1036
+
1037
+ return False
1038
+
1039
+ async def _check_network_headers_for_pdf(self, target_id: TargetID) -> bool:
1040
+ """Infer PDF via navigation history/URL; headers are not available post-navigation in this context."""
1041
+ try:
1042
+ import asyncio
1043
+
1044
+ # Get CDP session
1045
+ temp_session = await self.browser_session.get_or_create_cdp_session(target_id, focus=False)
1046
+
1047
+ # Get navigation history to find the main resource
1048
+ history = await asyncio.wait_for(
1049
+ temp_session.cdp_client.send.Page.getNavigationHistory(session_id=temp_session.session_id), timeout=3.0
1050
+ )
1051
+
1052
+ current_entry = history.get('entries', [])
1053
+ if current_entry:
1054
+ current_index = history.get('currentIndex', 0)
1055
+ if 0 <= current_index < len(current_entry):
1056
+ current_url = current_entry[current_index].get('url', '')
1057
+
1058
+ # Check if the URL itself suggests PDF
1059
+ if self._check_url_for_pdf(current_url):
1060
+ return True
1061
+
1062
+ # Note: CDP doesn't easily expose response headers for completed navigations
1063
+ # For more complex cases, we'd need to set up Network.responseReceived listeners
1064
+ # before navigation, but that's overkill for most PDF detection cases
1065
+
1066
+ return False
1067
+
1068
+ except Exception as e:
1069
+ self.logger.debug(f'[DownloadsWatchdog] Network headers check failed (non-critical): {e}')
1070
+ return False
1071
+
1072
+ async def trigger_pdf_download(self, target_id: TargetID) -> str | None:
1073
+ """Trigger download of a PDF from Chrome's PDF viewer.
1074
+
1075
+ Returns the download path if successful, None otherwise.
1076
+ """
1077
+ self.logger.debug(f'[DownloadsWatchdog] trigger_pdf_download called for target_id={target_id}')
1078
+
1079
+ if not self.browser_session.browser_profile.downloads_path:
1080
+ self.logger.warning('[DownloadsWatchdog] ❌ No downloads path configured, cannot save PDF download')
1081
+ return None
1082
+
1083
+ downloads_path = self.browser_session.browser_profile.downloads_path
1084
+ self.logger.debug(f'[DownloadsWatchdog] Downloads path: {downloads_path}')
1085
+
1086
+ try:
1087
+ # Create a temporary CDP session for this target without switching focus
1088
+ import asyncio
1089
+
1090
+ self.logger.debug(f'[DownloadsWatchdog] Creating CDP session for PDF download from target {target_id}')
1091
+ temp_session = await self.browser_session.get_or_create_cdp_session(target_id, focus=False)
1092
+
1093
+ # Try to get the PDF URL with timeout
1094
+ result = await asyncio.wait_for(
1095
+ temp_session.cdp_client.send.Runtime.evaluate(
1096
+ params={
1097
+ 'expression': """
1098
+ (() => {
1099
+ // For Chrome's PDF viewer, the actual URL is in window.location.href
1100
+ // The embed element's src is often "about:blank"
1101
+ const embedElement = document.querySelector('embed[type="application/x-google-chrome-pdf"]') ||
1102
+ document.querySelector('embed[type="application/pdf"]');
1103
+ if (embedElement) {
1104
+ // Chrome PDF viewer detected - use the page URL
1105
+ return { url: window.location.href };
1106
+ }
1107
+ // Fallback to window.location.href anyway
1108
+ return { url: window.location.href };
1109
+ })()
1110
+ """,
1111
+ 'returnByValue': True,
1112
+ },
1113
+ session_id=temp_session.session_id,
1114
+ ),
1115
+ timeout=5.0, # 5 second timeout to prevent hanging
1116
+ )
1117
+ pdf_info = result.get('result', {}).get('value', {})
1118
+
1119
+ pdf_url = pdf_info.get('url', '')
1120
+ if not pdf_url:
1121
+ self.logger.warning(f'[DownloadsWatchdog] ❌ Could not determine PDF URL for download {pdf_info}')
1122
+ return None
1123
+
1124
+ # Generate filename from URL
1125
+ pdf_filename = os.path.basename(pdf_url.split('?')[0]) # Remove query params
1126
+ if not pdf_filename or not pdf_filename.endswith('.pdf'):
1127
+ parsed = urlparse(pdf_url)
1128
+ pdf_filename = os.path.basename(parsed.path) or 'document.pdf'
1129
+ if not pdf_filename.endswith('.pdf'):
1130
+ pdf_filename += '.pdf'
1131
+
1132
+ self.logger.debug(f'[DownloadsWatchdog] Generated filename: {pdf_filename}')
1133
+
1134
+ # Check if already downloaded in this session
1135
+ self.logger.debug(f'[DownloadsWatchdog] PDF_URL: {pdf_url}, session_pdf_urls: {self._session_pdf_urls}')
1136
+ if pdf_url in self._session_pdf_urls:
1137
+ existing_path = self._session_pdf_urls[pdf_url]
1138
+ self.logger.debug(f'[DownloadsWatchdog] PDF already downloaded in session: {existing_path}')
1139
+ return existing_path
1140
+
1141
+ # Generate unique filename if file exists from previous run
1142
+ downloads_dir = str(self.browser_session.browser_profile.downloads_path)
1143
+ os.makedirs(downloads_dir, exist_ok=True)
1144
+ final_filename = pdf_filename
1145
+ existing_files = os.listdir(downloads_dir)
1146
+ if pdf_filename in existing_files:
1147
+ # Generate unique name with (1), (2), etc.
1148
+ base, ext = os.path.splitext(pdf_filename)
1149
+ counter = 1
1150
+ while f'{base} ({counter}){ext}' in existing_files:
1151
+ counter += 1
1152
+ final_filename = f'{base} ({counter}){ext}'
1153
+ self.logger.debug(f'[DownloadsWatchdog] File exists, using: {final_filename}')
1154
+
1155
+ self.logger.debug(f'[DownloadsWatchdog] Starting PDF download from: {pdf_url[:100]}...')
1156
+
1157
+ # Download using JavaScript fetch to leverage browser cache
1158
+ try:
1159
+ # Properly escape the URL to prevent JavaScript injection
1160
+ escaped_pdf_url = json.dumps(pdf_url)
1161
+
1162
+ result = await asyncio.wait_for(
1163
+ temp_session.cdp_client.send.Runtime.evaluate(
1164
+ params={
1165
+ 'expression': f"""
1166
+ (async () => {{
1167
+ try {{
1168
+ // Use fetch with cache: 'force-cache' to prioritize cached version
1169
+ const response = await fetch({escaped_pdf_url}, {{
1170
+ cache: 'force-cache'
1171
+ }});
1172
+ if (!response.ok) {{
1173
+ throw new Error(`HTTP error! status: ${{response.status}}`);
1174
+ }}
1175
+ const blob = await response.blob();
1176
+ const arrayBuffer = await blob.arrayBuffer();
1177
+ const uint8Array = new Uint8Array(arrayBuffer);
1178
+
1179
+ // Check if served from cache
1180
+ const fromCache = response.headers.has('age') ||
1181
+ !response.headers.has('date');
1182
+
1183
+ return {{
1184
+ data: Array.from(uint8Array),
1185
+ fromCache: fromCache,
1186
+ responseSize: uint8Array.length,
1187
+ transferSize: response.headers.get('content-length') || 'unknown'
1188
+ }};
1189
+ }} catch (error) {{
1190
+ throw new Error(`Fetch failed: ${{error.message}}`);
1191
+ }}
1192
+ }})()
1193
+ """,
1194
+ 'awaitPromise': True,
1195
+ 'returnByValue': True,
1196
+ },
1197
+ session_id=temp_session.session_id,
1198
+ ),
1199
+ timeout=10.0, # 10 second timeout for download operation
1200
+ )
1201
+ download_result = result.get('result', {}).get('value', {})
1202
+
1203
+ if download_result and download_result.get('data') and len(download_result['data']) > 0:
1204
+ # Ensure downloads directory exists
1205
+ downloads_dir = str(self.browser_session.browser_profile.downloads_path)
1206
+ os.makedirs(downloads_dir, exist_ok=True)
1207
+ download_path = os.path.join(downloads_dir, final_filename)
1208
+
1209
+ # Save the PDF asynchronously
1210
+ async with await anyio.open_file(download_path, 'wb') as f:
1211
+ await f.write(bytes(download_result['data']))
1212
+
1213
+ # Verify file was written successfully
1214
+ if os.path.exists(download_path):
1215
+ actual_size = os.path.getsize(download_path)
1216
+ self.logger.debug(
1217
+ f'[DownloadsWatchdog] PDF file written successfully: {download_path} ({actual_size} bytes)'
1218
+ )
1219
+ else:
1220
+ self.logger.error(f'[DownloadsWatchdog] ❌ Failed to write PDF file to: {download_path}')
1221
+ return None
1222
+
1223
+ # Log cache information
1224
+ cache_status = 'from cache' if download_result.get('fromCache') else 'from network'
1225
+ response_size = download_result.get('responseSize', 0)
1226
+ self.logger.debug(
1227
+ f'[DownloadsWatchdog] ✅ Auto-downloaded PDF ({cache_status}, {response_size:,} bytes): {download_path}'
1228
+ )
1229
+
1230
+ # Store URL->path mapping for this session
1231
+ self._session_pdf_urls[pdf_url] = download_path
1232
+
1233
+ # Emit file downloaded event
1234
+ self.logger.debug(f'[DownloadsWatchdog] Dispatching FileDownloadedEvent for {final_filename}')
1235
+ self.event_bus.dispatch(
1236
+ FileDownloadedEvent(
1237
+ url=pdf_url,
1238
+ path=download_path,
1239
+ file_name=final_filename,
1240
+ file_size=response_size,
1241
+ file_type='pdf',
1242
+ mime_type='application/pdf',
1243
+ from_cache=download_result.get('fromCache', False),
1244
+ auto_download=True,
1245
+ )
1246
+ )
1247
+
1248
+ # No need to detach - session is cached
1249
+ return download_path
1250
+ else:
1251
+ self.logger.warning(f'[DownloadsWatchdog] No data received when downloading PDF from {pdf_url}')
1252
+ return None
1253
+
1254
+ except Exception as e:
1255
+ self.logger.warning(f'[DownloadsWatchdog] Failed to auto-download PDF from {pdf_url}: {type(e).__name__}: {e}')
1256
+ return None
1257
+
1258
+ except TimeoutError:
1259
+ self.logger.debug('[DownloadsWatchdog] PDF download operation timed out')
1260
+ return None
1261
+ except Exception as e:
1262
+ self.logger.error(f'[DownloadsWatchdog] Error in PDF download: {type(e).__name__}: {e}')
1263
+ return None
1264
+
1265
+ @staticmethod
1266
+ async def _get_unique_filename(directory: str, filename: str) -> str:
1267
+ """Generate a unique filename for downloads by appending (1), (2), etc., if a file already exists."""
1268
+ base, ext = os.path.splitext(filename)
1269
+ counter = 1
1270
+ new_filename = filename
1271
+ while os.path.exists(os.path.join(directory, new_filename)):
1272
+ new_filename = f'{base} ({counter}){ext}'
1273
+ counter += 1
1274
+ return new_filename
1275
+
1276
+
1277
+ # Fix Pydantic circular dependency - this will be called from session.py after BrowserSession is defined