optexity-browser-use 0.9.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- browser_use/__init__.py +157 -0
- browser_use/actor/__init__.py +11 -0
- browser_use/actor/element.py +1175 -0
- browser_use/actor/mouse.py +134 -0
- browser_use/actor/page.py +561 -0
- browser_use/actor/playground/flights.py +41 -0
- browser_use/actor/playground/mixed_automation.py +54 -0
- browser_use/actor/playground/playground.py +236 -0
- browser_use/actor/utils.py +176 -0
- browser_use/agent/cloud_events.py +282 -0
- browser_use/agent/gif.py +424 -0
- browser_use/agent/judge.py +170 -0
- browser_use/agent/message_manager/service.py +473 -0
- browser_use/agent/message_manager/utils.py +52 -0
- browser_use/agent/message_manager/views.py +98 -0
- browser_use/agent/prompts.py +413 -0
- browser_use/agent/service.py +2316 -0
- browser_use/agent/system_prompt.md +185 -0
- browser_use/agent/system_prompt_flash.md +10 -0
- browser_use/agent/system_prompt_no_thinking.md +183 -0
- browser_use/agent/views.py +743 -0
- browser_use/browser/__init__.py +41 -0
- browser_use/browser/cloud/cloud.py +203 -0
- browser_use/browser/cloud/views.py +89 -0
- browser_use/browser/events.py +578 -0
- browser_use/browser/profile.py +1158 -0
- browser_use/browser/python_highlights.py +548 -0
- browser_use/browser/session.py +3225 -0
- browser_use/browser/session_manager.py +399 -0
- browser_use/browser/video_recorder.py +162 -0
- browser_use/browser/views.py +200 -0
- browser_use/browser/watchdog_base.py +260 -0
- browser_use/browser/watchdogs/__init__.py +0 -0
- browser_use/browser/watchdogs/aboutblank_watchdog.py +253 -0
- browser_use/browser/watchdogs/crash_watchdog.py +335 -0
- browser_use/browser/watchdogs/default_action_watchdog.py +2729 -0
- browser_use/browser/watchdogs/dom_watchdog.py +817 -0
- browser_use/browser/watchdogs/downloads_watchdog.py +1277 -0
- browser_use/browser/watchdogs/local_browser_watchdog.py +461 -0
- browser_use/browser/watchdogs/permissions_watchdog.py +43 -0
- browser_use/browser/watchdogs/popups_watchdog.py +143 -0
- browser_use/browser/watchdogs/recording_watchdog.py +126 -0
- browser_use/browser/watchdogs/screenshot_watchdog.py +62 -0
- browser_use/browser/watchdogs/security_watchdog.py +280 -0
- browser_use/browser/watchdogs/storage_state_watchdog.py +335 -0
- browser_use/cli.py +2359 -0
- browser_use/code_use/__init__.py +16 -0
- browser_use/code_use/formatting.py +192 -0
- browser_use/code_use/namespace.py +665 -0
- browser_use/code_use/notebook_export.py +276 -0
- browser_use/code_use/service.py +1340 -0
- browser_use/code_use/system_prompt.md +574 -0
- browser_use/code_use/utils.py +150 -0
- browser_use/code_use/views.py +171 -0
- browser_use/config.py +505 -0
- browser_use/controller/__init__.py +3 -0
- browser_use/dom/enhanced_snapshot.py +161 -0
- browser_use/dom/markdown_extractor.py +169 -0
- browser_use/dom/playground/extraction.py +312 -0
- browser_use/dom/playground/multi_act.py +32 -0
- browser_use/dom/serializer/clickable_elements.py +200 -0
- browser_use/dom/serializer/code_use_serializer.py +287 -0
- browser_use/dom/serializer/eval_serializer.py +478 -0
- browser_use/dom/serializer/html_serializer.py +212 -0
- browser_use/dom/serializer/paint_order.py +197 -0
- browser_use/dom/serializer/serializer.py +1170 -0
- browser_use/dom/service.py +825 -0
- browser_use/dom/utils.py +129 -0
- browser_use/dom/views.py +906 -0
- browser_use/exceptions.py +5 -0
- browser_use/filesystem/__init__.py +0 -0
- browser_use/filesystem/file_system.py +619 -0
- browser_use/init_cmd.py +376 -0
- browser_use/integrations/gmail/__init__.py +24 -0
- browser_use/integrations/gmail/actions.py +115 -0
- browser_use/integrations/gmail/service.py +225 -0
- browser_use/llm/__init__.py +155 -0
- browser_use/llm/anthropic/chat.py +242 -0
- browser_use/llm/anthropic/serializer.py +312 -0
- browser_use/llm/aws/__init__.py +36 -0
- browser_use/llm/aws/chat_anthropic.py +242 -0
- browser_use/llm/aws/chat_bedrock.py +289 -0
- browser_use/llm/aws/serializer.py +257 -0
- browser_use/llm/azure/chat.py +91 -0
- browser_use/llm/base.py +57 -0
- browser_use/llm/browser_use/__init__.py +3 -0
- browser_use/llm/browser_use/chat.py +201 -0
- browser_use/llm/cerebras/chat.py +193 -0
- browser_use/llm/cerebras/serializer.py +109 -0
- browser_use/llm/deepseek/chat.py +212 -0
- browser_use/llm/deepseek/serializer.py +109 -0
- browser_use/llm/exceptions.py +29 -0
- browser_use/llm/google/__init__.py +3 -0
- browser_use/llm/google/chat.py +542 -0
- browser_use/llm/google/serializer.py +120 -0
- browser_use/llm/groq/chat.py +229 -0
- browser_use/llm/groq/parser.py +158 -0
- browser_use/llm/groq/serializer.py +159 -0
- browser_use/llm/messages.py +238 -0
- browser_use/llm/models.py +271 -0
- browser_use/llm/oci_raw/__init__.py +10 -0
- browser_use/llm/oci_raw/chat.py +443 -0
- browser_use/llm/oci_raw/serializer.py +229 -0
- browser_use/llm/ollama/chat.py +97 -0
- browser_use/llm/ollama/serializer.py +143 -0
- browser_use/llm/openai/chat.py +264 -0
- browser_use/llm/openai/like.py +15 -0
- browser_use/llm/openai/serializer.py +165 -0
- browser_use/llm/openrouter/chat.py +211 -0
- browser_use/llm/openrouter/serializer.py +26 -0
- browser_use/llm/schema.py +176 -0
- browser_use/llm/views.py +48 -0
- browser_use/logging_config.py +330 -0
- browser_use/mcp/__init__.py +18 -0
- browser_use/mcp/__main__.py +12 -0
- browser_use/mcp/client.py +544 -0
- browser_use/mcp/controller.py +264 -0
- browser_use/mcp/server.py +1114 -0
- browser_use/observability.py +204 -0
- browser_use/py.typed +0 -0
- browser_use/sandbox/__init__.py +41 -0
- browser_use/sandbox/sandbox.py +637 -0
- browser_use/sandbox/views.py +132 -0
- browser_use/screenshots/__init__.py +1 -0
- browser_use/screenshots/service.py +52 -0
- browser_use/sync/__init__.py +6 -0
- browser_use/sync/auth.py +357 -0
- browser_use/sync/service.py +161 -0
- browser_use/telemetry/__init__.py +51 -0
- browser_use/telemetry/service.py +112 -0
- browser_use/telemetry/views.py +101 -0
- browser_use/tokens/__init__.py +0 -0
- browser_use/tokens/custom_pricing.py +24 -0
- browser_use/tokens/mappings.py +4 -0
- browser_use/tokens/service.py +580 -0
- browser_use/tokens/views.py +108 -0
- browser_use/tools/registry/service.py +572 -0
- browser_use/tools/registry/views.py +174 -0
- browser_use/tools/service.py +1675 -0
- browser_use/tools/utils.py +82 -0
- browser_use/tools/views.py +100 -0
- browser_use/utils.py +670 -0
- optexity_browser_use-0.9.5.dist-info/METADATA +344 -0
- optexity_browser_use-0.9.5.dist-info/RECORD +147 -0
- optexity_browser_use-0.9.5.dist-info/WHEEL +4 -0
- optexity_browser_use-0.9.5.dist-info/entry_points.txt +3 -0
- optexity_browser_use-0.9.5.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,1277 @@
|
|
|
1
|
+
"""Downloads watchdog for monitoring and handling file downloads."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import tempfile
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
|
9
|
+
from urllib.parse import urlparse
|
|
10
|
+
|
|
11
|
+
import anyio
|
|
12
|
+
from bubus import BaseEvent
|
|
13
|
+
from cdp_use.cdp.browser import DownloadProgressEvent, DownloadWillBeginEvent
|
|
14
|
+
from cdp_use.cdp.network import ResponseReceivedEvent
|
|
15
|
+
from cdp_use.cdp.target import SessionID, TargetID
|
|
16
|
+
from pydantic import PrivateAttr
|
|
17
|
+
|
|
18
|
+
from browser_use.browser.events import (
|
|
19
|
+
BrowserLaunchEvent,
|
|
20
|
+
BrowserStateRequestEvent,
|
|
21
|
+
BrowserStoppedEvent,
|
|
22
|
+
FileDownloadedEvent,
|
|
23
|
+
NavigationCompleteEvent,
|
|
24
|
+
TabClosedEvent,
|
|
25
|
+
TabCreatedEvent,
|
|
26
|
+
)
|
|
27
|
+
from browser_use.browser.watchdog_base import BaseWatchdog
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class DownloadsWatchdog(BaseWatchdog):
|
|
34
|
+
"""Monitors downloads and handles file download events."""
|
|
35
|
+
|
|
36
|
+
# Events this watchdog listens to (for documentation)
|
|
37
|
+
LISTENS_TO: ClassVar[list[type[BaseEvent[Any]]]] = [
|
|
38
|
+
BrowserLaunchEvent,
|
|
39
|
+
BrowserStateRequestEvent,
|
|
40
|
+
BrowserStoppedEvent,
|
|
41
|
+
TabCreatedEvent,
|
|
42
|
+
TabClosedEvent,
|
|
43
|
+
NavigationCompleteEvent,
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
# Events this watchdog emits
|
|
47
|
+
EMITS: ClassVar[list[type[BaseEvent[Any]]]] = [
|
|
48
|
+
FileDownloadedEvent,
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
# Private state
|
|
52
|
+
_sessions_with_listeners: set[str] = PrivateAttr(default_factory=set) # Track sessions that already have download listeners
|
|
53
|
+
_active_downloads: dict[str, Any] = PrivateAttr(default_factory=dict)
|
|
54
|
+
_pdf_viewer_cache: dict[str, bool] = PrivateAttr(default_factory=dict) # Cache PDF viewer status by target URL
|
|
55
|
+
_download_cdp_session_setup: bool = PrivateAttr(default=False) # Track if CDP session is set up
|
|
56
|
+
_download_cdp_session: Any = PrivateAttr(default=None) # Store CDP session reference
|
|
57
|
+
_cdp_event_tasks: set[asyncio.Task] = PrivateAttr(default_factory=set) # Track CDP event handler tasks
|
|
58
|
+
_cdp_downloads_info: dict[str, dict[str, Any]] = PrivateAttr(default_factory=dict) # Map guid -> info
|
|
59
|
+
_use_js_fetch_for_local: bool = PrivateAttr(default=False) # Guard JS fetch path for local regular downloads
|
|
60
|
+
_session_pdf_urls: dict[str, str] = PrivateAttr(default_factory=dict) # URL -> path for PDFs downloaded this session
|
|
61
|
+
_network_monitored_targets: set[str] = PrivateAttr(default_factory=set) # Track targets with network monitoring enabled
|
|
62
|
+
_detected_downloads: set[str] = PrivateAttr(default_factory=set) # Track detected download URLs to avoid duplicates
|
|
63
|
+
_network_callback_registered: bool = PrivateAttr(default=False) # Track if global network callback is registered
|
|
64
|
+
|
|
65
|
+
async def on_BrowserLaunchEvent(self, event: BrowserLaunchEvent) -> None:
|
|
66
|
+
self.logger.debug(f'[DownloadsWatchdog] Received BrowserLaunchEvent, EventBus ID: {id(self.event_bus)}')
|
|
67
|
+
# Ensure downloads directory exists
|
|
68
|
+
downloads_path = self.browser_session.browser_profile.downloads_path
|
|
69
|
+
if downloads_path:
|
|
70
|
+
expanded_path = Path(downloads_path).expanduser().resolve()
|
|
71
|
+
expanded_path.mkdir(parents=True, exist_ok=True)
|
|
72
|
+
self.logger.debug(f'[DownloadsWatchdog] Ensured downloads directory exists: {expanded_path}')
|
|
73
|
+
|
|
74
|
+
async def on_TabCreatedEvent(self, event: TabCreatedEvent) -> None:
|
|
75
|
+
"""Monitor new tabs for downloads."""
|
|
76
|
+
# logger.info(f'[DownloadsWatchdog] TabCreatedEvent received for tab {event.target_id[-4:]}: {event.url}')
|
|
77
|
+
|
|
78
|
+
# Assert downloads path is configured (should always be set by BrowserProfile default)
|
|
79
|
+
assert self.browser_session.browser_profile.downloads_path is not None, 'Downloads path must be configured'
|
|
80
|
+
|
|
81
|
+
if event.target_id:
|
|
82
|
+
# logger.info(f'[DownloadsWatchdog] Found target for tab {event.target_id}, calling attach_to_target')
|
|
83
|
+
await self.attach_to_target(event.target_id)
|
|
84
|
+
else:
|
|
85
|
+
self.logger.warning(f'[DownloadsWatchdog] No target found for tab {event.target_id}')
|
|
86
|
+
|
|
87
|
+
async def on_TabClosedEvent(self, event: TabClosedEvent) -> None:
|
|
88
|
+
"""Stop monitoring closed tabs."""
|
|
89
|
+
pass # No cleanup needed, browser context handles target lifecycle
|
|
90
|
+
|
|
91
|
+
async def on_BrowserStateRequestEvent(self, event: BrowserStateRequestEvent) -> None:
|
|
92
|
+
"""Handle browser state request events."""
|
|
93
|
+
cdp_session = self.browser_session.agent_focus
|
|
94
|
+
if not cdp_session:
|
|
95
|
+
return
|
|
96
|
+
|
|
97
|
+
url = await self.browser_session.get_current_page_url()
|
|
98
|
+
if not url:
|
|
99
|
+
return
|
|
100
|
+
|
|
101
|
+
target_id = cdp_session.target_id
|
|
102
|
+
self.event_bus.dispatch(
|
|
103
|
+
NavigationCompleteEvent(
|
|
104
|
+
event_type='NavigationCompleteEvent',
|
|
105
|
+
url=url,
|
|
106
|
+
target_id=target_id,
|
|
107
|
+
event_parent_id=event.event_id,
|
|
108
|
+
)
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
async def on_BrowserStoppedEvent(self, event: BrowserStoppedEvent) -> None:
|
|
112
|
+
"""Clean up when browser stops."""
|
|
113
|
+
# Cancel all CDP event handler tasks
|
|
114
|
+
for task in list(self._cdp_event_tasks):
|
|
115
|
+
if not task.done():
|
|
116
|
+
task.cancel()
|
|
117
|
+
# Wait for all tasks to complete cancellation
|
|
118
|
+
if self._cdp_event_tasks:
|
|
119
|
+
await asyncio.gather(*self._cdp_event_tasks, return_exceptions=True)
|
|
120
|
+
self._cdp_event_tasks.clear()
|
|
121
|
+
|
|
122
|
+
# Clean up CDP session
|
|
123
|
+
# CDP sessions are now cached and managed by BrowserSession
|
|
124
|
+
self._download_cdp_session = None
|
|
125
|
+
self._download_cdp_session_setup = False
|
|
126
|
+
|
|
127
|
+
# Clear other state
|
|
128
|
+
self._sessions_with_listeners.clear()
|
|
129
|
+
self._active_downloads.clear()
|
|
130
|
+
self._pdf_viewer_cache.clear()
|
|
131
|
+
self._session_pdf_urls.clear()
|
|
132
|
+
self._network_monitored_targets.clear()
|
|
133
|
+
self._detected_downloads.clear()
|
|
134
|
+
self._network_callback_registered = False
|
|
135
|
+
|
|
136
|
+
async def on_NavigationCompleteEvent(self, event: NavigationCompleteEvent) -> None:
|
|
137
|
+
"""Check for PDFs after navigation completes."""
|
|
138
|
+
self.logger.debug(f'[DownloadsWatchdog] NavigationCompleteEvent received for {event.url}, tab #{event.target_id[-4:]}')
|
|
139
|
+
|
|
140
|
+
# Clear PDF cache for the navigated URL since content may have changed
|
|
141
|
+
if event.url in self._pdf_viewer_cache:
|
|
142
|
+
del self._pdf_viewer_cache[event.url]
|
|
143
|
+
|
|
144
|
+
# Check if auto-download is enabled
|
|
145
|
+
auto_download_enabled = self._is_auto_download_enabled()
|
|
146
|
+
if not auto_download_enabled:
|
|
147
|
+
return
|
|
148
|
+
|
|
149
|
+
# Note: Using network-based PDF detection that doesn't require JavaScript
|
|
150
|
+
|
|
151
|
+
target_id = event.target_id
|
|
152
|
+
self.logger.debug(f'[DownloadsWatchdog] Got target_id={target_id} for tab #{event.target_id[-4:]}')
|
|
153
|
+
|
|
154
|
+
is_pdf = await self.check_for_pdf_viewer(target_id)
|
|
155
|
+
if is_pdf:
|
|
156
|
+
self.logger.debug(f'[DownloadsWatchdog] 📄 PDF detected at {event.url}, triggering auto-download...')
|
|
157
|
+
download_path = await self.trigger_pdf_download(target_id)
|
|
158
|
+
if not download_path:
|
|
159
|
+
self.logger.warning(f'[DownloadsWatchdog] ⚠️ PDF download failed for {event.url}')
|
|
160
|
+
|
|
161
|
+
def _is_auto_download_enabled(self) -> bool:
|
|
162
|
+
"""Check if auto-download PDFs is enabled in browser profile."""
|
|
163
|
+
return self.browser_session.browser_profile.auto_download_pdfs
|
|
164
|
+
|
|
165
|
+
async def attach_to_target(self, target_id: TargetID) -> None:
|
|
166
|
+
"""Set up download monitoring for a specific target."""
|
|
167
|
+
|
|
168
|
+
# Define CDP event handlers outside of try to avoid indentation/scope issues
|
|
169
|
+
def download_will_begin_handler(event: DownloadWillBeginEvent, session_id: SessionID | None) -> None:
|
|
170
|
+
self.logger.debug(f'[DownloadsWatchdog] Download will begin: {event}')
|
|
171
|
+
# Cache info for later completion event handling (esp. remote browsers)
|
|
172
|
+
guid = event.get('guid', '')
|
|
173
|
+
try:
|
|
174
|
+
suggested_filename = event.get('suggestedFilename')
|
|
175
|
+
assert suggested_filename, 'CDP DownloadWillBegin missing suggestedFilename'
|
|
176
|
+
self._cdp_downloads_info[guid] = {
|
|
177
|
+
'url': event.get('url', ''),
|
|
178
|
+
'suggested_filename': suggested_filename,
|
|
179
|
+
'handled': False,
|
|
180
|
+
}
|
|
181
|
+
except (AssertionError, KeyError):
|
|
182
|
+
pass
|
|
183
|
+
# Create and track the task
|
|
184
|
+
task = asyncio.create_task(self._handle_cdp_download(event, target_id, session_id))
|
|
185
|
+
self._cdp_event_tasks.add(task)
|
|
186
|
+
# Remove from set when done
|
|
187
|
+
task.add_done_callback(lambda t: self._cdp_event_tasks.discard(t))
|
|
188
|
+
|
|
189
|
+
def download_progress_handler(event: DownloadProgressEvent, session_id: SessionID | None) -> None:
|
|
190
|
+
# Check if download is complete
|
|
191
|
+
if event.get('state') == 'completed':
|
|
192
|
+
file_path = event.get('filePath')
|
|
193
|
+
guid = event.get('guid', '')
|
|
194
|
+
if self.browser_session.is_local:
|
|
195
|
+
if file_path:
|
|
196
|
+
self.logger.debug(f'[DownloadsWatchdog] Download completed: {file_path}')
|
|
197
|
+
# Track the download
|
|
198
|
+
self._track_download(file_path)
|
|
199
|
+
# Mark as handled to prevent fallback duplicate dispatch
|
|
200
|
+
try:
|
|
201
|
+
if guid in self._cdp_downloads_info:
|
|
202
|
+
self._cdp_downloads_info[guid]['handled'] = True
|
|
203
|
+
except (KeyError, AttributeError):
|
|
204
|
+
pass
|
|
205
|
+
else:
|
|
206
|
+
# No local file path provided, local polling in _handle_cdp_download will handle it
|
|
207
|
+
self.logger.debug(
|
|
208
|
+
'[DownloadsWatchdog] No filePath in progress event (local); polling will handle detection'
|
|
209
|
+
)
|
|
210
|
+
else:
|
|
211
|
+
# Remote browser: do not touch local filesystem. Fallback to downloadPath+suggestedFilename
|
|
212
|
+
info = self._cdp_downloads_info.get(guid, {})
|
|
213
|
+
try:
|
|
214
|
+
suggested_filename = info.get('suggested_filename') or (Path(file_path).name if file_path else 'download')
|
|
215
|
+
downloads_path = str(self.browser_session.browser_profile.downloads_path or '')
|
|
216
|
+
effective_path = file_path or str(Path(downloads_path) / suggested_filename)
|
|
217
|
+
file_name = Path(effective_path).name
|
|
218
|
+
file_ext = Path(file_name).suffix.lower().lstrip('.')
|
|
219
|
+
self.event_bus.dispatch(
|
|
220
|
+
FileDownloadedEvent(
|
|
221
|
+
url=info.get('url', ''),
|
|
222
|
+
path=str(effective_path),
|
|
223
|
+
file_name=file_name,
|
|
224
|
+
file_size=0,
|
|
225
|
+
file_type=file_ext if file_ext else None,
|
|
226
|
+
)
|
|
227
|
+
)
|
|
228
|
+
self.logger.debug(f'[DownloadsWatchdog] ✅ (remote) Download completed: {effective_path}')
|
|
229
|
+
finally:
|
|
230
|
+
if guid in self._cdp_downloads_info:
|
|
231
|
+
del self._cdp_downloads_info[guid]
|
|
232
|
+
|
|
233
|
+
try:
|
|
234
|
+
downloads_path_raw = self.browser_session.browser_profile.downloads_path
|
|
235
|
+
if not downloads_path_raw:
|
|
236
|
+
# logger.info(f'[DownloadsWatchdog] No downloads path configured, skipping target: {target_id}')
|
|
237
|
+
return # No downloads path configured
|
|
238
|
+
|
|
239
|
+
# Check if we already have a download listener on this session
|
|
240
|
+
# to prevent duplicate listeners from being added
|
|
241
|
+
# Note: Since download listeners are set up once per browser session, not per target,
|
|
242
|
+
# we just track if we've set up the browser-level listener
|
|
243
|
+
if self._download_cdp_session_setup:
|
|
244
|
+
self.logger.debug('[DownloadsWatchdog] Download listener already set up for browser session')
|
|
245
|
+
return
|
|
246
|
+
|
|
247
|
+
# logger.debug(f'[DownloadsWatchdog] Setting up CDP download listener for target: {target_id}')
|
|
248
|
+
|
|
249
|
+
# Use CDP session for download events but store reference in watchdog
|
|
250
|
+
if not self._download_cdp_session_setup:
|
|
251
|
+
# Set up CDP session for downloads (only once per browser session)
|
|
252
|
+
cdp_client = self.browser_session.cdp_client
|
|
253
|
+
|
|
254
|
+
# Set download behavior to allow downloads and enable events
|
|
255
|
+
downloads_path = self.browser_session.browser_profile.downloads_path
|
|
256
|
+
if not downloads_path:
|
|
257
|
+
self.logger.warning('[DownloadsWatchdog] No downloads path configured, skipping CDP download setup')
|
|
258
|
+
return
|
|
259
|
+
# Ensure path is properly expanded (~ -> absolute path)
|
|
260
|
+
expanded_downloads_path = Path(downloads_path).expanduser().resolve()
|
|
261
|
+
await cdp_client.send.Browser.setDownloadBehavior(
|
|
262
|
+
params={
|
|
263
|
+
'behavior': 'allow',
|
|
264
|
+
'downloadPath': str(expanded_downloads_path), # Use expanded absolute path
|
|
265
|
+
'eventsEnabled': True,
|
|
266
|
+
}
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# Register the handlers with CDP
|
|
270
|
+
cdp_client.register.Browser.downloadWillBegin(download_will_begin_handler) # type: ignore[arg-type]
|
|
271
|
+
cdp_client.register.Browser.downloadProgress(download_progress_handler) # type: ignore[arg-type]
|
|
272
|
+
|
|
273
|
+
self._download_cdp_session_setup = True
|
|
274
|
+
self.logger.debug('[DownloadsWatchdog] Set up CDP download listeners')
|
|
275
|
+
|
|
276
|
+
# No need to track individual targets since download listener is browser-level
|
|
277
|
+
# logger.debug(f'[DownloadsWatchdog] Successfully set up CDP download listener for target: {target_id}')
|
|
278
|
+
|
|
279
|
+
except Exception as e:
|
|
280
|
+
self.logger.warning(f'[DownloadsWatchdog] Failed to set up CDP download listener for target {target_id}: {e}')
|
|
281
|
+
|
|
282
|
+
# Set up network monitoring for this target (catches ALL download variants)
|
|
283
|
+
await self._setup_network_monitoring(target_id)
|
|
284
|
+
|
|
285
|
+
async def _setup_network_monitoring(self, target_id: TargetID) -> None:
|
|
286
|
+
"""Set up network monitoring to detect PDFs and downloads from ALL sources.
|
|
287
|
+
|
|
288
|
+
This catches:
|
|
289
|
+
- Direct PDF navigation
|
|
290
|
+
- PDFs in iframes
|
|
291
|
+
- PDFs with embed/object tags
|
|
292
|
+
- JavaScript-triggered downloads
|
|
293
|
+
- Any Content-Disposition: attachment headers
|
|
294
|
+
"""
|
|
295
|
+
# Skip if already monitoring this target
|
|
296
|
+
if target_id in self._network_monitored_targets:
|
|
297
|
+
self.logger.debug(f'[DownloadsWatchdog] Network monitoring already enabled for target {target_id[-4:]}')
|
|
298
|
+
return
|
|
299
|
+
|
|
300
|
+
# Check if auto-download is enabled
|
|
301
|
+
if not self._is_auto_download_enabled():
|
|
302
|
+
self.logger.debug('[DownloadsWatchdog] Auto-download disabled, skipping network monitoring')
|
|
303
|
+
return
|
|
304
|
+
|
|
305
|
+
try:
|
|
306
|
+
cdp_client = self.browser_session.cdp_client
|
|
307
|
+
|
|
308
|
+
# Register the global callback once
|
|
309
|
+
if not self._network_callback_registered:
|
|
310
|
+
|
|
311
|
+
def on_response_received(event: ResponseReceivedEvent, session_id: str | None) -> None:
|
|
312
|
+
"""Handle Network.responseReceived event to detect downloadable content.
|
|
313
|
+
|
|
314
|
+
This callback is registered globally and uses session_id to determine the correct target.
|
|
315
|
+
"""
|
|
316
|
+
try:
|
|
317
|
+
# Look up target_id from session_id
|
|
318
|
+
event_target_id = self.browser_session.get_target_id_from_session_id(session_id)
|
|
319
|
+
if not event_target_id:
|
|
320
|
+
# Session not in pool - might be a stale session or not yet tracked
|
|
321
|
+
return
|
|
322
|
+
|
|
323
|
+
# Only process events for targets we're monitoring
|
|
324
|
+
if event_target_id not in self._network_monitored_targets:
|
|
325
|
+
return
|
|
326
|
+
|
|
327
|
+
response = event.get('response', {})
|
|
328
|
+
url = response.get('url', '')
|
|
329
|
+
content_type = response.get('mimeType', '').lower()
|
|
330
|
+
headers = response.get('headers', {})
|
|
331
|
+
|
|
332
|
+
# Skip non-HTTP URLs (data:, about:, chrome-extension:, etc.)
|
|
333
|
+
if not url.startswith('http'):
|
|
334
|
+
return
|
|
335
|
+
|
|
336
|
+
# Check if it's a PDF
|
|
337
|
+
is_pdf = 'application/pdf' in content_type
|
|
338
|
+
|
|
339
|
+
# Check if it's marked as download via Content-Disposition header
|
|
340
|
+
content_disposition = headers.get('content-disposition', '').lower()
|
|
341
|
+
is_download_attachment = 'attachment' in content_disposition
|
|
342
|
+
|
|
343
|
+
# Filter out image/video/audio files even if marked as attachment
|
|
344
|
+
# These are likely resources, not intentional downloads
|
|
345
|
+
unwanted_content_types = [
|
|
346
|
+
'image/',
|
|
347
|
+
'video/',
|
|
348
|
+
'audio/',
|
|
349
|
+
'text/css',
|
|
350
|
+
'text/javascript',
|
|
351
|
+
'application/javascript',
|
|
352
|
+
'application/x-javascript',
|
|
353
|
+
'text/html',
|
|
354
|
+
'application/json',
|
|
355
|
+
'font/',
|
|
356
|
+
'application/font',
|
|
357
|
+
'application/x-font',
|
|
358
|
+
]
|
|
359
|
+
is_unwanted_type = any(content_type.startswith(prefix) for prefix in unwanted_content_types)
|
|
360
|
+
if is_unwanted_type:
|
|
361
|
+
return
|
|
362
|
+
|
|
363
|
+
# Check URL extension to filter out obvious images/resources
|
|
364
|
+
url_lower = url.lower().split('?')[0] # Remove query params
|
|
365
|
+
unwanted_extensions = [
|
|
366
|
+
'.jpg',
|
|
367
|
+
'.jpeg',
|
|
368
|
+
'.png',
|
|
369
|
+
'.gif',
|
|
370
|
+
'.webp',
|
|
371
|
+
'.svg',
|
|
372
|
+
'.ico',
|
|
373
|
+
'.css',
|
|
374
|
+
'.js',
|
|
375
|
+
'.woff',
|
|
376
|
+
'.woff2',
|
|
377
|
+
'.ttf',
|
|
378
|
+
'.eot',
|
|
379
|
+
'.mp4',
|
|
380
|
+
'.webm',
|
|
381
|
+
'.mp3',
|
|
382
|
+
'.wav',
|
|
383
|
+
'.ogg',
|
|
384
|
+
]
|
|
385
|
+
if any(url_lower.endswith(ext) for ext in unwanted_extensions):
|
|
386
|
+
return
|
|
387
|
+
|
|
388
|
+
# Only process if it's a PDF or download
|
|
389
|
+
if not (is_pdf or is_download_attachment):
|
|
390
|
+
return
|
|
391
|
+
|
|
392
|
+
# Check if we've already processed this URL in this session
|
|
393
|
+
if url in self._detected_downloads:
|
|
394
|
+
self.logger.debug(f'[DownloadsWatchdog] Already detected download: {url[:80]}...')
|
|
395
|
+
return
|
|
396
|
+
|
|
397
|
+
# Mark as detected to avoid duplicates
|
|
398
|
+
self._detected_downloads.add(url)
|
|
399
|
+
|
|
400
|
+
# Extract filename from Content-Disposition if available
|
|
401
|
+
suggested_filename = None
|
|
402
|
+
if 'filename=' in content_disposition:
|
|
403
|
+
# Parse filename from Content-Disposition header
|
|
404
|
+
import re
|
|
405
|
+
|
|
406
|
+
filename_match = re.search(r'filename[^;=\n]*=(([\'"]).*?\2|[^;\n]*)', content_disposition)
|
|
407
|
+
if filename_match:
|
|
408
|
+
suggested_filename = filename_match.group(1).strip('\'"')
|
|
409
|
+
|
|
410
|
+
self.logger.info(f'[DownloadsWatchdog] 🔍 Detected downloadable content via network: {url[:80]}...')
|
|
411
|
+
self.logger.debug(
|
|
412
|
+
f'[DownloadsWatchdog] Content-Type: {content_type}, Is PDF: {is_pdf}, Is Attachment: {is_download_attachment}'
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
# Trigger download asynchronously in background (don't block event handler)
|
|
416
|
+
async def download_in_background():
|
|
417
|
+
try:
|
|
418
|
+
download_path = await self.download_file_from_url(
|
|
419
|
+
url=url,
|
|
420
|
+
target_id=event_target_id, # Use target_id from session_id lookup
|
|
421
|
+
content_type=content_type,
|
|
422
|
+
suggested_filename=suggested_filename,
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
if download_path:
|
|
426
|
+
self.logger.info(f'[DownloadsWatchdog] ✅ Successfully downloaded: {download_path}')
|
|
427
|
+
else:
|
|
428
|
+
self.logger.warning(f'[DownloadsWatchdog] ⚠️ Failed to download: {url[:80]}...')
|
|
429
|
+
except Exception as e:
|
|
430
|
+
self.logger.error(f'[DownloadsWatchdog] Error downloading in background: {type(e).__name__}: {e}')
|
|
431
|
+
|
|
432
|
+
# Create background task
|
|
433
|
+
task = asyncio.create_task(download_in_background())
|
|
434
|
+
self._cdp_event_tasks.add(task)
|
|
435
|
+
task.add_done_callback(lambda t: self._cdp_event_tasks.discard(t))
|
|
436
|
+
|
|
437
|
+
except Exception as e:
|
|
438
|
+
self.logger.error(f'[DownloadsWatchdog] Error in network response handler: {type(e).__name__}: {e}')
|
|
439
|
+
|
|
440
|
+
# Register the callback globally (once)
|
|
441
|
+
cdp_client.register.Network.responseReceived(on_response_received)
|
|
442
|
+
self._network_callback_registered = True
|
|
443
|
+
self.logger.debug('[DownloadsWatchdog] ✅ Registered global network response callback')
|
|
444
|
+
|
|
445
|
+
# Get or create CDP session for this target
|
|
446
|
+
cdp_session = await self.browser_session.get_or_create_cdp_session(target_id, focus=False)
|
|
447
|
+
|
|
448
|
+
# Enable Network domain to monitor HTTP responses (per-target/per-session)
|
|
449
|
+
await cdp_client.send.Network.enable(session_id=cdp_session.session_id)
|
|
450
|
+
self.logger.debug(f'[DownloadsWatchdog] Enabled Network domain for target {target_id[-4:]}')
|
|
451
|
+
|
|
452
|
+
# Mark this target as monitored
|
|
453
|
+
self._network_monitored_targets.add(target_id)
|
|
454
|
+
self.logger.debug(f'[DownloadsWatchdog] ✅ Network monitoring enabled for target {target_id[-4:]}')
|
|
455
|
+
|
|
456
|
+
except Exception as e:
|
|
457
|
+
self.logger.warning(f'[DownloadsWatchdog] Failed to set up network monitoring for target {target_id}: {e}')
|
|
458
|
+
|
|
459
|
+
async def download_file_from_url(
|
|
460
|
+
self, url: str, target_id: TargetID, content_type: str | None = None, suggested_filename: str | None = None
|
|
461
|
+
) -> str | None:
|
|
462
|
+
"""Generic method to download any file from a URL.
|
|
463
|
+
|
|
464
|
+
Args:
|
|
465
|
+
url: The URL to download
|
|
466
|
+
target_id: The target ID for CDP session
|
|
467
|
+
content_type: Optional content type (e.g., 'application/pdf')
|
|
468
|
+
suggested_filename: Optional filename from Content-Disposition header
|
|
469
|
+
|
|
470
|
+
Returns:
|
|
471
|
+
Path to downloaded file, or None if download failed
|
|
472
|
+
"""
|
|
473
|
+
if not self.browser_session.browser_profile.downloads_path:
|
|
474
|
+
self.logger.warning('[DownloadsWatchdog] No downloads path configured')
|
|
475
|
+
return None
|
|
476
|
+
|
|
477
|
+
# Check if already downloaded in this session
|
|
478
|
+
if url in self._session_pdf_urls:
|
|
479
|
+
existing_path = self._session_pdf_urls[url]
|
|
480
|
+
self.logger.debug(f'[DownloadsWatchdog] File already downloaded in session: {existing_path}')
|
|
481
|
+
return existing_path
|
|
482
|
+
|
|
483
|
+
try:
|
|
484
|
+
# Get or create CDP session for this target
|
|
485
|
+
temp_session = await self.browser_session.get_or_create_cdp_session(target_id, focus=False)
|
|
486
|
+
|
|
487
|
+
# Determine filename
|
|
488
|
+
if suggested_filename:
|
|
489
|
+
filename = suggested_filename
|
|
490
|
+
else:
|
|
491
|
+
# Extract from URL
|
|
492
|
+
filename = os.path.basename(url.split('?')[0]) # Remove query params
|
|
493
|
+
if not filename or '.' not in filename:
|
|
494
|
+
# Fallback: use content type to determine extension
|
|
495
|
+
if content_type and 'pdf' in content_type:
|
|
496
|
+
filename = 'document.pdf'
|
|
497
|
+
else:
|
|
498
|
+
filename = 'download'
|
|
499
|
+
|
|
500
|
+
# Ensure downloads directory exists
|
|
501
|
+
downloads_dir = str(self.browser_session.browser_profile.downloads_path)
|
|
502
|
+
os.makedirs(downloads_dir, exist_ok=True)
|
|
503
|
+
|
|
504
|
+
# Generate unique filename if file exists
|
|
505
|
+
final_filename = filename
|
|
506
|
+
existing_files = os.listdir(downloads_dir)
|
|
507
|
+
if filename in existing_files:
|
|
508
|
+
base, ext = os.path.splitext(filename)
|
|
509
|
+
counter = 1
|
|
510
|
+
while f'{base} ({counter}){ext}' in existing_files:
|
|
511
|
+
counter += 1
|
|
512
|
+
final_filename = f'{base} ({counter}){ext}'
|
|
513
|
+
self.logger.debug(f'[DownloadsWatchdog] File exists, using: {final_filename}')
|
|
514
|
+
|
|
515
|
+
self.logger.debug(f'[DownloadsWatchdog] Downloading from: {url[:100]}...')
|
|
516
|
+
|
|
517
|
+
# Download using JavaScript fetch to leverage browser cache
|
|
518
|
+
escaped_url = json.dumps(url)
|
|
519
|
+
|
|
520
|
+
result = await asyncio.wait_for(
|
|
521
|
+
temp_session.cdp_client.send.Runtime.evaluate(
|
|
522
|
+
params={
|
|
523
|
+
'expression': f"""
|
|
524
|
+
(async () => {{
|
|
525
|
+
try {{
|
|
526
|
+
const response = await fetch({escaped_url}, {{
|
|
527
|
+
cache: 'force-cache'
|
|
528
|
+
}});
|
|
529
|
+
if (!response.ok) {{
|
|
530
|
+
throw new Error(`HTTP error! status: ${{response.status}}`);
|
|
531
|
+
}}
|
|
532
|
+
const blob = await response.blob();
|
|
533
|
+
const arrayBuffer = await blob.arrayBuffer();
|
|
534
|
+
const uint8Array = new Uint8Array(arrayBuffer);
|
|
535
|
+
|
|
536
|
+
return {{
|
|
537
|
+
data: Array.from(uint8Array),
|
|
538
|
+
responseSize: uint8Array.length
|
|
539
|
+
}};
|
|
540
|
+
}} catch (error) {{
|
|
541
|
+
throw new Error(`Fetch failed: ${{error.message}}`);
|
|
542
|
+
}}
|
|
543
|
+
}})()
|
|
544
|
+
""",
|
|
545
|
+
'awaitPromise': True,
|
|
546
|
+
'returnByValue': True,
|
|
547
|
+
},
|
|
548
|
+
session_id=temp_session.session_id,
|
|
549
|
+
),
|
|
550
|
+
timeout=15.0, # 15 second timeout
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
download_result = result.get('result', {}).get('value', {})
|
|
554
|
+
|
|
555
|
+
if download_result and download_result.get('data') and len(download_result['data']) > 0:
|
|
556
|
+
download_path = os.path.join(downloads_dir, final_filename)
|
|
557
|
+
|
|
558
|
+
# Save the file asynchronously
|
|
559
|
+
async with await anyio.open_file(download_path, 'wb') as f:
|
|
560
|
+
await f.write(bytes(download_result['data']))
|
|
561
|
+
|
|
562
|
+
# Verify file was written successfully
|
|
563
|
+
if os.path.exists(download_path):
|
|
564
|
+
actual_size = os.path.getsize(download_path)
|
|
565
|
+
self.logger.debug(f'[DownloadsWatchdog] File written: {download_path} ({actual_size} bytes)')
|
|
566
|
+
|
|
567
|
+
# Determine file type
|
|
568
|
+
file_ext = Path(final_filename).suffix.lower().lstrip('.')
|
|
569
|
+
mime_type = content_type or f'application/{file_ext}'
|
|
570
|
+
|
|
571
|
+
# Store URL->path mapping for this session
|
|
572
|
+
self._session_pdf_urls[url] = download_path
|
|
573
|
+
|
|
574
|
+
# Emit file downloaded event
|
|
575
|
+
self.logger.debug(f'[DownloadsWatchdog] Dispatching FileDownloadedEvent for {final_filename}')
|
|
576
|
+
self.event_bus.dispatch(
|
|
577
|
+
FileDownloadedEvent(
|
|
578
|
+
url=url,
|
|
579
|
+
path=download_path,
|
|
580
|
+
file_name=final_filename,
|
|
581
|
+
file_size=actual_size,
|
|
582
|
+
file_type=file_ext if file_ext else None,
|
|
583
|
+
mime_type=mime_type,
|
|
584
|
+
auto_download=True,
|
|
585
|
+
)
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
return download_path
|
|
589
|
+
else:
|
|
590
|
+
self.logger.error(f'[DownloadsWatchdog] Failed to write file: {download_path}')
|
|
591
|
+
return None
|
|
592
|
+
else:
|
|
593
|
+
self.logger.warning(f'[DownloadsWatchdog] No data received when downloading from {url}')
|
|
594
|
+
return None
|
|
595
|
+
|
|
596
|
+
except TimeoutError:
|
|
597
|
+
self.logger.warning(f'[DownloadsWatchdog] Download timed out: {url[:80]}...')
|
|
598
|
+
return None
|
|
599
|
+
except Exception as e:
|
|
600
|
+
self.logger.warning(f'[DownloadsWatchdog] Download failed: {type(e).__name__}: {e}')
|
|
601
|
+
return None
|
|
602
|
+
|
|
603
|
+
def _track_download(self, file_path: str) -> None:
|
|
604
|
+
"""Track a completed download and dispatch the appropriate event.
|
|
605
|
+
|
|
606
|
+
Args:
|
|
607
|
+
file_path: The path to the downloaded file
|
|
608
|
+
"""
|
|
609
|
+
try:
|
|
610
|
+
# Get file info
|
|
611
|
+
path = Path(file_path)
|
|
612
|
+
if path.exists():
|
|
613
|
+
file_size = path.stat().st_size
|
|
614
|
+
self.logger.debug(f'[DownloadsWatchdog] Tracked download: {path.name} ({file_size} bytes)')
|
|
615
|
+
|
|
616
|
+
# Dispatch download event
|
|
617
|
+
from browser_use.browser.events import FileDownloadedEvent
|
|
618
|
+
|
|
619
|
+
self.event_bus.dispatch(
|
|
620
|
+
FileDownloadedEvent(
|
|
621
|
+
url=str(path), # Use the file path as URL for local files
|
|
622
|
+
path=str(path),
|
|
623
|
+
file_name=path.name,
|
|
624
|
+
file_size=file_size,
|
|
625
|
+
)
|
|
626
|
+
)
|
|
627
|
+
else:
|
|
628
|
+
self.logger.warning(f'[DownloadsWatchdog] Downloaded file not found: {file_path}')
|
|
629
|
+
except Exception as e:
|
|
630
|
+
self.logger.error(f'[DownloadsWatchdog] Error tracking download: {e}')
|
|
631
|
+
|
|
632
|
+
async def _handle_cdp_download(
|
|
633
|
+
self, event: DownloadWillBeginEvent, target_id: TargetID, session_id: SessionID | None
|
|
634
|
+
) -> None:
|
|
635
|
+
"""Handle a CDP Page.downloadWillBegin event."""
|
|
636
|
+
downloads_dir = (
|
|
637
|
+
Path(
|
|
638
|
+
self.browser_session.browser_profile.downloads_path
|
|
639
|
+
or f'{tempfile.gettempdir()}/browser_use_downloads.{str(self.browser_session.id)[-4:]}'
|
|
640
|
+
)
|
|
641
|
+
.expanduser()
|
|
642
|
+
.resolve()
|
|
643
|
+
) # Ensure path is properly expanded
|
|
644
|
+
|
|
645
|
+
# Initialize variables that may be used outside try blocks
|
|
646
|
+
unique_filename = None
|
|
647
|
+
file_size = 0
|
|
648
|
+
expected_path = None
|
|
649
|
+
download_result = None
|
|
650
|
+
download_url = event.get('url', '')
|
|
651
|
+
suggested_filename = event.get('suggestedFilename', 'download')
|
|
652
|
+
guid = event.get('guid', '')
|
|
653
|
+
|
|
654
|
+
try:
|
|
655
|
+
self.logger.debug(f'[DownloadsWatchdog] ⬇️ File download starting: {suggested_filename} from {download_url[:100]}...')
|
|
656
|
+
self.logger.debug(f'[DownloadsWatchdog] Full CDP event: {event}')
|
|
657
|
+
|
|
658
|
+
# Since Browser.setDownloadBehavior is already configured, the browser will download the file
|
|
659
|
+
# We just need to wait for it to appear in the downloads directory
|
|
660
|
+
expected_path = downloads_dir / suggested_filename
|
|
661
|
+
|
|
662
|
+
# Debug: List current directory contents
|
|
663
|
+
self.logger.debug(f'[DownloadsWatchdog] Downloads directory: {downloads_dir}')
|
|
664
|
+
if downloads_dir.exists():
|
|
665
|
+
files_before = list(downloads_dir.iterdir())
|
|
666
|
+
self.logger.debug(f'[DownloadsWatchdog] Files before download: {[f.name for f in files_before]}')
|
|
667
|
+
|
|
668
|
+
# Try manual JavaScript fetch as a fallback for local browsers (disabled for regular local downloads)
|
|
669
|
+
if self.browser_session.is_local and self._use_js_fetch_for_local:
|
|
670
|
+
self.logger.debug(f'[DownloadsWatchdog] Attempting JS fetch fallback for {download_url}')
|
|
671
|
+
|
|
672
|
+
unique_filename = None
|
|
673
|
+
file_size = None
|
|
674
|
+
download_result = None
|
|
675
|
+
try:
|
|
676
|
+
# Escape the URL for JavaScript
|
|
677
|
+
import json
|
|
678
|
+
|
|
679
|
+
escaped_url = json.dumps(download_url)
|
|
680
|
+
|
|
681
|
+
# Get the proper session for the frame that initiated the download
|
|
682
|
+
cdp_session = await self.browser_session.cdp_client_for_frame(event.get('frameId'))
|
|
683
|
+
assert cdp_session
|
|
684
|
+
|
|
685
|
+
result = await cdp_session.cdp_client.send.Runtime.evaluate(
|
|
686
|
+
params={
|
|
687
|
+
'expression': f"""
|
|
688
|
+
(async () => {{
|
|
689
|
+
try {{
|
|
690
|
+
const response = await fetch({escaped_url});
|
|
691
|
+
if (!response.ok) {{
|
|
692
|
+
throw new Error(`HTTP error! status: ${{response.status}}`);
|
|
693
|
+
}}
|
|
694
|
+
const blob = await response.blob();
|
|
695
|
+
const arrayBuffer = await blob.arrayBuffer();
|
|
696
|
+
const uint8Array = new Uint8Array(arrayBuffer);
|
|
697
|
+
return {{
|
|
698
|
+
data: Array.from(uint8Array),
|
|
699
|
+
size: uint8Array.length,
|
|
700
|
+
contentType: response.headers.get('content-type') || 'application/octet-stream'
|
|
701
|
+
}};
|
|
702
|
+
}} catch (error) {{
|
|
703
|
+
throw new Error(`Fetch failed: ${{error.message}}`);
|
|
704
|
+
}}
|
|
705
|
+
}})()
|
|
706
|
+
""",
|
|
707
|
+
'awaitPromise': True,
|
|
708
|
+
'returnByValue': True,
|
|
709
|
+
},
|
|
710
|
+
session_id=cdp_session.session_id,
|
|
711
|
+
)
|
|
712
|
+
download_result = result.get('result', {}).get('value')
|
|
713
|
+
|
|
714
|
+
if download_result and download_result.get('data'):
|
|
715
|
+
# Save the file
|
|
716
|
+
file_data = bytes(download_result['data'])
|
|
717
|
+
file_size = len(file_data)
|
|
718
|
+
|
|
719
|
+
# Ensure unique filename
|
|
720
|
+
unique_filename = await self._get_unique_filename(str(downloads_dir), suggested_filename)
|
|
721
|
+
final_path = downloads_dir / unique_filename
|
|
722
|
+
|
|
723
|
+
# Write the file
|
|
724
|
+
import anyio
|
|
725
|
+
|
|
726
|
+
async with await anyio.open_file(final_path, 'wb') as f:
|
|
727
|
+
await f.write(file_data)
|
|
728
|
+
|
|
729
|
+
self.logger.debug(f'[DownloadsWatchdog] ✅ Downloaded and saved file: {final_path} ({file_size} bytes)')
|
|
730
|
+
expected_path = final_path
|
|
731
|
+
# Emit download event immediately
|
|
732
|
+
file_ext = expected_path.suffix.lower().lstrip('.')
|
|
733
|
+
file_type = file_ext if file_ext else None
|
|
734
|
+
self.event_bus.dispatch(
|
|
735
|
+
FileDownloadedEvent(
|
|
736
|
+
url=download_url,
|
|
737
|
+
path=str(expected_path),
|
|
738
|
+
file_name=unique_filename or expected_path.name,
|
|
739
|
+
file_size=file_size or 0,
|
|
740
|
+
file_type=file_type,
|
|
741
|
+
mime_type=(download_result.get('contentType') if download_result else None),
|
|
742
|
+
from_cache=False,
|
|
743
|
+
auto_download=False,
|
|
744
|
+
)
|
|
745
|
+
)
|
|
746
|
+
# Mark as handled to prevent duplicate dispatch from progress/polling paths
|
|
747
|
+
try:
|
|
748
|
+
if guid in self._cdp_downloads_info:
|
|
749
|
+
self._cdp_downloads_info[guid]['handled'] = True
|
|
750
|
+
except (KeyError, AttributeError):
|
|
751
|
+
pass
|
|
752
|
+
self.logger.debug(
|
|
753
|
+
f'[DownloadsWatchdog] ✅ File download completed via CDP: {suggested_filename} ({file_size} bytes) saved to {expected_path}'
|
|
754
|
+
)
|
|
755
|
+
return
|
|
756
|
+
else:
|
|
757
|
+
self.logger.error('[DownloadsWatchdog] ❌ No data received from fetch')
|
|
758
|
+
|
|
759
|
+
except Exception as fetch_error:
|
|
760
|
+
self.logger.error(f'[DownloadsWatchdog] ❌ Failed to download file via fetch: {fetch_error}')
|
|
761
|
+
|
|
762
|
+
# For remote browsers, don't poll local filesystem; downloadProgress handler will emit the event
|
|
763
|
+
if not self.browser_session.is_local:
|
|
764
|
+
return
|
|
765
|
+
except Exception as e:
|
|
766
|
+
self.logger.error(f'[DownloadsWatchdog] ❌ Error handling CDP download: {type(e).__name__} {e}')
|
|
767
|
+
|
|
768
|
+
# If we reach here, the fetch method failed, so wait for native download
|
|
769
|
+
# Poll the downloads directory for new files
|
|
770
|
+
self.logger.debug(f'[DownloadsWatchdog] Checking if browser auto-download saved the file for us: {suggested_filename}')
|
|
771
|
+
|
|
772
|
+
# Get initial list of files in downloads directory
|
|
773
|
+
initial_files = set()
|
|
774
|
+
if Path(downloads_dir).exists():
|
|
775
|
+
for f in Path(downloads_dir).iterdir():
|
|
776
|
+
if f.is_file() and not f.name.startswith('.'):
|
|
777
|
+
initial_files.add(f.name)
|
|
778
|
+
|
|
779
|
+
# Poll for new files
|
|
780
|
+
max_wait = 20 # seconds
|
|
781
|
+
start_time = asyncio.get_event_loop().time()
|
|
782
|
+
|
|
783
|
+
while asyncio.get_event_loop().time() - start_time < max_wait:
|
|
784
|
+
await asyncio.sleep(5.0) # Check every 5 seconds
|
|
785
|
+
|
|
786
|
+
if Path(downloads_dir).exists():
|
|
787
|
+
for file_path in Path(downloads_dir).iterdir():
|
|
788
|
+
# Skip hidden files and files that were already there
|
|
789
|
+
if file_path.is_file() and not file_path.name.startswith('.') and file_path.name not in initial_files:
|
|
790
|
+
# Check if file has content (> 4 bytes)
|
|
791
|
+
try:
|
|
792
|
+
file_size = file_path.stat().st_size
|
|
793
|
+
if file_size > 4:
|
|
794
|
+
# Found a new download!
|
|
795
|
+
self.logger.debug(
|
|
796
|
+
f'[DownloadsWatchdog] ✅ Found downloaded file: {file_path} ({file_size} bytes)'
|
|
797
|
+
)
|
|
798
|
+
|
|
799
|
+
# Determine file type from extension
|
|
800
|
+
file_ext = file_path.suffix.lower().lstrip('.')
|
|
801
|
+
file_type = file_ext if file_ext else None
|
|
802
|
+
|
|
803
|
+
# Dispatch download event
|
|
804
|
+
# Skip if already handled by progress/JS fetch
|
|
805
|
+
info = self._cdp_downloads_info.get(guid, {})
|
|
806
|
+
if info.get('handled'):
|
|
807
|
+
return
|
|
808
|
+
self.event_bus.dispatch(
|
|
809
|
+
FileDownloadedEvent(
|
|
810
|
+
url=download_url,
|
|
811
|
+
path=str(file_path),
|
|
812
|
+
file_name=file_path.name,
|
|
813
|
+
file_size=file_size,
|
|
814
|
+
file_type=file_type,
|
|
815
|
+
)
|
|
816
|
+
)
|
|
817
|
+
# Mark as handled after dispatch
|
|
818
|
+
try:
|
|
819
|
+
if guid in self._cdp_downloads_info:
|
|
820
|
+
self._cdp_downloads_info[guid]['handled'] = True
|
|
821
|
+
except (KeyError, AttributeError):
|
|
822
|
+
pass
|
|
823
|
+
return
|
|
824
|
+
except Exception as e:
|
|
825
|
+
self.logger.debug(f'[DownloadsWatchdog] Error checking file {file_path}: {e}')
|
|
826
|
+
|
|
827
|
+
self.logger.warning(f'[DownloadsWatchdog] Download did not complete within {max_wait} seconds')
|
|
828
|
+
|
|
829
|
+
async def _handle_download(self, download: Any) -> None:
|
|
830
|
+
"""Handle a download event."""
|
|
831
|
+
download_id = f'{id(download)}'
|
|
832
|
+
self._active_downloads[download_id] = download
|
|
833
|
+
self.logger.debug(f'[DownloadsWatchdog] ⬇️ Handling download: {download.suggested_filename} from {download.url[:100]}...')
|
|
834
|
+
|
|
835
|
+
# Debug: Check if download is already being handled elsewhere
|
|
836
|
+
failure = (
|
|
837
|
+
await download.failure()
|
|
838
|
+
) # TODO: it always fails for some reason, figure out why connect_over_cdp makes accept_downloads not work
|
|
839
|
+
self.logger.warning(f'[DownloadsWatchdog] ❌ Download state - canceled: {failure}, url: {download.url}')
|
|
840
|
+
# logger.info(f'[DownloadsWatchdog] Active downloads count: {len(self._active_downloads)}')
|
|
841
|
+
|
|
842
|
+
try:
|
|
843
|
+
current_step = 'getting_download_info'
|
|
844
|
+
# Get download info immediately
|
|
845
|
+
url = download.url
|
|
846
|
+
suggested_filename = download.suggested_filename
|
|
847
|
+
|
|
848
|
+
current_step = 'determining_download_directory'
|
|
849
|
+
# Determine download directory from browser profile
|
|
850
|
+
downloads_dir = self.browser_session.browser_profile.downloads_path
|
|
851
|
+
if not downloads_dir:
|
|
852
|
+
downloads_dir = str(Path.home() / 'Downloads')
|
|
853
|
+
else:
|
|
854
|
+
downloads_dir = str(downloads_dir) # Ensure it's a string
|
|
855
|
+
|
|
856
|
+
# Check if Playwright already auto-downloaded the file (due to CDP setup)
|
|
857
|
+
original_path = Path(downloads_dir) / suggested_filename
|
|
858
|
+
if original_path.exists() and original_path.stat().st_size > 0:
|
|
859
|
+
self.logger.debug(
|
|
860
|
+
f'[DownloadsWatchdog] File already downloaded by Playwright: {original_path} ({original_path.stat().st_size} bytes)'
|
|
861
|
+
)
|
|
862
|
+
|
|
863
|
+
# Use the existing file instead of creating a duplicate
|
|
864
|
+
download_path = original_path
|
|
865
|
+
file_size = original_path.stat().st_size
|
|
866
|
+
unique_filename = suggested_filename
|
|
867
|
+
else:
|
|
868
|
+
current_step = 'generating_unique_filename'
|
|
869
|
+
# Ensure unique filename
|
|
870
|
+
unique_filename = await self._get_unique_filename(downloads_dir, suggested_filename)
|
|
871
|
+
download_path = Path(downloads_dir) / unique_filename
|
|
872
|
+
|
|
873
|
+
self.logger.debug(f'[DownloadsWatchdog] Download started: {unique_filename} from {url[:100]}...')
|
|
874
|
+
|
|
875
|
+
current_step = 'calling_save_as'
|
|
876
|
+
# Save the download using Playwright's save_as method
|
|
877
|
+
self.logger.debug(f'[DownloadsWatchdog] Saving download to: {download_path}')
|
|
878
|
+
self.logger.debug(f'[DownloadsWatchdog] Download path exists: {download_path.parent.exists()}')
|
|
879
|
+
self.logger.debug(f'[DownloadsWatchdog] Download path writable: {os.access(download_path.parent, os.W_OK)}')
|
|
880
|
+
|
|
881
|
+
try:
|
|
882
|
+
self.logger.debug('[DownloadsWatchdog] About to call download.save_as()...')
|
|
883
|
+
await download.save_as(str(download_path))
|
|
884
|
+
self.logger.debug(f'[DownloadsWatchdog] Successfully saved download to: {download_path}')
|
|
885
|
+
current_step = 'save_as_completed'
|
|
886
|
+
except Exception as save_error:
|
|
887
|
+
self.logger.error(f'[DownloadsWatchdog] save_as() failed with error: {save_error}')
|
|
888
|
+
raise save_error
|
|
889
|
+
|
|
890
|
+
# Get file info
|
|
891
|
+
file_size = download_path.stat().st_size if download_path.exists() else 0
|
|
892
|
+
|
|
893
|
+
# Determine file type from extension
|
|
894
|
+
file_ext = download_path.suffix.lower().lstrip('.')
|
|
895
|
+
file_type = file_ext if file_ext else None
|
|
896
|
+
|
|
897
|
+
# Try to get MIME type from response headers if available
|
|
898
|
+
mime_type = None
|
|
899
|
+
# Note: Playwright doesn't expose response headers directly from Download object
|
|
900
|
+
|
|
901
|
+
# Check if this was a PDF auto-download
|
|
902
|
+
auto_download = False
|
|
903
|
+
if file_type == 'pdf':
|
|
904
|
+
auto_download = self._is_auto_download_enabled()
|
|
905
|
+
|
|
906
|
+
# Emit download event
|
|
907
|
+
self.event_bus.dispatch(
|
|
908
|
+
FileDownloadedEvent(
|
|
909
|
+
url=url,
|
|
910
|
+
path=str(download_path),
|
|
911
|
+
file_name=suggested_filename,
|
|
912
|
+
file_size=file_size,
|
|
913
|
+
file_type=file_type,
|
|
914
|
+
mime_type=mime_type,
|
|
915
|
+
from_cache=False,
|
|
916
|
+
auto_download=auto_download,
|
|
917
|
+
)
|
|
918
|
+
)
|
|
919
|
+
|
|
920
|
+
self.logger.debug(
|
|
921
|
+
f'[DownloadsWatchdog] ✅ Download completed: {suggested_filename} ({file_size} bytes) saved to {download_path}'
|
|
922
|
+
)
|
|
923
|
+
|
|
924
|
+
# File is now tracked on filesystem, no need to track in memory
|
|
925
|
+
|
|
926
|
+
except Exception as e:
|
|
927
|
+
self.logger.error(
|
|
928
|
+
f'[DownloadsWatchdog] Error handling download at step "{locals().get("current_step", "unknown")}", error: {e}'
|
|
929
|
+
)
|
|
930
|
+
self.logger.error(
|
|
931
|
+
f'[DownloadsWatchdog] Download state - URL: {download.url}, filename: {download.suggested_filename}'
|
|
932
|
+
)
|
|
933
|
+
finally:
|
|
934
|
+
# Clean up tracking
|
|
935
|
+
if download_id in self._active_downloads:
|
|
936
|
+
del self._active_downloads[download_id]
|
|
937
|
+
|
|
938
|
+
async def check_for_pdf_viewer(self, target_id: TargetID) -> bool:
|
|
939
|
+
"""Check if the current target is a PDF using network-based detection.
|
|
940
|
+
|
|
941
|
+
This method avoids JavaScript execution that can crash WebSocket connections.
|
|
942
|
+
Returns True if a PDF is detected and should be downloaded.
|
|
943
|
+
"""
|
|
944
|
+
self.logger.debug(f'[DownloadsWatchdog] Checking if target {target_id} is PDF viewer...')
|
|
945
|
+
|
|
946
|
+
# Get target info to get URL
|
|
947
|
+
cdp_client = self.browser_session.cdp_client
|
|
948
|
+
targets = await cdp_client.send.Target.getTargets()
|
|
949
|
+
target_info = next((t for t in targets['targetInfos'] if t['targetId'] == target_id), None)
|
|
950
|
+
if not target_info:
|
|
951
|
+
self.logger.warning(f'[DownloadsWatchdog] No target info found for {target_id}')
|
|
952
|
+
return False
|
|
953
|
+
|
|
954
|
+
page_url = target_info.get('url', '')
|
|
955
|
+
|
|
956
|
+
# Check cache first
|
|
957
|
+
if page_url in self._pdf_viewer_cache:
|
|
958
|
+
cached_result = self._pdf_viewer_cache[page_url]
|
|
959
|
+
self.logger.debug(f'[DownloadsWatchdog] Using cached PDF check result for {page_url}: {cached_result}')
|
|
960
|
+
return cached_result
|
|
961
|
+
|
|
962
|
+
try:
|
|
963
|
+
# Method 1: Check URL patterns (fastest, most reliable)
|
|
964
|
+
url_is_pdf = self._check_url_for_pdf(page_url)
|
|
965
|
+
if url_is_pdf:
|
|
966
|
+
self.logger.debug(f'[DownloadsWatchdog] PDF detected via URL pattern: {page_url}')
|
|
967
|
+
self._pdf_viewer_cache[page_url] = True
|
|
968
|
+
return True
|
|
969
|
+
|
|
970
|
+
# Method 2: Check network response headers via CDP (safer than JavaScript)
|
|
971
|
+
header_is_pdf = await self._check_network_headers_for_pdf(target_id)
|
|
972
|
+
if header_is_pdf:
|
|
973
|
+
self.logger.debug(f'[DownloadsWatchdog] PDF detected via network headers: {page_url}')
|
|
974
|
+
self._pdf_viewer_cache[page_url] = True
|
|
975
|
+
return True
|
|
976
|
+
|
|
977
|
+
# Method 3: Check Chrome's PDF viewer specific URLs
|
|
978
|
+
chrome_pdf_viewer = self._is_chrome_pdf_viewer_url(page_url)
|
|
979
|
+
if chrome_pdf_viewer:
|
|
980
|
+
self.logger.debug(f'[DownloadsWatchdog] Chrome PDF viewer detected: {page_url}')
|
|
981
|
+
self._pdf_viewer_cache[page_url] = True
|
|
982
|
+
return True
|
|
983
|
+
|
|
984
|
+
# Not a PDF
|
|
985
|
+
self._pdf_viewer_cache[page_url] = False
|
|
986
|
+
return False
|
|
987
|
+
|
|
988
|
+
except Exception as e:
|
|
989
|
+
self.logger.warning(f'[DownloadsWatchdog] ❌ Error checking for PDF viewer: {e}')
|
|
990
|
+
self._pdf_viewer_cache[page_url] = False
|
|
991
|
+
return False
|
|
992
|
+
|
|
993
|
+
def _check_url_for_pdf(self, url: str) -> bool:
|
|
994
|
+
"""Check if URL indicates a PDF file."""
|
|
995
|
+
if not url:
|
|
996
|
+
return False
|
|
997
|
+
|
|
998
|
+
url_lower = url.lower()
|
|
999
|
+
|
|
1000
|
+
# Direct PDF file extensions
|
|
1001
|
+
if url_lower.endswith('.pdf'):
|
|
1002
|
+
return True
|
|
1003
|
+
|
|
1004
|
+
# PDF in path
|
|
1005
|
+
if '.pdf' in url_lower:
|
|
1006
|
+
return True
|
|
1007
|
+
|
|
1008
|
+
# PDF MIME type in URL parameters
|
|
1009
|
+
if any(
|
|
1010
|
+
param in url_lower
|
|
1011
|
+
for param in [
|
|
1012
|
+
'content-type=application/pdf',
|
|
1013
|
+
'content-type=application%2fpdf',
|
|
1014
|
+
'mimetype=application/pdf',
|
|
1015
|
+
'type=application/pdf',
|
|
1016
|
+
]
|
|
1017
|
+
):
|
|
1018
|
+
return True
|
|
1019
|
+
|
|
1020
|
+
return False
|
|
1021
|
+
|
|
1022
|
+
def _is_chrome_pdf_viewer_url(self, url: str) -> bool:
|
|
1023
|
+
"""Check if this is Chrome's internal PDF viewer URL."""
|
|
1024
|
+
if not url:
|
|
1025
|
+
return False
|
|
1026
|
+
|
|
1027
|
+
url_lower = url.lower()
|
|
1028
|
+
|
|
1029
|
+
# Chrome PDF viewer uses chrome-extension:// URLs
|
|
1030
|
+
if 'chrome-extension://' in url_lower and 'pdf' in url_lower:
|
|
1031
|
+
return True
|
|
1032
|
+
|
|
1033
|
+
# Chrome PDF viewer internal URLs
|
|
1034
|
+
if url_lower.startswith('chrome://') and 'pdf' in url_lower:
|
|
1035
|
+
return True
|
|
1036
|
+
|
|
1037
|
+
return False
|
|
1038
|
+
|
|
1039
|
+
async def _check_network_headers_for_pdf(self, target_id: TargetID) -> bool:
|
|
1040
|
+
"""Infer PDF via navigation history/URL; headers are not available post-navigation in this context."""
|
|
1041
|
+
try:
|
|
1042
|
+
import asyncio
|
|
1043
|
+
|
|
1044
|
+
# Get CDP session
|
|
1045
|
+
temp_session = await self.browser_session.get_or_create_cdp_session(target_id, focus=False)
|
|
1046
|
+
|
|
1047
|
+
# Get navigation history to find the main resource
|
|
1048
|
+
history = await asyncio.wait_for(
|
|
1049
|
+
temp_session.cdp_client.send.Page.getNavigationHistory(session_id=temp_session.session_id), timeout=3.0
|
|
1050
|
+
)
|
|
1051
|
+
|
|
1052
|
+
current_entry = history.get('entries', [])
|
|
1053
|
+
if current_entry:
|
|
1054
|
+
current_index = history.get('currentIndex', 0)
|
|
1055
|
+
if 0 <= current_index < len(current_entry):
|
|
1056
|
+
current_url = current_entry[current_index].get('url', '')
|
|
1057
|
+
|
|
1058
|
+
# Check if the URL itself suggests PDF
|
|
1059
|
+
if self._check_url_for_pdf(current_url):
|
|
1060
|
+
return True
|
|
1061
|
+
|
|
1062
|
+
# Note: CDP doesn't easily expose response headers for completed navigations
|
|
1063
|
+
# For more complex cases, we'd need to set up Network.responseReceived listeners
|
|
1064
|
+
# before navigation, but that's overkill for most PDF detection cases
|
|
1065
|
+
|
|
1066
|
+
return False
|
|
1067
|
+
|
|
1068
|
+
except Exception as e:
|
|
1069
|
+
self.logger.debug(f'[DownloadsWatchdog] Network headers check failed (non-critical): {e}')
|
|
1070
|
+
return False
|
|
1071
|
+
|
|
1072
|
+
async def trigger_pdf_download(self, target_id: TargetID) -> str | None:
|
|
1073
|
+
"""Trigger download of a PDF from Chrome's PDF viewer.
|
|
1074
|
+
|
|
1075
|
+
Returns the download path if successful, None otherwise.
|
|
1076
|
+
"""
|
|
1077
|
+
self.logger.debug(f'[DownloadsWatchdog] trigger_pdf_download called for target_id={target_id}')
|
|
1078
|
+
|
|
1079
|
+
if not self.browser_session.browser_profile.downloads_path:
|
|
1080
|
+
self.logger.warning('[DownloadsWatchdog] ❌ No downloads path configured, cannot save PDF download')
|
|
1081
|
+
return None
|
|
1082
|
+
|
|
1083
|
+
downloads_path = self.browser_session.browser_profile.downloads_path
|
|
1084
|
+
self.logger.debug(f'[DownloadsWatchdog] Downloads path: {downloads_path}')
|
|
1085
|
+
|
|
1086
|
+
try:
|
|
1087
|
+
# Create a temporary CDP session for this target without switching focus
|
|
1088
|
+
import asyncio
|
|
1089
|
+
|
|
1090
|
+
self.logger.debug(f'[DownloadsWatchdog] Creating CDP session for PDF download from target {target_id}')
|
|
1091
|
+
temp_session = await self.browser_session.get_or_create_cdp_session(target_id, focus=False)
|
|
1092
|
+
|
|
1093
|
+
# Try to get the PDF URL with timeout
|
|
1094
|
+
result = await asyncio.wait_for(
|
|
1095
|
+
temp_session.cdp_client.send.Runtime.evaluate(
|
|
1096
|
+
params={
|
|
1097
|
+
'expression': """
|
|
1098
|
+
(() => {
|
|
1099
|
+
// For Chrome's PDF viewer, the actual URL is in window.location.href
|
|
1100
|
+
// The embed element's src is often "about:blank"
|
|
1101
|
+
const embedElement = document.querySelector('embed[type="application/x-google-chrome-pdf"]') ||
|
|
1102
|
+
document.querySelector('embed[type="application/pdf"]');
|
|
1103
|
+
if (embedElement) {
|
|
1104
|
+
// Chrome PDF viewer detected - use the page URL
|
|
1105
|
+
return { url: window.location.href };
|
|
1106
|
+
}
|
|
1107
|
+
// Fallback to window.location.href anyway
|
|
1108
|
+
return { url: window.location.href };
|
|
1109
|
+
})()
|
|
1110
|
+
""",
|
|
1111
|
+
'returnByValue': True,
|
|
1112
|
+
},
|
|
1113
|
+
session_id=temp_session.session_id,
|
|
1114
|
+
),
|
|
1115
|
+
timeout=5.0, # 5 second timeout to prevent hanging
|
|
1116
|
+
)
|
|
1117
|
+
pdf_info = result.get('result', {}).get('value', {})
|
|
1118
|
+
|
|
1119
|
+
pdf_url = pdf_info.get('url', '')
|
|
1120
|
+
if not pdf_url:
|
|
1121
|
+
self.logger.warning(f'[DownloadsWatchdog] ❌ Could not determine PDF URL for download {pdf_info}')
|
|
1122
|
+
return None
|
|
1123
|
+
|
|
1124
|
+
# Generate filename from URL
|
|
1125
|
+
pdf_filename = os.path.basename(pdf_url.split('?')[0]) # Remove query params
|
|
1126
|
+
if not pdf_filename or not pdf_filename.endswith('.pdf'):
|
|
1127
|
+
parsed = urlparse(pdf_url)
|
|
1128
|
+
pdf_filename = os.path.basename(parsed.path) or 'document.pdf'
|
|
1129
|
+
if not pdf_filename.endswith('.pdf'):
|
|
1130
|
+
pdf_filename += '.pdf'
|
|
1131
|
+
|
|
1132
|
+
self.logger.debug(f'[DownloadsWatchdog] Generated filename: {pdf_filename}')
|
|
1133
|
+
|
|
1134
|
+
# Check if already downloaded in this session
|
|
1135
|
+
self.logger.debug(f'[DownloadsWatchdog] PDF_URL: {pdf_url}, session_pdf_urls: {self._session_pdf_urls}')
|
|
1136
|
+
if pdf_url in self._session_pdf_urls:
|
|
1137
|
+
existing_path = self._session_pdf_urls[pdf_url]
|
|
1138
|
+
self.logger.debug(f'[DownloadsWatchdog] PDF already downloaded in session: {existing_path}')
|
|
1139
|
+
return existing_path
|
|
1140
|
+
|
|
1141
|
+
# Generate unique filename if file exists from previous run
|
|
1142
|
+
downloads_dir = str(self.browser_session.browser_profile.downloads_path)
|
|
1143
|
+
os.makedirs(downloads_dir, exist_ok=True)
|
|
1144
|
+
final_filename = pdf_filename
|
|
1145
|
+
existing_files = os.listdir(downloads_dir)
|
|
1146
|
+
if pdf_filename in existing_files:
|
|
1147
|
+
# Generate unique name with (1), (2), etc.
|
|
1148
|
+
base, ext = os.path.splitext(pdf_filename)
|
|
1149
|
+
counter = 1
|
|
1150
|
+
while f'{base} ({counter}){ext}' in existing_files:
|
|
1151
|
+
counter += 1
|
|
1152
|
+
final_filename = f'{base} ({counter}){ext}'
|
|
1153
|
+
self.logger.debug(f'[DownloadsWatchdog] File exists, using: {final_filename}')
|
|
1154
|
+
|
|
1155
|
+
self.logger.debug(f'[DownloadsWatchdog] Starting PDF download from: {pdf_url[:100]}...')
|
|
1156
|
+
|
|
1157
|
+
# Download using JavaScript fetch to leverage browser cache
|
|
1158
|
+
try:
|
|
1159
|
+
# Properly escape the URL to prevent JavaScript injection
|
|
1160
|
+
escaped_pdf_url = json.dumps(pdf_url)
|
|
1161
|
+
|
|
1162
|
+
result = await asyncio.wait_for(
|
|
1163
|
+
temp_session.cdp_client.send.Runtime.evaluate(
|
|
1164
|
+
params={
|
|
1165
|
+
'expression': f"""
|
|
1166
|
+
(async () => {{
|
|
1167
|
+
try {{
|
|
1168
|
+
// Use fetch with cache: 'force-cache' to prioritize cached version
|
|
1169
|
+
const response = await fetch({escaped_pdf_url}, {{
|
|
1170
|
+
cache: 'force-cache'
|
|
1171
|
+
}});
|
|
1172
|
+
if (!response.ok) {{
|
|
1173
|
+
throw new Error(`HTTP error! status: ${{response.status}}`);
|
|
1174
|
+
}}
|
|
1175
|
+
const blob = await response.blob();
|
|
1176
|
+
const arrayBuffer = await blob.arrayBuffer();
|
|
1177
|
+
const uint8Array = new Uint8Array(arrayBuffer);
|
|
1178
|
+
|
|
1179
|
+
// Check if served from cache
|
|
1180
|
+
const fromCache = response.headers.has('age') ||
|
|
1181
|
+
!response.headers.has('date');
|
|
1182
|
+
|
|
1183
|
+
return {{
|
|
1184
|
+
data: Array.from(uint8Array),
|
|
1185
|
+
fromCache: fromCache,
|
|
1186
|
+
responseSize: uint8Array.length,
|
|
1187
|
+
transferSize: response.headers.get('content-length') || 'unknown'
|
|
1188
|
+
}};
|
|
1189
|
+
}} catch (error) {{
|
|
1190
|
+
throw new Error(`Fetch failed: ${{error.message}}`);
|
|
1191
|
+
}}
|
|
1192
|
+
}})()
|
|
1193
|
+
""",
|
|
1194
|
+
'awaitPromise': True,
|
|
1195
|
+
'returnByValue': True,
|
|
1196
|
+
},
|
|
1197
|
+
session_id=temp_session.session_id,
|
|
1198
|
+
),
|
|
1199
|
+
timeout=10.0, # 10 second timeout for download operation
|
|
1200
|
+
)
|
|
1201
|
+
download_result = result.get('result', {}).get('value', {})
|
|
1202
|
+
|
|
1203
|
+
if download_result and download_result.get('data') and len(download_result['data']) > 0:
|
|
1204
|
+
# Ensure downloads directory exists
|
|
1205
|
+
downloads_dir = str(self.browser_session.browser_profile.downloads_path)
|
|
1206
|
+
os.makedirs(downloads_dir, exist_ok=True)
|
|
1207
|
+
download_path = os.path.join(downloads_dir, final_filename)
|
|
1208
|
+
|
|
1209
|
+
# Save the PDF asynchronously
|
|
1210
|
+
async with await anyio.open_file(download_path, 'wb') as f:
|
|
1211
|
+
await f.write(bytes(download_result['data']))
|
|
1212
|
+
|
|
1213
|
+
# Verify file was written successfully
|
|
1214
|
+
if os.path.exists(download_path):
|
|
1215
|
+
actual_size = os.path.getsize(download_path)
|
|
1216
|
+
self.logger.debug(
|
|
1217
|
+
f'[DownloadsWatchdog] PDF file written successfully: {download_path} ({actual_size} bytes)'
|
|
1218
|
+
)
|
|
1219
|
+
else:
|
|
1220
|
+
self.logger.error(f'[DownloadsWatchdog] ❌ Failed to write PDF file to: {download_path}')
|
|
1221
|
+
return None
|
|
1222
|
+
|
|
1223
|
+
# Log cache information
|
|
1224
|
+
cache_status = 'from cache' if download_result.get('fromCache') else 'from network'
|
|
1225
|
+
response_size = download_result.get('responseSize', 0)
|
|
1226
|
+
self.logger.debug(
|
|
1227
|
+
f'[DownloadsWatchdog] ✅ Auto-downloaded PDF ({cache_status}, {response_size:,} bytes): {download_path}'
|
|
1228
|
+
)
|
|
1229
|
+
|
|
1230
|
+
# Store URL->path mapping for this session
|
|
1231
|
+
self._session_pdf_urls[pdf_url] = download_path
|
|
1232
|
+
|
|
1233
|
+
# Emit file downloaded event
|
|
1234
|
+
self.logger.debug(f'[DownloadsWatchdog] Dispatching FileDownloadedEvent for {final_filename}')
|
|
1235
|
+
self.event_bus.dispatch(
|
|
1236
|
+
FileDownloadedEvent(
|
|
1237
|
+
url=pdf_url,
|
|
1238
|
+
path=download_path,
|
|
1239
|
+
file_name=final_filename,
|
|
1240
|
+
file_size=response_size,
|
|
1241
|
+
file_type='pdf',
|
|
1242
|
+
mime_type='application/pdf',
|
|
1243
|
+
from_cache=download_result.get('fromCache', False),
|
|
1244
|
+
auto_download=True,
|
|
1245
|
+
)
|
|
1246
|
+
)
|
|
1247
|
+
|
|
1248
|
+
# No need to detach - session is cached
|
|
1249
|
+
return download_path
|
|
1250
|
+
else:
|
|
1251
|
+
self.logger.warning(f'[DownloadsWatchdog] No data received when downloading PDF from {pdf_url}')
|
|
1252
|
+
return None
|
|
1253
|
+
|
|
1254
|
+
except Exception as e:
|
|
1255
|
+
self.logger.warning(f'[DownloadsWatchdog] Failed to auto-download PDF from {pdf_url}: {type(e).__name__}: {e}')
|
|
1256
|
+
return None
|
|
1257
|
+
|
|
1258
|
+
except TimeoutError:
|
|
1259
|
+
self.logger.debug('[DownloadsWatchdog] PDF download operation timed out')
|
|
1260
|
+
return None
|
|
1261
|
+
except Exception as e:
|
|
1262
|
+
self.logger.error(f'[DownloadsWatchdog] Error in PDF download: {type(e).__name__}: {e}')
|
|
1263
|
+
return None
|
|
1264
|
+
|
|
1265
|
+
@staticmethod
|
|
1266
|
+
async def _get_unique_filename(directory: str, filename: str) -> str:
|
|
1267
|
+
"""Generate a unique filename for downloads by appending (1), (2), etc., if a file already exists."""
|
|
1268
|
+
base, ext = os.path.splitext(filename)
|
|
1269
|
+
counter = 1
|
|
1270
|
+
new_filename = filename
|
|
1271
|
+
while os.path.exists(os.path.join(directory, new_filename)):
|
|
1272
|
+
new_filename = f'{base} ({counter}){ext}'
|
|
1273
|
+
counter += 1
|
|
1274
|
+
return new_filename
|
|
1275
|
+
|
|
1276
|
+
|
|
1277
|
+
# Fix Pydantic circular dependency - this will be called from session.py after BrowserSession is defined
|