phantomwright 0.1.4__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {phantomwright-0.1.4 → phantomwright-0.2.0}/PKG-INFO +7 -3
- phantomwright-0.2.0/phantomwright/_repo_version.py +1 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/captcha/cloudfare/scripts/observer.js +8 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/captcha/cloudfare/solver.py +45 -16
- phantomwright-0.2.0/phantomwright/cli/__init__.py +5 -0
- phantomwright-0.2.0/phantomwright/cli/client.py +201 -0
- phantomwright-0.2.0/phantomwright/cli/commands/__init__.py +1 -0
- phantomwright-0.2.0/phantomwright/cli/commands/extraction.py +137 -0
- phantomwright-0.2.0/phantomwright/cli/commands/info.py +169 -0
- phantomwright-0.2.0/phantomwright/cli/commands/interaction.py +46 -0
- phantomwright-0.2.0/phantomwright/cli/commands/navigation.py +70 -0
- phantomwright-0.2.0/phantomwright/cli/main.py +57 -0
- phantomwright-0.2.0/phantomwright/cli/output.py +51 -0
- phantomwright-0.2.0/phantomwright/cli/server.py +889 -0
- phantomwright-0.2.0/phantomwright/cli/session.py +63 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/pyproject.toml +10 -3
- phantomwright-0.1.4/phantomwright/_repo_version.py +0 -1
- {phantomwright-0.1.4 → phantomwright-0.2.0}/.gitignore +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/LICENSE +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/README.md +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/__init__.py +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/_impl/__init__.py +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/_impl/_core_debug_patch.py +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/_impl/_evaluate_patch.py +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/_impl/_inconsistency_patch.py +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/async_api/__init__.py +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/captcha/__init__.py +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/captcha/cloudfare/scripts/shadow_root.js +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/captcha/cloudfare/utils/build_js.py +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/captcha/cloudfare/utils/consts.py +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/captcha/cloudfare/utils/detection.py +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/captcha/cloudfare/utils/dom_helpers.py +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/captcha/cloudfare/utils/shadow_root.py +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/py.typed +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/stealth/__init__.py +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/stealth/js/evasions/chrome.app.js +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/stealth/js/evasions/chrome.csi.js +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/stealth/js/evasions/chrome.hairline.js +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/stealth/js/evasions/chrome.load.times.js +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/stealth/js/evasions/chrome.runtime.js +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/stealth/js/evasions/error.prototype.js +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/stealth/js/evasions/iframe.contentWindow.js +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/stealth/js/evasions/media.codecs.js +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/stealth/js/evasions/navigator.hardwareConcurrency.js +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/stealth/js/evasions/navigator.languages.js +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/stealth/js/evasions/navigator.permissions.js +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/stealth/js/evasions/navigator.platform.js +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/stealth/js/evasions/navigator.plugins.js +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/stealth/js/evasions/navigator.userAgent.js +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/stealth/js/evasions/navigator.vendor.js +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/stealth/js/evasions/webgl.vendor.js +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/stealth/js/generate.magic.arrays.js +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/stealth/js/utils.js +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/stealth/stealth.py +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/sync_api/__init__.py +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/user_simulator/README.md +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/user_simulator/__init__.py +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/user_simulator/async_basic.py +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/user_simulator/async_simulator.py +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/user_simulator/script.py +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/user_simulator/sync_basic.py +0 -0
- {phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/user_simulator/sync_simulator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: phantomwright
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Bridging playwright-core patch + extending playwright API for stealth injection & user simulation
|
|
5
5
|
Project-URL: homepage, https://github.com/ai-microsoft/phantom-wright
|
|
6
6
|
Project-URL: changelog, https://github.com/ai-microsoft/phantom-wright/blob/main/CHANGELOG.md
|
|
@@ -9,8 +9,12 @@ License-Expression: MIT
|
|
|
9
9
|
License-File: LICENSE
|
|
10
10
|
Classifier: Operating System :: OS Independent
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
|
12
|
-
Requires-Python: >=3.
|
|
13
|
-
Requires-Dist:
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Requires-Dist: aiohttp>=3.9.0
|
|
14
|
+
Requires-Dist: click>=8.0.0
|
|
15
|
+
Requires-Dist: httpx>=0.27.0
|
|
16
|
+
Requires-Dist: markitdown>=0.1.0
|
|
17
|
+
Requires-Dist: phantomwright-driver==1.58.3
|
|
14
18
|
Provides-Extra: black
|
|
15
19
|
Requires-Dist: black>=25.9.0; extra == 'black'
|
|
16
20
|
Provides-Extra: dev
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
version = 'v0.2.0'
|
{phantomwright-0.1.4 → phantomwright-0.2.0}/phantomwright/captcha/cloudfare/scripts/observer.js
RENAMED
|
@@ -41,6 +41,14 @@
|
|
|
41
41
|
}
|
|
42
42
|
}
|
|
43
43
|
|
|
44
|
+
// Expose function to trigger rescan from Python side
|
|
45
|
+
window.__triggerCfRescan = function() {
|
|
46
|
+
const root = document.body || document.documentElement;
|
|
47
|
+
if (root) {
|
|
48
|
+
scan(root);
|
|
49
|
+
}
|
|
50
|
+
};
|
|
51
|
+
|
|
44
52
|
function startObserve() {
|
|
45
53
|
const root = document.body || document.documentElement;
|
|
46
54
|
if (!root) return;
|
|
@@ -87,18 +87,27 @@ class CloudflareSolverAsync:
|
|
|
87
87
|
return self.page_solve_state.setdefault(
|
|
88
88
|
page,
|
|
89
89
|
{
|
|
90
|
-
"status": "idle", # idle | solving
|
|
90
|
+
"status": "idle", # idle | solving
|
|
91
91
|
"last_url": None,
|
|
92
92
|
},
|
|
93
93
|
)
|
|
94
94
|
|
|
95
95
|
# ---------------- js rebind ----------------
|
|
96
96
|
async def _rebind(self, page: Page):
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
window.
|
|
100
|
-
|
|
101
|
-
|
|
97
|
+
try:
|
|
98
|
+
await page.evaluate("""
|
|
99
|
+
window.onCloudflareDetected = function(sel, url) {
|
|
100
|
+
window.__cf_callback(sel, url);
|
|
101
|
+
};
|
|
102
|
+
""")
|
|
103
|
+
await asyncio.sleep(1)
|
|
104
|
+
await page.evaluate("""
|
|
105
|
+
if (typeof window.__triggerCfRescan === 'function') {
|
|
106
|
+
window.__triggerCfRescan();
|
|
107
|
+
}
|
|
108
|
+
""")
|
|
109
|
+
except Exception:
|
|
110
|
+
pass
|
|
102
111
|
|
|
103
112
|
# ---------------- report helper ----------------
|
|
104
113
|
def _log_final_report(self, report: dict):
|
|
@@ -113,7 +122,9 @@ class CloudflareSolverAsync:
|
|
|
113
122
|
"timestamp": time.time(),
|
|
114
123
|
}
|
|
115
124
|
|
|
116
|
-
self.log
|
|
125
|
+
if self.log:
|
|
126
|
+
log_str = json.dumps(data, ensure_ascii=False)
|
|
127
|
+
self.log(log_str)
|
|
117
128
|
|
|
118
129
|
# ---------------- core solve ----------------
|
|
119
130
|
async def _auto_solve_cf(self, page: Page):
|
|
@@ -140,7 +151,7 @@ class CloudflareSolverAsync:
|
|
|
140
151
|
report["challenge_type"] = challenge_type.name
|
|
141
152
|
|
|
142
153
|
if challenge_type is ChallengeType.TURNSTILE:
|
|
143
|
-
await page.locator("#cf-turnstile").wait_for(10000)
|
|
154
|
+
await page.locator("#cf-turnstile").wait_for(timeout=10000)
|
|
144
155
|
|
|
145
156
|
cf_iframes = await search_shadow_root_iframes(
|
|
146
157
|
captcha_container=page,
|
|
@@ -174,13 +185,22 @@ class CloudflareSolverAsync:
|
|
|
174
185
|
success_elements = await search_shadow_root_elements(
|
|
175
186
|
iframe, 'div[id="success"]'
|
|
176
187
|
)
|
|
177
|
-
|
|
188
|
+
|
|
189
|
+
# Check if success element is actually visible
|
|
190
|
+
solved = False
|
|
191
|
+
for el in success_elements:
|
|
192
|
+
try:
|
|
193
|
+
is_visible = await el.is_visible()
|
|
194
|
+
if is_visible:
|
|
195
|
+
solved = True
|
|
196
|
+
break
|
|
197
|
+
except:
|
|
198
|
+
pass
|
|
178
199
|
else:
|
|
179
200
|
solved = not await detect_cloudflare_challenge(page)
|
|
180
201
|
|
|
181
202
|
if solved:
|
|
182
|
-
state["status"] = "
|
|
183
|
-
state["last_url"] = page.url
|
|
203
|
+
state["status"] = "idle"
|
|
184
204
|
|
|
185
205
|
report["success"] = True
|
|
186
206
|
return
|
|
@@ -211,15 +231,17 @@ class CloudflareSolverAsync:
|
|
|
211
231
|
return
|
|
212
232
|
|
|
213
233
|
state = self._get_page_state(page)
|
|
234
|
+
current_url = page.url
|
|
235
|
+
|
|
236
|
+
# Skip if this URL was already attempted
|
|
237
|
+
if state["last_url"] == current_url:
|
|
238
|
+
return
|
|
214
239
|
|
|
215
|
-
if state["
|
|
216
|
-
state["status"] = "idle"
|
|
217
|
-
state["last_url"] = page.url
|
|
218
|
-
|
|
219
|
-
if state["status"] in ("solving", "done"):
|
|
240
|
+
if state["status"] == "solving":
|
|
220
241
|
return
|
|
221
242
|
|
|
222
243
|
state["status"] = "solving"
|
|
244
|
+
state["last_url"] = current_url
|
|
223
245
|
|
|
224
246
|
asyncio.create_task(self._auto_solve_cf(page))
|
|
225
247
|
|
|
@@ -234,7 +256,14 @@ class CloudflareSolverAsync:
|
|
|
234
256
|
await page.add_init_script(shadow_root_js)
|
|
235
257
|
|
|
236
258
|
await page.expose_function("__cf_callback", self._make_on_cf_detected(page))
|
|
259
|
+
|
|
260
|
+
# Listen to multiple events for more reliable detection
|
|
237
261
|
page.on("load", lambda: asyncio.create_task(self._rebind(page)))
|
|
262
|
+
page.on("domcontentloaded", lambda: asyncio.create_task(self._rebind(page)))
|
|
263
|
+
page.on("framenavigated", lambda frame: asyncio.create_task(self._rebind(page)) if frame == page.main_frame else None)
|
|
264
|
+
|
|
265
|
+
# Immediately rebind in case page is already loaded
|
|
266
|
+
asyncio.create_task(self._rebind(page))
|
|
238
267
|
|
|
239
268
|
# ---------------- public api ----------------
|
|
240
269
|
def start(self) -> None:
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""Client for communicating with the browser session server."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Optional
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
|
|
9
|
+
from .server import SESSION_FILE, DEFAULT_PORT
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SessionClient:
|
|
13
|
+
"""Client for communicating with the browser session server."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, port: Optional[int] = None):
|
|
16
|
+
self._port = port
|
|
17
|
+
self._base_url: Optional[str] = None
|
|
18
|
+
|
|
19
|
+
def _get_base_url(self) -> Optional[str]:
|
|
20
|
+
"""Get the base URL for the server."""
|
|
21
|
+
if self._base_url:
|
|
22
|
+
return self._base_url
|
|
23
|
+
|
|
24
|
+
# Try to load port from session file
|
|
25
|
+
port = self._port
|
|
26
|
+
if port is None:
|
|
27
|
+
if SESSION_FILE.exists():
|
|
28
|
+
try:
|
|
29
|
+
with open(SESSION_FILE, "r") as f:
|
|
30
|
+
session_info = json.load(f)
|
|
31
|
+
port = session_info.get("port", DEFAULT_PORT)
|
|
32
|
+
except (json.JSONDecodeError, IOError):
|
|
33
|
+
port = DEFAULT_PORT
|
|
34
|
+
else:
|
|
35
|
+
port = DEFAULT_PORT
|
|
36
|
+
|
|
37
|
+
self._base_url = f"http://127.0.0.1:{port}"
|
|
38
|
+
return self._base_url
|
|
39
|
+
|
|
40
|
+
def is_server_running(self) -> bool:
|
|
41
|
+
"""Check if the server is running."""
|
|
42
|
+
base_url = self._get_base_url()
|
|
43
|
+
if not base_url:
|
|
44
|
+
return False
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
with httpx.Client(timeout=2.0) as client:
|
|
48
|
+
response = client.get(f"{base_url}/status")
|
|
49
|
+
return response.status_code == 200
|
|
50
|
+
except httpx.RequestError:
|
|
51
|
+
return False
|
|
52
|
+
|
|
53
|
+
def _request(
|
|
54
|
+
self,
|
|
55
|
+
method: str,
|
|
56
|
+
endpoint: str,
|
|
57
|
+
data: Optional[dict] = None,
|
|
58
|
+
timeout: float = 60.0,
|
|
59
|
+
) -> dict:
|
|
60
|
+
"""Make a request to the server."""
|
|
61
|
+
base_url = self._get_base_url()
|
|
62
|
+
if not base_url:
|
|
63
|
+
return {"error": "NoSession", "message": "No active browser session. Run 'phantomwright start' first."}
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
with httpx.Client(timeout=timeout) as client:
|
|
67
|
+
if method == "GET":
|
|
68
|
+
response = client.get(f"{base_url}{endpoint}")
|
|
69
|
+
else:
|
|
70
|
+
response = client.post(f"{base_url}{endpoint}", json=data or {})
|
|
71
|
+
|
|
72
|
+
return response.json()
|
|
73
|
+
except httpx.ConnectError:
|
|
74
|
+
return {"error": "NoSession", "message": "No active browser session. Run 'phantomwright start' first."}
|
|
75
|
+
except httpx.RequestError as e:
|
|
76
|
+
return {"error": "ConnectionError", "message": str(e)}
|
|
77
|
+
except json.JSONDecodeError:
|
|
78
|
+
return {"error": "InvalidResponse", "message": "Invalid response from server"}
|
|
79
|
+
|
|
80
|
+
def get_status(self) -> dict:
|
|
81
|
+
"""Get server status."""
|
|
82
|
+
return self._request("GET", "/status")
|
|
83
|
+
|
|
84
|
+
def stop(self) -> dict:
|
|
85
|
+
"""Stop the server."""
|
|
86
|
+
return self._request("POST", "/stop")
|
|
87
|
+
|
|
88
|
+
def navigate(
|
|
89
|
+
self,
|
|
90
|
+
url: str,
|
|
91
|
+
simulate: bool = True,
|
|
92
|
+
cool_down: bool = True,
|
|
93
|
+
wait_for: Optional[str] = None,
|
|
94
|
+
timeout: int = 30000,
|
|
95
|
+
) -> dict:
|
|
96
|
+
"""Navigate to a URL."""
|
|
97
|
+
return self._request("POST", "/navigate", {
|
|
98
|
+
"url": url,
|
|
99
|
+
"simulate": simulate,
|
|
100
|
+
"cool_down": cool_down,
|
|
101
|
+
"wait_for": wait_for,
|
|
102
|
+
"timeout": timeout,
|
|
103
|
+
}, timeout=timeout / 1000 + 30)
|
|
104
|
+
|
|
105
|
+
def scroll(self, duration: int = 2000) -> dict:
|
|
106
|
+
"""Scroll the page."""
|
|
107
|
+
return self._request("POST", "/scroll", {"duration": duration}, timeout=duration / 1000 + 10)
|
|
108
|
+
|
|
109
|
+
def browse(self, duration: int = 2000) -> dict:
|
|
110
|
+
"""Simulate browsing."""
|
|
111
|
+
return self._request("POST", "/browse", {"duration": duration}, timeout=duration / 1000 + 10)
|
|
112
|
+
|
|
113
|
+
def click(self, selector: str, simulate: bool = True) -> dict:
|
|
114
|
+
"""Click an element."""
|
|
115
|
+
return self._request("POST", "/click", {
|
|
116
|
+
"selector": selector,
|
|
117
|
+
"simulate": simulate,
|
|
118
|
+
})
|
|
119
|
+
|
|
120
|
+
def type_text(self, selector: str, text: str, simulate: bool = True, typos: bool = False) -> dict:
|
|
121
|
+
"""Type text into an element."""
|
|
122
|
+
return self._request("POST", "/type", {
|
|
123
|
+
"selector": selector,
|
|
124
|
+
"text": text,
|
|
125
|
+
"simulate": simulate,
|
|
126
|
+
"typos": typos,
|
|
127
|
+
}, timeout=len(text) * 0.5 + 30)
|
|
128
|
+
|
|
129
|
+
def screenshot(
|
|
130
|
+
self,
|
|
131
|
+
selector: Optional[str] = None,
|
|
132
|
+
full_page: bool = False,
|
|
133
|
+
) -> dict:
|
|
134
|
+
"""Take a screenshot."""
|
|
135
|
+
return self._request("POST", "/screenshot", {
|
|
136
|
+
"selector": selector,
|
|
137
|
+
"full_page": full_page,
|
|
138
|
+
})
|
|
139
|
+
|
|
140
|
+
def get_html(self, selector: Optional[str] = None, outer: bool = True) -> dict:
|
|
141
|
+
"""Get HTML content."""
|
|
142
|
+
return self._request("POST", "/html", {
|
|
143
|
+
"selector": selector,
|
|
144
|
+
"outer": outer,
|
|
145
|
+
})
|
|
146
|
+
|
|
147
|
+
def get_text(self, selector: str) -> dict:
|
|
148
|
+
"""Get text content."""
|
|
149
|
+
return self._request("POST", "/text", {"selector": selector})
|
|
150
|
+
|
|
151
|
+
def get_attr(self, selector: str, attribute: str) -> dict:
|
|
152
|
+
"""Get attribute value."""
|
|
153
|
+
return self._request("POST", "/attr", {
|
|
154
|
+
"selector": selector,
|
|
155
|
+
"attribute": attribute,
|
|
156
|
+
})
|
|
157
|
+
|
|
158
|
+
def query(
|
|
159
|
+
self,
|
|
160
|
+
selector: str,
|
|
161
|
+
limit: Optional[int] = None,
|
|
162
|
+
attrs: Optional[str] = None,
|
|
163
|
+
) -> dict:
|
|
164
|
+
"""Query elements."""
|
|
165
|
+
return self._request("POST", "/query", {
|
|
166
|
+
"selector": selector,
|
|
167
|
+
"limit": limit,
|
|
168
|
+
"attrs": attrs,
|
|
169
|
+
})
|
|
170
|
+
|
|
171
|
+
def get_url(self) -> dict:
|
|
172
|
+
"""Get current URL."""
|
|
173
|
+
return self._request("GET", "/url")
|
|
174
|
+
|
|
175
|
+
def get_title(self) -> dict:
|
|
176
|
+
"""Get page title."""
|
|
177
|
+
return self._request("GET", "/title")
|
|
178
|
+
|
|
179
|
+
def wait(self, selector: str, timeout: int = 30000, state: str = "visible") -> dict:
|
|
180
|
+
"""Wait for an element."""
|
|
181
|
+
return self._request("POST", "/wait", {
|
|
182
|
+
"selector": selector,
|
|
183
|
+
"timeout": timeout,
|
|
184
|
+
"state": state,
|
|
185
|
+
}, timeout=timeout / 1000 + 10)
|
|
186
|
+
|
|
187
|
+
def get_markdown(self, selector: Optional[str] = None) -> dict:
|
|
188
|
+
"""Get page content as markdown with selectors."""
|
|
189
|
+
return self._request("POST", "/markdown", {"selector": selector})
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
# Global client instance
|
|
193
|
+
_client: Optional[SessionClient] = None
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def get_client() -> SessionClient:
|
|
197
|
+
"""Get the global session client."""
|
|
198
|
+
global _client
|
|
199
|
+
if _client is None:
|
|
200
|
+
_client = SessionClient()
|
|
201
|
+
return _client
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""PhantomWright CLI commands."""
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""Extraction commands: screenshot, html, text, attr, query."""
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
import os
|
|
5
|
+
import click
|
|
6
|
+
|
|
7
|
+
from ..client import get_client
|
|
8
|
+
from ..output import output_success, output_error
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@click.command()
|
|
12
|
+
@click.option("--path", default=None, help="File path to save screenshot")
|
|
13
|
+
@click.option("--selector", default=None, help="CSS selector for element screenshot")
|
|
14
|
+
@click.option("--full-page/--no-full-page", default=False, help="Capture full scrollable page")
|
|
15
|
+
def screenshot(path: str, selector: str, full_page: bool):
|
|
16
|
+
"""Take a screenshot of the page or element."""
|
|
17
|
+
client = get_client()
|
|
18
|
+
result = client.screenshot(selector=selector, full_page=full_page)
|
|
19
|
+
|
|
20
|
+
if "error" in result:
|
|
21
|
+
output_error("screenshot", result["error"], result["message"], selector=selector)
|
|
22
|
+
return
|
|
23
|
+
|
|
24
|
+
screenshot_bytes = base64.b64decode(result["base64"])
|
|
25
|
+
|
|
26
|
+
if path:
|
|
27
|
+
# Ensure directory exists
|
|
28
|
+
dir_path = os.path.dirname(path)
|
|
29
|
+
if dir_path:
|
|
30
|
+
os.makedirs(dir_path, exist_ok=True)
|
|
31
|
+
with open(path, "wb") as f:
|
|
32
|
+
f.write(screenshot_bytes)
|
|
33
|
+
output_success("screenshot", {
|
|
34
|
+
"path": os.path.abspath(path),
|
|
35
|
+
"size_bytes": result["size_bytes"],
|
|
36
|
+
"selector": selector,
|
|
37
|
+
"full_page": full_page,
|
|
38
|
+
})
|
|
39
|
+
else:
|
|
40
|
+
# Return base64 encoded image
|
|
41
|
+
output_success("screenshot", {
|
|
42
|
+
"base64": result["base64"],
|
|
43
|
+
"size_bytes": result["size_bytes"],
|
|
44
|
+
"selector": selector,
|
|
45
|
+
"full_page": full_page,
|
|
46
|
+
})
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@click.command()
|
|
50
|
+
@click.option("--selector", default=None, help="CSS selector to get HTML from")
|
|
51
|
+
@click.option("--outer/--inner", default=True, help="Get outer HTML (default) or inner HTML")
|
|
52
|
+
def html(selector: str, outer: bool):
|
|
53
|
+
"""Get page or element HTML content."""
|
|
54
|
+
client = get_client()
|
|
55
|
+
result = client.get_html(selector=selector, outer=outer)
|
|
56
|
+
|
|
57
|
+
if "error" in result:
|
|
58
|
+
output_error("html", result["error"], result["message"], selector=selector)
|
|
59
|
+
else:
|
|
60
|
+
output_success("html", {
|
|
61
|
+
"html": result["html"],
|
|
62
|
+
"length": result["length"],
|
|
63
|
+
"selector": selector,
|
|
64
|
+
"type": result["type"],
|
|
65
|
+
})
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@click.command()
|
|
69
|
+
@click.argument("selector")
|
|
70
|
+
def text(selector: str):
|
|
71
|
+
"""Get text content of an element."""
|
|
72
|
+
client = get_client()
|
|
73
|
+
result = client.get_text(selector=selector)
|
|
74
|
+
|
|
75
|
+
if "error" in result:
|
|
76
|
+
output_error("text", result["error"], result["message"], selector=selector)
|
|
77
|
+
else:
|
|
78
|
+
output_success("text", {
|
|
79
|
+
"text": result["text"],
|
|
80
|
+
"length": result["length"],
|
|
81
|
+
"selector": selector,
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@click.command()
|
|
86
|
+
@click.argument("selector")
|
|
87
|
+
@click.argument("attribute")
|
|
88
|
+
def attr(selector: str, attribute: str):
|
|
89
|
+
"""Get an attribute value from an element."""
|
|
90
|
+
client = get_client()
|
|
91
|
+
result = client.get_attr(selector=selector, attribute=attribute)
|
|
92
|
+
|
|
93
|
+
if "error" in result:
|
|
94
|
+
output_error("attr", result["error"], result["message"], selector=selector)
|
|
95
|
+
else:
|
|
96
|
+
output_success("attr", {
|
|
97
|
+
"attribute": result["attribute"],
|
|
98
|
+
"value": result["value"],
|
|
99
|
+
"selector": selector,
|
|
100
|
+
})
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@click.command()
|
|
104
|
+
@click.argument("selector")
|
|
105
|
+
@click.option("--limit", default=None, type=int, help="Maximum number of elements to return")
|
|
106
|
+
@click.option("--attrs", default=None, help="Comma-separated list of attributes to extract")
|
|
107
|
+
def query(selector: str, limit: int, attrs: str):
|
|
108
|
+
"""Query all elements matching a selector."""
|
|
109
|
+
client = get_client()
|
|
110
|
+
result = client.query(selector=selector, limit=limit, attrs=attrs)
|
|
111
|
+
|
|
112
|
+
if "error" in result:
|
|
113
|
+
output_error("query", result["error"], result["message"], selector=selector)
|
|
114
|
+
else:
|
|
115
|
+
output_success("query", {
|
|
116
|
+
"elements": result["elements"],
|
|
117
|
+
"count": result["count"],
|
|
118
|
+
"total_matches": result.get("total_matches", result["count"]),
|
|
119
|
+
"selector": selector,
|
|
120
|
+
})
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@click.command()
|
|
124
|
+
@click.option("--selector", default=None, help="CSS selector to convert (default: entire page)")
|
|
125
|
+
def markdown(selector: str):
|
|
126
|
+
"""Get page content as markdown with embedded selectors for agent interaction."""
|
|
127
|
+
client = get_client()
|
|
128
|
+
result = client.get_markdown(selector=selector)
|
|
129
|
+
|
|
130
|
+
if "error" in result:
|
|
131
|
+
output_error("markdown", result["error"], result["message"], selector=selector)
|
|
132
|
+
else:
|
|
133
|
+
output_success("markdown", {
|
|
134
|
+
"markdown": result["markdown"],
|
|
135
|
+
"url": result["url"],
|
|
136
|
+
"title": result["title"],
|
|
137
|
+
})
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""Info commands: url, title, wait, status, start, stop."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import subprocess
|
|
5
|
+
import sys
|
|
6
|
+
import click
|
|
7
|
+
|
|
8
|
+
from ..session import load_session, clear_session, is_session_active
|
|
9
|
+
from ..client import get_client
|
|
10
|
+
from ..output import output_success, output_error
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@click.command()
|
|
14
|
+
def url():
|
|
15
|
+
"""Get the current page URL."""
|
|
16
|
+
client = get_client()
|
|
17
|
+
result = client.get_url()
|
|
18
|
+
|
|
19
|
+
if "error" in result:
|
|
20
|
+
output_error("url", result["error"], result["message"])
|
|
21
|
+
else:
|
|
22
|
+
output_success("url", {"url": result["url"]})
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@click.command()
|
|
26
|
+
def title():
|
|
27
|
+
"""Get the current page title."""
|
|
28
|
+
client = get_client()
|
|
29
|
+
result = client.get_title()
|
|
30
|
+
|
|
31
|
+
if "error" in result:
|
|
32
|
+
output_error("title", result["error"], result["message"])
|
|
33
|
+
else:
|
|
34
|
+
output_success("title", {
|
|
35
|
+
"title": result["title"],
|
|
36
|
+
"url": result["url"],
|
|
37
|
+
})
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@click.command()
|
|
41
|
+
@click.argument("selector")
|
|
42
|
+
@click.option("--timeout", default=30000, help="Timeout in milliseconds")
|
|
43
|
+
@click.option("--state", default="visible", type=click.Choice(["attached", "detached", "visible", "hidden"]), help="Element state to wait for")
|
|
44
|
+
def wait(selector: str, timeout: int, state: str):
|
|
45
|
+
"""Wait for an element to appear."""
|
|
46
|
+
client = get_client()
|
|
47
|
+
result = client.wait(selector, timeout=timeout, state=state)
|
|
48
|
+
|
|
49
|
+
if "error" in result:
|
|
50
|
+
output_error("wait", result["error"], result["message"], selector=selector, timeout=timeout)
|
|
51
|
+
else:
|
|
52
|
+
output_success("wait", {
|
|
53
|
+
"selector": selector,
|
|
54
|
+
"state": state,
|
|
55
|
+
"found": result["found"],
|
|
56
|
+
"count": result["count"],
|
|
57
|
+
})
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@click.command()
|
|
61
|
+
def status():
|
|
62
|
+
"""Get the current session status."""
|
|
63
|
+
client = get_client()
|
|
64
|
+
result = client.get_status()
|
|
65
|
+
output_success("status", result)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@click.command()
|
|
69
|
+
@click.option("--browser", default="chrome", type=click.Choice(["chrome", "msedge"]), help="Browser to use (default: chrome)")
|
|
70
|
+
@click.option("--headless/--no-headless", default=False, help="Run browser in headless mode")
|
|
71
|
+
@click.option("--stealth/--no-stealth", default=True, help="Enable stealth mode")
|
|
72
|
+
@click.option("--cloudflare-solver/--no-cloudflare-solver", default=True, help="Enable Cloudflare solver")
|
|
73
|
+
@click.option("--visualize-mouse/--no-visualize-mouse", default=None, help="Show visual cursor (default: on when not headless)")
|
|
74
|
+
@click.option("--port", default=9323, help="Server port (default: 9323)")
|
|
75
|
+
@click.option("--foreground/--no-foreground", default=False, help="Run server in foreground (default: background)")
|
|
76
|
+
@click.option("--user-data-dir", default=None, help="Browser user data directory for persistent sessions (preserves logins)")
|
|
77
|
+
def start(browser: str, headless: bool, stealth: bool, cloudflare_solver: bool, visualize_mouse: bool, port: int, foreground: bool, user_data_dir: str):
|
|
78
|
+
"""Start a new browser session."""
|
|
79
|
+
# Check if already running
|
|
80
|
+
if is_session_active():
|
|
81
|
+
output_error("start", "SessionExists", "A browser session is already active. Run 'phantomwright stop' first.")
|
|
82
|
+
return
|
|
83
|
+
|
|
84
|
+
if not foreground:
|
|
85
|
+
# Start server in background subprocess
|
|
86
|
+
args = [
|
|
87
|
+
sys.executable, "-m", "phantomwright.cli.server",
|
|
88
|
+
"--port", str(port),
|
|
89
|
+
"--browser", browser,
|
|
90
|
+
]
|
|
91
|
+
if headless:
|
|
92
|
+
args.append("--headless")
|
|
93
|
+
if not stealth:
|
|
94
|
+
args.append("--no-stealth")
|
|
95
|
+
if not cloudflare_solver:
|
|
96
|
+
args.append("--no-cloudflare-solver")
|
|
97
|
+
if visualize_mouse is not None:
|
|
98
|
+
args.append("--visualize-mouse" if visualize_mouse else "--no-visualize-mouse")
|
|
99
|
+
if user_data_dir:
|
|
100
|
+
args.extend(["--user-data-dir", user_data_dir])
|
|
101
|
+
|
|
102
|
+
# Use CREATE_NEW_PROCESS_GROUP on Windows, start_new_session on Unix
|
|
103
|
+
if sys.platform == "win32":
|
|
104
|
+
subprocess.Popen(
|
|
105
|
+
args,
|
|
106
|
+
creationflags=subprocess.CREATE_NEW_PROCESS_GROUP | subprocess.DETACHED_PROCESS,
|
|
107
|
+
stdout=subprocess.DEVNULL,
|
|
108
|
+
stderr=subprocess.DEVNULL,
|
|
109
|
+
)
|
|
110
|
+
else:
|
|
111
|
+
subprocess.Popen(
|
|
112
|
+
args,
|
|
113
|
+
start_new_session=True,
|
|
114
|
+
stdout=subprocess.DEVNULL,
|
|
115
|
+
stderr=subprocess.DEVNULL,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Wait for server to start
|
|
119
|
+
import time
|
|
120
|
+
from ..client import SessionClient
|
|
121
|
+
client = SessionClient(port=port)
|
|
122
|
+
for _ in range(300): # Wait up to 30 seconds
|
|
123
|
+
time.sleep(0.1)
|
|
124
|
+
if client.is_server_running():
|
|
125
|
+
result = client.get_status()
|
|
126
|
+
output_success("start", result)
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
output_error("start", "StartupTimeout", "Server did not start within timeout.")
|
|
130
|
+
else:
|
|
131
|
+
# Run server in foreground
|
|
132
|
+
asyncio.run(_start_foreground(browser, headless, stealth, cloudflare_solver, visualize_mouse, port, user_data_dir))
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
async def _start_foreground(browser: str, headless: bool, stealth: bool, cloudflare_solver: bool, visualize_mouse: bool, port: int, user_data_dir: str):
|
|
136
|
+
"""Start server in foreground."""
|
|
137
|
+
from ..server import run_server
|
|
138
|
+
await run_server(
|
|
139
|
+
port=port,
|
|
140
|
+
browser=browser,
|
|
141
|
+
headless=headless,
|
|
142
|
+
stealth=stealth,
|
|
143
|
+
cloudflare_solver=cloudflare_solver,
|
|
144
|
+
visualize_mouse=visualize_mouse,
|
|
145
|
+
user_data_dir=user_data_dir,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@click.command()
|
|
150
|
+
def stop():
|
|
151
|
+
"""Stop the current browser session."""
|
|
152
|
+
client = get_client()
|
|
153
|
+
|
|
154
|
+
if not client.is_server_running():
|
|
155
|
+
# Check if there's a stale session file
|
|
156
|
+
stored_session = load_session()
|
|
157
|
+
if stored_session:
|
|
158
|
+
clear_session()
|
|
159
|
+
output_success("stop", {"message": "Cleaned up stale session file."})
|
|
160
|
+
else:
|
|
161
|
+
output_error("stop", "NoSession", "No active browser session to stop.")
|
|
162
|
+
return
|
|
163
|
+
|
|
164
|
+
result = client.stop()
|
|
165
|
+
|
|
166
|
+
if "error" in result:
|
|
167
|
+
output_error("stop", result["error"], result["message"])
|
|
168
|
+
else:
|
|
169
|
+
output_success("stop", {"message": "Browser session stopped."})
|