phantomwright 0.1.0__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {phantomwright-0.1.0 → phantomwright-0.1.2}/PKG-INFO +51 -4
  2. {phantomwright-0.1.0 → phantomwright-0.1.2}/README.md +50 -3
  3. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/__init__.py +1 -1
  4. phantomwright-0.1.2/phantomwright/_repo_version.py +1 -0
  5. phantomwright-0.1.2/phantomwright/captcha/__init__.py +5 -0
  6. phantomwright-0.1.2/phantomwright/captcha/cloudfare/scripts/observer.js +71 -0
  7. phantomwright-0.1.2/phantomwright/captcha/cloudfare/scripts/shadow_root.js +11 -0
  8. phantomwright-0.1.2/phantomwright/captcha/cloudfare/solver.py +284 -0
  9. phantomwright-0.1.2/phantomwright/captcha/cloudfare/utils/build_js.py +7 -0
  10. phantomwright-0.1.2/phantomwright/captcha/cloudfare/utils/consts.py +16 -0
  11. phantomwright-0.1.2/phantomwright/captcha/cloudfare/utils/detection.py +52 -0
  12. phantomwright-0.1.2/phantomwright/captcha/cloudfare/utils/dom_helpers.py +59 -0
  13. phantomwright-0.1.2/phantomwright/captcha/cloudfare/utils/shadow_root.py +122 -0
  14. {phantomwright-0.1.0 → phantomwright-0.1.2}/pyproject.toml +1 -1
  15. phantomwright-0.1.0/phantomwright/_repo_version.py +0 -1
  16. {phantomwright-0.1.0 → phantomwright-0.1.2}/.gitignore +0 -0
  17. {phantomwright-0.1.0 → phantomwright-0.1.2}/LICENSE +0 -0
  18. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/_impl/__init__.py +0 -0
  19. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/_impl/_core_debug_patch.py +0 -0
  20. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/_impl/_evaluate_patch.py +0 -0
  21. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/_impl/_inconsistency_patch.py +0 -0
  22. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/async_api/__init__.py +0 -0
  23. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/py.typed +0 -0
  24. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/stealth/__init__.py +0 -0
  25. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/stealth/js/evasions/chrome.app.js +0 -0
  26. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/stealth/js/evasions/chrome.csi.js +0 -0
  27. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/stealth/js/evasions/chrome.hairline.js +0 -0
  28. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/stealth/js/evasions/chrome.load.times.js +0 -0
  29. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/stealth/js/evasions/chrome.runtime.js +0 -0
  30. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/stealth/js/evasions/error.prototype.js +0 -0
  31. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/stealth/js/evasions/iframe.contentWindow.js +0 -0
  32. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/stealth/js/evasions/media.codecs.js +0 -0
  33. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/stealth/js/evasions/navigator.hardwareConcurrency.js +0 -0
  34. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/stealth/js/evasions/navigator.languages.js +0 -0
  35. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/stealth/js/evasions/navigator.permissions.js +0 -0
  36. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/stealth/js/evasions/navigator.platform.js +0 -0
  37. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/stealth/js/evasions/navigator.plugins.js +0 -0
  38. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/stealth/js/evasions/navigator.userAgent.js +0 -0
  39. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/stealth/js/evasions/navigator.vendor.js +0 -0
  40. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/stealth/js/evasions/webgl.vendor.js +0 -0
  41. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/stealth/js/generate.magic.arrays.js +0 -0
  42. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/stealth/js/utils.js +0 -0
  43. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/stealth/stealth.py +0 -0
  44. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/sync_api/__init__.py +0 -0
  45. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/user_simulator/README.md +0 -0
  46. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/user_simulator/__init__.py +0 -0
  47. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/user_simulator/async_basic.py +0 -0
  48. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/user_simulator/async_simulator.py +0 -0
  49. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/user_simulator/script.py +0 -0
  50. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/user_simulator/sync_basic.py +0 -0
  51. {phantomwright-0.1.0 → phantomwright-0.1.2}/phantomwright/user_simulator/sync_simulator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: phantomwright
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: Bridging playwright-core patch + extending playwright API for stealth injection & user simulation
5
5
  Project-URL: homepage, https://github.com/ai-microsoft/phantom-wright
6
6
  Project-URL: changelog, https://github.com/ai-microsoft/phantom-wright/blob/main/CHANGELOG.md
@@ -31,9 +31,7 @@ Description-Content-Type: text/markdown
31
31
  - **Full Playwright API** — All APIs exported from Playwright, no learning curve
32
32
  - **Fingerprints Evasion** — Override browser fingerprints to better evade detection
33
33
  - **User Simulation** — Humanized page interactions for realistic behavior
34
-
35
- > [!NOTE]
36
- > Coming Soon: Captcha Resolver
34
+ - **Captcha Solver** — Automatic Cloudflare challenge solving with background monitoring
37
35
 
38
36
  ## Installation
39
37
 
@@ -136,6 +134,55 @@ with sync_playwright() as p:
136
134
  browser.close()
137
135
  ```
138
136
 
137
+ ### Cloudflare Captcha Solver
138
+
139
+ ```python
140
+ import logging
141
+ from phantomwright.async_api import async_playwright
142
+ from phantomwright.captcha.cloudfare.solver import CloudflareAutoSolver
143
+
144
+ logging.basicConfig(level=logging.INFO)
145
+ logger = logging.getLogger(__name__)
146
+
147
+ async def main():
148
+ async with async_playwright() as pw:
149
+ browser = await pw.chromium.launch(headless=False)
150
+ context = await browser.new_context()
151
+ solver = CloudflareAutoSolver(
152
+ context,
153
+ max_attempts=3,
154
+ attempt_delay=5,
155
+ log_callback=logger.info,
156
+ )
157
+
158
+ solver.start()
159
+ urls = [
160
+ "https://2captcha.com/demo/cloudflare-turnstile",
161
+ "https://2captcha.com/demo/cloudflare-turnstile-challenge"
162
+ ]
163
+ for url in urls:
164
+ page = await context.new_page()
165
+ await page.goto(url)
166
+ ```
167
+
168
+ **Key Features:**
169
+
170
+ - **Seamless Background Solving** — Once `solve()` is called, the solver continuously monitors all pages in the context. No manual intervention required, even across navigations on the same page.
171
+ - **Dual Challenge Support** — Handles both Cloudflare **Turnstile** and **Interstitial** challenge types automatically.
172
+ - **Logging Callback** — Provides real-time visibility into captcha events via `log_callback`. Receives JSON strings containing:
173
+ ```json
174
+ {
175
+ "event": "cloudflare_captcha_solve",
176
+ "url": "https://example.com",
177
+ "challenge_type": "TURNSTILE",
178
+ "success": true,
179
+ "attempts": 1,
180
+ "duration_sec": 2.345,
181
+ "error": null,
182
+ "timestamp": 1736985600.123
183
+ }
184
+ ```
185
+
139
186
  ## Development
140
187
 
141
188
  ### Setup & Test
@@ -11,9 +11,7 @@
11
11
  - **Full Playwright API** — All APIs exported from Playwright, no learning curve
12
12
  - **Fingerprints Evasion** — Override browser fingerprints to better evade detection
13
13
  - **User Simulation** — Humanized page interactions for realistic behavior
14
-
15
- > [!NOTE]
16
- > Coming Soon: Captcha Resolver
14
+ - **Captcha Solver** — Automatic Cloudflare challenge solving with background monitoring
17
15
 
18
16
  ## Installation
19
17
 
@@ -116,6 +114,55 @@ with sync_playwright() as p:
116
114
  browser.close()
117
115
  ```
118
116
 
117
+ ### Cloudflare Captcha Solver
118
+
119
+ ```python
120
+ import logging
121
+ from phantomwright.async_api import async_playwright
122
+ from phantomwright.captcha.cloudfare.solver import CloudflareAutoSolver
123
+
124
+ logging.basicConfig(level=logging.INFO)
125
+ logger = logging.getLogger(__name__)
126
+
127
+ async def main():
128
+ async with async_playwright() as pw:
129
+ browser = await pw.chromium.launch(headless=False)
130
+ context = await browser.new_context()
131
+ solver = CloudflareAutoSolver(
132
+ context,
133
+ max_attempts=3,
134
+ attempt_delay=5,
135
+ log_callback=logger.info,
136
+ )
137
+
138
+ solver.start()
139
+ urls = [
140
+ "https://2captcha.com/demo/cloudflare-turnstile",
141
+ "https://2captcha.com/demo/cloudflare-turnstile-challenge"
142
+ ]
143
+ for url in urls:
144
+ page = await context.new_page()
145
+ await page.goto(url)
146
+ ```
147
+
148
+ **Key Features:**
149
+
150
+ - **Seamless Background Solving** — Once `solve()` is called, the solver continuously monitors all pages in the context. No manual intervention required, even across navigations on the same page.
151
+ - **Dual Challenge Support** — Handles both Cloudflare **Turnstile** and **Interstitial** challenge types automatically.
152
+ - **Logging Callback** — Provides real-time visibility into captcha events via `log_callback`. Receives JSON strings containing:
153
+ ```json
154
+ {
155
+ "event": "cloudflare_captcha_solve",
156
+ "url": "https://example.com",
157
+ "challenge_type": "TURNSTILE",
158
+ "success": true,
159
+ "attempts": 1,
160
+ "duration_sec": 2.345,
161
+ "error": null,
162
+ "timestamp": 1736985600.123
163
+ }
164
+ ```
165
+
119
166
  ## Development
120
167
 
121
168
  ### Setup & Test
@@ -1,5 +1,5 @@
1
1
  """
2
- Patchright + Stealth Plugin + User Behavior Simulation = PhantomWright
2
+ Patchright + Stealth Plugin + User Behavior Simulation + Cloudflare Captcha Solver = PhantomWright
3
3
  """
4
4
 
5
5
  from ._impl import _core_debug_patch, _evaluate_patch, _inconsistency_patch
@@ -0,0 +1 @@
1
+ version = 'v0.1.2'
@@ -0,0 +1,5 @@
1
+ from .cloudfare.solver import CloudflareSolverAsync
2
+
3
+ __all__ = [
4
+ "CloudflareSolverAsync",
5
+ ]
@@ -0,0 +1,71 @@
1
+ (function () {
2
+ const selectors = __CF_SELECTORS__;
3
+ const reportedNodes = new WeakSet();
4
+ let pendingRescan = false;
5
+
6
+ function isRealCallbackReady() {
7
+ return (
8
+ typeof window.onCloudflareDetected === "function" &&
9
+ window.onCloudflareDetected.toString().includes("__cf_callback")
10
+ );
11
+ }
12
+
13
+ function tryReport(node, sel) {
14
+ if (reportedNodes.has(node)) return;
15
+
16
+ if (!isRealCallbackReady()) {
17
+ pendingRescan = true;
18
+ return;
19
+ }
20
+
21
+ reportedNodes.add(node);
22
+ try {
23
+ window.onCloudflareDetected(sel, location.href);
24
+ } catch (e) {
25
+ console.error("onCloudflareDetected failed:", e);
26
+ }
27
+ }
28
+
29
+ function scan(root) {
30
+ for (const sel of selectors) {
31
+ let nodes;
32
+ try {
33
+ nodes = root.querySelectorAll(sel);
34
+ } catch (e) {
35
+ continue;
36
+ }
37
+ for (const node of nodes) {
38
+ console.log("Found Cloudflare element for selector:", sel, node);
39
+ tryReport(node, sel);
40
+ }
41
+ }
42
+ }
43
+
44
+ function startObserve() {
45
+ const root = document.body || document.documentElement;
46
+ if (!root) return;
47
+
48
+ const observer = new MutationObserver(() => {
49
+ scan(root);
50
+ });
51
+
52
+ observer.observe(root, { childList: true, subtree: true });
53
+ scan(root);
54
+ }
55
+
56
+ const readyTimer = setInterval(() => {
57
+ if (pendingRescan && isRealCallbackReady()) {
58
+ const root = document.body || document.documentElement;
59
+ if (root) {
60
+ scan(root);
61
+ pendingRescan = false;
62
+ }
63
+ }
64
+ }, 300);
65
+
66
+ if (document.readyState === "loading") {
67
+ document.addEventListener("DOMContentLoaded", startObserve);
68
+ } else {
69
+ startObserve();
70
+ }
71
+ })();
@@ -0,0 +1,11 @@
1
+ (() => {
2
+ const originalAttachShadow = Element.prototype.attachShadow;
3
+ Element.prototype.attachShadow = function (init) {
4
+ const shadowRoot = originalAttachShadow.call(this, init);
5
+
6
+ // expose shadowRoot for later use
7
+ this.shadowRootUnl = shadowRoot;
8
+
9
+ return shadowRoot;
10
+ };
11
+ })();
@@ -0,0 +1,284 @@
1
+ import asyncio
2
+ import json
3
+ import time
4
+ from typing import Callable, Optional, Union
5
+
6
+ from phantomwright.async_api import Page, Browser, BrowserContext
7
+
8
+ from .utils.consts import ChallengeType
9
+ from .utils.detection import (
10
+ detect_cf_challenge_type,
11
+ detect_cloudflare_challenge,
12
+ )
13
+ from .utils.dom_helpers import get_ready_checkbox
14
+ from .utils.shadow_root import (
15
+ search_shadow_root_elements,
16
+ search_shadow_root_iframes,
17
+ )
18
+ from .utils.build_js import observer_js, shadow_root_js
19
+
20
+
21
+ class CloudflareSolverAsync:
22
+ """
23
+ Automatic Cloudflare challenge solver for Playwright async API.
24
+
25
+ This class automatically detects and solves Cloudflare Turnstile and Interstitial
26
+ challenges by monitoring page loads and clicking the verification checkbox.
27
+
28
+ Attributes:
29
+ context: Playwright BrowserContext to monitor for Cloudflare challenges.
30
+ max_attempts: Maximum number of solve attempts per challenge (default: 3).
31
+ attempt_delay: Delay in seconds between retry attempts (default: 5).
32
+ log: Optional callback function for logging solve events.
33
+
34
+ Example:
35
+ >>> from phantomwright.async_api import async_playwright
36
+ >>> from .solver import CloudflareSolverAsync
37
+ >>>
38
+ >>> async with async_playwright() as p:
39
+ ... browser = await p.chromium.launch(headless=False)
40
+ ... context = await browser.new_context()
41
+ ... solver = CloudflareSolverAsync(context, log_callback=print)
42
+ ... solver.solve() # Start monitoring for Cloudflare challenges
43
+ ... page = await context.new_page()
44
+ ... await page.goto("https://example.com") # Challenges will be auto-solved
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ context_or_browser: Union[BrowserContext, Browser],
50
+ *,
51
+ max_attempts: int = 3,
52
+ attempt_delay: int = 5,
53
+ log_callback: Optional[Callable[[str], None]] = None,
54
+ ):
55
+ """
56
+ Initialize the Cloudflare solver.
57
+
58
+ Args:
59
+ context_or_browser: Playwright BrowserContext or Browser to monitor for challenges.
60
+ If a Browser is provided, all its contexts will be monitored.
61
+ max_attempts: Maximum number of solve attempts per challenge.
62
+ attempt_delay: Delay in seconds between retry attempts.
63
+ log_callback: Optional callback function that receives a JSON string
64
+ containing solve event details. The JSON includes:
65
+ - event: "cloudflare_captcha_solve"
66
+ - url: The page URL where the challenge was detected
67
+ - challenge_type: "TURNSTILE" or "INTERSTITIAL"
68
+ - success: Whether the solve was successful
69
+ - attempts: Number of attempts made
70
+ - duration_sec: Time taken to solve
71
+ - error: Error message if failed, None otherwise
72
+ - timestamp: Unix timestamp of the event
73
+ """
74
+ self._is_browser = hasattr(context_or_browser, 'new_context')
75
+ self._browser: Optional[Browser] = context_or_browser if self._is_browser else None
76
+ self._context: Optional[BrowserContext] = None if self._is_browser else context_or_browser
77
+ self.max_attempts = max_attempts
78
+ self.attempt_delay = attempt_delay
79
+ self.log = log_callback
80
+
81
+ self.solve_click_delay = 6
82
+ self.wait_checkbox_attempts = 3
83
+ self.wait_checkbox_delay = 6
84
+ self.checkbox_click_attempts = 3
85
+
86
+ self.page_solve_state = {}
87
+
88
+ # ---------------- state ----------------
89
+ def _get_page_state(self, page: Page):
90
+ return self.page_solve_state.setdefault(
91
+ page,
92
+ {
93
+ "status": "idle", # idle | solving | done
94
+ "last_url": None,
95
+ },
96
+ )
97
+
98
+ # ---------------- js rebind ----------------
99
+ async def _rebind(self, page: Page):
100
+ await page.evaluate("""
101
+ window.onCloudflareDetected = function(sel, url) {
102
+ window.__cf_callback(sel, url);
103
+ };
104
+ """)
105
+
106
+ # ---------------- report helper ----------------
107
+ def _log_final_report(self, report: dict):
108
+ data = {
109
+ "event": "cloudflare_captcha_solve",
110
+ "url": report.get("url"),
111
+ "challenge_type": report.get("challenge_type"),
112
+ "success": report.get("success"),
113
+ "attempts": report.get("attempts"),
114
+ "duration_sec": round(report.get("duration", 0), 3),
115
+ "error": report.get("error"),
116
+ "timestamp": time.time(),
117
+ }
118
+
119
+ self.log(json.dumps(data, ensure_ascii=False))
120
+
121
+ # ---------------- core solve ----------------
122
+ async def _auto_solve_cf(self, page: Page):
123
+ state = self._get_page_state(page)
124
+
125
+ report = {
126
+ "url": page.url,
127
+ "challenge_type": None,
128
+ "success": False,
129
+ "attempts": 0,
130
+ "error": None,
131
+ "start_time": time.time(),
132
+ "duration": 0,
133
+ }
134
+
135
+ try:
136
+ for attempt in range(1, self.max_attempts + 1):
137
+ report["attempts"] = attempt
138
+ try:
139
+ challenge_type = await detect_cf_challenge_type(page)
140
+ if challenge_type is None:
141
+ raise Exception("Unknown challenge type")
142
+
143
+ report["challenge_type"] = challenge_type.name
144
+
145
+ if challenge_type is ChallengeType.TURNSTILE:
146
+ await page.locator("#cf-turnstile").wait_for()
147
+
148
+ cf_iframes = await search_shadow_root_iframes(
149
+ captcha_container=page,
150
+ src_filter="https://challenges.cloudflare.com/cdn-cgi/challenge-platform/",
151
+ )
152
+ if not cf_iframes:
153
+ raise Exception("Cloudflare iframe not found")
154
+
155
+ checkbox_data = await get_ready_checkbox(
156
+ iframes=cf_iframes,
157
+ delay=self.wait_checkbox_delay,
158
+ attempts=self.wait_checkbox_attempts,
159
+ )
160
+ if not checkbox_data:
161
+ raise Exception("Checkbox not ready")
162
+
163
+ iframe, checkbox = checkbox_data
164
+
165
+ click_errors = []
166
+ for i in range(self.checkbox_click_attempts):
167
+ try:
168
+ await checkbox.click()
169
+ break
170
+ except Exception as e:
171
+ click_errors.append(e)
172
+ else:
173
+ raise Exception(f"Failed to click checkbox. Errors: {click_errors}")
174
+
175
+ await asyncio.sleep(self.solve_click_delay)
176
+ if challenge_type is ChallengeType.TURNSTILE:
177
+ success_elements = await search_shadow_root_elements(
178
+ iframe, 'div[id="success"]'
179
+ )
180
+ solved = bool(success_elements)
181
+ else:
182
+ solved = not await detect_cloudflare_challenge(page)
183
+
184
+ if solved:
185
+ state["status"] = "done"
186
+ state["last_url"] = page.url
187
+
188
+ report["success"] = True
189
+ return
190
+ else:
191
+ raise Exception("Solve attempt did not pass verification")
192
+
193
+ except Exception as e:
194
+ report["error"] = str(e)
195
+ await asyncio.sleep(self.attempt_delay)
196
+
197
+ state["status"] = "idle"
198
+ raise Exception(f"Failed after {self.max_attempts} attempts")
199
+
200
+ except Exception as final_error:
201
+ report["error"] = str(final_error)
202
+ raise
203
+
204
+ finally:
205
+ report["duration"] = time.time() - report["start_time"]
206
+ self._log_final_report(report)
207
+
208
+ # ---------------- callback ----------------
209
+ def _make_on_cf_detected(self, page: Page):
210
+ async def on_cf_detected(selector, url):
211
+ if not selector:
212
+ return
213
+
214
+ state = self._get_page_state(page)
215
+
216
+ if state["last_url"] != page.url:
217
+ state["status"] = "idle"
218
+ state["last_url"] = page.url
219
+
220
+ if state["status"] in ("solving", "done"):
221
+ return
222
+
223
+ state["status"] = "solving"
224
+
225
+ asyncio.create_task(self._auto_solve_cf(page))
226
+
227
+ return on_cf_detected
228
+
229
+ # ---------------- page setup ----------------
230
+ async def _setup_page(self, page: Page):
231
+ await page.add_init_script("""
232
+ window.onCloudflareDetected = function(sel, url) {};
233
+ """)
234
+ await page.add_init_script(observer_js)
235
+ await page.add_init_script(shadow_root_js)
236
+
237
+ await page.expose_function("__cf_callback", self._make_on_cf_detected(page))
238
+ page.on("load", lambda: asyncio.create_task(self._rebind(page)))
239
+
240
+ def _setup_context(self, context: BrowserContext) -> None:
241
+ """Set up page monitoring for a single context."""
242
+ context.on(
243
+ "page",
244
+ lambda p: asyncio.create_task(self._setup_page(p)),
245
+ )
246
+
247
+ # ---------------- public api ----------------
248
+ def start(self) -> None:
249
+ """
250
+ Start monitoring the browser or context for Cloudflare challenges.
251
+
252
+ This method registers event listeners to automatically detect and solve
253
+ Cloudflare challenges on any new page. Call this once after creating
254
+ the solver instance.
255
+
256
+ If initialized with a Browser, monitors all existing and new contexts.
257
+ If initialized with a BrowserContext, monitors only that context.
258
+
259
+ Note:
260
+ This method is synchronous but sets up async handlers internally.
261
+ Challenges will be solved automatically in the background.
262
+ """
263
+ if self._is_browser:
264
+ # Track which contexts we've already set up
265
+ self._monitored_contexts: set = set()
266
+
267
+ def _check_and_setup_contexts():
268
+ for context in self._browser.contexts:
269
+ if context not in self._monitored_contexts:
270
+ self._monitored_contexts.add(context)
271
+ self._setup_context(context)
272
+
273
+ # Monitor all existing contexts
274
+ _check_and_setup_contexts()
275
+
276
+ # Start a background task to periodically check for new contexts
277
+ async def _monitor_contexts():
278
+ while self._browser.is_connected():
279
+ _check_and_setup_contexts()
280
+ await asyncio.sleep(0.1) # Check every 100ms for new contexts
281
+
282
+ self._monitor_task = asyncio.create_task(_monitor_contexts())
283
+ else:
284
+ self._setup_context(self._context)
@@ -0,0 +1,7 @@
1
+ import json
2
+ from pathlib import Path
3
+
4
+ from .consts import ALL_CF_SELECTORS
5
+
6
+ observer_js = Path(__file__).parent.parent.joinpath("scripts", "observer.js").read_text(encoding="utf-8").replace("__CF_SELECTORS__", json.dumps(ALL_CF_SELECTORS))
7
+ shadow_root_js = Path(__file__).parent.parent.joinpath("scripts", "shadow_root.js").read_text(encoding="utf-8")
@@ -0,0 +1,16 @@
1
+ from enum import Enum
2
+
3
+ CF_INTERSTITIAL_SELECTORS = [
4
+ 'script[src*="/cdn-cgi/challenge-platform/"]',
5
+ ]
6
+
7
+ CF_TURNSTILE_SELECTORS = [
8
+ 'input[name="cf-turnstile-response"]',
9
+ 'script[src*="challenges.cloudflare.com/turnstile/v0"]',
10
+ ]
11
+
12
+ ALL_CF_SELECTORS = CF_INTERSTITIAL_SELECTORS + CF_TURNSTILE_SELECTORS
13
+
14
+ class ChallengeType(Enum):
15
+ INTERSTITIAL = "interstitial"
16
+ TURNSTILE = "turnstile"
@@ -0,0 +1,52 @@
1
+ from typing import Literal, Union
2
+
3
+ from phantomwright.async_api import ElementHandle, Frame, Page
4
+ from .consts import CF_INTERSTITIAL_SELECTORS, CF_TURNSTILE_SELECTORS, ChallengeType
5
+
6
+ async def detect_cloudflare_challenge(
7
+ captcha_container: Union[Page, Frame, ElementHandle],
8
+ challenge_type: Literal['turnstile', 'interstitial'] = 'turnstile'
9
+ ) -> bool:
10
+ """
11
+ Detect if a Cloudflare challenge is present in the provided captcha container by checking for specific predefined selectors
12
+
13
+ :param captcha_container: Page, Frame, ElementHandle
14
+ :param challenge_type: Type of challenge to detect ('turnstile' or 'interstitial')
15
+
16
+ :return: True if Cloudflare challenge is detected, False otherwise
17
+ """
18
+
19
+ if challenge_type not in ('turnstile', 'interstitial'):
20
+ raise ValueError("Invalid challenge_type: it must be either 'turnstile' or 'interstitial'")
21
+
22
+ selectors = CF_TURNSTILE_SELECTORS if challenge_type == 'turnstile' else CF_INTERSTITIAL_SELECTORS
23
+ for selector in selectors:
24
+ try:
25
+ element = captcha_container.locator(selector)
26
+ if await element.count() == 0:
27
+ continue
28
+ except Exception as e:
29
+ if 'Execution context was destroyed, most likely because of a navigation' in str(e):
30
+ # logger.warning(
31
+ # 'Execution context was destroyed while detecting Cloudflare challenge - counting as not detected')
32
+ return False
33
+
34
+ return True
35
+
36
+ return False
37
+
38
+ async def detect_cf_challenge_type(page: Page):
39
+ async def any_selector_hit(selectors):
40
+ for sel in selectors:
41
+ try:
42
+ if await page.locator(sel).count() > 0:
43
+ return True
44
+ except:
45
+ continue
46
+ return False
47
+
48
+ if await any_selector_hit(CF_INTERSTITIAL_SELECTORS):
49
+ return ChallengeType.INTERSTITIAL
50
+ if await any_selector_hit(CF_TURNSTILE_SELECTORS):
51
+ return ChallengeType.TURNSTILE
52
+ return None
@@ -0,0 +1,59 @@
1
+ import asyncio
2
+ from typing import Optional, List, Tuple
3
+
4
+ from phantomwright.async_api import Frame, ElementHandle
5
+ from .shadow_root import search_shadow_root_elements
6
+
7
+ async def get_ready_checkbox(
8
+ iframes: List[Frame],
9
+ delay: int,
10
+ attempts: int
11
+ ) -> Optional[Tuple[Frame, ElementHandle]]:
12
+ """
13
+ Accepts a list of Cloudflare iframes, sorts out detached ones, collects checkboxes from the remaining iframes,
14
+ and waits until at least one checkbox is found and ready to be clicked (visible)
15
+
16
+ :param iframes: Cloudflare iframes
17
+ :param delay: Delay in seconds between attempts to find the checkbox
18
+ :param attempts: Maximum number of attempts to find the checkbox
19
+
20
+ :return: Tuple (checkboxes Frame, checkboxes ElementHandle) if checkbox is found and ready, None otherwise
21
+ """
22
+
23
+ # ensure at least one attempt
24
+ if attempts <= 0:
25
+ attempts = 1
26
+
27
+ for attempt in range(attempts):
28
+ try:
29
+ checkboxes = []
30
+
31
+ # search for checkboxes in each iframe
32
+ for iframe in iframes:
33
+ try:
34
+ if iframe.is_detached(): # skip detached iframes
35
+ continue
36
+
37
+ iframe_checkboxes = await search_shadow_root_elements(iframe, 'input[type="checkbox"]')
38
+
39
+ # add found checkboxes to the list with their parent iframe
40
+ checkboxes += [(iframe, iframe_checkbox) for iframe_checkbox in iframe_checkboxes]
41
+ except Exception as e:
42
+ pass
43
+ # logger.error(f'Error searching for checkboxes in iframe: {e}')
44
+
45
+ # filter checkboxes that are visible and ready to be clicked
46
+ visible_checkboxes = []
47
+ for iframe, checkbox in checkboxes:
48
+ if await checkbox.is_visible():
49
+ visible_checkboxes.append((iframe, checkbox))
50
+
51
+ if visible_checkboxes:
52
+ return visible_checkboxes[0] # return the first visible checkbox
53
+
54
+ await asyncio.sleep(delay)
55
+ except Exception as e:
56
+ pass
57
+ # logger.error(f'Error while waiting for checkbox: {e}')
58
+
59
+ return None
@@ -0,0 +1,122 @@
1
+ from typing import Union, List, Optional
2
+ from phantomwright.async_api import ElementHandle, Page, Frame
3
+
4
+ async def get_shadow_roots(
5
+ queryable: Union[Page, Frame, ElementHandle],
6
+ ) -> List[ElementHandle]:
7
+ """
8
+ Get all shadow roots on the page
9
+
10
+ :param queryable: Page, Frame, ElementHandle
11
+
12
+ :return: List of shadow roots ElementHandles
13
+ """
14
+
15
+ # script to collect all shadow roots
16
+ js_script = """
17
+ () => {
18
+ const roots = [];
19
+
20
+ function collectShadowRoots(node) {
21
+ console.log("captcha", "node", node);
22
+ if (!node) return;
23
+
24
+ if (node.shadowRootUnl) {
25
+ roots.push(node.shadowRootUnl);
26
+ node = node.shadowRootUnl;
27
+ }
28
+
29
+ for (const el of node.querySelectorAll("*")) {
30
+ if (el.shadowRootUnl) {
31
+ collectShadowRoots(el);
32
+ }
33
+ }
34
+ }
35
+
36
+ collectShadowRoots(document);
37
+ console.log(roots);
38
+ return roots;
39
+ }
40
+ """
41
+
42
+ handle = await queryable.evaluate_handle(js_script)
43
+
44
+ # convert JSHandle array to python list of ElementHandle
45
+ properties = await handle.get_properties()
46
+
47
+ shadow_roots = []
48
+ for prop_handle in properties.values():
49
+ element = prop_handle.as_element()
50
+ if element:
51
+ shadow_roots.append(element)
52
+
53
+ return shadow_roots
54
+
55
+
56
+ async def search_shadow_root_elements(
57
+ queryable: Union[Page, Frame, ElementHandle],
58
+ selector: str
59
+ ) -> List[ElementHandle]:
60
+ """
61
+ Search for elements by selector within the shadow DOM of the queryable object
62
+
63
+ :param queryable: Page, Frame, ElementHandle
64
+ :param selector: CSS selector to search for elements
65
+
66
+ :return: List of ElementHandles that match the selector
67
+ """
68
+
69
+ elements = []
70
+
71
+ try:
72
+ shadow_roots = await get_shadow_roots(queryable) # get all shadow roots in the queryable object
73
+ for shadow_root in shadow_roots:
74
+ # find all elements by selector within the shadow root
75
+ js_script = f"shadow => shadow.querySelector('{selector}')"
76
+
77
+ element_handle = await shadow_root.evaluate_handle(js_script)
78
+ if not element_handle:
79
+ continue
80
+
81
+ element = element_handle.as_element()
82
+ if element:
83
+ elements.append(element)
84
+ except Exception as e:
85
+ pass
86
+ # logger.error(f'Error searching for elements: {e}')
87
+
88
+ return elements
89
+
90
+
91
+ async def search_shadow_root_iframes(
92
+ captcha_container: Union[Page, Frame, ElementHandle],
93
+ src_filter: str
94
+ ) -> Optional[List[Frame]]:
95
+ """
96
+ Search for an iframe within the shadow DOM, src of which includes the src_filter
97
+
98
+ :param captcha_container: Page, Frame, ElementHandle
99
+ :param src_filter: String to filter the iframe's src attribute
100
+
101
+ :return: list of matched iframes or empty list if no iframes found
102
+ """
103
+
104
+ matched_iframes = []
105
+
106
+ try:
107
+ iframe_elements = await search_shadow_root_elements(captcha_container, 'iframe')
108
+ for iframe_element in iframe_elements:
109
+ src_prop = await iframe_element.get_property('src')
110
+ src = await src_prop.json_value()
111
+
112
+ if src_filter in src:
113
+ cf_iframe = await iframe_element.content_frame()
114
+ if cf_iframe and cf_iframe.is_detached(): # skip detached iframes
115
+ continue
116
+
117
+ matched_iframes.append(cf_iframe)
118
+ except Exception as e:
119
+ pass
120
+ # logger.error(f'Error searching for iframes: {e}')
121
+
122
+ return matched_iframes
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "phantomwright"
3
- version = "0.1.0"
3
+ version = "0.1.2"
4
4
  description = "Bridging playwright-core patch + extending playwright API for stealth injection & user simulation"
5
5
  authors = [
6
6
  {name="Hang Yin", email="hangyin@microsoft.com"},
@@ -1 +0,0 @@
1
- version = 'v0.1.0'
File without changes
File without changes