helixwright 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- helixwright/__init__.py +58 -0
- helixwright/_impl/__init__.py +1 -0
- helixwright/_impl/_assertions.py +184 -0
- helixwright/_impl/_consent.py +80 -0
- helixwright/_impl/_errors.py +44 -0
- helixwright/_impl/_forensics.py +202 -0
- helixwright/_impl/_humanize.py +250 -0
- helixwright/_impl/_input.py +238 -0
- helixwright/_impl/_locator.py +1322 -0
- helixwright/_impl/_network.py +265 -0
- helixwright/_impl/_operator.py +746 -0
- helixwright/_impl/_page.py +2585 -0
- helixwright/_impl/_search.py +2 -0
- helixwright/_impl/_selectors.py +322 -0
- helixwright/_impl/_settings.py +51 -0
- helixwright/_impl/_transport.py +127 -0
- helixwright/_impl/_waits.py +204 -0
- helixwright/client.py +504 -0
- helixwright/errors.py +50 -0
- helixwright/launcher.py +267 -0
- helixwright/models.py +117 -0
- helixwright/py.typed +1 -0
- helixwright/solver.py +107 -0
- helixwright-0.1.0.dist-info/METADATA +240 -0
- helixwright-0.1.0.dist-info/RECORD +27 -0
- helixwright-0.1.0.dist-info/WHEEL +5 -0
- helixwright-0.1.0.dist-info/top_level.txt +1 -0
helixwright/__init__.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Helixwright automation SDK.
|
|
2
|
+
|
|
3
|
+
The SDK talks to Helix Browser through the desktop Local API. It does not spawn
|
|
4
|
+
Chrome, generate fingerprints, or manage browser profile directories by itself.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .client import Client, Profile, discover_base_url
|
|
8
|
+
from .errors import (
|
|
9
|
+
AutomationEndpointUnavailable,
|
|
10
|
+
AutomationSessionNotFound,
|
|
11
|
+
DesktopNotLoggedIn,
|
|
12
|
+
ElementNotFoundError,
|
|
13
|
+
HelixwrightError,
|
|
14
|
+
LocalApiError,
|
|
15
|
+
LocalApiUnavailable,
|
|
16
|
+
LoginRejected,
|
|
17
|
+
ProfileNotFound,
|
|
18
|
+
ReachabilityError,
|
|
19
|
+
TransportError,
|
|
20
|
+
WaitTimeoutError,
|
|
21
|
+
)
|
|
22
|
+
from ._impl._network import Response, Route
|
|
23
|
+
from ._impl._operator import El, NoneEl, Op
|
|
24
|
+
from .launcher import HelixBrowser, attach, launch
|
|
25
|
+
from .models import AutomationRpc, AutomationSession, Fingerprint, LaunchConfig, Proxy
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"launch",
|
|
29
|
+
"attach",
|
|
30
|
+
"Client",
|
|
31
|
+
"Profile",
|
|
32
|
+
"Fingerprint",
|
|
33
|
+
"Proxy",
|
|
34
|
+
"LaunchConfig",
|
|
35
|
+
"AutomationRpc",
|
|
36
|
+
"AutomationSession",
|
|
37
|
+
"HelixBrowser",
|
|
38
|
+
"Op",
|
|
39
|
+
"El",
|
|
40
|
+
"NoneEl",
|
|
41
|
+
"Response",
|
|
42
|
+
"Route",
|
|
43
|
+
"discover_base_url",
|
|
44
|
+
"HelixwrightError",
|
|
45
|
+
"LocalApiError",
|
|
46
|
+
"LocalApiUnavailable",
|
|
47
|
+
"DesktopNotLoggedIn",
|
|
48
|
+
"ProfileNotFound",
|
|
49
|
+
"AutomationSessionNotFound",
|
|
50
|
+
"AutomationEndpointUnavailable",
|
|
51
|
+
"TransportError",
|
|
52
|
+
"ReachabilityError",
|
|
53
|
+
"ElementNotFoundError",
|
|
54
|
+
"WaitTimeoutError",
|
|
55
|
+
"LoginRejected",
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Private implementation package (unstable internals). Import the public API from `helixwright`, not from here."""
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import json
|
|
3
|
+
import time
|
|
4
|
+
from ._errors import HelixwrightError
|
|
5
|
+
|
|
6
|
+
# The accessibility matchers now read via NATIVE RPC helpers on the Locator (_aria_role /
|
|
7
|
+
# _accessible_name / _accessible_description over get_attribute + element_property('tagName')),
|
|
8
|
+
# so NO JS runs. They remain APPROXIMATE (heuristic, not Chromium's computed accessibility tree).
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class _AssertionsBase:
|
|
12
|
+
"""Polling assertion base: shared timeout, negation (not_/is_not), and the poll loop."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, target, timeout_ms=5000, is_not=False):
|
|
15
|
+
self._t = target
|
|
16
|
+
self._to = timeout_ms
|
|
17
|
+
self._not = is_not
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def not_(self):
|
|
21
|
+
return type(self)(self._t, self._to, not self._not)
|
|
22
|
+
|
|
23
|
+
is_not = not_
|
|
24
|
+
|
|
25
|
+
def _poll(self, pred, desc):
|
|
26
|
+
deadline = time.time() + self._to / 1000.0
|
|
27
|
+
last = None
|
|
28
|
+
while True:
|
|
29
|
+
try:
|
|
30
|
+
last = bool(pred())
|
|
31
|
+
except (HelixwrightError, TypeError, AttributeError):
|
|
32
|
+
# transient (element gone / null evaluate) -> failed-but-retryable, never crash
|
|
33
|
+
last = False
|
|
34
|
+
if last != self._not:
|
|
35
|
+
return
|
|
36
|
+
if time.time() >= deadline:
|
|
37
|
+
raise AssertionError("expect %s%s failed (last=%r)"
|
|
38
|
+
% ("not_." if self._not else "", desc, last))
|
|
39
|
+
time.sleep(0.08)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class LocatorAssertions(_AssertionsBase):
|
|
43
|
+
"""Assertions about a Locator — visibility, text, value, count, state."""
|
|
44
|
+
|
|
45
|
+
def to_be_visible(self):
|
|
46
|
+
self._poll(self._t.is_visible, "to_be_visible")
|
|
47
|
+
|
|
48
|
+
def to_be_hidden(self):
|
|
49
|
+
self._poll(lambda: not self._t.is_visible(), "to_be_hidden")
|
|
50
|
+
|
|
51
|
+
def to_be_enabled(self):
|
|
52
|
+
self._poll(self._t.is_enabled, "to_be_enabled")
|
|
53
|
+
|
|
54
|
+
def to_be_disabled(self):
|
|
55
|
+
self._poll(lambda: not self._t.is_enabled(), "to_be_disabled")
|
|
56
|
+
|
|
57
|
+
def to_have_text(self, txt):
|
|
58
|
+
self._poll(lambda: self._t.text().strip() == txt, "to_have_text(%r)" % txt)
|
|
59
|
+
|
|
60
|
+
def to_contain_text(self, txt):
|
|
61
|
+
self._poll(lambda: txt in self._t.text(), "to_contain_text(%r)" % txt)
|
|
62
|
+
|
|
63
|
+
def to_have_value(self, v):
|
|
64
|
+
self._poll(lambda: self._t.input_value() == v, "to_have_value(%r)" % v)
|
|
65
|
+
|
|
66
|
+
def to_have_count(self, n):
|
|
67
|
+
self._poll(lambda: self._t.count() == n, "to_have_count(%d)" % n)
|
|
68
|
+
|
|
69
|
+
def to_be_checked(self):
|
|
70
|
+
self._poll(self._t.is_checked, "to_be_checked")
|
|
71
|
+
|
|
72
|
+
def to_be_editable(self):
|
|
73
|
+
self._poll(self._t.is_editable, "to_be_editable")
|
|
74
|
+
|
|
75
|
+
def to_be_focused(self):
|
|
76
|
+
self._poll(self._t.is_focused, "to_be_focused")
|
|
77
|
+
|
|
78
|
+
def to_have_attribute(self, name, value=None):
|
|
79
|
+
def _p():
|
|
80
|
+
v = self._t.get_attribute(name)
|
|
81
|
+
return (v is not None) if value is None else (v == value)
|
|
82
|
+
self._poll(_p, "to_have_attribute(%r,%r)" % (name, value))
|
|
83
|
+
|
|
84
|
+
# --- Playwright-parity matchers (pure-Python over Locator.evaluate/get_attribute/count) ----
|
|
85
|
+
def to_have_class(self, value):
|
|
86
|
+
"""The full class attribute equals |value| (Playwright string form)."""
|
|
87
|
+
self._poll(lambda: (self._t.get_attribute("class") or "") == value, "to_have_class(%r)" % value)
|
|
88
|
+
|
|
89
|
+
def to_contain_class(self, value):
|
|
90
|
+
"""The class token list contains |value|."""
|
|
91
|
+
self._poll(lambda: value in (self._t.get_attribute("class") or "").split(),
|
|
92
|
+
"to_contain_class(%r)" % value)
|
|
93
|
+
|
|
94
|
+
def to_have_id(self, id_):
|
|
95
|
+
self._poll(lambda: (self._t.get_attribute("id") or "") == id_, "to_have_id(%r)" % id_)
|
|
96
|
+
|
|
97
|
+
def to_have_css(self, name, value):
|
|
98
|
+
# [no-JS] native WebElement::GetComputedValue via Locator.computed_style.
|
|
99
|
+
self._poll(lambda: self._t.computed_style(name) == value, "to_have_css(%r,%r)" % (name, value))
|
|
100
|
+
|
|
101
|
+
def to_have_js_property(self, name, value):
|
|
102
|
+
self._poll(lambda: self._t.evaluate("el[%s]" % json.dumps(name)) == value,
|
|
103
|
+
"to_have_js_property(%r,%r)" % (name, value))
|
|
104
|
+
|
|
105
|
+
def to_have_values(self, values):
|
|
106
|
+
"""The selected <option> values of a multi-select equal |values| (list)."""
|
|
107
|
+
want = list(values)
|
|
108
|
+
self._poll(lambda: self._t.evaluate(
|
|
109
|
+
"Array.prototype.map.call(el.selectedOptions||[],function(o){return o.value;})") == want,
|
|
110
|
+
"to_have_values(%r)" % (want,))
|
|
111
|
+
|
|
112
|
+
def to_be_empty(self):
|
|
113
|
+
"""No child elements and no text (Playwright to_be_empty). [no-JS] native InnerHTML()=="" check."""
|
|
114
|
+
self._poll(lambda: (self._t.inner_html() or "").strip() == "", "to_be_empty")
|
|
115
|
+
|
|
116
|
+
def to_be_attached(self):
|
|
117
|
+
self._poll(lambda: self._t.count() > 0, "to_be_attached")
|
|
118
|
+
|
|
119
|
+
def to_be_in_viewport(self):
|
|
120
|
+
# [no-JS] VisibleBoundsInWidget() is viewport-clipped, so IsVisible() == visible-in-viewport.
|
|
121
|
+
self._poll(self._t.is_visible, "to_be_in_viewport")
|
|
122
|
+
|
|
123
|
+
def to_have_role(self, role):
|
|
124
|
+
"""APPROXIMATE computed ARIA role (explicit role attr or a tag heuristic — not the engine AOM).
|
|
125
|
+
[no-JS] via Locator._aria_role (get_attribute + element_property('tagName'))."""
|
|
126
|
+
self._poll(lambda: self._t._aria_role() == role, "to_have_role(%r)" % role)
|
|
127
|
+
|
|
128
|
+
def to_have_accessible_name(self, name):
|
|
129
|
+
"""APPROXIMATE accessible name (aria-label/text/alt/title heuristic). [no-JS] via native RPCs."""
|
|
130
|
+
self._poll(lambda: (self._t._accessible_name() or "").strip() == name,
|
|
131
|
+
"to_have_accessible_name(%r)" % name)
|
|
132
|
+
|
|
133
|
+
def to_have_accessible_description(self, desc):
|
|
134
|
+
"""APPROXIMATE accessible description (title heuristic). [no-JS] via native RPC."""
|
|
135
|
+
self._poll(lambda: (self._t._accessible_description() or "").strip() == desc,
|
|
136
|
+
"to_have_accessible_description(%r)" % desc)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class PageAssertions(_AssertionsBase):
|
|
140
|
+
"""Assertions about a Page — url, title."""
|
|
141
|
+
|
|
142
|
+
def _page(self):
|
|
143
|
+
# target is normally the Page; tolerate a Locator (resolve its page) for the
|
|
144
|
+
# combined Expect's back-compat.
|
|
145
|
+
t = self._t
|
|
146
|
+
return t if hasattr(t, "goto") else getattr(t, "_page", t)
|
|
147
|
+
|
|
148
|
+
def to_have_url(self, url):
|
|
149
|
+
self._poll(lambda: url in self._page().evaluate("location.href"),
|
|
150
|
+
"to_have_url(%r)" % url)
|
|
151
|
+
|
|
152
|
+
def to_have_title(self, title):
|
|
153
|
+
self._poll(lambda: title in self._page().evaluate("document.title"),
|
|
154
|
+
"to_have_title(%r)" % title)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class Expect(LocatorAssertions, PageAssertions):
|
|
158
|
+
"""Back-compat combined assertions (the public ``Expect`` export). Prefer ``expect()``,
|
|
159
|
+
which returns the precise LocatorAssertions / PageAssertions for the target."""
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def expect(target, timeout_ms=5000):
|
|
163
|
+
"""Polling assertion proxy for |target|: a Page -> PageAssertions (url/title), a
|
|
164
|
+
Locator -> LocatorAssertions (visibility/text/value/state). Async facade objects
|
|
165
|
+
(AsyncPage/AsyncLocator) are unwrapped to their sync object (assertions are
|
|
166
|
+
synchronous). For a bare Element use page.locator(css) instead."""
|
|
167
|
+
inner = getattr(target, "_p", None) # AsyncPage wraps the sync Page as _p
|
|
168
|
+
if inner is None:
|
|
169
|
+
inner = getattr(target, "_l", None) # AsyncLocator wraps the sync Locator as _l
|
|
170
|
+
if inner is not None:
|
|
171
|
+
target = inner
|
|
172
|
+
if hasattr(target, "goto"): # a Page
|
|
173
|
+
return PageAssertions(target, timeout_ms)
|
|
174
|
+
# Locator OR Element -> BOTH expose _as_locator() (Locator returns self; Element re-resolves to a
|
|
175
|
+
# Locator from its originating selector, or raises clearly for a selector-less Element). Resolve to
|
|
176
|
+
# a Locator so assertions POLL across re-render/detach (a fixed node_id snapshot can't survive one).
|
|
177
|
+
# We no longer inspect the .text descriptor to tell Element from Locator: that property-vs-method
|
|
178
|
+
# hack breaks once the two unify, and an instance .text read would fire a get_text RPC on a
|
|
179
|
+
# navigating page. _as_locator() is a plain method (no RPC) present on both.
|
|
180
|
+
loc_fn = getattr(target, "_as_locator", None)
|
|
181
|
+
if callable(loc_fn):
|
|
182
|
+
return LocatorAssertions(loc_fn(), timeout_ms)
|
|
183
|
+
raise HelixwrightError(
|
|
184
|
+
"expect() needs a Page, Locator, or selector-based Element; got %r" % type(target).__name__)
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Declarative cookie-consent rules (plan F7 / G3 accept_cookies).
|
|
2
|
+
|
|
3
|
+
Our advantage over autoconsent extensions: we execute these with our OWN isTrusted cross-frame
|
|
4
|
+
(+ shadow-piercing) click engine — no extension, no injected JS. Each entry is a Helixwright
|
|
5
|
+
selector (CMP-specific id/class first — fast + precise — then aria-label, then visible text as a
|
|
6
|
+
multilingual fallback). page.accept_cookies() tries them in order via find_anywhere (so a CMP
|
|
7
|
+
inside an OOPIF / shadow root is reached) and isTrusted-clicks the first match. Opt-in by
|
|
8
|
+
calling it; default action is 'accept'."""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# Ordered most-specific -> most-general. CMP id/class are unique enough to be safe;
|
|
13
|
+
# text fallbacks are intentionally conservative (exact, common labels) to avoid mis-clicks.
|
|
14
|
+
_ACCEPT = [
|
|
15
|
+
# OneTrust
|
|
16
|
+
"#onetrust-accept-btn-handler",
|
|
17
|
+
"#accept-recommended-btn-handler",
|
|
18
|
+
".onetrust-close-btn-handler.accept",
|
|
19
|
+
# Cookiebot
|
|
20
|
+
"#CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll",
|
|
21
|
+
"#CybotCookiebotDialogBodyButtonAccept",
|
|
22
|
+
"#CybotCookiebotDialogBodyLevelButtonAccept",
|
|
23
|
+
# Usercentrics
|
|
24
|
+
'button[data-testid="uc-accept-all-button"]',
|
|
25
|
+
'button[data-testid="uc-accept-button"]',
|
|
26
|
+
# Didomi
|
|
27
|
+
"#didomi-notice-agree-button",
|
|
28
|
+
"button.didomi-button-highlight",
|
|
29
|
+
# Quantcast
|
|
30
|
+
'.qc-cmp2-summary-buttons button[mode="primary"]',
|
|
31
|
+
# Osano / TrustArc / Cookie-script / cookieconsent
|
|
32
|
+
".osano-cm-accept-all",
|
|
33
|
+
"#truste-consent-button",
|
|
34
|
+
"#cookiescript_accept",
|
|
35
|
+
".cc-allow",
|
|
36
|
+
".cc-btn.cc-allow",
|
|
37
|
+
# Google Funding Choices / IAB generic
|
|
38
|
+
".fc-cta-consent",
|
|
39
|
+
# aria-label fallbacks
|
|
40
|
+
'[aria-label="Accept all"]',
|
|
41
|
+
'[aria-label="Accept all cookies"]',
|
|
42
|
+
'[aria-label="Accept cookies"]',
|
|
43
|
+
# visible-text fallbacks (role=button restricts to actual buttons/links; @@name:
|
|
44
|
+
# filters by aria-label/text/value — the role grammar's supported text predicate).
|
|
45
|
+
'role=button@@name:Accept all',
|
|
46
|
+
'role=button@@name:Accept All Cookies',
|
|
47
|
+
'role=button@@name:Allow all',
|
|
48
|
+
'role=button@@name:I accept',
|
|
49
|
+
'role=button@@name:Agree',
|
|
50
|
+
'role=button@@name:Got it',
|
|
51
|
+
'role=button@@name:Accept',
|
|
52
|
+
'role=button@@name:Alle akzeptieren', # de
|
|
53
|
+
'role=button@@name:Tout accepter', # fr
|
|
54
|
+
'role=button@@name:Aceptar todo', # es
|
|
55
|
+
'role=button@@name:接受全部', # zh
|
|
56
|
+
'role=button@@name:同意', # zh/ja
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
_REJECT = [
|
|
60
|
+
"#onetrust-reject-all-handler",
|
|
61
|
+
".ot-pc-refuse-all-handler",
|
|
62
|
+
"#CybotCookiebotDialogBodyButtonDecline",
|
|
63
|
+
"#CybotCookiebotDialogBodyLevelButtonLevelOptinDeclineAll",
|
|
64
|
+
'button[data-testid="uc-deny-all-button"]',
|
|
65
|
+
"#didomi-notice-disagree-button",
|
|
66
|
+
".qc-cmp2-summary-buttons button[mode=\"secondary\"]",
|
|
67
|
+
".osano-cm-deny-all",
|
|
68
|
+
".cc-deny",
|
|
69
|
+
'[aria-label="Reject all"]',
|
|
70
|
+
'[aria-label="Decline all"]',
|
|
71
|
+
'role=button@@name:Reject all',
|
|
72
|
+
'role=button@@name:Decline all',
|
|
73
|
+
'role=button@@name:Necessary only',
|
|
74
|
+
'role=button@@name:Reject',
|
|
75
|
+
'role=button@@name:Decline',
|
|
76
|
+
'role=button@@name:Alle ablehnen', # de
|
|
77
|
+
'role=button@@name:Tout refuser', # fr
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
_CONSENT_SELECTORS = {"accept": _ACCEPT, "reject": _REJECT}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
|
|
2
|
+
class HelixwrightError(Exception):
|
|
3
|
+
"""Base for all Helixwright errors. Public API: callers can `except HelixwrightError`
|
|
4
|
+
to catch everything; the subtypes below let internal code (and callers) discriminate
|
|
5
|
+
failure CLASSES without inspecting the error message string. [B3]"""
|
|
6
|
+
pass
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TransportError(HelixwrightError):
|
|
10
|
+
"""The local RPC transport failed (socket/HTTP/JSON, or the host process died). This is a
|
|
11
|
+
SESSION-level failure -- distinct from a per-element miss -- and must NOT be swallowed by the
|
|
12
|
+
best-effort `except` sites that mean 'element not present in this frame' (a browser crash
|
|
13
|
+
mid frame-walk used to surface as a confusing 'element not found')."""
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ElementNotFoundError(HelixwrightError):
|
|
18
|
+
"""A query resolved to no element (the host's 'not found' reply). Frame-walk / non-strict
|
|
19
|
+
lookups suppress exactly this class; everything else (incl. TransportError) propagates."""
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class WaitTimeoutError(HelixwrightError):
|
|
24
|
+
"""A wait/poll exceeded its deadline. (Named WaitTimeoutError, not TimeoutError, to avoid
|
|
25
|
+
shadowing the builtin OSError-derived TimeoutError.)"""
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ReachabilityError(HelixwrightError):
|
|
30
|
+
"""[3.0] A pre-launch connectivity check FAILED -- the proxy is dead/unreachable or there is no
|
|
31
|
+
internet -- so the browser is NOT opened (fail BEFORE spending a launch on a dead exit). This is
|
|
32
|
+
CONNECTIVITY only, not IP-reputation vetting (the framework does not vet proxy reputation)."""
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class LoginRejected(HelixwrightError):
|
|
37
|
+
"""[Tier-1A] A submitted form/login was REJECTED by the site -- raised by fill_form/login with
|
|
38
|
+
confirm=True when an unambiguous blocking inline error (aria-invalid / [role=alert] / a known
|
|
39
|
+
error class, with non-empty text) renders after submit. `.message` carries the site's VERBATIM
|
|
40
|
+
text so the failure surfaces at the cause ('Invalid password') instead of three steps later."""
|
|
41
|
+
|
|
42
|
+
def __init__(self, message):
|
|
43
|
+
self.message = message
|
|
44
|
+
super().__init__("login/form rejected: %s" % (message,))
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"""Failure forensics (plan G2) — the anti-detection debugging moat.
|
|
2
|
+
|
|
3
|
+
Competitors' debuggers (Playwright trace, DrissionPage, SeleniumBase) capture the DOM. They
|
|
4
|
+
cannot answer the question that matters for a fingerprint browser: whether the page saw the
|
|
5
|
+
fingerprint snapshot Helix Local API launched. capture_failure() writes that bundle locally.
|
|
6
|
+
|
|
7
|
+
Defensive by construction: capturing a failure must never raise (that would mask the original
|
|
8
|
+
error), so every step is individually guarded."""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import os
|
|
13
|
+
import time
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Compact live-fingerprint probe: the values the page really saw.
|
|
17
|
+
_FP_SNAPSHOT_JS = r"""(function(){
|
|
18
|
+
function gl(){try{var c=document.createElement('canvas');
|
|
19
|
+
var x=c.getContext('webgl')||c.getContext('experimental-webgl');if(!x)return {};
|
|
20
|
+
var e=x.getExtension('WEBGL_debug_renderer_info');if(!e)return {};
|
|
21
|
+
return {renderer:x.getParameter(e.UNMASKED_RENDERER_WEBGL),
|
|
22
|
+
vendor:x.getParameter(e.UNMASKED_VENDOR_WEBGL)};}catch(_){return {};}}
|
|
23
|
+
var o={platform:navigator.platform,vendor:navigator.vendor,
|
|
24
|
+
hardwareConcurrency:navigator.hardwareConcurrency,deviceMemory:navigator.deviceMemory,
|
|
25
|
+
maxTouchPoints:navigator.maxTouchPoints,language:navigator.language,
|
|
26
|
+
languages:navigator.languages,webdriver:navigator.webdriver,userAgent:navigator.userAgent,
|
|
27
|
+
screen:[screen.width,screen.height,screen.availWidth,screen.availHeight,screen.colorDepth],
|
|
28
|
+
devicePixelRatio:window.devicePixelRatio,
|
|
29
|
+
outer:[window.outerWidth,window.outerHeight,window.screenX,window.screenY]};
|
|
30
|
+
try{o.timezone=Intl.DateTimeFormat().resolvedOptions().timeZone;}catch(_){}
|
|
31
|
+
try{var g=gl();if(g.renderer)o.webglRenderer=g.renderer;if(g.vendor)o.webglVendor=g.vendor;}catch(_){}
|
|
32
|
+
return o;})()"""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _safe(fn, default=None):
|
|
36
|
+
try:
|
|
37
|
+
return fn()
|
|
38
|
+
except Exception:
|
|
39
|
+
return default
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _slug(s):
|
|
43
|
+
return "".join(c if (c.isalnum() or c in "-_") else "_" for c in str(s))[:48] or "x"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _live_fp_js():
|
|
47
|
+
"""Return the local fingerprint snapshot probe used in failure bundles."""
|
|
48
|
+
return _FP_SNAPSHOT_JS
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _eq(a, b):
|
|
52
|
+
if a is None or b is None:
|
|
53
|
+
return a == b
|
|
54
|
+
if str(a).strip() == str(b).strip():
|
|
55
|
+
return True
|
|
56
|
+
try:
|
|
57
|
+
return abs(float(a) - float(b)) < 1e-6
|
|
58
|
+
except (ValueError, TypeError):
|
|
59
|
+
return False
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# fingerprint snapshot key -> live JS key.
|
|
63
|
+
_FINGERPRINT_TO_LIVE = [
|
|
64
|
+
("platform", "platform"), ("hwcc", "hardwareConcurrency"), ("devmem", "deviceMemory"),
|
|
65
|
+
("webglRenderer", "webglRenderer"), ("webglVendor", "webglVendor"),
|
|
66
|
+
("tz", "timezone"), ("sw", "screenWidth"), ("sh", "screenHeight"),
|
|
67
|
+
("dpr", "devicePixelRatio"), ("ua", "userAgent"), ("userAgent", "userAgent"),
|
|
68
|
+
("vendor", "vendor"), ("maxTouch", "maxTouchPoints"),
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _delta(live, fingerprint_snapshot):
|
|
73
|
+
"""Return {key: {expected, actual, live_key}} for fingerprint snapshot mismatches."""
|
|
74
|
+
if not isinstance(live, dict) or not isinstance(fingerprint_snapshot, dict):
|
|
75
|
+
return {}
|
|
76
|
+
out = {}
|
|
77
|
+
for pk, lk in _FINGERPRINT_TO_LIVE:
|
|
78
|
+
if pk not in fingerprint_snapshot:
|
|
79
|
+
continue
|
|
80
|
+
exp, act = fingerprint_snapshot.get(pk), live.get(lk)
|
|
81
|
+
if not _eq(exp, act):
|
|
82
|
+
out[pk] = {"expected": exp, "actual": act, "live_key": lk}
|
|
83
|
+
if "langs" in fingerprint_snapshot:
|
|
84
|
+
exp = fingerprint_snapshot.get("langs")
|
|
85
|
+
expl = [s.strip() for s in (exp.split(",") if isinstance(exp, str) else list(exp or []))]
|
|
86
|
+
act = live.get("languages")
|
|
87
|
+
actl = [str(s).strip() for s in (act if isinstance(act, list) else [])]
|
|
88
|
+
if expl != actl:
|
|
89
|
+
out["langs"] = {"expected": expl, "actual": actl, "live_key": "languages"}
|
|
90
|
+
return out
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# [W6.1] WAF block/challenge fingerprints for a LOCAL scan over the already-captured DOM (NO
|
|
94
|
+
# network). A fingerprint that passes every readback check can still be IP/ASN-blocked; these
|
|
95
|
+
# markers say WHICH WAF challenged -- turning a "delta says ok but still blocked" bundle actionable.
|
|
96
|
+
_BLOCK_MARKERS = {
|
|
97
|
+
"cloudflare": ("just a moment", "checking your browser", "cf-browser-verification",
|
|
98
|
+
"challenge-platform", "__cf_chl", "cf-turnstile", "attention required",
|
|
99
|
+
"cf-error-details"),
|
|
100
|
+
"datadome": ("datadome", "captcha-delivery.com", "geo.captcha-delivery"),
|
|
101
|
+
"akamai": ("errors.edgesuite.net", "reference #", "access denied"),
|
|
102
|
+
"perimeterx_human": ("px-captcha", "perimeterx", "px-cdn", "press & hold", "press and hold"),
|
|
103
|
+
"imperva_incapsula": ("incapsula", "incident id", "_incap_"),
|
|
104
|
+
"generic": ("unusual traffic", "verify you are human", "are you a robot",
|
|
105
|
+
"pardon our interruption", "request blocked", "bot detection"),
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _scan_block_signals(html):
|
|
110
|
+
"""[W6.1] Pure LOCAL scan of the already-captured DOM for WAF block/challenge markers (no
|
|
111
|
+
network, no navigation). Returns sorted "waf:marker" hits (empty if the page looks unblocked)."""
|
|
112
|
+
if not isinstance(html, str) or not html:
|
|
113
|
+
return []
|
|
114
|
+
low = html.lower()
|
|
115
|
+
hits = set()
|
|
116
|
+
for waf, markers in _BLOCK_MARKERS.items():
|
|
117
|
+
for m in markers:
|
|
118
|
+
if m in low:
|
|
119
|
+
hits.add("%s:%s" % (waf, m))
|
|
120
|
+
return sorted(hits)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def capture_failure(page, label="failure", error=None):
|
|
124
|
+
"""Write a forensic bundle for |page| into settings.capture_dir and return the bundle dir
|
|
125
|
+
(or None if capture is disabled / the dir can't be created). Never raises."""
|
|
126
|
+
settings = getattr(page, "settings", None)
|
|
127
|
+
base = getattr(settings, "capture_dir", None)
|
|
128
|
+
if not base:
|
|
129
|
+
return None
|
|
130
|
+
stamp = time.strftime("%Y%m%d_%H%M%S")
|
|
131
|
+
bundle = os.path.join(base, "%s_%s" % (stamp, _slug(label)))
|
|
132
|
+
if not _safe(lambda: (os.makedirs(bundle, exist_ok=True) or True), False):
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
# 1) screenshot (PNG) — the visual state at failure.
|
|
136
|
+
_safe(lambda: page.screenshot(os.path.join(bundle, "screenshot.png")))
|
|
137
|
+
|
|
138
|
+
# 2) live fingerprint snapshot — what the page actually saw.
|
|
139
|
+
fp = _safe(lambda: page.evaluate(_live_fp_js()), {})
|
|
140
|
+
_safe(lambda: _write_json(os.path.join(bundle, "fingerprint.json"), fp))
|
|
141
|
+
|
|
142
|
+
# 2b) configured-vs-live delta.
|
|
143
|
+
fingerprint_snapshot = getattr(settings, "fingerprint_snapshot", None) or {}
|
|
144
|
+
delta = _safe(lambda: _delta(fp, fingerprint_snapshot), {})
|
|
145
|
+
_safe(lambda: _write_json(os.path.join(bundle, "delta.json"),
|
|
146
|
+
{"mismatches": delta, "ok": not delta}))
|
|
147
|
+
|
|
148
|
+
# 3) action trail (G10) — the behavior leading up to the failure.
|
|
149
|
+
trail = _safe(lambda: list(getattr(page, "_trail", [])), [])
|
|
150
|
+
_safe(lambda: _write_json(os.path.join(bundle, "trail.json"), trail))
|
|
151
|
+
|
|
152
|
+
# 4) console messages (if console_listen() was active).
|
|
153
|
+
console = _safe(lambda: page.console_messages(), None)
|
|
154
|
+
if console:
|
|
155
|
+
_safe(lambda: _write_json(os.path.join(bundle, "console.json"), console))
|
|
156
|
+
|
|
157
|
+
# 5) DOM snapshot (for completeness; we already beat competitors on 1-4).
|
|
158
|
+
html = _safe(lambda: page.content(), None)
|
|
159
|
+
if isinstance(html, str):
|
|
160
|
+
_safe(lambda: _write_text(os.path.join(bundle, "page.html"), html))
|
|
161
|
+
|
|
162
|
+
# 5b) Local WAF-marker DOM scan (Cloudflare/DataDome/Akamai/PerimeterX cookie/script/global
|
|
163
|
+
# markers in the ALREADY-captured HTML). Pure local scan — NO network/navigation (that would
|
|
164
|
+
# destroy the captured failure state). Forensic signal only, NOT IP-reputation vetting:
|
|
165
|
+
# IP/ASN reputation is the user's Plane B, not the framework's (no proxy pool / no preflight).
|
|
166
|
+
block_signals = _safe(lambda: _scan_block_signals(html if isinstance(html, str) else ""), [])
|
|
167
|
+
if block_signals:
|
|
168
|
+
_safe(lambda: _write_json(os.path.join(bundle, "block_signals.json"), {
|
|
169
|
+
"block_signals": block_signals,
|
|
170
|
+
"note": "local DOM WAF-marker scan of the captured page (no network)"}))
|
|
171
|
+
|
|
172
|
+
# 6) meta — url, error, fingerprint summary, think-time stats. fingerprint_summary carries the
|
|
173
|
+
# full set the delta compares (so delta.json has complete expected values), not 7 keys.
|
|
174
|
+
_SUMMARY_KEYS = ("platform", "lang", "langs", "tz", "webglRenderer", "webglVendor",
|
|
175
|
+
"hwcc", "devmem", "sw", "sh", "dpr", "ua", "userAgent", "vendor",
|
|
176
|
+
"maxTouch", "webgpuArch", "voices", "color_scheme")
|
|
177
|
+
meta = {
|
|
178
|
+
"label": label,
|
|
179
|
+
"error": (str(error) if error is not None else None),
|
|
180
|
+
"url": _safe(lambda: page.url, ""),
|
|
181
|
+
"frame_depth": getattr(page, "_frame_depth", 0),
|
|
182
|
+
"humanize": getattr(settings, "humanize", None),
|
|
183
|
+
"behavior_seed": getattr(settings, "behavior_seed", 0),
|
|
184
|
+
"fingerprint_summary": {
|
|
185
|
+
k: fingerprint_snapshot.get(k) for k in _SUMMARY_KEYS if k in fingerprint_snapshot
|
|
186
|
+
},
|
|
187
|
+
"delta_ok": (not delta),
|
|
188
|
+
"block_signals": block_signals,
|
|
189
|
+
"think_time_total_s": round(getattr(getattr(page, "_hz", None), "total_slept", 0.0), 2),
|
|
190
|
+
}
|
|
191
|
+
_safe(lambda: _write_json(os.path.join(bundle, "meta.json"), meta))
|
|
192
|
+
return bundle
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _write_json(path, obj):
|
|
196
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
197
|
+
json.dump(obj, f, ensure_ascii=False, indent=2)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _write_text(path, text):
|
|
201
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
202
|
+
f.write(text)
|