crawlix 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlix/__init__.py +28 -0
- crawlix/_version.py +1 -0
- crawlix/async_api.py +82 -0
- crawlix/backends/__init__.py +79 -0
- crawlix/backends/httpx.py +92 -0
- crawlix/backends/playwright.py +223 -0
- crawlix/backends/protocol.py +176 -0
- crawlix/backends/requests.py +139 -0
- crawlix/backends/selenium.py +270 -0
- crawlix/browser.py +71 -0
- crawlix/element.py +171 -0
- crawlix/exceptions.py +26 -0
- crawlix/page.py +223 -0
- crawlix/utils.py +36 -0
- crawlix-0.1.0.dist-info/METADATA +189 -0
- crawlix-0.1.0.dist-info/RECORD +18 -0
- crawlix-0.1.0.dist-info/WHEEL +4 -0
- crawlix-0.1.0.dist-info/licenses/LICENSE +21 -0
crawlix/__init__.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from crawlix.browser import Browser, browse, fetch, get
|
|
2
|
+
from crawlix.element import Element
|
|
3
|
+
from crawlix.exceptions import (
|
|
4
|
+
BackendError,
|
|
5
|
+
CrawlixError,
|
|
6
|
+
JavaScriptError,
|
|
7
|
+
NavigationError,
|
|
8
|
+
NetworkError,
|
|
9
|
+
SelectorError,
|
|
10
|
+
TimeoutError,
|
|
11
|
+
)
|
|
12
|
+
from crawlix.page import Page
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"Browser",
|
|
16
|
+
"Page",
|
|
17
|
+
"Element",
|
|
18
|
+
"get",
|
|
19
|
+
"fetch",
|
|
20
|
+
"browse",
|
|
21
|
+
"CrawlixError",
|
|
22
|
+
"BackendError",
|
|
23
|
+
"TimeoutError",
|
|
24
|
+
"NavigationError",
|
|
25
|
+
"SelectorError",
|
|
26
|
+
"NetworkError",
|
|
27
|
+
"JavaScriptError",
|
|
28
|
+
]
|
crawlix/_version.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
crawlix/async_api.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from crawlix.backends import _create_backend
|
|
6
|
+
from crawlix.backends.protocol import Backend, PageData
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class AsyncBrowser:
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
backend: str = "httpx",
|
|
13
|
+
headless: bool = True,
|
|
14
|
+
stealth: bool = True,
|
|
15
|
+
timeout: int = 30,
|
|
16
|
+
proxy: str | None = None,
|
|
17
|
+
locale: str = "en-US",
|
|
18
|
+
user_agent: str | None = None,
|
|
19
|
+
):
|
|
20
|
+
self._backend: Backend = _create_backend(
|
|
21
|
+
name=backend,
|
|
22
|
+
headless=headless,
|
|
23
|
+
stealth=stealth,
|
|
24
|
+
timeout=timeout,
|
|
25
|
+
proxy=proxy,
|
|
26
|
+
user_agent=user_agent,
|
|
27
|
+
locale=locale,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def backend_name(self) -> str:
|
|
32
|
+
return self._backend.name
|
|
33
|
+
|
|
34
|
+
async def open(self, url: str) -> AsyncPage:
|
|
35
|
+
data = await self._backend.open(url)
|
|
36
|
+
return AsyncPage(self._backend, data)
|
|
37
|
+
|
|
38
|
+
async def close(self) -> None:
|
|
39
|
+
await self._backend.close()
|
|
40
|
+
|
|
41
|
+
async def __aenter__(self) -> AsyncBrowser:
|
|
42
|
+
return self
|
|
43
|
+
|
|
44
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
45
|
+
await self.close()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class AsyncPage:
|
|
49
|
+
def __init__(self, backend, data: PageData | None = None):
|
|
50
|
+
self._backend = backend
|
|
51
|
+
self._data = data or PageData()
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def url(self) -> str:
|
|
55
|
+
return self._data.url
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def html(self) -> str:
|
|
59
|
+
return self._data.html
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def text(self) -> str:
|
|
63
|
+
return self._data.text
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def status(self) -> int:
|
|
67
|
+
return self._data.status
|
|
68
|
+
|
|
69
|
+
async def json(self) -> dict:
|
|
70
|
+
import json as _json
|
|
71
|
+
return _json.loads(self.text)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
async def aget(url: str, **kwargs: Any) -> AsyncPage:
|
|
75
|
+
async with AsyncBrowser(**kwargs) as b:
|
|
76
|
+
return await b.open(url)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
async def afetch(url: str, **kwargs: Any) -> str:
|
|
80
|
+
async with AsyncBrowser(**kwargs) as b:
|
|
81
|
+
page = await b.open(url)
|
|
82
|
+
return page.html
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from crawlix.backends.protocol import Backend
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def detect_backend(
|
|
10
|
+
explicit: str | None = None,
|
|
11
|
+
headless: bool = True,
|
|
12
|
+
stealth: bool = True,
|
|
13
|
+
timeout: int = 30,
|
|
14
|
+
proxy: str | None = None,
|
|
15
|
+
user_agent: str | None = None,
|
|
16
|
+
locale: str = "en-US",
|
|
17
|
+
) -> Backend:
|
|
18
|
+
if explicit is not None:
|
|
19
|
+
return _create_backend(
|
|
20
|
+
explicit, headless, stealth, timeout, proxy, user_agent, locale
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
for name in ("playwright", "selenium"):
|
|
24
|
+
try:
|
|
25
|
+
return _create_backend(
|
|
26
|
+
name, headless, stealth, timeout, proxy, user_agent, locale
|
|
27
|
+
)
|
|
28
|
+
except ImportError:
|
|
29
|
+
continue
|
|
30
|
+
|
|
31
|
+
from crawlix.backends.requests import RequestsBackend
|
|
32
|
+
return RequestsBackend(
|
|
33
|
+
headless=headless,
|
|
34
|
+
stealth=stealth,
|
|
35
|
+
timeout=timeout,
|
|
36
|
+
proxy=proxy,
|
|
37
|
+
user_agent=user_agent,
|
|
38
|
+
locale=locale,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _create_backend(
|
|
43
|
+
name: str,
|
|
44
|
+
headless: bool = True,
|
|
45
|
+
stealth: bool = True,
|
|
46
|
+
timeout: int = 30,
|
|
47
|
+
proxy: str | None = None,
|
|
48
|
+
user_agent: str | None = None,
|
|
49
|
+
locale: str = "en-US",
|
|
50
|
+
) -> Backend:
|
|
51
|
+
if name == "requests":
|
|
52
|
+
from crawlix.backends.requests import RequestsBackend
|
|
53
|
+
return RequestsBackend(
|
|
54
|
+
headless=headless, stealth=stealth, timeout=timeout,
|
|
55
|
+
proxy=proxy, user_agent=user_agent, locale=locale,
|
|
56
|
+
)
|
|
57
|
+
elif name == "playwright":
|
|
58
|
+
from crawlix.backends.playwright import PlaywrightBackend
|
|
59
|
+
return PlaywrightBackend(
|
|
60
|
+
headless=headless, stealth=stealth, timeout=timeout,
|
|
61
|
+
proxy=proxy, user_agent=user_agent, locale=locale,
|
|
62
|
+
)
|
|
63
|
+
elif name == "selenium":
|
|
64
|
+
from crawlix.backends.selenium import SeleniumBackend
|
|
65
|
+
return SeleniumBackend(
|
|
66
|
+
headless=headless, stealth=stealth, timeout=timeout,
|
|
67
|
+
proxy=proxy, user_agent=user_agent, locale=locale,
|
|
68
|
+
)
|
|
69
|
+
elif name == "httpx":
|
|
70
|
+
from crawlix.backends.httpx import HttpxBackend
|
|
71
|
+
return HttpxBackend(
|
|
72
|
+
headless=headless, stealth=stealth, timeout=timeout,
|
|
73
|
+
proxy=proxy, user_agent=user_agent, locale=locale,
|
|
74
|
+
)
|
|
75
|
+
else:
|
|
76
|
+
from crawlix.exceptions import BackendError
|
|
77
|
+
raise BackendError(
|
|
78
|
+
f"Unknown backend: {name!r}. Available: requests, playwright, selenium, httpx"
|
|
79
|
+
)
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from crawlix.backends.protocol import Backend, ElementData, PageData
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class HttpxBackend(Backend):
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
headless: bool = True,
|
|
12
|
+
stealth: bool = True,
|
|
13
|
+
timeout: int = 30,
|
|
14
|
+
proxy: str | None = None,
|
|
15
|
+
user_agent: str | None = None,
|
|
16
|
+
locale: str = "en-US",
|
|
17
|
+
**kwargs: Any,
|
|
18
|
+
):
|
|
19
|
+
self._timeout = timeout
|
|
20
|
+
self._proxy = proxy
|
|
21
|
+
self._user_agent = user_agent
|
|
22
|
+
self._locale = locale
|
|
23
|
+
self._stealth = stealth
|
|
24
|
+
self._client = None
|
|
25
|
+
|
|
26
|
+
async def _get_client(self):
|
|
27
|
+
if self._client is None:
|
|
28
|
+
try:
|
|
29
|
+
import httpx
|
|
30
|
+
except ImportError as e:
|
|
31
|
+
raise ImportError(
|
|
32
|
+
"httpx is required for async support.\n"
|
|
33
|
+
"Install: pip install crawlix[async]"
|
|
34
|
+
) from e
|
|
35
|
+
|
|
36
|
+
headers = {}
|
|
37
|
+
if self._stealth:
|
|
38
|
+
from crawlix.utils import stealth_headers
|
|
39
|
+
headers = stealth_headers(self._user_agent, self._locale)
|
|
40
|
+
|
|
41
|
+
client_kwargs: dict[str, Any] = {
|
|
42
|
+
"headers": headers,
|
|
43
|
+
"timeout": self._timeout,
|
|
44
|
+
}
|
|
45
|
+
if self._proxy:
|
|
46
|
+
client_kwargs["proxies"] = self._proxy
|
|
47
|
+
self._client = httpx.AsyncClient(**client_kwargs)
|
|
48
|
+
return self._client
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def name(self) -> str:
|
|
52
|
+
return "httpx"
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def supports_js(self) -> bool:
|
|
56
|
+
return False
|
|
57
|
+
|
|
58
|
+
async def open(self, url: str) -> PageData:
|
|
59
|
+
import httpx
|
|
60
|
+
from bs4 import BeautifulSoup
|
|
61
|
+
|
|
62
|
+
client = await self._get_client()
|
|
63
|
+
try:
|
|
64
|
+
resp = await client.get(url)
|
|
65
|
+
except httpx.RequestError as e:
|
|
66
|
+
from crawlix.exceptions import NavigationError
|
|
67
|
+
raise NavigationError(f"Failed to load {url}: {e}") from e
|
|
68
|
+
|
|
69
|
+
soup = BeautifulSoup(resp.text, "html.parser")
|
|
70
|
+
title_tag = soup.find("title")
|
|
71
|
+
title = title_tag.get_text(strip=True) if title_tag else ""
|
|
72
|
+
text = soup.get_text(separator=" ", strip=True)
|
|
73
|
+
|
|
74
|
+
return PageData(
|
|
75
|
+
url=str(resp.url),
|
|
76
|
+
html=resp.text,
|
|
77
|
+
text=text,
|
|
78
|
+
title=title,
|
|
79
|
+
status=resp.status_code,
|
|
80
|
+
headers=dict(resp.headers),
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
async def close(self) -> None:
|
|
84
|
+
if self._client is not None:
|
|
85
|
+
await self._client.aclose()
|
|
86
|
+
self._client = None
|
|
87
|
+
|
|
88
|
+
async def find(self, selector: str) -> ElementData | None:
|
|
89
|
+
return None
|
|
90
|
+
|
|
91
|
+
async def find_all(self, selector: str) -> list[ElementData]:
|
|
92
|
+
return []
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from crawlix.backends.protocol import Backend, ElementData, PageData
|
|
6
|
+
from crawlix.utils import random_user_agent
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PlaywrightBackend(Backend):
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
headless: bool = True,
|
|
13
|
+
stealth: bool = True,
|
|
14
|
+
timeout: int = 30,
|
|
15
|
+
proxy: str | None = None,
|
|
16
|
+
user_agent: str | None = None,
|
|
17
|
+
locale: str = "en-US",
|
|
18
|
+
**kwargs: Any,
|
|
19
|
+
):
|
|
20
|
+
try:
|
|
21
|
+
from playwright.sync_api import sync_playwright
|
|
22
|
+
except ImportError as e:
|
|
23
|
+
raise ImportError(
|
|
24
|
+
"playwright is not installed.\n"
|
|
25
|
+
"Install: pip install crawlix[playwright]"
|
|
26
|
+
) from e
|
|
27
|
+
|
|
28
|
+
self._headless = headless
|
|
29
|
+
self._stealth = stealth
|
|
30
|
+
self._timeout = timeout * 1000
|
|
31
|
+
self._proxy = proxy
|
|
32
|
+
self._user_agent = user_agent or random_user_agent()
|
|
33
|
+
self._locale = locale
|
|
34
|
+
self._pw = None
|
|
35
|
+
self._browser = None
|
|
36
|
+
self._context = None
|
|
37
|
+
self._page = None
|
|
38
|
+
|
|
39
|
+
self._pw = sync_playwright().start()
|
|
40
|
+
launch_kwargs: dict[str, Any] = {"headless": headless, "timeout": self._timeout}
|
|
41
|
+
if proxy:
|
|
42
|
+
launch_kwargs["proxy"] = {"server": proxy}
|
|
43
|
+
self._browser = self._pw.chromium.launch(**launch_kwargs)
|
|
44
|
+
|
|
45
|
+
context_kwargs: dict[str, Any] = {
|
|
46
|
+
"user_agent": self._user_agent,
|
|
47
|
+
"locale": locale,
|
|
48
|
+
}
|
|
49
|
+
if stealth:
|
|
50
|
+
context_kwargs.update(self._stealth_context())
|
|
51
|
+
self._context = self._browser.new_context(**context_kwargs)
|
|
52
|
+
self._page = self._context.new_page()
|
|
53
|
+
|
|
54
|
+
@staticmethod
|
|
55
|
+
def _stealth_context() -> dict:
|
|
56
|
+
return {
|
|
57
|
+
"viewport": {"width": 1920, "height": 1080},
|
|
58
|
+
"screen": {"width": 1920, "height": 1080},
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def name(self) -> str:
|
|
63
|
+
return "playwright"
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def supports_js(self) -> bool:
|
|
67
|
+
return True
|
|
68
|
+
|
|
69
|
+
def open(self, url: str) -> PageData:
|
|
70
|
+
self._page.goto(url, timeout=self._timeout)
|
|
71
|
+
return self._to_page_data()
|
|
72
|
+
|
|
73
|
+
def new_page(self) -> PageData:
|
|
74
|
+
self._page = self._context.new_page()
|
|
75
|
+
return PageData(url="about:blank")
|
|
76
|
+
|
|
77
|
+
def close(self) -> None:
|
|
78
|
+
try:
|
|
79
|
+
if self._pw:
|
|
80
|
+
self._pw.stop()
|
|
81
|
+
except Exception:
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
def _to_page_data(self) -> PageData:
|
|
85
|
+
return PageData(
|
|
86
|
+
url=self._page.url,
|
|
87
|
+
html=self._page.content(),
|
|
88
|
+
text=self._page.inner_text("body") if self._page.query_selector("body") else "",
|
|
89
|
+
title=self._page.title(),
|
|
90
|
+
status=200,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
def goto(self, url: str) -> None:
|
|
94
|
+
self._page.goto(url, timeout=self._timeout)
|
|
95
|
+
|
|
96
|
+
def reload(self) -> None:
|
|
97
|
+
self._page.reload()
|
|
98
|
+
|
|
99
|
+
def back(self) -> None:
|
|
100
|
+
self._page.go_back()
|
|
101
|
+
|
|
102
|
+
def forward(self) -> None:
|
|
103
|
+
self._page.go_forward()
|
|
104
|
+
|
|
105
|
+
def find(self, selector: str) -> ElementData | None:
|
|
106
|
+
el = self._page.query_selector(selector)
|
|
107
|
+
if el is None:
|
|
108
|
+
return None
|
|
109
|
+
return self._element_to_data(el)
|
|
110
|
+
|
|
111
|
+
def find_all(self, selector: str) -> list[ElementData]:
|
|
112
|
+
els = self._page.query_selector_all(selector)
|
|
113
|
+
return [self._element_to_data(el) for el in els]
|
|
114
|
+
|
|
115
|
+
def _element_to_data(self, el) -> ElementData:
|
|
116
|
+
try:
|
|
117
|
+
tag = el.evaluate("el => el.tagName.toLowerCase()")
|
|
118
|
+
except Exception:
|
|
119
|
+
tag = ""
|
|
120
|
+
return ElementData(
|
|
121
|
+
tag=tag,
|
|
122
|
+
text=el.inner_text() if el else "",
|
|
123
|
+
html=el.inner_html() if el else "",
|
|
124
|
+
outer_html=el.evaluate("el => el.outerHTML") if el else "",
|
|
125
|
+
attributes=el.evaluate(
|
|
126
|
+
"el => { const a = {}; for (const attr of el.attributes) a[attr.name] = attr.value; return a; }"
|
|
127
|
+
) if el else {},
|
|
128
|
+
classes=el.evaluate("el => [...el.classList]") if el else [],
|
|
129
|
+
element_id=el.evaluate("el => el.id") if el else "",
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
def click(self, selector: str) -> None:
|
|
133
|
+
self._page.click(selector)
|
|
134
|
+
|
|
135
|
+
def double_click(self, selector: str) -> None:
|
|
136
|
+
self._page.dblclick(selector)
|
|
137
|
+
|
|
138
|
+
def right_click(self, selector: str) -> None:
|
|
139
|
+
self._page.click(selector, button="right")
|
|
140
|
+
|
|
141
|
+
def type(self, selector: str, text: str) -> None:
|
|
142
|
+
self._page.fill(selector, text)
|
|
143
|
+
|
|
144
|
+
def clear(self, selector: str) -> None:
|
|
145
|
+
self._page.fill(selector, "")
|
|
146
|
+
|
|
147
|
+
def submit(self, selector: str = "form") -> None:
|
|
148
|
+
el = self._page.query_selector(selector)
|
|
149
|
+
if el:
|
|
150
|
+
el.evaluate("el => el.submit()")
|
|
151
|
+
|
|
152
|
+
def select(self, selector: str, value: str) -> None:
|
|
153
|
+
self._page.select_option(selector, value)
|
|
154
|
+
|
|
155
|
+
def hover(self, selector: str) -> None:
|
|
156
|
+
self._page.hover(selector)
|
|
157
|
+
|
|
158
|
+
def focus(self, selector: str) -> None:
|
|
159
|
+
self._page.focus(selector)
|
|
160
|
+
|
|
161
|
+
def blur(self, selector: str) -> None:
|
|
162
|
+
self._page.evaluate(f"document.querySelector('{selector}')?.blur()")
|
|
163
|
+
|
|
164
|
+
def scroll_to(self, selector: str) -> None:
|
|
165
|
+
self._page.evaluate(f"document.querySelector('{selector}')?.scrollIntoView()")
|
|
166
|
+
|
|
167
|
+
def key(self, key: str) -> None:
|
|
168
|
+
self._page.keyboard.press(key)
|
|
169
|
+
|
|
170
|
+
def upload(self, selector: str, path: str) -> None:
|
|
171
|
+
self._page.set_input_files(selector, path)
|
|
172
|
+
|
|
173
|
+
def wait_for(self, selector: str, timeout: int = 10) -> None:
|
|
174
|
+
self._page.wait_for_selector(selector, timeout=timeout * 1000)
|
|
175
|
+
|
|
176
|
+
def wait_for_text(self, text: str, timeout: int = 10) -> None:
|
|
177
|
+
self._page.wait_for_function(
|
|
178
|
+
f'document.body.innerText.includes("{text}")',
|
|
179
|
+
timeout=timeout * 1000,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
def wait_for_url(self, pattern: str, timeout: int = 10) -> None:
|
|
183
|
+
self._page.wait_for_url(pattern, timeout=timeout * 1000)
|
|
184
|
+
|
|
185
|
+
def wait_for_load(self, timeout: int = 30) -> None:
|
|
186
|
+
self._page.wait_for_load_state("load", timeout=timeout * 1000)
|
|
187
|
+
|
|
188
|
+
def wait_for_network_idle(self, timeout: int = 30) -> None:
|
|
189
|
+
self._page.wait_for_load_state("networkidle", timeout=timeout * 1000)
|
|
190
|
+
|
|
191
|
+
def sleep(self, seconds: float) -> None:
|
|
192
|
+
import time
|
|
193
|
+
time.sleep(seconds)
|
|
194
|
+
|
|
195
|
+
def evaluate(self, js_code: str) -> Any:
|
|
196
|
+
return self._page.evaluate(js_code)
|
|
197
|
+
|
|
198
|
+
def evaluate_on(self, selector: str, js: str) -> Any:
|
|
199
|
+
return self._page.evaluate(f"document.querySelector('{selector}')?.{js}")
|
|
200
|
+
|
|
201
|
+
def screenshot(self, path: str | None = None) -> bytes:
|
|
202
|
+
return self._page.screenshot(path=path)
|
|
203
|
+
|
|
204
|
+
def pdf(self, path: str | None = None) -> bytes:
|
|
205
|
+
return self._page.pdf(path=path)
|
|
206
|
+
|
|
207
|
+
def set_headers(self, headers: dict[str, str]) -> None:
|
|
208
|
+
self._context.set_extra_http_headers(headers)
|
|
209
|
+
|
|
210
|
+
def get_cookies(self) -> list[dict]:
|
|
211
|
+
return [
|
|
212
|
+
{"name": c["name"], "value": c["value"], "domain": c.get("domain", ""), "path": c.get("path", "")}
|
|
213
|
+
for c in self._context.cookies()
|
|
214
|
+
]
|
|
215
|
+
|
|
216
|
+
def set_cookies(self, cookies: list[dict]) -> None:
|
|
217
|
+
self._context.add_cookies(cookies)
|
|
218
|
+
|
|
219
|
+
def clear_cookies(self) -> None:
|
|
220
|
+
self._context.clear_cookies()
|
|
221
|
+
|
|
222
|
+
def intercept(self, pattern: str, handler) -> None:
|
|
223
|
+
self._page.route(pattern, handler)
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from crawlix.exceptions import BackendError
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class PageData:
|
|
12
|
+
url: str = ""
|
|
13
|
+
html: str = ""
|
|
14
|
+
text: str = ""
|
|
15
|
+
title: str = ""
|
|
16
|
+
status: int = 200
|
|
17
|
+
headers: dict[str, str] = field(default_factory=dict)
|
|
18
|
+
cookies: list[dict[str, str]] = field(default_factory=list)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ElementData:
|
|
23
|
+
tag: str = ""
|
|
24
|
+
text: str = ""
|
|
25
|
+
html: str = ""
|
|
26
|
+
outer_html: str = ""
|
|
27
|
+
attributes: dict[str, str] = field(default_factory=dict)
|
|
28
|
+
classes: list[str] = field(default_factory=list)
|
|
29
|
+
element_id: str = ""
|
|
30
|
+
parent_data: ElementData | None = None
|
|
31
|
+
children_data: list[ElementData] = field(default_factory=list)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class Backend(ABC):
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def name(self) -> str: ...
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
@abstractmethod
|
|
42
|
+
def supports_js(self) -> bool: ...
|
|
43
|
+
|
|
44
|
+
@abstractmethod
|
|
45
|
+
def open(self, url: str) -> PageData: ...
|
|
46
|
+
|
|
47
|
+
@abstractmethod
|
|
48
|
+
def close(self) -> None: ...
|
|
49
|
+
|
|
50
|
+
# Navigation
|
|
51
|
+
def goto(self, url: str) -> None:
|
|
52
|
+
raise BackendError(self._browser_only("goto"))
|
|
53
|
+
|
|
54
|
+
def reload(self) -> None:
|
|
55
|
+
raise BackendError(self._browser_only("reload"))
|
|
56
|
+
|
|
57
|
+
def back(self) -> None:
|
|
58
|
+
raise BackendError(self._browser_only("back"))
|
|
59
|
+
|
|
60
|
+
def forward(self) -> None:
|
|
61
|
+
raise BackendError(self._browser_only("forward"))
|
|
62
|
+
|
|
63
|
+
# Querying
|
|
64
|
+
@abstractmethod
|
|
65
|
+
def find(self, selector: str) -> ElementData | None: ...
|
|
66
|
+
|
|
67
|
+
@abstractmethod
|
|
68
|
+
def find_all(self, selector: str) -> list[ElementData]: ...
|
|
69
|
+
|
|
70
|
+
def find_text(self, text: str) -> ElementData | None:
|
|
71
|
+
matches = [el for el in self.find_all("*") if text in (el.text or "")]
|
|
72
|
+
if not matches:
|
|
73
|
+
return None
|
|
74
|
+
matches.sort(key=lambda el: len(el.text or ""))
|
|
75
|
+
return matches[0]
|
|
76
|
+
|
|
77
|
+
def xpath(self, expr: str) -> list[ElementData]:
|
|
78
|
+
raise BackendError(self._browser_only("xpath"))
|
|
79
|
+
|
|
80
|
+
# Interaction
|
|
81
|
+
def click(self, selector: str) -> None:
|
|
82
|
+
raise BackendError(self._browser_only("click"))
|
|
83
|
+
|
|
84
|
+
def double_click(self, selector: str) -> None:
|
|
85
|
+
raise BackendError(self._browser_only("double_click"))
|
|
86
|
+
|
|
87
|
+
def type(self, selector: str, text: str) -> None:
|
|
88
|
+
raise BackendError(self._browser_only("type"))
|
|
89
|
+
|
|
90
|
+
def clear(self, selector: str) -> None:
|
|
91
|
+
raise BackendError(self._browser_only("clear"))
|
|
92
|
+
|
|
93
|
+
def submit(self, selector: str = "form") -> None:
|
|
94
|
+
raise BackendError(self._browser_only("submit"))
|
|
95
|
+
|
|
96
|
+
def select(self, selector: str, value: str) -> None:
|
|
97
|
+
raise BackendError(self._browser_only("select"))
|
|
98
|
+
|
|
99
|
+
def hover(self, selector: str) -> None:
|
|
100
|
+
raise BackendError(self._browser_only("hover"))
|
|
101
|
+
|
|
102
|
+
def focus(self, selector: str) -> None:
|
|
103
|
+
raise BackendError(self._browser_only("focus"))
|
|
104
|
+
|
|
105
|
+
def blur(self, selector: str) -> None:
|
|
106
|
+
raise BackendError(self._browser_only("blur"))
|
|
107
|
+
|
|
108
|
+
def right_click(self, selector: str) -> None:
|
|
109
|
+
raise BackendError(self._browser_only("right_click"))
|
|
110
|
+
|
|
111
|
+
def scroll_to(self, selector: str) -> None:
|
|
112
|
+
raise BackendError(self._browser_only("scroll_to"))
|
|
113
|
+
|
|
114
|
+
def key(self, key: str) -> None:
|
|
115
|
+
raise BackendError(self._browser_only("key"))
|
|
116
|
+
|
|
117
|
+
def upload(self, selector: str, path: str) -> None:
|
|
118
|
+
raise BackendError(self._browser_only("upload"))
|
|
119
|
+
|
|
120
|
+
# Waiting
|
|
121
|
+
def wait_for(self, selector: str, timeout: int = 10) -> None:
|
|
122
|
+
raise BackendError(self._browser_only("wait_for"))
|
|
123
|
+
|
|
124
|
+
def wait_for_text(self, text: str, timeout: int = 10) -> None:
|
|
125
|
+
raise BackendError(self._browser_only("wait_for_text"))
|
|
126
|
+
|
|
127
|
+
def wait_for_url(self, pattern: str, timeout: int = 10) -> None:
|
|
128
|
+
raise BackendError(self._browser_only("wait_for_url"))
|
|
129
|
+
|
|
130
|
+
def wait_for_load(self, timeout: int = 30) -> None:
|
|
131
|
+
raise BackendError(self._browser_only("wait_for_load"))
|
|
132
|
+
|
|
133
|
+
def wait_for_network_idle(self, timeout: int = 30) -> None:
|
|
134
|
+
raise BackendError(self._browser_only("wait_for_network_idle"))
|
|
135
|
+
|
|
136
|
+
def sleep(self, seconds: float) -> None:
|
|
137
|
+
raise BackendError(self._browser_only("sleep"))
|
|
138
|
+
|
|
139
|
+
# JavaScript
|
|
140
|
+
def evaluate(self, js_code: str) -> Any:
|
|
141
|
+
raise BackendError(self._browser_only("evaluate"))
|
|
142
|
+
|
|
143
|
+
def evaluate_on(self, selector: str, js: str) -> Any:
|
|
144
|
+
raise BackendError(self._browser_only("evaluate_on"))
|
|
145
|
+
|
|
146
|
+
# Network
|
|
147
|
+
def set_headers(self, headers: dict[str, str]) -> None:
|
|
148
|
+
raise BackendError(self._browser_only("set_headers"))
|
|
149
|
+
|
|
150
|
+
def set_cookies(self, cookies: list[dict]) -> None:
|
|
151
|
+
raise BackendError(self._browser_only("set_cookies"))
|
|
152
|
+
|
|
153
|
+
def get_cookies(self) -> list[dict]:
|
|
154
|
+
raise BackendError(self._browser_only("get_cookies"))
|
|
155
|
+
|
|
156
|
+
def clear_cookies(self) -> None:
|
|
157
|
+
raise BackendError(self._browser_only("clear_cookies"))
|
|
158
|
+
|
|
159
|
+
def intercept(self, pattern: str, handler) -> None:
|
|
160
|
+
raise BackendError(self._browser_only("intercept"))
|
|
161
|
+
|
|
162
|
+
# Output
|
|
163
|
+
def screenshot(self, path: str | None = None) -> bytes:
|
|
164
|
+
raise BackendError(self._browser_only("screenshot"))
|
|
165
|
+
|
|
166
|
+
def pdf(self, path: str | None = None) -> bytes:
|
|
167
|
+
raise BackendError(self._browser_only("pdf"))
|
|
168
|
+
|
|
169
|
+
@staticmethod
|
|
170
|
+
def _browser_only(method: str) -> str:
|
|
171
|
+
return (
|
|
172
|
+
f"{method}() requires a browser backend.\n"
|
|
173
|
+
"Install one:\n"
|
|
174
|
+
" pip install crawlix[playwright] \u2190 recommended\n"
|
|
175
|
+
" pip install crawlix[selenium]"
|
|
176
|
+
)
|