crawlix 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
crawlix/__init__.py ADDED
@@ -0,0 +1,28 @@
1
+ from crawlix.browser import Browser, browse, fetch, get
2
+ from crawlix.element import Element
3
+ from crawlix.exceptions import (
4
+ BackendError,
5
+ CrawlixError,
6
+ JavaScriptError,
7
+ NavigationError,
8
+ NetworkError,
9
+ SelectorError,
10
+ TimeoutError,
11
+ )
12
+ from crawlix.page import Page
13
+
14
+ __all__ = [
15
+ "Browser",
16
+ "Page",
17
+ "Element",
18
+ "get",
19
+ "fetch",
20
+ "browse",
21
+ "CrawlixError",
22
+ "BackendError",
23
+ "TimeoutError",
24
+ "NavigationError",
25
+ "SelectorError",
26
+ "NetworkError",
27
+ "JavaScriptError",
28
+ ]
crawlix/_version.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
crawlix/async_api.py ADDED
@@ -0,0 +1,82 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from crawlix.backends import _create_backend
6
+ from crawlix.backends.protocol import Backend, PageData
7
+
8
+
9
+ class AsyncBrowser:
10
+ def __init__(
11
+ self,
12
+ backend: str = "httpx",
13
+ headless: bool = True,
14
+ stealth: bool = True,
15
+ timeout: int = 30,
16
+ proxy: str | None = None,
17
+ locale: str = "en-US",
18
+ user_agent: str | None = None,
19
+ ):
20
+ self._backend: Backend = _create_backend(
21
+ name=backend,
22
+ headless=headless,
23
+ stealth=stealth,
24
+ timeout=timeout,
25
+ proxy=proxy,
26
+ user_agent=user_agent,
27
+ locale=locale,
28
+ )
29
+
30
+ @property
31
+ def backend_name(self) -> str:
32
+ return self._backend.name
33
+
34
+ async def open(self, url: str) -> AsyncPage:
35
+ data = await self._backend.open(url)
36
+ return AsyncPage(self._backend, data)
37
+
38
+ async def close(self) -> None:
39
+ await self._backend.close()
40
+
41
+ async def __aenter__(self) -> AsyncBrowser:
42
+ return self
43
+
44
+ async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
45
+ await self.close()
46
+
47
+
48
+ class AsyncPage:
49
+ def __init__(self, backend, data: PageData | None = None):
50
+ self._backend = backend
51
+ self._data = data or PageData()
52
+
53
+ @property
54
+ def url(self) -> str:
55
+ return self._data.url
56
+
57
+ @property
58
+ def html(self) -> str:
59
+ return self._data.html
60
+
61
+ @property
62
+ def text(self) -> str:
63
+ return self._data.text
64
+
65
+ @property
66
+ def status(self) -> int:
67
+ return self._data.status
68
+
69
+ async def json(self) -> dict:
70
+ import json as _json
71
+ return _json.loads(self.text)
72
+
73
+
74
+ async def aget(url: str, **kwargs: Any) -> AsyncPage:
75
+ async with AsyncBrowser(**kwargs) as b:
76
+ return await b.open(url)
77
+
78
+
79
+ async def afetch(url: str, **kwargs: Any) -> str:
80
+ async with AsyncBrowser(**kwargs) as b:
81
+ page = await b.open(url)
82
+ return page.html
@@ -0,0 +1,79 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ if TYPE_CHECKING:
6
+ from crawlix.backends.protocol import Backend
7
+
8
+
9
+ def detect_backend(
10
+ explicit: str | None = None,
11
+ headless: bool = True,
12
+ stealth: bool = True,
13
+ timeout: int = 30,
14
+ proxy: str | None = None,
15
+ user_agent: str | None = None,
16
+ locale: str = "en-US",
17
+ ) -> Backend:
18
+ if explicit is not None:
19
+ return _create_backend(
20
+ explicit, headless, stealth, timeout, proxy, user_agent, locale
21
+ )
22
+
23
+ for name in ("playwright", "selenium"):
24
+ try:
25
+ return _create_backend(
26
+ name, headless, stealth, timeout, proxy, user_agent, locale
27
+ )
28
+ except ImportError:
29
+ continue
30
+
31
+ from crawlix.backends.requests import RequestsBackend
32
+ return RequestsBackend(
33
+ headless=headless,
34
+ stealth=stealth,
35
+ timeout=timeout,
36
+ proxy=proxy,
37
+ user_agent=user_agent,
38
+ locale=locale,
39
+ )
40
+
41
+
42
+ def _create_backend(
43
+ name: str,
44
+ headless: bool = True,
45
+ stealth: bool = True,
46
+ timeout: int = 30,
47
+ proxy: str | None = None,
48
+ user_agent: str | None = None,
49
+ locale: str = "en-US",
50
+ ) -> Backend:
51
+ if name == "requests":
52
+ from crawlix.backends.requests import RequestsBackend
53
+ return RequestsBackend(
54
+ headless=headless, stealth=stealth, timeout=timeout,
55
+ proxy=proxy, user_agent=user_agent, locale=locale,
56
+ )
57
+ elif name == "playwright":
58
+ from crawlix.backends.playwright import PlaywrightBackend
59
+ return PlaywrightBackend(
60
+ headless=headless, stealth=stealth, timeout=timeout,
61
+ proxy=proxy, user_agent=user_agent, locale=locale,
62
+ )
63
+ elif name == "selenium":
64
+ from crawlix.backends.selenium import SeleniumBackend
65
+ return SeleniumBackend(
66
+ headless=headless, stealth=stealth, timeout=timeout,
67
+ proxy=proxy, user_agent=user_agent, locale=locale,
68
+ )
69
+ elif name == "httpx":
70
+ from crawlix.backends.httpx import HttpxBackend
71
+ return HttpxBackend(
72
+ headless=headless, stealth=stealth, timeout=timeout,
73
+ proxy=proxy, user_agent=user_agent, locale=locale,
74
+ )
75
+ else:
76
+ from crawlix.exceptions import BackendError
77
+ raise BackendError(
78
+ f"Unknown backend: {name!r}. Available: requests, playwright, selenium, httpx"
79
+ )
@@ -0,0 +1,92 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from crawlix.backends.protocol import Backend, ElementData, PageData
6
+
7
+
8
+ class HttpxBackend(Backend):
9
+ def __init__(
10
+ self,
11
+ headless: bool = True,
12
+ stealth: bool = True,
13
+ timeout: int = 30,
14
+ proxy: str | None = None,
15
+ user_agent: str | None = None,
16
+ locale: str = "en-US",
17
+ **kwargs: Any,
18
+ ):
19
+ self._timeout = timeout
20
+ self._proxy = proxy
21
+ self._user_agent = user_agent
22
+ self._locale = locale
23
+ self._stealth = stealth
24
+ self._client = None
25
+
26
+ async def _get_client(self):
27
+ if self._client is None:
28
+ try:
29
+ import httpx
30
+ except ImportError as e:
31
+ raise ImportError(
32
+ "httpx is required for async support.\n"
33
+ "Install: pip install crawlix[async]"
34
+ ) from e
35
+
36
+ headers = {}
37
+ if self._stealth:
38
+ from crawlix.utils import stealth_headers
39
+ headers = stealth_headers(self._user_agent, self._locale)
40
+
41
+ client_kwargs: dict[str, Any] = {
42
+ "headers": headers,
43
+ "timeout": self._timeout,
44
+ }
45
+ if self._proxy:
46
+ client_kwargs["proxies"] = self._proxy
47
+ self._client = httpx.AsyncClient(**client_kwargs)
48
+ return self._client
49
+
50
+ @property
51
+ def name(self) -> str:
52
+ return "httpx"
53
+
54
+ @property
55
+ def supports_js(self) -> bool:
56
+ return False
57
+
58
+ async def open(self, url: str) -> PageData:
59
+ import httpx
60
+ from bs4 import BeautifulSoup
61
+
62
+ client = await self._get_client()
63
+ try:
64
+ resp = await client.get(url)
65
+ except httpx.RequestError as e:
66
+ from crawlix.exceptions import NavigationError
67
+ raise NavigationError(f"Failed to load {url}: {e}") from e
68
+
69
+ soup = BeautifulSoup(resp.text, "html.parser")
70
+ title_tag = soup.find("title")
71
+ title = title_tag.get_text(strip=True) if title_tag else ""
72
+ text = soup.get_text(separator=" ", strip=True)
73
+
74
+ return PageData(
75
+ url=str(resp.url),
76
+ html=resp.text,
77
+ text=text,
78
+ title=title,
79
+ status=resp.status_code,
80
+ headers=dict(resp.headers),
81
+ )
82
+
83
+ async def close(self) -> None:
84
+ if self._client is not None:
85
+ await self._client.aclose()
86
+ self._client = None
87
+
88
+ async def find(self, selector: str) -> ElementData | None:
89
+ return None
90
+
91
+ async def find_all(self, selector: str) -> list[ElementData]:
92
+ return []
@@ -0,0 +1,223 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from crawlix.backends.protocol import Backend, ElementData, PageData
6
+ from crawlix.utils import random_user_agent
7
+
8
+
9
+ class PlaywrightBackend(Backend):
10
+ def __init__(
11
+ self,
12
+ headless: bool = True,
13
+ stealth: bool = True,
14
+ timeout: int = 30,
15
+ proxy: str | None = None,
16
+ user_agent: str | None = None,
17
+ locale: str = "en-US",
18
+ **kwargs: Any,
19
+ ):
20
+ try:
21
+ from playwright.sync_api import sync_playwright
22
+ except ImportError as e:
23
+ raise ImportError(
24
+ "playwright is not installed.\n"
25
+ "Install: pip install crawlix[playwright]"
26
+ ) from e
27
+
28
+ self._headless = headless
29
+ self._stealth = stealth
30
+ self._timeout = timeout * 1000
31
+ self._proxy = proxy
32
+ self._user_agent = user_agent or random_user_agent()
33
+ self._locale = locale
34
+ self._pw = None
35
+ self._browser = None
36
+ self._context = None
37
+ self._page = None
38
+
39
+ self._pw = sync_playwright().start()
40
+ launch_kwargs: dict[str, Any] = {"headless": headless, "timeout": self._timeout}
41
+ if proxy:
42
+ launch_kwargs["proxy"] = {"server": proxy}
43
+ self._browser = self._pw.chromium.launch(**launch_kwargs)
44
+
45
+ context_kwargs: dict[str, Any] = {
46
+ "user_agent": self._user_agent,
47
+ "locale": locale,
48
+ }
49
+ if stealth:
50
+ context_kwargs.update(self._stealth_context())
51
+ self._context = self._browser.new_context(**context_kwargs)
52
+ self._page = self._context.new_page()
53
+
54
+ @staticmethod
55
+ def _stealth_context() -> dict:
56
+ return {
57
+ "viewport": {"width": 1920, "height": 1080},
58
+ "screen": {"width": 1920, "height": 1080},
59
+ }
60
+
61
+ @property
62
+ def name(self) -> str:
63
+ return "playwright"
64
+
65
+ @property
66
+ def supports_js(self) -> bool:
67
+ return True
68
+
69
+ def open(self, url: str) -> PageData:
70
+ self._page.goto(url, timeout=self._timeout)
71
+ return self._to_page_data()
72
+
73
+ def new_page(self) -> PageData:
74
+ self._page = self._context.new_page()
75
+ return PageData(url="about:blank")
76
+
77
+ def close(self) -> None:
78
+ try:
79
+ if self._pw:
80
+ self._pw.stop()
81
+ except Exception:
82
+ pass
83
+
84
+ def _to_page_data(self) -> PageData:
85
+ return PageData(
86
+ url=self._page.url,
87
+ html=self._page.content(),
88
+ text=self._page.inner_text("body") if self._page.query_selector("body") else "",
89
+ title=self._page.title(),
90
+ status=200,
91
+ )
92
+
93
+ def goto(self, url: str) -> None:
94
+ self._page.goto(url, timeout=self._timeout)
95
+
96
+ def reload(self) -> None:
97
+ self._page.reload()
98
+
99
+ def back(self) -> None:
100
+ self._page.go_back()
101
+
102
+ def forward(self) -> None:
103
+ self._page.go_forward()
104
+
105
+ def find(self, selector: str) -> ElementData | None:
106
+ el = self._page.query_selector(selector)
107
+ if el is None:
108
+ return None
109
+ return self._element_to_data(el)
110
+
111
+ def find_all(self, selector: str) -> list[ElementData]:
112
+ els = self._page.query_selector_all(selector)
113
+ return [self._element_to_data(el) for el in els]
114
+
115
+ def _element_to_data(self, el) -> ElementData:
116
+ try:
117
+ tag = el.evaluate("el => el.tagName.toLowerCase()")
118
+ except Exception:
119
+ tag = ""
120
+ return ElementData(
121
+ tag=tag,
122
+ text=el.inner_text() if el else "",
123
+ html=el.inner_html() if el else "",
124
+ outer_html=el.evaluate("el => el.outerHTML") if el else "",
125
+ attributes=el.evaluate(
126
+ "el => { const a = {}; for (const attr of el.attributes) a[attr.name] = attr.value; return a; }"
127
+ ) if el else {},
128
+ classes=el.evaluate("el => [...el.classList]") if el else [],
129
+ element_id=el.evaluate("el => el.id") if el else "",
130
+ )
131
+
132
+ def click(self, selector: str) -> None:
133
+ self._page.click(selector)
134
+
135
+ def double_click(self, selector: str) -> None:
136
+ self._page.dblclick(selector)
137
+
138
+ def right_click(self, selector: str) -> None:
139
+ self._page.click(selector, button="right")
140
+
141
+ def type(self, selector: str, text: str) -> None:
142
+ self._page.fill(selector, text)
143
+
144
+ def clear(self, selector: str) -> None:
145
+ self._page.fill(selector, "")
146
+
147
+ def submit(self, selector: str = "form") -> None:
148
+ el = self._page.query_selector(selector)
149
+ if el:
150
+ el.evaluate("el => el.submit()")
151
+
152
+ def select(self, selector: str, value: str) -> None:
153
+ self._page.select_option(selector, value)
154
+
155
+ def hover(self, selector: str) -> None:
156
+ self._page.hover(selector)
157
+
158
+ def focus(self, selector: str) -> None:
159
+ self._page.focus(selector)
160
+
161
+ def blur(self, selector: str) -> None:
162
+ self._page.evaluate(f"document.querySelector('{selector}')?.blur()")
163
+
164
+ def scroll_to(self, selector: str) -> None:
165
+ self._page.evaluate(f"document.querySelector('{selector}')?.scrollIntoView()")
166
+
167
+ def key(self, key: str) -> None:
168
+ self._page.keyboard.press(key)
169
+
170
+ def upload(self, selector: str, path: str) -> None:
171
+ self._page.set_input_files(selector, path)
172
+
173
+ def wait_for(self, selector: str, timeout: int = 10) -> None:
174
+ self._page.wait_for_selector(selector, timeout=timeout * 1000)
175
+
176
+ def wait_for_text(self, text: str, timeout: int = 10) -> None:
177
+ self._page.wait_for_function(
178
+ f'document.body.innerText.includes("{text}")',
179
+ timeout=timeout * 1000,
180
+ )
181
+
182
+ def wait_for_url(self, pattern: str, timeout: int = 10) -> None:
183
+ self._page.wait_for_url(pattern, timeout=timeout * 1000)
184
+
185
+ def wait_for_load(self, timeout: int = 30) -> None:
186
+ self._page.wait_for_load_state("load", timeout=timeout * 1000)
187
+
188
+ def wait_for_network_idle(self, timeout: int = 30) -> None:
189
+ self._page.wait_for_load_state("networkidle", timeout=timeout * 1000)
190
+
191
+ def sleep(self, seconds: float) -> None:
192
+ import time
193
+ time.sleep(seconds)
194
+
195
+ def evaluate(self, js_code: str) -> Any:
196
+ return self._page.evaluate(js_code)
197
+
198
+ def evaluate_on(self, selector: str, js: str) -> Any:
199
+ return self._page.evaluate(f"document.querySelector('{selector}')?.{js}")
200
+
201
+ def screenshot(self, path: str | None = None) -> bytes:
202
+ return self._page.screenshot(path=path)
203
+
204
+ def pdf(self, path: str | None = None) -> bytes:
205
+ return self._page.pdf(path=path)
206
+
207
+ def set_headers(self, headers: dict[str, str]) -> None:
208
+ self._context.set_extra_http_headers(headers)
209
+
210
+ def get_cookies(self) -> list[dict]:
211
+ return [
212
+ {"name": c["name"], "value": c["value"], "domain": c.get("domain", ""), "path": c.get("path", "")}
213
+ for c in self._context.cookies()
214
+ ]
215
+
216
+ def set_cookies(self, cookies: list[dict]) -> None:
217
+ self._context.add_cookies(cookies)
218
+
219
+ def clear_cookies(self) -> None:
220
+ self._context.clear_cookies()
221
+
222
+ def intercept(self, pattern: str, handler) -> None:
223
+ self._page.route(pattern, handler)
@@ -0,0 +1,176 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from dataclasses import dataclass, field
5
+ from typing import Any
6
+
7
+ from crawlix.exceptions import BackendError
8
+
9
+
10
+ @dataclass
11
+ class PageData:
12
+ url: str = ""
13
+ html: str = ""
14
+ text: str = ""
15
+ title: str = ""
16
+ status: int = 200
17
+ headers: dict[str, str] = field(default_factory=dict)
18
+ cookies: list[dict[str, str]] = field(default_factory=list)
19
+
20
+
21
+ @dataclass
22
+ class ElementData:
23
+ tag: str = ""
24
+ text: str = ""
25
+ html: str = ""
26
+ outer_html: str = ""
27
+ attributes: dict[str, str] = field(default_factory=dict)
28
+ classes: list[str] = field(default_factory=list)
29
+ element_id: str = ""
30
+ parent_data: ElementData | None = None
31
+ children_data: list[ElementData] = field(default_factory=list)
32
+
33
+
34
+ class Backend(ABC):
35
+
36
+ @property
37
+ @abstractmethod
38
+ def name(self) -> str: ...
39
+
40
+ @property
41
+ @abstractmethod
42
+ def supports_js(self) -> bool: ...
43
+
44
+ @abstractmethod
45
+ def open(self, url: str) -> PageData: ...
46
+
47
+ @abstractmethod
48
+ def close(self) -> None: ...
49
+
50
+ # Navigation
51
+ def goto(self, url: str) -> None:
52
+ raise BackendError(self._browser_only("goto"))
53
+
54
+ def reload(self) -> None:
55
+ raise BackendError(self._browser_only("reload"))
56
+
57
+ def back(self) -> None:
58
+ raise BackendError(self._browser_only("back"))
59
+
60
+ def forward(self) -> None:
61
+ raise BackendError(self._browser_only("forward"))
62
+
63
+ # Querying
64
+ @abstractmethod
65
+ def find(self, selector: str) -> ElementData | None: ...
66
+
67
+ @abstractmethod
68
+ def find_all(self, selector: str) -> list[ElementData]: ...
69
+
70
+ def find_text(self, text: str) -> ElementData | None:
71
+ matches = [el for el in self.find_all("*") if text in (el.text or "")]
72
+ if not matches:
73
+ return None
74
+ matches.sort(key=lambda el: len(el.text or ""))
75
+ return matches[0]
76
+
77
+ def xpath(self, expr: str) -> list[ElementData]:
78
+ raise BackendError(self._browser_only("xpath"))
79
+
80
+ # Interaction
81
+ def click(self, selector: str) -> None:
82
+ raise BackendError(self._browser_only("click"))
83
+
84
+ def double_click(self, selector: str) -> None:
85
+ raise BackendError(self._browser_only("double_click"))
86
+
87
+ def type(self, selector: str, text: str) -> None:
88
+ raise BackendError(self._browser_only("type"))
89
+
90
+ def clear(self, selector: str) -> None:
91
+ raise BackendError(self._browser_only("clear"))
92
+
93
+ def submit(self, selector: str = "form") -> None:
94
+ raise BackendError(self._browser_only("submit"))
95
+
96
+ def select(self, selector: str, value: str) -> None:
97
+ raise BackendError(self._browser_only("select"))
98
+
99
+ def hover(self, selector: str) -> None:
100
+ raise BackendError(self._browser_only("hover"))
101
+
102
+ def focus(self, selector: str) -> None:
103
+ raise BackendError(self._browser_only("focus"))
104
+
105
+ def blur(self, selector: str) -> None:
106
+ raise BackendError(self._browser_only("blur"))
107
+
108
+ def right_click(self, selector: str) -> None:
109
+ raise BackendError(self._browser_only("right_click"))
110
+
111
+ def scroll_to(self, selector: str) -> None:
112
+ raise BackendError(self._browser_only("scroll_to"))
113
+
114
+ def key(self, key: str) -> None:
115
+ raise BackendError(self._browser_only("key"))
116
+
117
+ def upload(self, selector: str, path: str) -> None:
118
+ raise BackendError(self._browser_only("upload"))
119
+
120
+ # Waiting
121
+ def wait_for(self, selector: str, timeout: int = 10) -> None:
122
+ raise BackendError(self._browser_only("wait_for"))
123
+
124
+ def wait_for_text(self, text: str, timeout: int = 10) -> None:
125
+ raise BackendError(self._browser_only("wait_for_text"))
126
+
127
+ def wait_for_url(self, pattern: str, timeout: int = 10) -> None:
128
+ raise BackendError(self._browser_only("wait_for_url"))
129
+
130
+ def wait_for_load(self, timeout: int = 30) -> None:
131
+ raise BackendError(self._browser_only("wait_for_load"))
132
+
133
+ def wait_for_network_idle(self, timeout: int = 30) -> None:
134
+ raise BackendError(self._browser_only("wait_for_network_idle"))
135
+
136
+ def sleep(self, seconds: float) -> None:
137
+ raise BackendError(self._browser_only("sleep"))
138
+
139
+ # JavaScript
140
+ def evaluate(self, js_code: str) -> Any:
141
+ raise BackendError(self._browser_only("evaluate"))
142
+
143
+ def evaluate_on(self, selector: str, js: str) -> Any:
144
+ raise BackendError(self._browser_only("evaluate_on"))
145
+
146
+ # Network
147
+ def set_headers(self, headers: dict[str, str]) -> None:
148
+ raise BackendError(self._browser_only("set_headers"))
149
+
150
+ def set_cookies(self, cookies: list[dict]) -> None:
151
+ raise BackendError(self._browser_only("set_cookies"))
152
+
153
+ def get_cookies(self) -> list[dict]:
154
+ raise BackendError(self._browser_only("get_cookies"))
155
+
156
+ def clear_cookies(self) -> None:
157
+ raise BackendError(self._browser_only("clear_cookies"))
158
+
159
+ def intercept(self, pattern: str, handler) -> None:
160
+ raise BackendError(self._browser_only("intercept"))
161
+
162
+ # Output
163
+ def screenshot(self, path: str | None = None) -> bytes:
164
+ raise BackendError(self._browser_only("screenshot"))
165
+
166
+ def pdf(self, path: str | None = None) -> bytes:
167
+ raise BackendError(self._browser_only("pdf"))
168
+
169
+ @staticmethod
170
+ def _browser_only(method: str) -> str:
171
+ return (
172
+ f"{method}() requires a browser backend.\n"
173
+ "Install one:\n"
174
+ " pip install crawlix[playwright] \u2190 recommended\n"
175
+ " pip install crawlix[selenium]"
176
+ )