webskrap 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- webskrap/__init__.py +26 -0
- webskrap/cli.py +170 -0
- webskrap/client.py +252 -0
- webskrap/models.py +206 -0
- webskrap/profiles.py +67 -0
- webskrap/py.typed +1 -0
- webskrap/stealth.py +117 -0
- webskrap-0.1.0.dist-info/METADATA +278 -0
- webskrap-0.1.0.dist-info/RECORD +11 -0
- webskrap-0.1.0.dist-info/WHEEL +4 -0
- webskrap-0.1.0.dist-info/entry_points.txt +2 -0
webskrap/__init__.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from webskrap.client import WebSkrapClient, WebSkrapError, WebSkrapSession
|
|
2
|
+
from webskrap.models import (
|
|
3
|
+
BrowserProfile,
|
|
4
|
+
FetchResult,
|
|
5
|
+
ProxyConfig,
|
|
6
|
+
ResourcePolicy,
|
|
7
|
+
SessionConfig,
|
|
8
|
+
StealthConfig,
|
|
9
|
+
Viewport,
|
|
10
|
+
)
|
|
11
|
+
from webskrap.profiles import get_profile, list_profiles
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"BrowserProfile",
|
|
15
|
+
"FetchResult",
|
|
16
|
+
"ProxyConfig",
|
|
17
|
+
"ResourcePolicy",
|
|
18
|
+
"SessionConfig",
|
|
19
|
+
"StealthConfig",
|
|
20
|
+
"Viewport",
|
|
21
|
+
"WebSkrapClient",
|
|
22
|
+
"WebSkrapError",
|
|
23
|
+
"WebSkrapSession",
|
|
24
|
+
"get_profile",
|
|
25
|
+
"list_profiles",
|
|
26
|
+
]
|
webskrap/cli.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Annotated, cast
|
|
6
|
+
|
|
7
|
+
import typer
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
from rich.table import Table
|
|
10
|
+
|
|
11
|
+
from webskrap.client import WaitUntil, WebSkrapClient
|
|
12
|
+
from webskrap.models import ResourcePolicy, SessionConfig
|
|
13
|
+
from webskrap.profiles import get_profile, list_profiles
|
|
14
|
+
|
|
15
|
+
app = typer.Typer(help="WebSkrap browser scraping toolkit.")
|
|
16
|
+
console = Console()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@app.command("profiles")
|
|
20
|
+
def profiles_command() -> None:
|
|
21
|
+
table = Table(title="WebSkrap Profiles")
|
|
22
|
+
table.add_column("Name")
|
|
23
|
+
table.add_column("Viewport")
|
|
24
|
+
table.add_column("Locale")
|
|
25
|
+
table.add_column("Timezone")
|
|
26
|
+
table.add_column("Mobile")
|
|
27
|
+
|
|
28
|
+
for profile in list_profiles():
|
|
29
|
+
table.add_row(
|
|
30
|
+
profile.name,
|
|
31
|
+
f"{profile.viewport.width}x{profile.viewport.height}",
|
|
32
|
+
profile.locale,
|
|
33
|
+
profile.timezone_id,
|
|
34
|
+
"yes" if profile.is_mobile else "no",
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
console.print(table)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@app.command("doctor")
|
|
41
|
+
def doctor_command() -> None:
|
|
42
|
+
asyncio.run(_doctor())
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
async def _doctor() -> None:
|
|
46
|
+
try:
|
|
47
|
+
from playwright.async_api import async_playwright
|
|
48
|
+
except Exception as exc:
|
|
49
|
+
console.print(f"[red]Playwright import failed:[/red] {exc}")
|
|
50
|
+
raise typer.Exit(code=1) from exc
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
manager = async_playwright()
|
|
54
|
+
playwright = await manager.start()
|
|
55
|
+
browser = await playwright.chromium.launch(headless=True)
|
|
56
|
+
await browser.close()
|
|
57
|
+
await playwright.stop()
|
|
58
|
+
except Exception as exc:
|
|
59
|
+
console.print("[yellow]Playwright is installed, but Chromium did not launch.[/yellow]")
|
|
60
|
+
console.print(str(exc))
|
|
61
|
+
console.print("Run: python -m playwright install chromium")
|
|
62
|
+
raise typer.Exit(code=1) from exc
|
|
63
|
+
|
|
64
|
+
console.print("[green]Playwright and Chromium are ready.[/green]")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@app.command("fetch")
|
|
68
|
+
def fetch_command(
|
|
69
|
+
url: Annotated[str, typer.Argument(help="URL to fetch.")],
|
|
70
|
+
profile: Annotated[
|
|
71
|
+
str,
|
|
72
|
+
typer.Option("--profile", "-p", help="Bundled profile name."),
|
|
73
|
+
] = "desktop-chrome",
|
|
74
|
+
headed: Annotated[bool, typer.Option("--headed", help="Run with a visible browser.")] = False,
|
|
75
|
+
channel: Annotated[
|
|
76
|
+
str | None,
|
|
77
|
+
typer.Option("--channel", help="Browser channel, e.g. chrome."),
|
|
78
|
+
] = None,
|
|
79
|
+
screenshot: Annotated[
|
|
80
|
+
Path | None,
|
|
81
|
+
typer.Option("--screenshot", help="Write a full-page screenshot to this path."),
|
|
82
|
+
] = None,
|
|
83
|
+
output: Annotated[
|
|
84
|
+
Path | None,
|
|
85
|
+
typer.Option("--output", "-o", help="Write HTML to this file."),
|
|
86
|
+
] = None,
|
|
87
|
+
wait_until: Annotated[
|
|
88
|
+
str,
|
|
89
|
+
typer.Option("--wait-until", help="commit, domcontentloaded, load, or networkidle."),
|
|
90
|
+
] = "domcontentloaded",
|
|
91
|
+
timeout_ms: Annotated[
|
|
92
|
+
float,
|
|
93
|
+
typer.Option("--timeout-ms", min=1, help="Navigation timeout."),
|
|
94
|
+
] = 30_000,
|
|
95
|
+
resource_policy: Annotated[
|
|
96
|
+
ResourcePolicy,
|
|
97
|
+
typer.Option("--resource-policy", help="Resource routing preset."),
|
|
98
|
+
] = ResourcePolicy.ALL,
|
|
99
|
+
no_stealth: Annotated[
|
|
100
|
+
bool,
|
|
101
|
+
typer.Option("--no-stealth", help="Disable browser hardening."),
|
|
102
|
+
] = False,
|
|
103
|
+
) -> None:
|
|
104
|
+
asyncio.run(
|
|
105
|
+
_fetch(
|
|
106
|
+
url=url,
|
|
107
|
+
profile=profile,
|
|
108
|
+
headed=headed,
|
|
109
|
+
channel=channel,
|
|
110
|
+
screenshot=screenshot,
|
|
111
|
+
output=output,
|
|
112
|
+
wait_until=wait_until,
|
|
113
|
+
timeout_ms=timeout_ms,
|
|
114
|
+
resource_policy=resource_policy,
|
|
115
|
+
no_stealth=no_stealth,
|
|
116
|
+
)
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
async def _fetch(
|
|
121
|
+
*,
|
|
122
|
+
url: str,
|
|
123
|
+
profile: str,
|
|
124
|
+
headed: bool,
|
|
125
|
+
channel: str | None,
|
|
126
|
+
screenshot: Path | None,
|
|
127
|
+
output: Path | None,
|
|
128
|
+
wait_until: str,
|
|
129
|
+
timeout_ms: float,
|
|
130
|
+
resource_policy: ResourcePolicy,
|
|
131
|
+
no_stealth: bool,
|
|
132
|
+
) -> None:
|
|
133
|
+
selected_profile = get_profile(profile)
|
|
134
|
+
config = SessionConfig(
|
|
135
|
+
headless=not headed,
|
|
136
|
+
channel=channel,
|
|
137
|
+
navigation_timeout_ms=timeout_ms,
|
|
138
|
+
resource_policy=resource_policy,
|
|
139
|
+
)
|
|
140
|
+
config.stealth.enabled = not no_stealth
|
|
141
|
+
|
|
142
|
+
async with WebSkrapClient() as client:
|
|
143
|
+
result = await client.fetch(
|
|
144
|
+
url,
|
|
145
|
+
profile=selected_profile,
|
|
146
|
+
config=config,
|
|
147
|
+
wait_until=_parse_wait_until(wait_until),
|
|
148
|
+
screenshot=screenshot or False,
|
|
149
|
+
timeout_ms=timeout_ms,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
if output:
|
|
153
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
154
|
+
output.write_text(result.text, encoding="utf-8")
|
|
155
|
+
|
|
156
|
+
console.print(f"[bold]Status:[/bold] {result.status}")
|
|
157
|
+
console.print(f"[bold]Final URL:[/bold] {result.final_url}")
|
|
158
|
+
console.print(f"[bold]Title:[/bold] {result.title}")
|
|
159
|
+
if result.screenshot_path:
|
|
160
|
+
console.print(f"[bold]Screenshot:[/bold] {result.screenshot_path}")
|
|
161
|
+
if output:
|
|
162
|
+
console.print(f"[bold]HTML:[/bold] {output}")
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _parse_wait_until(value: str) -> WaitUntil:
|
|
166
|
+
valid = ("commit", "domcontentloaded", "load", "networkidle")
|
|
167
|
+
if value not in valid:
|
|
168
|
+
allowed = ", ".join(valid)
|
|
169
|
+
raise typer.BadParameter(f"must be one of: {allowed}")
|
|
170
|
+
return cast(WaitUntil, value)
|
webskrap/client.py
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from collections.abc import Mapping
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Literal
|
|
7
|
+
from uuid import uuid4
|
|
8
|
+
|
|
9
|
+
from playwright.async_api import Browser, BrowserContext, Page, Playwright, async_playwright
|
|
10
|
+
|
|
11
|
+
from webskrap.models import BrowserProfile, FetchResult, ResourcePolicy, SessionConfig
|
|
12
|
+
from webskrap.profiles import get_profile
|
|
13
|
+
from webskrap.stealth import apply_stealth
|
|
14
|
+
|
|
15
|
+
WaitUntil = Literal["commit", "domcontentloaded", "load", "networkidle"]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class WebSkrapError(RuntimeError):
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class WebSkrapSession:
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
*,
|
|
26
|
+
name: str,
|
|
27
|
+
context: BrowserContext,
|
|
28
|
+
config: SessionConfig,
|
|
29
|
+
profile: BrowserProfile,
|
|
30
|
+
browser: Browser | None = None,
|
|
31
|
+
) -> None:
|
|
32
|
+
self.name = name
|
|
33
|
+
self.context = context
|
|
34
|
+
self.config = config
|
|
35
|
+
self.profile = profile
|
|
36
|
+
self.browser = browser
|
|
37
|
+
self._closed = False
|
|
38
|
+
|
|
39
|
+
async def __aenter__(self) -> WebSkrapSession:
|
|
40
|
+
return self
|
|
41
|
+
|
|
42
|
+
async def __aexit__(self, exc_type: object, exc: object, traceback: object) -> None:
|
|
43
|
+
await self.close()
|
|
44
|
+
|
|
45
|
+
async def fetch(
|
|
46
|
+
self,
|
|
47
|
+
url: str,
|
|
48
|
+
*,
|
|
49
|
+
wait_until: WaitUntil = "domcontentloaded",
|
|
50
|
+
screenshot: bool | str | Path = False,
|
|
51
|
+
timeout_ms: float | None = None,
|
|
52
|
+
) -> FetchResult:
|
|
53
|
+
self._ensure_open()
|
|
54
|
+
started = time.perf_counter()
|
|
55
|
+
page = await self.context.new_page()
|
|
56
|
+
try:
|
|
57
|
+
_configure_page_timeouts(page, self.config)
|
|
58
|
+
response = await page.goto(
|
|
59
|
+
url,
|
|
60
|
+
wait_until=wait_until,
|
|
61
|
+
timeout=timeout_ms or self.config.navigation_timeout_ms,
|
|
62
|
+
)
|
|
63
|
+
title = await page.title()
|
|
64
|
+
text = await page.content()
|
|
65
|
+
screenshot_path = await _maybe_screenshot(page, screenshot)
|
|
66
|
+
cookies = await self.context.cookies()
|
|
67
|
+
elapsed_ms = (time.perf_counter() - started) * 1000
|
|
68
|
+
status = response.status if response else None
|
|
69
|
+
headers = dict(response.headers) if response else {}
|
|
70
|
+
return FetchResult(
|
|
71
|
+
url=url,
|
|
72
|
+
final_url=page.url,
|
|
73
|
+
status=status,
|
|
74
|
+
ok=status is not None and 200 <= status < 400,
|
|
75
|
+
headers=headers,
|
|
76
|
+
text=text,
|
|
77
|
+
title=title,
|
|
78
|
+
cookies=cookies,
|
|
79
|
+
timings={"elapsed_ms": elapsed_ms},
|
|
80
|
+
screenshot_path=screenshot_path,
|
|
81
|
+
)
|
|
82
|
+
finally:
|
|
83
|
+
await page.close()
|
|
84
|
+
|
|
85
|
+
async def close(self) -> None:
|
|
86
|
+
if self._closed:
|
|
87
|
+
return
|
|
88
|
+
self._closed = True
|
|
89
|
+
await self.context.close()
|
|
90
|
+
if self.browser is not None:
|
|
91
|
+
await self.browser.close()
|
|
92
|
+
|
|
93
|
+
def _ensure_open(self) -> None:
|
|
94
|
+
if self._closed:
|
|
95
|
+
msg = f"session '{self.name}' is closed"
|
|
96
|
+
raise WebSkrapError(msg)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class WebSkrapClient:
|
|
100
|
+
def __init__(
|
|
101
|
+
self,
|
|
102
|
+
*,
|
|
103
|
+
default_config: SessionConfig | None = None,
|
|
104
|
+
profiles: Mapping[str, BrowserProfile] | None = None,
|
|
105
|
+
) -> None:
|
|
106
|
+
self.default_config = default_config or SessionConfig()
|
|
107
|
+
self.profiles = dict(profiles or {})
|
|
108
|
+
self._playwright_manager = None
|
|
109
|
+
self._playwright: Playwright | None = None
|
|
110
|
+
self._sessions: dict[str, WebSkrapSession] = {}
|
|
111
|
+
|
|
112
|
+
async def __aenter__(self) -> WebSkrapClient:
|
|
113
|
+
await self.start()
|
|
114
|
+
return self
|
|
115
|
+
|
|
116
|
+
async def __aexit__(self, exc_type: object, exc: object, traceback: object) -> None:
|
|
117
|
+
await self.close()
|
|
118
|
+
|
|
119
|
+
async def start(self) -> None:
|
|
120
|
+
if self._playwright is not None:
|
|
121
|
+
return
|
|
122
|
+
self._playwright_manager = async_playwright()
|
|
123
|
+
self._playwright = await self._playwright_manager.start()
|
|
124
|
+
|
|
125
|
+
async def close(self) -> None:
|
|
126
|
+
for session in list(self._sessions.values()):
|
|
127
|
+
await session.close()
|
|
128
|
+
self._sessions.clear()
|
|
129
|
+
if self._playwright is not None:
|
|
130
|
+
await self._playwright.stop()
|
|
131
|
+
self._playwright_manager = None
|
|
132
|
+
self._playwright = None
|
|
133
|
+
|
|
134
|
+
async def fetch(
|
|
135
|
+
self,
|
|
136
|
+
url: str,
|
|
137
|
+
*,
|
|
138
|
+
profile: str | BrowserProfile | None = None,
|
|
139
|
+
config: SessionConfig | None = None,
|
|
140
|
+
wait_until: WaitUntil = "domcontentloaded",
|
|
141
|
+
screenshot: bool | str | Path = False,
|
|
142
|
+
timeout_ms: float | None = None,
|
|
143
|
+
) -> FetchResult:
|
|
144
|
+
name = f"_single_{uuid4().hex}"
|
|
145
|
+
session = await self.session(name, config=config, profile=profile)
|
|
146
|
+
try:
|
|
147
|
+
return await session.fetch(
|
|
148
|
+
url,
|
|
149
|
+
wait_until=wait_until,
|
|
150
|
+
screenshot=screenshot,
|
|
151
|
+
timeout_ms=timeout_ms,
|
|
152
|
+
)
|
|
153
|
+
finally:
|
|
154
|
+
await session.close()
|
|
155
|
+
self._sessions.pop(name, None)
|
|
156
|
+
|
|
157
|
+
async def session(
|
|
158
|
+
self,
|
|
159
|
+
name: str,
|
|
160
|
+
*,
|
|
161
|
+
config: SessionConfig | None = None,
|
|
162
|
+
profile: str | BrowserProfile | None = None,
|
|
163
|
+
) -> WebSkrapSession:
|
|
164
|
+
if name in self._sessions:
|
|
165
|
+
return self._sessions[name]
|
|
166
|
+
await self.start()
|
|
167
|
+
resolved_config = config or self.default_config
|
|
168
|
+
resolved_profile = self._resolve_profile(profile)
|
|
169
|
+
session = await self._create_session(name, resolved_config, resolved_profile)
|
|
170
|
+
self._sessions[name] = session
|
|
171
|
+
return session
|
|
172
|
+
|
|
173
|
+
def _resolve_profile(self, profile: str | BrowserProfile | None) -> BrowserProfile:
|
|
174
|
+
if isinstance(profile, BrowserProfile):
|
|
175
|
+
return profile
|
|
176
|
+
if profile in self.profiles:
|
|
177
|
+
return self.profiles[profile].model_copy(deep=True)
|
|
178
|
+
return get_profile(profile)
|
|
179
|
+
|
|
180
|
+
async def _create_session(
|
|
181
|
+
self,
|
|
182
|
+
name: str,
|
|
183
|
+
config: SessionConfig,
|
|
184
|
+
profile: BrowserProfile,
|
|
185
|
+
) -> WebSkrapSession:
|
|
186
|
+
if self._playwright is None:
|
|
187
|
+
msg = "client is not started"
|
|
188
|
+
raise WebSkrapError(msg)
|
|
189
|
+
|
|
190
|
+
browser_type = getattr(self._playwright, config.browser)
|
|
191
|
+
context_options = config.context_options(profile)
|
|
192
|
+
|
|
193
|
+
if config.user_data_dir is not None:
|
|
194
|
+
config.user_data_dir.mkdir(parents=True, exist_ok=True)
|
|
195
|
+
context = await browser_type.launch_persistent_context(
|
|
196
|
+
str(config.user_data_dir),
|
|
197
|
+
**config.launch_options(),
|
|
198
|
+
**context_options,
|
|
199
|
+
)
|
|
200
|
+
browser = None
|
|
201
|
+
else:
|
|
202
|
+
browser = await browser_type.launch(**config.launch_options())
|
|
203
|
+
context = await browser.new_context(**context_options)
|
|
204
|
+
|
|
205
|
+
context.set_default_timeout(config.default_timeout_ms)
|
|
206
|
+
context.set_default_navigation_timeout(config.navigation_timeout_ms)
|
|
207
|
+
|
|
208
|
+
if config.resource_policy != ResourcePolicy.ALL:
|
|
209
|
+
await context.route("**/*", _resource_route_handler(config.resource_policy))
|
|
210
|
+
if config.stealth.enabled:
|
|
211
|
+
await apply_stealth(context, profile, config.stealth)
|
|
212
|
+
|
|
213
|
+
return WebSkrapSession(
|
|
214
|
+
name=name,
|
|
215
|
+
context=context,
|
|
216
|
+
config=config,
|
|
217
|
+
profile=profile,
|
|
218
|
+
browser=browser,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _resource_route_handler(policy: ResourcePolicy):
|
|
223
|
+
blocked = {
|
|
224
|
+
ResourcePolicy.LITE: {"image", "font", "media"},
|
|
225
|
+
ResourcePolicy.DOCUMENTS: {"image", "font", "media", "stylesheet"},
|
|
226
|
+
}[policy]
|
|
227
|
+
|
|
228
|
+
async def handle(route) -> None:
|
|
229
|
+
if route.request.resource_type in blocked:
|
|
230
|
+
await route.abort()
|
|
231
|
+
else:
|
|
232
|
+
await route.continue_()
|
|
233
|
+
|
|
234
|
+
return handle
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _configure_page_timeouts(page: Page, config: SessionConfig) -> None:
|
|
238
|
+
page.set_default_timeout(config.default_timeout_ms)
|
|
239
|
+
page.set_default_navigation_timeout(config.navigation_timeout_ms)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
async def _maybe_screenshot(page: Page, screenshot: bool | str | Path) -> Path | None:
|
|
243
|
+
if not screenshot:
|
|
244
|
+
return None
|
|
245
|
+
path = (
|
|
246
|
+
Path(f"webskrap-{int(time.time() * 1000)}.png")
|
|
247
|
+
if screenshot is True
|
|
248
|
+
else Path(screenshot)
|
|
249
|
+
)
|
|
250
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
251
|
+
await page.screenshot(path=str(path), full_page=True)
|
|
252
|
+
return path
|
webskrap/models.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from enum import StrEnum
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Literal
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ResourcePolicy(StrEnum):
|
|
11
|
+
ALL = "all"
|
|
12
|
+
LITE = "lite"
|
|
13
|
+
DOCUMENTS = "documents"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Viewport(BaseModel):
|
|
17
|
+
width: int = Field(gt=0)
|
|
18
|
+
height: int = Field(gt=0)
|
|
19
|
+
|
|
20
|
+
def to_playwright(self) -> dict[str, int]:
|
|
21
|
+
return {"width": self.width, "height": self.height}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ProxyConfig(BaseModel):
|
|
25
|
+
server: str
|
|
26
|
+
bypass: str | None = None
|
|
27
|
+
username: str | None = None
|
|
28
|
+
password: str | None = None
|
|
29
|
+
|
|
30
|
+
@field_validator("server")
|
|
31
|
+
@classmethod
|
|
32
|
+
def validate_server(cls, value: str) -> str:
|
|
33
|
+
allowed_prefixes = ("http://", "https://", "socks4://", "socks5://")
|
|
34
|
+
if not value.startswith(allowed_prefixes):
|
|
35
|
+
msg = "proxy server must start with http://, https://, socks4://, or socks5://"
|
|
36
|
+
raise ValueError(msg)
|
|
37
|
+
return value
|
|
38
|
+
|
|
39
|
+
def to_playwright(self) -> dict[str, str]:
|
|
40
|
+
payload = {"server": self.server}
|
|
41
|
+
if self.bypass:
|
|
42
|
+
payload["bypass"] = self.bypass
|
|
43
|
+
if self.username:
|
|
44
|
+
payload["username"] = self.username
|
|
45
|
+
if self.password:
|
|
46
|
+
payload["password"] = self.password
|
|
47
|
+
return payload
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class StealthConfig(BaseModel):
|
|
51
|
+
enabled: bool = True
|
|
52
|
+
patch_webdriver: bool = True
|
|
53
|
+
patch_chrome_runtime: bool = True
|
|
54
|
+
patch_permissions: bool = True
|
|
55
|
+
patch_plugins: bool = True
|
|
56
|
+
patch_webgl: bool = True
|
|
57
|
+
patch_canvas: bool = True
|
|
58
|
+
patch_hardware: bool = True
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class BrowserProfile(BaseModel):
|
|
62
|
+
model_config = ConfigDict(extra="forbid")
|
|
63
|
+
|
|
64
|
+
name: str
|
|
65
|
+
user_agent: str | None = None
|
|
66
|
+
viewport: Viewport = Field(default_factory=lambda: Viewport(width=1365, height=768))
|
|
67
|
+
screen: Viewport = Field(default_factory=lambda: Viewport(width=1440, height=900))
|
|
68
|
+
locale: str = "en-US"
|
|
69
|
+
timezone_id: str = "Europe/Paris"
|
|
70
|
+
device_scale_factor: float = Field(default=1.0, gt=0)
|
|
71
|
+
is_mobile: bool = False
|
|
72
|
+
has_touch: bool = False
|
|
73
|
+
color_scheme: Literal["dark", "light", "no-preference", "null"] = "light"
|
|
74
|
+
reduced_motion: Literal["reduce", "no-preference", "null"] = "no-preference"
|
|
75
|
+
extra_http_headers: dict[str, str] = Field(default_factory=dict)
|
|
76
|
+
navigator_languages: list[str] = Field(default_factory=lambda: ["en-US", "en"])
|
|
77
|
+
hardware_concurrency: int = Field(default=8, ge=1, le=64)
|
|
78
|
+
device_memory: int = Field(default=8, ge=1, le=128)
|
|
79
|
+
webgl_vendor: str = "Google Inc. (Intel)"
|
|
80
|
+
webgl_renderer: str = "ANGLE (Intel, Intel(R) Iris(TM) Plus Graphics, OpenGL 4.1)"
|
|
81
|
+
|
|
82
|
+
@field_validator("name")
|
|
83
|
+
@classmethod
|
|
84
|
+
def validate_name(cls, value: str) -> str:
|
|
85
|
+
if not value.strip():
|
|
86
|
+
msg = "profile name cannot be empty"
|
|
87
|
+
raise ValueError(msg)
|
|
88
|
+
return value
|
|
89
|
+
|
|
90
|
+
@field_validator("locale", "timezone_id")
|
|
91
|
+
@classmethod
|
|
92
|
+
def validate_non_empty(cls, value: str) -> str:
|
|
93
|
+
if not value.strip():
|
|
94
|
+
msg = "value cannot be empty"
|
|
95
|
+
raise ValueError(msg)
|
|
96
|
+
return value
|
|
97
|
+
|
|
98
|
+
@model_validator(mode="after")
|
|
99
|
+
def ensure_language_consistency(self) -> BrowserProfile:
|
|
100
|
+
if not self.navigator_languages:
|
|
101
|
+
self.navigator_languages = [self.locale]
|
|
102
|
+
if self.locale not in self.navigator_languages:
|
|
103
|
+
self.navigator_languages.insert(0, self.locale)
|
|
104
|
+
return self
|
|
105
|
+
|
|
106
|
+
def accept_language(self) -> str:
|
|
107
|
+
languages = self.navigator_languages or [self.locale]
|
|
108
|
+
weighted = [languages[0]]
|
|
109
|
+
weighted.extend(
|
|
110
|
+
f"{language};q={max(0.1, 1 - index * 0.1):.1f}"
|
|
111
|
+
for index, language in enumerate(languages[1:], start=1)
|
|
112
|
+
)
|
|
113
|
+
return ",".join(weighted)
|
|
114
|
+
|
|
115
|
+
def headers(self) -> dict[str, str]:
|
|
116
|
+
headers = {
|
|
117
|
+
"Accept": (
|
|
118
|
+
"text/html,application/xhtml+xml,application/xml;q=0.9,"
|
|
119
|
+
"image/avif,image/webp,*/*;q=0.8"
|
|
120
|
+
),
|
|
121
|
+
"Accept-Language": self.accept_language(),
|
|
122
|
+
"Upgrade-Insecure-Requests": "1",
|
|
123
|
+
}
|
|
124
|
+
headers.update(self.extra_http_headers)
|
|
125
|
+
return headers
|
|
126
|
+
|
|
127
|
+
def to_context_options(self) -> dict[str, Any]:
|
|
128
|
+
options: dict[str, Any] = {
|
|
129
|
+
"viewport": self.viewport.to_playwright(),
|
|
130
|
+
"screen": self.screen.to_playwright(),
|
|
131
|
+
"locale": self.locale,
|
|
132
|
+
"timezone_id": self.timezone_id,
|
|
133
|
+
"device_scale_factor": self.device_scale_factor,
|
|
134
|
+
"is_mobile": self.is_mobile,
|
|
135
|
+
"has_touch": self.has_touch,
|
|
136
|
+
"color_scheme": self.color_scheme,
|
|
137
|
+
"reduced_motion": self.reduced_motion,
|
|
138
|
+
"extra_http_headers": self.headers(),
|
|
139
|
+
}
|
|
140
|
+
if self.user_agent:
|
|
141
|
+
options["user_agent"] = self.user_agent
|
|
142
|
+
return options
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class SessionConfig(BaseModel):
|
|
146
|
+
model_config = ConfigDict(arbitrary_types_allowed=True, extra="forbid")
|
|
147
|
+
|
|
148
|
+
browser: Literal["chromium", "firefox", "webkit"] = "chromium"
|
|
149
|
+
channel: str | None = None
|
|
150
|
+
headless: bool = True
|
|
151
|
+
user_data_dir: Path | None = None
|
|
152
|
+
storage_state: Path | dict[str, Any] | None = None
|
|
153
|
+
proxy: ProxyConfig | None = None
|
|
154
|
+
resource_policy: ResourcePolicy = ResourcePolicy.ALL
|
|
155
|
+
stealth: StealthConfig = Field(default_factory=StealthConfig)
|
|
156
|
+
ignore_https_errors: bool = False
|
|
157
|
+
java_script_enabled: bool = True
|
|
158
|
+
service_workers: Literal["allow", "block"] = "allow"
|
|
159
|
+
timeout_ms: float = Field(default=30_000, gt=0)
|
|
160
|
+
navigation_timeout_ms: float = Field(default=30_000, gt=0)
|
|
161
|
+
default_timeout_ms: float = Field(default=30_000, gt=0)
|
|
162
|
+
slow_mo_ms: float | None = Field(default=None, ge=0)
|
|
163
|
+
launch_args: list[str] = Field(default_factory=list)
|
|
164
|
+
|
|
165
|
+
def launch_options(self) -> dict[str, Any]:
|
|
166
|
+
options: dict[str, Any] = {
|
|
167
|
+
"headless": self.headless,
|
|
168
|
+
"timeout": self.timeout_ms,
|
|
169
|
+
}
|
|
170
|
+
if self.channel:
|
|
171
|
+
options["channel"] = self.channel
|
|
172
|
+
if self.slow_mo_ms is not None:
|
|
173
|
+
options["slow_mo"] = self.slow_mo_ms
|
|
174
|
+
if self.launch_args:
|
|
175
|
+
options["args"] = self.launch_args
|
|
176
|
+
return options
|
|
177
|
+
|
|
178
|
+
def context_options(self, profile: BrowserProfile) -> dict[str, Any]:
|
|
179
|
+
options = profile.to_context_options()
|
|
180
|
+
options.update(
|
|
181
|
+
{
|
|
182
|
+
"ignore_https_errors": self.ignore_https_errors,
|
|
183
|
+
"java_script_enabled": self.java_script_enabled,
|
|
184
|
+
"service_workers": self.service_workers,
|
|
185
|
+
}
|
|
186
|
+
)
|
|
187
|
+
if self.proxy:
|
|
188
|
+
options["proxy"] = self.proxy.to_playwright()
|
|
189
|
+
if self.storage_state is not None and self.user_data_dir is None:
|
|
190
|
+
options["storage_state"] = self.storage_state
|
|
191
|
+
return options
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
class FetchResult(BaseModel):
|
|
195
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
196
|
+
|
|
197
|
+
url: str
|
|
198
|
+
final_url: str
|
|
199
|
+
status: int | None
|
|
200
|
+
ok: bool
|
|
201
|
+
headers: dict[str, str]
|
|
202
|
+
text: str
|
|
203
|
+
title: str
|
|
204
|
+
cookies: list[dict[str, Any]]
|
|
205
|
+
timings: dict[str, float]
|
|
206
|
+
screenshot_path: Path | None = None
|
webskrap/profiles.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from webskrap.models import BrowserProfile, Viewport
|
|
4
|
+
|
|
5
|
+
_PROFILES: dict[str, BrowserProfile] = {
|
|
6
|
+
"desktop-chrome": BrowserProfile(
|
|
7
|
+
name="desktop-chrome",
|
|
8
|
+
viewport=Viewport(width=1365, height=768),
|
|
9
|
+
screen=Viewport(width=1440, height=900),
|
|
10
|
+
locale="en-US",
|
|
11
|
+
timezone_id="Europe/Paris",
|
|
12
|
+
device_scale_factor=1,
|
|
13
|
+
navigator_languages=["en-US", "en"],
|
|
14
|
+
hardware_concurrency=8,
|
|
15
|
+
device_memory=8,
|
|
16
|
+
webgl_vendor="Google Inc. (Intel)",
|
|
17
|
+
webgl_renderer="ANGLE (Intel, Intel(R) Iris(TM) Plus Graphics, OpenGL 4.1)",
|
|
18
|
+
),
|
|
19
|
+
"desktop-edge": BrowserProfile(
|
|
20
|
+
name="desktop-edge",
|
|
21
|
+
viewport=Viewport(width=1440, height=810),
|
|
22
|
+
screen=Viewport(width=1536, height=864),
|
|
23
|
+
locale="en-US",
|
|
24
|
+
timezone_id="Europe/Paris",
|
|
25
|
+
device_scale_factor=1,
|
|
26
|
+
navigator_languages=["en-US", "en"],
|
|
27
|
+
hardware_concurrency=8,
|
|
28
|
+
device_memory=8,
|
|
29
|
+
webgl_vendor="Google Inc. (NVIDIA)",
|
|
30
|
+
webgl_renderer="ANGLE (NVIDIA, NVIDIA GeForce RTX 3060 Direct3D11 vs_5_0 ps_5_0)",
|
|
31
|
+
),
|
|
32
|
+
"mobile-chrome": BrowserProfile(
|
|
33
|
+
name="mobile-chrome",
|
|
34
|
+
user_agent=(
|
|
35
|
+
"Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 "
|
|
36
|
+
"(KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36"
|
|
37
|
+
),
|
|
38
|
+
viewport=Viewport(width=412, height=915),
|
|
39
|
+
screen=Viewport(width=412, height=915),
|
|
40
|
+
locale="en-US",
|
|
41
|
+
timezone_id="Europe/Paris",
|
|
42
|
+
device_scale_factor=2.625,
|
|
43
|
+
is_mobile=True,
|
|
44
|
+
has_touch=True,
|
|
45
|
+
navigator_languages=["en-US", "en"],
|
|
46
|
+
hardware_concurrency=8,
|
|
47
|
+
device_memory=8,
|
|
48
|
+
webgl_vendor="Google Inc. (Qualcomm)",
|
|
49
|
+
webgl_renderer="ANGLE (Qualcomm, Adreno (TM) 730, OpenGL ES 3.2)",
|
|
50
|
+
),
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def list_profiles() -> tuple[BrowserProfile, ...]:
|
|
55
|
+
return tuple(_PROFILES.values())
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def get_profile(name: str | BrowserProfile | None = None) -> BrowserProfile:
|
|
59
|
+
if isinstance(name, BrowserProfile):
|
|
60
|
+
return name
|
|
61
|
+
key = name or "desktop-chrome"
|
|
62
|
+
try:
|
|
63
|
+
return _PROFILES[key].model_copy(deep=True)
|
|
64
|
+
except KeyError as exc:
|
|
65
|
+
available = ", ".join(sorted(_PROFILES))
|
|
66
|
+
msg = f"unknown profile '{key}'. Available profiles: {available}"
|
|
67
|
+
raise ValueError(msg) from exc
|
webskrap/py.typed
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
webskrap/stealth.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
from playwright.async_api import BrowserContext
|
|
6
|
+
|
|
7
|
+
from webskrap.models import BrowserProfile, StealthConfig
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
async def apply_stealth(
|
|
11
|
+
context: BrowserContext,
|
|
12
|
+
profile: BrowserProfile,
|
|
13
|
+
config: StealthConfig,
|
|
14
|
+
) -> None:
|
|
15
|
+
if not config.enabled:
|
|
16
|
+
return
|
|
17
|
+
await context.add_init_script(script=build_stealth_script(profile, config))
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def build_stealth_script(profile: BrowserProfile, config: StealthConfig) -> str:
|
|
21
|
+
payload = {
|
|
22
|
+
"languages": profile.navigator_languages,
|
|
23
|
+
"hardwareConcurrency": profile.hardware_concurrency,
|
|
24
|
+
"deviceMemory": profile.device_memory,
|
|
25
|
+
"webglVendor": profile.webgl_vendor,
|
|
26
|
+
"webglRenderer": profile.webgl_renderer,
|
|
27
|
+
"patchWebdriver": config.patch_webdriver,
|
|
28
|
+
"patchChromeRuntime": config.patch_chrome_runtime,
|
|
29
|
+
"patchPermissions": config.patch_permissions,
|
|
30
|
+
"patchPlugins": config.patch_plugins,
|
|
31
|
+
"patchWebgl": config.patch_webgl,
|
|
32
|
+
"patchCanvas": config.patch_canvas,
|
|
33
|
+
"patchHardware": config.patch_hardware,
|
|
34
|
+
}
|
|
35
|
+
encoded = json.dumps(payload, separators=(",", ":"))
|
|
36
|
+
return f"""
|
|
37
|
+
(() => {{
|
|
38
|
+
const profile = {encoded};
|
|
39
|
+
const defineGetter = (target, prop, getter) => {{
|
|
40
|
+
try {{
|
|
41
|
+
Object.defineProperty(target, prop, {{ get: getter, configurable: true }});
|
|
42
|
+
}} catch (_) {{}}
|
|
43
|
+
}};
|
|
44
|
+
|
|
45
|
+
if (profile.patchWebdriver) {{
|
|
46
|
+
defineGetter(Navigator.prototype, "webdriver", () => undefined);
|
|
47
|
+
}}
|
|
48
|
+
|
|
49
|
+
defineGetter(Navigator.prototype, "languages", () => profile.languages.slice());
|
|
50
|
+
defineGetter(Navigator.prototype, "language", () => profile.languages[0]);
|
|
51
|
+
|
|
52
|
+
if (profile.patchHardware) {{
|
|
53
|
+
defineGetter(Navigator.prototype, "hardwareConcurrency", () => profile.hardwareConcurrency);
|
|
54
|
+
defineGetter(Navigator.prototype, "deviceMemory", () => profile.deviceMemory);
|
|
55
|
+
}}
|
|
56
|
+
|
|
57
|
+
if (profile.patchPlugins) {{
|
|
58
|
+
defineGetter(Navigator.prototype, "plugins", () => [1, 2, 3, 4, 5]);
|
|
59
|
+
defineGetter(Navigator.prototype, "mimeTypes", () => [1, 2, 3]);
|
|
60
|
+
}}
|
|
61
|
+
|
|
62
|
+
if (profile.patchChromeRuntime && !window.chrome) {{
|
|
63
|
+
try {{
|
|
64
|
+
Object.defineProperty(window, "chrome", {{
|
|
65
|
+
configurable: true,
|
|
66
|
+
value: {{
|
|
67
|
+
app: {{ isInstalled: false }},
|
|
68
|
+
csi: () => ({{}}),
|
|
69
|
+
loadTimes: () => ({{}}),
|
|
70
|
+
runtime: {{}}
|
|
71
|
+
}}
|
|
72
|
+
}});
|
|
73
|
+
}} catch (_) {{}}
|
|
74
|
+
}}
|
|
75
|
+
|
|
76
|
+
if (profile.patchPermissions && navigator.permissions && navigator.permissions.query) {{
|
|
77
|
+
const originalQuery = navigator.permissions.query.bind(navigator.permissions);
|
|
78
|
+
navigator.permissions.query = (parameters) => {{
|
|
79
|
+
if (parameters && parameters.name === "notifications") {{
|
|
80
|
+
return Promise.resolve({{ state: Notification.permission }});
|
|
81
|
+
}}
|
|
82
|
+
return originalQuery(parameters);
|
|
83
|
+
}};
|
|
84
|
+
}}
|
|
85
|
+
|
|
86
|
+
if (profile.patchWebgl) {{
|
|
87
|
+
const patchWebGL = (proto) => {{
|
|
88
|
+
if (!proto || !proto.getParameter) return;
|
|
89
|
+
const original = proto.getParameter;
|
|
90
|
+
proto.getParameter = function(parameter) {{
|
|
91
|
+
if (parameter === 37445) return profile.webglVendor;
|
|
92
|
+
if (parameter === 37446) return profile.webglRenderer;
|
|
93
|
+
return original.apply(this, arguments);
|
|
94
|
+
}};
|
|
95
|
+
}};
|
|
96
|
+
patchWebGL(window.WebGLRenderingContext && window.WebGLRenderingContext.prototype);
|
|
97
|
+
patchWebGL(window.WebGL2RenderingContext && window.WebGL2RenderingContext.prototype);
|
|
98
|
+
}}
|
|
99
|
+
|
|
100
|
+
if (profile.patchCanvas && window.HTMLCanvasElement) {{
|
|
101
|
+
const originalToDataURL = HTMLCanvasElement.prototype.toDataURL;
|
|
102
|
+
HTMLCanvasElement.prototype.toDataURL = function() {{
|
|
103
|
+
try {{
|
|
104
|
+
const context = this.getContext("2d");
|
|
105
|
+
if (context && this.width && this.height) {{
|
|
106
|
+
const x = Math.min(1, this.width - 1);
|
|
107
|
+
const y = Math.min(1, this.height - 1);
|
|
108
|
+
const imageData = context.getImageData(x, y, 1, 1);
|
|
109
|
+
imageData.data[0] = (imageData.data[0] + 1) % 255;
|
|
110
|
+
context.putImageData(imageData, x, y);
|
|
111
|
+
}}
|
|
112
|
+
}} catch (_) {{}}
|
|
113
|
+
return originalToDataURL.apply(this, arguments);
|
|
114
|
+
}};
|
|
115
|
+
}}
|
|
116
|
+
}})();
|
|
117
|
+
"""
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: webskrap
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A Playwright-based Python scraping framework with coherent browser profiles and session controls.
|
|
5
|
+
Project-URL: Homepage, https://github.com/kacigaya/webskrap
|
|
6
|
+
Project-URL: Repository, https://github.com/kacigaya/webskrap
|
|
7
|
+
Project-URL: Issues, https://github.com/kacigaya/webskrap/issues
|
|
8
|
+
Author: WebSkrap contributors
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
Keywords: browser-automation,crawler,playwright,scraping
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Typing :: Typed
|
|
19
|
+
Requires-Python: >=3.11
|
|
20
|
+
Requires-Dist: playwright>=1.49
|
|
21
|
+
Requires-Dist: pydantic>=2.8
|
|
22
|
+
Requires-Dist: rich>=13.9
|
|
23
|
+
Requires-Dist: typer>=0.15
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
|
|
26
|
+
Requires-Dist: pytest>=8.3; extra == 'dev'
|
|
27
|
+
Requires-Dist: ruff>=0.8; extra == 'dev'
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
<p align="center">
|
|
31
|
+
<img src="assets/webskrap-logo.png" alt="WebSkrap logo" width="200">
|
|
32
|
+
</p>
|
|
33
|
+
|
|
34
|
+
<h1 align="center">WebSkrap</h1>
|
|
35
|
+
|
|
36
|
+
<p align="center">
|
|
37
|
+
<strong>Async-first Python scraping framework built on Playwright.</strong><br>
|
|
38
|
+
<em>It provides coherent browser profiles, persistent sessions, resource routing, and configurable browser hardening for data collection workflows that need realistic browser behavior.</em>
|
|
39
|
+
</p>
|
|
40
|
+
|
|
41
|
+
WebSkrap does not include CAPTCHA solving, login-wall bypassing, credential bypassing, or access-control circumvention. Use it only on targets you are allowed to access.
|
|
42
|
+
|
|
43
|
+
## Install
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install webskrap
|
|
47
|
+
python -m playwright install chromium
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Quick Start
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
import asyncio
|
|
54
|
+
|
|
55
|
+
from webskrap import WebSkrapClient
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
async def main() -> None:
|
|
59
|
+
async with WebSkrapClient() as client:
|
|
60
|
+
result = await client.fetch("https://example.com")
|
|
61
|
+
print(result.status)
|
|
62
|
+
print(result.title)
|
|
63
|
+
print(result.text[:200])
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
asyncio.run(main())
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Persistent Session
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
import asyncio
|
|
73
|
+
from pathlib import Path
|
|
74
|
+
|
|
75
|
+
from webskrap import SessionConfig, WebSkrapClient
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
async def main() -> None:
|
|
79
|
+
config = SessionConfig(
|
|
80
|
+
user_data_dir=Path(".webskrap/sessions/shop"),
|
|
81
|
+
headless=True,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
async with WebSkrapClient() as client:
|
|
85
|
+
session = await client.session("shop", config=config, profile="desktop-chrome")
|
|
86
|
+
first = await session.fetch("https://example.com")
|
|
87
|
+
second = await session.fetch("https://example.com/account")
|
|
88
|
+
print(first.final_url, second.final_url)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
asyncio.run(main())
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Headed Browser
|
|
95
|
+
|
|
96
|
+
Use a persistent session when you want the browser to stay open.
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
import asyncio
|
|
100
|
+
from pathlib import Path
|
|
101
|
+
|
|
102
|
+
from webskrap import SessionConfig, WebSkrapClient
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
async def main() -> None:
|
|
106
|
+
config = SessionConfig(
|
|
107
|
+
headless=False,
|
|
108
|
+
user_data_dir=Path(".webskrap/dev-session"),
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
async with WebSkrapClient() as client:
|
|
112
|
+
session = await client.session("dev", config=config)
|
|
113
|
+
page = await session.context.new_page()
|
|
114
|
+
await page.goto("https://example.com", wait_until="domcontentloaded")
|
|
115
|
+
|
|
116
|
+
input("Press Enter to close browser...")
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
asyncio.run(main())
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
Example for a headed Chrome session with a French desktop profile:
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
import asyncio
|
|
126
|
+
from pathlib import Path
|
|
127
|
+
|
|
128
|
+
from webskrap import BrowserProfile, SessionConfig, Viewport, WebSkrapClient
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
async def main() -> None:
|
|
132
|
+
config = SessionConfig(
|
|
133
|
+
headless=False,
|
|
134
|
+
channel="chrome",
|
|
135
|
+
user_data_dir=Path(".webskrap/gmf"),
|
|
136
|
+
navigation_timeout_ms=90_000,
|
|
137
|
+
default_timeout_ms=90_000,
|
|
138
|
+
slow_mo_ms=50,
|
|
139
|
+
launch_args=[
|
|
140
|
+
"--start-maximized",
|
|
141
|
+
"--disable-blink-features=AutomationControlled",
|
|
142
|
+
"--no-first-run",
|
|
143
|
+
"--no-default-browser-check",
|
|
144
|
+
],
|
|
145
|
+
)
|
|
146
|
+
profile = BrowserProfile(
|
|
147
|
+
name="fr-desktop",
|
|
148
|
+
viewport=Viewport(width=1440, height=900),
|
|
149
|
+
screen=Viewport(width=1440, height=900),
|
|
150
|
+
locale="fr-FR",
|
|
151
|
+
timezone_id="Europe/Paris",
|
|
152
|
+
navigator_languages=["fr-FR", "fr", "en-US", "en"],
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
async with WebSkrapClient() as client:
|
|
156
|
+
session = await client.session("gmf", config=config, profile=profile)
|
|
157
|
+
page = await session.context.new_page()
|
|
158
|
+
await page.goto("https://www.gmf.fr/habitation/devis", wait_until="domcontentloaded")
|
|
159
|
+
|
|
160
|
+
input("Press Enter to close browser...")
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
asyncio.run(main())
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## Custom Profile
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
from webskrap import BrowserProfile, Viewport
|
|
170
|
+
|
|
171
|
+
profile = BrowserProfile(
|
|
172
|
+
name="workstation",
|
|
173
|
+
viewport=Viewport(width=1440, height=900),
|
|
174
|
+
screen=Viewport(width=1440, height=900),
|
|
175
|
+
locale="en-US",
|
|
176
|
+
timezone_id="Europe/Paris",
|
|
177
|
+
)
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## Session Options
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
from pathlib import Path
|
|
184
|
+
|
|
185
|
+
from webskrap import ProxyConfig, ResourcePolicy, SessionConfig
|
|
186
|
+
|
|
187
|
+
config = SessionConfig(
|
|
188
|
+
browser="chromium",
|
|
189
|
+
channel="chrome",
|
|
190
|
+
headless=False,
|
|
191
|
+
user_data_dir=Path(".webskrap/session"),
|
|
192
|
+
storage_state=None,
|
|
193
|
+
proxy=ProxyConfig(server="http://127.0.0.1:8080"),
|
|
194
|
+
resource_policy=ResourcePolicy.LITE,
|
|
195
|
+
ignore_https_errors=True,
|
|
196
|
+
java_script_enabled=True,
|
|
197
|
+
service_workers="allow",
|
|
198
|
+
timeout_ms=30_000,
|
|
199
|
+
navigation_timeout_ms=90_000,
|
|
200
|
+
default_timeout_ms=90_000,
|
|
201
|
+
slow_mo_ms=50,
|
|
202
|
+
launch_args=[
|
|
203
|
+
"--start-maximized",
|
|
204
|
+
"--disable-blink-features=AutomationControlled",
|
|
205
|
+
"--disable-dev-shm-usage",
|
|
206
|
+
"--no-first-run",
|
|
207
|
+
"--no-default-browser-check",
|
|
208
|
+
],
|
|
209
|
+
)
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
`resource_policy` values:
|
|
213
|
+
|
|
214
|
+
- `ResourcePolicy.ALL`: allow all resources.
|
|
215
|
+
- `ResourcePolicy.LITE`: block images, fonts, and media.
|
|
216
|
+
- `ResourcePolicy.DOCUMENTS`: block images, fonts, media, and stylesheets.
|
|
217
|
+
|
|
218
|
+
## Profile Options
|
|
219
|
+
|
|
220
|
+
```python
|
|
221
|
+
from webskrap import BrowserProfile, Viewport
|
|
222
|
+
|
|
223
|
+
profile = BrowserProfile(
|
|
224
|
+
name="fr-desktop",
|
|
225
|
+
user_agent=None,
|
|
226
|
+
viewport=Viewport(width=1440, height=900),
|
|
227
|
+
screen=Viewport(width=1440, height=900),
|
|
228
|
+
locale="fr-FR",
|
|
229
|
+
timezone_id="Europe/Paris",
|
|
230
|
+
device_scale_factor=1,
|
|
231
|
+
is_mobile=False,
|
|
232
|
+
has_touch=False,
|
|
233
|
+
color_scheme="light",
|
|
234
|
+
reduced_motion="no-preference",
|
|
235
|
+
extra_http_headers={},
|
|
236
|
+
navigator_languages=["fr-FR", "fr", "en-US", "en"],
|
|
237
|
+
hardware_concurrency=8,
|
|
238
|
+
device_memory=8,
|
|
239
|
+
webgl_vendor="Google Inc. (Intel)",
|
|
240
|
+
webgl_renderer="ANGLE (Intel, Intel(R) Iris(TM) Plus Graphics, OpenGL 4.1)",
|
|
241
|
+
)
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
## Stealth Options
|
|
245
|
+
|
|
246
|
+
```python
|
|
247
|
+
from webskrap import SessionConfig, StealthConfig
|
|
248
|
+
|
|
249
|
+
config = SessionConfig(
|
|
250
|
+
stealth=StealthConfig(
|
|
251
|
+
enabled=True,
|
|
252
|
+
patch_webdriver=True,
|
|
253
|
+
patch_chrome_runtime=True,
|
|
254
|
+
patch_permissions=True,
|
|
255
|
+
patch_plugins=True,
|
|
256
|
+
patch_webgl=True,
|
|
257
|
+
patch_canvas=True,
|
|
258
|
+
patch_hardware=True,
|
|
259
|
+
)
|
|
260
|
+
)
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
## CLI
|
|
264
|
+
|
|
265
|
+
```bash
|
|
266
|
+
webskrap profiles
|
|
267
|
+
webskrap doctor
|
|
268
|
+
webskrap fetch https://example.com --profile desktop-chrome
|
|
269
|
+
webskrap fetch https://example.com --headed --screenshot example.png
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
## Development
|
|
273
|
+
|
|
274
|
+
```bash
|
|
275
|
+
pip install -e ".[dev]"
|
|
276
|
+
pytest
|
|
277
|
+
ruff check .
|
|
278
|
+
```
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
webskrap/__init__.py,sha256=JSQts2TI_3G41T5avnltIK3GRttbSZ8YtfcCxC-zG6k,551
|
|
2
|
+
webskrap/cli.py,sha256=TnKdDYQCE8zJgsPTXQGKwwXXwlzAl437o-dk9CLXplc,5187
|
|
3
|
+
webskrap/client.py,sha256=wVcipoyX-EsMPD7tv08lDojS_nA1zTIczU_4RmyArII,8232
|
|
4
|
+
webskrap/models.py,sha256=1Hhbzt_GJsv2SCWIK6msy04gmgrL-H-XnwC4wieng4M,7159
|
|
5
|
+
webskrap/profiles.py,sha256=jRPqsSRf-bpTLsBVQ9w_Fb6UU7v3Gmgc0b3h2dRhUrg,2366
|
|
6
|
+
webskrap/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
7
|
+
webskrap/stealth.py,sha256=btCeOFJTGIKTGZpUjp4zyD-bxmf3CQOSoX4o3qAMaVE,4082
|
|
8
|
+
webskrap-0.1.0.dist-info/METADATA,sha256=W9iPOKgXCig4TAVNBVAQStw3Fc6dyjjy0EP_wAqI2HU,7260
|
|
9
|
+
webskrap-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
10
|
+
webskrap-0.1.0.dist-info/entry_points.txt,sha256=nKdAJqYIMy8Ql7TH1MUER8IgiaUcBi7hqU8p4bhIU7A,46
|
|
11
|
+
webskrap-0.1.0.dist-info/RECORD,,
|