webskrap 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
webskrap/__init__.py ADDED
@@ -0,0 +1,26 @@
1
+ from webskrap.client import WebSkrapClient, WebSkrapError, WebSkrapSession
2
+ from webskrap.models import (
3
+ BrowserProfile,
4
+ FetchResult,
5
+ ProxyConfig,
6
+ ResourcePolicy,
7
+ SessionConfig,
8
+ StealthConfig,
9
+ Viewport,
10
+ )
11
+ from webskrap.profiles import get_profile, list_profiles
12
+
13
+ __all__ = [
14
+ "BrowserProfile",
15
+ "FetchResult",
16
+ "ProxyConfig",
17
+ "ResourcePolicy",
18
+ "SessionConfig",
19
+ "StealthConfig",
20
+ "Viewport",
21
+ "WebSkrapClient",
22
+ "WebSkrapError",
23
+ "WebSkrapSession",
24
+ "get_profile",
25
+ "list_profiles",
26
+ ]
webskrap/cli.py ADDED
@@ -0,0 +1,170 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ from pathlib import Path
5
+ from typing import Annotated, cast
6
+
7
+ import typer
8
+ from rich.console import Console
9
+ from rich.table import Table
10
+
11
+ from webskrap.client import WaitUntil, WebSkrapClient
12
+ from webskrap.models import ResourcePolicy, SessionConfig
13
+ from webskrap.profiles import get_profile, list_profiles
14
+
15
+ app = typer.Typer(help="WebSkrap browser scraping toolkit.")
16
+ console = Console()
17
+
18
+
19
+ @app.command("profiles")
20
+ def profiles_command() -> None:
21
+ table = Table(title="WebSkrap Profiles")
22
+ table.add_column("Name")
23
+ table.add_column("Viewport")
24
+ table.add_column("Locale")
25
+ table.add_column("Timezone")
26
+ table.add_column("Mobile")
27
+
28
+ for profile in list_profiles():
29
+ table.add_row(
30
+ profile.name,
31
+ f"{profile.viewport.width}x{profile.viewport.height}",
32
+ profile.locale,
33
+ profile.timezone_id,
34
+ "yes" if profile.is_mobile else "no",
35
+ )
36
+
37
+ console.print(table)
38
+
39
+
40
+ @app.command("doctor")
41
+ def doctor_command() -> None:
42
+ asyncio.run(_doctor())
43
+
44
+
45
+ async def _doctor() -> None:
46
+ try:
47
+ from playwright.async_api import async_playwright
48
+ except Exception as exc:
49
+ console.print(f"[red]Playwright import failed:[/red] {exc}")
50
+ raise typer.Exit(code=1) from exc
51
+
52
+ try:
53
+ manager = async_playwright()
54
+ playwright = await manager.start()
55
+ browser = await playwright.chromium.launch(headless=True)
56
+ await browser.close()
57
+ await playwright.stop()
58
+ except Exception as exc:
59
+ console.print("[yellow]Playwright is installed, but Chromium did not launch.[/yellow]")
60
+ console.print(str(exc))
61
+ console.print("Run: python -m playwright install chromium")
62
+ raise typer.Exit(code=1) from exc
63
+
64
+ console.print("[green]Playwright and Chromium are ready.[/green]")
65
+
66
+
67
+ @app.command("fetch")
68
+ def fetch_command(
69
+ url: Annotated[str, typer.Argument(help="URL to fetch.")],
70
+ profile: Annotated[
71
+ str,
72
+ typer.Option("--profile", "-p", help="Bundled profile name."),
73
+ ] = "desktop-chrome",
74
+ headed: Annotated[bool, typer.Option("--headed", help="Run with a visible browser.")] = False,
75
+ channel: Annotated[
76
+ str | None,
77
+ typer.Option("--channel", help="Browser channel, e.g. chrome."),
78
+ ] = None,
79
+ screenshot: Annotated[
80
+ Path | None,
81
+ typer.Option("--screenshot", help="Write a full-page screenshot to this path."),
82
+ ] = None,
83
+ output: Annotated[
84
+ Path | None,
85
+ typer.Option("--output", "-o", help="Write HTML to this file."),
86
+ ] = None,
87
+ wait_until: Annotated[
88
+ str,
89
+ typer.Option("--wait-until", help="commit, domcontentloaded, load, or networkidle."),
90
+ ] = "domcontentloaded",
91
+ timeout_ms: Annotated[
92
+ float,
93
+ typer.Option("--timeout-ms", min=1, help="Navigation timeout."),
94
+ ] = 30_000,
95
+ resource_policy: Annotated[
96
+ ResourcePolicy,
97
+ typer.Option("--resource-policy", help="Resource routing preset."),
98
+ ] = ResourcePolicy.ALL,
99
+ no_stealth: Annotated[
100
+ bool,
101
+ typer.Option("--no-stealth", help="Disable browser hardening."),
102
+ ] = False,
103
+ ) -> None:
104
+ asyncio.run(
105
+ _fetch(
106
+ url=url,
107
+ profile=profile,
108
+ headed=headed,
109
+ channel=channel,
110
+ screenshot=screenshot,
111
+ output=output,
112
+ wait_until=wait_until,
113
+ timeout_ms=timeout_ms,
114
+ resource_policy=resource_policy,
115
+ no_stealth=no_stealth,
116
+ )
117
+ )
118
+
119
+
120
+ async def _fetch(
121
+ *,
122
+ url: str,
123
+ profile: str,
124
+ headed: bool,
125
+ channel: str | None,
126
+ screenshot: Path | None,
127
+ output: Path | None,
128
+ wait_until: str,
129
+ timeout_ms: float,
130
+ resource_policy: ResourcePolicy,
131
+ no_stealth: bool,
132
+ ) -> None:
133
+ selected_profile = get_profile(profile)
134
+ config = SessionConfig(
135
+ headless=not headed,
136
+ channel=channel,
137
+ navigation_timeout_ms=timeout_ms,
138
+ resource_policy=resource_policy,
139
+ )
140
+ config.stealth.enabled = not no_stealth
141
+
142
+ async with WebSkrapClient() as client:
143
+ result = await client.fetch(
144
+ url,
145
+ profile=selected_profile,
146
+ config=config,
147
+ wait_until=_parse_wait_until(wait_until),
148
+ screenshot=screenshot or False,
149
+ timeout_ms=timeout_ms,
150
+ )
151
+
152
+ if output:
153
+ output.parent.mkdir(parents=True, exist_ok=True)
154
+ output.write_text(result.text, encoding="utf-8")
155
+
156
+ console.print(f"[bold]Status:[/bold] {result.status}")
157
+ console.print(f"[bold]Final URL:[/bold] {result.final_url}")
158
+ console.print(f"[bold]Title:[/bold] {result.title}")
159
+ if result.screenshot_path:
160
+ console.print(f"[bold]Screenshot:[/bold] {result.screenshot_path}")
161
+ if output:
162
+ console.print(f"[bold]HTML:[/bold] {output}")
163
+
164
+
165
+ def _parse_wait_until(value: str) -> WaitUntil:
166
+ valid = ("commit", "domcontentloaded", "load", "networkidle")
167
+ if value not in valid:
168
+ allowed = ", ".join(valid)
169
+ raise typer.BadParameter(f"must be one of: {allowed}")
170
+ return cast(WaitUntil, value)
webskrap/client.py ADDED
@@ -0,0 +1,252 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ from collections.abc import Mapping
5
+ from pathlib import Path
6
+ from typing import Literal
7
+ from uuid import uuid4
8
+
9
+ from playwright.async_api import Browser, BrowserContext, Page, Playwright, async_playwright
10
+
11
+ from webskrap.models import BrowserProfile, FetchResult, ResourcePolicy, SessionConfig
12
+ from webskrap.profiles import get_profile
13
+ from webskrap.stealth import apply_stealth
14
+
15
+ WaitUntil = Literal["commit", "domcontentloaded", "load", "networkidle"]
16
+
17
+
18
+ class WebSkrapError(RuntimeError):
19
+ pass
20
+
21
+
22
+ class WebSkrapSession:
23
+ def __init__(
24
+ self,
25
+ *,
26
+ name: str,
27
+ context: BrowserContext,
28
+ config: SessionConfig,
29
+ profile: BrowserProfile,
30
+ browser: Browser | None = None,
31
+ ) -> None:
32
+ self.name = name
33
+ self.context = context
34
+ self.config = config
35
+ self.profile = profile
36
+ self.browser = browser
37
+ self._closed = False
38
+
39
+ async def __aenter__(self) -> WebSkrapSession:
40
+ return self
41
+
42
+ async def __aexit__(self, exc_type: object, exc: object, traceback: object) -> None:
43
+ await self.close()
44
+
45
+ async def fetch(
46
+ self,
47
+ url: str,
48
+ *,
49
+ wait_until: WaitUntil = "domcontentloaded",
50
+ screenshot: bool | str | Path = False,
51
+ timeout_ms: float | None = None,
52
+ ) -> FetchResult:
53
+ self._ensure_open()
54
+ started = time.perf_counter()
55
+ page = await self.context.new_page()
56
+ try:
57
+ _configure_page_timeouts(page, self.config)
58
+ response = await page.goto(
59
+ url,
60
+ wait_until=wait_until,
61
+ timeout=timeout_ms or self.config.navigation_timeout_ms,
62
+ )
63
+ title = await page.title()
64
+ text = await page.content()
65
+ screenshot_path = await _maybe_screenshot(page, screenshot)
66
+ cookies = await self.context.cookies()
67
+ elapsed_ms = (time.perf_counter() - started) * 1000
68
+ status = response.status if response else None
69
+ headers = dict(response.headers) if response else {}
70
+ return FetchResult(
71
+ url=url,
72
+ final_url=page.url,
73
+ status=status,
74
+ ok=status is not None and 200 <= status < 400,
75
+ headers=headers,
76
+ text=text,
77
+ title=title,
78
+ cookies=cookies,
79
+ timings={"elapsed_ms": elapsed_ms},
80
+ screenshot_path=screenshot_path,
81
+ )
82
+ finally:
83
+ await page.close()
84
+
85
+ async def close(self) -> None:
86
+ if self._closed:
87
+ return
88
+ self._closed = True
89
+ await self.context.close()
90
+ if self.browser is not None:
91
+ await self.browser.close()
92
+
93
+ def _ensure_open(self) -> None:
94
+ if self._closed:
95
+ msg = f"session '{self.name}' is closed"
96
+ raise WebSkrapError(msg)
97
+
98
+
99
+ class WebSkrapClient:
100
+ def __init__(
101
+ self,
102
+ *,
103
+ default_config: SessionConfig | None = None,
104
+ profiles: Mapping[str, BrowserProfile] | None = None,
105
+ ) -> None:
106
+ self.default_config = default_config or SessionConfig()
107
+ self.profiles = dict(profiles or {})
108
+ self._playwright_manager = None
109
+ self._playwright: Playwright | None = None
110
+ self._sessions: dict[str, WebSkrapSession] = {}
111
+
112
+ async def __aenter__(self) -> WebSkrapClient:
113
+ await self.start()
114
+ return self
115
+
116
+ async def __aexit__(self, exc_type: object, exc: object, traceback: object) -> None:
117
+ await self.close()
118
+
119
+ async def start(self) -> None:
120
+ if self._playwright is not None:
121
+ return
122
+ self._playwright_manager = async_playwright()
123
+ self._playwright = await self._playwright_manager.start()
124
+
125
+ async def close(self) -> None:
126
+ for session in list(self._sessions.values()):
127
+ await session.close()
128
+ self._sessions.clear()
129
+ if self._playwright is not None:
130
+ await self._playwright.stop()
131
+ self._playwright_manager = None
132
+ self._playwright = None
133
+
134
+ async def fetch(
135
+ self,
136
+ url: str,
137
+ *,
138
+ profile: str | BrowserProfile | None = None,
139
+ config: SessionConfig | None = None,
140
+ wait_until: WaitUntil = "domcontentloaded",
141
+ screenshot: bool | str | Path = False,
142
+ timeout_ms: float | None = None,
143
+ ) -> FetchResult:
144
+ name = f"_single_{uuid4().hex}"
145
+ session = await self.session(name, config=config, profile=profile)
146
+ try:
147
+ return await session.fetch(
148
+ url,
149
+ wait_until=wait_until,
150
+ screenshot=screenshot,
151
+ timeout_ms=timeout_ms,
152
+ )
153
+ finally:
154
+ await session.close()
155
+ self._sessions.pop(name, None)
156
+
157
+ async def session(
158
+ self,
159
+ name: str,
160
+ *,
161
+ config: SessionConfig | None = None,
162
+ profile: str | BrowserProfile | None = None,
163
+ ) -> WebSkrapSession:
164
+ if name in self._sessions:
165
+ return self._sessions[name]
166
+ await self.start()
167
+ resolved_config = config or self.default_config
168
+ resolved_profile = self._resolve_profile(profile)
169
+ session = await self._create_session(name, resolved_config, resolved_profile)
170
+ self._sessions[name] = session
171
+ return session
172
+
173
+ def _resolve_profile(self, profile: str | BrowserProfile | None) -> BrowserProfile:
174
+ if isinstance(profile, BrowserProfile):
175
+ return profile
176
+ if profile in self.profiles:
177
+ return self.profiles[profile].model_copy(deep=True)
178
+ return get_profile(profile)
179
+
180
+ async def _create_session(
181
+ self,
182
+ name: str,
183
+ config: SessionConfig,
184
+ profile: BrowserProfile,
185
+ ) -> WebSkrapSession:
186
+ if self._playwright is None:
187
+ msg = "client is not started"
188
+ raise WebSkrapError(msg)
189
+
190
+ browser_type = getattr(self._playwright, config.browser)
191
+ context_options = config.context_options(profile)
192
+
193
+ if config.user_data_dir is not None:
194
+ config.user_data_dir.mkdir(parents=True, exist_ok=True)
195
+ context = await browser_type.launch_persistent_context(
196
+ str(config.user_data_dir),
197
+ **config.launch_options(),
198
+ **context_options,
199
+ )
200
+ browser = None
201
+ else:
202
+ browser = await browser_type.launch(**config.launch_options())
203
+ context = await browser.new_context(**context_options)
204
+
205
+ context.set_default_timeout(config.default_timeout_ms)
206
+ context.set_default_navigation_timeout(config.navigation_timeout_ms)
207
+
208
+ if config.resource_policy != ResourcePolicy.ALL:
209
+ await context.route("**/*", _resource_route_handler(config.resource_policy))
210
+ if config.stealth.enabled:
211
+ await apply_stealth(context, profile, config.stealth)
212
+
213
+ return WebSkrapSession(
214
+ name=name,
215
+ context=context,
216
+ config=config,
217
+ profile=profile,
218
+ browser=browser,
219
+ )
220
+
221
+
222
+ def _resource_route_handler(policy: ResourcePolicy):
223
+ blocked = {
224
+ ResourcePolicy.LITE: {"image", "font", "media"},
225
+ ResourcePolicy.DOCUMENTS: {"image", "font", "media", "stylesheet"},
226
+ }[policy]
227
+
228
+ async def handle(route) -> None:
229
+ if route.request.resource_type in blocked:
230
+ await route.abort()
231
+ else:
232
+ await route.continue_()
233
+
234
+ return handle
235
+
236
+
237
+ def _configure_page_timeouts(page: Page, config: SessionConfig) -> None:
238
+ page.set_default_timeout(config.default_timeout_ms)
239
+ page.set_default_navigation_timeout(config.navigation_timeout_ms)
240
+
241
+
242
+ async def _maybe_screenshot(page: Page, screenshot: bool | str | Path) -> Path | None:
243
+ if not screenshot:
244
+ return None
245
+ path = (
246
+ Path(f"webskrap-{int(time.time() * 1000)}.png")
247
+ if screenshot is True
248
+ else Path(screenshot)
249
+ )
250
+ path.parent.mkdir(parents=True, exist_ok=True)
251
+ await page.screenshot(path=str(path), full_page=True)
252
+ return path
webskrap/models.py ADDED
@@ -0,0 +1,206 @@
1
+ from __future__ import annotations
2
+
3
+ from enum import StrEnum
4
+ from pathlib import Path
5
+ from typing import Any, Literal
6
+
7
+ from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
8
+
9
+
10
+ class ResourcePolicy(StrEnum):
11
+ ALL = "all"
12
+ LITE = "lite"
13
+ DOCUMENTS = "documents"
14
+
15
+
16
+ class Viewport(BaseModel):
17
+ width: int = Field(gt=0)
18
+ height: int = Field(gt=0)
19
+
20
+ def to_playwright(self) -> dict[str, int]:
21
+ return {"width": self.width, "height": self.height}
22
+
23
+
24
+ class ProxyConfig(BaseModel):
25
+ server: str
26
+ bypass: str | None = None
27
+ username: str | None = None
28
+ password: str | None = None
29
+
30
+ @field_validator("server")
31
+ @classmethod
32
+ def validate_server(cls, value: str) -> str:
33
+ allowed_prefixes = ("http://", "https://", "socks4://", "socks5://")
34
+ if not value.startswith(allowed_prefixes):
35
+ msg = "proxy server must start with http://, https://, socks4://, or socks5://"
36
+ raise ValueError(msg)
37
+ return value
38
+
39
+ def to_playwright(self) -> dict[str, str]:
40
+ payload = {"server": self.server}
41
+ if self.bypass:
42
+ payload["bypass"] = self.bypass
43
+ if self.username:
44
+ payload["username"] = self.username
45
+ if self.password:
46
+ payload["password"] = self.password
47
+ return payload
48
+
49
+
50
+ class StealthConfig(BaseModel):
51
+ enabled: bool = True
52
+ patch_webdriver: bool = True
53
+ patch_chrome_runtime: bool = True
54
+ patch_permissions: bool = True
55
+ patch_plugins: bool = True
56
+ patch_webgl: bool = True
57
+ patch_canvas: bool = True
58
+ patch_hardware: bool = True
59
+
60
+
61
+ class BrowserProfile(BaseModel):
62
+ model_config = ConfigDict(extra="forbid")
63
+
64
+ name: str
65
+ user_agent: str | None = None
66
+ viewport: Viewport = Field(default_factory=lambda: Viewport(width=1365, height=768))
67
+ screen: Viewport = Field(default_factory=lambda: Viewport(width=1440, height=900))
68
+ locale: str = "en-US"
69
+ timezone_id: str = "Europe/Paris"
70
+ device_scale_factor: float = Field(default=1.0, gt=0)
71
+ is_mobile: bool = False
72
+ has_touch: bool = False
73
+ color_scheme: Literal["dark", "light", "no-preference", "null"] = "light"
74
+ reduced_motion: Literal["reduce", "no-preference", "null"] = "no-preference"
75
+ extra_http_headers: dict[str, str] = Field(default_factory=dict)
76
+ navigator_languages: list[str] = Field(default_factory=lambda: ["en-US", "en"])
77
+ hardware_concurrency: int = Field(default=8, ge=1, le=64)
78
+ device_memory: int = Field(default=8, ge=1, le=128)
79
+ webgl_vendor: str = "Google Inc. (Intel)"
80
+ webgl_renderer: str = "ANGLE (Intel, Intel(R) Iris(TM) Plus Graphics, OpenGL 4.1)"
81
+
82
+ @field_validator("name")
83
+ @classmethod
84
+ def validate_name(cls, value: str) -> str:
85
+ if not value.strip():
86
+ msg = "profile name cannot be empty"
87
+ raise ValueError(msg)
88
+ return value
89
+
90
+ @field_validator("locale", "timezone_id")
91
+ @classmethod
92
+ def validate_non_empty(cls, value: str) -> str:
93
+ if not value.strip():
94
+ msg = "value cannot be empty"
95
+ raise ValueError(msg)
96
+ return value
97
+
98
+ @model_validator(mode="after")
99
+ def ensure_language_consistency(self) -> BrowserProfile:
100
+ if not self.navigator_languages:
101
+ self.navigator_languages = [self.locale]
102
+ if self.locale not in self.navigator_languages:
103
+ self.navigator_languages.insert(0, self.locale)
104
+ return self
105
+
106
+ def accept_language(self) -> str:
107
+ languages = self.navigator_languages or [self.locale]
108
+ weighted = [languages[0]]
109
+ weighted.extend(
110
+ f"{language};q={max(0.1, 1 - index * 0.1):.1f}"
111
+ for index, language in enumerate(languages[1:], start=1)
112
+ )
113
+ return ",".join(weighted)
114
+
115
+ def headers(self) -> dict[str, str]:
116
+ headers = {
117
+ "Accept": (
118
+ "text/html,application/xhtml+xml,application/xml;q=0.9,"
119
+ "image/avif,image/webp,*/*;q=0.8"
120
+ ),
121
+ "Accept-Language": self.accept_language(),
122
+ "Upgrade-Insecure-Requests": "1",
123
+ }
124
+ headers.update(self.extra_http_headers)
125
+ return headers
126
+
127
+ def to_context_options(self) -> dict[str, Any]:
128
+ options: dict[str, Any] = {
129
+ "viewport": self.viewport.to_playwright(),
130
+ "screen": self.screen.to_playwright(),
131
+ "locale": self.locale,
132
+ "timezone_id": self.timezone_id,
133
+ "device_scale_factor": self.device_scale_factor,
134
+ "is_mobile": self.is_mobile,
135
+ "has_touch": self.has_touch,
136
+ "color_scheme": self.color_scheme,
137
+ "reduced_motion": self.reduced_motion,
138
+ "extra_http_headers": self.headers(),
139
+ }
140
+ if self.user_agent:
141
+ options["user_agent"] = self.user_agent
142
+ return options
143
+
144
+
145
+ class SessionConfig(BaseModel):
146
+ model_config = ConfigDict(arbitrary_types_allowed=True, extra="forbid")
147
+
148
+ browser: Literal["chromium", "firefox", "webkit"] = "chromium"
149
+ channel: str | None = None
150
+ headless: bool = True
151
+ user_data_dir: Path | None = None
152
+ storage_state: Path | dict[str, Any] | None = None
153
+ proxy: ProxyConfig | None = None
154
+ resource_policy: ResourcePolicy = ResourcePolicy.ALL
155
+ stealth: StealthConfig = Field(default_factory=StealthConfig)
156
+ ignore_https_errors: bool = False
157
+ java_script_enabled: bool = True
158
+ service_workers: Literal["allow", "block"] = "allow"
159
+ timeout_ms: float = Field(default=30_000, gt=0)
160
+ navigation_timeout_ms: float = Field(default=30_000, gt=0)
161
+ default_timeout_ms: float = Field(default=30_000, gt=0)
162
+ slow_mo_ms: float | None = Field(default=None, ge=0)
163
+ launch_args: list[str] = Field(default_factory=list)
164
+
165
+ def launch_options(self) -> dict[str, Any]:
166
+ options: dict[str, Any] = {
167
+ "headless": self.headless,
168
+ "timeout": self.timeout_ms,
169
+ }
170
+ if self.channel:
171
+ options["channel"] = self.channel
172
+ if self.slow_mo_ms is not None:
173
+ options["slow_mo"] = self.slow_mo_ms
174
+ if self.launch_args:
175
+ options["args"] = self.launch_args
176
+ return options
177
+
178
+ def context_options(self, profile: BrowserProfile) -> dict[str, Any]:
179
+ options = profile.to_context_options()
180
+ options.update(
181
+ {
182
+ "ignore_https_errors": self.ignore_https_errors,
183
+ "java_script_enabled": self.java_script_enabled,
184
+ "service_workers": self.service_workers,
185
+ }
186
+ )
187
+ if self.proxy:
188
+ options["proxy"] = self.proxy.to_playwright()
189
+ if self.storage_state is not None and self.user_data_dir is None:
190
+ options["storage_state"] = self.storage_state
191
+ return options
192
+
193
+
194
+ class FetchResult(BaseModel):
195
+ model_config = ConfigDict(arbitrary_types_allowed=True)
196
+
197
+ url: str
198
+ final_url: str
199
+ status: int | None
200
+ ok: bool
201
+ headers: dict[str, str]
202
+ text: str
203
+ title: str
204
+ cookies: list[dict[str, Any]]
205
+ timings: dict[str, float]
206
+ screenshot_path: Path | None = None
webskrap/profiles.py ADDED
@@ -0,0 +1,67 @@
1
+ from __future__ import annotations
2
+
3
+ from webskrap.models import BrowserProfile, Viewport
4
+
5
+ _PROFILES: dict[str, BrowserProfile] = {
6
+ "desktop-chrome": BrowserProfile(
7
+ name="desktop-chrome",
8
+ viewport=Viewport(width=1365, height=768),
9
+ screen=Viewport(width=1440, height=900),
10
+ locale="en-US",
11
+ timezone_id="Europe/Paris",
12
+ device_scale_factor=1,
13
+ navigator_languages=["en-US", "en"],
14
+ hardware_concurrency=8,
15
+ device_memory=8,
16
+ webgl_vendor="Google Inc. (Intel)",
17
+ webgl_renderer="ANGLE (Intel, Intel(R) Iris(TM) Plus Graphics, OpenGL 4.1)",
18
+ ),
19
+ "desktop-edge": BrowserProfile(
20
+ name="desktop-edge",
21
+ viewport=Viewport(width=1440, height=810),
22
+ screen=Viewport(width=1536, height=864),
23
+ locale="en-US",
24
+ timezone_id="Europe/Paris",
25
+ device_scale_factor=1,
26
+ navigator_languages=["en-US", "en"],
27
+ hardware_concurrency=8,
28
+ device_memory=8,
29
+ webgl_vendor="Google Inc. (NVIDIA)",
30
+ webgl_renderer="ANGLE (NVIDIA, NVIDIA GeForce RTX 3060 Direct3D11 vs_5_0 ps_5_0)",
31
+ ),
32
+ "mobile-chrome": BrowserProfile(
33
+ name="mobile-chrome",
34
+ user_agent=(
35
+ "Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 "
36
+ "(KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36"
37
+ ),
38
+ viewport=Viewport(width=412, height=915),
39
+ screen=Viewport(width=412, height=915),
40
+ locale="en-US",
41
+ timezone_id="Europe/Paris",
42
+ device_scale_factor=2.625,
43
+ is_mobile=True,
44
+ has_touch=True,
45
+ navigator_languages=["en-US", "en"],
46
+ hardware_concurrency=8,
47
+ device_memory=8,
48
+ webgl_vendor="Google Inc. (Qualcomm)",
49
+ webgl_renderer="ANGLE (Qualcomm, Adreno (TM) 730, OpenGL ES 3.2)",
50
+ ),
51
+ }
52
+
53
+
54
+ def list_profiles() -> tuple[BrowserProfile, ...]:
55
+ return tuple(_PROFILES.values())
56
+
57
+
58
+ def get_profile(name: str | BrowserProfile | None = None) -> BrowserProfile:
59
+ if isinstance(name, BrowserProfile):
60
+ return name
61
+ key = name or "desktop-chrome"
62
+ try:
63
+ return _PROFILES[key].model_copy(deep=True)
64
+ except KeyError as exc:
65
+ available = ", ".join(sorted(_PROFILES))
66
+ msg = f"unknown profile '{key}'. Available profiles: {available}"
67
+ raise ValueError(msg) from exc
webskrap/py.typed ADDED
@@ -0,0 +1 @@
1
+
webskrap/stealth.py ADDED
@@ -0,0 +1,117 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+
5
+ from playwright.async_api import BrowserContext
6
+
7
+ from webskrap.models import BrowserProfile, StealthConfig
8
+
9
+
10
+ async def apply_stealth(
11
+ context: BrowserContext,
12
+ profile: BrowserProfile,
13
+ config: StealthConfig,
14
+ ) -> None:
15
+ if not config.enabled:
16
+ return
17
+ await context.add_init_script(script=build_stealth_script(profile, config))
18
+
19
+
20
+ def build_stealth_script(profile: BrowserProfile, config: StealthConfig) -> str:
21
+ payload = {
22
+ "languages": profile.navigator_languages,
23
+ "hardwareConcurrency": profile.hardware_concurrency,
24
+ "deviceMemory": profile.device_memory,
25
+ "webglVendor": profile.webgl_vendor,
26
+ "webglRenderer": profile.webgl_renderer,
27
+ "patchWebdriver": config.patch_webdriver,
28
+ "patchChromeRuntime": config.patch_chrome_runtime,
29
+ "patchPermissions": config.patch_permissions,
30
+ "patchPlugins": config.patch_plugins,
31
+ "patchWebgl": config.patch_webgl,
32
+ "patchCanvas": config.patch_canvas,
33
+ "patchHardware": config.patch_hardware,
34
+ }
35
+ encoded = json.dumps(payload, separators=(",", ":"))
36
+ return f"""
37
+ (() => {{
38
+ const profile = {encoded};
39
+ const defineGetter = (target, prop, getter) => {{
40
+ try {{
41
+ Object.defineProperty(target, prop, {{ get: getter, configurable: true }});
42
+ }} catch (_) {{}}
43
+ }};
44
+
45
+ if (profile.patchWebdriver) {{
46
+ defineGetter(Navigator.prototype, "webdriver", () => undefined);
47
+ }}
48
+
49
+ defineGetter(Navigator.prototype, "languages", () => profile.languages.slice());
50
+ defineGetter(Navigator.prototype, "language", () => profile.languages[0]);
51
+
52
+ if (profile.patchHardware) {{
53
+ defineGetter(Navigator.prototype, "hardwareConcurrency", () => profile.hardwareConcurrency);
54
+ defineGetter(Navigator.prototype, "deviceMemory", () => profile.deviceMemory);
55
+ }}
56
+
57
+ if (profile.patchPlugins) {{
58
+ defineGetter(Navigator.prototype, "plugins", () => [1, 2, 3, 4, 5]);
59
+ defineGetter(Navigator.prototype, "mimeTypes", () => [1, 2, 3]);
60
+ }}
61
+
62
+ if (profile.patchChromeRuntime && !window.chrome) {{
63
+ try {{
64
+ Object.defineProperty(window, "chrome", {{
65
+ configurable: true,
66
+ value: {{
67
+ app: {{ isInstalled: false }},
68
+ csi: () => ({{}}),
69
+ loadTimes: () => ({{}}),
70
+ runtime: {{}}
71
+ }}
72
+ }});
73
+ }} catch (_) {{}}
74
+ }}
75
+
76
+ if (profile.patchPermissions && navigator.permissions && navigator.permissions.query) {{
77
+ const originalQuery = navigator.permissions.query.bind(navigator.permissions);
78
+ navigator.permissions.query = (parameters) => {{
79
+ if (parameters && parameters.name === "notifications") {{
80
+ return Promise.resolve({{ state: Notification.permission }});
81
+ }}
82
+ return originalQuery(parameters);
83
+ }};
84
+ }}
85
+
86
+ if (profile.patchWebgl) {{
87
+ const patchWebGL = (proto) => {{
88
+ if (!proto || !proto.getParameter) return;
89
+ const original = proto.getParameter;
90
+ proto.getParameter = function(parameter) {{
91
+ if (parameter === 37445) return profile.webglVendor;
92
+ if (parameter === 37446) return profile.webglRenderer;
93
+ return original.apply(this, arguments);
94
+ }};
95
+ }};
96
+ patchWebGL(window.WebGLRenderingContext && window.WebGLRenderingContext.prototype);
97
+ patchWebGL(window.WebGL2RenderingContext && window.WebGL2RenderingContext.prototype);
98
+ }}
99
+
100
+ if (profile.patchCanvas && window.HTMLCanvasElement) {{
101
+ const originalToDataURL = HTMLCanvasElement.prototype.toDataURL;
102
+ HTMLCanvasElement.prototype.toDataURL = function() {{
103
+ try {{
104
+ const context = this.getContext("2d");
105
+ if (context && this.width && this.height) {{
106
+ const x = Math.min(1, this.width - 1);
107
+ const y = Math.min(1, this.height - 1);
108
+ const imageData = context.getImageData(x, y, 1, 1);
109
+ imageData.data[0] = (imageData.data[0] + 1) % 255;
110
+ context.putImageData(imageData, x, y);
111
+ }}
112
+ }} catch (_) {{}}
113
+ return originalToDataURL.apply(this, arguments);
114
+ }};
115
+ }}
116
+ }})();
117
+ """
@@ -0,0 +1,278 @@
1
+ Metadata-Version: 2.4
2
+ Name: webskrap
3
+ Version: 0.1.0
4
+ Summary: A Playwright-based Python scraping framework with coherent browser profiles and session controls.
5
+ Project-URL: Homepage, https://github.com/kacigaya/webskrap
6
+ Project-URL: Repository, https://github.com/kacigaya/webskrap
7
+ Project-URL: Issues, https://github.com/kacigaya/webskrap/issues
8
+ Author: WebSkrap contributors
9
+ License-Expression: MIT
10
+ Keywords: browser-automation,crawler,playwright,scraping
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Typing :: Typed
19
+ Requires-Python: >=3.11
20
+ Requires-Dist: playwright>=1.49
21
+ Requires-Dist: pydantic>=2.8
22
+ Requires-Dist: rich>=13.9
23
+ Requires-Dist: typer>=0.15
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
26
+ Requires-Dist: pytest>=8.3; extra == 'dev'
27
+ Requires-Dist: ruff>=0.8; extra == 'dev'
28
+ Description-Content-Type: text/markdown
29
+
30
+ <p align="center">
31
+ <img src="assets/webskrap-logo.png" alt="WebSkrap logo" width="200">
32
+ </p>
33
+
34
+ <h1 align="center">WebSkrap</h1>
35
+
36
+ <p align="center">
37
+ <strong>Async-first Python scraping framework built on Playwright.</strong><br>
38
+ <em>It provides coherent browser profiles, persistent sessions, resource routing, and configurable browser hardening for data collection workflows that need realistic browser behavior.</em>
39
+ </p>
40
+
41
+ WebSkrap does not include CAPTCHA solving, login-wall bypassing, credential bypassing, or access-control circumvention. Use it only on targets you are allowed to access.
42
+
43
+ ## Install
44
+
45
+ ```bash
46
+ pip install webskrap
47
+ python -m playwright install chromium
48
+ ```
49
+
50
+ ## Quick Start
51
+
52
+ ```python
53
+ import asyncio
54
+
55
+ from webskrap import WebSkrapClient
56
+
57
+
58
+ async def main() -> None:
59
+ async with WebSkrapClient() as client:
60
+ result = await client.fetch("https://example.com")
61
+ print(result.status)
62
+ print(result.title)
63
+ print(result.text[:200])
64
+
65
+
66
+ asyncio.run(main())
67
+ ```
68
+
69
+ ## Persistent Session
70
+
71
+ ```python
72
+ import asyncio
73
+ from pathlib import Path
74
+
75
+ from webskrap import SessionConfig, WebSkrapClient
76
+
77
+
78
+ async def main() -> None:
79
+ config = SessionConfig(
80
+ user_data_dir=Path(".webskrap/sessions/shop"),
81
+ headless=True,
82
+ )
83
+
84
+ async with WebSkrapClient() as client:
85
+ session = await client.session("shop", config=config, profile="desktop-chrome")
86
+ first = await session.fetch("https://example.com")
87
+ second = await session.fetch("https://example.com/account")
88
+ print(first.final_url, second.final_url)
89
+
90
+
91
+ asyncio.run(main())
92
+ ```
93
+
94
+ ## Headed Browser
95
+
96
+ Use a persistent session when you want the browser to stay open.
97
+
98
+ ```python
99
+ import asyncio
100
+ from pathlib import Path
101
+
102
+ from webskrap import SessionConfig, WebSkrapClient
103
+
104
+
105
+ async def main() -> None:
106
+ config = SessionConfig(
107
+ headless=False,
108
+ user_data_dir=Path(".webskrap/dev-session"),
109
+ )
110
+
111
+ async with WebSkrapClient() as client:
112
+ session = await client.session("dev", config=config)
113
+ page = await session.context.new_page()
114
+ await page.goto("https://example.com", wait_until="domcontentloaded")
115
+
116
+ input("Press Enter to close browser...")
117
+
118
+
119
+ asyncio.run(main())
120
+ ```
121
+
122
+ Example for a headed Chrome session with a French desktop profile:
123
+
124
+ ```python
125
+ import asyncio
126
+ from pathlib import Path
127
+
128
+ from webskrap import BrowserProfile, SessionConfig, Viewport, WebSkrapClient
129
+
130
+
131
+ async def main() -> None:
132
+ config = SessionConfig(
133
+ headless=False,
134
+ channel="chrome",
135
+ user_data_dir=Path(".webskrap/gmf"),
136
+ navigation_timeout_ms=90_000,
137
+ default_timeout_ms=90_000,
138
+ slow_mo_ms=50,
139
+ launch_args=[
140
+ "--start-maximized",
141
+ "--disable-blink-features=AutomationControlled",
142
+ "--no-first-run",
143
+ "--no-default-browser-check",
144
+ ],
145
+ )
146
+ profile = BrowserProfile(
147
+ name="fr-desktop",
148
+ viewport=Viewport(width=1440, height=900),
149
+ screen=Viewport(width=1440, height=900),
150
+ locale="fr-FR",
151
+ timezone_id="Europe/Paris",
152
+ navigator_languages=["fr-FR", "fr", "en-US", "en"],
153
+ )
154
+
155
+ async with WebSkrapClient() as client:
156
+ session = await client.session("gmf", config=config, profile=profile)
157
+ page = await session.context.new_page()
158
+ await page.goto("https://www.gmf.fr/habitation/devis", wait_until="domcontentloaded")
159
+
160
+ input("Press Enter to close browser...")
161
+
162
+
163
+ asyncio.run(main())
164
+ ```
165
+
166
+ ## Custom Profile
167
+
168
+ ```python
169
+ from webskrap import BrowserProfile, Viewport
170
+
171
+ profile = BrowserProfile(
172
+ name="workstation",
173
+ viewport=Viewport(width=1440, height=900),
174
+ screen=Viewport(width=1440, height=900),
175
+ locale="en-US",
176
+ timezone_id="Europe/Paris",
177
+ )
178
+ ```
179
+
180
+ ## Session Options
181
+
182
+ ```python
183
+ from pathlib import Path
184
+
185
+ from webskrap import ProxyConfig, ResourcePolicy, SessionConfig
186
+
187
+ config = SessionConfig(
188
+ browser="chromium",
189
+ channel="chrome",
190
+ headless=False,
191
+ user_data_dir=Path(".webskrap/session"),
192
+ storage_state=None,
193
+ proxy=ProxyConfig(server="http://127.0.0.1:8080"),
194
+ resource_policy=ResourcePolicy.LITE,
195
+ ignore_https_errors=True,
196
+ java_script_enabled=True,
197
+ service_workers="allow",
198
+ timeout_ms=30_000,
199
+ navigation_timeout_ms=90_000,
200
+ default_timeout_ms=90_000,
201
+ slow_mo_ms=50,
202
+ launch_args=[
203
+ "--start-maximized",
204
+ "--disable-blink-features=AutomationControlled",
205
+ "--disable-dev-shm-usage",
206
+ "--no-first-run",
207
+ "--no-default-browser-check",
208
+ ],
209
+ )
210
+ ```
211
+
212
+ `resource_policy` values:
213
+
214
+ - `ResourcePolicy.ALL`: allow all resources.
215
+ - `ResourcePolicy.LITE`: block images, fonts, and media.
216
+ - `ResourcePolicy.DOCUMENTS`: block images, fonts, media, and stylesheets.
217
+
218
+ ## Profile Options
219
+
220
+ ```python
221
+ from webskrap import BrowserProfile, Viewport
222
+
223
+ profile = BrowserProfile(
224
+ name="fr-desktop",
225
+ user_agent=None,
226
+ viewport=Viewport(width=1440, height=900),
227
+ screen=Viewport(width=1440, height=900),
228
+ locale="fr-FR",
229
+ timezone_id="Europe/Paris",
230
+ device_scale_factor=1,
231
+ is_mobile=False,
232
+ has_touch=False,
233
+ color_scheme="light",
234
+ reduced_motion="no-preference",
235
+ extra_http_headers={},
236
+ navigator_languages=["fr-FR", "fr", "en-US", "en"],
237
+ hardware_concurrency=8,
238
+ device_memory=8,
239
+ webgl_vendor="Google Inc. (Intel)",
240
+ webgl_renderer="ANGLE (Intel, Intel(R) Iris(TM) Plus Graphics, OpenGL 4.1)",
241
+ )
242
+ ```
243
+
244
+ ## Stealth Options
245
+
246
+ ```python
247
+ from webskrap import SessionConfig, StealthConfig
248
+
249
+ config = SessionConfig(
250
+ stealth=StealthConfig(
251
+ enabled=True,
252
+ patch_webdriver=True,
253
+ patch_chrome_runtime=True,
254
+ patch_permissions=True,
255
+ patch_plugins=True,
256
+ patch_webgl=True,
257
+ patch_canvas=True,
258
+ patch_hardware=True,
259
+ )
260
+ )
261
+ ```
262
+
263
+ ## CLI
264
+
265
+ ```bash
266
+ webskrap profiles
267
+ webskrap doctor
268
+ webskrap fetch https://example.com --profile desktop-chrome
269
+ webskrap fetch https://example.com --headed --screenshot example.png
270
+ ```
271
+
272
+ ## Development
273
+
274
+ ```bash
275
+ pip install -e ".[dev]"
276
+ pytest
277
+ ruff check .
278
+ ```
@@ -0,0 +1,11 @@
1
+ webskrap/__init__.py,sha256=JSQts2TI_3G41T5avnltIK3GRttbSZ8YtfcCxC-zG6k,551
2
+ webskrap/cli.py,sha256=TnKdDYQCE8zJgsPTXQGKwwXXwlzAl437o-dk9CLXplc,5187
3
+ webskrap/client.py,sha256=wVcipoyX-EsMPD7tv08lDojS_nA1zTIczU_4RmyArII,8232
4
+ webskrap/models.py,sha256=1Hhbzt_GJsv2SCWIK6msy04gmgrL-H-XnwC4wieng4M,7159
5
+ webskrap/profiles.py,sha256=jRPqsSRf-bpTLsBVQ9w_Fb6UU7v3Gmgc0b3h2dRhUrg,2366
6
+ webskrap/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
7
+ webskrap/stealth.py,sha256=btCeOFJTGIKTGZpUjp4zyD-bxmf3CQOSoX4o3qAMaVE,4082
8
+ webskrap-0.1.0.dist-info/METADATA,sha256=W9iPOKgXCig4TAVNBVAQStw3Fc6dyjjy0EP_wAqI2HU,7260
9
+ webskrap-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
10
+ webskrap-0.1.0.dist-info/entry_points.txt,sha256=nKdAJqYIMy8Ql7TH1MUER8IgiaUcBi7hqU8p4bhIU7A,46
11
+ webskrap-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ webskrap = webskrap.cli:app