webskrap 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ name: Publish
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+ workflow_dispatch:
7
+
8
+ permissions:
9
+ contents: read
10
+
11
+ jobs:
12
+ publish:
13
+ runs-on: ubuntu-latest
14
+ environment: pypi
15
+ permissions:
16
+ id-token: write
17
+ contents: read
18
+ steps:
19
+ - uses: actions/checkout@v4
20
+ - uses: actions/setup-python@v5
21
+ with:
22
+ python-version: "3.13"
23
+ - name: Install build
24
+ run: python -m pip install --upgrade build
25
+ - name: Build package
26
+ run: python -m build
27
+ - name: Publish to PyPI
28
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,13 @@
1
+ .venv/
2
+ __pycache__/
3
+ *.py[cod]
4
+ .pytest_cache/
5
+ .ruff_cache/
6
+ .mypy_cache/
7
+ .coverage
8
+ htmlcov/
9
+ dist/
10
+ build/
11
+ *.egg-info/
12
+ .webskrap/
13
+ test.py
@@ -0,0 +1,278 @@
1
+ Metadata-Version: 2.4
2
+ Name: webskrap
3
+ Version: 0.1.0
4
+ Summary: A Playwright-based Python scraping framework with coherent browser profiles and session controls.
5
+ Project-URL: Homepage, https://github.com/kacigaya/webskrap
6
+ Project-URL: Repository, https://github.com/kacigaya/webskrap
7
+ Project-URL: Issues, https://github.com/kacigaya/webskrap/issues
8
+ Author: WebSkrap contributors
9
+ License-Expression: MIT
10
+ Keywords: browser-automation,crawler,playwright,scraping
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Typing :: Typed
19
+ Requires-Python: >=3.11
20
+ Requires-Dist: playwright>=1.49
21
+ Requires-Dist: pydantic>=2.8
22
+ Requires-Dist: rich>=13.9
23
+ Requires-Dist: typer>=0.15
24
+ Provides-Extra: dev
25
+ Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
26
+ Requires-Dist: pytest>=8.3; extra == 'dev'
27
+ Requires-Dist: ruff>=0.8; extra == 'dev'
28
+ Description-Content-Type: text/markdown
29
+
30
+ <p align="center">
31
+ <img src="assets/webskrap-logo.png" alt="WebSkrap logo" width="200">
32
+ </p>
33
+
34
+ <h1 align="center">WebSkrap</h1>
35
+
36
+ <p align="center">
37
+ <strong>Async-first Python scraping framework built on Playwright.</strong><br>
38
+ <em>It provides coherent browser profiles, persistent sessions, resource routing, and configurable browser hardening for data collection workflows that need realistic browser behavior.</em>
39
+ </p>
40
+
41
+ WebSkrap does not include CAPTCHA solving, login-wall bypassing, credential bypassing, or access-control circumvention. Use it only on targets you are allowed to access.
42
+
43
+ ## Install
44
+
45
+ ```bash
46
+ pip install webskrap
47
+ python -m playwright install chromium
48
+ ```
49
+
50
+ ## Quick Start
51
+
52
+ ```python
53
+ import asyncio
54
+
55
+ from webskrap import WebSkrapClient
56
+
57
+
58
+ async def main() -> None:
59
+ async with WebSkrapClient() as client:
60
+ result = await client.fetch("https://example.com")
61
+ print(result.status)
62
+ print(result.title)
63
+ print(result.text[:200])
64
+
65
+
66
+ asyncio.run(main())
67
+ ```
68
+
69
+ ## Persistent Session
70
+
71
+ ```python
72
+ import asyncio
73
+ from pathlib import Path
74
+
75
+ from webskrap import SessionConfig, WebSkrapClient
76
+
77
+
78
+ async def main() -> None:
79
+ config = SessionConfig(
80
+ user_data_dir=Path(".webskrap/sessions/shop"),
81
+ headless=True,
82
+ )
83
+
84
+ async with WebSkrapClient() as client:
85
+ session = await client.session("shop", config=config, profile="desktop-chrome")
86
+ first = await session.fetch("https://example.com")
87
+ second = await session.fetch("https://example.com/account")
88
+ print(first.final_url, second.final_url)
89
+
90
+
91
+ asyncio.run(main())
92
+ ```
93
+
94
+ ## Headed Browser
95
+
96
+ Use a persistent session when you want the browser to stay open.
97
+
98
+ ```python
99
+ import asyncio
100
+ from pathlib import Path
101
+
102
+ from webskrap import SessionConfig, WebSkrapClient
103
+
104
+
105
+ async def main() -> None:
106
+ config = SessionConfig(
107
+ headless=False,
108
+ user_data_dir=Path(".webskrap/dev-session"),
109
+ )
110
+
111
+ async with WebSkrapClient() as client:
112
+ session = await client.session("dev", config=config)
113
+ page = await session.context.new_page()
114
+ await page.goto("https://example.com", wait_until="domcontentloaded")
115
+
116
+ input("Press Enter to close browser...")
117
+
118
+
119
+ asyncio.run(main())
120
+ ```
121
+
122
+ Example for a headed Chrome session with a French desktop profile:
123
+
124
+ ```python
125
+ import asyncio
126
+ from pathlib import Path
127
+
128
+ from webskrap import BrowserProfile, SessionConfig, Viewport, WebSkrapClient
129
+
130
+
131
+ async def main() -> None:
132
+ config = SessionConfig(
133
+ headless=False,
134
+ channel="chrome",
135
+ user_data_dir=Path(".webskrap/gmf"),
136
+ navigation_timeout_ms=90_000,
137
+ default_timeout_ms=90_000,
138
+ slow_mo_ms=50,
139
+ launch_args=[
140
+ "--start-maximized",
141
+ "--disable-blink-features=AutomationControlled",
142
+ "--no-first-run",
143
+ "--no-default-browser-check",
144
+ ],
145
+ )
146
+ profile = BrowserProfile(
147
+ name="fr-desktop",
148
+ viewport=Viewport(width=1440, height=900),
149
+ screen=Viewport(width=1440, height=900),
150
+ locale="fr-FR",
151
+ timezone_id="Europe/Paris",
152
+ navigator_languages=["fr-FR", "fr", "en-US", "en"],
153
+ )
154
+
155
+ async with WebSkrapClient() as client:
156
+ session = await client.session("gmf", config=config, profile=profile)
157
+ page = await session.context.new_page()
158
+ await page.goto("https://www.gmf.fr/habitation/devis", wait_until="domcontentloaded")
159
+
160
+ input("Press Enter to close browser...")
161
+
162
+
163
+ asyncio.run(main())
164
+ ```
165
+
166
+ ## Custom Profile
167
+
168
+ ```python
169
+ from webskrap import BrowserProfile, Viewport
170
+
171
+ profile = BrowserProfile(
172
+ name="workstation",
173
+ viewport=Viewport(width=1440, height=900),
174
+ screen=Viewport(width=1440, height=900),
175
+ locale="en-US",
176
+ timezone_id="Europe/Paris",
177
+ )
178
+ ```
179
+
180
+ ## Session Options
181
+
182
+ ```python
183
+ from pathlib import Path
184
+
185
+ from webskrap import ProxyConfig, ResourcePolicy, SessionConfig
186
+
187
+ config = SessionConfig(
188
+ browser="chromium",
189
+ channel="chrome",
190
+ headless=False,
191
+ user_data_dir=Path(".webskrap/session"),
192
+ storage_state=None,
193
+ proxy=ProxyConfig(server="http://127.0.0.1:8080"),
194
+ resource_policy=ResourcePolicy.LITE,
195
+ ignore_https_errors=True,
196
+ java_script_enabled=True,
197
+ service_workers="allow",
198
+ timeout_ms=30_000,
199
+ navigation_timeout_ms=90_000,
200
+ default_timeout_ms=90_000,
201
+ slow_mo_ms=50,
202
+ launch_args=[
203
+ "--start-maximized",
204
+ "--disable-blink-features=AutomationControlled",
205
+ "--disable-dev-shm-usage",
206
+ "--no-first-run",
207
+ "--no-default-browser-check",
208
+ ],
209
+ )
210
+ ```
211
+
212
+ `resource_policy` values:
213
+
214
+ - `ResourcePolicy.ALL`: allow all resources.
215
+ - `ResourcePolicy.LITE`: block images, fonts, and media.
216
+ - `ResourcePolicy.DOCUMENTS`: block images, fonts, media, and stylesheets.
217
+
218
+ ## Profile Options
219
+
220
+ ```python
221
+ from webskrap import BrowserProfile, Viewport
222
+
223
+ profile = BrowserProfile(
224
+ name="fr-desktop",
225
+ user_agent=None,
226
+ viewport=Viewport(width=1440, height=900),
227
+ screen=Viewport(width=1440, height=900),
228
+ locale="fr-FR",
229
+ timezone_id="Europe/Paris",
230
+ device_scale_factor=1,
231
+ is_mobile=False,
232
+ has_touch=False,
233
+ color_scheme="light",
234
+ reduced_motion="no-preference",
235
+ extra_http_headers={},
236
+ navigator_languages=["fr-FR", "fr", "en-US", "en"],
237
+ hardware_concurrency=8,
238
+ device_memory=8,
239
+ webgl_vendor="Google Inc. (Intel)",
240
+ webgl_renderer="ANGLE (Intel, Intel(R) Iris(TM) Plus Graphics, OpenGL 4.1)",
241
+ )
242
+ ```
243
+
244
+ ## Stealth Options
245
+
246
+ ```python
247
+ from webskrap import SessionConfig, StealthConfig
248
+
249
+ config = SessionConfig(
250
+ stealth=StealthConfig(
251
+ enabled=True,
252
+ patch_webdriver=True,
253
+ patch_chrome_runtime=True,
254
+ patch_permissions=True,
255
+ patch_plugins=True,
256
+ patch_webgl=True,
257
+ patch_canvas=True,
258
+ patch_hardware=True,
259
+ )
260
+ )
261
+ ```
262
+
263
+ ## CLI
264
+
265
+ ```bash
266
+ webskrap profiles
267
+ webskrap doctor
268
+ webskrap fetch https://example.com --profile desktop-chrome
269
+ webskrap fetch https://example.com --headed --screenshot example.png
270
+ ```
271
+
272
+ ## Development
273
+
274
+ ```bash
275
+ pip install -e ".[dev]"
276
+ pytest
277
+ ruff check .
278
+ ```
@@ -0,0 +1,249 @@
1
+ <p align="center">
2
+ <img src="assets/webskrap-logo.png" alt="WebSkrap logo" width="200">
3
+ </p>
4
+
5
+ <h1 align="center">WebSkrap</h1>
6
+
7
+ <p align="center">
8
+ <strong>Async-first Python scraping framework built on Playwright.</strong><br>
9
+ <em>It provides coherent browser profiles, persistent sessions, resource routing, and configurable browser hardening for data collection workflows that need realistic browser behavior.</em>
10
+ </p>
11
+
12
+ WebSkrap does not include CAPTCHA solving, login-wall bypassing, credential bypassing, or access-control circumvention. Use it only on targets you are allowed to access.
13
+
14
+ ## Install
15
+
16
+ ```bash
17
+ pip install webskrap
18
+ python -m playwright install chromium
19
+ ```
20
+
21
+ ## Quick Start
22
+
23
+ ```python
24
+ import asyncio
25
+
26
+ from webskrap import WebSkrapClient
27
+
28
+
29
+ async def main() -> None:
30
+ async with WebSkrapClient() as client:
31
+ result = await client.fetch("https://example.com")
32
+ print(result.status)
33
+ print(result.title)
34
+ print(result.text[:200])
35
+
36
+
37
+ asyncio.run(main())
38
+ ```
39
+
40
+ ## Persistent Session
41
+
42
+ ```python
43
+ import asyncio
44
+ from pathlib import Path
45
+
46
+ from webskrap import SessionConfig, WebSkrapClient
47
+
48
+
49
+ async def main() -> None:
50
+ config = SessionConfig(
51
+ user_data_dir=Path(".webskrap/sessions/shop"),
52
+ headless=True,
53
+ )
54
+
55
+ async with WebSkrapClient() as client:
56
+ session = await client.session("shop", config=config, profile="desktop-chrome")
57
+ first = await session.fetch("https://example.com")
58
+ second = await session.fetch("https://example.com/account")
59
+ print(first.final_url, second.final_url)
60
+
61
+
62
+ asyncio.run(main())
63
+ ```
64
+
65
+ ## Headed Browser
66
+
67
+ Use a persistent session when you want the browser to stay open.
68
+
69
+ ```python
70
+ import asyncio
71
+ from pathlib import Path
72
+
73
+ from webskrap import SessionConfig, WebSkrapClient
74
+
75
+
76
+ async def main() -> None:
77
+ config = SessionConfig(
78
+ headless=False,
79
+ user_data_dir=Path(".webskrap/dev-session"),
80
+ )
81
+
82
+ async with WebSkrapClient() as client:
83
+ session = await client.session("dev", config=config)
84
+ page = await session.context.new_page()
85
+ await page.goto("https://example.com", wait_until="domcontentloaded")
86
+
87
+ input("Press Enter to close browser...")
88
+
89
+
90
+ asyncio.run(main())
91
+ ```
92
+
93
+ Example for a headed Chrome session with a French desktop profile:
94
+
95
+ ```python
96
+ import asyncio
97
+ from pathlib import Path
98
+
99
+ from webskrap import BrowserProfile, SessionConfig, Viewport, WebSkrapClient
100
+
101
+
102
+ async def main() -> None:
103
+ config = SessionConfig(
104
+ headless=False,
105
+ channel="chrome",
106
+ user_data_dir=Path(".webskrap/gmf"),
107
+ navigation_timeout_ms=90_000,
108
+ default_timeout_ms=90_000,
109
+ slow_mo_ms=50,
110
+ launch_args=[
111
+ "--start-maximized",
112
+ "--disable-blink-features=AutomationControlled",
113
+ "--no-first-run",
114
+ "--no-default-browser-check",
115
+ ],
116
+ )
117
+ profile = BrowserProfile(
118
+ name="fr-desktop",
119
+ viewport=Viewport(width=1440, height=900),
120
+ screen=Viewport(width=1440, height=900),
121
+ locale="fr-FR",
122
+ timezone_id="Europe/Paris",
123
+ navigator_languages=["fr-FR", "fr", "en-US", "en"],
124
+ )
125
+
126
+ async with WebSkrapClient() as client:
127
+ session = await client.session("gmf", config=config, profile=profile)
128
+ page = await session.context.new_page()
129
+ await page.goto("https://www.gmf.fr/habitation/devis", wait_until="domcontentloaded")
130
+
131
+ input("Press Enter to close browser...")
132
+
133
+
134
+ asyncio.run(main())
135
+ ```
136
+
137
+ ## Custom Profile
138
+
139
+ ```python
140
+ from webskrap import BrowserProfile, Viewport
141
+
142
+ profile = BrowserProfile(
143
+ name="workstation",
144
+ viewport=Viewport(width=1440, height=900),
145
+ screen=Viewport(width=1440, height=900),
146
+ locale="en-US",
147
+ timezone_id="Europe/Paris",
148
+ )
149
+ ```
150
+
151
+ ## Session Options
152
+
153
+ ```python
154
+ from pathlib import Path
155
+
156
+ from webskrap import ProxyConfig, ResourcePolicy, SessionConfig
157
+
158
+ config = SessionConfig(
159
+ browser="chromium",
160
+ channel="chrome",
161
+ headless=False,
162
+ user_data_dir=Path(".webskrap/session"),
163
+ storage_state=None,
164
+ proxy=ProxyConfig(server="http://127.0.0.1:8080"),
165
+ resource_policy=ResourcePolicy.LITE,
166
+ ignore_https_errors=True,
167
+ java_script_enabled=True,
168
+ service_workers="allow",
169
+ timeout_ms=30_000,
170
+ navigation_timeout_ms=90_000,
171
+ default_timeout_ms=90_000,
172
+ slow_mo_ms=50,
173
+ launch_args=[
174
+ "--start-maximized",
175
+ "--disable-blink-features=AutomationControlled",
176
+ "--disable-dev-shm-usage",
177
+ "--no-first-run",
178
+ "--no-default-browser-check",
179
+ ],
180
+ )
181
+ ```
182
+
183
+ `resource_policy` values:
184
+
185
+ - `ResourcePolicy.ALL`: allow all resources.
186
+ - `ResourcePolicy.LITE`: block images, fonts, and media.
187
+ - `ResourcePolicy.DOCUMENTS`: block images, fonts, media, and stylesheets.
188
+
189
+ ## Profile Options
190
+
191
+ ```python
192
+ from webskrap import BrowserProfile, Viewport
193
+
194
+ profile = BrowserProfile(
195
+ name="fr-desktop",
196
+ user_agent=None,
197
+ viewport=Viewport(width=1440, height=900),
198
+ screen=Viewport(width=1440, height=900),
199
+ locale="fr-FR",
200
+ timezone_id="Europe/Paris",
201
+ device_scale_factor=1,
202
+ is_mobile=False,
203
+ has_touch=False,
204
+ color_scheme="light",
205
+ reduced_motion="no-preference",
206
+ extra_http_headers={},
207
+ navigator_languages=["fr-FR", "fr", "en-US", "en"],
208
+ hardware_concurrency=8,
209
+ device_memory=8,
210
+ webgl_vendor="Google Inc. (Intel)",
211
+ webgl_renderer="ANGLE (Intel, Intel(R) Iris(TM) Plus Graphics, OpenGL 4.1)",
212
+ )
213
+ ```
214
+
215
+ ## Stealth Options
216
+
217
+ ```python
218
+ from webskrap import SessionConfig, StealthConfig
219
+
220
+ config = SessionConfig(
221
+ stealth=StealthConfig(
222
+ enabled=True,
223
+ patch_webdriver=True,
224
+ patch_chrome_runtime=True,
225
+ patch_permissions=True,
226
+ patch_plugins=True,
227
+ patch_webgl=True,
228
+ patch_canvas=True,
229
+ patch_hardware=True,
230
+ )
231
+ )
232
+ ```
233
+
234
+ ## CLI
235
+
236
+ ```bash
237
+ webskrap profiles
238
+ webskrap doctor
239
+ webskrap fetch https://example.com --profile desktop-chrome
240
+ webskrap fetch https://example.com --headed --screenshot example.png
241
+ ```
242
+
243
+ ## Development
244
+
245
+ ```bash
246
+ pip install -e ".[dev]"
247
+ pytest
248
+ ruff check .
249
+ ```
Binary file
@@ -0,0 +1,66 @@
1
+ [build-system]
2
+ requires = ["hatchling>=1.26"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "webskrap"
7
+ version = "0.1.0"
8
+ description = "A Playwright-based Python scraping framework with coherent browser profiles and session controls."
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ license = "MIT"
12
+ authors = [
13
+ { name = "WebSkrap contributors" }
14
+ ]
15
+ keywords = ["browser-automation", "playwright", "scraping", "crawler"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Intended Audience :: Developers",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Programming Language :: Python :: 3.13",
24
+ "Typing :: Typed",
25
+ ]
26
+ dependencies = [
27
+ "playwright>=1.49",
28
+ "pydantic>=2.8",
29
+ "rich>=13.9",
30
+ "typer>=0.15",
31
+ ]
32
+
33
+ [project.optional-dependencies]
34
+ dev = [
35
+ "pytest>=8.3",
36
+ "pytest-asyncio>=0.24",
37
+ "ruff>=0.8",
38
+ ]
39
+
40
+ [project.scripts]
41
+ webskrap = "webskrap.cli:app"
42
+
43
+ [project.urls]
44
+ Homepage = "https://github.com/kacigaya/webskrap"
45
+ Repository = "https://github.com/kacigaya/webskrap"
46
+ Issues = "https://github.com/kacigaya/webskrap/issues"
47
+
48
+ [tool.hatch.build.targets.wheel]
49
+ packages = ["src/webskrap"]
50
+
51
+ [tool.pytest.ini_options]
52
+ asyncio_mode = "auto"
53
+ testpaths = ["tests"]
54
+ markers = [
55
+ "browser: requires installed Playwright browsers",
56
+ ]
57
+
58
+ [tool.ruff]
59
+ line-length = 100
60
+ target-version = "py311"
61
+
62
+ [tool.ruff.lint]
63
+ select = ["E", "F", "I", "UP", "B", "SIM"]
64
+
65
+ [tool.ruff.format]
66
+ quote-style = "double"
@@ -0,0 +1,26 @@
1
+ from webskrap.client import WebSkrapClient, WebSkrapError, WebSkrapSession
2
+ from webskrap.models import (
3
+ BrowserProfile,
4
+ FetchResult,
5
+ ProxyConfig,
6
+ ResourcePolicy,
7
+ SessionConfig,
8
+ StealthConfig,
9
+ Viewport,
10
+ )
11
+ from webskrap.profiles import get_profile, list_profiles
12
+
13
+ __all__ = [
14
+ "BrowserProfile",
15
+ "FetchResult",
16
+ "ProxyConfig",
17
+ "ResourcePolicy",
18
+ "SessionConfig",
19
+ "StealthConfig",
20
+ "Viewport",
21
+ "WebSkrapClient",
22
+ "WebSkrapError",
23
+ "WebSkrapSession",
24
+ "get_profile",
25
+ "list_profiles",
26
+ ]