webskrap 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- webskrap-0.1.0/.github/workflows/workflow.yml +28 -0
- webskrap-0.1.0/.gitignore +13 -0
- webskrap-0.1.0/PKG-INFO +278 -0
- webskrap-0.1.0/README.md +249 -0
- webskrap-0.1.0/assets/webskrap-logo.png +0 -0
- webskrap-0.1.0/pyproject.toml +66 -0
- webskrap-0.1.0/src/webskrap/__init__.py +26 -0
- webskrap-0.1.0/src/webskrap/cli.py +170 -0
- webskrap-0.1.0/src/webskrap/client.py +252 -0
- webskrap-0.1.0/src/webskrap/models.py +206 -0
- webskrap-0.1.0/src/webskrap/profiles.py +67 -0
- webskrap-0.1.0/src/webskrap/py.typed +1 -0
- webskrap-0.1.0/src/webskrap/stealth.py +117 -0
- webskrap-0.1.0/tests/test_browser_integration.py +82 -0
- webskrap-0.1.0/tests/test_client_unit.py +46 -0
- webskrap-0.1.0/tests/test_models.py +51 -0
- webskrap-0.1.0/tests/test_profiles.py +25 -0
- webskrap-0.1.0/tests/test_stealth.py +30 -0
- webskrap-0.1.0/uv.lock +454 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
name: Publish
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
workflow_dispatch:
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
contents: read
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
publish:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
environment: pypi
|
|
15
|
+
permissions:
|
|
16
|
+
id-token: write
|
|
17
|
+
contents: read
|
|
18
|
+
steps:
|
|
19
|
+
- uses: actions/checkout@v4
|
|
20
|
+
- uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: "3.13"
|
|
23
|
+
- name: Install build
|
|
24
|
+
run: python -m pip install --upgrade build
|
|
25
|
+
- name: Build package
|
|
26
|
+
run: python -m build
|
|
27
|
+
- name: Publish to PyPI
|
|
28
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
webskrap-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: webskrap
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A Playwright-based Python scraping framework with coherent browser profiles and session controls.
|
|
5
|
+
Project-URL: Homepage, https://github.com/kacigaya/webskrap
|
|
6
|
+
Project-URL: Repository, https://github.com/kacigaya/webskrap
|
|
7
|
+
Project-URL: Issues, https://github.com/kacigaya/webskrap/issues
|
|
8
|
+
Author: WebSkrap contributors
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
Keywords: browser-automation,crawler,playwright,scraping
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Typing :: Typed
|
|
19
|
+
Requires-Python: >=3.11
|
|
20
|
+
Requires-Dist: playwright>=1.49
|
|
21
|
+
Requires-Dist: pydantic>=2.8
|
|
22
|
+
Requires-Dist: rich>=13.9
|
|
23
|
+
Requires-Dist: typer>=0.15
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
|
|
26
|
+
Requires-Dist: pytest>=8.3; extra == 'dev'
|
|
27
|
+
Requires-Dist: ruff>=0.8; extra == 'dev'
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
<p align="center">
|
|
31
|
+
<img src="assets/webskrap-logo.png" alt="WebSkrap logo" width="200">
|
|
32
|
+
</p>
|
|
33
|
+
|
|
34
|
+
<h1 align="center">WebSkrap</h1>
|
|
35
|
+
|
|
36
|
+
<p align="center">
|
|
37
|
+
<strong>Async-first Python scraping framework built on Playwright.</strong><br>
|
|
38
|
+
<em>It provides coherent browser profiles, persistent sessions, resource routing, and configurable browser hardening for data collection workflows that need realistic browser behavior.</em>
|
|
39
|
+
</p>
|
|
40
|
+
|
|
41
|
+
WebSkrap does not include CAPTCHA solving, login-wall bypassing, credential bypassing, or access-control circumvention. Use it only on targets you are allowed to access.
|
|
42
|
+
|
|
43
|
+
## Install
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install webskrap
|
|
47
|
+
python -m playwright install chromium
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Quick Start
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
import asyncio
|
|
54
|
+
|
|
55
|
+
from webskrap import WebSkrapClient
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
async def main() -> None:
|
|
59
|
+
async with WebSkrapClient() as client:
|
|
60
|
+
result = await client.fetch("https://example.com")
|
|
61
|
+
print(result.status)
|
|
62
|
+
print(result.title)
|
|
63
|
+
print(result.text[:200])
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
asyncio.run(main())
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Persistent Session
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
import asyncio
|
|
73
|
+
from pathlib import Path
|
|
74
|
+
|
|
75
|
+
from webskrap import SessionConfig, WebSkrapClient
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
async def main() -> None:
|
|
79
|
+
config = SessionConfig(
|
|
80
|
+
user_data_dir=Path(".webskrap/sessions/shop"),
|
|
81
|
+
headless=True,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
async with WebSkrapClient() as client:
|
|
85
|
+
session = await client.session("shop", config=config, profile="desktop-chrome")
|
|
86
|
+
first = await session.fetch("https://example.com")
|
|
87
|
+
second = await session.fetch("https://example.com/account")
|
|
88
|
+
print(first.final_url, second.final_url)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
asyncio.run(main())
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Headed Browser
|
|
95
|
+
|
|
96
|
+
Use a persistent session when you want the browser to stay open.
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
import asyncio
|
|
100
|
+
from pathlib import Path
|
|
101
|
+
|
|
102
|
+
from webskrap import SessionConfig, WebSkrapClient
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
async def main() -> None:
|
|
106
|
+
config = SessionConfig(
|
|
107
|
+
headless=False,
|
|
108
|
+
user_data_dir=Path(".webskrap/dev-session"),
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
async with WebSkrapClient() as client:
|
|
112
|
+
session = await client.session("dev", config=config)
|
|
113
|
+
page = await session.context.new_page()
|
|
114
|
+
await page.goto("https://example.com", wait_until="domcontentloaded")
|
|
115
|
+
|
|
116
|
+
input("Press Enter to close browser...")
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
asyncio.run(main())
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
Example for a headed Chrome session with a French desktop profile:
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
import asyncio
|
|
126
|
+
from pathlib import Path
|
|
127
|
+
|
|
128
|
+
from webskrap import BrowserProfile, SessionConfig, Viewport, WebSkrapClient
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
async def main() -> None:
|
|
132
|
+
config = SessionConfig(
|
|
133
|
+
headless=False,
|
|
134
|
+
channel="chrome",
|
|
135
|
+
user_data_dir=Path(".webskrap/gmf"),
|
|
136
|
+
navigation_timeout_ms=90_000,
|
|
137
|
+
default_timeout_ms=90_000,
|
|
138
|
+
slow_mo_ms=50,
|
|
139
|
+
launch_args=[
|
|
140
|
+
"--start-maximized",
|
|
141
|
+
"--disable-blink-features=AutomationControlled",
|
|
142
|
+
"--no-first-run",
|
|
143
|
+
"--no-default-browser-check",
|
|
144
|
+
],
|
|
145
|
+
)
|
|
146
|
+
profile = BrowserProfile(
|
|
147
|
+
name="fr-desktop",
|
|
148
|
+
viewport=Viewport(width=1440, height=900),
|
|
149
|
+
screen=Viewport(width=1440, height=900),
|
|
150
|
+
locale="fr-FR",
|
|
151
|
+
timezone_id="Europe/Paris",
|
|
152
|
+
navigator_languages=["fr-FR", "fr", "en-US", "en"],
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
async with WebSkrapClient() as client:
|
|
156
|
+
session = await client.session("gmf", config=config, profile=profile)
|
|
157
|
+
page = await session.context.new_page()
|
|
158
|
+
await page.goto("https://www.gmf.fr/habitation/devis", wait_until="domcontentloaded")
|
|
159
|
+
|
|
160
|
+
input("Press Enter to close browser...")
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
asyncio.run(main())
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## Custom Profile
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
from webskrap import BrowserProfile, Viewport
|
|
170
|
+
|
|
171
|
+
profile = BrowserProfile(
|
|
172
|
+
name="workstation",
|
|
173
|
+
viewport=Viewport(width=1440, height=900),
|
|
174
|
+
screen=Viewport(width=1440, height=900),
|
|
175
|
+
locale="en-US",
|
|
176
|
+
timezone_id="Europe/Paris",
|
|
177
|
+
)
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## Session Options
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
from pathlib import Path
|
|
184
|
+
|
|
185
|
+
from webskrap import ProxyConfig, ResourcePolicy, SessionConfig
|
|
186
|
+
|
|
187
|
+
config = SessionConfig(
|
|
188
|
+
browser="chromium",
|
|
189
|
+
channel="chrome",
|
|
190
|
+
headless=False,
|
|
191
|
+
user_data_dir=Path(".webskrap/session"),
|
|
192
|
+
storage_state=None,
|
|
193
|
+
proxy=ProxyConfig(server="http://127.0.0.1:8080"),
|
|
194
|
+
resource_policy=ResourcePolicy.LITE,
|
|
195
|
+
ignore_https_errors=True,
|
|
196
|
+
java_script_enabled=True,
|
|
197
|
+
service_workers="allow",
|
|
198
|
+
timeout_ms=30_000,
|
|
199
|
+
navigation_timeout_ms=90_000,
|
|
200
|
+
default_timeout_ms=90_000,
|
|
201
|
+
slow_mo_ms=50,
|
|
202
|
+
launch_args=[
|
|
203
|
+
"--start-maximized",
|
|
204
|
+
"--disable-blink-features=AutomationControlled",
|
|
205
|
+
"--disable-dev-shm-usage",
|
|
206
|
+
"--no-first-run",
|
|
207
|
+
"--no-default-browser-check",
|
|
208
|
+
],
|
|
209
|
+
)
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
`resource_policy` values:
|
|
213
|
+
|
|
214
|
+
- `ResourcePolicy.ALL`: allow all resources.
|
|
215
|
+
- `ResourcePolicy.LITE`: block images, fonts, and media.
|
|
216
|
+
- `ResourcePolicy.DOCUMENTS`: block images, fonts, media, and stylesheets.
|
|
217
|
+
|
|
218
|
+
## Profile Options
|
|
219
|
+
|
|
220
|
+
```python
|
|
221
|
+
from webskrap import BrowserProfile, Viewport
|
|
222
|
+
|
|
223
|
+
profile = BrowserProfile(
|
|
224
|
+
name="fr-desktop",
|
|
225
|
+
user_agent=None,
|
|
226
|
+
viewport=Viewport(width=1440, height=900),
|
|
227
|
+
screen=Viewport(width=1440, height=900),
|
|
228
|
+
locale="fr-FR",
|
|
229
|
+
timezone_id="Europe/Paris",
|
|
230
|
+
device_scale_factor=1,
|
|
231
|
+
is_mobile=False,
|
|
232
|
+
has_touch=False,
|
|
233
|
+
color_scheme="light",
|
|
234
|
+
reduced_motion="no-preference",
|
|
235
|
+
extra_http_headers={},
|
|
236
|
+
navigator_languages=["fr-FR", "fr", "en-US", "en"],
|
|
237
|
+
hardware_concurrency=8,
|
|
238
|
+
device_memory=8,
|
|
239
|
+
webgl_vendor="Google Inc. (Intel)",
|
|
240
|
+
webgl_renderer="ANGLE (Intel, Intel(R) Iris(TM) Plus Graphics, OpenGL 4.1)",
|
|
241
|
+
)
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
## Stealth Options
|
|
245
|
+
|
|
246
|
+
```python
|
|
247
|
+
from webskrap import SessionConfig, StealthConfig
|
|
248
|
+
|
|
249
|
+
config = SessionConfig(
|
|
250
|
+
stealth=StealthConfig(
|
|
251
|
+
enabled=True,
|
|
252
|
+
patch_webdriver=True,
|
|
253
|
+
patch_chrome_runtime=True,
|
|
254
|
+
patch_permissions=True,
|
|
255
|
+
patch_plugins=True,
|
|
256
|
+
patch_webgl=True,
|
|
257
|
+
patch_canvas=True,
|
|
258
|
+
patch_hardware=True,
|
|
259
|
+
)
|
|
260
|
+
)
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
## CLI
|
|
264
|
+
|
|
265
|
+
```bash
|
|
266
|
+
webskrap profiles
|
|
267
|
+
webskrap doctor
|
|
268
|
+
webskrap fetch https://example.com --profile desktop-chrome
|
|
269
|
+
webskrap fetch https://example.com --headed --screenshot example.png
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
## Development
|
|
273
|
+
|
|
274
|
+
```bash
|
|
275
|
+
pip install -e ".[dev]"
|
|
276
|
+
pytest
|
|
277
|
+
ruff check .
|
|
278
|
+
```
|
webskrap-0.1.0/README.md
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="assets/webskrap-logo.png" alt="WebSkrap logo" width="200">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<h1 align="center">WebSkrap</h1>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<strong>Async-first Python scraping framework built on Playwright.</strong><br>
|
|
9
|
+
<em>It provides coherent browser profiles, persistent sessions, resource routing, and configurable browser hardening for data collection workflows that need realistic browser behavior.</em>
|
|
10
|
+
</p>
|
|
11
|
+
|
|
12
|
+
WebSkrap does not include CAPTCHA solving, login-wall bypassing, credential bypassing, or access-control circumvention. Use it only on targets you are allowed to access.
|
|
13
|
+
|
|
14
|
+
## Install
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install webskrap
|
|
18
|
+
python -m playwright install chromium
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Quick Start
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
import asyncio
|
|
25
|
+
|
|
26
|
+
from webskrap import WebSkrapClient
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
async def main() -> None:
|
|
30
|
+
async with WebSkrapClient() as client:
|
|
31
|
+
result = await client.fetch("https://example.com")
|
|
32
|
+
print(result.status)
|
|
33
|
+
print(result.title)
|
|
34
|
+
print(result.text[:200])
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
asyncio.run(main())
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Persistent Session
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
import asyncio
|
|
44
|
+
from pathlib import Path
|
|
45
|
+
|
|
46
|
+
from webskrap import SessionConfig, WebSkrapClient
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
async def main() -> None:
|
|
50
|
+
config = SessionConfig(
|
|
51
|
+
user_data_dir=Path(".webskrap/sessions/shop"),
|
|
52
|
+
headless=True,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
async with WebSkrapClient() as client:
|
|
56
|
+
session = await client.session("shop", config=config, profile="desktop-chrome")
|
|
57
|
+
first = await session.fetch("https://example.com")
|
|
58
|
+
second = await session.fetch("https://example.com/account")
|
|
59
|
+
print(first.final_url, second.final_url)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
asyncio.run(main())
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Headed Browser
|
|
66
|
+
|
|
67
|
+
Use a persistent session when you want the browser to stay open.
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
import asyncio
|
|
71
|
+
from pathlib import Path
|
|
72
|
+
|
|
73
|
+
from webskrap import SessionConfig, WebSkrapClient
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
async def main() -> None:
|
|
77
|
+
config = SessionConfig(
|
|
78
|
+
headless=False,
|
|
79
|
+
user_data_dir=Path(".webskrap/dev-session"),
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
async with WebSkrapClient() as client:
|
|
83
|
+
session = await client.session("dev", config=config)
|
|
84
|
+
page = await session.context.new_page()
|
|
85
|
+
await page.goto("https://example.com", wait_until="domcontentloaded")
|
|
86
|
+
|
|
87
|
+
input("Press Enter to close browser...")
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
asyncio.run(main())
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Example for a headed Chrome session with a French desktop profile:
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
import asyncio
|
|
97
|
+
from pathlib import Path
|
|
98
|
+
|
|
99
|
+
from webskrap import BrowserProfile, SessionConfig, Viewport, WebSkrapClient
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
async def main() -> None:
|
|
103
|
+
config = SessionConfig(
|
|
104
|
+
headless=False,
|
|
105
|
+
channel="chrome",
|
|
106
|
+
user_data_dir=Path(".webskrap/gmf"),
|
|
107
|
+
navigation_timeout_ms=90_000,
|
|
108
|
+
default_timeout_ms=90_000,
|
|
109
|
+
slow_mo_ms=50,
|
|
110
|
+
launch_args=[
|
|
111
|
+
"--start-maximized",
|
|
112
|
+
"--disable-blink-features=AutomationControlled",
|
|
113
|
+
"--no-first-run",
|
|
114
|
+
"--no-default-browser-check",
|
|
115
|
+
],
|
|
116
|
+
)
|
|
117
|
+
profile = BrowserProfile(
|
|
118
|
+
name="fr-desktop",
|
|
119
|
+
viewport=Viewport(width=1440, height=900),
|
|
120
|
+
screen=Viewport(width=1440, height=900),
|
|
121
|
+
locale="fr-FR",
|
|
122
|
+
timezone_id="Europe/Paris",
|
|
123
|
+
navigator_languages=["fr-FR", "fr", "en-US", "en"],
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
async with WebSkrapClient() as client:
|
|
127
|
+
session = await client.session("gmf", config=config, profile=profile)
|
|
128
|
+
page = await session.context.new_page()
|
|
129
|
+
await page.goto("https://www.gmf.fr/habitation/devis", wait_until="domcontentloaded")
|
|
130
|
+
|
|
131
|
+
input("Press Enter to close browser...")
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
asyncio.run(main())
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## Custom Profile
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
from webskrap import BrowserProfile, Viewport
|
|
141
|
+
|
|
142
|
+
profile = BrowserProfile(
|
|
143
|
+
name="workstation",
|
|
144
|
+
viewport=Viewport(width=1440, height=900),
|
|
145
|
+
screen=Viewport(width=1440, height=900),
|
|
146
|
+
locale="en-US",
|
|
147
|
+
timezone_id="Europe/Paris",
|
|
148
|
+
)
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## Session Options
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
from pathlib import Path
|
|
155
|
+
|
|
156
|
+
from webskrap import ProxyConfig, ResourcePolicy, SessionConfig
|
|
157
|
+
|
|
158
|
+
config = SessionConfig(
|
|
159
|
+
browser="chromium",
|
|
160
|
+
channel="chrome",
|
|
161
|
+
headless=False,
|
|
162
|
+
user_data_dir=Path(".webskrap/session"),
|
|
163
|
+
storage_state=None,
|
|
164
|
+
proxy=ProxyConfig(server="http://127.0.0.1:8080"),
|
|
165
|
+
resource_policy=ResourcePolicy.LITE,
|
|
166
|
+
ignore_https_errors=True,
|
|
167
|
+
java_script_enabled=True,
|
|
168
|
+
service_workers="allow",
|
|
169
|
+
timeout_ms=30_000,
|
|
170
|
+
navigation_timeout_ms=90_000,
|
|
171
|
+
default_timeout_ms=90_000,
|
|
172
|
+
slow_mo_ms=50,
|
|
173
|
+
launch_args=[
|
|
174
|
+
"--start-maximized",
|
|
175
|
+
"--disable-blink-features=AutomationControlled",
|
|
176
|
+
"--disable-dev-shm-usage",
|
|
177
|
+
"--no-first-run",
|
|
178
|
+
"--no-default-browser-check",
|
|
179
|
+
],
|
|
180
|
+
)
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
`resource_policy` values:
|
|
184
|
+
|
|
185
|
+
- `ResourcePolicy.ALL`: allow all resources.
|
|
186
|
+
- `ResourcePolicy.LITE`: block images, fonts, and media.
|
|
187
|
+
- `ResourcePolicy.DOCUMENTS`: block images, fonts, media, and stylesheets.
|
|
188
|
+
|
|
189
|
+
## Profile Options
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
from webskrap import BrowserProfile, Viewport
|
|
193
|
+
|
|
194
|
+
profile = BrowserProfile(
|
|
195
|
+
name="fr-desktop",
|
|
196
|
+
user_agent=None,
|
|
197
|
+
viewport=Viewport(width=1440, height=900),
|
|
198
|
+
screen=Viewport(width=1440, height=900),
|
|
199
|
+
locale="fr-FR",
|
|
200
|
+
timezone_id="Europe/Paris",
|
|
201
|
+
device_scale_factor=1,
|
|
202
|
+
is_mobile=False,
|
|
203
|
+
has_touch=False,
|
|
204
|
+
color_scheme="light",
|
|
205
|
+
reduced_motion="no-preference",
|
|
206
|
+
extra_http_headers={},
|
|
207
|
+
navigator_languages=["fr-FR", "fr", "en-US", "en"],
|
|
208
|
+
hardware_concurrency=8,
|
|
209
|
+
device_memory=8,
|
|
210
|
+
webgl_vendor="Google Inc. (Intel)",
|
|
211
|
+
webgl_renderer="ANGLE (Intel, Intel(R) Iris(TM) Plus Graphics, OpenGL 4.1)",
|
|
212
|
+
)
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
## Stealth Options
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
from webskrap import SessionConfig, StealthConfig
|
|
219
|
+
|
|
220
|
+
config = SessionConfig(
|
|
221
|
+
stealth=StealthConfig(
|
|
222
|
+
enabled=True,
|
|
223
|
+
patch_webdriver=True,
|
|
224
|
+
patch_chrome_runtime=True,
|
|
225
|
+
patch_permissions=True,
|
|
226
|
+
patch_plugins=True,
|
|
227
|
+
patch_webgl=True,
|
|
228
|
+
patch_canvas=True,
|
|
229
|
+
patch_hardware=True,
|
|
230
|
+
)
|
|
231
|
+
)
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
## CLI
|
|
235
|
+
|
|
236
|
+
```bash
|
|
237
|
+
webskrap profiles
|
|
238
|
+
webskrap doctor
|
|
239
|
+
webskrap fetch https://example.com --profile desktop-chrome
|
|
240
|
+
webskrap fetch https://example.com --headed --screenshot example.png
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
## Development
|
|
244
|
+
|
|
245
|
+
```bash
|
|
246
|
+
pip install -e ".[dev]"
|
|
247
|
+
pytest
|
|
248
|
+
ruff check .
|
|
249
|
+
```
|
|
Binary file
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling>=1.26"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "webskrap"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A Playwright-based Python scraping framework with coherent browser profiles and session controls."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "WebSkrap contributors" }
|
|
14
|
+
]
|
|
15
|
+
keywords = ["browser-automation", "playwright", "scraping", "crawler"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Programming Language :: Python :: 3.13",
|
|
24
|
+
"Typing :: Typed",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"playwright>=1.49",
|
|
28
|
+
"pydantic>=2.8",
|
|
29
|
+
"rich>=13.9",
|
|
30
|
+
"typer>=0.15",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.optional-dependencies]
|
|
34
|
+
dev = [
|
|
35
|
+
"pytest>=8.3",
|
|
36
|
+
"pytest-asyncio>=0.24",
|
|
37
|
+
"ruff>=0.8",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
[project.scripts]
|
|
41
|
+
webskrap = "webskrap.cli:app"
|
|
42
|
+
|
|
43
|
+
[project.urls]
|
|
44
|
+
Homepage = "https://github.com/kacigaya/webskrap"
|
|
45
|
+
Repository = "https://github.com/kacigaya/webskrap"
|
|
46
|
+
Issues = "https://github.com/kacigaya/webskrap/issues"
|
|
47
|
+
|
|
48
|
+
[tool.hatch.build.targets.wheel]
|
|
49
|
+
packages = ["src/webskrap"]
|
|
50
|
+
|
|
51
|
+
[tool.pytest.ini_options]
|
|
52
|
+
asyncio_mode = "auto"
|
|
53
|
+
testpaths = ["tests"]
|
|
54
|
+
markers = [
|
|
55
|
+
"browser: requires installed Playwright browsers",
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
[tool.ruff]
|
|
59
|
+
line-length = 100
|
|
60
|
+
target-version = "py311"
|
|
61
|
+
|
|
62
|
+
[tool.ruff.lint]
|
|
63
|
+
select = ["E", "F", "I", "UP", "B", "SIM"]
|
|
64
|
+
|
|
65
|
+
[tool.ruff.format]
|
|
66
|
+
quote-style = "double"
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from webskrap.client import WebSkrapClient, WebSkrapError, WebSkrapSession
|
|
2
|
+
from webskrap.models import (
|
|
3
|
+
BrowserProfile,
|
|
4
|
+
FetchResult,
|
|
5
|
+
ProxyConfig,
|
|
6
|
+
ResourcePolicy,
|
|
7
|
+
SessionConfig,
|
|
8
|
+
StealthConfig,
|
|
9
|
+
Viewport,
|
|
10
|
+
)
|
|
11
|
+
from webskrap.profiles import get_profile, list_profiles
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"BrowserProfile",
|
|
15
|
+
"FetchResult",
|
|
16
|
+
"ProxyConfig",
|
|
17
|
+
"ResourcePolicy",
|
|
18
|
+
"SessionConfig",
|
|
19
|
+
"StealthConfig",
|
|
20
|
+
"Viewport",
|
|
21
|
+
"WebSkrapClient",
|
|
22
|
+
"WebSkrapError",
|
|
23
|
+
"WebSkrapSession",
|
|
24
|
+
"get_profile",
|
|
25
|
+
"list_profiles",
|
|
26
|
+
]
|