web2api 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- web2api-0.1.0/PKG-INFO +19 -0
- web2api-0.1.0/README.md +409 -0
- web2api-0.1.0/pyproject.toml +55 -0
- web2api-0.1.0/setup.cfg +4 -0
- web2api-0.1.0/web2api/__init__.py +10 -0
- web2api-0.1.0/web2api/bundled/plugins/catalog.yaml +15 -0
- web2api-0.1.0/web2api/bundled/recipes/deepl/recipe.yaml +33 -0
- web2api-0.1.0/web2api/bundled/recipes/deepl/scraper.py +112 -0
- web2api-0.1.0/web2api/bundled/recipes/hackernews/recipe.yaml +97 -0
- web2api-0.1.0/web2api/bundled/recipes/x/plugin.yaml +17 -0
- web2api-0.1.0/web2api/bundled/recipes/x/recipe.yaml +19 -0
- web2api-0.1.0/web2api/bundled/recipes/x/scraper.py +110 -0
- web2api-0.1.0/web2api/cache.py +150 -0
- web2api-0.1.0/web2api/cli.py +974 -0
- web2api-0.1.0/web2api/config.py +165 -0
- web2api-0.1.0/web2api/engine.py +502 -0
- web2api-0.1.0/web2api/logging_utils.py +54 -0
- web2api-0.1.0/web2api/main.py +412 -0
- web2api-0.1.0/web2api/plugin.py +248 -0
- web2api-0.1.0/web2api/plugin_manager.py +530 -0
- web2api-0.1.0/web2api/pool.py +312 -0
- web2api-0.1.0/web2api/registry.py +221 -0
- web2api-0.1.0/web2api/schemas.py +85 -0
- web2api-0.1.0/web2api/scraper.py +50 -0
- web2api-0.1.0/web2api/self_update.py +164 -0
- web2api-0.1.0/web2api/templates/index.html +576 -0
- web2api-0.1.0/web2api.egg-info/PKG-INFO +19 -0
- web2api-0.1.0/web2api.egg-info/SOURCES.txt +30 -0
- web2api-0.1.0/web2api.egg-info/dependency_links.txt +1 -0
- web2api-0.1.0/web2api.egg-info/entry_points.txt +2 -0
- web2api-0.1.0/web2api.egg-info/requires.txt +15 -0
- web2api-0.1.0/web2api.egg-info/top_level.txt +1 -0
web2api-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: web2api
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Turn websites into REST APIs via live Playwright scraping.
|
|
5
|
+
Requires-Python: >=3.12
|
|
6
|
+
Requires-Dist: fastapi<1.0,>=0.115
|
|
7
|
+
Requires-Dist: jinja2<4.0,>=3.1
|
|
8
|
+
Requires-Dist: playwright<2.0,>=1.50
|
|
9
|
+
Requires-Dist: pydantic<3.0,>=2.10
|
|
10
|
+
Requires-Dist: pyyaml<7.0,>=6.0
|
|
11
|
+
Requires-Dist: typer<1.0,>=0.12
|
|
12
|
+
Requires-Dist: uvicorn[standard]<1.0,>=0.34
|
|
13
|
+
Provides-Extra: dev
|
|
14
|
+
Requires-Dist: httpx<1.0,>=0.28; extra == "dev"
|
|
15
|
+
Requires-Dist: pytest<9.0,>=8.3; extra == "dev"
|
|
16
|
+
Requires-Dist: pytest-asyncio<1.0,>=0.25; extra == "dev"
|
|
17
|
+
Requires-Dist: pytest-cov<7.0,>=6.0; extra == "dev"
|
|
18
|
+
Requires-Dist: pytest-timeout<3.0,>=2.3; extra == "dev"
|
|
19
|
+
Requires-Dist: ruff<1.0,>=0.9; extra == "dev"
|
web2api-0.1.0/README.md
ADDED
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
# Web2API
|
|
2
|
+
|
|
3
|
+
Turn any website into a REST API by scraping it live with Playwright.
|
|
4
|
+
|
|
5
|
+
Web2API loads recipe folders from `recipes/` at startup. Each recipe defines endpoints with selectors, actions, fields, and pagination in YAML. Optional Python scrapers handle interactive or complex sites. Optional plugin metadata can declare external dependencies and required env vars. Drop a folder — get an API.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- **Arbitrary named endpoints** — recipes define as many endpoints as needed (not limited to read/search)
|
|
10
|
+
- **Declarative YAML recipes** with selectors, actions, transforms, and pagination
|
|
11
|
+
- **Custom Python scrapers** for interactive sites (e.g. typing text, waiting for dynamic content)
|
|
12
|
+
- **Optional plugin metadata** (`plugin.yaml`) for recipe-specific dependency requirements
|
|
13
|
+
- **Shared browser/context pool** for concurrent Playwright requests
|
|
14
|
+
- **In-memory response cache** with stale-while-revalidate
|
|
15
|
+
- **Unified JSON response schema** across all recipes and endpoints
|
|
16
|
+
- **Docker deployment** with auto-restart
|
|
17
|
+
|
|
18
|
+
## Quickstart (Docker)
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
docker compose up --build -d
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Service: `http://localhost:8010`
|
|
25
|
+
|
|
26
|
+
### Verify
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
curl -s http://localhost:8010/health | jq
|
|
30
|
+
curl -s http://localhost:8010/api/sites | jq
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## CLI
|
|
34
|
+
|
|
35
|
+
Web2API ships with a management CLI:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
web2api --help
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### Plugin Commands
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
# List all recipe folders with plugin readiness
|
|
45
|
+
web2api plugins list
|
|
46
|
+
|
|
47
|
+
# Check missing env vars/commands/packages
|
|
48
|
+
web2api plugins doctor
|
|
49
|
+
web2api plugins doctor x
|
|
50
|
+
web2api plugins doctor x --no-run-healthchecks
|
|
51
|
+
web2api plugins doctor x --allow-untrusted
|
|
52
|
+
|
|
53
|
+
# Install plugin recipe from source
|
|
54
|
+
web2api plugins add ./my-recipe
|
|
55
|
+
web2api plugins add https://github.com/acme/web2api-recipes.git --ref v1.2.0 --subdir recipes/news
|
|
56
|
+
|
|
57
|
+
# Update managed plugin from recorded source
|
|
58
|
+
web2api plugins update x --yes
|
|
59
|
+
web2api plugins update x --ref v1.3.0 --subdir recipes/x --yes
|
|
60
|
+
|
|
61
|
+
# Install plugin recipe from catalog
|
|
62
|
+
web2api plugins catalog list
|
|
63
|
+
web2api plugins catalog add hackernews --yes
|
|
64
|
+
|
|
65
|
+
# Install declared dependencies for a plugin recipe (host)
|
|
66
|
+
web2api plugins install x --yes
|
|
67
|
+
web2api plugins install x --apt --yes # include apt packages
|
|
68
|
+
|
|
69
|
+
# Generate Dockerfile snippet for plugin dependencies
|
|
70
|
+
web2api plugins install x --target docker --apt
|
|
71
|
+
|
|
72
|
+
# Remove plugin recipe + manifest record
|
|
73
|
+
web2api plugins uninstall x --yes
|
|
74
|
+
|
|
75
|
+
# Disable/enable a recipe (writes/removes recipes/<slug>/.disabled)
|
|
76
|
+
web2api plugins disable x --yes
|
|
77
|
+
web2api plugins enable x
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
`plugins install` does not run `apt` installs unless `--apt` is explicitly passed.
|
|
81
|
+
Install-state records are stored in `recipes/.web2api_plugins.json`.
|
|
82
|
+
Default catalog path is `plugins/catalog.yaml` in a source checkout, with a bundled fallback
|
|
83
|
+
inside the installed package.
|
|
84
|
+
`plugins update` works only for plugins tracked in the manifest.
|
|
85
|
+
|
|
86
|
+
Plugins installed from untrusted sources (for example git URLs) are blocked from executing
|
|
87
|
+
install/healthcheck commands unless `--allow-untrusted` is passed.
|
|
88
|
+
|
|
89
|
+
### Self Update Commands
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
# Show current version + recommended update method
|
|
93
|
+
web2api self update check
|
|
94
|
+
|
|
95
|
+
# Apply update using auto-detected method (pip/git/docker)
|
|
96
|
+
web2api self update apply --yes
|
|
97
|
+
|
|
98
|
+
# Pin explicit method or target version/ref
|
|
99
|
+
web2api self update apply --method pip --to 0.1.0 --yes
|
|
100
|
+
web2api self update apply --method git --to v0.1.0 --yes
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
For `--method git`, `self update apply` checks out a tag:
|
|
104
|
+
- if `--to` is provided, that tag/ref is used
|
|
105
|
+
- if `--to` is omitted, the latest sortable git tag is used
|
|
106
|
+
|
|
107
|
+
After `self update apply`, the CLI automatically runs `web2api plugins doctor`.
|
|
108
|
+
|
|
109
|
+
## Discover Recipes
|
|
110
|
+
|
|
111
|
+
Recipe availability is dynamic. Use discovery endpoints instead of relying on a static README list.
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
# List all discovered sites and endpoint metadata
|
|
115
|
+
curl -s "http://localhost:8010/api/sites" | jq
|
|
116
|
+
|
|
117
|
+
# Print endpoint paths with required params
|
|
118
|
+
curl -s "http://localhost:8010/api/sites" | jq -r '
|
|
119
|
+
.[] as $site
|
|
120
|
+
| $site.endpoints[]
|
|
121
|
+
| "/\($site.slug)/\(.name) params: page" + (if .requires_query then ", q" else "" end)
|
|
122
|
+
'
|
|
123
|
+
|
|
124
|
+
# Print ready-to-run URL templates
|
|
125
|
+
curl -s "http://localhost:8010/api/sites" | jq -r '
|
|
126
|
+
.[] as $site
|
|
127
|
+
| $site.endpoints[]
|
|
128
|
+
| "http://localhost:8010/\($site.slug)/\(.name)?"
|
|
129
|
+
+ (if .requires_query then "q=<query>&" else "" end)
|
|
130
|
+
+ "page=1"
|
|
131
|
+
'
|
|
132
|
+
|
|
133
|
+
# Example call pattern (no query endpoint)
|
|
134
|
+
curl -s "http://localhost:8010/{slug}/{endpoint}?page=1" | jq
|
|
135
|
+
|
|
136
|
+
# Example call pattern (query endpoint)
|
|
137
|
+
curl -s "http://localhost:8010/{slug}/{endpoint}?q=hello&page=1" | jq
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
For custom scraper parameters beyond `page` and `q`, check the specific recipe folder
|
|
141
|
+
(`recipes/<slug>/scraper.py`).
|
|
142
|
+
|
|
143
|
+
## API
|
|
144
|
+
|
|
145
|
+
### Discovery
|
|
146
|
+
|
|
147
|
+
| Endpoint | Description |
|
|
148
|
+
|---|---|
|
|
149
|
+
| `GET /` | HTML index listing all recipes and endpoints |
|
|
150
|
+
| `GET /health` | Service, browser pool, and cache health |
|
|
151
|
+
| `GET /api/sites` | JSON list of all recipes with endpoint metadata |
|
|
152
|
+
|
|
153
|
+
### Recipe Endpoints
|
|
154
|
+
|
|
155
|
+
All recipe endpoints follow the pattern: `GET /{slug}/{endpoint}?page=1&q=...`
|
|
156
|
+
|
|
157
|
+
- `page` — pagination (default: 1)
|
|
158
|
+
- `q` — query text (required when `requires_query: true`)
|
|
159
|
+
- additional query params are passed to custom scrapers
|
|
160
|
+
- extra query param names must match `[a-zA-Z0-9][a-zA-Z0-9_-]{0,63}` and values are capped at 512 chars
|
|
161
|
+
|
|
162
|
+
### Error Codes
|
|
163
|
+
|
|
164
|
+
| HTTP | Code | When |
|
|
165
|
+
|---|---|---|
|
|
166
|
+
| 400 | `INVALID_PARAMS` | Missing required `q` or invalid extra query parameters |
|
|
167
|
+
| 404 | — | Unknown recipe or endpoint |
|
|
168
|
+
| 502 | `SCRAPE_FAILED` | Browser/upstream failure |
|
|
169
|
+
| 504 | `SCRAPE_TIMEOUT` | Scrape exceeded timeout |
|
|
170
|
+
|
|
171
|
+
### Caching
|
|
172
|
+
|
|
173
|
+
- Successful responses are cached in-memory by `(slug, endpoint, page, q, extra params)`.
|
|
174
|
+
- Cache hits return `metadata.cached: true`.
|
|
175
|
+
- Stale entries can be served immediately while a background refresh updates the cache.
|
|
176
|
+
|
|
177
|
+
### Response Shape
|
|
178
|
+
|
|
179
|
+
```json
|
|
180
|
+
{
|
|
181
|
+
"site": { "name": "...", "slug": "...", "url": "..." },
|
|
182
|
+
"endpoint": "read",
|
|
183
|
+
"query": null,
|
|
184
|
+
"items": [
|
|
185
|
+
{
|
|
186
|
+
"title": "Example title",
|
|
187
|
+
"url": "https://example.com",
|
|
188
|
+
"fields": { "score": 153, "author": "pg" }
|
|
189
|
+
}
|
|
190
|
+
],
|
|
191
|
+
"pagination": {
|
|
192
|
+
"current_page": 1,
|
|
193
|
+
"has_next": true,
|
|
194
|
+
"has_prev": false,
|
|
195
|
+
"total_pages": null,
|
|
196
|
+
"total_items": null
|
|
197
|
+
},
|
|
198
|
+
"metadata": {
|
|
199
|
+
"scraped_at": "2026-02-18T12:34:56Z",
|
|
200
|
+
"response_time_ms": 1832,
|
|
201
|
+
"item_count": 30,
|
|
202
|
+
"cached": false
|
|
203
|
+
},
|
|
204
|
+
"error": null
|
|
205
|
+
}
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
## Recipe Authoring
|
|
209
|
+
|
|
210
|
+
### Layout
|
|
211
|
+
|
|
212
|
+
```
|
|
213
|
+
recipes/
|
|
214
|
+
<slug>/
|
|
215
|
+
recipe.yaml # required — endpoint definitions
|
|
216
|
+
scraper.py # optional — custom Python scraper
|
|
217
|
+
plugin.yaml # optional — dependency metadata and runtime checks
|
|
218
|
+
README.md # optional — documentation
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
- Folder name must match `slug`
|
|
222
|
+
- `slug` cannot be a reserved system route (`api`, `health`, `docs`, `openapi`, `redoc`)
|
|
223
|
+
- Recipe folders containing `.disabled` are skipped by discovery
|
|
224
|
+
- Restart the service to pick up new or changed recipes
|
|
225
|
+
- Invalid recipes are skipped with warning logs
|
|
226
|
+
|
|
227
|
+
### Example: Declarative Endpoints
|
|
228
|
+
|
|
229
|
+
```yaml
|
|
230
|
+
name: "Example Site"
|
|
231
|
+
slug: "examplesite"
|
|
232
|
+
base_url: "https://example.com"
|
|
233
|
+
description: "Scrapes example.com listings and search"
|
|
234
|
+
endpoints:
|
|
235
|
+
read:
|
|
236
|
+
description: "Browse listings"
|
|
237
|
+
url: "https://example.com/list?page={page}"
|
|
238
|
+
actions:
|
|
239
|
+
- type: wait
|
|
240
|
+
selector: ".item"
|
|
241
|
+
timeout: 10000
|
|
242
|
+
items:
|
|
243
|
+
container: ".item"
|
|
244
|
+
fields:
|
|
245
|
+
title:
|
|
246
|
+
selector: "a.title"
|
|
247
|
+
attribute: "text"
|
|
248
|
+
url:
|
|
249
|
+
selector: "a.title"
|
|
250
|
+
attribute: "href"
|
|
251
|
+
transform: "absolute_url"
|
|
252
|
+
pagination:
|
|
253
|
+
type: "page_param"
|
|
254
|
+
param: "page"
|
|
255
|
+
start: 1
|
|
256
|
+
|
|
257
|
+
search:
|
|
258
|
+
description: "Search listings"
|
|
259
|
+
requires_query: true
|
|
260
|
+
url: "https://example.com/search?q={query}&page={page_zero}"
|
|
261
|
+
items:
|
|
262
|
+
container: ".result"
|
|
263
|
+
fields:
|
|
264
|
+
title:
|
|
265
|
+
selector: "a"
|
|
266
|
+
attribute: "text"
|
|
267
|
+
pagination:
|
|
268
|
+
type: "page_param"
|
|
269
|
+
param: "page"
|
|
270
|
+
start: 0
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
### Endpoint Config Fields
|
|
274
|
+
|
|
275
|
+
| Field | Required | Description |
|
|
276
|
+
|---|---|---|
|
|
277
|
+
| `url` | yes | URL template with `{page}`, `{page_zero}`, `{query}` placeholders |
|
|
278
|
+
| `description` | no | Human-readable endpoint description |
|
|
279
|
+
| `requires_query` | no | If `true`, the `q` parameter is mandatory (default: `false`) |
|
|
280
|
+
| `actions` | no | Playwright actions to run before extraction |
|
|
281
|
+
| `items` | yes | Container selector + field definitions |
|
|
282
|
+
| `pagination` | yes | Pagination strategy (`page_param`, `offset_param`, or `next_link`) |
|
|
283
|
+
|
|
284
|
+
Pagination notes:
|
|
285
|
+
`{page}` resolves to `start + ((api_page - 1) * step)`.
|
|
286
|
+
|
|
287
|
+
### Actions
|
|
288
|
+
|
|
289
|
+
| Type | Parameters |
|
|
290
|
+
|---|---|
|
|
291
|
+
| `wait` | `selector`, `timeout` (optional) |
|
|
292
|
+
| `click` | `selector` |
|
|
293
|
+
| `scroll` | `direction` (down/up), `amount` (pixels or "bottom") |
|
|
294
|
+
| `type` | `selector`, `text` |
|
|
295
|
+
| `sleep` | `ms` |
|
|
296
|
+
| `evaluate` | `script` |
|
|
297
|
+
|
|
298
|
+
### Transforms
|
|
299
|
+
|
|
300
|
+
`strip` · `strip_html` · `regex_int` · `regex_float` · `iso_date` · `absolute_url`
|
|
301
|
+
|
|
302
|
+
### Field Context
|
|
303
|
+
|
|
304
|
+
`self` (default) · `next_sibling` · `parent`
|
|
305
|
+
|
|
306
|
+
### Custom Scraper
|
|
307
|
+
|
|
308
|
+
For interactive or complex sites, add a `scraper.py` with a `Scraper` class:
|
|
309
|
+
|
|
310
|
+
```python
|
|
311
|
+
from playwright.async_api import Page
|
|
312
|
+
from web2api.scraper import BaseScraper, ScrapeResult
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
class Scraper(BaseScraper):
|
|
316
|
+
def supports(self, endpoint: str) -> bool:
|
|
317
|
+
return endpoint in {"de-en", "en-de"}
|
|
318
|
+
|
|
319
|
+
async def scrape(self, endpoint: str, page: Page, params: dict) -> ScrapeResult:
|
|
320
|
+
# page is BLANK — navigate yourself
|
|
321
|
+
await page.goto("https://example.com")
|
|
322
|
+
# ... interact with the page ...
|
|
323
|
+
return ScrapeResult(
|
|
324
|
+
items=[{"title": "result", "fields": {"key": "value"}}],
|
|
325
|
+
current_page=params["page"],
|
|
326
|
+
has_next=False,
|
|
327
|
+
)
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
- `supports(endpoint)` — declare which endpoints use custom scraping
|
|
331
|
+
- `scrape(endpoint, page, params)` — `page` is blank, you must `goto()` yourself
|
|
332
|
+
- `params` always contains `page` (int) and `query` (str | None)
|
|
333
|
+
- `params` also includes validated extra query params (for example `count`)
|
|
334
|
+
- Endpoints not handled by the scraper fall back to declarative YAML
|
|
335
|
+
|
|
336
|
+
### Plugin Metadata (Optional)
|
|
337
|
+
|
|
338
|
+
Use `plugin.yaml` to declare install/runtime requirements for a recipe:
|
|
339
|
+
|
|
340
|
+
```yaml
|
|
341
|
+
version: "1.0.0"
|
|
342
|
+
web2api:
|
|
343
|
+
min: "0.2.0"
|
|
344
|
+
max: "1.0.0"
|
|
345
|
+
requires_env:
|
|
346
|
+
- BIRD_AUTH_TOKEN
|
|
347
|
+
- BIRD_CT0
|
|
348
|
+
dependencies:
|
|
349
|
+
commands:
|
|
350
|
+
- bird
|
|
351
|
+
python:
|
|
352
|
+
- httpx
|
|
353
|
+
apt:
|
|
354
|
+
- nodejs
|
|
355
|
+
npm:
|
|
356
|
+
- "@steipete/bird"
|
|
357
|
+
healthcheck:
|
|
358
|
+
command: ["bird", "--version"]
|
|
359
|
+
```
|
|
360
|
+
|
|
361
|
+
Version bounds in `web2api.min` / `web2api.max` use numeric `major.minor.patch` format.
|
|
362
|
+
|
|
363
|
+
`GET /api/sites` now includes a `plugin` block (or `null`) with:
|
|
364
|
+
|
|
365
|
+
- declared metadata from `plugin.yaml`
|
|
366
|
+
- computed `status.ready` plus missing env vars/commands/python packages
|
|
367
|
+
- unverified package declarations (`apt`, `npm`) for operators
|
|
368
|
+
|
|
369
|
+
Compatibility enforcement:
|
|
370
|
+
- `PLUGIN_ENFORCE_COMPATIBILITY=false` (default): incompatible plugins are loaded but reported as not ready.
|
|
371
|
+
- `PLUGIN_ENFORCE_COMPATIBILITY=true`: incompatible plugins are skipped at discovery time.
|
|
372
|
+
|
|
373
|
+
## Configuration
|
|
374
|
+
|
|
375
|
+
Environment variables (with defaults):
|
|
376
|
+
|
|
377
|
+
| Variable | Default | Description |
|
|
378
|
+
|---|---|---|
|
|
379
|
+
| `POOL_MAX_CONTEXTS` | 5 | Max browser contexts in pool |
|
|
380
|
+
| `POOL_CONTEXT_TTL` | 50 | Requests per context before recycling |
|
|
381
|
+
| `POOL_ACQUIRE_TIMEOUT` | 30 | Seconds to wait for a context |
|
|
382
|
+
| `POOL_PAGE_TIMEOUT` | 15000 | Page navigation timeout (ms) |
|
|
383
|
+
| `POOL_QUEUE_SIZE` | 20 | Max queued requests |
|
|
384
|
+
| `SCRAPE_TIMEOUT` | 30 | Overall scrape timeout (seconds) |
|
|
385
|
+
| `CACHE_ENABLED` | true | Enable in-memory response caching |
|
|
386
|
+
| `CACHE_TTL_SECONDS` | 30 | Fresh cache duration in seconds |
|
|
387
|
+
| `CACHE_STALE_TTL_SECONDS` | 120 | Stale-while-revalidate window in seconds |
|
|
388
|
+
| `CACHE_MAX_ENTRIES` | 500 | Maximum cached request variants |
|
|
389
|
+
| `RECIPES_DIR` | `./recipes` (or bundled defaults in installed package) | Path to recipes directory |
|
|
390
|
+
| `PLUGIN_ENFORCE_COMPATIBILITY` | false | Skip plugin recipes outside declared `web2api` version bounds |
|
|
391
|
+
| `BIRD_AUTH_TOKEN` | empty | X/Twitter auth token for `x` recipe |
|
|
392
|
+
| `BIRD_CT0` | empty | X/Twitter ct0 token for `x` recipe |
|
|
393
|
+
|
|
394
|
+
## Testing
|
|
395
|
+
|
|
396
|
+
```bash
|
|
397
|
+
# Inside the container or with deps installed:
|
|
398
|
+
pytest tests/unit tests/integration --timeout=30 -x -q
|
|
399
|
+
```
|
|
400
|
+
|
|
401
|
+
## Tech Stack
|
|
402
|
+
|
|
403
|
+
- Python 3.12 + FastAPI + Playwright (Chromium)
|
|
404
|
+
- Pydantic for config validation
|
|
405
|
+
- Docker for deployment
|
|
406
|
+
|
|
407
|
+
## License
|
|
408
|
+
|
|
409
|
+
MIT
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=69", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "web2api"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Turn websites into REST APIs via live Playwright scraping."
|
|
9
|
+
requires-python = ">=3.12"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"fastapi>=0.115,<1.0",
|
|
12
|
+
"jinja2>=3.1,<4.0",
|
|
13
|
+
"playwright>=1.50,<2.0",
|
|
14
|
+
"pydantic>=2.10,<3.0",
|
|
15
|
+
"pyyaml>=6.0,<7.0",
|
|
16
|
+
"typer>=0.12,<1.0",
|
|
17
|
+
"uvicorn[standard]>=0.34,<1.0",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[project.optional-dependencies]
|
|
21
|
+
dev = [
|
|
22
|
+
"httpx>=0.28,<1.0",
|
|
23
|
+
"pytest>=8.3,<9.0",
|
|
24
|
+
"pytest-asyncio>=0.25,<1.0",
|
|
25
|
+
"pytest-cov>=6.0,<7.0",
|
|
26
|
+
"pytest-timeout>=2.3,<3.0",
|
|
27
|
+
"ruff>=0.9,<1.0",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.scripts]
|
|
31
|
+
web2api = "web2api.cli:main"
|
|
32
|
+
|
|
33
|
+
[tool.setuptools]
|
|
34
|
+
include-package-data = true
|
|
35
|
+
|
|
36
|
+
[tool.setuptools.packages.find]
|
|
37
|
+
include = ["web2api*"]
|
|
38
|
+
|
|
39
|
+
[tool.setuptools.package-data]
|
|
40
|
+
web2api = [
|
|
41
|
+
"templates/*.html",
|
|
42
|
+
"bundled/plugins/*.yaml",
|
|
43
|
+
"bundled/recipes/*/*.yaml",
|
|
44
|
+
"bundled/recipes/*/*.py",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
[tool.ruff]
|
|
48
|
+
line-length = 100
|
|
49
|
+
target-version = "py312"
|
|
50
|
+
|
|
51
|
+
[tool.ruff.lint]
|
|
52
|
+
select = ["E", "F", "I", "UP"]
|
|
53
|
+
|
|
54
|
+
[tool.pytest.ini_options]
|
|
55
|
+
asyncio_mode = "auto"
|
web2api-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
plugins:
|
|
2
|
+
hackernews:
|
|
3
|
+
description: "Built-in Hacker News recipe."
|
|
4
|
+
source: "../recipes/hackernews"
|
|
5
|
+
trusted: true
|
|
6
|
+
|
|
7
|
+
deepl:
|
|
8
|
+
description: "Built-in DeepL translation recipe."
|
|
9
|
+
source: "../recipes/deepl"
|
|
10
|
+
trusted: true
|
|
11
|
+
|
|
12
|
+
x:
|
|
13
|
+
description: "Built-in X/Twitter recipe (requires bird CLI and auth env vars)."
|
|
14
|
+
source: "../recipes/x"
|
|
15
|
+
trusted: true
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
name: "DeepL Translator"
|
|
2
|
+
slug: "deepl"
|
|
3
|
+
base_url: "https://www.deepl.com"
|
|
4
|
+
description: "Translate text between German and English using DeepL"
|
|
5
|
+
endpoints:
|
|
6
|
+
de-en:
|
|
7
|
+
description: "German to English"
|
|
8
|
+
requires_query: true
|
|
9
|
+
url: "https://www.deepl.com/en/translator#de/en/"
|
|
10
|
+
items:
|
|
11
|
+
container: "d-textarea"
|
|
12
|
+
fields:
|
|
13
|
+
text:
|
|
14
|
+
selector: ""
|
|
15
|
+
attribute: "text"
|
|
16
|
+
pagination:
|
|
17
|
+
type: "page_param"
|
|
18
|
+
param: "p"
|
|
19
|
+
start: 1
|
|
20
|
+
en-de:
|
|
21
|
+
description: "English to German"
|
|
22
|
+
requires_query: true
|
|
23
|
+
url: "https://www.deepl.com/en/translator#en/de/"
|
|
24
|
+
items:
|
|
25
|
+
container: "d-textarea"
|
|
26
|
+
fields:
|
|
27
|
+
text:
|
|
28
|
+
selector: ""
|
|
29
|
+
attribute: "text"
|
|
30
|
+
pagination:
|
|
31
|
+
type: "page_param"
|
|
32
|
+
param: "p"
|
|
33
|
+
start: 1
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""DeepL Translator scraper — supports multiple language pairs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from playwright.async_api import Page
|
|
9
|
+
|
|
10
|
+
from web2api.scraper import BaseScraper, ScrapeResult
|
|
11
|
+
|
|
12
|
+
# Map endpoint names to (source_lang, target_lang) pairs
|
|
13
|
+
_LANG_PAIRS: dict[str, tuple[str, str]] = {
|
|
14
|
+
"de-en": ("de", "en"),
|
|
15
|
+
"en-de": ("en", "de"),
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Scraper(BaseScraper):
|
|
20
|
+
"""Translate text via DeepL's web translator."""
|
|
21
|
+
|
|
22
|
+
def supports(self, endpoint: str) -> bool:
|
|
23
|
+
return endpoint in _LANG_PAIRS
|
|
24
|
+
|
|
25
|
+
async def scrape(self, endpoint: str, page: Page, params: dict[str, Any]) -> ScrapeResult:
|
|
26
|
+
source_lang, target_lang = _LANG_PAIRS[endpoint]
|
|
27
|
+
query = params.get("query") or ""
|
|
28
|
+
|
|
29
|
+
if not query.strip():
|
|
30
|
+
return ScrapeResult(
|
|
31
|
+
items=[{
|
|
32
|
+
"source_text": "",
|
|
33
|
+
"translated_text": "",
|
|
34
|
+
"source_lang": source_lang,
|
|
35
|
+
"target_lang": target_lang,
|
|
36
|
+
}]
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
await page.goto(f"https://www.deepl.com/en/translator#{source_lang}/{target_lang}/")
|
|
40
|
+
|
|
41
|
+
source_area = await page.wait_for_selector(
|
|
42
|
+
'd-textarea[data-testid="translator-source-input"]',
|
|
43
|
+
timeout=15000,
|
|
44
|
+
)
|
|
45
|
+
if source_area is None:
|
|
46
|
+
raise RuntimeError("Could not find DeepL source input")
|
|
47
|
+
|
|
48
|
+
await source_area.click()
|
|
49
|
+
await page.keyboard.press("Control+a")
|
|
50
|
+
await page.keyboard.press("Backspace")
|
|
51
|
+
await page.keyboard.type(query, delay=10)
|
|
52
|
+
|
|
53
|
+
# Wait for translation to appear and stabilize.
|
|
54
|
+
# DeepL streams results progressively, so we wait until the
|
|
55
|
+
# target text stops changing for a few consecutive checks.
|
|
56
|
+
translated = ""
|
|
57
|
+
stable_count = 0
|
|
58
|
+
required_stable = 6 # must be unchanged for 6 consecutive checks (3s)
|
|
59
|
+
|
|
60
|
+
for _ in range(80): # up to 40 seconds total
|
|
61
|
+
await asyncio.sleep(0.5)
|
|
62
|
+
current = await self._read_target(page)
|
|
63
|
+
|
|
64
|
+
if not current or current == query.strip():
|
|
65
|
+
stable_count = 0
|
|
66
|
+
continue
|
|
67
|
+
|
|
68
|
+
if current == translated:
|
|
69
|
+
stable_count += 1
|
|
70
|
+
if stable_count >= required_stable:
|
|
71
|
+
break
|
|
72
|
+
else:
|
|
73
|
+
translated = current
|
|
74
|
+
stable_count = 0
|
|
75
|
+
|
|
76
|
+
if not translated:
|
|
77
|
+
raise RuntimeError("Translation did not appear within timeout")
|
|
78
|
+
|
|
79
|
+
return ScrapeResult(
|
|
80
|
+
items=[{
|
|
81
|
+
"source_text": query,
|
|
82
|
+
"translated_text": translated,
|
|
83
|
+
"source_lang": source_lang,
|
|
84
|
+
"target_lang": target_lang,
|
|
85
|
+
}],
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
@staticmethod
|
|
89
|
+
async def _read_target(page: Page) -> str:
|
|
90
|
+
"""Extract the current translation text from the target area."""
|
|
91
|
+
# Try the value attribute first
|
|
92
|
+
target_area = await page.query_selector(
|
|
93
|
+
'd-textarea[data-testid="translator-target-input"]'
|
|
94
|
+
)
|
|
95
|
+
if target_area is not None:
|
|
96
|
+
text = await target_area.get_attribute("value")
|
|
97
|
+
if text and text.strip():
|
|
98
|
+
return text.strip()
|
|
99
|
+
text = await target_area.text_content()
|
|
100
|
+
if text and text.strip():
|
|
101
|
+
return text.strip()
|
|
102
|
+
|
|
103
|
+
# Fallback: paragraph inside the target
|
|
104
|
+
target_p = await page.query_selector(
|
|
105
|
+
'[data-testid="translator-target-input"] p'
|
|
106
|
+
)
|
|
107
|
+
if target_p is not None:
|
|
108
|
+
text = await target_p.text_content()
|
|
109
|
+
if text and text.strip():
|
|
110
|
+
return text.strip()
|
|
111
|
+
|
|
112
|
+
return ""
|