web2api 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. web2api-0.1.0/PKG-INFO +19 -0
  2. web2api-0.1.0/README.md +409 -0
  3. web2api-0.1.0/pyproject.toml +55 -0
  4. web2api-0.1.0/setup.cfg +4 -0
  5. web2api-0.1.0/web2api/__init__.py +10 -0
  6. web2api-0.1.0/web2api/bundled/plugins/catalog.yaml +15 -0
  7. web2api-0.1.0/web2api/bundled/recipes/deepl/recipe.yaml +33 -0
  8. web2api-0.1.0/web2api/bundled/recipes/deepl/scraper.py +112 -0
  9. web2api-0.1.0/web2api/bundled/recipes/hackernews/recipe.yaml +97 -0
  10. web2api-0.1.0/web2api/bundled/recipes/x/plugin.yaml +17 -0
  11. web2api-0.1.0/web2api/bundled/recipes/x/recipe.yaml +19 -0
  12. web2api-0.1.0/web2api/bundled/recipes/x/scraper.py +110 -0
  13. web2api-0.1.0/web2api/cache.py +150 -0
  14. web2api-0.1.0/web2api/cli.py +974 -0
  15. web2api-0.1.0/web2api/config.py +165 -0
  16. web2api-0.1.0/web2api/engine.py +502 -0
  17. web2api-0.1.0/web2api/logging_utils.py +54 -0
  18. web2api-0.1.0/web2api/main.py +412 -0
  19. web2api-0.1.0/web2api/plugin.py +248 -0
  20. web2api-0.1.0/web2api/plugin_manager.py +530 -0
  21. web2api-0.1.0/web2api/pool.py +312 -0
  22. web2api-0.1.0/web2api/registry.py +221 -0
  23. web2api-0.1.0/web2api/schemas.py +85 -0
  24. web2api-0.1.0/web2api/scraper.py +50 -0
  25. web2api-0.1.0/web2api/self_update.py +164 -0
  26. web2api-0.1.0/web2api/templates/index.html +576 -0
  27. web2api-0.1.0/web2api.egg-info/PKG-INFO +19 -0
  28. web2api-0.1.0/web2api.egg-info/SOURCES.txt +30 -0
  29. web2api-0.1.0/web2api.egg-info/dependency_links.txt +1 -0
  30. web2api-0.1.0/web2api.egg-info/entry_points.txt +2 -0
  31. web2api-0.1.0/web2api.egg-info/requires.txt +15 -0
  32. web2api-0.1.0/web2api.egg-info/top_level.txt +1 -0
web2api-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.4
2
+ Name: web2api
3
+ Version: 0.1.0
4
+ Summary: Turn websites into REST APIs via live Playwright scraping.
5
+ Requires-Python: >=3.12
6
+ Requires-Dist: fastapi<1.0,>=0.115
7
+ Requires-Dist: jinja2<4.0,>=3.1
8
+ Requires-Dist: playwright<2.0,>=1.50
9
+ Requires-Dist: pydantic<3.0,>=2.10
10
+ Requires-Dist: pyyaml<7.0,>=6.0
11
+ Requires-Dist: typer<1.0,>=0.12
12
+ Requires-Dist: uvicorn[standard]<1.0,>=0.34
13
+ Provides-Extra: dev
14
+ Requires-Dist: httpx<1.0,>=0.28; extra == "dev"
15
+ Requires-Dist: pytest<9.0,>=8.3; extra == "dev"
16
+ Requires-Dist: pytest-asyncio<1.0,>=0.25; extra == "dev"
17
+ Requires-Dist: pytest-cov<7.0,>=6.0; extra == "dev"
18
+ Requires-Dist: pytest-timeout<3.0,>=2.3; extra == "dev"
19
+ Requires-Dist: ruff<1.0,>=0.9; extra == "dev"
@@ -0,0 +1,409 @@
1
+ # Web2API
2
+
3
+ Turn any website into a REST API by scraping it live with Playwright.
4
+
5
+ Web2API loads recipe folders from `recipes/` at startup. Each recipe defines endpoints with selectors, actions, fields, and pagination in YAML. Optional Python scrapers handle interactive or complex sites. Optional plugin metadata can declare external dependencies and required env vars. Drop a folder — get an API.
6
+
7
+ ## Features
8
+
9
+ - **Arbitrary named endpoints** — recipes define as many endpoints as needed (not limited to read/search)
10
+ - **Declarative YAML recipes** with selectors, actions, transforms, and pagination
11
+ - **Custom Python scrapers** for interactive sites (e.g. typing text, waiting for dynamic content)
12
+ - **Optional plugin metadata** (`plugin.yaml`) for recipe-specific dependency requirements
13
+ - **Shared browser/context pool** for concurrent Playwright requests
14
+ - **In-memory response cache** with stale-while-revalidate
15
+ - **Unified JSON response schema** across all recipes and endpoints
16
+ - **Docker deployment** with auto-restart
17
+
18
+ ## Quickstart (Docker)
19
+
20
+ ```bash
21
+ docker compose up --build -d
22
+ ```
23
+
24
+ Service: `http://localhost:8010`
25
+
26
+ ### Verify
27
+
28
+ ```bash
29
+ curl -s http://localhost:8010/health | jq
30
+ curl -s http://localhost:8010/api/sites | jq
31
+ ```
32
+
33
+ ## CLI
34
+
35
+ Web2API ships with a management CLI:
36
+
37
+ ```bash
38
+ web2api --help
39
+ ```
40
+
41
+ ### Plugin Commands
42
+
43
+ ```bash
44
+ # List all recipe folders with plugin readiness
45
+ web2api plugins list
46
+
47
+ # Check missing env vars/commands/packages
48
+ web2api plugins doctor
49
+ web2api plugins doctor x
50
+ web2api plugins doctor x --no-run-healthchecks
51
+ web2api plugins doctor x --allow-untrusted
52
+
53
+ # Install plugin recipe from source
54
+ web2api plugins add ./my-recipe
55
+ web2api plugins add https://github.com/acme/web2api-recipes.git --ref v1.2.0 --subdir recipes/news
56
+
57
+ # Update managed plugin from recorded source
58
+ web2api plugins update x --yes
59
+ web2api plugins update x --ref v1.3.0 --subdir recipes/x --yes
60
+
61
+ # Install plugin recipe from catalog
62
+ web2api plugins catalog list
63
+ web2api plugins catalog add hackernews --yes
64
+
65
+ # Install declared dependencies for a plugin recipe (host)
66
+ web2api plugins install x --yes
67
+ web2api plugins install x --apt --yes # include apt packages
68
+
69
+ # Generate Dockerfile snippet for plugin dependencies
70
+ web2api plugins install x --target docker --apt
71
+
72
+ # Remove plugin recipe + manifest record
73
+ web2api plugins uninstall x --yes
74
+
75
+ # Disable/enable a recipe (writes/removes recipes/<slug>/.disabled)
76
+ web2api plugins disable x --yes
77
+ web2api plugins enable x
78
+ ```
79
+
80
+ `plugins install` does not run `apt` installs unless `--apt` is explicitly passed.
81
+ Install-state records are stored in `recipes/.web2api_plugins.json`.
82
+ Default catalog path is `plugins/catalog.yaml` in a source checkout, with a bundled fallback
83
+ inside the installed package.
84
+ `plugins update` works only for plugins tracked in the manifest.
85
+
86
+ Plugins installed from untrusted sources (for example git URLs) are blocked from executing
87
+ install/healthcheck commands unless `--allow-untrusted` is passed.
88
+
89
+ ### Self Update Commands
90
+
91
+ ```bash
92
+ # Show current version + recommended update method
93
+ web2api self update check
94
+
95
+ # Apply update using auto-detected method (pip/git/docker)
96
+ web2api self update apply --yes
97
+
98
+ # Pin explicit method or target version/ref
99
+ web2api self update apply --method pip --to 0.1.0 --yes
100
+ web2api self update apply --method git --to v0.1.0 --yes
101
+ ```
102
+
103
+ For `--method git`, `self update apply` checks out a tag:
104
+ - if `--to` is provided, that tag/ref is used
105
+ - if `--to` is omitted, the latest sortable git tag is used
106
+
107
+ After `self update apply`, the CLI automatically runs `web2api plugins doctor`.
108
+
109
+ ## Discover Recipes
110
+
111
+ Recipe availability is dynamic. Use discovery endpoints instead of relying on a static README list.
112
+
113
+ ```bash
114
+ # List all discovered sites and endpoint metadata
115
+ curl -s "http://localhost:8010/api/sites" | jq
116
+
117
+ # Print endpoint paths with required params
118
+ curl -s "http://localhost:8010/api/sites" | jq -r '
119
+ .[] as $site
120
+ | $site.endpoints[]
121
+ | "/\($site.slug)/\(.name) params: page" + (if .requires_query then ", q" else "" end)
122
+ '
123
+
124
+ # Print ready-to-run URL templates
125
+ curl -s "http://localhost:8010/api/sites" | jq -r '
126
+ .[] as $site
127
+ | $site.endpoints[]
128
+ | "http://localhost:8010/\($site.slug)/\(.name)?"
129
+ + (if .requires_query then "q=<query>&" else "" end)
130
+ + "page=1"
131
+ '
132
+
133
+ # Example call pattern (no query endpoint)
134
+ curl -s "http://localhost:8010/{slug}/{endpoint}?page=1" | jq
135
+
136
+ # Example call pattern (query endpoint)
137
+ curl -s "http://localhost:8010/{slug}/{endpoint}?q=hello&page=1" | jq
138
+ ```
139
+
140
+ For custom scraper parameters beyond `page` and `q`, check the specific recipe folder
141
+ (`recipes/<slug>/scraper.py`).
142
+
143
+ ## API
144
+
145
+ ### Discovery
146
+
147
+ | Endpoint | Description |
148
+ |---|---|
149
+ | `GET /` | HTML index listing all recipes and endpoints |
150
+ | `GET /health` | Service, browser pool, and cache health |
151
+ | `GET /api/sites` | JSON list of all recipes with endpoint metadata |
152
+
153
+ ### Recipe Endpoints
154
+
155
+ All recipe endpoints follow the pattern: `GET /{slug}/{endpoint}?page=1&q=...`
156
+
157
+ - `page` — pagination (default: 1)
158
+ - `q` — query text (required when `requires_query: true`)
159
+ - additional query params are passed to custom scrapers
160
+ - extra query param names must match `[a-zA-Z0-9][a-zA-Z0-9_-]{0,63}` and values are capped at 512 chars
161
+
162
+ ### Error Codes
163
+
164
+ | HTTP | Code | When |
165
+ |---|---|---|
166
+ | 400 | `INVALID_PARAMS` | Missing required `q` or invalid extra query parameters |
167
+ | 404 | — | Unknown recipe or endpoint |
168
+ | 502 | `SCRAPE_FAILED` | Browser/upstream failure |
169
+ | 504 | `SCRAPE_TIMEOUT` | Scrape exceeded timeout |
170
+
171
+ ### Caching
172
+
173
+ - Successful responses are cached in-memory by `(slug, endpoint, page, q, extra params)`.
174
+ - Cache hits return `metadata.cached: true`.
175
+ - Stale entries can be served immediately while a background refresh updates the cache.
176
+
177
+ ### Response Shape
178
+
179
+ ```json
180
+ {
181
+ "site": { "name": "...", "slug": "...", "url": "..." },
182
+ "endpoint": "read",
183
+ "query": null,
184
+ "items": [
185
+ {
186
+ "title": "Example title",
187
+ "url": "https://example.com",
188
+ "fields": { "score": 153, "author": "pg" }
189
+ }
190
+ ],
191
+ "pagination": {
192
+ "current_page": 1,
193
+ "has_next": true,
194
+ "has_prev": false,
195
+ "total_pages": null,
196
+ "total_items": null
197
+ },
198
+ "metadata": {
199
+ "scraped_at": "2026-02-18T12:34:56Z",
200
+ "response_time_ms": 1832,
201
+ "item_count": 30,
202
+ "cached": false
203
+ },
204
+ "error": null
205
+ }
206
+ ```
207
+
208
+ ## Recipe Authoring
209
+
210
+ ### Layout
211
+
212
+ ```
213
+ recipes/
214
+ <slug>/
215
+ recipe.yaml # required — endpoint definitions
216
+ scraper.py # optional — custom Python scraper
217
+ plugin.yaml # optional — dependency metadata and runtime checks
218
+ README.md # optional — documentation
219
+ ```
220
+
221
+ - Folder name must match `slug`
222
+ - `slug` cannot be a reserved system route (`api`, `health`, `docs`, `openapi`, `redoc`)
223
+ - Recipe folders containing `.disabled` are skipped by discovery
224
+ - Restart the service to pick up new or changed recipes
225
+ - Invalid recipes are skipped with warning logs
226
+
227
+ ### Example: Declarative Endpoints
228
+
229
+ ```yaml
230
+ name: "Example Site"
231
+ slug: "examplesite"
232
+ base_url: "https://example.com"
233
+ description: "Scrapes example.com listings and search"
234
+ endpoints:
235
+ read:
236
+ description: "Browse listings"
237
+ url: "https://example.com/list?page={page}"
238
+ actions:
239
+ - type: wait
240
+ selector: ".item"
241
+ timeout: 10000
242
+ items:
243
+ container: ".item"
244
+ fields:
245
+ title:
246
+ selector: "a.title"
247
+ attribute: "text"
248
+ url:
249
+ selector: "a.title"
250
+ attribute: "href"
251
+ transform: "absolute_url"
252
+ pagination:
253
+ type: "page_param"
254
+ param: "page"
255
+ start: 1
256
+
257
+ search:
258
+ description: "Search listings"
259
+ requires_query: true
260
+ url: "https://example.com/search?q={query}&page={page_zero}"
261
+ items:
262
+ container: ".result"
263
+ fields:
264
+ title:
265
+ selector: "a"
266
+ attribute: "text"
267
+ pagination:
268
+ type: "page_param"
269
+ param: "page"
270
+ start: 0
271
+ ```
272
+
273
+ ### Endpoint Config Fields
274
+
275
+ | Field | Required | Description |
276
+ |---|---|---|
277
+ | `url` | yes | URL template with `{page}`, `{page_zero}`, `{query}` placeholders |
278
+ | `description` | no | Human-readable endpoint description |
279
+ | `requires_query` | no | If `true`, the `q` parameter is mandatory (default: `false`) |
280
+ | `actions` | no | Playwright actions to run before extraction |
281
+ | `items` | yes | Container selector + field definitions |
282
+ | `pagination` | yes | Pagination strategy (`page_param`, `offset_param`, or `next_link`) |
283
+
284
+ Pagination notes:
285
+ `{page}` resolves to `start + ((api_page - 1) * step)`.
286
+
287
+ ### Actions
288
+
289
+ | Type | Parameters |
290
+ |---|---|
291
+ | `wait` | `selector`, `timeout` (optional) |
292
+ | `click` | `selector` |
293
+ | `scroll` | `direction` (down/up), `amount` (pixels or "bottom") |
294
+ | `type` | `selector`, `text` |
295
+ | `sleep` | `ms` |
296
+ | `evaluate` | `script` |
297
+
298
+ ### Transforms
299
+
300
+ `strip` · `strip_html` · `regex_int` · `regex_float` · `iso_date` · `absolute_url`
301
+
302
+ ### Field Context
303
+
304
+ `self` (default) · `next_sibling` · `parent`
305
+
306
+ ### Custom Scraper
307
+
308
+ For interactive or complex sites, add a `scraper.py` with a `Scraper` class:
309
+
310
+ ```python
311
+ from playwright.async_api import Page
312
+ from web2api.scraper import BaseScraper, ScrapeResult
313
+
314
+
315
+ class Scraper(BaseScraper):
316
+ def supports(self, endpoint: str) -> bool:
317
+ return endpoint in {"de-en", "en-de"}
318
+
319
+ async def scrape(self, endpoint: str, page: Page, params: dict) -> ScrapeResult:
320
+ # page is BLANK — navigate yourself
321
+ await page.goto("https://example.com")
322
+ # ... interact with the page ...
323
+ return ScrapeResult(
324
+ items=[{"title": "result", "fields": {"key": "value"}}],
325
+ current_page=params["page"],
326
+ has_next=False,
327
+ )
328
+ ```
329
+
330
+ - `supports(endpoint)` — declare which endpoints use custom scraping
331
+ - `scrape(endpoint, page, params)` — `page` is blank, you must `goto()` yourself
332
+ - `params` always contains `page` (int) and `query` (str | None)
333
+ - `params` also includes validated extra query params (for example `count`)
334
+ - Endpoints not handled by the scraper fall back to declarative YAML
335
+
336
+ ### Plugin Metadata (Optional)
337
+
338
+ Use `plugin.yaml` to declare install/runtime requirements for a recipe:
339
+
340
+ ```yaml
341
+ version: "1.0.0"
342
+ web2api:
343
+ min: "0.2.0"
344
+ max: "1.0.0"
345
+ requires_env:
346
+ - BIRD_AUTH_TOKEN
347
+ - BIRD_CT0
348
+ dependencies:
349
+ commands:
350
+ - bird
351
+ python:
352
+ - httpx
353
+ apt:
354
+ - nodejs
355
+ npm:
356
+ - "@steipete/bird"
357
+ healthcheck:
358
+ command: ["bird", "--version"]
359
+ ```
360
+
361
+ Version bounds in `web2api.min` / `web2api.max` use numeric `major.minor.patch` format.
362
+
363
+ `GET /api/sites` now includes a `plugin` block (or `null`) with:
364
+
365
+ - declared metadata from `plugin.yaml`
366
+ - computed `status.ready` plus missing env vars/commands/python packages
367
+ - unverified package declarations (`apt`, `npm`) for operators
368
+
369
+ Compatibility enforcement:
370
+ - `PLUGIN_ENFORCE_COMPATIBILITY=false` (default): incompatible plugins are loaded but reported as not ready.
371
+ - `PLUGIN_ENFORCE_COMPATIBILITY=true`: incompatible plugins are skipped at discovery time.
372
+
373
+ ## Configuration
374
+
375
+ Environment variables (with defaults):
376
+
377
+ | Variable | Default | Description |
378
+ |---|---|---|
379
+ | `POOL_MAX_CONTEXTS` | 5 | Max browser contexts in pool |
380
+ | `POOL_CONTEXT_TTL` | 50 | Requests per context before recycling |
381
+ | `POOL_ACQUIRE_TIMEOUT` | 30 | Seconds to wait for a context |
382
+ | `POOL_PAGE_TIMEOUT` | 15000 | Page navigation timeout (ms) |
383
+ | `POOL_QUEUE_SIZE` | 20 | Max queued requests |
384
+ | `SCRAPE_TIMEOUT` | 30 | Overall scrape timeout (seconds) |
385
+ | `CACHE_ENABLED` | true | Enable in-memory response caching |
386
+ | `CACHE_TTL_SECONDS` | 30 | Fresh cache duration in seconds |
387
+ | `CACHE_STALE_TTL_SECONDS` | 120 | Stale-while-revalidate window in seconds |
388
+ | `CACHE_MAX_ENTRIES` | 500 | Maximum cached request variants |
389
+ | `RECIPES_DIR` | `./recipes` (or bundled defaults in installed package) | Path to recipes directory |
390
+ | `PLUGIN_ENFORCE_COMPATIBILITY` | false | Skip plugin recipes outside declared `web2api` version bounds |
391
+ | `BIRD_AUTH_TOKEN` | empty | X/Twitter auth token for `x` recipe |
392
+ | `BIRD_CT0` | empty | X/Twitter ct0 token for `x` recipe |
393
+
394
+ ## Testing
395
+
396
+ ```bash
397
+ # Inside the container or with deps installed:
398
+ pytest tests/unit tests/integration --timeout=30 -x -q
399
+ ```
400
+
401
+ ## Tech Stack
402
+
403
+ - Python 3.12 + FastAPI + Playwright (Chromium)
404
+ - Pydantic for config validation
405
+ - Docker for deployment
406
+
407
+ ## License
408
+
409
+ MIT
@@ -0,0 +1,55 @@
1
+ [build-system]
2
+ requires = ["setuptools>=69", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "web2api"
7
+ version = "0.1.0"
8
+ description = "Turn websites into REST APIs via live Playwright scraping."
9
+ requires-python = ">=3.12"
10
+ dependencies = [
11
+ "fastapi>=0.115,<1.0",
12
+ "jinja2>=3.1,<4.0",
13
+ "playwright>=1.50,<2.0",
14
+ "pydantic>=2.10,<3.0",
15
+ "pyyaml>=6.0,<7.0",
16
+ "typer>=0.12,<1.0",
17
+ "uvicorn[standard]>=0.34,<1.0",
18
+ ]
19
+
20
+ [project.optional-dependencies]
21
+ dev = [
22
+ "httpx>=0.28,<1.0",
23
+ "pytest>=8.3,<9.0",
24
+ "pytest-asyncio>=0.25,<1.0",
25
+ "pytest-cov>=6.0,<7.0",
26
+ "pytest-timeout>=2.3,<3.0",
27
+ "ruff>=0.9,<1.0",
28
+ ]
29
+
30
+ [project.scripts]
31
+ web2api = "web2api.cli:main"
32
+
33
+ [tool.setuptools]
34
+ include-package-data = true
35
+
36
+ [tool.setuptools.packages.find]
37
+ include = ["web2api*"]
38
+
39
+ [tool.setuptools.package-data]
40
+ web2api = [
41
+ "templates/*.html",
42
+ "bundled/plugins/*.yaml",
43
+ "bundled/recipes/*/*.yaml",
44
+ "bundled/recipes/*/*.py",
45
+ ]
46
+
47
+ [tool.ruff]
48
+ line-length = 100
49
+ target-version = "py312"
50
+
51
+ [tool.ruff.lint]
52
+ select = ["E", "F", "I", "UP"]
53
+
54
+ [tool.pytest.ini_options]
55
+ asyncio_mode = "auto"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,10 @@
1
+ """Web2API package."""
2
+
3
+ from importlib.metadata import PackageNotFoundError, version
4
+
5
+ __all__ = ["__version__"]
6
+
7
+ try:
8
+ __version__ = version("web2api")
9
+ except PackageNotFoundError:
10
+ __version__ = "0.1.0"
@@ -0,0 +1,15 @@
1
+ plugins:
2
+ hackernews:
3
+ description: "Built-in Hacker News recipe."
4
+ source: "../recipes/hackernews"
5
+ trusted: true
6
+
7
+ deepl:
8
+ description: "Built-in DeepL translation recipe."
9
+ source: "../recipes/deepl"
10
+ trusted: true
11
+
12
+ x:
13
+ description: "Built-in X/Twitter recipe (requires bird CLI and auth env vars)."
14
+ source: "../recipes/x"
15
+ trusted: true
@@ -0,0 +1,33 @@
1
+ name: "DeepL Translator"
2
+ slug: "deepl"
3
+ base_url: "https://www.deepl.com"
4
+ description: "Translate text between German and English using DeepL"
5
+ endpoints:
6
+ de-en:
7
+ description: "German to English"
8
+ requires_query: true
9
+ url: "https://www.deepl.com/en/translator#de/en/"
10
+ items:
11
+ container: "d-textarea"
12
+ fields:
13
+ text:
14
+ selector: ""
15
+ attribute: "text"
16
+ pagination:
17
+ type: "page_param"
18
+ param: "p"
19
+ start: 1
20
+ en-de:
21
+ description: "English to German"
22
+ requires_query: true
23
+ url: "https://www.deepl.com/en/translator#en/de/"
24
+ items:
25
+ container: "d-textarea"
26
+ fields:
27
+ text:
28
+ selector: ""
29
+ attribute: "text"
30
+ pagination:
31
+ type: "page_param"
32
+ param: "p"
33
+ start: 1
@@ -0,0 +1,112 @@
1
+ """DeepL Translator scraper — supports multiple language pairs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from typing import Any
7
+
8
+ from playwright.async_api import Page
9
+
10
+ from web2api.scraper import BaseScraper, ScrapeResult
11
+
12
+ # Map endpoint names to (source_lang, target_lang) pairs
13
+ _LANG_PAIRS: dict[str, tuple[str, str]] = {
14
+ "de-en": ("de", "en"),
15
+ "en-de": ("en", "de"),
16
+ }
17
+
18
+
19
+ class Scraper(BaseScraper):
20
+ """Translate text via DeepL's web translator."""
21
+
22
+ def supports(self, endpoint: str) -> bool:
23
+ return endpoint in _LANG_PAIRS
24
+
25
+ async def scrape(self, endpoint: str, page: Page, params: dict[str, Any]) -> ScrapeResult:
26
+ source_lang, target_lang = _LANG_PAIRS[endpoint]
27
+ query = params.get("query") or ""
28
+
29
+ if not query.strip():
30
+ return ScrapeResult(
31
+ items=[{
32
+ "source_text": "",
33
+ "translated_text": "",
34
+ "source_lang": source_lang,
35
+ "target_lang": target_lang,
36
+ }]
37
+ )
38
+
39
+ await page.goto(f"https://www.deepl.com/en/translator#{source_lang}/{target_lang}/")
40
+
41
+ source_area = await page.wait_for_selector(
42
+ 'd-textarea[data-testid="translator-source-input"]',
43
+ timeout=15000,
44
+ )
45
+ if source_area is None:
46
+ raise RuntimeError("Could not find DeepL source input")
47
+
48
+ await source_area.click()
49
+ await page.keyboard.press("Control+a")
50
+ await page.keyboard.press("Backspace")
51
+ await page.keyboard.type(query, delay=10)
52
+
53
+ # Wait for translation to appear and stabilize.
54
+ # DeepL streams results progressively, so we wait until the
55
+ # target text stops changing for a few consecutive checks.
56
+ translated = ""
57
+ stable_count = 0
58
+ required_stable = 6 # must be unchanged for 6 consecutive checks (3s)
59
+
60
+ for _ in range(80): # up to 40 seconds total
61
+ await asyncio.sleep(0.5)
62
+ current = await self._read_target(page)
63
+
64
+ if not current or current == query.strip():
65
+ stable_count = 0
66
+ continue
67
+
68
+ if current == translated:
69
+ stable_count += 1
70
+ if stable_count >= required_stable:
71
+ break
72
+ else:
73
+ translated = current
74
+ stable_count = 0
75
+
76
+ if not translated:
77
+ raise RuntimeError("Translation did not appear within timeout")
78
+
79
+ return ScrapeResult(
80
+ items=[{
81
+ "source_text": query,
82
+ "translated_text": translated,
83
+ "source_lang": source_lang,
84
+ "target_lang": target_lang,
85
+ }],
86
+ )
87
+
88
+ @staticmethod
89
+ async def _read_target(page: Page) -> str:
90
+ """Extract the current translation text from the target area."""
91
+ # Try the value attribute first
92
+ target_area = await page.query_selector(
93
+ 'd-textarea[data-testid="translator-target-input"]'
94
+ )
95
+ if target_area is not None:
96
+ text = await target_area.get_attribute("value")
97
+ if text and text.strip():
98
+ return text.strip()
99
+ text = await target_area.text_content()
100
+ if text and text.strip():
101
+ return text.strip()
102
+
103
+ # Fallback: paragraph inside the target
104
+ target_p = await page.query_selector(
105
+ '[data-testid="translator-target-input"] p'
106
+ )
107
+ if target_p is not None:
108
+ text = await target_p.text_content()
109
+ if text and text.strip():
110
+ return text.strip()
111
+
112
+ return ""