web2api 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
web2api/__init__.py ADDED
@@ -0,0 +1,10 @@
1
+ """Web2API package."""
2
+
3
+ from importlib.metadata import PackageNotFoundError, version
4
+
5
+ __all__ = ["__version__"]
6
+
7
+ try:
8
+ __version__ = version("web2api")
9
+ except PackageNotFoundError:
10
+ __version__ = "0.1.0"
@@ -0,0 +1,15 @@
1
+ plugins:
2
+ hackernews:
3
+ description: "Built-in Hacker News recipe."
4
+ source: "../recipes/hackernews"
5
+ trusted: true
6
+
7
+ deepl:
8
+ description: "Built-in DeepL translation recipe."
9
+ source: "../recipes/deepl"
10
+ trusted: true
11
+
12
+ x:
13
+ description: "Built-in X/Twitter recipe (requires bird CLI and auth env vars)."
14
+ source: "../recipes/x"
15
+ trusted: true
@@ -0,0 +1,33 @@
1
+ name: "DeepL Translator"
2
+ slug: "deepl"
3
+ base_url: "https://www.deepl.com"
4
+ description: "Translate text between German and English using DeepL"
5
+ endpoints:
6
+ de-en:
7
+ description: "German to English"
8
+ requires_query: true
9
+ url: "https://www.deepl.com/en/translator#de/en/"
10
+ items:
11
+ container: "d-textarea"
12
+ fields:
13
+ text:
14
+ selector: ""
15
+ attribute: "text"
16
+ pagination:
17
+ type: "page_param"
18
+ param: "p"
19
+ start: 1
20
+ en-de:
21
+ description: "English to German"
22
+ requires_query: true
23
+ url: "https://www.deepl.com/en/translator#en/de/"
24
+ items:
25
+ container: "d-textarea"
26
+ fields:
27
+ text:
28
+ selector: ""
29
+ attribute: "text"
30
+ pagination:
31
+ type: "page_param"
32
+ param: "p"
33
+ start: 1
@@ -0,0 +1,112 @@
1
+ """DeepL Translator scraper — supports multiple language pairs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from typing import Any
7
+
8
+ from playwright.async_api import Page
9
+
10
+ from web2api.scraper import BaseScraper, ScrapeResult
11
+
12
+ # Map endpoint names to (source_lang, target_lang) pairs
13
+ _LANG_PAIRS: dict[str, tuple[str, str]] = {
14
+ "de-en": ("de", "en"),
15
+ "en-de": ("en", "de"),
16
+ }
17
+
18
+
19
+ class Scraper(BaseScraper):
20
+ """Translate text via DeepL's web translator."""
21
+
22
+ def supports(self, endpoint: str) -> bool:
23
+ return endpoint in _LANG_PAIRS
24
+
25
+ async def scrape(self, endpoint: str, page: Page, params: dict[str, Any]) -> ScrapeResult:
26
+ source_lang, target_lang = _LANG_PAIRS[endpoint]
27
+ query = params.get("query") or ""
28
+
29
+ if not query.strip():
30
+ return ScrapeResult(
31
+ items=[{
32
+ "source_text": "",
33
+ "translated_text": "",
34
+ "source_lang": source_lang,
35
+ "target_lang": target_lang,
36
+ }]
37
+ )
38
+
39
+ await page.goto(f"https://www.deepl.com/en/translator#{source_lang}/{target_lang}/")
40
+
41
+ source_area = await page.wait_for_selector(
42
+ 'd-textarea[data-testid="translator-source-input"]',
43
+ timeout=15000,
44
+ )
45
+ if source_area is None:
46
+ raise RuntimeError("Could not find DeepL source input")
47
+
48
+ await source_area.click()
49
+ await page.keyboard.press("Control+a")
50
+ await page.keyboard.press("Backspace")
51
+ await page.keyboard.type(query, delay=10)
52
+
53
+ # Wait for translation to appear and stabilize.
54
+ # DeepL streams results progressively, so we wait until the
55
+ # target text stops changing for a few consecutive checks.
56
+ translated = ""
57
+ stable_count = 0
58
+ required_stable = 6 # must be unchanged for 6 consecutive checks (3s)
59
+
60
+ for _ in range(80): # up to 40 seconds total
61
+ await asyncio.sleep(0.5)
62
+ current = await self._read_target(page)
63
+
64
+ if not current or current == query.strip():
65
+ stable_count = 0
66
+ continue
67
+
68
+ if current == translated:
69
+ stable_count += 1
70
+ if stable_count >= required_stable:
71
+ break
72
+ else:
73
+ translated = current
74
+ stable_count = 0
75
+
76
+ if not translated:
77
+ raise RuntimeError("Translation did not appear within timeout")
78
+
79
+ return ScrapeResult(
80
+ items=[{
81
+ "source_text": query,
82
+ "translated_text": translated,
83
+ "source_lang": source_lang,
84
+ "target_lang": target_lang,
85
+ }],
86
+ )
87
+
88
+ @staticmethod
89
+ async def _read_target(page: Page) -> str:
90
+ """Extract the current translation text from the target area."""
91
+ # Try the value attribute first
92
+ target_area = await page.query_selector(
93
+ 'd-textarea[data-testid="translator-target-input"]'
94
+ )
95
+ if target_area is not None:
96
+ text = await target_area.get_attribute("value")
97
+ if text and text.strip():
98
+ return text.strip()
99
+ text = await target_area.text_content()
100
+ if text and text.strip():
101
+ return text.strip()
102
+
103
+ # Fallback: paragraph inside the target
104
+ target_p = await page.query_selector(
105
+ '[data-testid="translator-target-input"] p'
106
+ )
107
+ if target_p is not None:
108
+ text = await target_p.text_content()
109
+ if text and text.strip():
110
+ return text.strip()
111
+
112
+ return ""
@@ -0,0 +1,97 @@
1
+ name: "Hacker News"
2
+ slug: "hackernews"
3
+ base_url: "https://news.ycombinator.com"
4
+ description: "Hacker News front page stories and search results"
5
+ endpoints:
6
+ read:
7
+ description: "Front page stories"
8
+ url: "https://news.ycombinator.com/news?p={page}"
9
+ actions:
10
+ - type: wait
11
+ selector: "tr.athing"
12
+ timeout: 10000
13
+ items:
14
+ container: "tr.athing"
15
+ fields:
16
+ title:
17
+ selector: ".titleline > a"
18
+ attribute: "text"
19
+ url:
20
+ selector: ".titleline > a"
21
+ attribute: "href"
22
+ transform: "absolute_url"
23
+ score:
24
+ selector: ".score"
25
+ context: "next_sibling"
26
+ attribute: "text"
27
+ transform: "regex_int"
28
+ optional: true
29
+ author:
30
+ selector: ".hnuser"
31
+ context: "next_sibling"
32
+ attribute: "text"
33
+ optional: true
34
+ comment_count:
35
+ selector: "a[href^='item?id=']:last-child"
36
+ context: "next_sibling"
37
+ attribute: "text"
38
+ transform: "regex_int"
39
+ optional: true
40
+ time_ago:
41
+ selector: ".age"
42
+ context: "next_sibling"
43
+ attribute: "text"
44
+ optional: true
45
+ id:
46
+ selector: ""
47
+ attribute: "id"
48
+ pagination:
49
+ type: "page_param"
50
+ param: "p"
51
+ start: 1
52
+
53
+ search:
54
+ description: "Search stories via Algolia"
55
+ requires_query: true
56
+ url: "https://hn.algolia.com/?q={query}&page={page_zero}"
57
+ actions:
58
+ - type: wait
59
+ selector: ".Story"
60
+ timeout: 15000
61
+ items:
62
+ container: ".Story"
63
+ fields:
64
+ title:
65
+ selector: ".Story_title a:first-child"
66
+ attribute: "text"
67
+ url:
68
+ selector: ".Story_title a:first-child"
69
+ attribute: "href"
70
+ transform: "absolute_url"
71
+ score:
72
+ selector: ".Story_meta span:first-child"
73
+ attribute: "text"
74
+ transform: "regex_int"
75
+ optional: true
76
+ author:
77
+ selector: ".Story_meta a[href^='https://news.ycombinator.com/user']"
78
+ attribute: "text"
79
+ optional: true
80
+ comment_count:
81
+ selector: ".Story_meta a[href*='item?id=']"
82
+ attribute: "text"
83
+ transform: "regex_int"
84
+ optional: true
85
+ time_ago:
86
+ selector: ".Story_meta span[title]"
87
+ attribute: "text"
88
+ optional: true
89
+ id:
90
+ selector: ".Story_meta a[href*='item?id=']"
91
+ attribute: "href"
92
+ transform: "regex_int"
93
+ optional: true
94
+ pagination:
95
+ type: "page_param"
96
+ param: "page"
97
+ start: 0
@@ -0,0 +1,17 @@
1
+ version: "1.0.0"
2
+ web2api:
3
+ min: "0.1.0"
4
+ requires_env:
5
+ - BIRD_AUTH_TOKEN
6
+ - BIRD_CT0
7
+ dependencies:
8
+ commands:
9
+ - bird
10
+ apt:
11
+ - nodejs
12
+ npm:
13
+ - "@steipete/bird"
14
+ healthcheck:
15
+ command:
16
+ - bird
17
+ - --version
@@ -0,0 +1,19 @@
1
+ name: "X (Twitter)"
2
+ slug: "x"
3
+ base_url: "https://x.com"
4
+ description: "Retrieve recent posts from an X/Twitter user profile"
5
+ endpoints:
6
+ posts:
7
+ description: "Get recent posts by username (q=username, count=N)"
8
+ requires_query: true
9
+ url: "https://x.com/{query}"
10
+ items:
11
+ container: "[data-testid='tweet']"
12
+ fields:
13
+ text:
14
+ selector: "[data-testid='tweetText']"
15
+ attribute: "text"
16
+ pagination:
17
+ type: "page_param"
18
+ param: "p"
19
+ start: 1
@@ -0,0 +1,110 @@
1
+ """X (Twitter) scraper — uses bird CLI for authenticated API access."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import os
8
+ from typing import Any
9
+
10
+ from playwright.async_api import Page
11
+
12
+ from web2api.scraper import BaseScraper, ScrapeResult
13
+
14
+ # Auth tokens read from ~/.bird_auth or environment
15
+ _AUTH_TOKEN = os.environ.get("BIRD_AUTH_TOKEN", "")
16
+ _CT0 = os.environ.get("BIRD_CT0", "")
17
+
18
+
19
+ def _load_auth() -> tuple[str, str]:
20
+ """Load bird auth tokens from env or ~/.bird_auth file."""
21
+ auth_token = _AUTH_TOKEN
22
+ ct0 = _CT0
23
+ if auth_token and ct0:
24
+ return auth_token, ct0
25
+
26
+ bird_auth_path = os.path.expanduser("~/.bird_auth")
27
+ if os.path.exists(bird_auth_path):
28
+ with open(bird_auth_path) as f:
29
+ for line in f:
30
+ line = line.strip()
31
+ if line.startswith("AUTH_TOKEN="):
32
+ auth_token = line.split("=", 1)[1]
33
+ elif line.startswith("CT0="):
34
+ ct0 = line.split("=", 1)[1]
35
+
36
+ if not auth_token or not ct0:
37
+ raise RuntimeError(
38
+ "Missing X/Twitter credentials. "
39
+ "Set BIRD_AUTH_TOKEN + BIRD_CT0 env vars or create ~/.bird_auth"
40
+ )
41
+ return auth_token, ct0
42
+
43
+
44
+ class Scraper(BaseScraper):
45
+ """Fetch user tweets via the bird CLI."""
46
+
47
+ def supports(self, endpoint: str) -> bool:
48
+ return endpoint == "posts"
49
+
50
+ async def scrape(self, endpoint: str, page: Page, params: dict[str, Any]) -> ScrapeResult:
51
+ username = (params.get("query") or "").strip().lstrip("@")
52
+ if not username:
53
+ raise RuntimeError("Missing username — pass q=<username>")
54
+
55
+ count = min(int(params.get("count", "10")), 50)
56
+ auth_token, ct0 = _load_auth()
57
+
58
+ # Shell out to bird CLI
59
+ cmd = [
60
+ "bird", "user-tweets", username,
61
+ "-n", str(count),
62
+ "--json",
63
+ "--auth-token", auth_token,
64
+ "--ct0", ct0,
65
+ ]
66
+
67
+ proc = await asyncio.create_subprocess_exec(
68
+ *cmd,
69
+ stdout=asyncio.subprocess.PIPE,
70
+ stderr=asyncio.subprocess.PIPE,
71
+ )
72
+ stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=30)
73
+
74
+ if proc.returncode != 0:
75
+ error_msg = stderr.decode().strip()
76
+ if "Could not find user" in error_msg or "not found" in error_msg.lower():
77
+ raise RuntimeError(f"Account @{username} not found")
78
+ raise RuntimeError(f"bird CLI failed: {error_msg}")
79
+
80
+ # Parse JSON output — bird prints info lines to stderr, JSON to stdout
81
+ raw_output = stdout.decode().strip()
82
+
83
+ # Find the JSON array in the output (skip any non-JSON lines)
84
+ json_start = raw_output.find("[")
85
+ if json_start == -1:
86
+ raise RuntimeError(f"No JSON output from bird CLI for @{username}")
87
+
88
+ tweets_data = json.loads(raw_output[json_start:])
89
+
90
+ items: list[dict[str, Any]] = []
91
+ for tweet in tweets_data[:count]:
92
+ author_username = tweet.get("author", {}).get("username", username)
93
+ items.append({
94
+ "text": tweet.get("text", ""),
95
+ "author": author_username,
96
+ "author_name": tweet.get("author", {}).get("name", ""),
97
+ "timestamp": tweet.get("createdAt", ""),
98
+ "url": f"https://x.com/{author_username}/status/{tweet.get('id', '')}",
99
+ "replies": tweet.get("replyCount"),
100
+ "reposts": tweet.get("retweetCount"),
101
+ "likes": tweet.get("likeCount"),
102
+ "views": tweet.get("viewCount"),
103
+ "is_retweet": tweet.get("text", "").startswith("RT @"),
104
+ })
105
+
106
+ return ScrapeResult(
107
+ items=items,
108
+ current_page=1,
109
+ has_next=len(tweets_data) > count,
110
+ )
web2api/cache.py ADDED
@@ -0,0 +1,150 @@
1
+ """In-memory API response cache with stale-while-revalidate semantics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from collections import OrderedDict
7
+ from dataclasses import dataclass
8
+ from time import monotonic
9
+ from typing import Awaitable, Callable, Literal, TypeAlias
10
+
11
+ from web2api.schemas import ApiResponse
12
+
13
+ CacheKey: TypeAlias = tuple[str, str, int, str | None, tuple[tuple[str, str], ...]]
14
+ LookupState = Literal["miss", "fresh", "stale"]
15
+
16
+
17
+ @dataclass(slots=True)
18
+ class CacheLookup:
19
+ """Lookup result for a cache key."""
20
+
21
+ state: LookupState
22
+ response: ApiResponse | None = None
23
+
24
+
25
+ @dataclass(slots=True)
26
+ class _CacheEntry:
27
+ response: ApiResponse
28
+ expires_at: float
29
+ stale_until: float
30
+ refreshing: bool = False
31
+
32
+
33
+ class ResponseCache:
34
+ """Store successful scrape responses keyed by request parameters."""
35
+
36
+ def __init__(
37
+ self,
38
+ *,
39
+ ttl_seconds: float = 30.0,
40
+ stale_ttl_seconds: float = 120.0,
41
+ max_entries: int = 500,
42
+ ) -> None:
43
+ self.ttl_seconds = max(0.0, ttl_seconds)
44
+ self.stale_ttl_seconds = max(0.0, stale_ttl_seconds)
45
+ self.max_entries = max(1, max_entries)
46
+ self._entries: OrderedDict[CacheKey, _CacheEntry] = OrderedDict()
47
+ self._lock = asyncio.Lock()
48
+ self._hits = 0
49
+ self._stale_hits = 0
50
+ self._misses = 0
51
+ self._stores = 0
52
+ self._evictions = 0
53
+ self._refresh_tasks: set[asyncio.Task[None]] = set()
54
+
55
+ async def get(self, key: CacheKey) -> CacheLookup:
56
+ """Look up a cache entry and classify it as fresh/stale/miss."""
57
+ now = monotonic()
58
+ async with self._lock:
59
+ self._purge_expired_unlocked(now)
60
+ entry = self._entries.get(key)
61
+ if entry is None:
62
+ self._misses += 1
63
+ return CacheLookup(state="miss")
64
+
65
+ self._entries.move_to_end(key)
66
+ if entry.expires_at > now:
67
+ self._hits += 1
68
+ return CacheLookup(state="fresh", response=entry.response.model_copy(deep=True))
69
+
70
+ self._stale_hits += 1
71
+ return CacheLookup(state="stale", response=entry.response.model_copy(deep=True))
72
+
73
+ async def set(self, key: CacheKey, response: ApiResponse) -> None:
74
+ """Insert or replace a successful response in cache."""
75
+ if self.ttl_seconds <= 0 or response.error is not None:
76
+ return
77
+
78
+ now = monotonic()
79
+ entry = _CacheEntry(
80
+ response=response.model_copy(deep=True),
81
+ expires_at=now + self.ttl_seconds,
82
+ stale_until=now + self.ttl_seconds + self.stale_ttl_seconds,
83
+ )
84
+ async with self._lock:
85
+ self._purge_expired_unlocked(now)
86
+ self._entries[key] = entry
87
+ self._entries.move_to_end(key)
88
+ self._stores += 1
89
+ self._trim_to_capacity_unlocked()
90
+
91
+ async def trigger_refresh(
92
+ self,
93
+ key: CacheKey,
94
+ refresher: Callable[[], Awaitable[ApiResponse]],
95
+ ) -> None:
96
+ """Refresh a stale key in the background if not already refreshing."""
97
+ async with self._lock:
98
+ entry = self._entries.get(key)
99
+ if entry is None or entry.refreshing:
100
+ return
101
+ entry.refreshing = True
102
+
103
+ task = asyncio.create_task(self._run_refresh(key, refresher))
104
+ self._refresh_tasks.add(task)
105
+ task.add_done_callback(self._refresh_tasks.discard)
106
+
107
+ async def stats(self) -> dict[str, int | float | bool]:
108
+ """Return cache health and counters for diagnostics."""
109
+ now = monotonic()
110
+ async with self._lock:
111
+ self._purge_expired_unlocked(now)
112
+ return {
113
+ "enabled": True,
114
+ "ttl_seconds": self.ttl_seconds,
115
+ "stale_ttl_seconds": self.stale_ttl_seconds,
116
+ "max_entries": self.max_entries,
117
+ "entries": len(self._entries),
118
+ "hits": self._hits,
119
+ "stale_hits": self._stale_hits,
120
+ "misses": self._misses,
121
+ "stores": self._stores,
122
+ "evictions": self._evictions,
123
+ "refresh_tasks": len(self._refresh_tasks),
124
+ }
125
+
126
+ async def _run_refresh(
127
+ self,
128
+ key: CacheKey,
129
+ refresher: Callable[[], Awaitable[ApiResponse]],
130
+ ) -> None:
131
+ try:
132
+ refreshed = await refresher()
133
+ if refreshed.error is None:
134
+ await self.set(key, refreshed)
135
+ finally:
136
+ async with self._lock:
137
+ entry = self._entries.get(key)
138
+ if entry is not None:
139
+ entry.refreshing = False
140
+
141
+ def _purge_expired_unlocked(self, now: float) -> None:
142
+ expired_keys = [key for key, entry in self._entries.items() if entry.stale_until <= now]
143
+ for key in expired_keys:
144
+ self._entries.pop(key, None)
145
+ self._evictions += 1
146
+
147
+ def _trim_to_capacity_unlocked(self) -> None:
148
+ while len(self._entries) > self.max_entries:
149
+ self._entries.popitem(last=False)
150
+ self._evictions += 1