PyPI - web2api - Versions diffs - 0.1.0__py3-none-any.whl - Mend

web2api 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

web2api/__init__.py +10 -0
web2api/bundled/plugins/catalog.yaml +15 -0
web2api/bundled/recipes/deepl/recipe.yaml +33 -0
web2api/bundled/recipes/deepl/scraper.py +112 -0
web2api/bundled/recipes/hackernews/recipe.yaml +97 -0
web2api/bundled/recipes/x/plugin.yaml +17 -0
web2api/bundled/recipes/x/recipe.yaml +19 -0
web2api/bundled/recipes/x/scraper.py +110 -0
web2api/cache.py +150 -0
web2api/cli.py +974 -0
web2api/config.py +165 -0
web2api/engine.py +502 -0
web2api/logging_utils.py +54 -0
web2api/main.py +412 -0
web2api/plugin.py +248 -0
web2api/plugin_manager.py +530 -0
web2api/pool.py +312 -0
web2api/registry.py +221 -0
web2api/schemas.py +85 -0
web2api/scraper.py +50 -0
web2api/self_update.py +164 -0
web2api/templates/index.html +576 -0
web2api-0.1.0.dist-info/METADATA +19 -0
web2api-0.1.0.dist-info/RECORD +27 -0
web2api-0.1.0.dist-info/WHEEL +5 -0
web2api-0.1.0.dist-info/entry_points.txt +2 -0
web2api-0.1.0.dist-info/top_level.txt +1 -0

web2api/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""Web2API package."""
+from importlib.metadata import PackageNotFoundError, version
+__all__ = ["__version__"]
+try:
+    __version__ = version("web2api")
+except PackageNotFoundError:
+    __version__ = "0.1.0"

web2api/bundled/plugins/catalog.yaml ADDED Viewed

@@ -0,0 +1,15 @@
+plugins:
+  hackernews:
+    description: "Built-in Hacker News recipe."
+    source: "../recipes/hackernews"
+    trusted: true
+  deepl:
+    description: "Built-in DeepL translation recipe."
+    source: "../recipes/deepl"
+    trusted: true
+  x:
+    description: "Built-in X/Twitter recipe (requires bird CLI and auth env vars)."
+    source: "../recipes/x"
+    trusted: true

web2api/bundled/recipes/deepl/recipe.yaml ADDED Viewed

@@ -0,0 +1,33 @@
+name: "DeepL Translator"
+slug: "deepl"
+base_url: "https://www.deepl.com"
+description: "Translate text between German and English using DeepL"
+endpoints:
+  de-en:
+    description: "German to English"
+    requires_query: true
+    url: "https://www.deepl.com/en/translator#de/en/"
+    items:
+      container: "d-textarea"
+      fields:
+        text:
+          selector: ""
+          attribute: "text"
+    pagination:
+      type: "page_param"
+      param: "p"
+      start: 1
+  en-de:
+    description: "English to German"
+    requires_query: true
+    url: "https://www.deepl.com/en/translator#en/de/"
+    items:
+      container: "d-textarea"
+      fields:
+        text:
+          selector: ""
+          attribute: "text"
+    pagination:
+      type: "page_param"
+      param: "p"
+      start: 1

web2api/bundled/recipes/deepl/scraper.py ADDED Viewed

@@ -0,0 +1,112 @@
+"""DeepL Translator scraper — supports multiple language pairs."""
+from __future__ import annotations
+import asyncio
+from typing import Any
+from playwright.async_api import Page
+from web2api.scraper import BaseScraper, ScrapeResult
+# Map endpoint names to (source_lang, target_lang) pairs
+_LANG_PAIRS: dict[str, tuple[str, str]] = {
+    "de-en": ("de", "en"),
+    "en-de": ("en", "de"),
+}
+class Scraper(BaseScraper):
+    """Translate text via DeepL's web translator."""
+    def supports(self, endpoint: str) -> bool:
+        return endpoint in _LANG_PAIRS
+    async def scrape(self, endpoint: str, page: Page, params: dict[str, Any]) -> ScrapeResult:
+        source_lang, target_lang = _LANG_PAIRS[endpoint]
+        query = params.get("query") or ""
+        if not query.strip():
+            return ScrapeResult(
+                items=[{
+                    "source_text": "",
+                    "translated_text": "",
+                    "source_lang": source_lang,
+                    "target_lang": target_lang,
+                }]
+            )
+        await page.goto(f"https://www.deepl.com/en/translator#{source_lang}/{target_lang}/")
+        source_area = await page.wait_for_selector(
+            'd-textarea[data-testid="translator-source-input"]',
+            timeout=15000,
+        )
+        if source_area is None:
+            raise RuntimeError("Could not find DeepL source input")
+        await source_area.click()
+        await page.keyboard.press("Control+a")
+        await page.keyboard.press("Backspace")
+        await page.keyboard.type(query, delay=10)
+        # Wait for translation to appear and stabilize.
+        # DeepL streams results progressively, so we wait until the
+        # target text stops changing for a few consecutive checks.
+        translated = ""
+        stable_count = 0
+        required_stable = 6  # must be unchanged for 6 consecutive checks (3s)
+        for _ in range(80):  # up to 40 seconds total
+            await asyncio.sleep(0.5)
+            current = await self._read_target(page)
+            if not current or current == query.strip():
+                stable_count = 0
+                continue
+            if current == translated:
+                stable_count += 1
+                if stable_count >= required_stable:
+                    break
+            else:
+                translated = current
+                stable_count = 0
+        if not translated:
+            raise RuntimeError("Translation did not appear within timeout")
+        return ScrapeResult(
+            items=[{
+                "source_text": query,
+                "translated_text": translated,
+                "source_lang": source_lang,
+                "target_lang": target_lang,
+            }],
+        )
+    @staticmethod
+    async def _read_target(page: Page) -> str:
+        """Extract the current translation text from the target area."""
+        # Try the value attribute first
+        target_area = await page.query_selector(
+            'd-textarea[data-testid="translator-target-input"]'
+        )
+        if target_area is not None:
+            text = await target_area.get_attribute("value")
+            if text and text.strip():
+                return text.strip()
+            text = await target_area.text_content()
+            if text and text.strip():
+                return text.strip()
+        # Fallback: paragraph inside the target
+        target_p = await page.query_selector(
+            '[data-testid="translator-target-input"] p'
+        )
+        if target_p is not None:
+            text = await target_p.text_content()
+            if text and text.strip():
+                return text.strip()
+        return ""

web2api/bundled/recipes/hackernews/recipe.yaml ADDED Viewed

@@ -0,0 +1,97 @@
+name: "Hacker News"
+slug: "hackernews"
+base_url: "https://news.ycombinator.com"
+description: "Hacker News front page stories and search results"
+endpoints:
+  read:
+    description: "Front page stories"
+    url: "https://news.ycombinator.com/news?p={page}"
+    actions:
+      - type: wait
+        selector: "tr.athing"
+        timeout: 10000
+    items:
+      container: "tr.athing"
+      fields:
+        title:
+          selector: ".titleline > a"
+          attribute: "text"
+        url:
+          selector: ".titleline > a"
+          attribute: "href"
+          transform: "absolute_url"
+        score:
+          selector: ".score"
+          context: "next_sibling"
+          attribute: "text"
+          transform: "regex_int"
+          optional: true
+        author:
+          selector: ".hnuser"
+          context: "next_sibling"
+          attribute: "text"
+          optional: true
+        comment_count:
+          selector: "a[href^='item?id=']:last-child"
+          context: "next_sibling"
+          attribute: "text"
+          transform: "regex_int"
+          optional: true
+        time_ago:
+          selector: ".age"
+          context: "next_sibling"
+          attribute: "text"
+          optional: true
+        id:
+          selector: ""
+          attribute: "id"
+    pagination:
+      type: "page_param"
+      param: "p"
+      start: 1
+  search:
+    description: "Search stories via Algolia"
+    requires_query: true
+    url: "https://hn.algolia.com/?q={query}&page={page_zero}"
+    actions:
+      - type: wait
+        selector: ".Story"
+        timeout: 15000
+    items:
+      container: ".Story"
+      fields:
+        title:
+          selector: ".Story_title a:first-child"
+          attribute: "text"
+        url:
+          selector: ".Story_title a:first-child"
+          attribute: "href"
+          transform: "absolute_url"
+        score:
+          selector: ".Story_meta span:first-child"
+          attribute: "text"
+          transform: "regex_int"
+          optional: true
+        author:
+          selector: ".Story_meta a[href^='https://news.ycombinator.com/user']"
+          attribute: "text"
+          optional: true
+        comment_count:
+          selector: ".Story_meta a[href*='item?id=']"
+          attribute: "text"
+          transform: "regex_int"
+          optional: true
+        time_ago:
+          selector: ".Story_meta span[title]"
+          attribute: "text"
+          optional: true
+        id:
+          selector: ".Story_meta a[href*='item?id=']"
+          attribute: "href"
+          transform: "regex_int"
+          optional: true
+    pagination:
+      type: "page_param"
+      param: "page"
+      start: 0

web2api/bundled/recipes/x/plugin.yaml ADDED Viewed

@@ -0,0 +1,17 @@
+version: "1.0.0"
+web2api:
+  min: "0.1.0"
+requires_env:
+  - BIRD_AUTH_TOKEN
+  - BIRD_CT0
+dependencies:
+  commands:
+    - bird
+  apt:
+    - nodejs
+  npm:
+    - "@steipete/bird"
+healthcheck:
+  command:
+    - bird
+    - --version

web2api/bundled/recipes/x/recipe.yaml ADDED Viewed

@@ -0,0 +1,19 @@
+name: "X (Twitter)"
+slug: "x"
+base_url: "https://x.com"
+description: "Retrieve recent posts from an X/Twitter user profile"
+endpoints:
+  posts:
+    description: "Get recent posts by username (q=username, count=N)"
+    requires_query: true
+    url: "https://x.com/{query}"
+    items:
+      container: "[data-testid='tweet']"
+      fields:
+        text:
+          selector: "[data-testid='tweetText']"
+          attribute: "text"
+    pagination:
+      type: "page_param"
+      param: "p"
+      start: 1

web2api/bundled/recipes/x/scraper.py ADDED Viewed

@@ -0,0 +1,110 @@
+"""X (Twitter) scraper — uses bird CLI for authenticated API access."""
+from __future__ import annotations
+import asyncio
+import json
+import os
+from typing import Any
+from playwright.async_api import Page
+from web2api.scraper import BaseScraper, ScrapeResult
+# Auth tokens read from ~/.bird_auth or environment
+_AUTH_TOKEN = os.environ.get("BIRD_AUTH_TOKEN", "")
+_CT0 = os.environ.get("BIRD_CT0", "")
+def _load_auth() -> tuple[str, str]:
+    """Load bird auth tokens from env or ~/.bird_auth file."""
+    auth_token = _AUTH_TOKEN
+    ct0 = _CT0
+    if auth_token and ct0:
+        return auth_token, ct0
+    bird_auth_path = os.path.expanduser("~/.bird_auth")
+    if os.path.exists(bird_auth_path):
+        with open(bird_auth_path) as f:
+            for line in f:
+                line = line.strip()
+                if line.startswith("AUTH_TOKEN="):
+                    auth_token = line.split("=", 1)[1]
+                elif line.startswith("CT0="):
+                    ct0 = line.split("=", 1)[1]
+    if not auth_token or not ct0:
+        raise RuntimeError(
+            "Missing X/Twitter credentials. "
+            "Set BIRD_AUTH_TOKEN + BIRD_CT0 env vars or create ~/.bird_auth"
+        )
+    return auth_token, ct0
+class Scraper(BaseScraper):
+    """Fetch user tweets via the bird CLI."""
+    def supports(self, endpoint: str) -> bool:
+        return endpoint == "posts"
+    async def scrape(self, endpoint: str, page: Page, params: dict[str, Any]) -> ScrapeResult:
+        username = (params.get("query") or "").strip().lstrip("@")
+        if not username:
+            raise RuntimeError("Missing username — pass q=<username>")
+        count = min(int(params.get("count", "10")), 50)
+        auth_token, ct0 = _load_auth()
+        # Shell out to bird CLI
+        cmd = [
+            "bird", "user-tweets", username,
+            "-n", str(count),
+            "--json",
+            "--auth-token", auth_token,
+            "--ct0", ct0,
+        ]
+        proc = await asyncio.create_subprocess_exec(
+            *cmd,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=30)
+        if proc.returncode != 0:
+            error_msg = stderr.decode().strip()
+            if "Could not find user" in error_msg or "not found" in error_msg.lower():
+                raise RuntimeError(f"Account @{username} not found")
+            raise RuntimeError(f"bird CLI failed: {error_msg}")
+        # Parse JSON output — bird prints info lines to stderr, JSON to stdout
+        raw_output = stdout.decode().strip()
+        # Find the JSON array in the output (skip any non-JSON lines)
+        json_start = raw_output.find("[")
+        if json_start == -1:
+            raise RuntimeError(f"No JSON output from bird CLI for @{username}")
+        tweets_data = json.loads(raw_output[json_start:])
+        items: list[dict[str, Any]] = []
+        for tweet in tweets_data[:count]:
+            author_username = tweet.get("author", {}).get("username", username)
+            items.append({
+                "text": tweet.get("text", ""),
+                "author": author_username,
+                "author_name": tweet.get("author", {}).get("name", ""),
+                "timestamp": tweet.get("createdAt", ""),
+                "url": f"https://x.com/{author_username}/status/{tweet.get('id', '')}",
+                "replies": tweet.get("replyCount"),
+                "reposts": tweet.get("retweetCount"),
+                "likes": tweet.get("likeCount"),
+                "views": tweet.get("viewCount"),
+                "is_retweet": tweet.get("text", "").startswith("RT @"),
+            })
+        return ScrapeResult(
+            items=items,
+            current_page=1,
+            has_next=len(tweets_data) > count,
+        )

web2api/cache.py ADDED Viewed

@@ -0,0 +1,150 @@
+"""In-memory API response cache with stale-while-revalidate semantics."""
+from __future__ import annotations
+import asyncio
+from collections import OrderedDict
+from dataclasses import dataclass
+from time import monotonic
+from typing import Awaitable, Callable, Literal, TypeAlias
+from web2api.schemas import ApiResponse
+CacheKey: TypeAlias = tuple[str, str, int, str | None, tuple[tuple[str, str], ...]]
+LookupState = Literal["miss", "fresh", "stale"]
+@dataclass(slots=True)
+class CacheLookup:
+    """Lookup result for a cache key."""
+    state: LookupState
+    response: ApiResponse | None = None
+@dataclass(slots=True)
+class _CacheEntry:
+    response: ApiResponse
+    expires_at: float
+    stale_until: float
+    refreshing: bool = False
+class ResponseCache:
+    """Store successful scrape responses keyed by request parameters."""
+    def __init__(
+        self,
+        *,
+        ttl_seconds: float = 30.0,
+        stale_ttl_seconds: float = 120.0,
+        max_entries: int = 500,
+    ) -> None:
+        self.ttl_seconds = max(0.0, ttl_seconds)
+        self.stale_ttl_seconds = max(0.0, stale_ttl_seconds)
+        self.max_entries = max(1, max_entries)
+        self._entries: OrderedDict[CacheKey, _CacheEntry] = OrderedDict()
+        self._lock = asyncio.Lock()
+        self._hits = 0
+        self._stale_hits = 0
+        self._misses = 0
+        self._stores = 0
+        self._evictions = 0
+        self._refresh_tasks: set[asyncio.Task[None]] = set()
+    async def get(self, key: CacheKey) -> CacheLookup:
+        """Look up a cache entry and classify it as fresh/stale/miss."""
+        now = monotonic()
+        async with self._lock:
+            self._purge_expired_unlocked(now)
+            entry = self._entries.get(key)
+            if entry is None:
+                self._misses += 1
+                return CacheLookup(state="miss")
+            self._entries.move_to_end(key)
+            if entry.expires_at > now:
+                self._hits += 1
+                return CacheLookup(state="fresh", response=entry.response.model_copy(deep=True))
+            self._stale_hits += 1
+            return CacheLookup(state="stale", response=entry.response.model_copy(deep=True))
+    async def set(self, key: CacheKey, response: ApiResponse) -> None:
+        """Insert or replace a successful response in cache."""
+        if self.ttl_seconds <= 0 or response.error is not None:
+            return
+        now = monotonic()
+        entry = _CacheEntry(
+            response=response.model_copy(deep=True),
+            expires_at=now + self.ttl_seconds,
+            stale_until=now + self.ttl_seconds + self.stale_ttl_seconds,
+        )
+        async with self._lock:
+            self._purge_expired_unlocked(now)
+            self._entries[key] = entry
+            self._entries.move_to_end(key)
+            self._stores += 1
+            self._trim_to_capacity_unlocked()
+    async def trigger_refresh(
+        self,
+        key: CacheKey,
+        refresher: Callable[[], Awaitable[ApiResponse]],
+    ) -> None:
+        """Refresh a stale key in the background if not already refreshing."""
+        async with self._lock:
+            entry = self._entries.get(key)
+            if entry is None or entry.refreshing:
+                return
+            entry.refreshing = True
+        task = asyncio.create_task(self._run_refresh(key, refresher))
+        self._refresh_tasks.add(task)
+        task.add_done_callback(self._refresh_tasks.discard)
+    async def stats(self) -> dict[str, int | float | bool]:
+        """Return cache health and counters for diagnostics."""
+        now = monotonic()
+        async with self._lock:
+            self._purge_expired_unlocked(now)
+            return {
+                "enabled": True,
+                "ttl_seconds": self.ttl_seconds,
+                "stale_ttl_seconds": self.stale_ttl_seconds,
+                "max_entries": self.max_entries,
+                "entries": len(self._entries),
+                "hits": self._hits,
+                "stale_hits": self._stale_hits,
+                "misses": self._misses,
+                "stores": self._stores,
+                "evictions": self._evictions,
+                "refresh_tasks": len(self._refresh_tasks),
+            }
+    async def _run_refresh(
+        self,
+        key: CacheKey,
+        refresher: Callable[[], Awaitable[ApiResponse]],
+    ) -> None:
+        try:
+            refreshed = await refresher()
+            if refreshed.error is None:
+                await self.set(key, refreshed)
+        finally:
+            async with self._lock:
+                entry = self._entries.get(key)
+                if entry is not None:
+                    entry.refreshing = False
+    def _purge_expired_unlocked(self, now: float) -> None:
+        expired_keys = [key for key, entry in self._entries.items() if entry.stale_until <= now]
+        for key in expired_keys:
+            self._entries.pop(key, None)
+            self._evictions += 1
+    def _trim_to_capacity_unlocked(self) -> None:
+        while len(self._entries) > self.max_entries:
+            self._entries.popitem(last=False)
+            self._evictions += 1