PyPI - scrape-forvo - Versions diffs - 0.1.0__tar.gz - Mend

scrape-forvo 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

scrape_forvo-0.1.0/PKG-INFO +77 -0
scrape_forvo-0.1.0/README.md +65 -0
scrape_forvo-0.1.0/pyproject.toml +30 -0
scrape_forvo-0.1.0/setup.cfg +4 -0
scrape_forvo-0.1.0/src/scrape_forvo/__init__.py +6 -0
scrape_forvo-0.1.0/src/scrape_forvo/__main__.py +7 -0
scrape_forvo-0.1.0/src/scrape_forvo/api.py +127 -0
scrape_forvo-0.1.0/src/scrape_forvo/cli.py +60 -0
scrape_forvo-0.1.0/src/scrape_forvo/download.py +43 -0
scrape_forvo-0.1.0/src/scrape_forvo/errors.py +14 -0
scrape_forvo-0.1.0/src/scrape_forvo/fetch.py +115 -0
scrape_forvo-0.1.0/src/scrape_forvo/parse.py +83 -0
scrape_forvo-0.1.0/src/scrape_forvo/types.py +11 -0
scrape_forvo-0.1.0/src/scrape_forvo.egg-info/PKG-INFO +77 -0
scrape_forvo-0.1.0/src/scrape_forvo.egg-info/SOURCES.txt +22 -0
scrape_forvo-0.1.0/src/scrape_forvo.egg-info/dependency_links.txt +1 -0
scrape_forvo-0.1.0/src/scrape_forvo.egg-info/entry_points.txt +2 -0
scrape_forvo-0.1.0/src/scrape_forvo.egg-info/requires.txt +6 -0
scrape_forvo-0.1.0/src/scrape_forvo.egg-info/top_level.txt +1 -0
scrape_forvo-0.1.0/tests/test_api.py +38 -0
scrape_forvo-0.1.0/tests/test_cli.py +23 -0
scrape_forvo-0.1.0/tests/test_download.py +60 -0
scrape_forvo-0.1.0/tests/test_fetch_requests.py +44 -0
scrape_forvo-0.1.0/tests/test_parse.py +38 -0

scrape_forvo-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,77 @@
+Metadata-Version: 2.4
+Name: scrape-forvo
+Version: 0.1.0
+Summary: Add your description here
+Requires-Python: >=3.13
+Description-Content-Type: text/markdown
+Requires-Dist: playwright>=1.58.0
+Requires-Dist: requests>=2.32.5
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0.0; extra == "dev"
+Requires-Dist: responses>=0.25.0; extra == "dev"
+# scrape-forvo
+Download pronunciation MP3s from Forvo search pages.
+## Installation
+```bash
+python -m pip install -e .
+```
+## Usage
+Only this command is confirmed to work reliably:
+```bash
+scrape-forvo https://forvo.com/search/egg/no/ --use-playwright --headed
+```
+## Scriptable Usage
+You can also import `scrape_forvo` and use it from Python:
+```python
+from scrape_forvo import scrape
+result = scrape(
+    "https://forvo.com/search/egg/no/",
+    outdir="forvo_mp3",
+    use_playwright=True,
+    headed=True,
+)
+print(result.downloaded_count)
+for candidate in result.candidates:
+    print(candidate.url, "->", candidate.out_path)
+```
+The `scrape()` arguments map directly to CLI flags, so both interfaces share the same behavior without duplicated logic.
+## Development
+Install dev dependencies:
+```bash
+python -m pip install -e .[dev]
+```
+Run tests:
+```bash
+pytest
+```
+### Optional live test
+Set `FORVO_LIVE_TEST=1` to enable the live integration test.
+## TODO
+edge cases
+- [ ] when multiple pronunciation files come out. which one to pick?
+- [ ] when there's no pronunciation available.
+integration
+- [ ] integration with the vocab repo

scrape_forvo-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,65 @@
+# scrape-forvo
+Download pronunciation MP3s from Forvo search pages.
+## Installation
+```bash
+python -m pip install -e .
+```
+## Usage
+Only this command is confirmed to work reliably:
+```bash
+scrape-forvo https://forvo.com/search/egg/no/ --use-playwright --headed
+```
+## Scriptable Usage
+You can also import `scrape_forvo` and use it from Python:
+```python
+from scrape_forvo import scrape
+result = scrape(
+    "https://forvo.com/search/egg/no/",
+    outdir="forvo_mp3",
+    use_playwright=True,
+    headed=True,
+)
+print(result.downloaded_count)
+for candidate in result.candidates:
+    print(candidate.url, "->", candidate.out_path)
+```
+The `scrape()` arguments map directly to CLI flags, so both interfaces share the same behavior without duplicated logic.
+## Development
+Install dev dependencies:
+```bash
+python -m pip install -e .[dev]
+```
+Run tests:
+```bash
+pytest
+```
+### Optional live test
+Set `FORVO_LIVE_TEST=1` to enable the live integration test.
+## TODO
+edge cases
+- [ ] when multiple pronunciation files come out. which one to pick?
+- [ ] when there's no pronunciation available.
+integration
+- [ ] integration with the vocab repo

scrape_forvo-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,30 @@
+[project]
+name = "scrape-forvo"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "playwright>=1.58.0",
+    "requests>=2.32.5",
+]
+[project.scripts]
+scrape-forvo = "scrape_forvo.cli:main"
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "responses>=0.25.0",
+]
+[build-system]
+requires = ["setuptools>=69.0.0"]
+build-backend = "setuptools.build_meta"
+[tool.setuptools]
+package-dir = {"" = "src"}
+packages = ["scrape_forvo"]
+[tool.pytest.ini_options]
+testpaths = ["tests"]

scrape_forvo-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

scrape_forvo-0.1.0/src/scrape_forvo/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from __future__ import annotations
+from .api import AudioCandidate, ScrapeResult, scrape
+__all__ = ["__version__", "AudioCandidate", "ScrapeResult", "scrape"]
+__version__ = "0.1.0"

scrape_forvo-0.1.0/src/scrape_forvo/__main__.py ADDED Viewed

@@ -0,0 +1,7 @@
+from __future__ import annotations
+from .cli import main
+if __name__ == "__main__":
+    raise SystemExit(main())

scrape_forvo-0.1.0/src/scrape_forvo/api.py ADDED Viewed

@@ -0,0 +1,127 @@
+from __future__ import annotations
+import os
+from dataclasses import dataclass
+from typing import Callable, Optional
+import requests
+from .download import download_audio, select_audio_url
+from .fetch import fetch_html_playwright, fetch_html_requests, make_session
+from .parse import extract_audio_host, iter_play_items, page_slug, safe_filename
+@dataclass(frozen=True)
+class AudioCandidate:
+    play_id: str
+    label: str
+    url: str
+    out_path: str
+@dataclass(frozen=True)
+class ScrapeResult:
+    downloaded_count: int
+    candidates: tuple[AudioCandidate, ...]
+def scrape(
+    url: str,
+    *,
+    outdir: str = "forvo_mp3",
+    limit: int = 1000,
+    dry_run: bool = False,
+    no_head: bool = False,
+    prefix: str | None = None,
+    use_playwright: bool = True,
+    headed: bool = False,
+    emit: Optional[Callable[[str], None]] = None,
+) -> ScrapeResult:
+    """Scrape pronunciation audio (MP3) from a Forvo page and optionally download.
+    Fetches the page, parses play items, resolves MP3 URLs, and downloads files
+    into ``outdir``. Skips duplicates and failed downloads; continues until
+    ``limit`` items are downloaded or the page is exhausted.
+    Args:
+        url: Forvo pronunciation page URL to scrape.
+        outdir: Directory to save MP3 files. Created if missing.
+        limit: Maximum number of audio files to download (0 = no limit).
+        dry_run: If True, do not download; only collect candidates and count.
+        no_head: If True, skip HEAD checks when resolving audio URLs.
+        prefix: Filename prefix; if None, derived from the page URL slug.
+        use_playwright: If True, fetch HTML with Playwright instead of requests.
+        headed: If True, run Playwright browser in headed (visible) mode.
+        emit: Optional callback called with progress strings (URLs, paths, skips).
+    Returns:
+        ScrapeResult with downloaded_count and candidates (all considered items).
+    Raises:
+        ValueError: If limit is negative.
+    """
+    if limit < 0:
+        raise ValueError("limit must be >= 0")
+    name_prefix = safe_filename(prefix) if prefix else page_slug(url)
+    session = make_session()
+    if use_playwright:
+        html = fetch_html_playwright(url, headed=headed)
+    else:
+        html = fetch_html_requests(url, session)
+    audio_host = extract_audio_host(html)
+    base_url = f"https://{audio_host}/audios/mp3"
+    downloaded = 0
+    seen_urls: set[str] = set()
+    candidates: list[AudioCandidate] = []
+    for item in iter_play_items(html):
+        if downloaded >= limit:
+            break
+        chosen_url = select_audio_url(
+            item,
+            base_url,
+            session,
+            no_head=no_head,
+            seen_urls=seen_urls,
+        )
+        if not chosen_url:
+            if emit:
+                emit(f"[skip] play_id={item.play_id} label={item.label} (no working mp3 URL)")
+            continue
+        seen_urls.add(chosen_url)
+        filename = f"{name_prefix}_{safe_filename(item.label)}_{item.play_id}_{downloaded+1:03d}.mp3"
+        out_path = os.path.join(outdir, filename)
+        candidates.append(
+            AudioCandidate(
+                play_id=item.play_id,
+                label=item.label,
+                url=chosen_url,
+                out_path=out_path,
+            )
+        )
+        if emit:
+            emit(chosen_url)
+        if dry_run:
+            downloaded += 1
+            continue
+        try:
+            download_audio(chosen_url, out_path, session)
+            if emit:
+                emit(f"  -> {out_path}")
+            downloaded += 1
+        except requests.HTTPError as exc:
+            if emit:
+                emit(f"[skip] download failed for {chosen_url}: {exc}")
+        except requests.RequestException as exc:
+            if emit:
+                emit(f"[skip] network error for {chosen_url}: {exc}")
+    return ScrapeResult(downloaded_count=downloaded, candidates=tuple(candidates))

scrape_forvo-0.1.0/src/scrape_forvo/cli.py ADDED Viewed

@@ -0,0 +1,60 @@
+from __future__ import annotations
+import argparse
+import logging
+import sys
+from typing import Iterable
+from .api import scrape
+def _configure_logging() -> logging.Logger:
+    logging.basicConfig(level=logging.INFO, format="%(message)s", force=True, stream=sys.stdout)
+    return logging.getLogger("scrape_forvo")
+def _parse_args(argv: Iterable[str] | None) -> argparse.Namespace:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("url", help="Forvo URL, e.g. https://forvo.com/search/egg/no/")
+    ap.add_argument("-o", "--outdir", default="forvo_mp3", help="Output directory")
+    ap.add_argument("--limit", type=int, default=1000, help="Max downloads")
+    ap.add_argument("--dry-run", action="store_true", help="Print only; do not download")
+    ap.add_argument("--no-head", action="store_true", help="Skip HEAD probe; try GET directly")
+    ap.add_argument("--prefix", default=None, help="Filename prefix (default: derived from URL)")
+    ap.add_argument(
+        "--use-playwright",
+        action="store_true",
+        help="Use Playwright to fetch HTML (bypasses many 403 blocks).",
+    )
+    ap.add_argument(
+        "--headed",
+        action="store_true",
+        help="Show browser window (use if Cloudflare blocks headless; challenge often passes when visible).",
+    )
+    return ap.parse_args(argv)
+def main(argv: Iterable[str] | None = None) -> int:
+    args = _parse_args(argv)
+    log = _configure_logging()
+    result = scrape(
+        args.url,
+        outdir=args.outdir,
+        limit=args.limit,
+        dry_run=args.dry_run,
+        no_head=args.no_head,
+        prefix=args.prefix,
+        use_playwright=args.use_playwright,
+        headed=args.headed,
+        emit=log.info,
+    )
+    if result.downloaded_count == 0:
+        log.info("No MP3s downloaded (no valid Play(...) mp3 URLs found).")
+        return 2
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

scrape_forvo-0.1.0/src/scrape_forvo/download.py ADDED Viewed

@@ -0,0 +1,43 @@
+from __future__ import annotations
+import os
+from typing import Optional
+import requests
+from .types import PlayItem
+def head_ok(url: str, session: requests.Session) -> bool:
+    try:
+        r = session.head(url, allow_redirects=True, timeout=15)
+        return r.status_code in (200, 206)
+    except requests.RequestException:
+        return False
+def select_audio_url(
+    item: PlayItem,
+    base_url: str,
+    session: requests.Session,
+    *,
+    no_head: bool,
+    seen_urls: set[str],
+) -> Optional[str]:
+    for mp3_path in item.mp3_paths:
+        url = f"{base_url}/{mp3_path}"
+        if url in seen_urls:
+            continue
+        if no_head or head_ok(url, session):
+            return url
+    return None
+def download_audio(url: str, out_path: str, session: requests.Session) -> None:
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    with session.get(url, stream=True, timeout=30) as r:
+        r.raise_for_status()
+        with open(out_path, "wb") as f:
+            for chunk in r.iter_content(chunk_size=1024 * 128):
+                if chunk:
+                    f.write(chunk)

scrape_forvo-0.1.0/src/scrape_forvo/errors.py ADDED Viewed

@@ -0,0 +1,14 @@
+from __future__ import annotations
+class ForvoError(RuntimeError):
+    """Base error for scrape_forvo."""
+class ContentNotFoundError(ForvoError):
+    """Expected content was not found in page HTML."""
+class ForvoBlockedError(ForvoError):
+    """Raised when Forvo blocks non-browser requests (e.g., 403)."""

scrape_forvo-0.1.0/src/scrape_forvo/fetch.py ADDED Viewed

@@ -0,0 +1,115 @@
+from __future__ import annotations
+import time
+import requests
+from .errors import ForvoBlockedError
+_REAL_CONTENT_MARKER = "_AUDIO_HTTP_HOST"
+def make_session() -> requests.Session:
+    """
+    Make a more browser-like session to avoid 403s.
+    """
+    s = requests.Session()
+    s.headers.update(
+        {
+            "User-Agent": (
+                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+                "AppleWebKit/537.36 (KHTML, like Gecko) "
+                "Chrome/122.0.0.0 Safari/537.36"
+            ),
+            "Accept": (
+                "text/html,application/xhtml+xml,application/xml;"
+                "q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"
+            ),
+            "Accept-Language": "en-US,en;q=0.9",
+            "Accept-Encoding": "gzip, deflate, br",
+            "Connection": "keep-alive",
+            "Upgrade-Insecure-Requests": "1",
+        }
+    )
+    return s
+def fetch_html_requests(url: str, session: requests.Session) -> str:
+    """
+    Requests-based fetch with:
+    - warm-up homepage (cookies)
+    - referer header
+    """
+    homepage = "https://forvo.com/"
+    try:
+        session.get(homepage, timeout=20)
+    except requests.RequestException:
+        pass
+    headers = {"Referer": homepage}
+    r = session.get(url, headers=headers, timeout=30)
+    if r.status_code == 403:
+        raise ForvoBlockedError(
+            "HTTP 403 Forbidden. Forvo is blocking non-browser requests.\n"
+            "Try:\n"
+            "  1) --use-playwright (recommended)\n"
+            "  2) run from a different network/IP\n"
+            "  3) ensure you can open the URL in a normal browser\n"
+        )
+    r.raise_for_status()
+    return r.text
+def _wait_for_real_content(page, timeout_ms: int = 60_000, poll_interval_ms: int = 2000) -> str:
+    """
+    Wait until the page shows real Forvo content (past Cloudflare/security interstitial).
+    Returns final HTML. Raises RuntimeError if timeout is reached before real content appears.
+    """
+    deadline = time.time() + (timeout_ms / 1000.0)
+    while time.time() < deadline:
+        html = page.content()
+        if _REAL_CONTENT_MARKER in html:
+            return html
+        page.wait_for_timeout(min(poll_interval_ms, 2000))
+    raise RuntimeError(
+        "Timed out waiting for Forvo to finish security verification. "
+        "Try: python3 -m scrape_forvo ... --use-playwright --headed  (visible browser often passes Cloudflare)."
+    )
+def fetch_html_playwright(
+    url: str,
+    *,
+    headed: bool = False,
+) -> str:
+    """
+    Playwright fallback: renders page like a real browser and returns final HTML.
+    Requires:
+      pip install playwright
+      playwright install
+    Waits for Cloudflare/security verification to complete before reading content.
+    Use headed=True to show the browser window; Cloudflare often passes with a visible browser.
+    """
+    from playwright.sync_api import sync_playwright
+    with sync_playwright() as p:
+        browser = p.chromium.launch(headless=not headed)
+        try:
+            ctx = browser.new_context(locale="en-US")
+            page = ctx.new_page()
+            page.goto(url, wait_until="domcontentloaded", timeout=60_000)
+            page.wait_for_timeout(2000)
+            try:
+                html = _wait_for_real_content(page, timeout_ms=55_000, poll_interval_ms=2000)
+            except RuntimeError:
+                html = page.content()
+                if _REAL_CONTENT_MARKER not in html:
+                    raise RuntimeError(
+                        "Page did not load real Forvo content (still on security verification or captcha). "
+                        "Try: --use-playwright --headed  (visible browser), or run from a different network."
+                    )
+            return html
+        finally:
+            browser.close()

scrape_forvo-0.1.0/src/scrape_forvo/parse.py ADDED Viewed

@@ -0,0 +1,83 @@
+from __future__ import annotations
+import base64
+import re
+from typing import Iterable, List, Optional
+from urllib.parse import urlparse
+from .errors import ContentNotFoundError
+from .types import PlayItem
+AUDIO_HOST_RE = re.compile(r"_AUDIO_HTTP_HOST\s*=\s*'([^']+)'")
+ONCLICK_PLAY_RE = re.compile(r'onclick="Play\((.*?)\);return false;"', re.DOTALL)
+SINGLE_QUOTED_RE = re.compile(r"'([^']*)'")
+def b64_decode(s: str) -> Optional[str]:
+    try:
+        padded = s + "=" * (-len(s) % 4)
+        raw = base64.b64decode(padded)
+        return raw.decode("utf-8")
+    except Exception:
+        return None
+def extract_audio_host(html: str) -> str:
+    m = AUDIO_HOST_RE.search(html)
+    if not m:
+        raise ContentNotFoundError("Could not find _AUDIO_HTTP_HOST in page HTML.")
+    return m.group(1)
+def safe_filename(name: str) -> str:
+    name = name.strip()
+    name = re.sub(r"\s+", "_", name)
+    name = re.sub(r"[^a-zA-Z0-9._-]+", "_", name)
+    return name.strip("_") or "forvo"
+def page_slug(url: str) -> str:
+    p = urlparse(url)
+    bits = [b for b in p.path.split("/") if b]
+    return safe_filename(bits[-1] if bits else "forvo")
+def _dedupe_preserve_order(paths: Iterable[str]) -> List[str]:
+    seen = set()
+    uniq: List[str] = []
+    for p in paths:
+        if p not in seen:
+            seen.add(p)
+            uniq.append(p)
+    return uniq
+def _prefer_canonical(paths: List[str]) -> List[str]:
+    paths.sort(key=lambda p: (0 if re.search(r"(^|/)2/s/2s_", p) else 1, len(p)))
+    return paths
+def iter_play_items(html: str) -> Iterable[PlayItem]:
+    for m in ONCLICK_PLAY_RE.finditer(html):
+        inside = m.group(1)
+        play_id_match = re.match(r"\s*(\d+)\s*,", inside)
+        play_id = play_id_match.group(1) if play_id_match else "unknown"
+        quoted = SINGLE_QUOTED_RE.findall(inside)
+        label = "forvo"
+        if len(quoted) >= 2:
+            label = quoted[-2] or quoted[-1] or "forvo"
+        decoded_mp3_paths: List[str] = []
+        for token in quoted:
+            decoded = b64_decode(token)
+            if decoded and ".mp3" in decoded:
+                decoded_mp3_paths.append(decoded.lstrip("/"))
+        uniq = _prefer_canonical(_dedupe_preserve_order(decoded_mp3_paths))
+        if uniq:
+            yield PlayItem(play_id=play_id, label=label, mp3_paths=tuple(uniq))

scrape_forvo-0.1.0/src/scrape_forvo/types.py ADDED Viewed

@@ -0,0 +1,11 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Tuple
+@dataclass(frozen=True)
+class PlayItem:
+    play_id: str
+    label: str
+    mp3_paths: Tuple[str, ...]

scrape_forvo-0.1.0/src/scrape_forvo.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,77 @@
+Metadata-Version: 2.4
+Name: scrape-forvo
+Version: 0.1.0
+Summary: Add your description here
+Requires-Python: >=3.13
+Description-Content-Type: text/markdown
+Requires-Dist: playwright>=1.58.0
+Requires-Dist: requests>=2.32.5
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0.0; extra == "dev"
+Requires-Dist: responses>=0.25.0; extra == "dev"
+# scrape-forvo
+Download pronunciation MP3s from Forvo search pages.
+## Installation
+```bash
+python -m pip install -e .
+```
+## Usage
+Only this command is confirmed to work reliably:
+```bash
+scrape-forvo https://forvo.com/search/egg/no/ --use-playwright --headed
+```
+## Scriptable Usage
+You can also import `scrape_forvo` and use it from Python:
+```python
+from scrape_forvo import scrape
+result = scrape(
+    "https://forvo.com/search/egg/no/",
+    outdir="forvo_mp3",
+    use_playwright=True,
+    headed=True,
+)
+print(result.downloaded_count)
+for candidate in result.candidates:
+    print(candidate.url, "->", candidate.out_path)
+```
+The `scrape()` arguments map directly to CLI flags, so both interfaces share the same behavior without duplicated logic.
+## Development
+Install dev dependencies:
+```bash
+python -m pip install -e .[dev]
+```
+Run tests:
+```bash
+pytest
+```
+### Optional live test
+Set `FORVO_LIVE_TEST=1` to enable the live integration test.
+## TODO
+edge cases
+- [ ] when multiple pronunciation files come out. which one to pick?
+- [ ] when there's no pronunciation available.
+integration
+- [ ] integration with the vocab repo

scrape_forvo-0.1.0/src/scrape_forvo.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,22 @@
+README.md
+pyproject.toml
+src/scrape_forvo/__init__.py
+src/scrape_forvo/__main__.py
+src/scrape_forvo/api.py
+src/scrape_forvo/cli.py
+src/scrape_forvo/download.py
+src/scrape_forvo/errors.py
+src/scrape_forvo/fetch.py
+src/scrape_forvo/parse.py
+src/scrape_forvo/types.py
+src/scrape_forvo.egg-info/PKG-INFO
+src/scrape_forvo.egg-info/SOURCES.txt
+src/scrape_forvo.egg-info/dependency_links.txt
+src/scrape_forvo.egg-info/entry_points.txt
+src/scrape_forvo.egg-info/requires.txt
+src/scrape_forvo.egg-info/top_level.txt
+tests/test_api.py
+tests/test_cli.py
+tests/test_download.py
+tests/test_fetch_requests.py
+tests/test_parse.py

scrape_forvo-0.1.0/src/scrape_forvo.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

scrape_forvo-0.1.0/src/scrape_forvo.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ scrape-forvo = scrape_forvo.cli:main

scrape_forvo-0.1.0/src/scrape_forvo.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,6 @@
+playwright>=1.58.0
+requests>=2.32.5
+[dev]
+pytest>=8.0.0
+responses>=0.25.0

scrape_forvo-0.1.0/src/scrape_forvo.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ scrape_forvo

scrape_forvo-0.1.0/tests/test_api.py ADDED Viewed

@@ -0,0 +1,38 @@
+from __future__ import annotations
+import base64
+import scrape_forvo.api as api
+from scrape_forvo.types import PlayItem
+def _b64(s: str) -> str:
+    return base64.b64encode(s.encode()).decode()
+def _sample_html() -> str:
+    mp3 = _b64("2/s/2s_egg_1.mp3")
+    return (
+        "var _AUDIO_HTTP_HOST = 'audio.forvo.com';"
+        "<a onclick=\"Play(123,'x','%s','Label');return false;\">" % mp3
+    )
+def test_scrape_dry_run_returns_candidate(monkeypatch) -> None:
+    monkeypatch.setattr(api, "make_session", lambda: object())
+    monkeypatch.setattr(api, "fetch_html_requests", lambda url, session: _sample_html())
+    monkeypatch.setattr(api, "iter_play_items", lambda html: iter((PlayItem("123", "Label", ("2/s/2s_egg_1.mp3",)),)))
+    result = api.scrape("https://forvo.com/search/egg/no/", dry_run=True, no_head=True)
+    assert result.downloaded_count == 1
+    assert len(result.candidates) == 1
+    assert result.candidates[0].url == "https://audio.forvo.com/audios/mp3/2/s/2s_egg_1.mp3"
+def test_scrape_invalid_limit() -> None:
+    try:
+        api.scrape("https://forvo.com/search/egg/no/", limit=-1)
+    except ValueError as exc:
+        assert "limit must be >= 0" in str(exc)
+    else:
+        raise AssertionError("Expected ValueError")

scrape_forvo-0.1.0/tests/test_cli.py ADDED Viewed

@@ -0,0 +1,23 @@
+from __future__ import annotations
+import scrape_forvo.cli as cli
+from scrape_forvo.api import ScrapeResult
+def test_cli_dry_run(monkeypatch, capsys) -> None:
+    monkeypatch.setattr(cli, "scrape", lambda *args, **kwargs: ScrapeResult(downloaded_count=1, candidates=tuple()))
+    code = cli.main(["https://forvo.com/search/egg/no/", "--dry-run", "--no-head"])
+    assert code == 0
+    _ = capsys.readouterr()
+def test_cli_no_mp3s(monkeypatch, capsys) -> None:
+    monkeypatch.setattr(cli, "scrape", lambda *args, **kwargs: ScrapeResult(downloaded_count=0, candidates=tuple()))
+    code = cli.main(["https://forvo.com/search/egg/no/", "--dry-run", "--no-head"])
+    assert code == 2
+    captured = capsys.readouterr()
+    assert "No MP3s downloaded" in (captured.out + captured.err)

scrape_forvo-0.1.0/tests/test_download.py ADDED Viewed

@@ -0,0 +1,60 @@
+from __future__ import annotations
+import io
+import os
+from pathlib import Path
+from scrape_forvo.download import download_audio, select_audio_url
+from scrape_forvo.types import PlayItem
+class _DummyResponse:
+    def __init__(self, status_code: int, body: bytes = b"") -> None:
+        self.status_code = status_code
+        self._body = body
+    def raise_for_status(self) -> None:
+        if self.status_code >= 400:
+            raise RuntimeError("bad status")
+    def iter_content(self, chunk_size: int = 1024) -> list[bytes]:
+        return [self._body]
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc, tb):
+        return False
+class _DummySession:
+    def __init__(self, head_status: int = 200, body: bytes = b"") -> None:
+        self._head_status = head_status
+        self._body = body
+    def head(self, url: str, allow_redirects: bool = True, timeout: int = 15):
+        return _DummyResponse(self._head_status)
+    def get(self, url: str, stream: bool = True, timeout: int = 30):
+        return _DummyResponse(200, self._body)
+def test_select_audio_url_respects_no_head() -> None:
+    item = PlayItem(play_id="1", label="x", mp3_paths=("a.mp3", "b.mp3"))
+    session = _DummySession(head_status=404)
+    url = select_audio_url(item, "https://host/audios/mp3", session, no_head=True, seen_urls=set())
+    assert url == "https://host/audios/mp3/a.mp3"
+def test_select_audio_url_uses_head() -> None:
+    item = PlayItem(play_id="1", label="x", mp3_paths=("a.mp3",))
+    session = _DummySession(head_status=404)
+    url = select_audio_url(item, "https://host/audios/mp3", session, no_head=False, seen_urls=set())
+    assert url is None
+def test_download_audio_writes_file(tmp_path: Path) -> None:
+    session = _DummySession(body=b"abc123")
+    out_path = tmp_path / "out.mp3"
+    download_audio("https://host/audios/mp3/a.mp3", str(out_path), session)
+    assert out_path.read_bytes() == b"abc123"

scrape_forvo-0.1.0/tests/test_fetch_requests.py ADDED Viewed

@@ -0,0 +1,44 @@
+from __future__ import annotations
+import pytest
+responses = pytest.importorskip("responses")
+from scrape_forvo.errors import ForvoBlockedError
+from scrape_forvo.fetch import fetch_html_requests, make_session
+@responses.activate
+def test_fetch_html_requests_sets_referer_and_warmup() -> None:
+    session = make_session()
+    homepage = "https://forvo.com/"
+    target = "https://forvo.com/search/egg/no/"
+    responses.add(responses.GET, homepage, body="ok", status=200)
+    responses.add(responses.GET, target, body="<html>ok</html>", status=200)
+    html = fetch_html_requests(target, session)
+    assert html == "<html>ok</html>"
+    called_urls = [call.request.url for call in responses.calls]
+    assert homepage in called_urls
+    assert target in called_urls
+    target_call = next(call for call in responses.calls if call.request.url == target)
+    assert target_call.request.headers.get("Referer") == homepage
+@responses.activate
+def test_fetch_html_requests_403_raises() -> None:
+    session = make_session()
+    target = "https://forvo.com/search/egg/no/"
+    responses.add(responses.GET, "https://forvo.com/", body="ok", status=200)
+    responses.add(responses.GET, target, body="nope", status=403)
+    try:
+        fetch_html_requests(target, session)
+    except ForvoBlockedError as exc:
+        assert "HTTP 403" in str(exc)
+    else:
+        raise AssertionError("Expected ForvoBlockedError")

scrape_forvo-0.1.0/tests/test_parse.py ADDED Viewed

@@ -0,0 +1,38 @@
+from __future__ import annotations
+import base64
+import pytest
+from scrape_forvo.parse import extract_audio_host, iter_play_items, page_slug, safe_filename
+def _b64(s: str) -> str:
+    return base64.b64encode(s.encode()).decode()
+def test_extract_audio_host() -> None:
+    html = "var _AUDIO_HTTP_HOST = 'audio.forvo.com';"
+    assert extract_audio_host(html) == "audio.forvo.com"
+def test_iter_play_items_dedupe_and_order() -> None:
+    mp3_a = _b64("2/s/2s_egg_1.mp3")
+    mp3_b = _b64("other/egg_1.mp3")
+    html = (
+        "<a onclick=\"Play(123,'x','%s','%s','Label','end');return false;\">"
+        % (mp3_b, mp3_a)
+    )
+    items = list(iter_play_items(html))
+    assert len(items) == 1
+    item = items[0]
+    assert item.play_id == "123"
+    assert item.label == "Label"
+    assert item.mp3_paths[0].startswith("2/s/2s_")
+    assert len(item.mp3_paths) == 2
+def test_safe_filename_and_page_slug() -> None:
+    assert safe_filename("  hello world ") == "hello_world"
+    assert safe_filename("##") == "forvo"
+    assert page_slug("https://forvo.com/search/egg/no/") == "no"