PyPI - dvdcompare-scraper - Versions diffs - 0.1.0__tar.gz - Mend

dvdcompare-scraper 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

dvdcompare_scraper-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,10 @@
+Metadata-Version: 2.4
+Name: dvdcompare-scraper
+Version: 0.1.0
+Summary: Scrape disc extras metadata from dvdcompare.net
+Requires-Python: >=3.11
+Requires-Dist: httpx>=0.27
+Requires-Dist: beautifulsoup4>=4.12
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0; extra == "dev"
+Requires-Dist: pytest-asyncio>=0.23; extra == "dev"

dvdcompare_scraper-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,80 @@
+# dvdcompare-scraper
+Scrape disc extras metadata from [dvdcompare.net](https://www.dvdcompare.net).
+## Install
+```
+pip install -e ".[dev]"
+```
+## Usage
+Search by title:
+```
+dvdcompare "Oppenheimer"
+```
+Look up by dvdcompare film ID:
+```
+dvdcompare --id 66397
+```
+Look up by URL:
+```
+dvdcompare --url "https://www.dvdcompare.net/comparisons/film.php?fid=66397"
+```
+### Regional releases
+Each dvdcompare page lists multiple regional releases (e.g. America, United Kingdom, Japan), each with its own disc contents and runtimes. By default, the CLI shows only the first release listed.
+- `--release` selects a release by position (1-based) or by name keyword (case-insensitive substring match):
+  ```
+  dvdcompare --id 67210 --release 2
+  dvdcompare --id 67210 --release america
+  dvdcompare --id 67210 --release "united kingdom"
+  ```
+  If no release matches the keyword, the available release names are printed so you can retry.
+- `--all-releases` shows every release:
+  ```
+  dvdcompare --id 67210 --all-releases
+  ```
+- `--json` outputs the data structure (respects `--release` filtering):
+  ```
+  dvdcompare --id 67210 --json
+  dvdcompare --id 67210 --release america --json
+  ```
+### Filtering with external tools
+For more complex filtering, pipe the JSON output through jq or PowerShell:
+**jq:**
+```bash
+dvdcompare --id 67210 --json | jq '.releases |= map(select(.name | test("america"; "i")))'
+```
+**PowerShell:**
+```powershell
+dvdcompare --id 67210 --json | ConvertFrom-Json | ForEach-Object {
+    $_.releases = $_.releases | Where-Object { $_.name -match "america" }
+    $_ | ConvertTo-Json -Depth 10
+}
+```
+## Data model
+- `FilmComparison`: top-level object with title, year, format, director, IMDB info, and a list of `Release` objects.
+- `Release`: a regional release with name (e.g. "Blu-ray ALL America - BBC"), year, and a list of `Disc` objects.
+- `Disc`: a single disc with number, format (e.g. "Blu-ray 4K"), and a list of `Feature` objects.
+- `Feature`: a bonus feature with title, runtime, type, year, technical notes, play-all flag, and optional children (for grouped features like "Making Of" collections or episode groups).
+## Tests
+```
+py -m pytest tests/ -v
+```

dvdcompare_scraper-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,28 @@
+[build-system]
+requires = ["setuptools>=68.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "dvdcompare-scraper"
+version = "0.1.0"
+description = "Scrape disc extras metadata from dvdcompare.net"
+requires-python = ">=3.11"
+dependencies = [
+    "httpx>=0.27",
+    "beautifulsoup4>=4.12",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0",
+    "pytest-asyncio>=0.23",
+]
+[project.scripts]
+dvdcompare = "dvdcompare.cli:main"
+[tool.setuptools.packages.find]
+where = ["src"]
+[tool.pytest.ini_options]
+asyncio_mode = "auto"

dvdcompare_scraper-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

dvdcompare_scraper-0.1.0/src/dvdcompare/__init__.py ADDED Viewed

File without changes

dvdcompare_scraper-0.1.0/src/dvdcompare/cli.py ADDED Viewed

@@ -0,0 +1,138 @@
+from __future__ import annotations
+import argparse
+import asyncio
+import json
+import sys
+from dataclasses import asdict
+from .models import Release
+from .parser import format_runtime
+from .scraper import get_film, get_film_by_url, search
+def select_releases(releases: list[Release], selector: str) -> list[Release]:
+    """Filter releases by 1-based index or case-insensitive name substring.
+    Raises LookupError if no release matches the selector.
+    """
+    try:
+        idx = int(selector) - 1
+        idx = min(idx, len(releases) - 1)
+        return [releases[idx]]
+    except ValueError:
+        pass
+    keyword = selector.lower()
+    matched = [r for r in releases if keyword in r.name.lower()]
+    if matched:
+        return matched
+    names = "\n".join(f"  {i}. {r.name}" for i, r in enumerate(releases, 1))
+    raise LookupError(
+        f"No release matching '{selector}'.\nAvailable releases:\n{names}"
+    )
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Scrape disc extras metadata from dvdcompare.net",
+    )
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("query", nargs="?", help="Search query (film title)")
+    group.add_argument("--id", type=int, dest="film_id", help="dvdcompare film ID")
+    group.add_argument("--url", help="dvdcompare film page URL")
+    parser.add_argument(
+        "--release",
+        default="1",
+        help="Release number (1-based) or name keyword (default: 1)",
+    )
+    parser.add_argument(
+        "--all-releases",
+        action="store_true",
+        help="Show all releases instead of just one",
+    )
+    parser.add_argument("--json", action="store_true", help="Output as JSON")
+    args = parser.parse_args()
+    asyncio.run(_run(args))
+async def _run(args: argparse.Namespace) -> None:
+    if args.film_id:
+        film = await get_film(args.film_id)
+    elif args.url:
+        film = await get_film_by_url(args.url)
+    else:
+        results = await search(args.query)
+        if not results:
+            print("No results found.", file=sys.stderr)
+            sys.exit(1)
+        if len(results) > 1:
+            print(f"Found {len(results)} results:", file=sys.stderr)
+            for i, r in enumerate(results, 1):
+                print(f"  {i}. {r.title} (fid={r.film_id})", file=sys.stderr)
+            print(
+                "Using first result. Use --id to select a specific one.",
+                file=sys.stderr,
+            )
+        film = await get_film(results[0].film_id)
+    if not args.all_releases and film.releases:
+        film.releases = select_releases(film.releases, args.release)
+    if args.json:
+        print(json.dumps(asdict(film), indent=2))
+    else:
+        _print_text(film, args)
+def _print_text(film, args: argparse.Namespace) -> None:
+    header = film.title
+    if film.format:
+        header += f" ({film.format})"
+    if film.year:
+        header += f" ({film.year})"
+    print(header)
+    if film.director:
+        print(f"Director: {film.director}")
+    if film.imdb_id:
+        print(f"IMDB: {film.imdb_id}")
+    print()
+    for release in film.releases:
+        line = f"--- {release.name}"
+        if release.year:
+            line += f" [{release.year}]"
+        line += " ---"
+        print(line)
+        for disc in release.discs:
+            print(f"\n  DISC {disc.number} ({disc.format})")
+            if disc.is_film:
+                print("    The Film")
+            for feature in disc.features:
+                _print_feature(feature, indent=4)
+        print()
+def _print_feature(feature, indent: int = 4) -> None:
+    prefix = " " * indent
+    parts = [f'{prefix}"{feature.title}"']
+    if feature.year:
+        parts.append(str(feature.year))
+    if feature.feature_type:
+        parts.append(feature.feature_type)
+    if feature.runtime_seconds is not None:
+        rt = format_runtime(feature.runtime_seconds)
+        if feature.is_play_all:
+            parts.append(f"(Play All - {rt})")
+        else:
+            parts.append(f"({rt})")
+    if feature.technical_notes:
+        parts.append(f"[{feature.technical_notes}]")
+    print(" ".join(parts))
+    for child in feature.children:
+        _print_feature(child, indent=indent + 2)

dvdcompare_scraper-0.1.0/src/dvdcompare/models.py ADDED Viewed

@@ -0,0 +1,58 @@
+from __future__ import annotations
+from dataclasses import dataclass, field
+@dataclass
+class Feature:
+    """A single bonus feature on a disc."""
+    title: str
+    runtime_seconds: int | None = None
+    feature_type: str | None = None
+    year: int | None = None
+    technical_notes: str | None = None
+    is_play_all: bool = False
+    children: list[Feature] = field(default_factory=list)
+@dataclass
+class Disc:
+    """A single disc in a release."""
+    number: int
+    format: str
+    is_film: bool = False
+    features: list[Feature] = field(default_factory=list)
+@dataclass
+class Release:
+    """A regional release of a film."""
+    name: str
+    year: int | None = None
+    discs: list[Disc] = field(default_factory=list)
+@dataclass
+class FilmComparison:
+    """A complete film comparison page from dvdcompare.net."""
+    title: str
+    year: int | None = None
+    format: str | None = None
+    director: str | None = None
+    imdb_url: str | None = None
+    imdb_id: str | None = None
+    film_id: int | None = None
+    releases: list[Release] = field(default_factory=list)
+@dataclass
+class SearchResult:
+    """A single result from a dvdcompare.net search."""
+    title: str
+    url: str
+    film_id: int | None = None

dvdcompare_scraper-0.1.0/src/dvdcompare/parser.py ADDED Viewed

@@ -0,0 +1,351 @@
+from __future__ import annotations
+import html as html_module
+import re
+from bs4 import BeautifulSoup
+from .models import Disc, Feature, FilmComparison, Release, SearchResult
+_DISC_WORDS = {
+    "ONE": 1,
+    "TWO": 2,
+    "THREE": 3,
+    "FOUR": 4,
+    "FIVE": 5,
+    "SIX": 6,
+    "SEVEN": 7,
+    "EIGHT": 8,
+    "NINE": 9,
+    "TEN": 10,
+}
+def parse_runtime(s: str) -> int:
+    """Parse ``MM:SS``, ``H:MM:SS``, or ``NNN mins`` into total seconds."""
+    s = s.strip()
+    mins_match = re.match(r"^(\d+)\s*mins?$", s)
+    if mins_match:
+        return int(mins_match.group(1)) * 60
+    parts = s.split(":")
+    if len(parts) == 2:
+        return int(parts[0]) * 60 + int(parts[1])
+    if len(parts) == 3:
+        return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
+    return 0
+def format_runtime(seconds: int) -> str:
+    """Format total seconds as ``MM:SS`` or ``H:MM:SS``."""
+    mins, secs = divmod(seconds, 60)
+    hours, mins = divmod(mins, 60)
+    if hours:
+        return f"{hours}:{mins:02d}:{secs:02d}"
+    return f"{mins}:{secs:02d}"
+def _disc_number(word: str) -> int:
+    word = word.upper()
+    if word in _DISC_WORDS:
+        return _DISC_WORDS[word]
+    try:
+        return int(word)
+    except ValueError:
+        return 0
+def parse_feature_line(line: str) -> Feature:
+    """Parse a single feature text line into a :class:`Feature`."""
+    technical_notes = None
+    is_play_all = False
+    runtime_seconds = None
+    year = None
+    feature_type = None
+    # 1. Extract technical notes [...]
+    tech_match = re.search(r"\s*\[([^\]]+)\]", line)
+    if tech_match:
+        technical_notes = tech_match.group(1)
+        line = line[: tech_match.start()] + line[tech_match.end() :]
+    # 2. Strip trailing colon (group indicator) -- detected by caller
+    line = line.strip()
+    if line.endswith(":"):
+        line = line[:-1].strip()
+    # 3. Extract runtime  (MM:SS) / (H:MM:SS) / (Play All - MM:SS) / (NNN mins)
+    #    Also handles "with Play All option - MM:SS"
+    runtime_match = re.search(
+        r"\((?:(?:with )?(Play All)(?: option)? - )?(\d{1,3}:\d{2}(?::\d{2})?|\d+\s*mins?)\)",
+        line,
+    )
+    if runtime_match:
+        if runtime_match.group(1):
+            is_play_all = True
+        runtime_seconds = parse_runtime(runtime_match.group(2))
+        line = line[: runtime_match.start()] + line[runtime_match.end() :]
+        line = line.strip()
+    # 3b. Strip trailing colon again (may be exposed after runtime removal)
+    if line.endswith(":"):
+        line = line[:-1].strip()
+    # 4. Extract quoted title vs unquoted
+    quote_match = re.match(r'^["\u201c](.+?)["\u201d](.*)$', line)
+    if quote_match:
+        title = quote_match.group(1).strip()
+        rest = quote_match.group(2).strip()
+    else:
+        title = line.strip()
+        rest = ""
+    # 5. From rest, extract year and type
+    if rest:
+        year_match = re.match(r"^(\d{4})\s*(.*)", rest)
+        if year_match:
+            year = int(year_match.group(1))
+            feature_type = year_match.group(2).strip() or None
+        else:
+            feature_type = rest.strip() or None
+    # Normalize whitespace in title
+    title = re.sub(r"\s+", " ", title).strip()
+    return Feature(
+        title=title,
+        runtime_seconds=runtime_seconds,
+        feature_type=feature_type,
+        year=year,
+        technical_notes=technical_notes,
+        is_play_all=is_play_all,
+    )
+def parse_extras(extras_html: str) -> list[Disc]:
+    """Parse the inner HTML of an extras description div into :class:`Disc` objects."""
+    # Replace <br> variants with newlines
+    text = re.sub(r"<br\s*/?>", "\n", extras_html)
+    # Remove all remaining HTML tags
+    text = re.sub(r"<[^>]+>", "", text)
+    # Decode HTML entities
+    text = html_module.unescape(text)
+    lines = [ln.strip() for ln in text.split("\n")]
+    discs: list[Disc] = []
+    current_disc: Disc | None = None
+    current_group: Feature | None = None
+    for line in lines:
+        if not line:
+            continue
+        # Disc header: DISC ONE (Blu-ray 4K)  or  DISC ONE
+        disc_match = re.match(r"^DISC\s+(\w+)(?:\s+\((.+)\))?$", line)
+        if disc_match:
+            current_disc = Disc(
+                number=_disc_number(disc_match.group(1)),
+                format=disc_match.group(2) or "",
+            )
+            discs.append(current_disc)
+            current_group = None
+            continue
+        # "* The Film" marker (possibly with a variant title suffix).
+        # dvdcompare uses a leading asterisk to flag the main feature.
+        if line.startswith("*"):
+            stripped = line.lstrip("*").strip()
+            is_film_marker = (
+                not stripped  # bare "*"
+                or stripped.lower().startswith("the film")
+            )
+            if is_film_marker and current_disc:
+                current_disc.is_film = True
+            # Exact "* The Film" (no extra info) -> skip entirely
+            if not stripped or stripped.lower() == "the film":
+                continue
+            # Otherwise strip the asterisk and keep as a feature
+            line = stripped
+            # fall through to feature parsing below
+        if current_disc is None:
+            continue
+        # Sub-feature (starts with "- ")
+        if line.startswith("- "):
+            feature = parse_feature_line(line[2:])
+            if current_group:
+                current_group.children.append(feature)
+            else:
+                current_disc.features.append(feature)
+            continue
+        feature = parse_feature_line(line)
+        current_disc.features.append(feature)
+        # Detect group header (trailing colon or play-all)
+        is_group = line.rstrip().endswith(":") or feature.is_play_all
+        if is_group:
+            current_group = feature
+        else:
+            current_group = None
+    return discs
+def parse_film_page(html: str) -> FilmComparison:
+    """Parse a dvdcompare.net film comparison page into a :class:`FilmComparison`."""
+    soup = BeautifulSoup(html, "html.parser")
+    # --- Title, format, year from <h2> ---
+    title = ""
+    year = None
+    disc_format = None
+    h2 = soup.find("h2")
+    if h2:
+        h2_text = h2.get_text(strip=True)
+        # Last (YYYY) is the year
+        year_match = re.search(r"\((\d{4})\)\s*$", h2_text)
+        if year_match:
+            year = int(year_match.group(1))
+            rest = h2_text[: year_match.start()].strip()
+        else:
+            rest = h2_text
+        # Format in parens at end of remainder
+        fmt_match = re.search(r"\(([^)]+)\)\s*$", rest)
+        if fmt_match:
+            disc_format = fmt_match.group(1)
+            title = rest[: fmt_match.start()].strip()
+        else:
+            title = rest
+    # --- IMDB ---
+    imdb_url = None
+    imdb_id = None
+    imdb_link = soup.find("a", href=re.compile(r"imdb\.com/title/"))
+    if imdb_link:
+        imdb_url = imdb_link["href"]
+        id_match = re.search(r"/(tt\d+)", imdb_url)
+        if id_match:
+            imdb_id = id_match.group(1)
+    # --- Director ---
+    director = None
+    content_div = soup.find("div", id="content")
+    if content_div:
+        dir_match = re.search(
+            r"Director:\s*(.+?)(?:\n|$)", content_div.get_text()
+        )
+        if dir_match:
+            director = dir_match.group(1).strip()
+    # --- Film ID ---
+    film_id = None
+    form = soup.find("form", action=re.compile(r"film\.php\?fid="))
+    if form:
+        fid_match = re.search(r"fid=(\d+)", form["action"])
+        if fid_match:
+            film_id = int(fid_match.group(1))
+    # --- Releases ---
+    releases: list[Release] = []
+    table = soup.find("table", attrs={"border": "0", "align": "center"})
+    if table:
+        for tr in table.find_all("tr"):
+            ul = tr.find("ul", class_="dvd")
+            if not ul:
+                continue
+            h3 = ul.find("h3")
+            if not h3:
+                continue
+            # Release name and year
+            release_year = None
+            year_span = h3.find("span", class_="disc-release-year")
+            if year_span:
+                ry_match = re.search(r"\[(\d{4})", year_span.get_text())
+                if ry_match:
+                    release_year = int(ry_match.group(1))
+                release_name = h3.get_text(strip=True).replace(
+                    year_span.get_text(strip=True), ""
+                ).strip()
+            else:
+                release_name = h3.get_text(strip=True)
+            # Find extras
+            discs: list[Disc] = []
+            for li in ul.find_all("li", recursive=False):
+                label_div = li.find("div", class_="label")
+                if label_div and "Extras:" in label_div.get_text():
+                    desc_div = li.find("div", class_="description")
+                    if desc_div:
+                        discs = parse_extras(desc_div.decode_contents())
+                    break
+            releases.append(
+                Release(name=release_name, year=release_year, discs=discs)
+            )
+    return FilmComparison(
+        title=title,
+        year=year,
+        format=disc_format,
+        director=director,
+        imdb_url=imdb_url,
+        imdb_id=imdb_id,
+        film_id=film_id,
+        releases=releases,
+    )
+def parse_search_results(html: str) -> list[SearchResult]:
+    """Parse a dvdcompare.net search results page.
+    When the search returns exactly one hit, dvdcompare emits a JavaScript
+    redirect (``location.href="film.php?fid=..."``)) instead of a clickable
+    ``<a>`` tag.  This function handles both cases.
+    """
+    soup = BeautifulSoup(html, "html.parser")
+    results: list[SearchResult] = []
+    seen: set[int] = set()
+    for link in soup.find_all("a", href=re.compile(r"film\.php\?fid=\d+")):
+        text = link.get_text(strip=True)
+        if not text:
+            continue
+        href = link["href"]
+        fid_match = re.search(r"fid=(\d+)", href)
+        film_id = int(fid_match.group(1)) if fid_match else None
+        if film_id and film_id in seen:
+            continue
+        if film_id:
+            seen.add(film_id)
+        if not href.startswith("http"):
+            href = f"https://www.dvdcompare.net/comparisons/{href}"
+        results.append(SearchResult(title=text, url=href, film_id=film_id))
+    # Single-result pages use a JS redirect instead of <a> links.
+    if not results:
+        for script in soup.find_all("script"):
+            content = script.string or ""
+            m = re.search(
+                r'location\.href\s*=\s*"(film\.php\?fid=(\d+))"', content
+            )
+            if m:
+                fid = int(m.group(2))
+                href = f"https://www.dvdcompare.net/comparisons/{m.group(1)}"
+                # Try to grab the title from the <h2> nearby.
+                h2 = soup.find("h2")
+                title = ""
+                if h2:
+                    italic = h2.find("i")
+                    title = italic.get_text(strip=True) if italic else h2.get_text(strip=True)
+                results.append(SearchResult(title=title, url=href, film_id=fid))
+                break
+    return results