PyPI - megaloader - Versions diffs - 0.1.0__py3-none-any.whl - Mend

megaloader 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

megaloader/__init__.py +87 -0
megaloader/exceptions.py +14 -0
megaloader/item.py +32 -0
megaloader/plugin.py +94 -0
megaloader/plugins/__init__.py +74 -0
megaloader/plugins/bunkr.py +147 -0
megaloader/plugins/cyberdrop.py +116 -0
megaloader/plugins/fanbox.py +165 -0
megaloader/plugins/fapello.py +84 -0
megaloader/plugins/gofile.py +105 -0
megaloader/plugins/pixeldrain.py +51 -0
megaloader/plugins/pixiv.py +135 -0
megaloader/plugins/rule34.py +174 -0
megaloader/plugins/thothub_to.py +164 -0
megaloader/plugins/thothub_vip.py +114 -0
megaloader/plugins/thotslife.py +66 -0
megaloader/py.typed +0 -0
megaloader-0.1.0.dist-info/METADATA +213 -0
megaloader-0.1.0.dist-info/RECORD +21 -0
megaloader-0.1.0.dist-info/WHEEL +5 -0
megaloader-0.1.0.dist-info/top_level.txt +1 -0

megaloader/__init__.py ADDED Viewed

@@ -0,0 +1,87 @@
+"""
+Megaloader - Extract downloadable content metadata from file hosting platforms.
+Basic usage:
+    import megaloader as mgl
+    for item in mgl.extract(url):
+        print(item.url, item.filename)
+    # With plugin-specific options
+    items = mgl.extract(url, password="secret")
+    items = mgl.extract(url, session_id="cookie_value")
+"""
+import logging
+import urllib.parse
+from collections.abc import Generator
+from typing import Any
+from megaloader.exceptions import ExtractionError, UnsupportedDomainError
+from megaloader.item import DownloadItem
+from megaloader.plugins import get_plugin_class
+logging.getLogger(__name__).addHandler(logging.NullHandler())
+logger = logging.getLogger(__name__)
+__version__ = "0.2.0"
+__all__ = ["DownloadItem", "ExtractionError", "UnsupportedDomainError", "extract"]
+def extract(url: str, **options: Any) -> Generator[DownloadItem, None, None]:
+    """
+    Extract downloadable items from a URL.
+    Returns a generator that yields items lazily as they're discovered.
+    Network requests happen during iteration, not at call time.
+    Args:
+        url: The source URL to extract from
+        **options: Plugin-specific options:
+            - password: str (Gofile)
+            - session_id: str (Fanbox, Pixiv)
+            - api_key: str (Rule34)
+            - user_id: str (Rule34)
+    Yields:
+        DownloadItem: Metadata for each downloadable file
+    Raises:
+        ValueError: Invalid URL format
+        UnsupportedDomainError: No plugin available for domain
+        ExtractionError: Network or parsing failure
+    Example:
+        >>> for item in extract("https://pixeldrain.com/l/abc123"):
+        ...     print(item.download_url, item.filename)
+    """
+    if not url or not url.strip():
+        msg = "URL cannot be empty"
+        raise ValueError(msg)
+    url = url.strip()
+    parsed = urllib.parse.urlparse(url)
+    if not parsed.netloc:
+        msg = f"Invalid URL: Could not parse domain from '{url}'"
+        raise ValueError(msg)
+    plugin_class = get_plugin_class(parsed.netloc)
+    if plugin_class is None:
+        raise UnsupportedDomainError(parsed.netloc)
+    logger.debug(
+        "Initializing %s for domain '%s'", plugin_class.__name__, parsed.netloc
+    )
+    try:
+        plugin = plugin_class(url, **options)
+        yield from plugin.extract()
+    except (UnsupportedDomainError, ValueError):
+        raise
+    except Exception as e:
+        logger.debug("Extraction failed for %s: %s", url, e, exc_info=True)
+        msg = f"Failed to extract from {url}: {e}"
+        raise ExtractionError(msg) from e

megaloader/exceptions.py ADDED Viewed

@@ -0,0 +1,14 @@
+class MegaloaderError(Exception):
+    """Base exception for all megaloader errors."""
+class ExtractionError(MegaloaderError):
+    """Failed to extract items from URL due to network or parsing error."""
+class UnsupportedDomainError(MegaloaderError):
+    """No plugin available for this domain."""
+    def __init__(self, domain: str) -> None:
+        super().__init__(f"No plugin found for domain: {domain}")
+        self.domain = domain

megaloader/item.py ADDED Viewed

@@ -0,0 +1,32 @@
+from dataclasses import dataclass, field
+@dataclass
+class DownloadItem:
+    """
+    Represents a single downloadable file with metadata.
+    Attributes:
+        download_url: Direct URL to download the file
+        filename: Original filename (may need sanitization for filesystem)
+        collection_name: Optional grouping (album/gallery/user)
+        source_id: Optional unique identifier from the source platform
+        headers: Optional HTTP headers required for download (e.g., Referer)
+        size_bytes: Optional file size in bytes
+    """
+    download_url: str
+    filename: str
+    collection_name: str | None = None
+    source_id: str | None = None
+    headers: dict[str, str] = field(default_factory=dict)
+    size_bytes: int | None = None
+    def __post_init__(self) -> None:
+        """Validate required fields."""
+        if not self.download_url:
+            msg = "download_url cannot be empty"
+            raise ValueError(msg)
+        if not self.filename:
+            msg = "filename cannot be empty"
+            raise ValueError(msg)

megaloader/plugin.py ADDED Viewed

@@ -0,0 +1,94 @@
+import logging
+from abc import ABC, abstractmethod
+from collections.abc import Generator
+from typing import Any, ClassVar
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+from megaloader.item import DownloadItem
+logger = logging.getLogger(__name__)
+class BasePlugin(ABC):
+    """
+    Base class for site-specific extractors.
+    Credential handling convention:
+    1. Explicit **kwargs take precedence (e.g., password="secret")
+    2. Environment variables as fallback (PLUGIN_NAME_*)
+    3. Fail gracefully if required credentials missing
+    Subclasses should override _configure_session() to add:
+    - Authentication headers/cookies
+    - Site-specific headers (Referer, Origin)
+    """
+    DEFAULT_HEADERS: ClassVar[dict[str, str]] = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+    }
+    def __init__(self, url: str, **options: Any) -> None:
+        if not url.strip():
+            msg = "URL must be a non-empty string"
+            raise ValueError(msg)
+        self.url = url.strip()
+        self.options = options
+        self._session: requests.Session | None = None
+    @property
+    def session(self) -> requests.Session:
+        """Lazily create session with retry logic and default headers."""
+        if self._session is None:
+            self._session = self._create_session()
+            self._configure_session(self._session)
+        return self._session
+    def _create_session(self) -> requests.Session:
+        """Create session with retry strategy for transient failures."""
+        session = requests.Session()
+        session.headers.update(self.DEFAULT_HEADERS)
+        # Retry on common transient errors
+        retry_strategy = Retry(
+            total=3,
+            backoff_factor=1,
+            status_forcelist=[429, 500, 502, 503, 504],
+            allowed_methods=["GET", "POST"],
+        )
+        adapter = HTTPAdapter(max_retries=retry_strategy)
+        session.mount("https://", adapter)
+        session.mount("http://", adapter)
+        return session
+    def _configure_session(self, session: requests.Session) -> None:  # noqa: B027
+        """
+        Override to add plugin-specific headers/cookies.
+        Example:
+            session.headers["Referer"] = f"https://{self.domain}/"
+            if api_key := os.getenv("PLUGIN_API_KEY"):
+                session.headers["Authorization"] = f"Bearer {api_key}"
+        """
+    @abstractmethod
+    def extract(self) -> Generator[DownloadItem, None, None]:
+        """
+        Extract downloadable items from the URL.
+        Yields items as they're discovered (lazy evaluation).
+        Should handle pagination, nested galleries, etc.
+        Yields:
+            DownloadItem: Each file found at the URL
+        Raises:
+            ExtractionError: On network/parsing failures
+        """

megaloader/plugins/__init__.py ADDED Viewed

@@ -0,0 +1,74 @@
+from megaloader.plugin import BasePlugin
+from megaloader.plugins.bunkr import Bunkr
+from megaloader.plugins.cyberdrop import Cyberdrop
+from megaloader.plugins.fanbox import Fanbox
+from megaloader.plugins.fapello import Fapello
+from megaloader.plugins.gofile import Gofile
+from megaloader.plugins.pixeldrain import PixelDrain
+from megaloader.plugins.pixiv import Pixiv
+from megaloader.plugins.rule34 import Rule34
+from megaloader.plugins.thothub_to import ThothubTO
+from megaloader.plugins.thothub_vip import ThothubVIP
+from megaloader.plugins.thotslife import Thotslife
+PLUGIN_REGISTRY: dict[str, type[BasePlugin]] = {
+    "bunkr.si": Bunkr,
+    "bunkr.la": Bunkr,
+    "bunkr.is": Bunkr,
+    "bunkr.ru": Bunkr,
+    "bunkr.su": Bunkr,
+    "cyberdrop.cr": Cyberdrop,
+    "cyberdrop.me": Cyberdrop,
+    "cyberdrop.to": Cyberdrop,
+    "fanbox.cc": Fanbox,
+    "fapello.com": Fapello,
+    "gofile.io": Gofile,
+    "pixeldrain.com": PixelDrain,
+    "pixiv.net": Pixiv,
+    "rule34.xxx": Rule34,
+    "thothub.ch": ThothubTO,
+    "thothub.to": ThothubTO,
+    "thothub.vip": ThothubVIP,
+    "thotslife.com": Thotslife,
+}
+# Domains that support subdomains (e.g., creator.fanbox.cc)
+SUBDOMAIN_SUPPORTED: set[str] = {"fanbox.cc"}
+def get_plugin_class(domain: str) -> type[BasePlugin] | None:
+    """
+    Resolve domain to plugin class.
+    Resolution order:
+    1. Exact match in PLUGIN_REGISTRY
+    2. Subdomain match for supported domains
+    3. Partial match (fallback for domain variations)
+    Args:
+        domain: Normalized domain from URL (e.g., "pixiv.net")
+    Returns:
+        Plugin class or None if unsupported
+    """
+    domain = domain.lower().strip()
+    # Exact match
+    if domain in PLUGIN_REGISTRY:
+        return PLUGIN_REGISTRY[domain]
+    # Subdomain support (e.g., creator.fanbox.cc -> fanbox.cc)
+    for base_domain in SUBDOMAIN_SUPPORTED:
+        if domain.endswith(f".{base_domain}") and base_domain in PLUGIN_REGISTRY:
+            return PLUGIN_REGISTRY[base_domain]
+    # Partial match fallback (e.g., www.pixiv.net -> pixiv.net)
+    for registered_domain, plugin_class in PLUGIN_REGISTRY.items():
+        if registered_domain in domain:
+            return plugin_class
+    return None
+__all__ = ["PLUGIN_REGISTRY", "get_plugin_class"]

megaloader/plugins/bunkr.py ADDED Viewed

@@ -0,0 +1,147 @@
+import base64
+import html
+import logging
+import math
+import re
+from collections.abc import Generator
+from urllib.parse import quote, urljoin, urlparse
+import requests
+from megaloader.item import DownloadItem
+from megaloader.plugin import BasePlugin
+logger = logging.getLogger(__name__)
+class Bunkr(BasePlugin):
+    """Extract files from Bunkr albums and individual file pages."""
+    API_BASE = "https://apidl.bunkr.ru/api/_001_v2"
+    def extract(self) -> Generator[DownloadItem, None, None]:
+        path = urlparse(self.url).path
+        if path.startswith("/a/"):
+            logger.debug("Processing album")
+            yield from self._extract_album()
+        elif path.startswith("/f/"):
+            logger.debug("Processing single file")
+            yield from self._extract_file(self.url)
+        else:
+            logger.warning("Unrecognized Bunkr URL format")
+    def _extract_album(self) -> Generator[DownloadItem, None, None]:
+        """Extract all files from an album page."""
+        try:
+            response = self.session.get(self.url, allow_redirects=True, timeout=30)
+            response.raise_for_status()
+        except Exception:
+            logger.exception("Failed to fetch album page")
+            return
+        file_links = re.findall(r'href="(/f/[^"]+)"', response.text)
+        if not file_links:
+            logger.warning("No files found in album")
+            return
+        seen_urls = set()
+        for link in file_links:
+            # Skip template variables
+            if "file.slug" in link or "+" in link:
+                continue
+            file_url = urljoin(response.url, link)
+            if file_url in seen_urls:
+                continue
+            seen_urls.add(file_url)
+            yield from self._extract_file(file_url)
+    def _extract_file(self, file_url: str) -> Generator[DownloadItem, None, None]:
+        """Extract download URL from a file page."""
+        try:
+            response = self.session.get(file_url, timeout=30)
+            response.raise_for_status()
+        except requests.RequestException:
+            logger.debug("Failed to fetch file page %s", file_url, exc_info=True)
+            return
+        # Find download button
+        download_match = re.search(
+            r'<a[^>]+class="[^"]*btn-main[^"]*"[^>]+href="([^"]+)"[^>]*>Download</a>',
+            response.text,
+        )
+        if not download_match:
+            logger.debug("No download button found for %s", file_url)
+            return
+        download_page_url = urljoin(file_url, download_match.group(1))
+        # Extract file ID from download page URL
+        if match := re.search(r"/file/(\w+)", download_page_url):
+            file_id = match.group(1)
+        else:
+            logger.debug("Could not extract file ID from %s", download_page_url)
+            return
+        filename = self._extract_filename(response.text) or f"bunkr_file_{file_id}"
+        if direct_url := self._fetch_direct_url(file_id, filename):
+            yield DownloadItem(
+                download_url=direct_url,
+                filename=filename,
+                source_id=file_id,
+                headers={"Referer": "https://get.bunkrr.su/"},
+            )
+    def _extract_filename(self, content: str) -> str | None:
+        """Extract original filename from page metadata."""
+        # Try og:title meta tag
+        if match := re.search(r'<meta property="og:title" content="([^"]+)"', content):
+            return html.unescape(match.group(1)).strip()
+        # Try JavaScript variable
+        if match := re.search(r'var ogname\s*=\s*"([^"]+)"', content):
+            return html.unescape(match.group(1)).strip()
+        return None
+    def _fetch_direct_url(self, file_id: str, filename: str) -> str | None:
+        """Get direct CDN URL using Bunkr's API."""
+        try:
+            response = self.session.post(
+                self.API_BASE,
+                json={"id": file_id},
+                timeout=30,
+            )
+            response.raise_for_status()
+            data = response.json()
+            # Decrypt the URL
+            timestamp = data["timestamp"]
+            encrypted_b64 = data["url"]
+            # Generate time-based decryption key
+            key_str = f"SECRET_KEY_{math.floor(timestamp / 3600)}"
+            key_bytes = key_str.encode("utf-8")
+            # XOR decrypt
+            encrypted_bytes = base64.b64decode(encrypted_b64)
+            decrypted = bytearray(
+                encrypted_bytes[i] ^ key_bytes[i % len(key_bytes)]
+                for i in range(len(encrypted_bytes))
+            )
+            base_url = decrypted.decode("utf-8")
+            return f"{base_url}?n={quote(filename)}"
+        except Exception:  # noqa: BLE001
+            logger.debug(
+                "Failed to fetch direct URL for file ID %s", file_id, exc_info=True
+            )
+            return None

megaloader/plugins/cyberdrop.py ADDED Viewed

@@ -0,0 +1,116 @@
+import logging
+import re
+from collections.abc import Generator
+from typing import Any
+from urllib.parse import urljoin, urlparse
+import requests
+from bs4 import BeautifulSoup
+from megaloader.item import DownloadItem
+from megaloader.plugin import BasePlugin
+logger = logging.getLogger(__name__)
+class Cyberdrop(BasePlugin):
+    """Extract files from Cyberdrop albums and individual files."""
+    API_BASE = "https://api.cyberdrop.cr/api/file"
+    SITE_BASE = "https://cyberdrop.cr"
+    def extract(self) -> Generator[DownloadItem, None, None]:
+        path = urlparse(self.url).path
+        if path.startswith("/a/"):
+            logger.debug("Processing album")
+            yield from self._extract_album()
+        elif path.startswith("/f/"):
+            logger.debug("Processing single file")
+            yield from self._extract_file()
+        else:
+            logger.warning("Unrecognized Cyberdrop URL format")
+    def _extract_album(self) -> Generator[DownloadItem, None, None]:
+        """Extract all files from an album."""
+        try:
+            response = self.session.get(self.url, timeout=30)
+            response.raise_for_status()
+        except Exception:
+            logger.exception("Failed to fetch album page")
+            return
+        soup = BeautifulSoup(response.text, "html.parser")
+        # Get album title
+        title_elem = soup.find("h1", id="title")
+        collection_name = title_elem.text.strip() if title_elem else None
+        # Find all file links
+        file_links = soup.select("a.file[href], a#file[href]")
+        for link in file_links:
+            file_url = urljoin(self.SITE_BASE, str(link["href"]))
+            if match := re.search(r"/f/(\w+)", file_url):
+                yield from self._process_file(match.group(1), collection_name)
+    def _extract_file(self) -> Generator[DownloadItem, None, None]:
+        """Extract a single file."""
+        if match := re.search(r"/f/(\w+)", self.url):
+            yield from self._process_file(match.group(1))
+    def _process_file(
+        self, file_id: str, collection_name: str | None = None
+    ) -> Generator[DownloadItem, None, None]:
+        """Process a single file ID and yield download item."""
+        item_data = self._fetch_file_info(file_id)
+        if not item_data:
+            return
+        direct_url = self._fetch_direct_url(item_data["auth_url"])
+        if not direct_url:
+            return
+        yield DownloadItem(
+            download_url=direct_url,
+            filename=item_data["name"],
+            collection_name=collection_name,
+            source_id=file_id,
+        )
+    def _fetch_file_info(self, file_id: str) -> dict[str, Any] | None:
+        """Get file metadata from API."""
+        api_url = f"{self.API_BASE}/info/{file_id}"
+        try:
+            response = self.session.get(api_url, timeout=30)
+            response.raise_for_status()
+            data = response.json()
+            if isinstance(data, dict) and data.get("name") and data.get("auth_url"):
+                return data
+        except (requests.RequestException, ValueError):
+            logger.debug("Failed to fetch file info for %s", file_id, exc_info=True)
+        return None
+    def _fetch_direct_url(self, auth_url: str) -> str | None:
+        """Get direct CDN URL from auth endpoint."""
+        try:
+            response = self.session.get(auth_url, timeout=30)
+            response.raise_for_status()
+            data = response.json()
+            if isinstance(data, dict):
+                url = data.get("url")
+                if isinstance(url, str):
+                    return url
+        except (requests.RequestException, ValueError, KeyError):
+            logger.debug("Failed to fetch direct URL from %s", auth_url, exc_info=True)
+        return None