megaloader 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
megaloader/__init__.py ADDED
@@ -0,0 +1,87 @@
1
+ """
2
+ Megaloader - Extract downloadable content metadata from file hosting platforms.
3
+
4
+ Basic usage:
5
+ import megaloader as mgl
6
+
7
+ for item in mgl.extract(url):
8
+ print(item.url, item.filename)
9
+
10
+ # With plugin-specific options
11
+ items = mgl.extract(url, password="secret")
12
+ items = mgl.extract(url, session_id="cookie_value")
13
+ """
14
+
15
+ import logging
16
+ import urllib.parse
17
+
18
+ from collections.abc import Generator
19
+ from typing import Any
20
+
21
+ from megaloader.exceptions import ExtractionError, UnsupportedDomainError
22
+ from megaloader.item import DownloadItem
23
+ from megaloader.plugins import get_plugin_class
24
+
25
+
26
+ logging.getLogger(__name__).addHandler(logging.NullHandler())
27
+ logger = logging.getLogger(__name__)
28
+
29
+ __version__ = "0.2.0"
30
+ __all__ = ["DownloadItem", "ExtractionError", "UnsupportedDomainError", "extract"]
31
+
32
+
33
+ def extract(url: str, **options: Any) -> Generator[DownloadItem, None, None]:
34
+ """
35
+ Extract downloadable items from a URL.
36
+
37
+ Returns a generator that yields items lazily as they're discovered.
38
+ Network requests happen during iteration, not at call time.
39
+
40
+ Args:
41
+ url: The source URL to extract from
42
+ **options: Plugin-specific options:
43
+ - password: str (Gofile)
44
+ - session_id: str (Fanbox, Pixiv)
45
+ - api_key: str (Rule34)
46
+ - user_id: str (Rule34)
47
+
48
+ Yields:
49
+ DownloadItem: Metadata for each downloadable file
50
+
51
+ Raises:
52
+ ValueError: Invalid URL format
53
+ UnsupportedDomainError: No plugin available for domain
54
+ ExtractionError: Network or parsing failure
55
+
56
+ Example:
57
+ >>> for item in extract("https://pixeldrain.com/l/abc123"):
58
+ ... print(item.download_url, item.filename)
59
+ """
60
+ if not url or not url.strip():
61
+ msg = "URL cannot be empty"
62
+ raise ValueError(msg)
63
+
64
+ url = url.strip()
65
+ parsed = urllib.parse.urlparse(url)
66
+
67
+ if not parsed.netloc:
68
+ msg = f"Invalid URL: Could not parse domain from '{url}'"
69
+ raise ValueError(msg)
70
+
71
+ plugin_class = get_plugin_class(parsed.netloc)
72
+ if plugin_class is None:
73
+ raise UnsupportedDomainError(parsed.netloc)
74
+
75
+ logger.debug(
76
+ "Initializing %s for domain '%s'", plugin_class.__name__, parsed.netloc
77
+ )
78
+
79
+ try:
80
+ plugin = plugin_class(url, **options)
81
+ yield from plugin.extract()
82
+ except (UnsupportedDomainError, ValueError):
83
+ raise
84
+ except Exception as e:
85
+ logger.debug("Extraction failed for %s: %s", url, e, exc_info=True)
86
+ msg = f"Failed to extract from {url}: {e}"
87
+ raise ExtractionError(msg) from e
@@ -0,0 +1,14 @@
1
+ class MegaloaderError(Exception):
2
+ """Base exception for all megaloader errors."""
3
+
4
+
5
+ class ExtractionError(MegaloaderError):
6
+ """Failed to extract items from URL due to network or parsing error."""
7
+
8
+
9
+ class UnsupportedDomainError(MegaloaderError):
10
+ """No plugin available for this domain."""
11
+
12
+ def __init__(self, domain: str) -> None:
13
+ super().__init__(f"No plugin found for domain: {domain}")
14
+ self.domain = domain
megaloader/item.py ADDED
@@ -0,0 +1,32 @@
1
+ from dataclasses import dataclass, field
2
+
3
+
4
+ @dataclass
5
+ class DownloadItem:
6
+ """
7
+ Represents a single downloadable file with metadata.
8
+
9
+ Attributes:
10
+ download_url: Direct URL to download the file
11
+ filename: Original filename (may need sanitization for filesystem)
12
+ collection_name: Optional grouping (album/gallery/user)
13
+ source_id: Optional unique identifier from the source platform
14
+ headers: Optional HTTP headers required for download (e.g., Referer)
15
+ size_bytes: Optional file size in bytes
16
+ """
17
+
18
+ download_url: str
19
+ filename: str
20
+ collection_name: str | None = None
21
+ source_id: str | None = None
22
+ headers: dict[str, str] = field(default_factory=dict)
23
+ size_bytes: int | None = None
24
+
25
+ def __post_init__(self) -> None:
26
+ """Validate required fields."""
27
+ if not self.download_url:
28
+ msg = "download_url cannot be empty"
29
+ raise ValueError(msg)
30
+ if not self.filename:
31
+ msg = "filename cannot be empty"
32
+ raise ValueError(msg)
megaloader/plugin.py ADDED
@@ -0,0 +1,94 @@
1
+ import logging
2
+
3
+ from abc import ABC, abstractmethod
4
+ from collections.abc import Generator
5
+ from typing import Any, ClassVar
6
+
7
+ import requests
8
+
9
+ from requests.adapters import HTTPAdapter
10
+ from urllib3.util.retry import Retry
11
+
12
+ from megaloader.item import DownloadItem
13
+
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class BasePlugin(ABC):
19
+ """
20
+ Base class for site-specific extractors.
21
+
22
+ Credential handling convention:
23
+ 1. Explicit **kwargs take precedence (e.g., password="secret")
24
+ 2. Environment variables as fallback (PLUGIN_NAME_*)
25
+ 3. Fail gracefully if required credentials missing
26
+
27
+ Subclasses should override _configure_session() to add:
28
+ - Authentication headers/cookies
29
+ - Site-specific headers (Referer, Origin)
30
+ """
31
+
32
+ DEFAULT_HEADERS: ClassVar[dict[str, str]] = {
33
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
34
+ }
35
+
36
+ def __init__(self, url: str, **options: Any) -> None:
37
+ if not url.strip():
38
+ msg = "URL must be a non-empty string"
39
+ raise ValueError(msg)
40
+
41
+ self.url = url.strip()
42
+ self.options = options
43
+ self._session: requests.Session | None = None
44
+
45
+ @property
46
+ def session(self) -> requests.Session:
47
+ """Lazily create session with retry logic and default headers."""
48
+ if self._session is None:
49
+ self._session = self._create_session()
50
+ self._configure_session(self._session)
51
+ return self._session
52
+
53
+ def _create_session(self) -> requests.Session:
54
+ """Create session with retry strategy for transient failures."""
55
+ session = requests.Session()
56
+ session.headers.update(self.DEFAULT_HEADERS)
57
+
58
+ # Retry on common transient errors
59
+ retry_strategy = Retry(
60
+ total=3,
61
+ backoff_factor=1,
62
+ status_forcelist=[429, 500, 502, 503, 504],
63
+ allowed_methods=["GET", "POST"],
64
+ )
65
+ adapter = HTTPAdapter(max_retries=retry_strategy)
66
+ session.mount("https://", adapter)
67
+ session.mount("http://", adapter)
68
+
69
+ return session
70
+
71
+ def _configure_session(self, session: requests.Session) -> None: # noqa: B027
72
+ """
73
+ Override to add plugin-specific headers/cookies.
74
+
75
+ Example:
76
+ session.headers["Referer"] = f"https://{self.domain}/"
77
+ if api_key := os.getenv("PLUGIN_API_KEY"):
78
+ session.headers["Authorization"] = f"Bearer {api_key}"
79
+ """
80
+
81
+ @abstractmethod
82
+ def extract(self) -> Generator[DownloadItem, None, None]:
83
+ """
84
+ Extract downloadable items from the URL.
85
+
86
+ Yields items as they're discovered (lazy evaluation).
87
+ Should handle pagination, nested galleries, etc.
88
+
89
+ Yields:
90
+ DownloadItem: Each file found at the URL
91
+
92
+ Raises:
93
+ ExtractionError: On network/parsing failures
94
+ """
@@ -0,0 +1,74 @@
1
+ from megaloader.plugin import BasePlugin
2
+ from megaloader.plugins.bunkr import Bunkr
3
+ from megaloader.plugins.cyberdrop import Cyberdrop
4
+ from megaloader.plugins.fanbox import Fanbox
5
+ from megaloader.plugins.fapello import Fapello
6
+ from megaloader.plugins.gofile import Gofile
7
+ from megaloader.plugins.pixeldrain import PixelDrain
8
+ from megaloader.plugins.pixiv import Pixiv
9
+ from megaloader.plugins.rule34 import Rule34
10
+ from megaloader.plugins.thothub_to import ThothubTO
11
+ from megaloader.plugins.thothub_vip import ThothubVIP
12
+ from megaloader.plugins.thotslife import Thotslife
13
+
14
+
15
+ PLUGIN_REGISTRY: dict[str, type[BasePlugin]] = {
16
+ "bunkr.si": Bunkr,
17
+ "bunkr.la": Bunkr,
18
+ "bunkr.is": Bunkr,
19
+ "bunkr.ru": Bunkr,
20
+ "bunkr.su": Bunkr,
21
+ "cyberdrop.cr": Cyberdrop,
22
+ "cyberdrop.me": Cyberdrop,
23
+ "cyberdrop.to": Cyberdrop,
24
+ "fanbox.cc": Fanbox,
25
+ "fapello.com": Fapello,
26
+ "gofile.io": Gofile,
27
+ "pixeldrain.com": PixelDrain,
28
+ "pixiv.net": Pixiv,
29
+ "rule34.xxx": Rule34,
30
+ "thothub.ch": ThothubTO,
31
+ "thothub.to": ThothubTO,
32
+ "thothub.vip": ThothubVIP,
33
+ "thotslife.com": Thotslife,
34
+ }
35
+
36
+ # Domains that support subdomains (e.g., creator.fanbox.cc)
37
+ SUBDOMAIN_SUPPORTED: set[str] = {"fanbox.cc"}
38
+
39
+
40
+ def get_plugin_class(domain: str) -> type[BasePlugin] | None:
41
+ """
42
+ Resolve domain to plugin class.
43
+
44
+ Resolution order:
45
+ 1. Exact match in PLUGIN_REGISTRY
46
+ 2. Subdomain match for supported domains
47
+ 3. Partial match (fallback for domain variations)
48
+
49
+ Args:
50
+ domain: Normalized domain from URL (e.g., "pixiv.net")
51
+
52
+ Returns:
53
+ Plugin class or None if unsupported
54
+ """
55
+ domain = domain.lower().strip()
56
+
57
+ # Exact match
58
+ if domain in PLUGIN_REGISTRY:
59
+ return PLUGIN_REGISTRY[domain]
60
+
61
+ # Subdomain support (e.g., creator.fanbox.cc -> fanbox.cc)
62
+ for base_domain in SUBDOMAIN_SUPPORTED:
63
+ if domain.endswith(f".{base_domain}") and base_domain in PLUGIN_REGISTRY:
64
+ return PLUGIN_REGISTRY[base_domain]
65
+
66
+ # Partial match fallback (e.g., www.pixiv.net -> pixiv.net)
67
+ for registered_domain, plugin_class in PLUGIN_REGISTRY.items():
68
+ if registered_domain in domain:
69
+ return plugin_class
70
+
71
+ return None
72
+
73
+
74
+ __all__ = ["PLUGIN_REGISTRY", "get_plugin_class"]
@@ -0,0 +1,147 @@
1
+ import base64
2
+ import html
3
+ import logging
4
+ import math
5
+ import re
6
+
7
+ from collections.abc import Generator
8
+ from urllib.parse import quote, urljoin, urlparse
9
+
10
+ import requests
11
+
12
+ from megaloader.item import DownloadItem
13
+ from megaloader.plugin import BasePlugin
14
+
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class Bunkr(BasePlugin):
20
+ """Extract files from Bunkr albums and individual file pages."""
21
+
22
+ API_BASE = "https://apidl.bunkr.ru/api/_001_v2"
23
+
24
+ def extract(self) -> Generator[DownloadItem, None, None]:
25
+ path = urlparse(self.url).path
26
+
27
+ if path.startswith("/a/"):
28
+ logger.debug("Processing album")
29
+ yield from self._extract_album()
30
+ elif path.startswith("/f/"):
31
+ logger.debug("Processing single file")
32
+ yield from self._extract_file(self.url)
33
+ else:
34
+ logger.warning("Unrecognized Bunkr URL format")
35
+
36
+ def _extract_album(self) -> Generator[DownloadItem, None, None]:
37
+ """Extract all files from an album page."""
38
+ try:
39
+ response = self.session.get(self.url, allow_redirects=True, timeout=30)
40
+ response.raise_for_status()
41
+ except Exception:
42
+ logger.exception("Failed to fetch album page")
43
+ return
44
+
45
+ file_links = re.findall(r'href="(/f/[^"]+)"', response.text)
46
+
47
+ if not file_links:
48
+ logger.warning("No files found in album")
49
+ return
50
+
51
+ seen_urls = set()
52
+ for link in file_links:
53
+ # Skip template variables
54
+ if "file.slug" in link or "+" in link:
55
+ continue
56
+
57
+ file_url = urljoin(response.url, link)
58
+ if file_url in seen_urls:
59
+ continue
60
+
61
+ seen_urls.add(file_url)
62
+ yield from self._extract_file(file_url)
63
+
64
+ def _extract_file(self, file_url: str) -> Generator[DownloadItem, None, None]:
65
+ """Extract download URL from a file page."""
66
+ try:
67
+ response = self.session.get(file_url, timeout=30)
68
+ response.raise_for_status()
69
+ except requests.RequestException:
70
+ logger.debug("Failed to fetch file page %s", file_url, exc_info=True)
71
+ return
72
+
73
+ # Find download button
74
+ download_match = re.search(
75
+ r'<a[^>]+class="[^"]*btn-main[^"]*"[^>]+href="([^"]+)"[^>]*>Download</a>',
76
+ response.text,
77
+ )
78
+
79
+ if not download_match:
80
+ logger.debug("No download button found for %s", file_url)
81
+ return
82
+
83
+ download_page_url = urljoin(file_url, download_match.group(1))
84
+
85
+ # Extract file ID from download page URL
86
+ if match := re.search(r"/file/(\w+)", download_page_url):
87
+ file_id = match.group(1)
88
+ else:
89
+ logger.debug("Could not extract file ID from %s", download_page_url)
90
+ return
91
+
92
+ filename = self._extract_filename(response.text) or f"bunkr_file_{file_id}"
93
+
94
+ if direct_url := self._fetch_direct_url(file_id, filename):
95
+ yield DownloadItem(
96
+ download_url=direct_url,
97
+ filename=filename,
98
+ source_id=file_id,
99
+ headers={"Referer": "https://get.bunkrr.su/"},
100
+ )
101
+
102
+ def _extract_filename(self, content: str) -> str | None:
103
+ """Extract original filename from page metadata."""
104
+ # Try og:title meta tag
105
+ if match := re.search(r'<meta property="og:title" content="([^"]+)"', content):
106
+ return html.unescape(match.group(1)).strip()
107
+
108
+ # Try JavaScript variable
109
+ if match := re.search(r'var ogname\s*=\s*"([^"]+)"', content):
110
+ return html.unescape(match.group(1)).strip()
111
+
112
+ return None
113
+
114
+ def _fetch_direct_url(self, file_id: str, filename: str) -> str | None:
115
+ """Get direct CDN URL using Bunkr's API."""
116
+ try:
117
+ response = self.session.post(
118
+ self.API_BASE,
119
+ json={"id": file_id},
120
+ timeout=30,
121
+ )
122
+ response.raise_for_status()
123
+ data = response.json()
124
+
125
+ # Decrypt the URL
126
+ timestamp = data["timestamp"]
127
+ encrypted_b64 = data["url"]
128
+
129
+ # Generate time-based decryption key
130
+ key_str = f"SECRET_KEY_{math.floor(timestamp / 3600)}"
131
+ key_bytes = key_str.encode("utf-8")
132
+
133
+ # XOR decrypt
134
+ encrypted_bytes = base64.b64decode(encrypted_b64)
135
+ decrypted = bytearray(
136
+ encrypted_bytes[i] ^ key_bytes[i % len(key_bytes)]
137
+ for i in range(len(encrypted_bytes))
138
+ )
139
+
140
+ base_url = decrypted.decode("utf-8")
141
+ return f"{base_url}?n={quote(filename)}"
142
+
143
+ except Exception: # noqa: BLE001
144
+ logger.debug(
145
+ "Failed to fetch direct URL for file ID %s", file_id, exc_info=True
146
+ )
147
+ return None
@@ -0,0 +1,116 @@
1
+ import logging
2
+ import re
3
+
4
+ from collections.abc import Generator
5
+ from typing import Any
6
+ from urllib.parse import urljoin, urlparse
7
+
8
+ import requests
9
+
10
+ from bs4 import BeautifulSoup
11
+
12
+ from megaloader.item import DownloadItem
13
+ from megaloader.plugin import BasePlugin
14
+
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class Cyberdrop(BasePlugin):
20
+ """Extract files from Cyberdrop albums and individual files."""
21
+
22
+ API_BASE = "https://api.cyberdrop.cr/api/file"
23
+ SITE_BASE = "https://cyberdrop.cr"
24
+
25
+ def extract(self) -> Generator[DownloadItem, None, None]:
26
+ path = urlparse(self.url).path
27
+
28
+ if path.startswith("/a/"):
29
+ logger.debug("Processing album")
30
+ yield from self._extract_album()
31
+ elif path.startswith("/f/"):
32
+ logger.debug("Processing single file")
33
+ yield from self._extract_file()
34
+ else:
35
+ logger.warning("Unrecognized Cyberdrop URL format")
36
+
37
+ def _extract_album(self) -> Generator[DownloadItem, None, None]:
38
+ """Extract all files from an album."""
39
+ try:
40
+ response = self.session.get(self.url, timeout=30)
41
+ response.raise_for_status()
42
+ except Exception:
43
+ logger.exception("Failed to fetch album page")
44
+ return
45
+
46
+ soup = BeautifulSoup(response.text, "html.parser")
47
+
48
+ # Get album title
49
+ title_elem = soup.find("h1", id="title")
50
+ collection_name = title_elem.text.strip() if title_elem else None
51
+
52
+ # Find all file links
53
+ file_links = soup.select("a.file[href], a#file[href]")
54
+
55
+ for link in file_links:
56
+ file_url = urljoin(self.SITE_BASE, str(link["href"]))
57
+
58
+ if match := re.search(r"/f/(\w+)", file_url):
59
+ yield from self._process_file(match.group(1), collection_name)
60
+
61
+ def _extract_file(self) -> Generator[DownloadItem, None, None]:
62
+ """Extract a single file."""
63
+ if match := re.search(r"/f/(\w+)", self.url):
64
+ yield from self._process_file(match.group(1))
65
+
66
+ def _process_file(
67
+ self, file_id: str, collection_name: str | None = None
68
+ ) -> Generator[DownloadItem, None, None]:
69
+ """Process a single file ID and yield download item."""
70
+ item_data = self._fetch_file_info(file_id)
71
+ if not item_data:
72
+ return
73
+
74
+ direct_url = self._fetch_direct_url(item_data["auth_url"])
75
+ if not direct_url:
76
+ return
77
+
78
+ yield DownloadItem(
79
+ download_url=direct_url,
80
+ filename=item_data["name"],
81
+ collection_name=collection_name,
82
+ source_id=file_id,
83
+ )
84
+
85
+ def _fetch_file_info(self, file_id: str) -> dict[str, Any] | None:
86
+ """Get file metadata from API."""
87
+ api_url = f"{self.API_BASE}/info/{file_id}"
88
+
89
+ try:
90
+ response = self.session.get(api_url, timeout=30)
91
+ response.raise_for_status()
92
+ data = response.json()
93
+
94
+ if isinstance(data, dict) and data.get("name") and data.get("auth_url"):
95
+ return data
96
+ except (requests.RequestException, ValueError):
97
+ logger.debug("Failed to fetch file info for %s", file_id, exc_info=True)
98
+
99
+ return None
100
+
101
+ def _fetch_direct_url(self, auth_url: str) -> str | None:
102
+ """Get direct CDN URL from auth endpoint."""
103
+ try:
104
+ response = self.session.get(auth_url, timeout=30)
105
+ response.raise_for_status()
106
+ data = response.json()
107
+
108
+ if isinstance(data, dict):
109
+ url = data.get("url")
110
+ if isinstance(url, str):
111
+ return url
112
+
113
+ except (requests.RequestException, ValueError, KeyError):
114
+ logger.debug("Failed to fetch direct URL from %s", auth_url, exc_info=True)
115
+
116
+ return None