PyPI - rcdl - Versions diffs - 2.0.0__py3-none-any.whl - Mend

rcdl 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

rcdl/__init__.py +5 -0
rcdl/__main__.py +23 -0
rcdl/core/api.py +54 -0
rcdl/core/config.py +115 -0
rcdl/core/db.py +262 -0
rcdl/core/downloader.py +246 -0
rcdl/core/downloader_subprocess.py +29 -0
rcdl/core/file_io.py +14 -0
rcdl/core/models.py +52 -0
rcdl/core/parser.py +208 -0
rcdl/core/processor.py +290 -0
rcdl/interface/cli.py +97 -0
rcdl/interface/progress.py +25 -0
rcdl/scripts/upload_pypi.py +83 -0
rcdl/utils.py +40 -0
rcdl-2.0.0.dist-info/METADATA +81 -0
rcdl-2.0.0.dist-info/RECORD +19 -0
rcdl-2.0.0.dist-info/WHEEL +4 -0
rcdl-2.0.0.dist-info/entry_points.txt +3 -0

rcdl/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+# __init__.py
+from importlib.metadata import version
+__version__ = version("rcdl")

rcdl/__main__.py ADDED Viewed

@@ -0,0 +1,23 @@
+# __main__.py
+import logging
+from rcdl.core.config import Config, setup_logging
+# setup file structure
+Config.ensure_dirs()
+Config.ensure_files()
+# setup logging
+setup_logging(Config.LOG_FILE, level=0)
+logging.info("--- INIT ---")
+logging.info("Logger initialized")
+# init database
+from rcdl.core.db import DB  # noqa: E402
+d = DB()
+d.close()
+from rcdl.interface.cli import cli  # noqa: E402, F401

rcdl/core/api.py ADDED Viewed

@@ -0,0 +1,54 @@
+# core/api.py
+from .models import Creator
+class URL:
+    DOMAINS_BASE_URL = {
+        "coomer": "https://coomer.st/api/v1/",
+        "kemono": "https://kemono.cr/api/v1/",
+    }
+    @staticmethod
+    def get_base_url(domain: str) -> str:
+        if domain not in URL.DOMAINS_BASE_URL:
+            raise KeyError(f"{domain} not in known domains urls")
+        return URL.DOMAINS_BASE_URL[domain]
+    @staticmethod
+    def get_post_revision(creator: Creator, post_id) -> str:
+        return f"{URL.get_base_url(creator.domain)}{creator.service}/user/{creator.creator_id}/post/{post_id}/revisions"
+    @staticmethod
+    def get_headers() -> dict:
+        return {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0 Safari/537.36",
+            "Accept": "text/css",
+        }
+    @staticmethod
+    def get_url_from_file(domain: str, path_url: str):
+        if domain == "coomer":
+            return f"https://coomer.st{path_url}"
+        elif domain == "kemono":
+            return f"https://kemono.cr{path_url}"
+        else:
+            raise ValueError(
+                f"Domain {domain} is not an accepted value/does not exist. Please check your creators.json file"
+            )
+    @staticmethod
+    def add_params(url: str, params: dict):
+        url += "?"
+        for key in params:
+            url += f"{key}={params[key]}&"
+        return url[:-1]
+    @staticmethod
+    def get_creator_post_wo_param(creator: Creator) -> str:
+        return f"{URL.get_base_url(creator.domain)}{creator.service}/user/{creator.creator_id}/posts"
+    @staticmethod
+    def get_posts_page_url_wo_param():
+        domain = URL.DOMAINS_BASE_URL["coomer"]
+        return f"{domain}posts"

rcdl/core/config.py ADDED Viewed

@@ -0,0 +1,115 @@
+# core/config.py
+from pathlib import Path
+import json
+import logging
+import os
+from .file_io import write_json
+class Config:
+    # paths
+    APP_NAME = "cdl"
+    BASE_DIR = Path.home() / "Videos/rcdl"
+    BASE_DIR = Path(os.environ.get("RCDL_BASE_DIR", Path.home() / "Videos/rcdl"))
+    CACHE_DIR = BASE_DIR / ".cache"
+    DB_PATH = CACHE_DIR / "cdl.db"
+    LOG_FILE = CACHE_DIR / "cdl.log"
+    FUSE_CSV_FILE = CACHE_DIR / "cdl_fuse.csv"
+    SETTINGS_FILE = CACHE_DIR / "settings.json"
+    CREATORS_FILE = CACHE_DIR / "creators.json"
+    DISCOVER_DIR = CACHE_DIR / "discover"
+    # defaults
+    DEFAULT_SETTINGS = {
+        "work_folder": str(BASE_DIR),
+        "yt_dlp_args": "--external-downloader aria2c",
+        "preset": "veryfast",
+    }
+    # default creators
+    DEFAULT_CREATORS = [
+        {"creator_id": "boixd", "service": "onlyfans", "domain": "coomer"}
+    ]
+    DEBUG = False
+    DRY_RUN = False
+    # api settings
+    POST_PER_PAGE = 50
+    DEFAULT_MAX_PAGE = 10
+    MAX_FAIL_COUNT = 7
+    @classmethod
+    def ensure_dirs(cls):
+        cls.CACHE_DIR.mkdir(parents=True, exist_ok=True)
+        cls.DISCOVER_DIR.mkdir(exist_ok=True)
+    @classmethod
+    def ensure_files(cls):
+        files = [
+            cls.DB_PATH,
+            cls.FUSE_CSV_FILE,
+            cls.CREATORS_FILE,
+        ]
+        for file in files:
+            if not file.exists():
+                file.touch()
+                logging.info("Created file %s", file)
+                if file == cls.CREATORS_FILE:
+                    write_json(file, cls.DEFAULT_CREATORS)
+    @classmethod
+    def load_settings(cls):
+        if not cls.SETTINGS_FILE.exists():
+            with open(cls.SETTINGS_FILE, "w") as f:
+                json.dump(cls.DEFAULT_SETTINGS, f, indent=4)
+            return cls.DEFAULT_SETTINGS
+        with open(cls.SETTINGS_FILE, "r") as f:
+            return json.load(f)
+    @classmethod
+    def creator_folder(cls, creator_id: str) -> Path:
+        folder = cls.BASE_DIR / creator_id
+        folder.mkdir(exist_ok=True)
+        return folder
+    @classmethod
+    def cache_file(cls, filename: str, ext: str = ".json") -> Path:
+        file_name = filename + ext
+        file = cls.CACHE_DIR / file_name
+        return file
+    @classmethod
+    def set_debug(cls, debug: bool):
+        cls.DEBUG = debug
+    @classmethod
+    def set_dry_run(cls, dry_run: bool):
+        cls.DRY_RUN = dry_run
+def setup_logging(log_file: Path, level: int = 0):
+    logger = logging.getLogger()
+    logger.setLevel(level)
+    logger.handlers.clear()  # avoid double handlers if called multiple times
+    # file handler
+    file_handler = logging.FileHandler(log_file, encoding="utf-8", mode="a")
+    file_handler.setFormatter(
+        logging.Formatter(
+            "{asctime} - {levelname} - {message}",
+            style="{",
+            datefmt="%Y-%m-%d %H:%M:%S",
+        )
+    )
+    logger.addHandler(file_handler)
+    # console handler (stdout)
+    console_handler = logging.StreamHandler()
+    console_handler.setFormatter(logging.Formatter("{levelname}: {message}", style="{"))
+    logger.addHandler(console_handler)

rcdl/core/db.py ADDED Viewed

@@ -0,0 +1,262 @@
+# core/db.py
+import sqlite3
+from typing import Optional
+import logging
+from .config import Config
+from .models import Video, VideoStatus
+class DB:
+    def __init__(self):
+        self.conn = sqlite3.connect(Config.DB_PATH)
+        self.conn.row_factory = sqlite3.Row
+        self._init_table()
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+    def _init_table(self):
+        # init table for videos to DL
+        q = """
+            CREATE TABLE IF NOT EXISTS videos (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                post_id TEXT,
+                creator_id TEXT,
+                service TEXT,
+                domain TEXT,
+                relative_path TEXT,
+                url TEXT,
+                part TEXT,
+                status TEXT DEFAULT 'not_downloaded',
+                fail_count INTEGER DEFAULT 0,
+                published TEXT,
+                title TEXT,
+                substring TEXT,
+                downloaded_at TEXT,
+                file_size REAL,
+                UNIQUE (service, url)
+            )
+        """
+        self.conn.execute(q)
+        self.conn.commit()
+    def get_video_status(self, video: Video) -> Optional[dict]:
+        cur = self.conn.cursor()
+        cur.execute(
+            "SELECT status, fail_count FROM videos WHERE (service, url) = (?, ?)",
+            (video.service, video.url),
+        )
+        row = cur.fetchone()
+        if row:
+            return {
+                "status": VideoStatus(row["status"]),
+                "fail_count": row["fail_count"],
+                "downloaded_at": row["downloaded_at"],
+                "relative_path": row["relative_path"],
+            }
+        return None
+    def get_videos_by_status(self, status: VideoStatus) -> list[Video]:
+        cur = self.conn.cursor()
+        cur.execute(
+            "SELECT * FROM videos WHERE status = ?",
+            (status.value,),
+        )
+        rows = cur.fetchall()
+        videos: list[Video] = []
+        for row in rows:
+            v = Video(
+                post_id=row["post_id"],
+                creator_id=row["creator_id"],
+                service=row["service"],
+                domain=row["domain"],
+                relative_path=row["relative_path"],
+                url=row["url"],
+                part=row["part"],
+                status=VideoStatus(row["status"]),
+                fail_count=row["fail_count"],
+                published=row["published"],
+                title=row["title"],
+                substring=row["substring"],
+                downloaded_at=row["downloaded_at"],
+                file_size=row["file_size"],
+            )
+            videos.append(v)
+        logging.info(f"DB request status {status.value} returned {len(videos)} results")
+        return videos
+    def get_videos_by_creator_id(self, creator_id: str):
+        # discover_videos
+        cur = self.conn.cursor()
+        cur.execute("SELECT * FROM videos WHERE creator_id = ?", (creator_id,))
+        rows = cur.fetchall()
+        videos: list[Video] = []
+        for row in rows:
+            v = Video(
+                post_id=row["post_id"],
+                creator_id=row["creator_id"],
+                service=row["service"],
+                domain=row["domain"],
+                relative_path=row["relative_path"],
+                url=row["url"],
+                part=row["part"],
+                status=VideoStatus(row["status"]),
+                fail_count=row["fail_count"],
+                published=row["published"],
+                title=row["title"],
+                substring=row["substring"],
+                downloaded_at=row["downloaded_at"],
+                file_size=row["file_size"],
+            )
+            videos.append(v)
+        return videos
+    def mark_not_downloaded(self, video: Video):
+        video.status = VideoStatus.NOT_DOWNLOADED
+        self._upsert_video(video)
+    def mark_downloaded(self, video: Video):
+        video.status = VideoStatus.DOWNLOADED
+        self._upsert_video(video)
+    def mark_failed(self, video: Video, fail_count: int):
+        video.status = VideoStatus.FAILED
+        video.fail_count = fail_count
+        self._upsert_video(video)
+    def mark_skipped(self, video: Video):
+        video.status = VideoStatus.SKIPPED
+        self._upsert_video(video)
+    def mark_ignored(self, video: Video):
+        video.status = VideoStatus.IGNORED
+        self._upsert_video(video)
+    def _upsert_video(self, video: Video):
+        q = """
+            INSERT INTO videos (
+                post_id, creator_id, service, domain, relative_path, url, part,
+                status, fail_count, published, title, substring,
+                downloaded_at, file_size
+            )
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+            ON CONFLICT(service, url) DO UPDATE SET
+                status=excluded.status,
+                fail_count=excluded.fail_count,
+                relative_path=excluded.relative_path,
+                downloaded_at=excluded.downloaded_at,
+                file_size=excluded.file_size
+        """
+        if video.status is None:
+            video.status = VideoStatus.NOT_DOWNLOADED
+        self.conn.execute(
+            q,
+            (
+                video.post_id,
+                video.creator_id,
+                video.service,
+                video.domain,
+                video.relative_path,
+                video.url,
+                video.part,
+                video.status.value,
+                video.fail_count,
+                video.published,
+                video.title,
+                video.substring,
+                video.downloaded_at,
+                video.file_size,
+            ),
+        )
+        self.conn.commit()
+    def get_pending_videos(
+        self, max_fail_count: int = Config.MAX_FAIL_COUNT
+    ) -> list[Video]:
+        cur = self.conn.cursor()
+        cur.execute(
+            "SELECT * FROM videos WHERE status = ? OR (status = ? AND fail_count < ?)",
+            (
+                VideoStatus.NOT_DOWNLOADED,
+                VideoStatus.FAILED,
+                max_fail_count,
+            ),
+        )
+        rows = cur.fetchall()
+        videos: list[Video] = []
+        for row in rows:
+            videos.append(
+                Video(
+                    post_id=row["post_id"],
+                    creator_id=row["creator_id"],
+                    service=row["service"],
+                    domain=row["domain"],
+                    relative_path=row["relative_path"],
+                    url=row["url"],
+                    part=row["part"],
+                    status=VideoStatus(row["status"]),
+                    fail_count=row["fail_count"],
+                    published=row["published"],
+                    title=row["title"],
+                    substring=row["substring"],
+                    downloaded_at=row["downloaded_at"],
+                    file_size=row["file_size"],
+                )
+            )
+        logging.info(
+            "DB pending videos: %d (max_fail_count=%d)", len(videos), max_fail_count
+        )
+        return videos
+    def close(self):
+        self.conn.close()
+def update_db(videos: list[Video]):
+    if not videos:
+        return
+    with DB() as d:
+        q = """
+            INSERT OR IGNORE INTO videos (
+                post_id, creator_id, service, domain, relative_path, url, part,
+                status, fail_count, published, title, substring,
+                downloaded_at, file_size
+            )
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        """
+        rows = []
+        for video in videos:
+            rows.append(
+                (
+                    video.post_id,
+                    video.creator_id,
+                    video.service,
+                    video.domain,
+                    video.relative_path,
+                    video.url,
+                    video.part,
+                    VideoStatus.NOT_DOWNLOADED.value,
+                    0,
+                    video.published,
+                    video.title,
+                    video.substring,
+                    None,
+                    None,
+                )
+            )
+        d.conn.executemany(q, rows)
+        d.conn.commit()

rcdl/core/downloader.py ADDED Viewed

@@ -0,0 +1,246 @@
+# core/downloader.py
+import logging
+import os
+import requests
+import rcdl.core.parser as parser
+from .api import URL
+from .config import Config
+from .models import Creator, Video
+from .db import DB, update_db
+from .downloader_subprocess import ytdlp_subprocess
+from .file_io import write_json, load_json
+from rcdl.interface.progress import ProgressPrinter
+class PostsFetcher:
+    def __init__(
+        self, url: str, json_path: str, max_page: int = Config.DEFAULT_MAX_PAGE
+    ):
+        self.url = url
+        self.json_path = json_path
+        self.page = 0
+        self.max_page = max_page
+        self.status = 200
+    def _request_page(self, url: str) -> requests.Response:
+        logging.info(f"RequestEngine url {url}")
+        headers = URL.get_headers()
+        return requests.get(url, headers=headers)
+    def request(self, continue_on_error: bool = False, params: dict = {}):
+        while self.status == 200 and self.page < self.max_page:
+            o = self.page * Config.POST_PER_PAGE
+            params["o"] = o
+            url = URL.add_params(self.url, params)
+            if Config.DRY_RUN:
+                logging.debug(f"DRY-RUN posts fetcher {url} -> {self.json_path}")
+                self.page += 1
+                continue
+            response = self._request_page(url)
+            self.status = response.status_code
+            if self.status != 200:
+                if self.page == 0:
+                    logging.error(f"Failed to get {url}")
+                else:
+                    logging.warning(
+                        f"Status code {self.status}. This behavior is expected."
+                    )
+                if not continue_on_error:
+                    break
+            else:
+                logging.info(f"Response Status Code: {self.status}")
+                if self.page > 0:
+                    json_data = list(load_json(self.json_path))
+                else:
+                    json_data = []
+                if "posts" in response.json():
+                    json_data.extend(response.json()["posts"])
+                else:
+                    json_data.extend(response.json())
+                write_json(self.json_path, json_data, mode="w")
+            self.page += 1
+class VideoDownloader:
+    def __init__(self, total_dl: int = -1):
+        self.progress = ProgressPrinter(total_dl)
+    def _build_url(self, domain: str, video: Video):
+        return URL.get_url_from_file(domain, video.url)
+    def _build_output_path(self, video: Video, discover: bool = False):
+        if not discover:
+            return os.path.join(
+                Config.creator_folder(video.creator_id), video.relative_path
+            )
+        else:
+            return os.path.join(Config.DISCOVER_DIR, video.relative_path)
+    def _update_db_status(self, result: int, video: Video):
+        with DB() as d:
+            if result == 0:
+                d.mark_downloaded(video)
+            else:
+                d.mark_failed(video, video.fail_count + 1)
+    def _exists(self, path: str) -> bool:
+        if os.path.exists(path):
+            return True
+        return False
+    def download(
+        self, domain: str, video: Video, write_db: bool = True, discover: bool = False
+    ) -> bool:
+        url = self._build_url(domain, video)
+        path = self._build_output_path(video, discover=discover)
+        if Config.DRY_RUN:
+            logging.debug(f"DRY-RUN dl {url} -> {path}")
+            return True
+        if self._exists(path):
+            logging.warning("Video was already dl ! This should not happen")
+            self.progress.update(ignore=True)  # update progress but not eta
+        result = ytdlp_subprocess(url, path)
+        if write_db:
+            self._update_db_status(result, video)
+        self.progress.update()
+        self.progress.display()
+        if result == 0:
+            return True
+        return False
+def fetch_posts_by_tag(tag: str, max_page: int = Config.DEFAULT_MAX_PAGE) -> dict:
+    url = URL.get_posts_page_url_wo_param()
+    path = Config.cache_file(tag)
+    pf = PostsFetcher(url, str(path), max_page=max_page)
+    pf.request(continue_on_error=True, params={"tag": tag})
+    return load_json(path)
+def fetch_posts_by_creator(creator: Creator) -> dict:
+    url = URL.get_creator_post_wo_param(creator)
+    path = Config.cache_file(f"{creator.creator_id}_{creator.service}")
+    pf = PostsFetcher(url, str(path))
+    pf.request()
+    return load_json(path)
+def refresh_creators_videos(creators: list[Creator]):
+    for creator in creators:
+        logging.info(
+            f"CREATOR {creator.creator_id} from {creator.service} on {creator.domain}"
+        )
+        posts = fetch_posts_by_creator(creator)
+        logging.info(
+            f"Found {len(posts)} posts from creator {creator.creator_id}({creator.service})"
+        )
+        posts_with_videos = parser.filter_posts_with_videos_from_json(
+            str(Config.cache_file(f"{creator.creator_id}_{creator.service}"))
+        )
+        logging.info(
+            f"Find {len(posts_with_videos)} posts with videos from creator {creator.creator_id}({creator.service})"
+        )
+        all_videos = parser.convert_posts_to_videos(posts_with_videos)
+        logging.info(f"Converted {len(posts_with_videos)} to {len(all_videos)} videos")
+        # put all videos in db
+        update_db(all_videos)
+def download_videos_to_be_dl():
+    """
+    Download videos of all creators in creators.json
+    """
+    with DB() as d:
+        videos = d.get_pending_videos()
+        # VideoDownloader Engine
+        vd = VideoDownloader(total_dl=len(videos))
+        for video in videos:
+            creator = Creator(
+                creator_id=video.creator_id,
+                service=video.service,
+                domain=video.domain,
+                status=None,
+            )
+            succesful = vd.download(creator.domain, video)
+            if not succesful:
+                logging.warning("Fail to dl vid")
+def discover(tag: str, max_page: int):
+    discover_creators(tag, max_page)
+    dl_video_from_discover_creators()
+def discover_creators(tag: str, max_page: int):
+    # download posts with searched tags
+    posts = fetch_posts_by_tag(tag, max_page)
+    logging.info(f"Find {len(posts)} post")
+    path = str(Config.cache_file(tag))
+    posts_with_videos = parser.filter_posts_with_videos_from_json(path)
+    logging.info(f"Find {len(posts_with_videos)} posts with videos")
+    creators = parser.get_creators_from_posts(posts_with_videos)
+    # save to csv
+    file = os.path.join(Config.DISCOVER_DIR, "discover.csv")
+    with open(file, "w") as f:
+        for c in creators:
+            line = f"{c.creator_id};{c.service};{c.domain};{'to_be_treated'}\n"
+            f.write(line)
+def dl_video_from_discover_creators():
+    # load csv
+    file = os.path.join(Config.DISCOVER_DIR, "discover.csv")
+    with open(file, "r") as f:
+        lines = f.readlines()
+    creators = []
+    for line in lines:
+        line = line.replace("\n", "").strip().split(";")
+        creators.append(
+            Creator(creator_id=line[0], service=line[1], domain=line[2], status=line[3])
+        )
+    # get posts
+    for creator in creators:
+        response = requests.get(
+            URL.get_creator_post_wo_param(creator), headers=URL.get_headers()
+        )
+        if response.status_code != 200:
+            print(f"ERROR - Request {URL.get_creator_post_wo_param(creator)}")
+        response_posts = response.json()
+        posts = parser.filter_posts_with_videos_from_list(response_posts)
+        print(f"{len(posts)} found")
+        if len(posts) > 5:
+            posts = posts[0:5]
+            print("Limited posts to 5")
+        for post in posts:
+            urls = parser.extract_video_urls(post)
+            url = URL.get_url_from_file(creator.domain, urls[0])
+            filename = f"{post['user']}_{post['id']}.mp4"
+            filepath = os.path.join(Config.DISCOVER_DIR, filename)
+            ytdlp_subprocess(url, filepath)