rcdl 2.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,265 @@
1
+ # core/downloader.py
2
+
3
+ import logging
4
+ import os
5
+
6
+ import requests
7
+
8
+ import rcdl.core.parser as parser
9
+ from .api import URL
10
+ from .config import Config
11
+ from .models import Creator, Video, VideoStatus
12
+ from .db import DB
13
+ from .downloader_subprocess import ytdlp_subprocess
14
+ from .file_io import write_json, load_json
15
+ from rcdl.interface.ui import UI
16
+
17
+
18
+ class PostsFetcher:
19
+ """
20
+ Fetch posts from api. Save as JSON. Handle multiple pages requests
21
+ """
22
+
23
+ def __init__(
24
+ self, url: str, json_path: str, max_page: int = Config.DEFAULT_MAX_PAGE
25
+ ):
26
+ self.url = url
27
+ self.json_path = json_path
28
+
29
+ self.page = 0
30
+ self.max_page = max_page
31
+
32
+ self.status = 200
33
+
34
+ def _request_page(self, url: str) -> requests.Response:
35
+ """Request a single page and return json dict"""
36
+ logging.info(f"RequestEngine url {url}")
37
+ headers = URL.get_headers()
38
+ response = requests.get(url, headers=headers)
39
+ if response.status_code != 200:
40
+ logging.warning(f"Failed request {url}: {requests.status_codes}")
41
+ return response
42
+
43
+ def request(self, params: dict = {}):
44
+ with UI.progress_posts_fetcher(self.max_page) as progress:
45
+ task = progress.add_task("Fetching posts", total=self.max_page)
46
+
47
+ while self.status == 200 and self.page < self.max_page:
48
+ o = self.page * Config.POST_PER_PAGE
49
+ params["o"] = o
50
+ url = URL.add_params(self.url, params)
51
+
52
+ try:
53
+ # Dry run: not request acutally made
54
+ if Config.DRY_RUN:
55
+ logging.debug(
56
+ f"DRY-RUN posts fetcher {url} -> {self.json_path}"
57
+ )
58
+ self.page += 1
59
+ continue
60
+
61
+ response = self._request_page(url)
62
+ self.status = response.status_code
63
+
64
+ # if the programm crash while doing requests, previous requests are still saved and not overwritten.
65
+ if self.page > 0:
66
+ json_data = list(load_json(self.json_path))
67
+ else:
68
+ json_data = []
69
+
70
+ # for discover command, response json is in a different format and contains 'posts'
71
+ if self.status == 200:
72
+ if "posts" in response.json():
73
+ json_data.extend(response.json()["posts"])
74
+ else:
75
+ json_data.extend(response.json())
76
+
77
+ write_json(self.json_path, json_data, mode="w")
78
+
79
+ progress.update(
80
+ task,
81
+ advance=1,
82
+ description=f"Fetched {len(json_data)} posts (page {self.page + 1}/{self.max_page})",
83
+ )
84
+ except Exception as e:
85
+ logging.error(f"Error in request {url} p{self.page}: {e}")
86
+ finally:
87
+ self.page += 1
88
+
89
+
90
+ class VideoDownloader:
91
+ """Handle downloading a list of Videos and update DB status"""
92
+
93
+ def __init__(self):
94
+ pass
95
+
96
+ def _build_url(self, video: Video):
97
+ return URL.get_url_from_file(video.domain, video.url)
98
+
99
+ def _build_output_path(self, video: Video, discover: bool = False):
100
+ if discover:
101
+ return os.path.join(Config.DISCOVER_DIR, video.relative_path)
102
+
103
+ return os.path.join(
104
+ Config.creator_folder(video.creator_id), video.relative_path
105
+ )
106
+
107
+ def _update_db_status(self, result: int, video: Video):
108
+ with DB() as d:
109
+ if result == 0:
110
+ d.set_status(video, VideoStatus.DOWNLOADED, fail_count=0)
111
+ else:
112
+ d.set_status(video, VideoStatus.FAILED, fail_count=video.fail_count + 1)
113
+
114
+ def downloads(
115
+ self, videos: list[Video], write_db: bool = True, discover: bool = False
116
+ ):
117
+ progress, task = UI.video_progress(total=len(videos))
118
+ try:
119
+ for video in videos:
120
+ url = self._build_url(video)
121
+ filepath = self._build_output_path(video, discover=discover)
122
+
123
+ UI.set_current_video_progress(
124
+ f"{video.creator_id}@({video.service})", video.relative_path
125
+ )
126
+
127
+ if Config.DRY_RUN:
128
+ UI.debug(f"Dry run: dl {video.creator_id} @ {filepath}")
129
+ progress.advance(task)
130
+ continue
131
+
132
+ if os.path.exists(filepath):
133
+ UI.warning(
134
+ f"Video {url} @ {filepath} already exists. Possible DB problem"
135
+ )
136
+ progress.advance(task)
137
+ continue
138
+
139
+ result = ytdlp_subprocess(url, filepath)
140
+ if write_db:
141
+ self._update_db_status(result, video)
142
+
143
+ progress.advance(task)
144
+ finally:
145
+ UI.close_video_progress()
146
+
147
+
148
+ def fetch_posts_by_tag(tag: str, max_page: int = Config.DEFAULT_MAX_PAGE) -> dict:
149
+ """Helper function to get all posts from a search results"""
150
+ url = URL.get_posts_page_url_wo_param()
151
+ path = Config.cache_file(tag)
152
+ pf = PostsFetcher(url, str(path), max_page=max_page)
153
+ pf.request(params={"tag": tag})
154
+
155
+ return load_json(path)
156
+
157
+
158
+ def fetch_posts_by_creator(creator: Creator) -> dict:
159
+ """Helper function to get all posts from a creator"""
160
+ url = URL.get_creator_post_wo_param(creator)
161
+ path = Config.cache_file(f"{creator.creator_id}_{creator.service}")
162
+ pf = PostsFetcher(url, str(path))
163
+ pf.request()
164
+
165
+ return load_json(path)
166
+
167
+
168
+ def refresh_creators_videos():
169
+ """
170
+ Command refresh
171
+ For each creator:
172
+ - get all posts to a .json
173
+ - from the .json filter to keep only the posts with videos in it
174
+ - convert posts dict to Videos
175
+ - update the DB
176
+ """
177
+ creators = parser.get_creators()
178
+ for creator in creators:
179
+ UI.info(f"Creator {creator.creator_id} from {creator.service}")
180
+
181
+ fetch_posts_by_creator(creator)
182
+ posts_with_videos = parser.filter_posts_with_videos_from_json(
183
+ str(Config.cache_file(f"{creator.creator_id}_{creator.service}"))
184
+ )
185
+ all_videos = parser.convert_posts_to_videos(posts_with_videos)
186
+
187
+ UI.info(
188
+ f"Found {len(all_videos)} videos from {len(posts_with_videos)} posts with videos url"
189
+ )
190
+
191
+ # put all videos in db
192
+ with DB() as db:
193
+ db.insert_videos(all_videos)
194
+
195
+
196
+ def download_videos_to_be_dl():
197
+ """
198
+ Command dlsf
199
+ Download videos in db with status TO_BE_DOWNLOADED OR (FAILED & fail_count < Config.)
200
+ """
201
+ with DB() as db:
202
+ videos = db.query_videos(pending=True)
203
+
204
+ vd = VideoDownloader()
205
+ vd.downloads(videos, write_db=True, discover=False)
206
+
207
+
208
+ # --- --- --- --- --- DISCOVER --- --- --- --- ---
209
+ def discover(tag: str, max_page: int):
210
+ discover_creators(tag, max_page)
211
+ dl_video_from_discover_creators()
212
+
213
+
214
+ def discover_creators(tag: str, max_page: int):
215
+ # download posts with searched tags
216
+ posts = fetch_posts_by_tag(tag, max_page)
217
+ logging.info(f"Find {len(posts)} post")
218
+
219
+ path = str(Config.cache_file(tag))
220
+ posts_with_videos = parser.filter_posts_with_videos_from_json(path)
221
+ logging.info(f"Find {len(posts_with_videos)} posts with videos")
222
+
223
+ creators = parser.get_creators_from_posts(posts_with_videos)
224
+
225
+ # save to csv
226
+ file = os.path.join(Config.DISCOVER_DIR, "discover.csv")
227
+ with open(file, "w") as f:
228
+ for c in creators:
229
+ line = f"{c.creator_id};{c.service};{c.domain};{'to_be_treated'}\n"
230
+ f.write(line)
231
+
232
+
233
+ def dl_video_from_discover_creators():
234
+ # load csv
235
+ file = os.path.join(Config.DISCOVER_DIR, "discover.csv")
236
+ with open(file, "r") as f:
237
+ lines = f.readlines()
238
+
239
+ creators = []
240
+ for line in lines:
241
+ line = line.replace("\n", "").strip().split(";")
242
+ creators.append(
243
+ Creator(creator_id=line[0], service=line[1], domain=line[2], status=line[3])
244
+ )
245
+
246
+ # get posts
247
+ for creator in creators:
248
+ response = requests.get(
249
+ URL.get_creator_post_wo_param(creator), headers=URL.get_headers()
250
+ )
251
+ if response.status_code != 200:
252
+ print(f"ERROR - Request {URL.get_creator_post_wo_param(creator)}")
253
+ response_posts = response.json()
254
+ posts = parser.filter_posts_with_videos_from_list(response_posts)
255
+ print(f"{len(posts)} found")
256
+ if len(posts) > 5:
257
+ posts = posts[0:5]
258
+ print("Limited posts to 5")
259
+
260
+ for post in posts:
261
+ urls = parser.extract_video_urls(post)
262
+ url = URL.get_url_from_file(creator.domain, urls[0])
263
+ filename = f"{post['user']}_{post['id']}.mp4"
264
+ filepath = os.path.join(Config.DISCOVER_DIR, filename)
265
+ ytdlp_subprocess(url, filepath)
@@ -0,0 +1,190 @@
1
+ # core/downloader_subprocess.py
2
+
3
+ import subprocess
4
+ import logging
5
+ from pathlib import Path
6
+ import os
7
+
8
+ from rcdl.core.models import Video
9
+ from rcdl.core.config import Config
10
+ from rcdl.interface.ui import UI
11
+
12
+
13
+ def ytdlp_subprocess(
14
+ url: str,
15
+ filepath: Path | str,
16
+ ):
17
+ """Call yt-dlp in a subprocess"""
18
+ cmd = [
19
+ "yt-dlp",
20
+ "-q",
21
+ "--progress",
22
+ url,
23
+ "-o",
24
+ filepath,
25
+ "--external-downloader",
26
+ "aria2c",
27
+ ]
28
+
29
+ logging.info(f"CMD: {' '.join(cmd)}")
30
+
31
+ result = subprocess.run(cmd, capture_output=True, text=True)
32
+ if result.returncode != 0:
33
+ logging.error(f"yt-dlp failed to dl vid: {result.stderr}")
34
+
35
+ return result.returncode
36
+
37
+
38
+ def ffmpeg_concat_build_command(videos: list[Video]) -> dict:
39
+ # parameters
40
+ width: int = 1920
41
+ height: int = 1080
42
+ fps: int = 30
43
+ preset: str = "veryfast"
44
+ threads: int = 0 # 0 for max
45
+
46
+ # output path
47
+ v = videos[0]
48
+ output_filename = f"tmp_{v.published}_{v.title}.mp4"
49
+ output_path = os.path.join(Config.creator_folder(v.creator_id), output_filename)
50
+
51
+ # build cmd
52
+ cmd = ["ffmpeg", "-y", "-progress", "pipe:2", "-nostats"]
53
+
54
+ # inputs
55
+ for v in videos:
56
+ input_path = os.path.join(Config.creator_folder(v.creator_id), v.relative_path)
57
+ cmd.extend(["-i", input_path])
58
+
59
+ # filter complex
60
+ filter_lines = []
61
+ for idx in range(len(videos)):
62
+ filter_lines.append(
63
+ f"[{idx}:v]"
64
+ f"scale={width}:{height}:force_original_aspect_ratio=decrease,"
65
+ f"pad={width}:{height}:(ow-iw)/2:(oh-ih)/2,"
66
+ f"fps={fps},setsar=1"
67
+ f"[v{idx}]"
68
+ )
69
+
70
+ # concat inputs
71
+ concat = []
72
+ for idx in range(len(videos)):
73
+ concat.append(f"[v{idx}][{idx}:a]")
74
+
75
+ filter_lines.append(f"{''.join(concat)}concat=n={len(videos)}:v=1:a=1[outv][outa]")
76
+ filter_complex = ";".join(filter_lines)
77
+
78
+ cmd.extend(
79
+ [
80
+ "-filter_complex",
81
+ filter_complex,
82
+ "-map",
83
+ "[outv]",
84
+ "-map",
85
+ "[outa]",
86
+ "-c:v",
87
+ "libx264",
88
+ "-preset",
89
+ preset,
90
+ "-threads",
91
+ str(threads),
92
+ "-c:a",
93
+ "aac",
94
+ "-f",
95
+ "mp4",
96
+ output_path,
97
+ ]
98
+ )
99
+
100
+ return {"cmd": cmd, "output_path": output_path}
101
+
102
+
103
+ def get_duration(path: str) -> int:
104
+ cmd = [
105
+ "ffprobe",
106
+ "-v",
107
+ "error",
108
+ "-select_streams",
109
+ "v:0",
110
+ "-show_entries",
111
+ "format=duration",
112
+ "-of",
113
+ "default=noprint_wrappers=1:nokey=1",
114
+ path,
115
+ ]
116
+
117
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True)
118
+ return int(float(result.stdout.strip()) * 1000)
119
+
120
+
121
+ def get_total_duration(videos: list[Video]) -> int:
122
+ duration = 0
123
+ for v in videos:
124
+ path = os.path.join(Config.creator_folder(v.creator_id), v.relative_path)
125
+ duration += get_duration(path)
126
+ return duration
127
+
128
+
129
+ def ffmpeg_concat(videos: list[Video]):
130
+ command_builder = ffmpeg_concat_build_command(videos)
131
+ cmd = command_builder["cmd"]
132
+ output_path = command_builder["output_path"]
133
+
134
+ logging.info(f"CMD: {' '.join(cmd)}")
135
+
136
+ ffmpeg_log = Config.CACHE_DIR / "ffmpeg.log"
137
+ with open(ffmpeg_log, "w", encoding="utf-8") as log_file:
138
+ print(cmd, file=log_file)
139
+ # run cmd
140
+ process = subprocess.Popen(
141
+ cmd,
142
+ stdout=subprocess.DEVNULL,
143
+ stderr=subprocess.PIPE,
144
+ text=True,
145
+ bufsize=1,
146
+ )
147
+
148
+ assert process.stderr is not None
149
+ total_duration = get_total_duration(videos)
150
+ progress, task = UI.concat_progress(total=total_duration)
151
+ last_progress = 0
152
+ UI.set_current_concat_progress(f"{videos[0].relative_path}", output_path)
153
+
154
+ for line in process.stderr:
155
+ line = line.strip()
156
+ if not line:
157
+ continue
158
+
159
+ print(line, file=log_file)
160
+
161
+ progres_key = "out_time_ms"
162
+ if line.startswith(progres_key):
163
+ current_progress_str = line.replace(f"{progres_key}=", "").strip()
164
+ try:
165
+ current_progress_us = int(current_progress_str)
166
+ current_progress_ms = current_progress_us // 1000
167
+ delta = current_progress_ms - last_progress
168
+ progress.advance(task, advance=delta)
169
+ last_progress = current_progress_ms
170
+ except Exception:
171
+ pass
172
+
173
+ process.wait()
174
+ UI.close_concat_progress()
175
+
176
+ if process.returncode != 0:
177
+ UI.error(f"Failed to concat videos. See ffmpeg log file {ffmpeg_log}")
178
+ with open(ffmpeg_log, "r") as f:
179
+ lines = f.readlines()
180
+ logging.error("---FFMPEG LOG---")
181
+ for line in lines:
182
+ logging.error(line)
183
+ logging.error("---END FFMPEG LOG---")
184
+ return process.returncode
185
+
186
+ temp_output_path = output_path
187
+ new_output_path = temp_output_path.replace("tmp_", "")
188
+ os.replace(temp_output_path, new_output_path)
189
+ UI.info(f"Rename {output_path} -> {output_path}")
190
+ return 0
rcdl/core/file_io.py ADDED
@@ -0,0 +1,34 @@
1
+ # core/file_io.py
2
+
3
+ import json
4
+
5
+
6
+ def write_json(path, data, mode="w"):
7
+ with open(path, mode) as f:
8
+ json.dump(data, f, indent=4)
9
+
10
+
11
+ def load_json(path) -> dict:
12
+ with open(path, "r") as f:
13
+ data = json.load(f)
14
+ return data
15
+
16
+
17
+ def load_txt(path) -> list[str]:
18
+ with open(path, "r") as f:
19
+ lines = f.readlines()
20
+ for i in range(len(lines)):
21
+ lines[i] = lines[i].strip()
22
+ return lines
23
+
24
+
25
+ def write_txt(path, lines: list[str] | str, mode: str = "a"):
26
+ if isinstance(lines, str):
27
+ lines = [lines]
28
+
29
+ with open(path, mode) as f:
30
+ for line in lines:
31
+ if not line.endswith("\n"):
32
+ f.write(line + "\n")
33
+ else:
34
+ f.write(line)
rcdl/core/fuse.py ADDED
@@ -0,0 +1,118 @@
1
+ # core/fuse.py
2
+
3
+ import os
4
+
5
+ from rcdl.interface.ui import UI
6
+ from rcdl.core.db import DB
7
+ from rcdl.core.models import VideoStatus
8
+ from rcdl.core.config import Config
9
+ from rcdl.core.downloader_subprocess import ffmpeg_concat
10
+
11
+
12
+ def fuse_videos():
13
+ """Fuse videos"""
14
+
15
+ allowed_status = [
16
+ VideoStatus.DOWNLOADED,
17
+ VideoStatus.CONCAT_WIP,
18
+ VideoStatus.CONCAT_FAILED,
19
+ ]
20
+
21
+ if Config.DEBUG:
22
+ allowed_status.append(VideoStatus.DOWNLOADED)
23
+
24
+ # load db videos
25
+ with DB() as db:
26
+ videos = db.query_videos(status=allowed_status, min_part_number=1)
27
+
28
+ # get unique posts id
29
+ posts_ids = set()
30
+ for video in videos:
31
+ posts_ids.add(video.post_id)
32
+
33
+ with UI.progress_total_concat() as progress:
34
+ task = progress.add_task("Total concat", total=len(posts_ids))
35
+
36
+ for post_id in posts_ids:
37
+ UI.info(f"Looking at post_id: {post_id}")
38
+
39
+ # get all videos with same post_id
40
+ with DB() as db:
41
+ videos = db.query_videos(post_id=post_id)
42
+ if not videos:
43
+ UI.error("Query SQL Failed.")
44
+ progress.update(task, advance=1)
45
+ continue
46
+
47
+ # check each videos of the same post is fully downloaded
48
+ ok = True
49
+ for video in videos:
50
+ if video.status not in allowed_status:
51
+ ok = False
52
+ break
53
+ if not ok:
54
+ progress.update(task, advance=1)
55
+ continue
56
+
57
+ # sort by part number
58
+ videos.sort(key=lambda v: int(v.part))
59
+
60
+ ok = True
61
+ for video in videos:
62
+ # make sure video exist
63
+ path = os.path.join(
64
+ Config.creator_folder(video.creator_id), video.relative_path
65
+ )
66
+ if not os.path.exists(path):
67
+ UI.error(f"Video @ {path} does not exists")
68
+ ok = False
69
+
70
+ # if status is concat WIP, these should not be possible
71
+ if video.status == VideoStatus.CONCAT_WIP:
72
+ UI.warning(
73
+ f"Video '{video.relative_path}' has status 'CONCAT_WIP'. This is a bug and should not be possible."
74
+ )
75
+
76
+ # update videos status in db to CONCAT_WIP
77
+ # in case of problems in the scripts, we will know
78
+ with DB() as db:
79
+ for video in videos:
80
+ db.set_status(video, VideoStatus.CONCAT_WIP)
81
+
82
+ result = 1
83
+ try:
84
+ result = ffmpeg_concat(videos)
85
+ except Exception as e:
86
+ UI.error(f"Failed concat due to: {e}")
87
+
88
+ # concat failed
89
+ if not result == 0:
90
+ with DB() as db:
91
+ for video in videos:
92
+ db.set_status(video, VideoStatus.CONCAT_FAILED)
93
+ continue
94
+
95
+ # concat succeeded
96
+ with DB() as db:
97
+ for video in videos:
98
+ db.set_status(video, VideoStatus.CONCAT_DONE)
99
+
100
+ # update progress bar
101
+ progress.update(
102
+ task,
103
+ advance=1,
104
+ description=f"Concated videos for post id {post_id}",
105
+ )
106
+
107
+ # remove part video if concat OK
108
+ with DB() as db:
109
+ for video in videos:
110
+ path = os.path.join(
111
+ Config.creator_folder(video.creator_id), video.relative_path
112
+ )
113
+ try:
114
+ os.remove(path)
115
+ UI.info(f"Removed {path}")
116
+ db.set_status(video, VideoStatus.REMOVED)
117
+ except Exception as e:
118
+ UI.error(f"Failed to remove {path} due to error: {e}")
rcdl/core/models.py ADDED
@@ -0,0 +1,56 @@
1
+ # core/models.py
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Optional
5
+ from enum import Enum
6
+
7
+
8
+ class VideoStatus(Enum):
9
+ NOT_DOWNLOADED = "not_downloaded"
10
+ DOWNLOADED = "downloaded"
11
+ FAILED = "failed"
12
+ SKIPPED = "skipped"
13
+ IGNORED = "ignored"
14
+ REMOVED = "removed"
15
+ CONCAT_WIP = "concat_wip" # concat in progress
16
+ CONCAT_DONE = "concat_done"
17
+ CONCAT_FAILED = "concat_failed"
18
+
19
+
20
+ class DiscoverStatus(Enum):
21
+ TO_BE_TREATED = "to_be_treated"
22
+ DOWNLOADED = "downloaded"
23
+ BLACKLISTED = "blacklisted"
24
+ WHITELSITED = "whitelisted"
25
+ DOWNLOAD_MORE = "download_more"
26
+
27
+
28
+ @dataclass
29
+ class Creator:
30
+ creator_id: str
31
+ service: str
32
+ domain: str
33
+ status: Optional[str]
34
+
35
+
36
+ @dataclass
37
+ class Video:
38
+ # important fields
39
+ post_id: str
40
+ creator_id: str
41
+ service: str
42
+ domain: str
43
+ relative_path: str
44
+ url: str
45
+ part: int = 0
46
+
47
+ # metadata
48
+ published: Optional[str] = None
49
+ title: Optional[str] = None
50
+ substring: Optional[str] = None
51
+ downloaded_at: Optional[str] = None
52
+ file_size: Optional[float] = None
53
+
54
+ # status in cdl
55
+ status: Optional[VideoStatus] = None
56
+ fail_count: int = 0