rcdl 2.2.2__py3-none-any.whl → 3.0.0b13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rcdl/core/downloader.py CHANGED
@@ -1,18 +1,32 @@
1
1
  # core/downloader.py
2
2
 
3
+ """
4
+ Handle post, media download to website
5
+ """
6
+
3
7
  import logging
4
8
  import os
9
+ import json
5
10
 
6
11
  import requests
7
12
 
8
- import rcdl.core.parser as parser
9
- from .api import URL
10
- from .config import Config
11
- from .models import Creator, Video, VideoStatus
12
- from .db import DB
13
- from .downloader_subprocess import ytdlp_subprocess
14
- from .file_io import write_json, load_json
15
- from rcdl.interface.ui import UI
13
+ from rcdl.interface.ui import UI, NestedProgress
14
+ from rcdl.core import parser
15
+ from rcdl.core import adapters
16
+ from rcdl.core.api import URL
17
+ from rcdl.core.config import Config
18
+ from rcdl.core.models import (
19
+ Creator,
20
+ Status,
21
+ Media,
22
+ Post,
23
+ FusedMedia,
24
+ FusedStatus,
25
+ )
26
+ from rcdl.core.db import DB
27
+ from rcdl.core.downloader_subprocess import ytdlp_subprocess
28
+ from rcdl.core.file_io import write_json, load_json
29
+ from rcdl.utils import get_date_now, get_media_metadata
16
30
 
17
31
 
18
32
  class PostsFetcher:
@@ -33,14 +47,18 @@ class PostsFetcher:
33
47
 
34
48
  def _request_page(self, url: str) -> requests.Response:
35
49
  """Request a single page and return json dict"""
36
- logging.info(f"RequestEngine url {url}")
50
+ logging.info("RequestEngine url %s", url)
37
51
  headers = URL.get_headers()
38
- response = requests.get(url, headers=headers)
52
+ response = requests.get(url, headers=headers, timeout=Config.TIMEOUT)
39
53
  if response.status_code != 200:
40
- logging.warning(f"Failed request {url}: {requests.status_codes}")
54
+ logging.warning("Failed request %s: %s", url, response.status_code)
41
55
  return response
42
56
 
43
- def request(self, params: dict = {}):
57
+ def request(self, params: dict | None = None):
58
+ """Request multiple page of an url"""
59
+ if params is None:
60
+ params = {}
61
+
44
62
  with UI.progress_posts_fetcher(self.max_page) as progress:
45
63
  task = progress.add_task("Fetching posts", total=self.max_page)
46
64
 
@@ -53,7 +71,7 @@ class PostsFetcher:
53
71
  # Dry run: not request acutally made
54
72
  if Config.DRY_RUN:
55
73
  logging.debug(
56
- f"DRY-RUN posts fetcher {url} -> {self.json_path}"
74
+ "DRY-RUN posts fetcher %s -> %s", url, self.json_path
57
75
  )
58
76
  self.page += 1
59
77
  continue
@@ -61,13 +79,15 @@ class PostsFetcher:
61
79
  response = self._request_page(url)
62
80
  self.status = response.status_code
63
81
 
64
- # if the programm crash while doing requests, previous requests are still saved and not overwritten.
82
+ # if the programm crash while doing requests,
83
+ # previous requests are still saved and not overwritten.
65
84
  if self.page > 0:
66
85
  json_data = list(load_json(self.json_path))
67
86
  else:
68
87
  json_data = []
69
88
 
70
- # for discover command, response json is in a different format and contains 'posts'
89
+ # for discover command, response json is in a
90
+ # different format and contains 'posts'
71
91
  if self.status == 200:
72
92
  if "posts" in response.json():
73
93
  json_data.extend(response.json()["posts"])
@@ -79,70 +99,116 @@ class PostsFetcher:
79
99
  progress.update(
80
100
  task,
81
101
  advance=1,
82
- description=f"Fetched {len(json_data)} posts (page {self.page + 1}/{self.max_page})",
102
+ description=(
103
+ f"Fetched {len(json_data)}"
104
+ f" posts (page {self.page + 1}/{self.max_page})"
105
+ ),
106
+ )
107
+ except requests.RequestException as e:
108
+ logging.error(
109
+ "Failed to request %s (page: %s) deu to: %s", url, self.page, e
110
+ )
111
+ except json.JSONDecodeError as e:
112
+ logging.error(
113
+ "Failed to decode JSON response of request %s due to: %s",
114
+ url,
115
+ e,
83
116
  )
84
- except Exception as e:
85
- logging.error(f"Error in request {url} p{self.page}: {e}")
86
117
  finally:
87
118
  self.page += 1
88
119
 
89
120
 
90
- class VideoDownloader:
91
- """Handle downloading a list of Videos and update DB status"""
121
+ class MediaDownloader:
122
+ """Handle downloading a list of media and update DB status"""
92
123
 
93
124
  def __init__(self):
94
125
  pass
95
126
 
96
- def _build_url(self, video: Video):
97
- return URL.get_url_from_file(video.domain, video.url)
127
+ def _build_url(self, domain: str, url: str):
128
+ """Return full url"""
129
+ return URL.get_url_from_file(domain, url)
98
130
 
99
- def _build_output_path(self, video: Video, discover: bool = False):
100
- if discover:
101
- return os.path.join(Config.DISCOVER_DIR, video.relative_path)
131
+ def _build_full_path(self, user: str, media_path: str):
132
+ """Return full path"""
133
+ return os.path.join(Config.creator_folder(user), media_path)
102
134
 
103
- return os.path.join(
104
- Config.creator_folder(video.creator_id), video.relative_path
105
- )
135
+ def _media_exist(self, full_path: str):
136
+ """Check a file exist"""
137
+ return os.path.exists(full_path)
106
138
 
107
- def _update_db_status(self, result: int, video: Video):
108
- with DB() as d:
109
- if result == 0:
110
- d.set_status(video, VideoStatus.DOWNLOADED, fail_count=0)
111
- else:
112
- d.set_status(video, VideoStatus.FAILED, fail_count=video.fail_count + 1)
139
+ def _update_db(self, result: int, media: Media, full_path: str):
140
+ """Update db information"""
113
141
 
114
- def downloads(
115
- self, videos: list[Video], write_db: bool = True, discover: bool = False
116
- ):
117
- progress, task = UI.video_progress(total=len(videos))
118
- try:
119
- for video in videos:
120
- url = self._build_url(video)
121
- filepath = self._build_output_path(video, discover=discover)
122
-
123
- UI.set_current_video_progress(
124
- f"{video.creator_id}@({video.service})", video.relative_path
125
- )
142
+ # video failed to download
143
+ if result != 0:
144
+ media.fail_count += 1
145
+ else:
146
+ duration, file_size, checksum = get_media_metadata(full_path)
147
+ media.duration = duration
148
+ media.status = Status.DOWNLOADED
149
+ media.checksum = checksum
150
+ media.created_at = get_date_now()
151
+ media.file_size = file_size
126
152
 
127
- if Config.DRY_RUN:
128
- UI.debug(f"Dry run: dl {video.creator_id} @ {filepath}")
129
- progress.advance(task)
130
- continue
153
+ with DB() as db:
154
+ db.update_media(media)
155
+
156
+ def download(self, medias: list[Media], max_fail_count: int | None = None):
157
+ """Download all medias in media with PENDING stats"""
158
+ # init progress bar
159
+ progress = NestedProgress(UI.console)
160
+ progress.start(
161
+ total=len(medias),
162
+ total_label="Downloading videos",
163
+ current_label="Current video",
164
+ )
131
165
 
132
- if os.path.exists(filepath):
133
- UI.warning(
134
- f"Video {url} @ {filepath} already exists. Possible DB problem"
135
- )
136
- progress.advance(task)
137
- continue
166
+ max_try = Config.MAX_FAIL_COUNT
167
+ if max_fail_count is not None:
168
+ max_try = max_fail_count
169
+ for media in medias:
170
+ progress.start_current("Downloading", total=2)
171
+ if media.fail_count > max_try:
172
+ UI.warning(
173
+ f"Video skipped due to too many failed download attempt ({media.fail_count})"
174
+ )
175
+ progress.advance_total()
176
+ continue
177
+
178
+ # match post info from db with post_id to get user/creator_id
179
+ with DB() as db:
180
+ post = db.query_post_by_id(media.post_id)
181
+ if post is None:
182
+ UI.error(f"Could not match media post_id {media.post_id} with a post")
183
+ progress.advance_total()
184
+ continue
185
+
186
+ # build full url and full path
187
+ url = self._build_url(post.domain, media.url)
188
+ full_path = self._build_full_path(post.user, media.file_path)
189
+
190
+ # update progress bar info (video in download info)
191
+ progress.set_status(f"{post.user}@({post.service}) -> ", media.file_path)
192
+
193
+ # check video does not already exist
194
+ if self._media_exist(full_path):
195
+ UI.warning(
196
+ f"Video {url} @ {full_path} already exists. Possible DB problem"
197
+ )
198
+ self._update_db(0, media, full_path)
199
+ progress.advance_total()
200
+ continue
138
201
 
139
- result = ytdlp_subprocess(url, filepath)
140
- if write_db:
141
- self._update_db_status(result, video)
202
+ # dry run: no actual download, skippe rest of fn
203
+ if Config.DRY_RUN:
204
+ UI.debug(f"(dry-run) dl {post.user}@{full_path} from {url}")
205
+ progress.advance_total()
206
+ continue
142
207
 
143
- progress.advance(task)
144
- finally:
145
- UI.close_video_progress()
208
+ result = ytdlp_subprocess(url, full_path)
209
+ self._update_db(result, media, full_path)
210
+ progress.advance_total()
211
+ progress.close()
146
212
 
147
213
 
148
214
  def fetch_posts_by_tag(tag: str, max_page: int = Config.DEFAULT_MAX_PAGE) -> dict:
@@ -158,108 +224,84 @@ def fetch_posts_by_tag(tag: str, max_page: int = Config.DEFAULT_MAX_PAGE) -> dic
158
224
  def fetch_posts_by_creator(creator: Creator) -> dict:
159
225
  """Helper function to get all posts from a creator"""
160
226
  url = URL.get_creator_post_wo_param(creator)
161
- path = Config.cache_file(f"{creator.creator_id}_{creator.service}")
227
+ path = Config.cache_file(f"{creator.id}_{creator.service}")
162
228
  pf = PostsFetcher(url, str(path))
163
229
  pf.request()
164
230
 
165
231
  return load_json(path)
166
232
 
167
233
 
234
+ def get_fuses_from_post(posts: list[Post]) -> list[FusedMedia]:
235
+ """Update data on fuses database table for video to be fused"""
236
+ fuses: list[FusedMedia] = []
237
+ for post in posts:
238
+ json_post = json.loads(post.raw_json)
239
+ total_parts = len(parser.extract_video_urls(json_post))
240
+ if total_parts > 1:
241
+ fuses.append(
242
+ FusedMedia(
243
+ id=post.id,
244
+ duration=0,
245
+ total_parts=total_parts,
246
+ status=FusedStatus.PENDING,
247
+ checksum="",
248
+ file_path=parser.get_filename_fuse(post),
249
+ created_at="",
250
+ updated_at="",
251
+ file_size=0,
252
+ fail_count=0,
253
+ )
254
+ )
255
+ return fuses
256
+
257
+
168
258
  def refresh_creators_videos():
169
259
  """
170
- Command refresh
171
260
  For each creator:
172
- - get all posts to a .json
173
- - from the .json filter to keep only the posts with videos in it
174
- - convert posts dict to Videos
175
- - update the DB
261
+ - get posts with videos & update posts DB
262
+ - extract all medias & update medias DB
263
+ - extract fuses group & update fuses DB
176
264
  """
177
265
  creators = parser.get_creators()
178
266
  for creator in creators:
179
- UI.info(f"Creator {creator.creator_id} from {creator.service}")
267
+ UI.info(f"Creator {creator.id} from {creator.service}")
180
268
 
269
+ # request all posts by creator
181
270
  fetch_posts_by_creator(creator)
271
+
272
+ # only keep posts with video url (mp4, m4v, ...)
182
273
  posts_with_videos = parser.filter_posts_with_videos_from_json(
183
- str(Config.cache_file(f"{creator.creator_id}_{creator.service}"))
274
+ str(Config.cache_file(f"{creator.id}_{creator.service}"))
184
275
  )
185
- all_videos = parser.convert_posts_to_videos(posts_with_videos)
186
276
 
187
- UI.info(
188
- f"Found {len(all_videos)} videos from {len(posts_with_videos)} posts with videos url"
189
- )
277
+ # convert all json dict into Post model
278
+ posts = adapters.json_posts_to_posts(posts_with_videos)
190
279
 
191
- # put all videos in db
280
+ # insert posts in db
192
281
  with DB() as db:
193
- db.insert_videos(all_videos)
194
-
195
-
196
- def download_videos_to_be_dl():
197
- """
198
- Command dlsf
199
- Download videos in db with status TO_BE_DOWNLOADED OR (FAILED & fail_count < Config.)
200
- """
201
- with DB() as db:
202
- videos = db.query_videos(pending=True)
203
-
204
- vd = VideoDownloader()
205
- vd.downloads(videos, write_db=True, discover=False)
206
-
207
-
208
- # --- --- --- --- --- DISCOVER --- --- --- --- ---
209
- def discover(tag: str, max_page: int):
210
- discover_creators(tag, max_page)
211
- dl_video_from_discover_creators()
282
+ db.insert_posts(posts)
212
283
 
284
+ # find all multiple part videos and update db
285
+ fuses = get_fuses_from_post(posts)
286
+ with DB() as db:
287
+ db.insert_fused_media(fuses)
213
288
 
214
- def discover_creators(tag: str, max_page: int):
215
- # download posts with searched tags
216
- posts = fetch_posts_by_tag(tag, max_page)
217
- logging.info(f"Find {len(posts)} post")
218
-
219
- path = str(Config.cache_file(tag))
220
- posts_with_videos = parser.filter_posts_with_videos_from_json(path)
221
- logging.info(f"Find {len(posts_with_videos)} posts with videos")
222
-
223
- creators = parser.get_creators_from_posts(posts_with_videos)
224
-
225
- # save to csv
226
- file = os.path.join(Config.DISCOVER_DIR, "discover.csv")
227
- with open(file, "w") as f:
228
- for c in creators:
229
- line = f"{c.creator_id};{c.service};{c.domain};{'to_be_treated'}\n"
230
- f.write(line)
231
-
289
+ # convert all posts into videos
290
+ medias = []
291
+ for post in posts:
292
+ medias.extend(adapters.post_to_videos(post))
232
293
 
233
- def dl_video_from_discover_creators():
234
- # load csv
235
- file = os.path.join(Config.DISCOVER_DIR, "discover.csv")
236
- with open(file, "r") as f:
237
- lines = f.readlines()
294
+ # insert videos in db
295
+ with DB() as db:
296
+ db.insert_medias(medias)
238
297
 
239
- creators = []
240
- for line in lines:
241
- line = line.replace("\n", "").strip().split(";")
242
- creators.append(
243
- Creator(creator_id=line[0], service=line[1], domain=line[2], status=line[3])
244
- )
245
298
 
246
- # get posts
247
- for creator in creators:
248
- response = requests.get(
249
- URL.get_creator_post_wo_param(creator), headers=URL.get_headers()
250
- )
251
- if response.status_code != 200:
252
- print(f"ERROR - Request {URL.get_creator_post_wo_param(creator)}")
253
- response_posts = response.json()
254
- posts = parser.filter_posts_with_videos_from_list(response_posts)
255
- print(f"{len(posts)} found")
256
- if len(posts) > 5:
257
- posts = posts[0:5]
258
- print("Limited posts to 5")
299
+ def download_videos_to_be_dl(max_fail_count: int | None):
300
+ """
301
+ Download all media with PENDING status in DB
302
+ """
303
+ with DB() as db:
304
+ medias = db.query_media_by_status(Status.PENDING)
259
305
 
260
- for post in posts:
261
- urls = parser.extract_video_urls(post)
262
- url = URL.get_url_from_file(creator.domain, urls[0])
263
- filename = f"{post['user']}_{post['id']}.mp4"
264
- filepath = os.path.join(Config.DISCOVER_DIR, filename)
265
- ytdlp_subprocess(url, filepath)
306
+ media_downloader = MediaDownloader()
307
+ media_downloader.download(medias, max_fail_count=max_fail_count)