ytcollector 1.0.8__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ytcollector/downloader.py CHANGED
@@ -1,493 +1,338 @@
1
- """
2
- YouTube Video Downloader Module
3
- yt-dlp 기반 YouTube 영상 다운로드 및 특정 구간 추출
4
- """
5
- from pathlib import Path
6
- from typing import Optional, Tuple, List, Dict
7
- import logging
1
+ import os
2
+ import time
3
+ import random
4
+ import shutil
5
+ import threading
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
8
7
 
9
- import yt_dlp
10
- from yt_dlp.utils import download_range_func
8
+ from yt_dlp import YoutubeDL
11
9
 
12
- from .config import (
13
- VIDEO_FORMAT,
14
- DOWNLOAD_RETRIES,
15
- CLIP_DURATION_BEFORE,
16
- CLIP_DURATION_AFTER,
17
- )
18
- from .utils import extract_video_id, get_clip_path
10
+ from .config import USER_AGENTS, CATEGORY_QUERIES, CATEGORY_NAMES, SKIP_ERRORS, BLACKLIST_KEYWORDS
11
+ from .analyzer import VideoAnalyzer
12
+ from .utils import clip_video, append_to_url_list, get_video_duration, get_next_index
19
13
 
20
- logger = logging.getLogger(__name__)
21
14
 
15
+ class YouTubeDownloader:
16
+ """YouTube 다운로더 클래스"""
17
+
18
+ _file_lock = threading.Lock()
22
19
 
23
- class LimitReachedError(Exception):
24
- """다운로드 제한 도달 예외"""
25
- pass
20
+ def __init__(self, output_path, max_duration=180, proxy=None, fast_mode=False, workers=3):
21
+ self.output_path = output_path
22
+ self.max_duration = max_duration # 기본 180초(3분)
23
+ self.proxy = proxy
24
+ self.fast_mode = fast_mode
25
+ self.workers = workers
26
+ self.analyzer = VideoAnalyzer()
27
+ self.query_index = {}
26
28
 
29
+ os.makedirs(output_path, exist_ok=True)
27
30
 
28
- class VideoDownloader:
29
- """YouTube 영상 다운로더 클래스"""
30
-
31
- def __init__(self, task_type: str, base_dir: Path = None):
32
- self.task_type = task_type
33
- self.base_dir = base_dir or Path.cwd()
34
-
35
- def get_video_info(self, url: str) -> dict:
36
- """영상 메타데이터 조회 (다운로드 없이)"""
37
- ydl_opts = {
38
- 'quiet': True,
39
- 'no_warnings': True,
40
- 'extract_flat': False,
41
- }
42
-
43
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
44
- info = ydl.extract_info(url, download=False)
45
- return {
46
- 'id': info.get('id'),
47
- 'title': info.get('title'),
48
- 'duration': info.get('duration'),
49
- 'channel': info.get('channel'),
50
- 'upload_date': info.get('upload_date'),
51
- }
52
-
53
- def search_youtube(self, query: str, max_results: int = 50) -> List[Dict]:
54
- """YouTube 검색을 통해 상위 결과의 URL 목록 반환"""
55
- ydl_opts = {
56
- 'quiet': True,
57
- 'no_warnings': True,
58
- 'extract_flat': True,
59
- 'force_generic_extractor': False,
60
- }
61
-
62
- search_query = f"ytsearch{max_results}:{query}"
63
- logger.info(f"Searching YouTube for: '{query}' (Max {max_results} results)")
64
-
65
- results = []
66
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
31
+ def _get_ua(self):
32
+ return random.choice(USER_AGENTS)
33
+
34
+ def _get_query(self, category):
35
+ """검색어 순환 반환"""
36
+ if category not in self.query_index:
37
+ self.query_index[category] = 0
38
+
39
+ queries = CATEGORY_QUERIES[category]
40
+ query = queries[self.query_index[category]]
41
+ self.query_index[category] = (self.query_index[category] + 1) % len(queries)
42
+ return query
43
+
44
+ def _format_duration(self, seconds):
45
+ if not seconds:
46
+ return "?"
47
+ return f"{int(seconds // 60)}:{int(seconds % 60):02d}"
48
+
49
+ def _download_one(self, url, quiet=False):
50
+ """단일 영상 다운로드"""
51
+ archive = os.path.join(self.output_path, '.archive.txt')
52
+ last_file = None
53
+
54
+ def hook(d):
55
+ nonlocal last_file
56
+ if d['status'] == 'finished':
57
+ last_file = d.get('filename')
58
+ elif d['status'] == 'downloading' and not quiet:
59
+ pct = d.get('_percent_str', '0%').strip()
60
+ spd = d.get('_speed_str', 'N/A').strip()
61
+ print(f"\r 다운로드: {pct} | {spd}", end='', flush=True)
62
+
63
+ max_retries = 1 if self.fast_mode else 3
64
+
65
+ for attempt in range(max_retries):
67
66
  try:
68
- info = ydl.extract_info(search_query, download=False)
69
- if 'entries' in info:
70
- for entry in info['entries']:
71
- if entry:
72
- results.append({
73
- 'url': f"https://www.youtube.com/watch?v={entry['id']}",
74
- 'title': entry.get('title'),
75
- 'id': entry['id']
76
- })
67
+ opts = {
68
+ 'outtmpl': os.path.join(self.output_path, '%(title)s.%(ext)s'),
69
+ 'format': 'best[ext=mp4]/best',
70
+ 'progress_hooks': [hook],
71
+ 'quiet': True,
72
+ 'no_warnings': True,
73
+ 'download_archive': archive,
74
+ 'http_headers': {'User-Agent': self._get_ua()},
75
+ 'socket_timeout': 10 if self.fast_mode else 30,
76
+ }
77
+
78
+ if self.proxy:
79
+ opts['proxy'] = self.proxy
80
+
81
+ if attempt > 0:
82
+ time.sleep(min(2 ** attempt, 10))
83
+
84
+ with YoutubeDL(opts) as ydl:
85
+ info = ydl.extract_info(url, download=True)
86
+ if info is None:
87
+ return "skipped", None, None
88
+
89
+ title = info.get('title', 'Unknown')
90
+
91
+ if last_file and os.path.exists(last_file):
92
+ return "ok", last_file, title
93
+
94
+ ext = info.get('ext', 'mp4')
95
+ path = os.path.join(self.output_path, f"{title}.{ext}")
96
+ if os.path.exists(path):
97
+ return "ok", path, title
98
+
99
+ return "ok", None, title
100
+
77
101
  except Exception as e:
78
- logger.error(f"Search failed: {e}")
79
-
80
- return results
81
-
82
- def calculate_clip_range(
83
- self,
84
- timestamp_sec: int,
85
- video_duration: int
86
- ) -> Tuple[int, int]:
87
- """타임스탬프 기준 ±1분 30초 클립 범위 계산"""
88
- start = max(0, timestamp_sec - CLIP_DURATION_BEFORE)
89
- end = min(video_duration, timestamp_sec + CLIP_DURATION_AFTER)
90
- return start, end
91
-
92
- def download_segment(
93
- self,
94
- url: str,
95
- start_sec: int,
96
- end_sec: int,
97
- output_path: Optional[Path] = None
98
- ) -> Path:
99
- """특정 구간만 다운로드"""
100
- video_id = extract_video_id(url)
101
-
102
- if output_path is None:
103
- filename = f"{video_id}_{start_sec}-{end_sec}"
104
- output_path = get_clip_path(self.base_dir, self.task_type, filename)
105
-
106
- output_template = str(output_path).replace('.mp4', '')
107
-
108
- ydl_opts = {
109
- 'format': VIDEO_FORMAT,
110
- 'outtmpl': f"{output_template}.%(ext)s",
111
- 'retries': DOWNLOAD_RETRIES,
112
- 'quiet': False,
113
- 'no_warnings': False,
114
- 'download_ranges': download_range_func(None, [(start_sec, end_sec)]),
115
- 'force_keyframes_at_cuts': True,
116
- }
117
-
118
- # Use ffmpeg from imageio-ffmpeg
119
- try:
120
- import imageio_ffmpeg
121
- ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
122
- ydl_opts['ffmpeg_location'] = ffmpeg_path
123
- logger.debug(f"Using ffmpeg from: {ffmpeg_path}")
124
- except ImportError:
125
- logger.warning("imageio-ffmpeg not found, relying on system ffmpeg")
126
-
127
- logger.info(f"Downloading segment [{start_sec}s - {end_sec}s] from: {url}")
128
-
129
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
130
- ydl.download([url])
131
-
132
- return output_path
133
-
134
- def get_saved_video_count(self) -> int:
135
- """현재 태스크 폴더에 저장된 영상 개수 확인"""
136
- from .utils import get_task_video_count
137
- return get_task_video_count(self.base_dir, self.task_type)
138
-
139
- def _get_history_key(self, url: str, timestamp_min: int, timestamp_sec: int) -> str:
140
- """히스토리 키 생성"""
141
- video_id = extract_video_id(url)
142
- return f"{self.task_type}_{video_id}_{timestamp_min}_{timestamp_sec}"
143
-
144
- def download_clip_at_timestamp(
145
- self,
146
- url: str,
147
- timestamp_min: int,
148
- timestamp_sec: int,
149
- skip_verify: bool = False
150
- ) -> Tuple[Optional[Path], Optional[dict]]:
151
- """
152
- 특정 타임스탬프 기준으로 ±1:30 클립 다운로드
153
- 1. 임시 폴더에 다운로드
154
- 2. YOLO 검증 (타겟 객체 유무 확인)
155
- 3. 검증 통과 시 최종 경로로 이동 (task_xxxx.mp4)
156
- """
157
- from .config import MAX_VIDEOS_PER_TASK
158
- from .utils import load_history, save_history, get_clip_path, ensure_dir
159
- from .verifier import verify_clip
160
- import shutil
161
-
162
- target_sec = timestamp_min * 60 + timestamp_sec
163
- history_key = self._get_history_key(url, timestamp_min, timestamp_sec)
164
-
165
- # 1. 히스토리 기반 중복 확인
166
- history = load_history(self.base_dir)
167
- if history_key in history:
168
- saved_path = history[history_key].get('output_path', 'unknown')
169
- # 파일이 실제로 존재하는지도 확인하면 좋음
170
- if Path(saved_path).exists():
171
- logger.info(f"Skipping download (already in history): {saved_path}")
172
- return Path(saved_path), {'cached': True, 'output_path': saved_path}
173
-
174
- # 2. 개수 제한 확인
175
- current_count = self.get_saved_video_count()
176
- if current_count >= MAX_VIDEOS_PER_TASK:
177
- msg = f"Task limit reached ({current_count}/{MAX_VIDEOS_PER_TASK}). Stopping download."
178
- logger.warning(msg)
179
- raise LimitReachedError(msg)
180
-
181
- # 3. 영상 정보 조회
102
+ err = str(e).lower()
103
+
104
+ if "already" in err or "recorded" in err:
105
+ return "skipped", None, None
106
+
107
+ if any(s in err for s in SKIP_ERRORS):
108
+ return "unavailable", None, None
109
+
110
+ return "failed", None, None
111
+
112
+ def _search(self, query, count=10):
113
+ """영상 검색"""
182
114
  try:
183
- info = self.get_video_info(url)
115
+ opts = {
116
+ 'quiet': True,
117
+ 'no_warnings': True,
118
+ 'extract_flat': 'in_playlist',
119
+ 'http_headers': {'User-Agent': self._get_ua()},
120
+ 'socket_timeout': 10,
121
+ }
122
+ if self.proxy:
123
+ opts['proxy'] = self.proxy
124
+
125
+ with YoutubeDL(opts) as ydl:
126
+ result = ydl.extract_info(f"ytsearch{count}:{query}", download=False)
127
+
128
+ return list(result.get('entries', [])) if result else []
184
129
  except Exception as e:
185
- logger.error(f"Failed to get video info: {e}")
186
- raise
187
-
188
- video_duration = info.get('duration', 0)
189
-
190
- if video_duration == 0:
191
- raise ValueError(f"Cannot get video duration for: {url}")
192
-
193
- start_sec, end_sec = self.calculate_clip_range(target_sec, video_duration)
194
- clip_duration = end_sec - start_sec
195
-
196
- logger.info(
197
- f"Target: {timestamp_min}:{timestamp_sec:02d}, "
198
- f"Clip range: {start_sec}s - {end_sec}s (duration: {clip_duration}s)"
199
- )
200
-
201
- # 4. 임시 파일 다운로드
202
- # temp 폴더 생성
203
- temp_dir = ensure_dir(self.base_dir / "temp")
204
- video_id = extract_video_id(url)
205
- temp_filename = f"temp_{video_id}_{timestamp_min}_{timestamp_sec}.mp4"
206
- temp_path = temp_dir / temp_filename
207
-
130
+ print(f" 검색 에러: {e}")
131
+ return []
132
+
133
+ def _get_duration(self, video_id):
134
+ """영상 길이 조회"""
208
135
  try:
209
- self.download_segment(url, start_sec, end_sec, temp_path)
210
-
211
- # 5. YOLO 검증 (skip_verify가 False일 때만 수행)
212
- if not skip_verify:
213
- logger.info(f"Verifying content for task: {self.task_type}...")
214
- # verifier 모듈 사용하여 검증
215
- verify_result = verify_clip(temp_path, self.task_type, self.base_dir)
216
-
217
- logger.info(f"Verification Info - Rate: {verify_result.get('summary', {}).get('detection_rate'):.2%}, Is Valid: {verify_result.get('is_valid')}")
218
-
219
- if not verify_result.get('is_valid', False):
220
- logger.warning(f"Verification failed: No {self.task_type} detected (Rate: {verify_result.get('summary', {}).get('detection_rate'):.2%}). Deleting...")
221
- if temp_path.exists():
222
- temp_path.unlink()
223
- return None, None
224
- else:
225
- logger.info(f"Skipping verification for task: {self.task_type} (--skip-verify enabled)")
226
- verify_result = {'is_valid': True, 'skipped': True}
227
-
228
- # 6. 검증 통과(혹은 건너뜀) -> 최종 저장 (순차적 파일명 생성)
229
- final_path = get_clip_path(self.base_dir, self.task_type, filename=None)
230
-
231
- # 이동 (네트워크 드라이브면 shutil.move 사용)
232
- shutil.move(str(temp_path), str(final_path))
233
- logger.info(f"Saved verified video to: {final_path}")
234
-
235
- metadata = {
236
- **info,
237
- 'target_timestamp_sec': target_sec,
238
- 'clip_start_sec': start_sec,
239
- 'clip_end_sec': end_sec,
240
- 'clip_duration': clip_duration,
241
- 'output_path': str(final_path),
242
- 'timestamp': timestamp_min * 60 + timestamp_sec,
243
- 'verification': verify_result
136
+ url = f"https://www.youtube.com/watch?v={video_id}"
137
+ opts = {
138
+ 'quiet': True,
139
+ 'no_warnings': True,
140
+ 'http_headers': {'User-Agent': self._get_ua()},
141
+ 'socket_timeout': 5,
244
142
  }
245
-
246
- # 7. 히스토리 업데이트
247
- history = load_history(self.base_dir)
248
- history[history_key] = metadata
249
- save_history(self.base_dir, history)
250
-
251
- return final_path, metadata
252
-
253
- except Exception as e:
254
- logger.error(f"Error during processing: {e}")
255
- # 에러 발생 시 임시 파일 정리
256
- if temp_path.exists():
257
- temp_path.unlink()
258
- raise
259
-
260
-
261
- def parse_txt_line(line: str) -> Optional[Dict]:
262
- """
263
- 텍스트 파일 한 줄 파싱
264
- 형식: task_type,url,timestamp_min,timestamp_sec,description
265
- """
266
- parts = [p.strip() for p in line.split(',')]
267
- if len(parts) < 4:
268
- return None
269
-
270
- # 헤더 체크
271
- if parts[0] == 'task_type' and parts[2] == 'timestamp_min':
272
- return None
273
-
274
- try:
275
- return {
276
- 'task_type': parts[0],
277
- 'url': parts[1],
278
- 'timestamp_min': int(parts[2]),
279
- 'timestamp_sec': int(parts[3]),
280
- 'description': parts[4] if len(parts) > 4 else ''
281
- }
282
- except ValueError:
283
- return None
284
-
285
-
286
- def download_from_txt(txt_path: Path, task_type: str, base_dir: Path = None, max_count: int = None, skip_verify: bool = False) -> list:
287
- """TXT 파일에서 다운로드 실행 (순차)"""
288
- return _process_download_loop(txt_path, task_type, base_dir, parallel=False, max_count=max_count, skip_verify=skip_verify)
289
-
290
-
291
- def download_from_txt_parallel(txt_path: Path, task_type: str, base_dir: Path = None, max_count: int = None, skip_verify: bool = False) -> list:
292
- """TXT 파일에서 병렬 다운로드 실행 (Fast Mode)"""
293
- return _process_download_loop(txt_path, task_type, base_dir, parallel=True, max_count=max_count, skip_verify=skip_verify)
294
-
295
-
296
- def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None, parallel: bool = False, max_count: int = None, skip_verify: bool = False) -> list:
297
- from .config import MAX_VIDEOS_PER_TASK, MAX_WORKERS, REQUEST_DELAY_MIN, REQUEST_DELAY_MAX
298
- import time
299
- import random
300
- from concurrent.futures import ThreadPoolExecutor, as_completed
301
-
302
- # max_count가 없으면 config의 기본값 사용
303
- limit = max_count if max_count is not None else MAX_VIDEOS_PER_TASK
143
+ if self.proxy:
144
+ opts['proxy'] = self.proxy
304
145
 
305
- results = []
306
- downloader = VideoDownloader(task_type, base_dir)
307
-
308
- # 시작 전 개수 확인
309
- initial_count = downloader.get_saved_video_count()
310
- if initial_count >= limit:
311
- logger.warning(f"Task '{task_type}' already has {initial_count} videos (Limit: {limit}). Skipping.")
312
- return results
313
-
314
- if not txt_path.exists():
315
- logger.error(f"File not found: {txt_path}")
316
- return results
146
+ with YoutubeDL(opts) as ydl:
147
+ info = ydl.extract_info(url, download=False)
148
+ return info.get('duration')
149
+ except:
150
+ return None
317
151
 
318
- lines = txt_path.read_text(encoding='utf-8').splitlines()
319
- items = []
320
-
321
- for line in lines:
322
- if not line.strip() or line.startswith('#'):
323
- continue
324
- data = parse_txt_line(line)
325
- if data and data['task_type'] == task_type:
326
- items.append(data)
327
-
328
- if not items:
329
- return results
330
-
331
- logger.info(f"Found {len(items)} URLs. Target: {limit} videos (Current: {initial_count}). Starting {'parallel' if parallel else 'sequential'} download...")
332
-
333
- def process_item(data):
334
- # 현재 개수 체크 (루프 도중 목표 달성 시 중단 위함)
335
- # 주의: 병렬 처리 시 정확한 count 동기화는 Lock이 필요하지만, 여기선 대략적인 체크로 충분
336
- current = downloader.get_saved_video_count()
337
- if current >= limit:
338
- raise LimitReachedError("Target count reached")
339
-
340
- # 방화벽 우회용 랜덤 딜레이 (병렬 모드에서도 적용하여 동시 요청 폭주 완화)
341
- if parallel:
342
- time.sleep(random.uniform(REQUEST_DELAY_MIN, REQUEST_DELAY_MAX))
343
-
344
- try:
345
- # VideoDownloader 내부의 limit 체크는 config 값을 쓰므로,
346
- # 여기서는 외부에서 주입된 limit을 강제할 방법이 필요하거나,
347
- # 단순히 루프 레벨에서 제어하면 됨.
348
- # download_clip_at_timestamp 메서드는 내부적으로 MAX_VIDEOS_PER_TASK를 체크하므로,
349
- # 이를 우회하거나 단순 루프 제어로 처리.
350
-
351
- output_path, metadata = downloader.download_clip_at_timestamp(
352
- url=data['url'],
353
- timestamp_min=data['timestamp_min'],
354
- timestamp_sec=data['timestamp_sec'],
355
- skip_verify=skip_verify
356
- )
357
-
358
- if output_path is None:
359
- return {
360
- 'success': False,
361
- 'url': data['url'],
362
- 'error': 'Verification failed',
363
- 'status': 'skipped'
364
- }
152
+ def _process_video(self, entry, category, cat_name):
153
+ """단일 영상 처리 (다운로드 + 분석 + 자동 트리밍 + URL 기록)"""
154
+ vid = entry.get('id')
155
+ url = f"https://www.youtube.com/watch?v={vid}"
156
+ title = entry.get('title', '?')[:45]
365
157
 
366
- if metadata and metadata.get('cached'):
367
- return {
368
- 'success': True,
369
- 'output_path': str(output_path),
370
- 'metadata': metadata,
371
- 'status': 'cached'
372
- }
373
-
374
- return {
375
- 'success': True,
376
- 'output_path': str(output_path),
377
- 'metadata': metadata,
378
- 'status': 'downloaded'
379
- }
380
-
381
- except LimitReachedError:
382
- # 내부에서 발생한 LimitReachedError도 처리
383
- return {'success': False, 'error': 'Limit reached', 'status': 'limit_reached'}
384
-
385
- except Exception as e:
386
- # "에러가 나면 pass" -> 로그만 남기고 실패 결과 반환
387
- logger.warning(f"Error processing {data['url']}: {e}")
388
- return {
389
- 'success': False,
390
- 'url': data['url'],
391
- 'error': str(e),
392
- 'status': 'error'
393
- }
158
+ status, filepath, _ = self._download_one(url, quiet=True)
159
+
160
+ result_info = {'title': title, 'status': status, 'saved': False}
394
161
 
395
- # --- 1단계: youtube_url.txt 파일 목록 처리 ---
396
- if items:
397
- if parallel:
398
- with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
399
- futures = [executor.submit(process_item, item) for item in items]
162
+ if status == "ok" and filepath:
163
+ print(f" 🔍 분석 중...")
164
+ analysis = self.analyzer.analyze(filepath)
165
+
166
+ detected = []
167
+ if analysis['face']:
168
+ detected.append(f"얼굴({analysis['face_count']})")
169
+ if analysis['text']:
170
+ detected.append("텍스트")
171
+ if analysis['license_plate']:
172
+ detected.append("번호판")
173
+ if analysis['tattoo']:
174
+ detected.append("타투")
175
+
176
+ result_info['detected'] = detected
177
+
178
+ if analysis.get(category):
179
+ # 1. 태스크별 전용 youtube_url_{category}.txt 업데이트
180
+ url_file_path = f"youtube_url_{category}.txt"
181
+ ts = analysis.get('first_detection_ts', '00:00')
182
+ append_to_url_list(url_file_path, url, ts, category)
183
+
184
+ # 2. 결과 폴더 이동 및 파일명 변경 (category_0001.mp4 형식)
185
+ dest_dir = os.path.join(self.output_path, cat_name)
186
+ os.makedirs(dest_dir, exist_ok=True)
187
+
188
+ # 파일명 접두어 결정 (license_plate -> license)
189
+ prefix = category.replace('license_plate', 'license')
400
190
 
401
- # 진행 상황 표시
402
- from tqdm import tqdm
403
- for future in tqdm(as_completed(futures), total=len(items), desc="Fast Download"):
191
+ with self._file_lock:
192
+ idx = get_next_index(dest_dir, prefix)
193
+ new_filename = f"{prefix}_{idx:04d}.mp4"
194
+ dest = os.path.join(dest_dir, new_filename)
195
+
196
+ # 원본 길이가 3분(180초) 초과면 감지 시점 기준 트리밍
197
+ duration = get_video_duration(filepath)
198
+ if duration > 180:
199
+ print(f" ✂ 3분 초과 영상 자동 트리밍 ({self._format_duration(duration)} -> 3:00)")
200
+ clip_video(filepath, dest, analysis.get('first_detection_sec', 0))
201
+ else:
202
+ if not os.path.exists(dest):
203
+ shutil.move(filepath, dest)
204
+
205
+ result_info['saved'] = True
206
+ result_info['new_path'] = dest
207
+ else:
208
+ if category == 'license_plate':
209
+ dest_dir = os.path.join(self.output_path, "번호판_미감지")
210
+ os.makedirs(dest_dir, exist_ok=True)
211
+ dest = os.path.join(dest_dir, os.path.basename(filepath))
212
+ if not os.path.exists(dest):
213
+ shutil.move(filepath, dest)
214
+ result_info['undetected_saved'] = True
215
+ else:
404
216
  try:
405
- res = future.result()
406
- results.append(res)
407
- if res.get('status') == 'limit_reached' or downloader.get_saved_video_count() >= limit:
408
- logger.info(f"Download limit ({limit}) reached. Stopping.")
409
- executor.shutdown(wait=False, cancel_futures=True)
410
- break
411
- except Exception:
412
- continue
413
- else:
414
- # 순차 실행
415
- for item in items:
416
- if downloader.get_saved_video_count() >= limit:
417
- break
418
- res = process_item(item)
419
- results.append(res)
420
- if res.get('status') == 'limit_reached':
217
+ os.remove(filepath)
218
+ except:
219
+ pass
220
+
221
+ return result_info
222
+
223
+ def collect(self, category, max_videos=5):
224
+ """카테고리별 영상 수집"""
225
+ cat_name = CATEGORY_NAMES[category]
226
+ query = self._get_query(category)
227
+
228
+ print(f"\n{'='*60}")
229
+ print(f"[{cat_name}] 검색: {query}")
230
+ mode = "⚡ 고속" if self.fast_mode else "일반"
231
+ # 검색 시에는 제한을 20분(1200초)으로 완화하여 더 많은 영상 확보
232
+ search_limit = 1200
233
+ print(f"목표: {max_videos}개 | 검색제한: {self._format_duration(search_limit)} | {mode}")
234
+ print('='*60)
235
+
236
+ # 검색
237
+ entries = self._search(query, max_videos * 3)
238
+ if not entries:
239
+ print("검색 결과 없음")
240
+ return 0
241
+
242
+ print(f"검색됨: {len(entries)}개")
243
+
244
+ # 필터링
245
+ filtered = []
246
+ for entry in entries:
247
+ if not entry: continue
248
+
249
+ vid = entry.get('id')
250
+ title = entry.get('title', '')
251
+ dur = entry.get('duration') or self._get_duration(vid)
252
+
253
+ # 블랙리스트 키워드 체크
254
+ blacklist = BLACKLIST_KEYWORDS.get(category, [])
255
+ if any(kw in title for kw in blacklist):
256
+ print(f" ✗ [제외] {title[:40]}...")
257
+ continue
258
+
259
+ # 너무 긴 영상(예: 20분 초과) 제외
260
+ if dur and dur < search_limit:
261
+ filtered.append(entry)
262
+ print(f" ✓ [{self._format_duration(dur)}] {title}")
263
+ if len(filtered) >= max_videos:
421
264
  break
265
+ elif dur:
266
+ print(f" ✗ [{self._format_duration(dur)}] (너무 filter됨)")
422
267
 
423
- # --- 2단계: 목표 수량을 못 채웠을 경우 YouTube 검색Fallback (반복 시도) ---
424
- max_search_attempts = 5 # 최대 검색 시도 횟수
425
- search_attempt = 0
426
- processed_urls = set(data['url'] for data in items) # 이미 처리한 URL 중복 검색 방지
427
-
428
- while downloader.get_saved_video_count() < limit and search_attempt < max_search_attempts:
429
- current_count = downloader.get_saved_video_count()
430
- remaining = limit - current_count
431
-
432
- logger.info(f"\n[Search Attempt {search_attempt+1}] Target not reached ({current_count}/{limit}). Searching YouTube for '{task_type}'...")
433
-
434
- # 검색어: 태스크 이름
435
- # 검색 결과 개수를 점진적으로 늘리거나 조절 가능
436
- search_results = downloader.search_youtube(task_type, max_results=min(100, remaining * 5))
437
-
438
- if not search_results:
439
- logger.warning("No more search results found.")
440
- break
441
-
442
- # 새로운 URL만 필터링
443
- new_entries = [e for e in search_results if e['url'] not in processed_urls]
444
- if not new_entries:
445
- logger.info("No new unique videos found in this search attempt.")
446
- search_attempt += 1
447
- continue
448
-
449
- search_items = []
450
- for entry in new_entries:
451
- processed_urls.add(entry['url'])
452
- # 검색 결과는 타임스탬프 정보가 없으므로 여러 지점 시도 가능 (현재는 1분 지점 고정)
453
- search_items.append({
454
- 'task_type': task_type,
455
- 'url': entry['url'],
456
- 'timestamp_min': 1,
457
- 'timestamp_sec': 0,
458
- 'description': f"Auto-searched: {entry['title']}"
459
- })
460
-
461
- logger.info(f"Processing {len(search_items)} new search results...")
462
-
463
- if parallel:
464
- with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
465
- futures = [executor.submit(process_item, item) for item in search_items]
466
- from tqdm import tqdm
467
- for future in tqdm(as_completed(futures), total=len(search_items), desc=f"Search Fallback #{search_attempt+1}"):
268
+ if not self.fast_mode:
269
+ time.sleep(0.3)
270
+
271
+ if not filtered:
272
+ print("조건 맞는 영상 없음")
273
+ return 0
274
+
275
+ print(f"\n다운로드 분석: {len(filtered)}개" + (" (병렬)" if self.fast_mode else ""))
276
+ success = 0
277
+
278
+ if self.fast_mode and self.workers > 1:
279
+ with ThreadPoolExecutor(max_workers=self.workers) as executor:
280
+ futures = {
281
+ executor.submit(self._process_video, entry, category, cat_name): entry
282
+ for entry in filtered
283
+ }
284
+ for i, future in enumerate(as_completed(futures)):
285
+ entry = futures[future]
286
+ title = entry.get('title', '?')[:45]
468
287
  try:
469
- res = future.result()
470
- results.append(res)
471
- if res.get('status') == 'limit_reached' or downloader.get_saved_video_count() >= limit:
472
- executor.shutdown(wait=False, cancel_futures=True)
473
- break
474
- except Exception:
475
- continue
288
+ result = future.result()
289
+ print(f"\n[{i+1}/{len(filtered)}] {title}")
290
+ if result['status'] == "ok":
291
+ if result.get('detected'):
292
+ print(f" 감지: {', '.join(result['detected'])}")
293
+ if result['saved']:
294
+ new_name = os.path.basename(result['new_path'])
295
+ print(f" ✅ 저장: {cat_name}/{new_name}")
296
+ success += 1
297
+ elif result.get('undetected_saved'):
298
+ print(" 📁 미감지 보관")
299
+ else:
300
+ print(" ❌ 미감지 삭제")
301
+ elif result['status'] == "skipped":
302
+ print(" ⏭ 이미 있음")
303
+ elif result['status'] == "unavailable":
304
+ print(" ⏭ 사용불가")
305
+ else:
306
+ print(" ✗ 실패")
307
+ except Exception as e:
308
+ print(f"\n[{i+1}/{len(filtered)}] {title}")
309
+ print(f" ✗ 에러: {e}")
476
310
  else:
477
- for item in search_items:
478
- if downloader.get_saved_video_count() >= limit:
479
- break
480
- res = process_item(item)
481
- results.append(res)
482
- if res.get('status') == 'limit_reached':
483
- break
484
-
485
- search_attempt += 1
486
-
487
- final_count = downloader.get_saved_video_count()
488
- if final_count < limit:
489
- logger.warning(f"Finished search attempts. Final count: {final_count}/{limit}")
490
- else:
491
- logger.info(f"Successfully reached target count: {final_count}/{limit}")
492
-
493
- return results
311
+ for i, entry in enumerate(filtered):
312
+ vid = entry.get('id')
313
+ title = entry.get('title', '?')[:45]
314
+ print(f"\n[{i+1}/{len(filtered)}] {title}")
315
+
316
+ result = self._process_video(entry, category, cat_name)
317
+ if result['status'] == "ok":
318
+ if result.get('detected'):
319
+ print(f" 감지: {', '.join(result['detected'])}")
320
+ if result['saved']:
321
+ new_name = os.path.basename(result['new_path'])
322
+ print(f" ✅ 저장: {cat_name}/{new_name}")
323
+ success += 1
324
+ elif result.get('undetected_saved'):
325
+ print(" 📁 미감지 보관")
326
+ else:
327
+ print(" ❌ 미감지 삭제")
328
+ elif result['status'] == "skipped":
329
+ print(" ⏭ 이미 있음")
330
+ elif result['status'] == "unavailable":
331
+ print(" ⏭ 사용불가")
332
+ else:
333
+ print(" ✗ 실패")
334
+
335
+ if not self.fast_mode:
336
+ time.sleep(random.uniform(0.5, 1.5))
337
+
338
+ return success