ytcollector 1.0.8__py3-none-any.whl → 1.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ytcollector/__init__.py +36 -11
- ytcollector/analyzer.py +205 -0
- ytcollector/cli.py +186 -218
- ytcollector/config.py +66 -62
- ytcollector/dataset_builder.py +136 -0
- ytcollector/downloader.py +328 -480
- ytcollector-1.0.9.dist-info/METADATA +207 -0
- ytcollector-1.0.9.dist-info/RECORD +11 -0
- ytcollector-1.0.9.dist-info/entry_points.txt +4 -0
- {ytcollector-1.0.8.dist-info → ytcollector-1.0.9.dist-info}/top_level.txt +0 -1
- config/settings.py +0 -39
- ytcollector/utils.py +0 -144
- ytcollector/verifier.py +0 -187
- ytcollector-1.0.8.dist-info/METADATA +0 -105
- ytcollector-1.0.8.dist-info/RECORD +0 -12
- ytcollector-1.0.8.dist-info/entry_points.txt +0 -2
- {ytcollector-1.0.8.dist-info → ytcollector-1.0.9.dist-info}/WHEEL +0 -0
ytcollector/downloader.py
CHANGED
|
@@ -1,493 +1,341 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
from
|
|
6
|
-
|
|
7
|
-
import
|
|
8
|
-
|
|
9
|
-
import
|
|
10
|
-
from
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
self.
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
search_query = f"ytsearch{max_results}:{query}"
|
|
63
|
-
logger.info(f"Searching YouTube for: '{query}' (Max {max_results} results)")
|
|
64
|
-
|
|
65
|
-
results = []
|
|
66
|
-
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
import random
|
|
4
|
+
import shutil
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
6
|
+
|
|
7
|
+
from yt_dlp import YoutubeDL
|
|
8
|
+
|
|
9
|
+
from .config import USER_AGENTS, CATEGORY_QUERIES, CATEGORY_NAMES, SKIP_ERRORS
|
|
10
|
+
from .analyzer import VideoAnalyzer
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class YouTubeDownloader:
|
|
14
|
+
"""YouTube 다운로더 클래스"""
|
|
15
|
+
|
|
16
|
+
def __init__(self, output_path, max_duration=180, proxy=None, fast_mode=False, workers=3):
|
|
17
|
+
self.output_path = output_path
|
|
18
|
+
self.max_duration = max_duration
|
|
19
|
+
self.proxy = proxy
|
|
20
|
+
self.fast_mode = fast_mode
|
|
21
|
+
self.workers = workers
|
|
22
|
+
self.analyzer = VideoAnalyzer()
|
|
23
|
+
self.query_index = {}
|
|
24
|
+
|
|
25
|
+
os.makedirs(output_path, exist_ok=True)
|
|
26
|
+
|
|
27
|
+
def _get_ua(self):
|
|
28
|
+
return random.choice(USER_AGENTS)
|
|
29
|
+
|
|
30
|
+
def _get_query(self, category):
|
|
31
|
+
"""검색어 순환 반환"""
|
|
32
|
+
if category not in self.query_index:
|
|
33
|
+
self.query_index[category] = 0
|
|
34
|
+
|
|
35
|
+
queries = CATEGORY_QUERIES[category]
|
|
36
|
+
query = queries[self.query_index[category]]
|
|
37
|
+
self.query_index[category] = (self.query_index[category] + 1) % len(queries)
|
|
38
|
+
return query
|
|
39
|
+
|
|
40
|
+
def _format_duration(self, seconds):
|
|
41
|
+
if not seconds:
|
|
42
|
+
return "?"
|
|
43
|
+
return f"{int(seconds // 60)}:{int(seconds % 60):02d}"
|
|
44
|
+
|
|
45
|
+
def _download_one(self, url, quiet=False):
|
|
46
|
+
"""단일 영상 다운로드"""
|
|
47
|
+
archive = os.path.join(self.output_path, '.archive.txt')
|
|
48
|
+
last_file = None
|
|
49
|
+
|
|
50
|
+
def hook(d):
|
|
51
|
+
nonlocal last_file
|
|
52
|
+
if d['status'] == 'finished':
|
|
53
|
+
last_file = d.get('filename')
|
|
54
|
+
elif d['status'] == 'downloading' and not quiet:
|
|
55
|
+
pct = d.get('_percent_str', '0%').strip()
|
|
56
|
+
spd = d.get('_speed_str', 'N/A').strip()
|
|
57
|
+
print(f"\r 다운로드: {pct} | {spd}", end='', flush=True)
|
|
58
|
+
|
|
59
|
+
max_retries = 1 if self.fast_mode else 3
|
|
60
|
+
|
|
61
|
+
for attempt in range(max_retries):
|
|
67
62
|
try:
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
63
|
+
opts = {
|
|
64
|
+
'outtmpl': os.path.join(self.output_path, '%(title)s.%(ext)s'),
|
|
65
|
+
'format': 'best[ext=mp4]/best',
|
|
66
|
+
'progress_hooks': [hook],
|
|
67
|
+
'quiet': True,
|
|
68
|
+
'no_warnings': True,
|
|
69
|
+
'download_archive': archive,
|
|
70
|
+
'http_headers': {'User-Agent': self._get_ua()},
|
|
71
|
+
'socket_timeout': 10 if self.fast_mode else 30,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if self.proxy:
|
|
75
|
+
opts['proxy'] = self.proxy
|
|
76
|
+
|
|
77
|
+
if attempt > 0:
|
|
78
|
+
time.sleep(min(2 ** attempt, 10))
|
|
79
|
+
|
|
80
|
+
with YoutubeDL(opts) as ydl:
|
|
81
|
+
info = ydl.extract_info(url, download=True)
|
|
82
|
+
if info is None:
|
|
83
|
+
return "skipped", None, None
|
|
84
|
+
|
|
85
|
+
title = info.get('title', 'Unknown')
|
|
86
|
+
|
|
87
|
+
if last_file and os.path.exists(last_file):
|
|
88
|
+
return "ok", last_file, title
|
|
89
|
+
|
|
90
|
+
ext = info.get('ext', 'mp4')
|
|
91
|
+
path = os.path.join(self.output_path, f"{title}.{ext}")
|
|
92
|
+
if os.path.exists(path):
|
|
93
|
+
return "ok", path, title
|
|
94
|
+
|
|
95
|
+
return "ok", None, title
|
|
96
|
+
|
|
77
97
|
except Exception as e:
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
return start, end
|
|
91
|
-
|
|
92
|
-
def download_segment(
|
|
93
|
-
self,
|
|
94
|
-
url: str,
|
|
95
|
-
start_sec: int,
|
|
96
|
-
end_sec: int,
|
|
97
|
-
output_path: Optional[Path] = None
|
|
98
|
-
) -> Path:
|
|
99
|
-
"""특정 구간만 다운로드"""
|
|
100
|
-
video_id = extract_video_id(url)
|
|
101
|
-
|
|
102
|
-
if output_path is None:
|
|
103
|
-
filename = f"{video_id}_{start_sec}-{end_sec}"
|
|
104
|
-
output_path = get_clip_path(self.base_dir, self.task_type, filename)
|
|
105
|
-
|
|
106
|
-
output_template = str(output_path).replace('.mp4', '')
|
|
107
|
-
|
|
108
|
-
ydl_opts = {
|
|
109
|
-
'format': VIDEO_FORMAT,
|
|
110
|
-
'outtmpl': f"{output_template}.%(ext)s",
|
|
111
|
-
'retries': DOWNLOAD_RETRIES,
|
|
112
|
-
'quiet': False,
|
|
113
|
-
'no_warnings': False,
|
|
114
|
-
'download_ranges': download_range_func(None, [(start_sec, end_sec)]),
|
|
115
|
-
'force_keyframes_at_cuts': True,
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
# Use ffmpeg from imageio-ffmpeg
|
|
119
|
-
try:
|
|
120
|
-
import imageio_ffmpeg
|
|
121
|
-
ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
|
|
122
|
-
ydl_opts['ffmpeg_location'] = ffmpeg_path
|
|
123
|
-
logger.debug(f"Using ffmpeg from: {ffmpeg_path}")
|
|
124
|
-
except ImportError:
|
|
125
|
-
logger.warning("imageio-ffmpeg not found, relying on system ffmpeg")
|
|
126
|
-
|
|
127
|
-
logger.info(f"Downloading segment [{start_sec}s - {end_sec}s] from: {url}")
|
|
128
|
-
|
|
129
|
-
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
130
|
-
ydl.download([url])
|
|
131
|
-
|
|
132
|
-
return output_path
|
|
133
|
-
|
|
134
|
-
def get_saved_video_count(self) -> int:
|
|
135
|
-
"""현재 태스크 폴더에 저장된 영상 개수 확인"""
|
|
136
|
-
from .utils import get_task_video_count
|
|
137
|
-
return get_task_video_count(self.base_dir, self.task_type)
|
|
138
|
-
|
|
139
|
-
def _get_history_key(self, url: str, timestamp_min: int, timestamp_sec: int) -> str:
|
|
140
|
-
"""히스토리 키 생성"""
|
|
141
|
-
video_id = extract_video_id(url)
|
|
142
|
-
return f"{self.task_type}_{video_id}_{timestamp_min}_{timestamp_sec}"
|
|
143
|
-
|
|
144
|
-
def download_clip_at_timestamp(
|
|
145
|
-
self,
|
|
146
|
-
url: str,
|
|
147
|
-
timestamp_min: int,
|
|
148
|
-
timestamp_sec: int,
|
|
149
|
-
skip_verify: bool = False
|
|
150
|
-
) -> Tuple[Optional[Path], Optional[dict]]:
|
|
151
|
-
"""
|
|
152
|
-
특정 타임스탬프 기준으로 ±1:30 클립 다운로드
|
|
153
|
-
1. 임시 폴더에 다운로드
|
|
154
|
-
2. YOLO 검증 (타겟 객체 유무 확인)
|
|
155
|
-
3. 검증 통과 시 최종 경로로 이동 (task_xxxx.mp4)
|
|
156
|
-
"""
|
|
157
|
-
from .config import MAX_VIDEOS_PER_TASK
|
|
158
|
-
from .utils import load_history, save_history, get_clip_path, ensure_dir
|
|
159
|
-
from .verifier import verify_clip
|
|
160
|
-
import shutil
|
|
161
|
-
|
|
162
|
-
target_sec = timestamp_min * 60 + timestamp_sec
|
|
163
|
-
history_key = self._get_history_key(url, timestamp_min, timestamp_sec)
|
|
164
|
-
|
|
165
|
-
# 1. 히스토리 기반 중복 확인
|
|
166
|
-
history = load_history(self.base_dir)
|
|
167
|
-
if history_key in history:
|
|
168
|
-
saved_path = history[history_key].get('output_path', 'unknown')
|
|
169
|
-
# 파일이 실제로 존재하는지도 확인하면 좋음
|
|
170
|
-
if Path(saved_path).exists():
|
|
171
|
-
logger.info(f"Skipping download (already in history): {saved_path}")
|
|
172
|
-
return Path(saved_path), {'cached': True, 'output_path': saved_path}
|
|
173
|
-
|
|
174
|
-
# 2. 개수 제한 확인
|
|
175
|
-
current_count = self.get_saved_video_count()
|
|
176
|
-
if current_count >= MAX_VIDEOS_PER_TASK:
|
|
177
|
-
msg = f"Task limit reached ({current_count}/{MAX_VIDEOS_PER_TASK}). Stopping download."
|
|
178
|
-
logger.warning(msg)
|
|
179
|
-
raise LimitReachedError(msg)
|
|
180
|
-
|
|
181
|
-
# 3. 영상 정보 조회
|
|
182
|
-
try:
|
|
183
|
-
info = self.get_video_info(url)
|
|
184
|
-
except Exception as e:
|
|
185
|
-
logger.error(f"Failed to get video info: {e}")
|
|
186
|
-
raise
|
|
187
|
-
|
|
188
|
-
video_duration = info.get('duration', 0)
|
|
189
|
-
|
|
190
|
-
if video_duration == 0:
|
|
191
|
-
raise ValueError(f"Cannot get video duration for: {url}")
|
|
192
|
-
|
|
193
|
-
start_sec, end_sec = self.calculate_clip_range(target_sec, video_duration)
|
|
194
|
-
clip_duration = end_sec - start_sec
|
|
195
|
-
|
|
196
|
-
logger.info(
|
|
197
|
-
f"Target: {timestamp_min}:{timestamp_sec:02d}, "
|
|
198
|
-
f"Clip range: {start_sec}s - {end_sec}s (duration: {clip_duration}s)"
|
|
199
|
-
)
|
|
200
|
-
|
|
201
|
-
# 4. 임시 파일 다운로드
|
|
202
|
-
# temp 폴더 생성
|
|
203
|
-
temp_dir = ensure_dir(self.base_dir / "temp")
|
|
204
|
-
video_id = extract_video_id(url)
|
|
205
|
-
temp_filename = f"temp_{video_id}_{timestamp_min}_{timestamp_sec}.mp4"
|
|
206
|
-
temp_path = temp_dir / temp_filename
|
|
207
|
-
|
|
98
|
+
err = str(e).lower()
|
|
99
|
+
|
|
100
|
+
if "already" in err or "recorded" in err:
|
|
101
|
+
return "skipped", None, None
|
|
102
|
+
|
|
103
|
+
if any(s in err for s in SKIP_ERRORS):
|
|
104
|
+
return "unavailable", None, None
|
|
105
|
+
|
|
106
|
+
return "failed", None, None
|
|
107
|
+
|
|
108
|
+
def _search(self, query, count=10):
|
|
109
|
+
"""영상 검색"""
|
|
208
110
|
try:
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
verify_result = verify_clip(temp_path, self.task_type, self.base_dir)
|
|
216
|
-
|
|
217
|
-
logger.info(f"Verification Info - Rate: {verify_result.get('summary', {}).get('detection_rate'):.2%}, Is Valid: {verify_result.get('is_valid')}")
|
|
218
|
-
|
|
219
|
-
if not verify_result.get('is_valid', False):
|
|
220
|
-
logger.warning(f"Verification failed: No {self.task_type} detected (Rate: {verify_result.get('summary', {}).get('detection_rate'):.2%}). Deleting...")
|
|
221
|
-
if temp_path.exists():
|
|
222
|
-
temp_path.unlink()
|
|
223
|
-
return None, None
|
|
224
|
-
else:
|
|
225
|
-
logger.info(f"Skipping verification for task: {self.task_type} (--skip-verify enabled)")
|
|
226
|
-
verify_result = {'is_valid': True, 'skipped': True}
|
|
227
|
-
|
|
228
|
-
# 6. 검증 통과(혹은 건너뜀) -> 최종 저장 (순차적 파일명 생성)
|
|
229
|
-
final_path = get_clip_path(self.base_dir, self.task_type, filename=None)
|
|
230
|
-
|
|
231
|
-
# 이동 (네트워크 드라이브면 shutil.move 사용)
|
|
232
|
-
shutil.move(str(temp_path), str(final_path))
|
|
233
|
-
logger.info(f"Saved verified video to: {final_path}")
|
|
234
|
-
|
|
235
|
-
metadata = {
|
|
236
|
-
**info,
|
|
237
|
-
'target_timestamp_sec': target_sec,
|
|
238
|
-
'clip_start_sec': start_sec,
|
|
239
|
-
'clip_end_sec': end_sec,
|
|
240
|
-
'clip_duration': clip_duration,
|
|
241
|
-
'output_path': str(final_path),
|
|
242
|
-
'timestamp': timestamp_min * 60 + timestamp_sec,
|
|
243
|
-
'verification': verify_result
|
|
111
|
+
opts = {
|
|
112
|
+
'quiet': True,
|
|
113
|
+
'no_warnings': True,
|
|
114
|
+
'extract_flat': 'in_playlist',
|
|
115
|
+
'http_headers': {'User-Agent': self._get_ua()},
|
|
116
|
+
'socket_timeout': 10,
|
|
244
117
|
}
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
history = load_history(self.base_dir)
|
|
248
|
-
history[history_key] = metadata
|
|
249
|
-
save_history(self.base_dir, history)
|
|
250
|
-
|
|
251
|
-
return final_path, metadata
|
|
252
|
-
|
|
253
|
-
except Exception as e:
|
|
254
|
-
logger.error(f"Error during processing: {e}")
|
|
255
|
-
# 에러 발생 시 임시 파일 정리
|
|
256
|
-
if temp_path.exists():
|
|
257
|
-
temp_path.unlink()
|
|
258
|
-
raise
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
def parse_txt_line(line: str) -> Optional[Dict]:
|
|
262
|
-
"""
|
|
263
|
-
텍스트 파일 한 줄 파싱
|
|
264
|
-
형식: task_type,url,timestamp_min,timestamp_sec,description
|
|
265
|
-
"""
|
|
266
|
-
parts = [p.strip() for p in line.split(',')]
|
|
267
|
-
if len(parts) < 4:
|
|
268
|
-
return None
|
|
269
|
-
|
|
270
|
-
# 헤더 체크
|
|
271
|
-
if parts[0] == 'task_type' and parts[2] == 'timestamp_min':
|
|
272
|
-
return None
|
|
273
|
-
|
|
274
|
-
try:
|
|
275
|
-
return {
|
|
276
|
-
'task_type': parts[0],
|
|
277
|
-
'url': parts[1],
|
|
278
|
-
'timestamp_min': int(parts[2]),
|
|
279
|
-
'timestamp_sec': int(parts[3]),
|
|
280
|
-
'description': parts[4] if len(parts) > 4 else ''
|
|
281
|
-
}
|
|
282
|
-
except ValueError:
|
|
283
|
-
return None
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
def download_from_txt(txt_path: Path, task_type: str, base_dir: Path = None, max_count: int = None, skip_verify: bool = False) -> list:
|
|
287
|
-
"""TXT 파일에서 다운로드 실행 (순차)"""
|
|
288
|
-
return _process_download_loop(txt_path, task_type, base_dir, parallel=False, max_count=max_count, skip_verify=skip_verify)
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
def download_from_txt_parallel(txt_path: Path, task_type: str, base_dir: Path = None, max_count: int = None, skip_verify: bool = False) -> list:
|
|
292
|
-
"""TXT 파일에서 병렬 다운로드 실행 (Fast Mode)"""
|
|
293
|
-
return _process_download_loop(txt_path, task_type, base_dir, parallel=True, max_count=max_count, skip_verify=skip_verify)
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None, parallel: bool = False, max_count: int = None, skip_verify: bool = False) -> list:
|
|
297
|
-
from .config import MAX_VIDEOS_PER_TASK, MAX_WORKERS, REQUEST_DELAY_MIN, REQUEST_DELAY_MAX
|
|
298
|
-
import time
|
|
299
|
-
import random
|
|
300
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
301
|
-
|
|
302
|
-
# max_count가 없으면 config의 기본값 사용
|
|
303
|
-
limit = max_count if max_count is not None else MAX_VIDEOS_PER_TASK
|
|
304
|
-
|
|
305
|
-
results = []
|
|
306
|
-
downloader = VideoDownloader(task_type, base_dir)
|
|
307
|
-
|
|
308
|
-
# 시작 전 개수 확인
|
|
309
|
-
initial_count = downloader.get_saved_video_count()
|
|
310
|
-
if initial_count >= limit:
|
|
311
|
-
logger.warning(f"Task '{task_type}' already has {initial_count} videos (Limit: {limit}). Skipping.")
|
|
312
|
-
return results
|
|
313
|
-
|
|
314
|
-
if not txt_path.exists():
|
|
315
|
-
logger.error(f"File not found: {txt_path}")
|
|
316
|
-
return results
|
|
317
|
-
|
|
318
|
-
lines = txt_path.read_text(encoding='utf-8').splitlines()
|
|
319
|
-
items = []
|
|
320
|
-
|
|
321
|
-
for line in lines:
|
|
322
|
-
if not line.strip() or line.startswith('#'):
|
|
323
|
-
continue
|
|
324
|
-
data = parse_txt_line(line)
|
|
325
|
-
if data and data['task_type'] == task_type:
|
|
326
|
-
items.append(data)
|
|
327
|
-
|
|
328
|
-
if not items:
|
|
329
|
-
return results
|
|
330
|
-
|
|
331
|
-
logger.info(f"Found {len(items)} URLs. Target: {limit} videos (Current: {initial_count}). Starting {'parallel' if parallel else 'sequential'} download...")
|
|
332
|
-
|
|
333
|
-
def process_item(data):
|
|
334
|
-
# 현재 개수 체크 (루프 도중 목표 달성 시 중단 위함)
|
|
335
|
-
# 주의: 병렬 처리 시 정확한 count 동기화는 Lock이 필요하지만, 여기선 대략적인 체크로 충분
|
|
336
|
-
current = downloader.get_saved_video_count()
|
|
337
|
-
if current >= limit:
|
|
338
|
-
raise LimitReachedError("Target count reached")
|
|
339
|
-
|
|
340
|
-
# 방화벽 우회용 랜덤 딜레이 (병렬 모드에서도 적용하여 동시 요청 폭주 완화)
|
|
341
|
-
if parallel:
|
|
342
|
-
time.sleep(random.uniform(REQUEST_DELAY_MIN, REQUEST_DELAY_MAX))
|
|
343
|
-
|
|
344
|
-
try:
|
|
345
|
-
# VideoDownloader 내부의 limit 체크는 config 값을 쓰므로,
|
|
346
|
-
# 여기서는 외부에서 주입된 limit을 강제할 방법이 필요하거나,
|
|
347
|
-
# 단순히 루프 레벨에서 제어하면 됨.
|
|
348
|
-
# download_clip_at_timestamp 메서드는 내부적으로 MAX_VIDEOS_PER_TASK를 체크하므로,
|
|
349
|
-
# 이를 우회하거나 단순 루프 제어로 처리.
|
|
350
|
-
|
|
351
|
-
output_path, metadata = downloader.download_clip_at_timestamp(
|
|
352
|
-
url=data['url'],
|
|
353
|
-
timestamp_min=data['timestamp_min'],
|
|
354
|
-
timestamp_sec=data['timestamp_sec'],
|
|
355
|
-
skip_verify=skip_verify
|
|
356
|
-
)
|
|
357
|
-
|
|
358
|
-
if output_path is None:
|
|
359
|
-
return {
|
|
360
|
-
'success': False,
|
|
361
|
-
'url': data['url'],
|
|
362
|
-
'error': 'Verification failed',
|
|
363
|
-
'status': 'skipped'
|
|
364
|
-
}
|
|
118
|
+
if self.proxy:
|
|
119
|
+
opts['proxy'] = self.proxy
|
|
365
120
|
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
'metadata': metadata,
|
|
371
|
-
'status': 'cached'
|
|
372
|
-
}
|
|
373
|
-
|
|
374
|
-
return {
|
|
375
|
-
'success': True,
|
|
376
|
-
'output_path': str(output_path),
|
|
377
|
-
'metadata': metadata,
|
|
378
|
-
'status': 'downloaded'
|
|
379
|
-
}
|
|
380
|
-
|
|
381
|
-
except LimitReachedError:
|
|
382
|
-
# 내부에서 발생한 LimitReachedError도 처리
|
|
383
|
-
return {'success': False, 'error': 'Limit reached', 'status': 'limit_reached'}
|
|
384
|
-
|
|
121
|
+
with YoutubeDL(opts) as ydl:
|
|
122
|
+
result = ydl.extract_info(f"ytsearch{count}:{query}", download=False)
|
|
123
|
+
|
|
124
|
+
return list(result.get('entries', [])) if result else []
|
|
385
125
|
except Exception as e:
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
126
|
+
print(f" 검색 에러: {e}")
|
|
127
|
+
return []
|
|
128
|
+
|
|
129
|
+
def _get_duration(self, video_id):
|
|
130
|
+
"""영상 길이 조회"""
|
|
131
|
+
try:
|
|
132
|
+
url = f"https://www.youtube.com/watch?v={video_id}"
|
|
133
|
+
opts = {
|
|
134
|
+
'quiet': True,
|
|
135
|
+
'no_warnings': True,
|
|
136
|
+
'http_headers': {'User-Agent': self._get_ua()},
|
|
137
|
+
'socket_timeout': 5,
|
|
393
138
|
}
|
|
139
|
+
if self.proxy:
|
|
140
|
+
opts['proxy'] = self.proxy
|
|
141
|
+
|
|
142
|
+
with YoutubeDL(opts) as ydl:
|
|
143
|
+
info = ydl.extract_info(url, download=False)
|
|
144
|
+
return info.get('duration')
|
|
145
|
+
except:
|
|
146
|
+
return None
|
|
147
|
+
|
|
148
|
+
def _process_video(self, entry, category, cat_name):
|
|
149
|
+
"""단일 영상 처리 (다운로드 + 분석)"""
|
|
150
|
+
vid = entry.get('id')
|
|
151
|
+
url = f"https://www.youtube.com/watch?v={vid}"
|
|
152
|
+
title = entry.get('title', '?')[:45]
|
|
153
|
+
|
|
154
|
+
status, filepath, _ = self._download_one(url, quiet=True)
|
|
394
155
|
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
if
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
156
|
+
result_info = {'title': title, 'status': status, 'saved': False}
|
|
157
|
+
|
|
158
|
+
if status == "ok" and filepath:
|
|
159
|
+
analysis = self.analyzer.analyze(filepath)
|
|
160
|
+
|
|
161
|
+
detected = []
|
|
162
|
+
if analysis['face']:
|
|
163
|
+
detected.append(f"얼굴({analysis['face_count']})")
|
|
164
|
+
if analysis['text']:
|
|
165
|
+
detected.append("텍스트")
|
|
166
|
+
if analysis['license_plate']:
|
|
167
|
+
detected.append("번호판")
|
|
168
|
+
if analysis['tattoo']:
|
|
169
|
+
detected.append("타투")
|
|
170
|
+
|
|
171
|
+
result_info['detected'] = detected
|
|
172
|
+
|
|
173
|
+
if analysis.get(category):
|
|
174
|
+
dest_dir = os.path.join(self.output_path, cat_name)
|
|
175
|
+
os.makedirs(dest_dir, exist_ok=True)
|
|
176
|
+
dest = os.path.join(dest_dir, os.path.basename(filepath))
|
|
177
|
+
if not os.path.exists(dest):
|
|
178
|
+
shutil.move(filepath, dest)
|
|
179
|
+
result_info['saved'] = True
|
|
180
|
+
else:
|
|
181
|
+
if category == 'license_plate':
|
|
182
|
+
dest_dir = os.path.join(self.output_path, "번호판_미감지")
|
|
183
|
+
os.makedirs(dest_dir, exist_ok=True)
|
|
184
|
+
dest = os.path.join(dest_dir, os.path.basename(filepath))
|
|
185
|
+
if not os.path.exists(dest):
|
|
186
|
+
shutil.move(filepath, dest)
|
|
187
|
+
result_info['undetected_saved'] = True
|
|
188
|
+
else:
|
|
404
189
|
try:
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
190
|
+
os.remove(filepath)
|
|
191
|
+
except:
|
|
192
|
+
pass
|
|
193
|
+
|
|
194
|
+
return result_info
|
|
195
|
+
|
|
196
|
+
def collect(self, category, max_videos=5):
|
|
197
|
+
"""카테고리별 영상 수집"""
|
|
198
|
+
cat_name = CATEGORY_NAMES[category]
|
|
199
|
+
query = self._get_query(category)
|
|
200
|
+
|
|
201
|
+
print(f"\n{'='*60}")
|
|
202
|
+
print(f"[{cat_name}] 검색: {query}")
|
|
203
|
+
mode = "⚡ 고속" if self.fast_mode else "일반"
|
|
204
|
+
print(f"목표: {max_videos}개 | 최대길이: {self._format_duration(self.max_duration)} | {mode}")
|
|
205
|
+
print('='*60)
|
|
206
|
+
|
|
207
|
+
# 검색
|
|
208
|
+
entries = self._search(query, max_videos * 3)
|
|
209
|
+
if not entries:
|
|
210
|
+
print("검색 결과 없음")
|
|
211
|
+
return 0
|
|
212
|
+
|
|
213
|
+
print(f"검색됨: {len(entries)}개")
|
|
214
|
+
|
|
215
|
+
# 길이 필터링
|
|
216
|
+
filtered = []
|
|
217
|
+
for entry in entries:
|
|
218
|
+
if not entry:
|
|
219
|
+
continue
|
|
220
|
+
|
|
221
|
+
vid = entry.get('id')
|
|
222
|
+
title = entry.get('title', '?')[:40]
|
|
223
|
+
dur = entry.get('duration') or self._get_duration(vid)
|
|
224
|
+
|
|
225
|
+
if dur and dur < self.max_duration:
|
|
226
|
+
filtered.append(entry)
|
|
227
|
+
print(f" ✓ [{self._format_duration(dur)}] {title}")
|
|
228
|
+
if len(filtered) >= max_videos:
|
|
421
229
|
break
|
|
230
|
+
elif dur:
|
|
231
|
+
print(f" ✗ [{self._format_duration(dur)}] {title}")
|
|
232
|
+
|
|
233
|
+
if not self.fast_mode:
|
|
234
|
+
time.sleep(0.3)
|
|
235
|
+
|
|
236
|
+
if not filtered:
|
|
237
|
+
print("조건 맞는 영상 없음")
|
|
238
|
+
return 0
|
|
239
|
+
|
|
240
|
+
print(f"\n다운로드: {len(filtered)}개" + (" (병렬)" if self.fast_mode else ""))
|
|
241
|
+
success = 0
|
|
242
|
+
|
|
243
|
+
if self.fast_mode and self.workers > 1:
|
|
244
|
+
# 병렬 다운로드
|
|
245
|
+
with ThreadPoolExecutor(max_workers=self.workers) as executor:
|
|
246
|
+
futures = {
|
|
247
|
+
executor.submit(self._process_video, entry, category, cat_name): entry
|
|
248
|
+
for entry in filtered
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
for i, future in enumerate(as_completed(futures)):
|
|
252
|
+
entry = futures[future]
|
|
253
|
+
title = entry.get('title', '?')[:45]
|
|
422
254
|
|
|
423
|
-
# --- 2단계: 목표 수량을 못 채웠을 경우 YouTube 검색Fallback (반복 시도) ---
|
|
424
|
-
max_search_attempts = 5 # 최대 검색 시도 횟수
|
|
425
|
-
search_attempt = 0
|
|
426
|
-
processed_urls = set(data['url'] for data in items) # 이미 처리한 URL 중복 검색 방지
|
|
427
|
-
|
|
428
|
-
while downloader.get_saved_video_count() < limit and search_attempt < max_search_attempts:
|
|
429
|
-
current_count = downloader.get_saved_video_count()
|
|
430
|
-
remaining = limit - current_count
|
|
431
|
-
|
|
432
|
-
logger.info(f"\n[Search Attempt {search_attempt+1}] Target not reached ({current_count}/{limit}). Searching YouTube for '{task_type}'...")
|
|
433
|
-
|
|
434
|
-
# 검색어: 태스크 이름
|
|
435
|
-
# 검색 결과 개수를 점진적으로 늘리거나 조절 가능
|
|
436
|
-
search_results = downloader.search_youtube(task_type, max_results=min(100, remaining * 5))
|
|
437
|
-
|
|
438
|
-
if not search_results:
|
|
439
|
-
logger.warning("No more search results found.")
|
|
440
|
-
break
|
|
441
|
-
|
|
442
|
-
# 새로운 URL만 필터링
|
|
443
|
-
new_entries = [e for e in search_results if e['url'] not in processed_urls]
|
|
444
|
-
if not new_entries:
|
|
445
|
-
logger.info("No new unique videos found in this search attempt.")
|
|
446
|
-
search_attempt += 1
|
|
447
|
-
continue
|
|
448
|
-
|
|
449
|
-
search_items = []
|
|
450
|
-
for entry in new_entries:
|
|
451
|
-
processed_urls.add(entry['url'])
|
|
452
|
-
# 검색 결과는 타임스탬프 정보가 없으므로 여러 지점 시도 가능 (현재는 1분 지점 고정)
|
|
453
|
-
search_items.append({
|
|
454
|
-
'task_type': task_type,
|
|
455
|
-
'url': entry['url'],
|
|
456
|
-
'timestamp_min': 1,
|
|
457
|
-
'timestamp_sec': 0,
|
|
458
|
-
'description': f"Auto-searched: {entry['title']}"
|
|
459
|
-
})
|
|
460
|
-
|
|
461
|
-
logger.info(f"Processing {len(search_items)} new search results...")
|
|
462
|
-
|
|
463
|
-
if parallel:
|
|
464
|
-
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
|
465
|
-
futures = [executor.submit(process_item, item) for item in search_items]
|
|
466
|
-
from tqdm import tqdm
|
|
467
|
-
for future in tqdm(as_completed(futures), total=len(search_items), desc=f"Search Fallback #{search_attempt+1}"):
|
|
468
255
|
try:
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
256
|
+
result = future.result()
|
|
257
|
+
print(f"\n[{i+1}/{len(filtered)}] {title}")
|
|
258
|
+
|
|
259
|
+
if result['status'] == "ok":
|
|
260
|
+
if result.get('detected'):
|
|
261
|
+
print(f" 감지: {', '.join(result['detected'])}")
|
|
262
|
+
if result['saved']:
|
|
263
|
+
print(f" ✅ 저장: {cat_name}/")
|
|
264
|
+
success += 1
|
|
265
|
+
elif result.get('undetected_saved'):
|
|
266
|
+
print(" 📁 미감지 보관")
|
|
267
|
+
else:
|
|
268
|
+
print(" ❌ 미감지 삭제")
|
|
269
|
+
elif result['status'] == "skipped":
|
|
270
|
+
print(" ⏭ 이미 있음")
|
|
271
|
+
elif result['status'] == "unavailable":
|
|
272
|
+
print(" ⏭ 사용불가")
|
|
273
|
+
else:
|
|
274
|
+
print(" ✗ 실패")
|
|
275
|
+
except Exception as e:
|
|
276
|
+
print(f"\n[{i+1}/{len(filtered)}] {title}")
|
|
277
|
+
print(f" ✗ 에러: {e}")
|
|
476
278
|
else:
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
279
|
+
# 순차 다운로드
|
|
280
|
+
for i, entry in enumerate(filtered):
|
|
281
|
+
vid = entry.get('id')
|
|
282
|
+
url = f"https://www.youtube.com/watch?v={vid}"
|
|
283
|
+
title = entry.get('title', '?')[:45]
|
|
284
|
+
|
|
285
|
+
print(f"\n[{i+1}/{len(filtered)}] {title}")
|
|
286
|
+
|
|
287
|
+
status, filepath, _ = self._download_one(url)
|
|
288
|
+
if not self.fast_mode:
|
|
289
|
+
print()
|
|
290
|
+
|
|
291
|
+
if status == "ok" and filepath:
|
|
292
|
+
print(" 🔍 분석...")
|
|
293
|
+
result = self.analyzer.analyze(filepath)
|
|
294
|
+
|
|
295
|
+
detected = []
|
|
296
|
+
if result['face']:
|
|
297
|
+
detected.append(f"얼굴({result['face_count']})")
|
|
298
|
+
if result['text']:
|
|
299
|
+
detected.append("텍스트")
|
|
300
|
+
if result['license_plate']:
|
|
301
|
+
detected.append("번호판")
|
|
302
|
+
if result['tattoo']:
|
|
303
|
+
detected.append("타투")
|
|
304
|
+
|
|
305
|
+
if detected:
|
|
306
|
+
print(f" 감지: {', '.join(detected)}")
|
|
307
|
+
|
|
308
|
+
if result.get(category):
|
|
309
|
+
dest_dir = os.path.join(self.output_path, cat_name)
|
|
310
|
+
os.makedirs(dest_dir, exist_ok=True)
|
|
311
|
+
dest = os.path.join(dest_dir, os.path.basename(filepath))
|
|
312
|
+
if not os.path.exists(dest):
|
|
313
|
+
shutil.move(filepath, dest)
|
|
314
|
+
print(f" ✅ 저장: {cat_name}/")
|
|
315
|
+
success += 1
|
|
316
|
+
else:
|
|
317
|
+
if category == 'license_plate':
|
|
318
|
+
dest_dir = os.path.join(self.output_path, "번호판_미감지")
|
|
319
|
+
os.makedirs(dest_dir, exist_ok=True)
|
|
320
|
+
dest = os.path.join(dest_dir, os.path.basename(filepath))
|
|
321
|
+
if not os.path.exists(dest):
|
|
322
|
+
shutil.move(filepath, dest)
|
|
323
|
+
print(" 📁 미감지 보관")
|
|
324
|
+
else:
|
|
325
|
+
try:
|
|
326
|
+
os.remove(filepath)
|
|
327
|
+
except:
|
|
328
|
+
pass
|
|
329
|
+
print(" ❌ 미감지 삭제")
|
|
330
|
+
|
|
331
|
+
elif status == "skipped":
|
|
332
|
+
print(" ⏭ 이미 있음")
|
|
333
|
+
elif status == "unavailable":
|
|
334
|
+
print(" ⏭ 사용불가")
|
|
335
|
+
else:
|
|
336
|
+
print(" ✗ 실패")
|
|
337
|
+
|
|
338
|
+
if not self.fast_mode:
|
|
339
|
+
time.sleep(random.uniform(0.5, 1.5))
|
|
340
|
+
|
|
341
|
+
return success
|