ytcollector 1.0.8__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ytcollector/cli.py CHANGED
@@ -1,234 +1,202 @@
1
1
  #!/usr/bin/env python3
2
- """
3
- SBS Dataset Collector CLI (Updated)
4
- """
5
- import argparse
6
- import logging
7
- from pathlib import Path
2
+ """YouTube 콘텐츠 수집기 - CLI 모듈"""
8
3
 
9
- # Package import modified to 'downloader'
10
- from .config import TASK_CLASSES, VALID_TASKS, get_paths
11
- from .utils import ensure_dir, get_url_file_path
4
+ import argparse
5
+ import os
6
+ import sys
12
7
 
13
- logging.basicConfig(
14
- level=logging.INFO,
15
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
16
- )
17
- logger = logging.getLogger(__name__)
8
+ from .config import CATEGORY_NAMES
9
+ from .downloader import YouTubeDownloader
10
+ from .analyzer import check_dependencies
18
11
 
19
12
 
20
- def init_project(args):
21
- """프로젝트 디렉토리 초기화"""
22
- base_dir = Path(args.dir) if args.dir else Path.cwd()
23
-
24
- # 1. 태스크별 폴더 youtube_url.txt 생성
25
- for task in VALID_TASKS:
26
- # get_url_file_path 내부에서 ensure_dir 호출로 폴더 생성됨
27
- txt_path = get_url_file_path(base_dir, task)
28
-
29
- if not txt_path.exists():
30
- txt_path.write_text(
31
- "task_type,url,timestamp_min,timestamp_sec,description\n"
32
- f"{task},https://www.youtube.com/watch?v=EXAMPLE,2,30,샘플\n",
33
- encoding='utf-8'
34
- )
35
-
36
- # 2. config.py의 get_paths 로직에 따른 폴더들 생성 확인
37
- paths = get_paths(base_dir)
38
- ensure_dir(paths['outputs'])
39
-
40
- print(f"✓ Project initialized at: {base_dir}")
41
- print(f" - Add URLs to: urls/<task>/youtube_url.txt")
42
- print(f" - Videos will be saved to configured OUTPUT_DIR (or video/ folder)")
13
+ def create_parser():
14
+ """CLI 인자 파서 생성"""
15
+ parser = argparse.ArgumentParser(
16
+ prog='ytcollector',
17
+ description='YouTube 콘텐츠 수집기 - 얼굴, 번호판, 타투, 텍스트 감지',
18
+ formatter_class=argparse.RawDescriptionHelpFormatter,
19
+ epilog="""
20
+ 예시:
21
+ ytcollector -c face # 얼굴 카테고리 5개
22
+ ytcollector -c face text -n 10 # 얼굴, 텍스트 각 10개
23
+ ytcollector -c face --fast # 고속 모드 (병렬 다운로드)
24
+ ytcollector -c face --fast -w 5 # 5개 동시 다운로드
25
+ ytcollector -c license_plate -d 5 # 번호판, 최대 5분
26
+
27
+ # 짧은 명령어도 사용 가능
28
+ ytc -c face -n 3
29
+ """
30
+ )
43
31
 
32
+ parser.add_argument(
33
+ '-c', '--categories',
34
+ nargs='+',
35
+ choices=['face', 'license_plate', 'tattoo', 'text'],
36
+ default=['face'],
37
+ help='수집할 카테고리 (기본: face)'
38
+ )
39
+ parser.add_argument(
40
+ '-n', '--count',
41
+ type=int,
42
+ default=5,
43
+ help='카테고리당 다운로드 수 (기본: 5)'
44
+ )
45
+ parser.add_argument(
46
+ '-d', '--duration',
47
+ type=int,
48
+ default=3,
49
+ help='최대 영상 길이(분) (기본: 3)'
50
+ )
51
+ parser.add_argument(
52
+ '-o', '--output',
53
+ type=str,
54
+ default=".",
55
+ help='저장 경로 (기본: 현재 폴더)'
56
+ )
57
+ parser.add_argument(
58
+ '--fast',
59
+ action='store_true',
60
+ help='고속 모드 (병렬 다운로드, 딜레이 최소화)'
61
+ )
62
+ parser.add_argument(
63
+ '-w', '--workers',
64
+ type=int,
65
+ default=3,
66
+ help='병렬 다운로드 수 (기본: 3, --fast 필요)'
67
+ )
68
+ parser.add_argument(
69
+ '--proxy',
70
+ type=str,
71
+ default=None,
72
+ help='프록시 (예: http://proxy:8080)'
73
+ )
74
+ parser.add_argument(
75
+ '-v', '--version',
76
+ action='version',
77
+ version='%(prog)s 1.1.1'
78
+ )
79
+ parser.add_argument(
80
+ '--check-deps',
81
+ action='store_true',
82
+ help='의존성 확인 후 종료'
83
+ )
44
84
 
45
- def run_download(args):
46
- """TXT 파일에서 영상 다운로드"""
47
- from .downloader import download_from_txt # Changed function name
48
-
49
- base_dir = Path(args.dir) if args.dir else Path.cwd()
50
-
51
- # argparse에서 nargs='+'로 받아오면 args.task는 항상 리스트
52
- tasks = args.task if isinstance(args.task, list) else [args.task]
53
-
54
- total_success = 0
55
- total_processed = 0
56
-
57
- for task in tasks:
58
- logger.info(f"=== Processing Task: {task} ===")
59
-
60
- # 파일 경로: video/{task}/youtube_url.txt
61
- txt_file = get_url_file_path(base_dir, task)
62
-
63
- if not txt_file.exists():
64
- logger.error(f"URL file not found: {txt_file}")
65
- logger.info(f"Skipping {task}. Run 'ytcollector init' first.")
66
- continue
67
-
68
- logger.info(f"Starting{' fast' if args.fast else ''} download for task: {task}")
69
-
70
- if args.fast:
71
- from .downloader import download_from_txt_parallel
72
- results = download_from_txt_parallel(txt_file, task, base_dir, max_count=args.count, skip_verify=args.skip_verify)
73
- else:
74
- results = download_from_txt(txt_file, task, base_dir, max_count=args.count, skip_verify=args.skip_verify)
75
-
76
- success_count = sum(1 for r in results if r.get('success'))
77
- total_success += success_count
78
- total_processed += len(results)
79
-
80
- print(f"✓ Task '{task}' complete: {success_count}/{len(results)} successful")
85
+ return parser
86
+
87
+
88
+ def run(
89
+ categories=None,
90
+ count=5,
91
+ duration=3,
92
+ output=None,
93
+ fast=False,
94
+ workers=3,
95
+ proxy=None,
96
+ quiet=False
97
+ ):
98
+ """
99
+ 프로그래밍 방식으로 수집기 실행
100
+
101
+ Args:
102
+ categories: 카테고리 리스트 (예: ['face', 'text'])
103
+ count: 카테고리당 다운로드 수
104
+ duration: 최대 영상 길이(분)
105
+ output: 저장 경로
106
+ fast: 고속 모드 여부
107
+ workers: 병렬 다운로드 수
108
+ proxy: 프록시 URL
109
+ quiet: 조용한 모드 (출력 최소화)
81
110
 
82
- print(f"\n✓ All tasks complete: {total_success}/{total_processed} successful total")
83
-
84
-
85
- def run_download_single(args):
86
- """단일 URL 다운로드"""
87
- from .downloader import VideoDownloader
88
-
89
- base_dir = Path(args.dir) if args.dir else Path.cwd()
90
- downloader = VideoDownloader(args.task, base_dir)
91
-
92
- try:
93
- output_path, metadata = downloader.download_clip_at_timestamp(
94
- url=args.url,
95
- timestamp_min=args.timestamp_min,
96
- timestamp_sec=args.timestamp_sec
97
- )
98
- status = "Cached" if metadata.get('cached') else "Downloaded"
99
- print(f"✓ {status}: {output_path}")
100
- if not metadata.get('cached'):
101
- print(f" Clip duration: {metadata['clip_duration']}s")
102
-
103
- except Exception as e:
104
- logger.error(f"Download failed: {e}")
105
-
106
-
107
- def run_verify(args):
108
- """클립 영상 검증"""
109
- from .verifier import verify_clip, batch_verify
111
+ Returns:
112
+ dict: 카테고리별 성공 다운로드 수
113
+ """
114
+ if categories is None:
115
+ categories = ['face']
116
+
117
+ if output is None:
118
+ output = "."
119
+
120
+ # 의존성 체크
121
+ missing = check_dependencies()
122
+ if missing and not quiet:
123
+ print(f"⚠ 분석 기능을 위해 설치 필요: pip install {' '.join(missing)}")
124
+
125
+ # 다운로더 생성
126
+ downloader = YouTubeDownloader(
127
+ output_path=output,
128
+ max_duration=duration * 60,
129
+ proxy=proxy,
130
+ fast_mode=fast,
131
+ workers=workers
132
+ )
110
133
 
111
- base_dir = Path(args.dir) if args.dir else Path.cwd()
134
+ results = {}
135
+ for category in categories:
136
+ count_success = downloader.collect(category, count)
137
+ results[category] = count_success
112
138
 
113
- if args.video:
114
- video_path = Path(args.video)
115
- result = verify_clip(video_path, args.task, base_dir)
116
- print_verification_result(result)
117
- else:
118
- # 폴더 경로: video/{task}/
119
- clips_dir = base_dir / "video" / args.task
120
- if not clips_dir.exists():
121
- logger.error(f"Video directory not found: {clips_dir}")
122
- return
123
-
124
- results = batch_verify(clips_dir, args.task, base_dir)
125
- valid_count = sum(1 for r in results if r.get('is_valid'))
126
- print(f"✓ Verification complete: {valid_count}/{len(results)} valid")
139
+ return results
127
140
 
128
141
 
129
- def run_pipeline(args):
130
- """다운로드 + 검증 전체 파이프라인"""
131
- print(f"=== Starting pipeline for task: {args.task} ===")
132
-
133
- run_download(args)
134
-
135
- if args.verify:
136
- print("\n--- Running verification ---")
137
- run_verify(args)
138
-
139
- print("=== Pipeline complete ===")
140
-
142
+ def main(args=None):
143
+ """CLI 메인 엔트리포인트"""
144
+ parser = create_parser()
145
+ parsed_args = parser.parse_args(args)
141
146
 
142
- def print_verification_result(result: dict):
143
- """검증 결과 출력"""
144
- summary = result.get('summary', {})
145
-
146
- print("\n" + "="*50)
147
- print(f"Video: {Path(result['video_path']).name}")
148
- print(f"Task: {result['task_type']}")
149
- print(f"Classes: {result['classes']}")
150
- print("-"*50)
151
- print(f"Duration: {summary.get('duration_sec', 0):.1f}s")
152
- print(f"Frames with detection: {summary.get('frames_with_detection', 0)}")
153
- print(f"Detection rate: {summary.get('detection_rate', 0):.1%}")
154
- print(f"Valid: {'✓ YES' if result.get('is_valid') else '✗ NO'}")
155
- print("="*50)
156
-
157
-
158
- def list_tasks(args):
159
- """태스크 목록 출력"""
160
- print("\nAvailable Tasks and YOLO-World Classes:")
161
- print("-" * 50)
162
- for task, classes in TASK_CLASSES.items():
163
- print(f"\n{task}:")
164
- for cls in classes:
165
- print(f" - {cls}")
166
-
167
-
168
- def main():
169
- parser = argparse.ArgumentParser(
170
- description='Downloader - SBS Dataset Collector',
171
- formatter_class=argparse.RawDescriptionHelpFormatter,
172
- epilog="""
173
- Examples:
174
- downloader init # 프로젝트 초기화
175
- downloader download --task face # 텍스트 파일에서 다운로드
176
- downloader verify --task face # YOLO 검증
177
- """
147
+ # 의존성 확인 모드
148
+ if parsed_args.check_deps:
149
+ missing = check_dependencies()
150
+ if missing:
151
+ print(" 누락된 의존성:")
152
+ for dep in missing:
153
+ print(f" - {dep}")
154
+ print(f"\n설치: pip install {' '.join(missing)}")
155
+ sys.exit(1)
156
+ else:
157
+ print(" 모든 의존성이 설치되어 있습니다.")
158
+ sys.exit(0)
159
+
160
+ # 시작 메시지
161
+ print("\n" + "=" * 60)
162
+ print("YouTube 콘텐츠 수집기")
163
+ print("=" * 60)
164
+ print(f"카테고리: {', '.join([CATEGORY_NAMES[c] for c in parsed_args.categories])}")
165
+ print(f"개수: 카테고리당 {parsed_args.count}개")
166
+ print(f"최대길이: {parsed_args.duration}분")
167
+ print(f"저장경로: {parsed_args.output}")
168
+ if parsed_args.fast:
169
+ print(f"모드: 고속 (병렬 {parsed_args.workers}개)")
170
+ if parsed_args.proxy:
171
+ print(f"프록시: {parsed_args.proxy}")
172
+
173
+ # 의존성 체크
174
+ missing = check_dependencies()
175
+ if missing:
176
+ print(f"\n⚠ 분석 기능을 위해 설치 필요:")
177
+ print(f" pip install {' '.join(missing)}")
178
+
179
+ # 수집 실행
180
+ results = run(
181
+ categories=parsed_args.categories,
182
+ count=parsed_args.count,
183
+ duration=parsed_args.duration,
184
+ output=parsed_args.output,
185
+ fast=parsed_args.fast,
186
+ workers=parsed_args.workers,
187
+ proxy=parsed_args.proxy
178
188
  )
179
-
180
- parser.add_argument('--dir', '-d', help='Project directory (default: current)')
181
-
182
- subparsers = parser.add_subparsers(dest='command', help='Commands')
183
-
184
- # Init
185
- init_parser = subparsers.add_parser('init', help='Initialize project directory')
186
-
187
- # Download
188
- download_parser = subparsers.add_parser('download', help='Download from youtube_url.txt')
189
- download_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS, help='One or more tasks (e.g. face tattoo)')
190
- download_parser.add_argument('--count', '-n', type=int, help='Max videos to collect (default: 1000)')
191
- download_parser.add_argument('--fast', action='store_true', help='Enable fast parallel downloading')
192
- download_parser.add_argument('--skip-verify', '-S', action='store_true', help='Skip YOLO verification and save all clips')
193
-
194
- # Download single
195
- single_parser = subparsers.add_parser('download-single', help='Download single video')
196
- single_parser.add_argument('--task', '-t', required=True, choices=VALID_TASKS)
197
- single_parser.add_argument('--url', '-u', required=True, help='YouTube URL')
198
- single_parser.add_argument('--timestamp-min', '-m', type=int, required=True)
199
- single_parser.add_argument('--timestamp-sec', '-s', type=int, required=True)
200
-
201
- # Verify
202
- verify_parser = subparsers.add_parser('verify', help='Verify with YOLO-World')
203
- verify_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS)
204
- verify_parser.add_argument('--video', '-v', help='Specific video file')
205
-
206
- # Pipeline
207
- pipeline_parser = subparsers.add_parser('pipeline', help='Full pipeline')
208
- pipeline_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS)
209
- pipeline_parser.add_argument('--verify', action='store_true')
210
- pipeline_parser.add_argument('--skip-verify', '-S', action='store_true', help='Skip verification in download stage')
211
-
212
- # List tasks
213
- subparsers.add_parser('list-tasks', help='List available tasks')
214
-
215
- args = parser.parse_args()
216
-
217
- if args.command is None:
218
- parser.print_help()
219
- return
220
-
221
- commands = {
222
- 'init': init_project,
223
- 'download': run_download,
224
- 'download-single': run_download_single,
225
- 'verify': run_verify,
226
- 'pipeline': run_pipeline,
227
- 'list-tasks': list_tasks,
228
- }
229
-
230
- commands[args.command](args)
189
+
190
+ # 완료 메시지
191
+ total = sum(results.values())
192
+ print(f"\n{'='*60}")
193
+ print(f"완료! 총 {total}개 저장")
194
+ for cat, cnt in results.items():
195
+ print(f" - {CATEGORY_NAMES[cat]}: {cnt}개")
196
+ print(f"{'='*60}\n")
197
+
198
+ return 0
231
199
 
232
200
 
233
- if __name__ == '__main__':
234
- main()
201
+ if __name__ == "__main__":
202
+ sys.exit(main())
ytcollector/config.py CHANGED
@@ -1,67 +1,85 @@
1
- """
2
- SBS Dataset Collector - Configuration
3
- """
4
- from pathlib import Path
5
- import platform
1
+ # 설정 상수
6
2
 
7
- # Default paths (will use current working directory)
8
- def get_paths(base_dir: Path = None):
9
- """Get all paths based on base directory"""
10
- if base_dir is None:
11
- base_dir = Path.cwd()
12
-
13
- return {
14
- 'base': base_dir,
15
- 'data': base_dir / "data",
16
- # 'urls' removed - now inside video/{task}/youtube_url.txt
17
- 'videos': base_dir / "data" / "videos", # 원본 전체 영상
18
- 'clips': base_dir / "video", # 클리핑된 영상 (요구사항: video/task_이름)
19
- 'outputs': base_dir / "outputs",
20
- 'reports': base_dir / "outputs" / "reports",
21
- 'history': base_dir / "download_history.json",
22
- }
3
+ USER_AGENTS = [
4
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
5
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
6
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
7
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
8
+ ]
23
9
 
24
- # 사용자 지정 출력 경로 (네트워크 드라이브 등)
25
- # macOS에서는 "/Volumes/Data/..." 등으로 마운트된 경로를 사용해야 함
26
- NAS_PATH_WINDOWS = r"\\NAS_SERVER_IP\Data\Private Dataset\SBS_De-Identification_YouTube"
27
- NAS_PATH_MAC = "/Volumes/Data/Private Dataset/SBS_De-Identification_YouTube"
28
-
29
- if platform.system() == 'Windows':
30
- CUSTOM_OUTPUT_DIR = NAS_PATH_WINDOWS
31
- elif platform.system() == 'Darwin': # macOS
32
- CUSTOM_OUTPUT_DIR = NAS_PATH_MAC
33
- else:
34
- CUSTOM_OUTPUT_DIR = None
35
-
36
- # Video settings
37
- CLIP_DURATION_BEFORE = 90 # 1분 30초 (초 단위)
38
- CLIP_DURATION_AFTER = 90 # 1분 30초 (초 단위)
39
- MAX_CLIP_DURATION = 180 # 최대 3분
40
-
41
- # Download settings
42
- VIDEO_FORMAT = "best[ext=mp4]/best"
43
- DOWNLOAD_RETRIES = 3
44
- MAX_VIDEOS_PER_TASK = 1000 # 태스크별 최대 영상 저장 수 (CLI -n 옵션으로 덮어쓰기 가능)
45
-
46
- # Fast Mode Settings (Parallel)
47
- MAX_WORKERS = 4 # 병렬 작업 프로세스 수
48
- REQUEST_DELAY_MIN = 1.0 # 최소 대기 시간 (초)
49
- REQUEST_DELAY_MAX = 3.0 # 최대 대기 시간 (초)
50
- PROXY_URL = None # 프록시 (예: "http://user:pass@host:port")
10
+ # 카테고리별 검색어 - SBS 콘텐츠 중심
11
+ CATEGORY_QUERIES = {
12
+ 'face': [
13
+ "SBS 인터뷰 클립",
14
+ "런닝맨 멤버 인터뷰",
15
+ "SBS 뉴스 인터뷰",
16
+ "미운우리새끼 인터뷰",
17
+ "SBS 스페셜 인물",
18
+ "집사부일체 인터뷰",
19
+ "그것이알고싶다 인터뷰",
20
+ "SBS 연예대상 소감",
21
+ ],
22
+ 'license_plate': [
23
+ "중고차 매물 소개",
24
+ "자동차 세차 영상",
25
+ "신차 출고 브이로그",
26
+ "자동차 튜닝 작업",
27
+ "엔카 허위매물",
28
+ "주차장 만차",
29
+ ],
30
+ 'tattoo': [
31
+ "타투 시술 영상",
32
+ "tattoo timelapse",
33
+ "타투이스트 작업",
34
+ "tattoo artist work",
35
+ "문신 시술",
36
+ "tattoo session",
37
+ ],
38
+ 'text': [
39
+ "SBS 런닝맨 레전드",
40
+ "SBS 미운우리새끼 명장면",
41
+ "SBS 동상이몽 클립",
42
+ "SBS 집사부일체 모음",
43
+ "SBS 골목식당 레전드",
44
+ "SBS 맛남의광장 클립",
45
+ "SBS 불타는청춘 명장면",
46
+ "SBS 정글의법칙 레전드",
47
+ "SBS 예능",
48
+ ],
49
+ }
51
50
 
52
- # YOLO-World settings
53
- YOLO_MODEL = "yolov8s-worldv2.pt"
54
- CONFIDENCE_THRESHOLD = 0.25
55
- FRAME_SAMPLE_RATE = 30 # 매 30프레임마다 샘플링 (약 1초)
51
+ CATEGORY_NAMES = {
52
+ 'face': '얼굴',
53
+ 'license_plate': '번호판',
54
+ 'tattoo': '타투',
55
+ 'text': '텍스트'
56
+ }
56
57
 
57
- # Task-specific class prompts
58
- TASK_CLASSES = {
59
- "face": ["face"],
60
- "license_plate": ["license plate"],
61
- "tattoo": ["tattoo"],
62
- "text": ["text"],
63
- "knife": ["knife"],
64
- "cigarette": ["cigarette"]
58
+ # 카테고리별 제외 키워드 (제목에 포함 시 스킵)
59
+ BLACKLIST_KEYWORDS = {
60
+ 'tattoo': [
61
+ "두피 문신", "두피문신",
62
+ "눈썹 문신", "눈썹문신",
63
+ "입술 문신", "입술문신",
64
+ "틴트 입술",
65
+ "반영구", "SMP"
66
+ ],
67
+ 'face': [],
68
+ 'license_plate': [],
69
+ 'text': []
65
70
  }
66
71
 
67
- VALID_TASKS = list(TASK_CLASSES.keys())
72
+ # 번호판 정규식 패턴
73
+ LICENSE_PLATE_PATTERNS = [
74
+ r'\d{2,3}[가-힣]\d{4}',
75
+ r'[가-힣]{2}\d{2}[가-힣]\d{4}',
76
+ r'[A-Z]{2,3}[-\s]?\d{2,4}[-\s]?[A-Z]{0,3}',
77
+ r'\d{2,4}[-\s]?[A-Z]{2,3}[-\s]?\d{2,4}',
78
+ ]
79
+
80
+ # 스킵할 에러 메시지
81
+ SKIP_ERRORS = [
82
+ "not available", "unavailable", "private", "removed",
83
+ "deleted", "copyright", "blocked", "age", "sign in",
84
+ "members-only", "premiere", "live event"
85
+ ]
@@ -0,0 +1,71 @@
1
+ import os
2
+ import subprocess
3
+ from yt_dlp import YoutubeDL
4
+ from .utils import clip_video, get_url_list, get_video_duration, timestamp_to_seconds
5
+
6
+ def download_videos(url_list, output_dir):
7
+ os.makedirs(output_dir, exist_ok=True)
8
+ for idx, item in enumerate(url_list, 1):
9
+ url = item['url']
10
+ task = item['task']
11
+ index_str = f"{idx:03d}"
12
+
13
+ existing_files = [f for f in os.listdir(output_dir) if f.startswith(f"{index_str}_")]
14
+ if existing_files:
15
+ print(f"[{index_str}] Skip: {existing_files[0]}")
16
+ continue
17
+
18
+ print(f"[{index_str}] Downloading: {url} ({task})")
19
+ try:
20
+ ydl_opts = {
21
+ 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
22
+ 'outtmpl': os.path.join(output_dir, f"{index_str}_{task}_%(title)s.%(ext)s"),
23
+ 'quiet': True,
24
+ 'no_warnings': True,
25
+ }
26
+ with YoutubeDL(ydl_opts) as ydl:
27
+ ydl.download([url])
28
+ except Exception as e:
29
+ print(f"[{index_str}] Failed: {e}")
30
+
31
+ def build_dataset(url_file, output_root="."):
32
+ video_dir = os.path.abspath(os.path.join(output_root, "video"))
33
+ clip_dir = os.path.abspath(os.path.join(output_root, "video_clips"))
34
+
35
+ urls = get_url_list(url_file)
36
+ if not urls:
37
+ print(f"Error: No valid data in {url_file}")
38
+ return
39
+
40
+ print(f"--- Step 1: Downloading {len(urls)} videos ---")
41
+ download_videos(urls, video_dir)
42
+
43
+ print(f"\n--- Step 2: Clipping videos ---")
44
+ os.makedirs(clip_dir, exist_ok=True)
45
+ for idx, item in enumerate(urls, 1):
46
+ index_str = f"{idx:03d}"
47
+ files = [f for f in os.listdir(video_dir) if f.startswith(f"{index_str}_")]
48
+ if not files: continue
49
+
50
+ input_file = os.path.join(video_dir, files[0])
51
+ output_file = os.path.join(clip_dir, files[0])
52
+
53
+ if os.path.exists(output_file): continue
54
+
55
+ print(f"[{index_str}] Clipping: {files[0]}")
56
+ center_sec = timestamp_to_seconds(item['timestamp'])
57
+ clip_video(input_file, output_file, center_sec)
58
+
59
+ print(f"\nDone! Clips saved in: {clip_dir}")
60
+
61
+ def main():
62
+ import argparse
63
+ parser = argparse.ArgumentParser(description='Build SBS Dataset from YouTube URL list')
64
+ parser.add_argument('file', help='Path to youtube_url.txt')
65
+ parser.add_argument('-o', '--output', default='.', help='Output root directory (default: .)')
66
+ args = parser.parse_args()
67
+
68
+ build_dataset(args.file, args.output)
69
+
70
+ if __name__ == "__main__":
71
+ main()