ytcollector 1.0.8__py3-none-any.whl → 1.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ytcollector/__init__.py +36 -11
- ytcollector/analyzer.py +205 -0
- ytcollector/cli.py +186 -218
- ytcollector/config.py +66 -62
- ytcollector/dataset_builder.py +136 -0
- ytcollector/downloader.py +328 -480
- ytcollector-1.0.9.dist-info/METADATA +207 -0
- ytcollector-1.0.9.dist-info/RECORD +11 -0
- ytcollector-1.0.9.dist-info/entry_points.txt +4 -0
- {ytcollector-1.0.8.dist-info → ytcollector-1.0.9.dist-info}/top_level.txt +0 -1
- config/settings.py +0 -39
- ytcollector/utils.py +0 -144
- ytcollector/verifier.py +0 -187
- ytcollector-1.0.8.dist-info/METADATA +0 -105
- ytcollector-1.0.8.dist-info/RECORD +0 -12
- ytcollector-1.0.8.dist-info/entry_points.txt +0 -2
- {ytcollector-1.0.8.dist-info → ytcollector-1.0.9.dist-info}/WHEEL +0 -0
ytcollector/__init__.py
CHANGED
|
@@ -1,14 +1,39 @@
|
|
|
1
|
-
"""
|
|
2
|
-
SBS Dataset Collector - YouTube 영상 수집 및 YOLO-World 검증 파이프라인
|
|
3
|
-
"""
|
|
4
|
-
from pathlib import Path
|
|
1
|
+
"""YouTube 콘텐츠 수집기 라이브러리
|
|
5
2
|
|
|
6
|
-
|
|
7
|
-
|
|
3
|
+
외부에서 라이브러리로 사용하거나 CLI로 실행할 수 있습니다.
|
|
4
|
+
|
|
5
|
+
라이브러리 사용 예시:
|
|
6
|
+
from ytcollector import YouTubeDownloader, run
|
|
7
|
+
|
|
8
|
+
# 방법 1: YouTubeDownloader 직접 사용
|
|
9
|
+
downloader = YouTubeDownloader(output_path="./videos")
|
|
10
|
+
count = downloader.collect("face", max_videos=5)
|
|
11
|
+
|
|
12
|
+
# 방법 2: run() 함수 사용 (간단한 방법)
|
|
13
|
+
results = run(categories=["face", "text"], count=3)
|
|
14
|
+
|
|
15
|
+
CLI 사용 예시:
|
|
16
|
+
ytcollector -c face -n 5
|
|
17
|
+
ytc -c face text --fast
|
|
18
|
+
"""
|
|
8
19
|
|
|
9
|
-
|
|
10
|
-
|
|
20
|
+
from .config import CATEGORY_NAMES, CATEGORY_QUERIES, USER_AGENTS, LICENSE_PLATE_PATTERNS
|
|
21
|
+
from .analyzer import VideoAnalyzer, check_dependencies
|
|
22
|
+
from .downloader import YouTubeDownloader
|
|
23
|
+
from .cli import run, main as cli_main
|
|
11
24
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
25
|
+
__version__ = "1.0.0"
|
|
26
|
+
__all__ = [
|
|
27
|
+
# 주요 클래스
|
|
28
|
+
"VideoAnalyzer",
|
|
29
|
+
"YouTubeDownloader",
|
|
30
|
+
# 설정
|
|
31
|
+
"CATEGORY_NAMES",
|
|
32
|
+
"CATEGORY_QUERIES",
|
|
33
|
+
"USER_AGENTS",
|
|
34
|
+
"LICENSE_PLATE_PATTERNS",
|
|
35
|
+
# 유틸리티
|
|
36
|
+
"check_dependencies",
|
|
37
|
+
"run",
|
|
38
|
+
"cli_main",
|
|
39
|
+
]
|
ytcollector/analyzer.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from .config import LICENSE_PLATE_PATTERNS
|
|
3
|
+
|
|
4
|
+
# 선택적 import
|
|
5
|
+
try:
|
|
6
|
+
import cv2
|
|
7
|
+
CV2_AVAILABLE = True
|
|
8
|
+
except ImportError:
|
|
9
|
+
CV2_AVAILABLE = False
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import easyocr
|
|
13
|
+
EASYOCR_AVAILABLE = True
|
|
14
|
+
except ImportError:
|
|
15
|
+
EASYOCR_AVAILABLE = False
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
import numpy as np
|
|
19
|
+
NUMPY_AVAILABLE = True
|
|
20
|
+
except ImportError:
|
|
21
|
+
NUMPY_AVAILABLE = False
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class VideoAnalyzer:
|
|
25
|
+
"""영상 분석 클래스 - 얼굴, 텍스트, 번호판, 타투 감지"""
|
|
26
|
+
|
|
27
|
+
def __init__(self):
|
|
28
|
+
self.ocr_reader = None
|
|
29
|
+
self.face_cascade = None
|
|
30
|
+
|
|
31
|
+
if CV2_AVAILABLE:
|
|
32
|
+
cascade_path = cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
|
|
33
|
+
self.face_cascade = cv2.CascadeClassifier(cascade_path)
|
|
34
|
+
|
|
35
|
+
def _init_ocr(self):
|
|
36
|
+
"""OCR 리더 초기화 (필요할 때만)"""
|
|
37
|
+
if EASYOCR_AVAILABLE and self.ocr_reader is None:
|
|
38
|
+
print(" OCR 엔진 초기화 중...")
|
|
39
|
+
self.ocr_reader = easyocr.Reader(['ko', 'en'], gpu=False, verbose=False)
|
|
40
|
+
|
|
41
|
+
def extract_frames(self, video_path, num_frames=10):
|
|
42
|
+
"""영상에서 균등 간격으로 프레임 추출"""
|
|
43
|
+
if not CV2_AVAILABLE:
|
|
44
|
+
return []
|
|
45
|
+
|
|
46
|
+
cap = cv2.VideoCapture(video_path)
|
|
47
|
+
if not cap.isOpened():
|
|
48
|
+
return []
|
|
49
|
+
|
|
50
|
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
51
|
+
if total_frames <= 0:
|
|
52
|
+
cap.release()
|
|
53
|
+
return []
|
|
54
|
+
|
|
55
|
+
frame_indices = [int(i * total_frames / (num_frames + 1)) for i in range(1, num_frames + 1)]
|
|
56
|
+
frames = []
|
|
57
|
+
|
|
58
|
+
for idx in frame_indices:
|
|
59
|
+
cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
|
|
60
|
+
ret, frame = cap.read()
|
|
61
|
+
if ret:
|
|
62
|
+
frames.append(frame)
|
|
63
|
+
|
|
64
|
+
cap.release()
|
|
65
|
+
return frames
|
|
66
|
+
|
|
67
|
+
def detect_faces(self, frame):
|
|
68
|
+
"""Haar Cascade로 얼굴 감지"""
|
|
69
|
+
if not CV2_AVAILABLE or self.face_cascade is None:
|
|
70
|
+
return []
|
|
71
|
+
|
|
72
|
+
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
|
73
|
+
return self.face_cascade.detectMultiScale(
|
|
74
|
+
gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30)
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
def detect_text(self, frame):
|
|
78
|
+
"""EasyOCR로 텍스트 감지"""
|
|
79
|
+
if not EASYOCR_AVAILABLE:
|
|
80
|
+
return []
|
|
81
|
+
|
|
82
|
+
self._init_ocr()
|
|
83
|
+
try:
|
|
84
|
+
h, w = frame.shape[:2]
|
|
85
|
+
if w > 640:
|
|
86
|
+
scale = 640 / w
|
|
87
|
+
frame = cv2.resize(frame, (640, int(h * scale)))
|
|
88
|
+
|
|
89
|
+
results = self.ocr_reader.readtext(frame)
|
|
90
|
+
return [r[1] for r in results if r[2] > 0.3]
|
|
91
|
+
except:
|
|
92
|
+
return []
|
|
93
|
+
|
|
94
|
+
def detect_license_plate(self, texts):
|
|
95
|
+
"""텍스트에서 번호판 패턴 감지"""
|
|
96
|
+
for text in texts:
|
|
97
|
+
text_clean = text.replace(' ', '').upper()
|
|
98
|
+
for pattern in LICENSE_PLATE_PATTERNS:
|
|
99
|
+
if re.search(pattern, text_clean):
|
|
100
|
+
return True
|
|
101
|
+
return False
|
|
102
|
+
|
|
103
|
+
def detect_tattoo(self, frame):
|
|
104
|
+
"""피부 영역에서 타투(어두운 잉크 패턴) 감지"""
|
|
105
|
+
if not CV2_AVAILABLE or not NUMPY_AVAILABLE:
|
|
106
|
+
return False
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
|
|
110
|
+
|
|
111
|
+
# 피부색 범위
|
|
112
|
+
lower_skin = np.array([0, 30, 80], dtype=np.uint8)
|
|
113
|
+
upper_skin = np.array([17, 170, 255], dtype=np.uint8)
|
|
114
|
+
skin_mask = cv2.inRange(hsv, lower_skin, upper_skin)
|
|
115
|
+
|
|
116
|
+
# 노이즈 제거
|
|
117
|
+
kernel = np.ones((5, 5), np.uint8)
|
|
118
|
+
skin_mask = cv2.morphologyEx(skin_mask, cv2.MORPH_OPEN, kernel)
|
|
119
|
+
skin_mask = cv2.morphologyEx(skin_mask, cv2.MORPH_CLOSE, kernel)
|
|
120
|
+
|
|
121
|
+
skin_pixels = cv2.countNonZero(skin_mask)
|
|
122
|
+
total_pixels = frame.shape[0] * frame.shape[1]
|
|
123
|
+
|
|
124
|
+
# 피부 영역 최소 10% 필요
|
|
125
|
+
if skin_pixels < total_pixels * 0.10:
|
|
126
|
+
return False
|
|
127
|
+
|
|
128
|
+
# 피부 영역 내 어두운 픽셀(타투) 감지
|
|
129
|
+
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
|
|
130
|
+
skin_gray = cv2.bitwise_and(gray, gray, mask=skin_mask)
|
|
131
|
+
dark_mask = cv2.inRange(skin_gray, 1, 80)
|
|
132
|
+
|
|
133
|
+
dark_pixels = cv2.countNonZero(dark_mask)
|
|
134
|
+
dark_ratio = dark_pixels / max(skin_pixels, 1)
|
|
135
|
+
|
|
136
|
+
# 어두운 영역이 3~35%일 때 타투로 판정
|
|
137
|
+
if 0.03 < dark_ratio < 0.35:
|
|
138
|
+
contours, _ = cv2.findContours(dark_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
139
|
+
significant = [c for c in contours if cv2.contourArea(c) > 100]
|
|
140
|
+
return len(significant) >= 1
|
|
141
|
+
|
|
142
|
+
return False
|
|
143
|
+
except:
|
|
144
|
+
return False
|
|
145
|
+
|
|
146
|
+
def analyze(self, video_path):
|
|
147
|
+
"""영상 전체 분석"""
|
|
148
|
+
results = {
|
|
149
|
+
'face': False,
|
|
150
|
+
'text': False,
|
|
151
|
+
'license_plate': False,
|
|
152
|
+
'tattoo': False,
|
|
153
|
+
'face_count': 0,
|
|
154
|
+
'detected_texts': []
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
if not CV2_AVAILABLE:
|
|
158
|
+
print(" ⚠ OpenCV 미설치")
|
|
159
|
+
return results
|
|
160
|
+
|
|
161
|
+
frames = self.extract_frames(video_path, num_frames=8)
|
|
162
|
+
if not frames:
|
|
163
|
+
print(" ⚠ 프레임 추출 실패")
|
|
164
|
+
return results
|
|
165
|
+
|
|
166
|
+
all_texts = []
|
|
167
|
+
total_faces = 0
|
|
168
|
+
|
|
169
|
+
for i, frame in enumerate(frames):
|
|
170
|
+
# 얼굴
|
|
171
|
+
faces = self.detect_faces(frame)
|
|
172
|
+
if len(faces) > 0:
|
|
173
|
+
results['face'] = True
|
|
174
|
+
total_faces += len(faces)
|
|
175
|
+
|
|
176
|
+
# 텍스트 (일부 프레임만)
|
|
177
|
+
if i % 2 == 0 and EASYOCR_AVAILABLE:
|
|
178
|
+
texts = self.detect_text(frame)
|
|
179
|
+
if texts:
|
|
180
|
+
results['text'] = True
|
|
181
|
+
all_texts.extend(texts)
|
|
182
|
+
|
|
183
|
+
# 타투
|
|
184
|
+
if self.detect_tattoo(frame):
|
|
185
|
+
results['tattoo'] = True
|
|
186
|
+
|
|
187
|
+
# 번호판 (텍스트에서)
|
|
188
|
+
if all_texts:
|
|
189
|
+
results['license_plate'] = self.detect_license_plate(all_texts)
|
|
190
|
+
results['detected_texts'] = list(set(all_texts))[:10]
|
|
191
|
+
|
|
192
|
+
results['face_count'] = total_faces
|
|
193
|
+
return results
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def check_dependencies():
|
|
197
|
+
"""의존성 체크"""
|
|
198
|
+
missing = []
|
|
199
|
+
if not CV2_AVAILABLE:
|
|
200
|
+
missing.append("opencv-python")
|
|
201
|
+
if not EASYOCR_AVAILABLE:
|
|
202
|
+
missing.append("easyocr")
|
|
203
|
+
if not NUMPY_AVAILABLE:
|
|
204
|
+
missing.append("numpy")
|
|
205
|
+
return missing
|
ytcollector/cli.py
CHANGED
|
@@ -1,234 +1,202 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
SBS Dataset Collector CLI (Updated)
|
|
4
|
-
"""
|
|
5
|
-
import argparse
|
|
6
|
-
import logging
|
|
7
|
-
from pathlib import Path
|
|
2
|
+
"""YouTube 콘텐츠 수집기 - CLI 모듈"""
|
|
8
3
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
4
|
+
import argparse
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
12
7
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
)
|
|
17
|
-
logger = logging.getLogger(__name__)
|
|
8
|
+
from .config import CATEGORY_NAMES
|
|
9
|
+
from .downloader import YouTubeDownloader
|
|
10
|
+
from .analyzer import check_dependencies
|
|
18
11
|
|
|
19
12
|
|
|
20
|
-
def
|
|
21
|
-
"""
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
ensure_dir(paths['outputs'])
|
|
39
|
-
|
|
40
|
-
print(f"✓ Project initialized at: {base_dir}")
|
|
41
|
-
print(f" - Add URLs to: urls/<task>/youtube_url.txt")
|
|
42
|
-
print(f" - Videos will be saved to configured OUTPUT_DIR (or video/ folder)")
|
|
13
|
+
def create_parser():
|
|
14
|
+
"""CLI 인자 파서 생성"""
|
|
15
|
+
parser = argparse.ArgumentParser(
|
|
16
|
+
prog='ytcollector',
|
|
17
|
+
description='YouTube 콘텐츠 수집기 - 얼굴, 번호판, 타투, 텍스트 감지',
|
|
18
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
19
|
+
epilog="""
|
|
20
|
+
예시:
|
|
21
|
+
ytcollector -c face # 얼굴 카테고리 5개
|
|
22
|
+
ytcollector -c face text -n 10 # 얼굴, 텍스트 각 10개
|
|
23
|
+
ytcollector -c face --fast # 고속 모드 (병렬 다운로드)
|
|
24
|
+
ytcollector -c face --fast -w 5 # 5개 동시 다운로드
|
|
25
|
+
ytcollector -c license_plate -d 5 # 번호판, 최대 5분
|
|
26
|
+
|
|
27
|
+
# 짧은 명령어도 사용 가능
|
|
28
|
+
ytc -c face -n 3
|
|
29
|
+
"""
|
|
30
|
+
)
|
|
43
31
|
|
|
32
|
+
parser.add_argument(
|
|
33
|
+
'-c', '--categories',
|
|
34
|
+
nargs='+',
|
|
35
|
+
choices=['face', 'license_plate', 'tattoo', 'text'],
|
|
36
|
+
default=['face'],
|
|
37
|
+
help='수집할 카테고리 (기본: face)'
|
|
38
|
+
)
|
|
39
|
+
parser.add_argument(
|
|
40
|
+
'-n', '--count',
|
|
41
|
+
type=int,
|
|
42
|
+
default=5,
|
|
43
|
+
help='카테고리당 다운로드 수 (기본: 5)'
|
|
44
|
+
)
|
|
45
|
+
parser.add_argument(
|
|
46
|
+
'-d', '--duration',
|
|
47
|
+
type=int,
|
|
48
|
+
default=3,
|
|
49
|
+
help='최대 영상 길이(분) (기본: 3)'
|
|
50
|
+
)
|
|
51
|
+
parser.add_argument(
|
|
52
|
+
'-o', '--output',
|
|
53
|
+
type=str,
|
|
54
|
+
default=os.path.expanduser("~/youtube"),
|
|
55
|
+
help='저장 경로 (기본: ~/youtube)'
|
|
56
|
+
)
|
|
57
|
+
parser.add_argument(
|
|
58
|
+
'--fast',
|
|
59
|
+
action='store_true',
|
|
60
|
+
help='고속 모드 (병렬 다운로드, 딜레이 최소화)'
|
|
61
|
+
)
|
|
62
|
+
parser.add_argument(
|
|
63
|
+
'-w', '--workers',
|
|
64
|
+
type=int,
|
|
65
|
+
default=3,
|
|
66
|
+
help='병렬 다운로드 수 (기본: 3, --fast 필요)'
|
|
67
|
+
)
|
|
68
|
+
parser.add_argument(
|
|
69
|
+
'--proxy',
|
|
70
|
+
type=str,
|
|
71
|
+
default=None,
|
|
72
|
+
help='프록시 (예: http://proxy:8080)'
|
|
73
|
+
)
|
|
74
|
+
parser.add_argument(
|
|
75
|
+
'-v', '--version',
|
|
76
|
+
action='version',
|
|
77
|
+
version='%(prog)s 1.0.9'
|
|
78
|
+
)
|
|
79
|
+
parser.add_argument(
|
|
80
|
+
'--check-deps',
|
|
81
|
+
action='store_true',
|
|
82
|
+
help='의존성 확인 후 종료'
|
|
83
|
+
)
|
|
44
84
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
if args.fast:
|
|
71
|
-
from .downloader import download_from_txt_parallel
|
|
72
|
-
results = download_from_txt_parallel(txt_file, task, base_dir, max_count=args.count, skip_verify=args.skip_verify)
|
|
73
|
-
else:
|
|
74
|
-
results = download_from_txt(txt_file, task, base_dir, max_count=args.count, skip_verify=args.skip_verify)
|
|
75
|
-
|
|
76
|
-
success_count = sum(1 for r in results if r.get('success'))
|
|
77
|
-
total_success += success_count
|
|
78
|
-
total_processed += len(results)
|
|
79
|
-
|
|
80
|
-
print(f"✓ Task '{task}' complete: {success_count}/{len(results)} successful")
|
|
85
|
+
return parser
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def run(
|
|
89
|
+
categories=None,
|
|
90
|
+
count=5,
|
|
91
|
+
duration=3,
|
|
92
|
+
output=None,
|
|
93
|
+
fast=False,
|
|
94
|
+
workers=3,
|
|
95
|
+
proxy=None,
|
|
96
|
+
quiet=False
|
|
97
|
+
):
|
|
98
|
+
"""
|
|
99
|
+
프로그래밍 방식으로 수집기 실행
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
categories: 카테고리 리스트 (예: ['face', 'text'])
|
|
103
|
+
count: 카테고리당 다운로드 수
|
|
104
|
+
duration: 최대 영상 길이(분)
|
|
105
|
+
output: 저장 경로
|
|
106
|
+
fast: 고속 모드 여부
|
|
107
|
+
workers: 병렬 다운로드 수
|
|
108
|
+
proxy: 프록시 URL
|
|
109
|
+
quiet: 조용한 모드 (출력 최소화)
|
|
81
110
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
logger.error(f"Download failed: {e}")
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
def run_verify(args):
|
|
108
|
-
"""클립 영상 검증"""
|
|
109
|
-
from .verifier import verify_clip, batch_verify
|
|
111
|
+
Returns:
|
|
112
|
+
dict: 카테고리별 성공 다운로드 수
|
|
113
|
+
"""
|
|
114
|
+
if categories is None:
|
|
115
|
+
categories = ['face']
|
|
116
|
+
|
|
117
|
+
if output is None:
|
|
118
|
+
output = os.path.expanduser("~/youtube")
|
|
119
|
+
|
|
120
|
+
# 의존성 체크
|
|
121
|
+
missing = check_dependencies()
|
|
122
|
+
if missing and not quiet:
|
|
123
|
+
print(f"⚠ 분석 기능을 위해 설치 필요: pip install {' '.join(missing)}")
|
|
124
|
+
|
|
125
|
+
# 다운로더 생성
|
|
126
|
+
downloader = YouTubeDownloader(
|
|
127
|
+
output_path=output,
|
|
128
|
+
max_duration=duration * 60,
|
|
129
|
+
proxy=proxy,
|
|
130
|
+
fast_mode=fast,
|
|
131
|
+
workers=workers
|
|
132
|
+
)
|
|
110
133
|
|
|
111
|
-
|
|
134
|
+
results = {}
|
|
135
|
+
for category in categories:
|
|
136
|
+
count_success = downloader.collect(category, count)
|
|
137
|
+
results[category] = count_success
|
|
112
138
|
|
|
113
|
-
|
|
114
|
-
video_path = Path(args.video)
|
|
115
|
-
result = verify_clip(video_path, args.task, base_dir)
|
|
116
|
-
print_verification_result(result)
|
|
117
|
-
else:
|
|
118
|
-
# 폴더 경로: video/{task}/
|
|
119
|
-
clips_dir = base_dir / "video" / args.task
|
|
120
|
-
if not clips_dir.exists():
|
|
121
|
-
logger.error(f"Video directory not found: {clips_dir}")
|
|
122
|
-
return
|
|
123
|
-
|
|
124
|
-
results = batch_verify(clips_dir, args.task, base_dir)
|
|
125
|
-
valid_count = sum(1 for r in results if r.get('is_valid'))
|
|
126
|
-
print(f"✓ Verification complete: {valid_count}/{len(results)} valid")
|
|
139
|
+
return results
|
|
127
140
|
|
|
128
141
|
|
|
129
|
-
def
|
|
130
|
-
"""
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
run_download(args)
|
|
134
|
-
|
|
135
|
-
if args.verify:
|
|
136
|
-
print("\n--- Running verification ---")
|
|
137
|
-
run_verify(args)
|
|
138
|
-
|
|
139
|
-
print("=== Pipeline complete ===")
|
|
140
|
-
|
|
142
|
+
def main(args=None):
|
|
143
|
+
"""CLI 메인 엔트리포인트"""
|
|
144
|
+
parser = create_parser()
|
|
145
|
+
parsed_args = parser.parse_args(args)
|
|
141
146
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
"
|
|
160
|
-
print("
|
|
161
|
-
print("
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
147
|
+
# 의존성 확인 모드
|
|
148
|
+
if parsed_args.check_deps:
|
|
149
|
+
missing = check_dependencies()
|
|
150
|
+
if missing:
|
|
151
|
+
print("⚠ 누락된 의존성:")
|
|
152
|
+
for dep in missing:
|
|
153
|
+
print(f" - {dep}")
|
|
154
|
+
print(f"\n설치: pip install {' '.join(missing)}")
|
|
155
|
+
sys.exit(1)
|
|
156
|
+
else:
|
|
157
|
+
print("✅ 모든 의존성이 설치되어 있습니다.")
|
|
158
|
+
sys.exit(0)
|
|
159
|
+
|
|
160
|
+
# 시작 메시지
|
|
161
|
+
print("\n" + "=" * 60)
|
|
162
|
+
print("YouTube 콘텐츠 수집기")
|
|
163
|
+
print("=" * 60)
|
|
164
|
+
print(f"카테고리: {', '.join([CATEGORY_NAMES[c] for c in parsed_args.categories])}")
|
|
165
|
+
print(f"개수: 카테고리당 {parsed_args.count}개")
|
|
166
|
+
print(f"최대길이: {parsed_args.duration}분")
|
|
167
|
+
print(f"저장경로: {parsed_args.output}")
|
|
168
|
+
if parsed_args.fast:
|
|
169
|
+
print(f"모드: ⚡ 고속 (병렬 {parsed_args.workers}개)")
|
|
170
|
+
if parsed_args.proxy:
|
|
171
|
+
print(f"프록시: {parsed_args.proxy}")
|
|
172
|
+
|
|
173
|
+
# 의존성 체크
|
|
174
|
+
missing = check_dependencies()
|
|
175
|
+
if missing:
|
|
176
|
+
print(f"\n⚠ 분석 기능을 위해 설치 필요:")
|
|
177
|
+
print(f" pip install {' '.join(missing)}")
|
|
178
|
+
|
|
179
|
+
# 수집 실행
|
|
180
|
+
results = run(
|
|
181
|
+
categories=parsed_args.categories,
|
|
182
|
+
count=parsed_args.count,
|
|
183
|
+
duration=parsed_args.duration,
|
|
184
|
+
output=parsed_args.output,
|
|
185
|
+
fast=parsed_args.fast,
|
|
186
|
+
workers=parsed_args.workers,
|
|
187
|
+
proxy=parsed_args.proxy
|
|
178
188
|
)
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
download_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS, help='One or more tasks (e.g. face tattoo)')
|
|
190
|
-
download_parser.add_argument('--count', '-n', type=int, help='Max videos to collect (default: 1000)')
|
|
191
|
-
download_parser.add_argument('--fast', action='store_true', help='Enable fast parallel downloading')
|
|
192
|
-
download_parser.add_argument('--skip-verify', '-S', action='store_true', help='Skip YOLO verification and save all clips')
|
|
193
|
-
|
|
194
|
-
# Download single
|
|
195
|
-
single_parser = subparsers.add_parser('download-single', help='Download single video')
|
|
196
|
-
single_parser.add_argument('--task', '-t', required=True, choices=VALID_TASKS)
|
|
197
|
-
single_parser.add_argument('--url', '-u', required=True, help='YouTube URL')
|
|
198
|
-
single_parser.add_argument('--timestamp-min', '-m', type=int, required=True)
|
|
199
|
-
single_parser.add_argument('--timestamp-sec', '-s', type=int, required=True)
|
|
200
|
-
|
|
201
|
-
# Verify
|
|
202
|
-
verify_parser = subparsers.add_parser('verify', help='Verify with YOLO-World')
|
|
203
|
-
verify_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS)
|
|
204
|
-
verify_parser.add_argument('--video', '-v', help='Specific video file')
|
|
205
|
-
|
|
206
|
-
# Pipeline
|
|
207
|
-
pipeline_parser = subparsers.add_parser('pipeline', help='Full pipeline')
|
|
208
|
-
pipeline_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS)
|
|
209
|
-
pipeline_parser.add_argument('--verify', action='store_true')
|
|
210
|
-
pipeline_parser.add_argument('--skip-verify', '-S', action='store_true', help='Skip verification in download stage')
|
|
211
|
-
|
|
212
|
-
# List tasks
|
|
213
|
-
subparsers.add_parser('list-tasks', help='List available tasks')
|
|
214
|
-
|
|
215
|
-
args = parser.parse_args()
|
|
216
|
-
|
|
217
|
-
if args.command is None:
|
|
218
|
-
parser.print_help()
|
|
219
|
-
return
|
|
220
|
-
|
|
221
|
-
commands = {
|
|
222
|
-
'init': init_project,
|
|
223
|
-
'download': run_download,
|
|
224
|
-
'download-single': run_download_single,
|
|
225
|
-
'verify': run_verify,
|
|
226
|
-
'pipeline': run_pipeline,
|
|
227
|
-
'list-tasks': list_tasks,
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
commands[args.command](args)
|
|
189
|
+
|
|
190
|
+
# 완료 메시지
|
|
191
|
+
total = sum(results.values())
|
|
192
|
+
print(f"\n{'='*60}")
|
|
193
|
+
print(f"완료! 총 {total}개 저장")
|
|
194
|
+
for cat, cnt in results.items():
|
|
195
|
+
print(f" - {CATEGORY_NAMES[cat]}: {cnt}개")
|
|
196
|
+
print(f"{'='*60}\n")
|
|
197
|
+
|
|
198
|
+
return 0
|
|
231
199
|
|
|
232
200
|
|
|
233
|
-
if __name__ ==
|
|
234
|
-
main()
|
|
201
|
+
if __name__ == "__main__":
|
|
202
|
+
sys.exit(main())
|