ytcollector 1.0.8__py3-none-any.whl → 1.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ytcollector/__init__.py +36 -11
- ytcollector/analyzer.py +205 -0
- ytcollector/cli.py +186 -218
- ytcollector/config.py +66 -62
- ytcollector/dataset_builder.py +136 -0
- ytcollector/downloader.py +328 -480
- ytcollector-1.0.9.dist-info/METADATA +207 -0
- ytcollector-1.0.9.dist-info/RECORD +11 -0
- ytcollector-1.0.9.dist-info/entry_points.txt +4 -0
- {ytcollector-1.0.8.dist-info → ytcollector-1.0.9.dist-info}/top_level.txt +0 -1
- config/settings.py +0 -39
- ytcollector/utils.py +0 -144
- ytcollector/verifier.py +0 -187
- ytcollector-1.0.8.dist-info/METADATA +0 -105
- ytcollector-1.0.8.dist-info/RECORD +0 -12
- ytcollector-1.0.8.dist-info/entry_points.txt +0 -2
- {ytcollector-1.0.8.dist-info → ytcollector-1.0.9.dist-info}/WHEEL +0 -0
ytcollector/config.py
CHANGED
|
@@ -1,67 +1,71 @@
|
|
|
1
|
-
|
|
2
|
-
SBS Dataset Collector - Configuration
|
|
3
|
-
"""
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
import platform
|
|
1
|
+
# 설정 상수
|
|
6
2
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
return {
|
|
14
|
-
'base': base_dir,
|
|
15
|
-
'data': base_dir / "data",
|
|
16
|
-
# 'urls' removed - now inside video/{task}/youtube_url.txt
|
|
17
|
-
'videos': base_dir / "data" / "videos", # 원본 전체 영상
|
|
18
|
-
'clips': base_dir / "video", # 클리핑된 영상 (요구사항: video/task_이름)
|
|
19
|
-
'outputs': base_dir / "outputs",
|
|
20
|
-
'reports': base_dir / "outputs" / "reports",
|
|
21
|
-
'history': base_dir / "download_history.json",
|
|
22
|
-
}
|
|
3
|
+
USER_AGENTS = [
|
|
4
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
5
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
6
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
|
|
7
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
|
|
8
|
+
]
|
|
23
9
|
|
|
24
|
-
#
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
10
|
+
# 카테고리별 검색어 - SBS 콘텐츠 중심
|
|
11
|
+
CATEGORY_QUERIES = {
|
|
12
|
+
'face': [
|
|
13
|
+
"SBS 인터뷰 클립",
|
|
14
|
+
"런닝맨 멤버 인터뷰",
|
|
15
|
+
"SBS 뉴스 인터뷰",
|
|
16
|
+
"미운우리새끼 인터뷰",
|
|
17
|
+
"SBS 스페셜 인물",
|
|
18
|
+
"집사부일체 인터뷰",
|
|
19
|
+
"그것이알고싶다 인터뷰",
|
|
20
|
+
"SBS 연예대상 소감",
|
|
21
|
+
],
|
|
22
|
+
'license_plate': [
|
|
23
|
+
"중고차 매물 소개",
|
|
24
|
+
"자동차 세차 영상",
|
|
25
|
+
"신차 출고 브이로그",
|
|
26
|
+
"자동차 튜닝 작업",
|
|
27
|
+
"엔카 허위매물",
|
|
28
|
+
"주차장 만차",
|
|
29
|
+
],
|
|
30
|
+
'tattoo': [
|
|
31
|
+
"타투 시술 영상",
|
|
32
|
+
"tattoo timelapse",
|
|
33
|
+
"타투이스트 작업",
|
|
34
|
+
"tattoo artist work",
|
|
35
|
+
"문신 시술",
|
|
36
|
+
"tattoo session",
|
|
37
|
+
],
|
|
38
|
+
'text': [
|
|
39
|
+
"SBS 런닝맨 레전드",
|
|
40
|
+
"SBS 미운우리새끼 명장면",
|
|
41
|
+
"SBS 동상이몽 클립",
|
|
42
|
+
"SBS 집사부일체 모음",
|
|
43
|
+
"SBS 골목식당 레전드",
|
|
44
|
+
"SBS 맛남의광장 클립",
|
|
45
|
+
"SBS 불타는청춘 명장면",
|
|
46
|
+
"SBS 정글의법칙 레전드",
|
|
47
|
+
"SBS 예능",
|
|
48
|
+
],
|
|
49
|
+
}
|
|
56
50
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
"text": ["text"],
|
|
63
|
-
"knife": ["knife"],
|
|
64
|
-
"cigarette": ["cigarette"]
|
|
51
|
+
CATEGORY_NAMES = {
|
|
52
|
+
'face': '얼굴',
|
|
53
|
+
'license_plate': '번호판',
|
|
54
|
+
'tattoo': '타투',
|
|
55
|
+
'text': '텍스트'
|
|
65
56
|
}
|
|
66
57
|
|
|
67
|
-
|
|
58
|
+
# 번호판 정규식 패턴
|
|
59
|
+
LICENSE_PLATE_PATTERNS = [
|
|
60
|
+
r'\d{2,3}[가-힣]\d{4}',
|
|
61
|
+
r'[가-힣]{2}\d{2}[가-힣]\d{4}',
|
|
62
|
+
r'[A-Z]{2,3}[-\s]?\d{2,4}[-\s]?[A-Z]{0,3}',
|
|
63
|
+
r'\d{2,4}[-\s]?[A-Z]{2,3}[-\s]?\d{2,4}',
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
# 스킵할 에러 메시지
|
|
67
|
+
SKIP_ERRORS = [
|
|
68
|
+
"not available", "unavailable", "private", "removed",
|
|
69
|
+
"deleted", "copyright", "blocked", "age", "sign in",
|
|
70
|
+
"members-only", "premiere", "live event"
|
|
71
|
+
]
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import subprocess
|
|
3
|
+
from yt_dlp import YoutubeDL
|
|
4
|
+
|
|
5
|
+
def get_url_list(file_path):
|
|
6
|
+
if not os.path.exists(file_path):
|
|
7
|
+
return []
|
|
8
|
+
|
|
9
|
+
urls = []
|
|
10
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
11
|
+
for line in f:
|
|
12
|
+
line = line.strip()
|
|
13
|
+
if not line or line.startswith('#'):
|
|
14
|
+
continue
|
|
15
|
+
parts = [p.strip() for p in line.split(',')]
|
|
16
|
+
if len(parts) >= 3:
|
|
17
|
+
urls.append({
|
|
18
|
+
'url': parts[0],
|
|
19
|
+
'timestamp': parts[1],
|
|
20
|
+
'task': parts[2]
|
|
21
|
+
})
|
|
22
|
+
return urls
|
|
23
|
+
|
|
24
|
+
def download_videos(url_list, output_dir):
|
|
25
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
26
|
+
for idx, item in enumerate(url_list, 1):
|
|
27
|
+
url = item['url']
|
|
28
|
+
task = item['task']
|
|
29
|
+
index_str = f"{idx:03d}"
|
|
30
|
+
|
|
31
|
+
existing_files = [f for f in os.listdir(output_dir) if f.startswith(f"{index_str}_")]
|
|
32
|
+
if existing_files:
|
|
33
|
+
print(f"[{index_str}] Skip: {existing_files[0]}")
|
|
34
|
+
continue
|
|
35
|
+
|
|
36
|
+
print(f"[{index_str}] Downloading: {url} ({task})")
|
|
37
|
+
try:
|
|
38
|
+
ydl_opts = {
|
|
39
|
+
'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
|
|
40
|
+
'outtmpl': os.path.join(output_dir, f"{index_str}_{task}_%(title)s.%(ext)s"),
|
|
41
|
+
'quiet': True,
|
|
42
|
+
'no_warnings': True,
|
|
43
|
+
}
|
|
44
|
+
with YoutubeDL(ydl_opts) as ydl:
|
|
45
|
+
ydl.download([url])
|
|
46
|
+
except Exception as e:
|
|
47
|
+
print(f"[{index_str}] Failed: {e}")
|
|
48
|
+
|
|
49
|
+
def get_video_duration(file_path):
|
|
50
|
+
cmd = [
|
|
51
|
+
'ffprobe', '-v', 'error', '-show_entries', 'format=duration',
|
|
52
|
+
'-of', 'default=noprint_wrappers=1:nokey=1', file_path
|
|
53
|
+
]
|
|
54
|
+
try:
|
|
55
|
+
output = subprocess.check_output(cmd).decode('utf-8').strip()
|
|
56
|
+
return float(output)
|
|
57
|
+
except:
|
|
58
|
+
return 0.0
|
|
59
|
+
|
|
60
|
+
def timestamp_to_seconds(timestamp):
|
|
61
|
+
try:
|
|
62
|
+
parts = timestamp.split(':')
|
|
63
|
+
if len(parts) == 2:
|
|
64
|
+
return int(parts[0]) * 60 + int(parts[1])
|
|
65
|
+
return 0.0
|
|
66
|
+
except:
|
|
67
|
+
return 0.0
|
|
68
|
+
|
|
69
|
+
def clip_video(input_path, output_path, center_timestamp, window_seconds=90):
|
|
70
|
+
duration = get_video_duration(input_path)
|
|
71
|
+
if duration == 0: return False
|
|
72
|
+
|
|
73
|
+
center_sec = timestamp_to_seconds(center_timestamp)
|
|
74
|
+
start_sec = max(0, center_sec - window_seconds)
|
|
75
|
+
end_sec = min(duration, start_sec + (window_seconds * 2))
|
|
76
|
+
|
|
77
|
+
if (end_sec - start_sec) < (window_seconds * 2) and start_sec > 0:
|
|
78
|
+
start_sec = max(0, end_sec - (window_seconds * 2))
|
|
79
|
+
|
|
80
|
+
actual_duration = end_sec - start_sec
|
|
81
|
+
|
|
82
|
+
cmd = [
|
|
83
|
+
'ffmpeg', '-y', '-ss', str(start_sec), '-t', str(actual_duration),
|
|
84
|
+
'-i', input_path, '-c', 'copy', output_path
|
|
85
|
+
]
|
|
86
|
+
try:
|
|
87
|
+
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
88
|
+
return True
|
|
89
|
+
except:
|
|
90
|
+
cmd[7:9] = ['-c:v', 'libx264', '-crf', '23', '-c:a', 'aac']
|
|
91
|
+
try:
|
|
92
|
+
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
93
|
+
return True
|
|
94
|
+
except:
|
|
95
|
+
return False
|
|
96
|
+
|
|
97
|
+
def build_dataset(url_file, output_root="."):
|
|
98
|
+
video_dir = os.path.join(output_root, "video")
|
|
99
|
+
clip_dir = os.path.join(output_root, "video_clips")
|
|
100
|
+
|
|
101
|
+
urls = get_url_list(url_file)
|
|
102
|
+
if not urls:
|
|
103
|
+
print(f"Error: No valid data in {url_file}")
|
|
104
|
+
return
|
|
105
|
+
|
|
106
|
+
print(f"--- Step 1: Downloading {len(urls)} videos ---")
|
|
107
|
+
download_videos(urls, video_dir)
|
|
108
|
+
|
|
109
|
+
print(f"\n--- Step 2: Clipping videos ---")
|
|
110
|
+
os.makedirs(clip_dir, exist_ok=True)
|
|
111
|
+
for idx, item in enumerate(urls, 1):
|
|
112
|
+
index_str = f"{idx:03d}"
|
|
113
|
+
files = [f for f in os.listdir(video_dir) if f.startswith(f"{index_str}_")]
|
|
114
|
+
if not files: continue
|
|
115
|
+
|
|
116
|
+
input_file = os.path.join(video_dir, files[0])
|
|
117
|
+
output_file = os.path.join(clip_dir, files[0])
|
|
118
|
+
|
|
119
|
+
if os.path.exists(output_file): continue
|
|
120
|
+
|
|
121
|
+
print(f"[{index_str}] Clipping: {files[0]}")
|
|
122
|
+
clip_video(input_file, output_file, item['timestamp'])
|
|
123
|
+
|
|
124
|
+
print(f"\nDone! Clips saved in: {os.path.abspath(clip_dir)}")
|
|
125
|
+
|
|
126
|
+
def main():
|
|
127
|
+
import argparse
|
|
128
|
+
parser = argparse.ArgumentParser(description='Build SBS Dataset from YouTube URL list')
|
|
129
|
+
parser.add_argument('file', help='Path to youtube_url.txt')
|
|
130
|
+
parser.add_argument('-o', '--output', default='.', help='Output root directory (default: .)')
|
|
131
|
+
args = parser.parse_args()
|
|
132
|
+
|
|
133
|
+
build_dataset(args.file, args.output)
|
|
134
|
+
|
|
135
|
+
if __name__ == "__main__":
|
|
136
|
+
main()
|