PyPI - ytcollector - Versions diffs - 1.0.8__py3-none-any.whl → 1.0.9__py3-none-any.whl - Mend

ytcollector 1.0.8py3-none-any.whl → 1.0.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

ytcollector/__init__.py +36 -11
ytcollector/analyzer.py +205 -0
ytcollector/cli.py +186 -218
ytcollector/config.py +66 -62
ytcollector/dataset_builder.py +136 -0
ytcollector/downloader.py +328 -480
ytcollector-1.0.9.dist-info/METADATA +207 -0
ytcollector-1.0.9.dist-info/RECORD +11 -0
ytcollector-1.0.9.dist-info/entry_points.txt +4 -0
{ytcollector-1.0.8.dist-info → ytcollector-1.0.9.dist-info}/top_level.txt +0 -1
config/settings.py +0 -39
ytcollector/utils.py +0 -144
ytcollector/verifier.py +0 -187
ytcollector-1.0.8.dist-info/METADATA +0 -105
ytcollector-1.0.8.dist-info/RECORD +0 -12
ytcollector-1.0.8.dist-info/entry_points.txt +0 -2
{ytcollector-1.0.8.dist-info → ytcollector-1.0.9.dist-info}/WHEEL +0 -0

ytcollector/config.py CHANGED Viewed

@@ -1,67 +1,71 @@
-"""
-SBS Dataset Collector - Configuration
-"""
-from pathlib import Path
-import platform
+# 설정 상수
-# Default paths (will use current working directory)
-def get_paths(base_dir: Path = None):
-    """Get all paths based on base directory"""
-    if base_dir is None:
-        base_dir = Path.cwd()
-    return {
-        'base': base_dir,
-        'data': base_dir / "data",
-        # 'urls' removed - now inside video/{task}/youtube_url.txt
-        'videos': base_dir / "data" / "videos",  # 원본 전체 영상
-        'clips': base_dir / "video",             # 클리핑된 영상 (요구사항: video/task_이름)
-        'outputs': base_dir / "outputs",
-        'reports': base_dir / "outputs" / "reports",
-        'history': base_dir / "download_history.json",
-    }
+USER_AGENTS = [
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
+]
-# 사용자 지정 출력 경로 (네트워크 드라이브 등)
-# macOS에서는 "/Volumes/Data/..." 등으로 마운트된 경로를 사용해야 함
-NAS_PATH_WINDOWS = r"\\NAS_SERVER_IP\Data\Private Dataset\SBS_De-Identification_YouTube"
-NAS_PATH_MAC = "/Volumes/Data/Private Dataset/SBS_De-Identification_YouTube"
-if platform.system() == 'Windows':
-    CUSTOM_OUTPUT_DIR = NAS_PATH_WINDOWS
-elif platform.system() == 'Darwin': # macOS
-    CUSTOM_OUTPUT_DIR = NAS_PATH_MAC
-else:
-    CUSTOM_OUTPUT_DIR = None
-# Video settings
-CLIP_DURATION_BEFORE = 90  # 1분 30초 (초 단위)
-CLIP_DURATION_AFTER = 90   # 1분 30초 (초 단위)
-MAX_CLIP_DURATION = 180    # 최대 3분
-# Download settings
-VIDEO_FORMAT = "best[ext=mp4]/best"
-DOWNLOAD_RETRIES = 3
-MAX_VIDEOS_PER_TASK = 1000  # 태스크별 최대 영상 저장 수 (CLI -n 옵션으로 덮어쓰기 가능)
-# Fast Mode Settings (Parallel)
-MAX_WORKERS = 4              # 병렬 작업 프로세스 수
-REQUEST_DELAY_MIN = 1.0      # 최소 대기 시간 (초)
-REQUEST_DELAY_MAX = 3.0      # 최대 대기 시간 (초)
-PROXY_URL = None             # 프록시 (예: "http://user:pass@host:port")
-# YOLO-World settings
-YOLO_MODEL = "yolov8s-worldv2.pt"
-CONFIDENCE_THRESHOLD = 0.25
-FRAME_SAMPLE_RATE = 30  # 매 30프레임마다 샘플링 (약 1초)
+# 카테고리별 검색어 - SBS 콘텐츠 중심
+CATEGORY_QUERIES = {
+    'face': [
+        "SBS 인터뷰 클립",
+        "런닝맨 멤버 인터뷰",
+        "SBS 뉴스 인터뷰",
+        "미운우리새끼 인터뷰",
+        "SBS 스페셜 인물",
+        "집사부일체 인터뷰",
+        "그것이알고싶다 인터뷰",
+        "SBS 연예대상 소감",
+    ],
+    'license_plate': [
+        "중고차 매물 소개",
+        "자동차 세차 영상",
+        "신차 출고 브이로그",
+        "자동차 튜닝 작업",
+        "엔카 허위매물",
+        "주차장 만차",
+    ],
+    'tattoo': [
+        "타투 시술 영상",
+        "tattoo timelapse",
+        "타투이스트 작업",
+        "tattoo artist work",
+        "문신 시술",
+        "tattoo session",
+    ],
+    'text': [
+        "SBS 런닝맨 레전드",
+        "SBS 미운우리새끼 명장면",
+        "SBS 동상이몽 클립",
+        "SBS 집사부일체 모음",
+        "SBS 골목식당 레전드",
+        "SBS 맛남의광장 클립",
+        "SBS 불타는청춘 명장면",
+        "SBS 정글의법칙 레전드",
+        "SBS 예능",
+    ],
+}
-# Task-specific class prompts
-TASK_CLASSES = {
-    "face": ["face"],
-    "license_plate": ["license plate"],
-    "tattoo": ["tattoo"],
-    "text": ["text"],
-    "knife": ["knife"],
-    "cigarette": ["cigarette"]
+CATEGORY_NAMES = {
+    'face': '얼굴',
+    'license_plate': '번호판',
+    'tattoo': '타투',
+    'text': '텍스트'
 }
-VALID_TASKS = list(TASK_CLASSES.keys())
+# 번호판 정규식 패턴
+LICENSE_PLATE_PATTERNS = [
+    r'\d{2,3}[가-힣]\d{4}',
+    r'[가-힣]{2}\d{2}[가-힣]\d{4}',
+    r'[A-Z]{2,3}[-\s]?\d{2,4}[-\s]?[A-Z]{0,3}',
+    r'\d{2,4}[-\s]?[A-Z]{2,3}[-\s]?\d{2,4}',
+]
+# 스킵할 에러 메시지
+SKIP_ERRORS = [
+    "not available", "unavailable", "private", "removed",
+    "deleted", "copyright", "blocked", "age", "sign in",
+    "members-only", "premiere", "live event"
+]

ytcollector/dataset_builder.py ADDED Viewed

@@ -0,0 +1,136 @@
+import os
+import subprocess
+from yt_dlp import YoutubeDL
+def get_url_list(file_path):
+    if not os.path.exists(file_path):
+        return []
+    urls = []
+    with open(file_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            line = line.strip()
+            if not line or line.startswith('#'):
+                continue
+            parts = [p.strip() for p in line.split(',')]
+            if len(parts) >= 3:
+                urls.append({
+                    'url': parts[0],
+                    'timestamp': parts[1],
+                    'task': parts[2]
+                })
+    return urls
+def download_videos(url_list, output_dir):
+    os.makedirs(output_dir, exist_ok=True)
+    for idx, item in enumerate(url_list, 1):
+        url = item['url']
+        task = item['task']
+        index_str = f"{idx:03d}"
+        existing_files = [f for f in os.listdir(output_dir) if f.startswith(f"{index_str}_")]
+        if existing_files:
+            print(f"[{index_str}] Skip: {existing_files[0]}")
+            continue
+        print(f"[{index_str}] Downloading: {url} ({task})")
+        try:
+            ydl_opts = {
+                'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
+                'outtmpl': os.path.join(output_dir, f"{index_str}_{task}_%(title)s.%(ext)s"),
+                'quiet': True,
+                'no_warnings': True,
+            }
+            with YoutubeDL(ydl_opts) as ydl:
+                ydl.download([url])
+        except Exception as e:
+            print(f"[{index_str}] Failed: {e}")
+def get_video_duration(file_path):
+    cmd = [
+        'ffprobe', '-v', 'error', '-show_entries', 'format=duration',
+        '-of', 'default=noprint_wrappers=1:nokey=1', file_path
+    ]
+    try:
+        output = subprocess.check_output(cmd).decode('utf-8').strip()
+        return float(output)
+    except:
+        return 0.0
+def timestamp_to_seconds(timestamp):
+    try:
+        parts = timestamp.split(':')
+        if len(parts) == 2:
+            return int(parts[0]) * 60 + int(parts[1])
+        return 0.0
+    except:
+        return 0.0
+def clip_video(input_path, output_path, center_timestamp, window_seconds=90):
+    duration = get_video_duration(input_path)
+    if duration == 0: return False
+    center_sec = timestamp_to_seconds(center_timestamp)
+    start_sec = max(0, center_sec - window_seconds)
+    end_sec = min(duration, start_sec + (window_seconds * 2))
+    if (end_sec - start_sec) < (window_seconds * 2) and start_sec > 0:
+        start_sec = max(0, end_sec - (window_seconds * 2))
+    actual_duration = end_sec - start_sec
+    cmd = [
+        'ffmpeg', '-y', '-ss', str(start_sec), '-t', str(actual_duration),
+        '-i', input_path, '-c', 'copy', output_path
+    ]
+    try:
+        subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        return True
+    except:
+        cmd[7:9] = ['-c:v', 'libx264', '-crf', '23', '-c:a', 'aac']
+        try:
+            subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+            return True
+        except:
+            return False
+def build_dataset(url_file, output_root="."):
+    video_dir = os.path.join(output_root, "video")
+    clip_dir = os.path.join(output_root, "video_clips")
+    urls = get_url_list(url_file)
+    if not urls:
+        print(f"Error: No valid data in {url_file}")
+        return
+    print(f"--- Step 1: Downloading {len(urls)} videos ---")
+    download_videos(urls, video_dir)
+    print(f"\n--- Step 2: Clipping videos ---")
+    os.makedirs(clip_dir, exist_ok=True)
+    for idx, item in enumerate(urls, 1):
+        index_str = f"{idx:03d}"
+        files = [f for f in os.listdir(video_dir) if f.startswith(f"{index_str}_")]
+        if not files: continue
+        input_file = os.path.join(video_dir, files[0])
+        output_file = os.path.join(clip_dir, files[0])
+        if os.path.exists(output_file): continue
+        print(f"[{index_str}] Clipping: {files[0]}")
+        clip_video(input_file, output_file, item['timestamp'])
+    print(f"\nDone! Clips saved in: {os.path.abspath(clip_dir)}")
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description='Build SBS Dataset from YouTube URL list')
+    parser.add_argument('file', help='Path to youtube_url.txt')
+    parser.add_argument('-o', '--output', default='.', help='Output root directory (default: .)')
+    args = parser.parse_args()
+    build_dataset(args.file, args.output)
+if __name__ == "__main__":
+    main()

ytcollector 1.0.8__py3-none-any.whl → 1.0.9__py3-none-any.whl

ytcollector 1.0.8py3-none-any.whl → 1.0.9py3-none-any.whl