PyPI - ytcollector - Versions diffs - 1.0.8__py3-none-any.whl → 1.0.9__py3-none-any.whl - Mend

ytcollector 1.0.8py3-none-any.whl → 1.0.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

ytcollector/__init__.py +36 -11
ytcollector/analyzer.py +205 -0
ytcollector/cli.py +186 -218
ytcollector/config.py +66 -62
ytcollector/dataset_builder.py +136 -0
ytcollector/downloader.py +328 -480
ytcollector-1.0.9.dist-info/METADATA +207 -0
ytcollector-1.0.9.dist-info/RECORD +11 -0
ytcollector-1.0.9.dist-info/entry_points.txt +4 -0
{ytcollector-1.0.8.dist-info → ytcollector-1.0.9.dist-info}/top_level.txt +0 -1
config/settings.py +0 -39
ytcollector/utils.py +0 -144
ytcollector/verifier.py +0 -187
ytcollector-1.0.8.dist-info/METADATA +0 -105
ytcollector-1.0.8.dist-info/RECORD +0 -12
ytcollector-1.0.8.dist-info/entry_points.txt +0 -2
{ytcollector-1.0.8.dist-info → ytcollector-1.0.9.dist-info}/WHEEL +0 -0

ytcollector/__init__.py CHANGED Viewed

@@ -1,14 +1,39 @@
-"""
-SBS Dataset Collector - YouTube 영상 수집 및 YOLO-World 검증 파이프라인
-"""
-from pathlib import Path
+"""YouTube 콘텐츠 수집기 라이브러리
-__version__ = "1.0.8"
-__author__ = "SBS Dataset Team"
+외부에서 라이브러리로 사용하거나 CLI로 실행할 수 있습니다.
+라이브러리 사용 예시:
+    from ytcollector import YouTubeDownloader, run
+    # 방법 1: YouTubeDownloader 직접 사용
+    downloader = YouTubeDownloader(output_path="./videos")
+    count = downloader.collect("face", max_videos=5)
+    # 방법 2: run() 함수 사용 (간단한 방법)
+    results = run(categories=["face", "text"], count=3)
+CLI 사용 예시:
+    ytcollector -c face -n 5
+    ytc -c face text --fast
+"""
-# Package root directory
-PACKAGE_DIR = Path(__file__).parent
+from .config import CATEGORY_NAMES, CATEGORY_QUERIES, USER_AGENTS, LICENSE_PLATE_PATTERNS
+from .analyzer import VideoAnalyzer, check_dependencies
+from .downloader import YouTubeDownloader
+from .cli import run, main as cli_main
-# Default data directories (can be overridden)
-DEFAULT_DATA_DIR = Path.cwd() / "data"
-DEFAULT_OUTPUT_DIR = Path.cwd() / "outputs"
+__version__ = "1.0.0"
+__all__ = [
+    # 주요 클래스
+    "VideoAnalyzer",
+    "YouTubeDownloader",
+    # 설정
+    "CATEGORY_NAMES",
+    "CATEGORY_QUERIES",
+    "USER_AGENTS",
+    "LICENSE_PLATE_PATTERNS",
+    # 유틸리티
+    "check_dependencies",
+    "run",
+    "cli_main",
+]

ytcollector/analyzer.py ADDED Viewed

@@ -0,0 +1,205 @@
+import re
+from .config import LICENSE_PLATE_PATTERNS
+# 선택적 import
+try:
+    import cv2
+    CV2_AVAILABLE = True
+except ImportError:
+    CV2_AVAILABLE = False
+try:
+    import easyocr
+    EASYOCR_AVAILABLE = True
+except ImportError:
+    EASYOCR_AVAILABLE = False
+try:
+    import numpy as np
+    NUMPY_AVAILABLE = True
+except ImportError:
+    NUMPY_AVAILABLE = False
+class VideoAnalyzer:
+    """영상 분석 클래스 - 얼굴, 텍스트, 번호판, 타투 감지"""
+    def __init__(self):
+        self.ocr_reader = None
+        self.face_cascade = None
+        if CV2_AVAILABLE:
+            cascade_path = cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
+            self.face_cascade = cv2.CascadeClassifier(cascade_path)
+    def _init_ocr(self):
+        """OCR 리더 초기화 (필요할 때만)"""
+        if EASYOCR_AVAILABLE and self.ocr_reader is None:
+            print("  OCR 엔진 초기화 중...")
+            self.ocr_reader = easyocr.Reader(['ko', 'en'], gpu=False, verbose=False)
+    def extract_frames(self, video_path, num_frames=10):
+        """영상에서 균등 간격으로 프레임 추출"""
+        if not CV2_AVAILABLE:
+            return []
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            return []
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        if total_frames <= 0:
+            cap.release()
+            return []
+        frame_indices = [int(i * total_frames / (num_frames + 1)) for i in range(1, num_frames + 1)]
+        frames = []
+        for idx in frame_indices:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+            ret, frame = cap.read()
+            if ret:
+                frames.append(frame)
+        cap.release()
+        return frames
+    def detect_faces(self, frame):
+        """Haar Cascade로 얼굴 감지"""
+        if not CV2_AVAILABLE or self.face_cascade is None:
+            return []
+        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+        return self.face_cascade.detectMultiScale(
+            gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30)
+        )
+    def detect_text(self, frame):
+        """EasyOCR로 텍스트 감지"""
+        if not EASYOCR_AVAILABLE:
+            return []
+        self._init_ocr()
+        try:
+            h, w = frame.shape[:2]
+            if w > 640:
+                scale = 640 / w
+                frame = cv2.resize(frame, (640, int(h * scale)))
+            results = self.ocr_reader.readtext(frame)
+            return [r[1] for r in results if r[2] > 0.3]
+        except:
+            return []
+    def detect_license_plate(self, texts):
+        """텍스트에서 번호판 패턴 감지"""
+        for text in texts:
+            text_clean = text.replace(' ', '').upper()
+            for pattern in LICENSE_PLATE_PATTERNS:
+                if re.search(pattern, text_clean):
+                    return True
+        return False
+    def detect_tattoo(self, frame):
+        """피부 영역에서 타투(어두운 잉크 패턴) 감지"""
+        if not CV2_AVAILABLE or not NUMPY_AVAILABLE:
+            return False
+        try:
+            hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
+            # 피부색 범위
+            lower_skin = np.array([0, 30, 80], dtype=np.uint8)
+            upper_skin = np.array([17, 170, 255], dtype=np.uint8)
+            skin_mask = cv2.inRange(hsv, lower_skin, upper_skin)
+            # 노이즈 제거
+            kernel = np.ones((5, 5), np.uint8)
+            skin_mask = cv2.morphologyEx(skin_mask, cv2.MORPH_OPEN, kernel)
+            skin_mask = cv2.morphologyEx(skin_mask, cv2.MORPH_CLOSE, kernel)
+            skin_pixels = cv2.countNonZero(skin_mask)
+            total_pixels = frame.shape[0] * frame.shape[1]
+            # 피부 영역 최소 10% 필요
+            if skin_pixels < total_pixels * 0.10:
+                return False
+            # 피부 영역 내 어두운 픽셀(타투) 감지
+            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+            skin_gray = cv2.bitwise_and(gray, gray, mask=skin_mask)
+            dark_mask = cv2.inRange(skin_gray, 1, 80)
+            dark_pixels = cv2.countNonZero(dark_mask)
+            dark_ratio = dark_pixels / max(skin_pixels, 1)
+            # 어두운 영역이 3~35%일 때 타투로 판정
+            if 0.03 < dark_ratio < 0.35:
+                contours, _ = cv2.findContours(dark_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                significant = [c for c in contours if cv2.contourArea(c) > 100]
+                return len(significant) >= 1
+            return False
+        except:
+            return False
+    def analyze(self, video_path):
+        """영상 전체 분석"""
+        results = {
+            'face': False,
+            'text': False,
+            'license_plate': False,
+            'tattoo': False,
+            'face_count': 0,
+            'detected_texts': []
+        }
+        if not CV2_AVAILABLE:
+            print("  ⚠ OpenCV 미설치")
+            return results
+        frames = self.extract_frames(video_path, num_frames=8)
+        if not frames:
+            print("  ⚠ 프레임 추출 실패")
+            return results
+        all_texts = []
+        total_faces = 0
+        for i, frame in enumerate(frames):
+            # 얼굴
+            faces = self.detect_faces(frame)
+            if len(faces) > 0:
+                results['face'] = True
+                total_faces += len(faces)
+            # 텍스트 (일부 프레임만)
+            if i % 2 == 0 and EASYOCR_AVAILABLE:
+                texts = self.detect_text(frame)
+                if texts:
+                    results['text'] = True
+                    all_texts.extend(texts)
+            # 타투
+            if self.detect_tattoo(frame):
+                results['tattoo'] = True
+        # 번호판 (텍스트에서)
+        if all_texts:
+            results['license_plate'] = self.detect_license_plate(all_texts)
+            results['detected_texts'] = list(set(all_texts))[:10]
+        results['face_count'] = total_faces
+        return results
+def check_dependencies():
+    """의존성 체크"""
+    missing = []
+    if not CV2_AVAILABLE:
+        missing.append("opencv-python")
+    if not EASYOCR_AVAILABLE:
+        missing.append("easyocr")
+    if not NUMPY_AVAILABLE:
+        missing.append("numpy")
+    return missing

ytcollector/cli.py CHANGED Viewed

@@ -1,234 +1,202 @@
 #!/usr/bin/env python3
-"""
-SBS Dataset Collector CLI (Updated)
-"""
-import argparse
-import logging
-from pathlib import Path
+"""YouTube 콘텐츠 수집기 - CLI 모듈"""
-# Package import modified to 'downloader'
-from .config import TASK_CLASSES, VALID_TASKS, get_paths
-from .utils import ensure_dir, get_url_file_path
+import argparse
+import os
+import sys
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger(__name__)
+from .config import CATEGORY_NAMES
+from .downloader import YouTubeDownloader
+from .analyzer import check_dependencies
-def init_project(args):
-    """프로젝트 디렉토리 초기화"""
-    base_dir = Path(args.dir) if args.dir else Path.cwd()
-    # 1. 태스크별 폴더 및 youtube_url.txt 생성
-    for task in VALID_TASKS:
-        # get_url_file_path 내부에서 ensure_dir 호출로 폴더 생성됨
-        txt_path = get_url_file_path(base_dir, task)
-        if not txt_path.exists():
-            txt_path.write_text(
-                "task_type,url,timestamp_min,timestamp_sec,description\n"
-                f"{task},https://www.youtube.com/watch?v=EXAMPLE,2,30,샘플\n",
-                encoding='utf-8'
-            )
-    # 2. config.py의 get_paths 로직에 따른 폴더들 생성 확인
-    paths = get_paths(base_dir)
-    ensure_dir(paths['outputs'])
-    print(f"✓ Project initialized at: {base_dir}")
-    print(f"  - Add URLs to: urls/<task>/youtube_url.txt")
-    print(f"  - Videos will be saved to configured OUTPUT_DIR (or video/ folder)")
+def create_parser():
+    """CLI 인자 파서 생성"""
+    parser = argparse.ArgumentParser(
+        prog='ytcollector',
+        description='YouTube 콘텐츠 수집기 - 얼굴, 번호판, 타투, 텍스트 감지',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+예시:
+  ytcollector -c face                    # 얼굴 카테고리 5개
+  ytcollector -c face text -n 10         # 얼굴, 텍스트 각 10개
+  ytcollector -c face --fast             # 고속 모드 (병렬 다운로드)
+  ytcollector -c face --fast -w 5        # 5개 동시 다운로드
+  ytcollector -c license_plate -d 5      # 번호판, 최대 5분
+  # 짧은 명령어도 사용 가능
+  ytc -c face -n 3
+        """
+    )
+    parser.add_argument(
+        '-c', '--categories',
+        nargs='+',
+        choices=['face', 'license_plate', 'tattoo', 'text'],
+        default=['face'],
+        help='수집할 카테고리 (기본: face)'
+    )
+    parser.add_argument(
+        '-n', '--count',
+        type=int,
+        default=5,
+        help='카테고리당 다운로드 수 (기본: 5)'
+    )
+    parser.add_argument(
+        '-d', '--duration',
+        type=int,
+        default=3,
+        help='최대 영상 길이(분) (기본: 3)'
+    )
+    parser.add_argument(
+        '-o', '--output',
+        type=str,
+        default=os.path.expanduser("~/youtube"),
+        help='저장 경로 (기본: ~/youtube)'
+    )
+    parser.add_argument(
+        '--fast',
+        action='store_true',
+        help='고속 모드 (병렬 다운로드, 딜레이 최소화)'
+    )
+    parser.add_argument(
+        '-w', '--workers',
+        type=int,
+        default=3,
+        help='병렬 다운로드 수 (기본: 3, --fast 필요)'
+    )
+    parser.add_argument(
+        '--proxy',
+        type=str,
+        default=None,
+        help='프록시 (예: http://proxy:8080)'
+    )
+    parser.add_argument(
+        '-v', '--version',
+        action='version',
+        version='%(prog)s 1.0.9'
+    )
+    parser.add_argument(
+        '--check-deps',
+        action='store_true',
+        help='의존성 확인 후 종료'
+    )
-def run_download(args):
-    """TXT 파일에서 영상 다운로드"""
-    from .downloader import download_from_txt  # Changed function name
-    base_dir = Path(args.dir) if args.dir else Path.cwd()
-    # argparse에서 nargs='+'로 받아오면 args.task는 항상 리스트
-    tasks = args.task if isinstance(args.task, list) else [args.task]
-    total_success = 0
-    total_processed = 0
-    for task in tasks:
-        logger.info(f"=== Processing Task: {task} ===")
-        # 파일 경로: video/{task}/youtube_url.txt
-        txt_file = get_url_file_path(base_dir, task)
-        if not txt_file.exists():
-            logger.error(f"URL file not found: {txt_file}")
-            logger.info(f"Skipping {task}. Run 'ytcollector init' first.")
-            continue
-        logger.info(f"Starting{' fast' if args.fast else ''} download for task: {task}")
-        if args.fast:
-            from .downloader import download_from_txt_parallel
-            results = download_from_txt_parallel(txt_file, task, base_dir, max_count=args.count, skip_verify=args.skip_verify)
-        else:
-            results = download_from_txt(txt_file, task, base_dir, max_count=args.count, skip_verify=args.skip_verify)
-        success_count = sum(1 for r in results if r.get('success'))
-        total_success += success_count
-        total_processed += len(results)
-        print(f"✓ Task '{task}' complete: {success_count}/{len(results)} successful")
+    return parser
+def run(
+    categories=None,
+    count=5,
+    duration=3,
+    output=None,
+    fast=False,
+    workers=3,
+    proxy=None,
+    quiet=False
+):
+    """
+    프로그래밍 방식으로 수집기 실행
+    Args:
+        categories: 카테고리 리스트 (예: ['face', 'text'])
+        count: 카테고리당 다운로드 수
+        duration: 최대 영상 길이(분)
+        output: 저장 경로
+        fast: 고속 모드 여부
+        workers: 병렬 다운로드 수
+        proxy: 프록시 URL
+        quiet: 조용한 모드 (출력 최소화)
-    print(f"\n✓ All tasks complete: {total_success}/{total_processed} successful total")
-def run_download_single(args):
-    """단일 URL 다운로드"""
-    from .downloader import VideoDownloader
-    base_dir = Path(args.dir) if args.dir else Path.cwd()
-    downloader = VideoDownloader(args.task, base_dir)
-    try:
-        output_path, metadata = downloader.download_clip_at_timestamp(
-            url=args.url,
-            timestamp_min=args.timestamp_min,
-            timestamp_sec=args.timestamp_sec
-        )
-        status = "Cached" if metadata.get('cached') else "Downloaded"
-        print(f"✓ {status}: {output_path}")
-        if not metadata.get('cached'):
-            print(f"  Clip duration: {metadata['clip_duration']}s")
-    except Exception as e:
-        logger.error(f"Download failed: {e}")
-def run_verify(args):
-    """클립 영상 검증"""
-    from .verifier import verify_clip, batch_verify
+    Returns:
+        dict: 카테고리별 성공 다운로드 수
+    """
+    if categories is None:
+        categories = ['face']
+    if output is None:
+        output = os.path.expanduser("~/youtube")
+    # 의존성 체크
+    missing = check_dependencies()
+    if missing and not quiet:
+        print(f"⚠ 분석 기능을 위해 설치 필요: pip install {' '.join(missing)}")
+    # 다운로더 생성
+    downloader = YouTubeDownloader(
+        output_path=output,
+        max_duration=duration * 60,
+        proxy=proxy,
+        fast_mode=fast,
+        workers=workers
+    )
-    base_dir = Path(args.dir) if args.dir else Path.cwd()
+    results = {}
+    for category in categories:
+        count_success = downloader.collect(category, count)
+        results[category] = count_success
-    if args.video:
-        video_path = Path(args.video)
-        result = verify_clip(video_path, args.task, base_dir)
-        print_verification_result(result)
-    else:
-        # 폴더 경로: video/{task}/
-        clips_dir = base_dir / "video" / args.task
-        if not clips_dir.exists():
-            logger.error(f"Video directory not found: {clips_dir}")
-            return
-        results = batch_verify(clips_dir, args.task, base_dir)
-        valid_count = sum(1 for r in results if r.get('is_valid'))
-        print(f"✓ Verification complete: {valid_count}/{len(results)} valid")
+    return results
-def run_pipeline(args):
-    """다운로드 + 검증 전체 파이프라인"""
-    print(f"=== Starting pipeline for task: {args.task} ===")
-    run_download(args)
-    if args.verify:
-        print("\n--- Running verification ---")
-        run_verify(args)
-    print("=== Pipeline complete ===")
+def main(args=None):
+    """CLI 메인 엔트리포인트"""
+    parser = create_parser()
+    parsed_args = parser.parse_args(args)
-def print_verification_result(result: dict):
-    """검증 결과 출력"""
-    summary = result.get('summary', {})
-    print("\n" + "="*50)
-    print(f"Video: {Path(result['video_path']).name}")
-    print(f"Task: {result['task_type']}")
-    print(f"Classes: {result['classes']}")
-    print("-"*50)
-    print(f"Duration: {summary.get('duration_sec', 0):.1f}s")
-    print(f"Frames with detection: {summary.get('frames_with_detection', 0)}")
-    print(f"Detection rate: {summary.get('detection_rate', 0):.1%}")
-    print(f"Valid: {'✓ YES' if result.get('is_valid') else '✗ NO'}")
-    print("="*50)
-def list_tasks(args):
-    """태스크 목록 출력"""
-    print("\nAvailable Tasks and YOLO-World Classes:")
-    print("-" * 50)
-    for task, classes in TASK_CLASSES.items():
-        print(f"\n{task}:")
-        for cls in classes:
-            print(f"  - {cls}")
-def main():
-    parser = argparse.ArgumentParser(
-        description='Downloader - SBS Dataset Collector',
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-Examples:
-  downloader init                           # 프로젝트 초기화
-  downloader download --task face           # 텍스트 파일에서 다운로드
-  downloader verify --task face             # YOLO 검증
-        """
+    # 의존성 확인 모드
+    if parsed_args.check_deps:
+        missing = check_dependencies()
+        if missing:
+            print("⚠ 누락된 의존성:")
+            for dep in missing:
+                print(f"  - {dep}")
+            print(f"\n설치: pip install {' '.join(missing)}")
+            sys.exit(1)
+        else:
+            print("✅ 모든 의존성이 설치되어 있습니다.")
+            sys.exit(0)
+    # 시작 메시지
+    print("\n" + "=" * 60)
+    print("YouTube 콘텐츠 수집기")
+    print("=" * 60)
+    print(f"카테고리: {', '.join([CATEGORY_NAMES[c] for c in parsed_args.categories])}")
+    print(f"개수: 카테고리당 {parsed_args.count}개")
+    print(f"최대길이: {parsed_args.duration}분")
+    print(f"저장경로: {parsed_args.output}")
+    if parsed_args.fast:
+        print(f"모드: ⚡ 고속 (병렬 {parsed_args.workers}개)")
+    if parsed_args.proxy:
+        print(f"프록시: {parsed_args.proxy}")
+    # 의존성 체크
+    missing = check_dependencies()
+    if missing:
+        print(f"\n⚠ 분석 기능을 위해 설치 필요:")
+        print(f"  pip install {' '.join(missing)}")
+    # 수집 실행
+    results = run(
+        categories=parsed_args.categories,
+        count=parsed_args.count,
+        duration=parsed_args.duration,
+        output=parsed_args.output,
+        fast=parsed_args.fast,
+        workers=parsed_args.workers,
+        proxy=parsed_args.proxy
     )
-    parser.add_argument('--dir', '-d', help='Project directory (default: current)')
-    subparsers = parser.add_subparsers(dest='command', help='Commands')
-    # Init
-    init_parser = subparsers.add_parser('init', help='Initialize project directory')
-    # Download
-    download_parser = subparsers.add_parser('download', help='Download from youtube_url.txt')
-    download_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS, help='One or more tasks (e.g. face tattoo)')
-    download_parser.add_argument('--count', '-n', type=int, help='Max videos to collect (default: 1000)')
-    download_parser.add_argument('--fast', action='store_true', help='Enable fast parallel downloading')
-    download_parser.add_argument('--skip-verify', '-S', action='store_true', help='Skip YOLO verification and save all clips')
-    # Download single
-    single_parser = subparsers.add_parser('download-single', help='Download single video')
-    single_parser.add_argument('--task', '-t', required=True, choices=VALID_TASKS)
-    single_parser.add_argument('--url', '-u', required=True, help='YouTube URL')
-    single_parser.add_argument('--timestamp-min', '-m', type=int, required=True)
-    single_parser.add_argument('--timestamp-sec', '-s', type=int, required=True)
-    # Verify
-    verify_parser = subparsers.add_parser('verify', help='Verify with YOLO-World')
-    verify_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS)
-    verify_parser.add_argument('--video', '-v', help='Specific video file')
-    # Pipeline
-    pipeline_parser = subparsers.add_parser('pipeline', help='Full pipeline')
-    pipeline_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS)
-    pipeline_parser.add_argument('--verify', action='store_true')
-    pipeline_parser.add_argument('--skip-verify', '-S', action='store_true', help='Skip verification in download stage')
-    # List tasks
-    subparsers.add_parser('list-tasks', help='List available tasks')
-    args = parser.parse_args()
-    if args.command is None:
-        parser.print_help()
-        return
-    commands = {
-        'init': init_project,
-        'download': run_download,
-        'download-single': run_download_single,
-        'verify': run_verify,
-        'pipeline': run_pipeline,
-        'list-tasks': list_tasks,
-    }
-    commands[args.command](args)
+    # 완료 메시지
+    total = sum(results.values())
+    print(f"\n{'='*60}")
+    print(f"완료! 총 {total}개 저장")
+    for cat, cnt in results.items():
+        print(f"  - {CATEGORY_NAMES[cat]}: {cnt}개")
+    print(f"{'='*60}\n")
+    return 0
-if __name__ == '__main__':
-    main()
+if __name__ == "__main__":
+    sys.exit(main())

ytcollector 1.0.8__py3-none-any.whl → 1.0.9__py3-none-any.whl

ytcollector 1.0.8py3-none-any.whl → 1.0.9py3-none-any.whl