PyPI - ytcollector - Versions diffs - 1.0.3__tar.gz → 1.0.5__tar.gz - Mend

ytcollector 1.0.3tar.gz → 1.0.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{ytcollector-1.0.3 → ytcollector-1.0.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ytcollector
-Version: 1.0.3
+Version: 1.0.5
 Summary: SBS 데이터셋 수집기
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown

{ytcollector-1.0.3 → ytcollector-1.0.5}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "ytcollector"
-version = "1.0.3"
+version = "1.0.5"
 description = "SBS 데이터셋 수집기"
 readme = "README.md"
 requires-python = ">=3.9"

{ytcollector-1.0.3 → ytcollector-1.0.5}/ytcollector/__init__.py RENAMED Viewed

@@ -3,7 +3,7 @@ SBS Dataset Collector - YouTube 영상 수집 및 YOLO-World 검증 파이프라
 """
 from pathlib import Path
-__version__ = "1.0.3"
+__version__ = "1.0.5"
 __author__ = "SBS Dataset Team"
 # Package root directory

{ytcollector-1.0.3 → ytcollector-1.0.5}/ytcollector/cli.py RENAMED Viewed

@@ -48,24 +48,38 @@ def run_download(args):
     base_dir = Path(args.dir) if args.dir else Path.cwd()
-    # 파일 경로: video/{task}/youtube_url.txt
-    txt_file = get_url_file_path(base_dir, args.task)
+    # argparse에서 nargs='+'로 받아오면 args.task는 항상 리스트
+    tasks = args.task if isinstance(args.task, list) else [args.task]
-    if not txt_file.exists():
-        logger.error(f"URL file not found: {txt_file}")
-        logger.info("Run 'downloader init' first to create project structure")
-        return
-    logger.info(f"Starting{' fast' if args.fast else ''} download for task: {args.task}")
-    if args.fast:
-        from .downloader import download_from_txt_parallel
-        results = download_from_txt_parallel(txt_file, args.task, base_dir)
-    else:
-        results = download_from_txt(txt_file, args.task, base_dir)
+    total_success = 0
+    total_processed = 0
-    success_count = sum(1 for r in results if r.get('success'))
-    print(f"✓ Download complete: {success_count}/{len(results)} successful")
+    for task in tasks:
+        logger.info(f"=== Processing Task: {task} ===")
+        # 파일 경로: video/{task}/youtube_url.txt
+        txt_file = get_url_file_path(base_dir, task)
+        if not txt_file.exists():
+            logger.error(f"URL file not found: {txt_file}")
+            logger.info(f"Skipping {task}. Run 'ytcollector init' first.")
+            continue
+        logger.info(f"Starting{' fast' if args.fast else ''} download for task: {task}")
+        if args.fast:
+            from .downloader import download_from_txt_parallel
+            results = download_from_txt_parallel(txt_file, task, base_dir, max_count=args.count)
+        else:
+            results = download_from_txt(txt_file, task, base_dir, max_count=args.count)
+        success_count = sum(1 for r in results if r.get('success'))
+        total_success += success_count
+        total_processed += len(results)
+        print(f"✓ Task '{task}' complete: {success_count}/{len(results)} successful")
+    print(f"\n✓ All tasks complete: {total_success}/{total_processed} successful total")
 def run_download_single(args):
@@ -172,7 +186,8 @@ Examples:
     # Download
     download_parser = subparsers.add_parser('download', help='Download from youtube_url.txt')
-    download_parser.add_argument('--task', '-t', required=True, choices=VALID_TASKS)
+    download_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS, help='One or more tasks (e.g. face tattoo)')
+    download_parser.add_argument('--count', '-n', type=int, help='Max videos to collect (default: 1000)')
     download_parser.add_argument('--fast', action='store_true', help='Enable fast parallel downloading')
     # Download single
@@ -184,12 +199,12 @@ Examples:
     # Verify
     verify_parser = subparsers.add_parser('verify', help='Verify with YOLO-World')
-    verify_parser.add_argument('--task', '-t', required=True, choices=VALID_TASKS)
+    verify_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS)
     verify_parser.add_argument('--video', '-v', help='Specific video file')
     # Pipeline
     pipeline_parser = subparsers.add_parser('pipeline', help='Full pipeline')
-    pipeline_parser.add_argument('--task', '-t', required=True, choices=VALID_TASKS)
+    pipeline_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS)
     pipeline_parser.add_argument('--verify', action='store_true')
     # List tasks

{ytcollector-1.0.3 → ytcollector-1.0.5}/ytcollector/config.py RENAMED Viewed

@@ -41,7 +41,7 @@ MAX_CLIP_DURATION = 180    # 최대 3분
 # Download settings
 VIDEO_FORMAT = "best[ext=mp4]/best"
 DOWNLOAD_RETRIES = 3
-MAX_VIDEOS_PER_TASK = 100  # 태스크별 최대 영상 저장 수
+MAX_VIDEOS_PER_TASK = 1000  # 태스크별 최대 영상 저장 수 (CLI -n 옵션으로 덮어쓰기 가능)
 # Fast Mode Settings (Parallel)
 MAX_WORKERS = 4              # 병렬 작업 프로세스 수

{ytcollector-1.0.3 → ytcollector-1.0.5}/ytcollector/downloader.py RENAMED Viewed

@@ -247,31 +247,34 @@ def parse_txt_line(line: str) -> Optional[Dict]:
         return None
-def download_from_txt(txt_path: Path, task_type: str, base_dir: Path = None) -> list:
+def download_from_txt(txt_path: Path, task_type: str, base_dir: Path = None, max_count: int = None) -> list:
     """TXT 파일에서 다운로드 실행 (순차)"""
     # 기존 로직을 process_single_item 함수로 분리하여 재사용할 수 있으면 좋겠지만,
     # 코드 구조상 일단 순차 실행 유지하고 parallel 함수 별도 구현
-    return _process_download_loop(txt_path, task_type, base_dir, parallel=False)
+    return _process_download_loop(txt_path, task_type, base_dir, parallel=False, max_count=max_count)
-def download_from_txt_parallel(txt_path: Path, task_type: str, base_dir: Path = None) -> list:
+def download_from_txt_parallel(txt_path: Path, task_type: str, base_dir: Path = None, max_count: int = None) -> list:
     """TXT 파일에서 병렬 다운로드 실행 (Fast Mode)"""
-    return _process_download_loop(txt_path, task_type, base_dir, parallel=True)
+    return _process_download_loop(txt_path, task_type, base_dir, parallel=True, max_count=max_count)
-def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None, parallel: bool = False) -> list:
+def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None, parallel: bool = False, max_count: int = None) -> list:
     from .config import MAX_VIDEOS_PER_TASK, MAX_WORKERS, REQUEST_DELAY_MIN, REQUEST_DELAY_MAX
     import time
     import random
     from concurrent.futures import ThreadPoolExecutor, as_completed
+    # max_count가 없으면 config의 기본값 사용
+    limit = max_count if max_count is not None else MAX_VIDEOS_PER_TASK
     results = []
     downloader = VideoDownloader(task_type, base_dir)
     # 시작 전 개수 확인
     initial_count = downloader.get_saved_video_count()
-    if initial_count >= MAX_VIDEOS_PER_TASK:
-        logger.warning(f"Task '{task_type}' already has {initial_count} videos. Skipping.")
+    if initial_count >= limit:
+        logger.warning(f"Task '{task_type}' already has {initial_count} videos (Limit: {limit}). Skipping.")
         return results
     if not txt_path.exists():
@@ -291,14 +294,26 @@ def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None
     if not items:
         return results
-    logger.info(f"Found {len(items)} URLs. Starting {'parallel' if parallel else 'sequential'} download...")
+    logger.info(f"Found {len(items)} URLs. Target: {limit} videos (Current: {initial_count}). Starting {'parallel' if parallel else 'sequential'} download...")
     def process_item(data):
+        # 현재 개수 체크 (루프 도중 목표 달성 시 중단 위함)
+        # 주의: 병렬 처리 시 정확한 count 동기화는 Lock이 필요하지만, 여기선 대략적인 체크로 충분
+        current = downloader.get_saved_video_count()
+        if current >= limit:
+            raise LimitReachedError("Target count reached")
         # 방화벽 우회용 랜덤 딜레이 (병렬 모드에서도 적용하여 동시 요청 폭주 완화)
         if parallel:
             time.sleep(random.uniform(REQUEST_DELAY_MIN, REQUEST_DELAY_MAX))
         try:
+            # VideoDownloader 내부의 limit 체크는 config 값을 쓰므로,
+            # 여기서는 외부에서 주입된 limit을 강제할 방법이 필요하거나,
+            # 단순히 루프 레벨에서 제어하면 됨.
+            # download_clip_at_timestamp 메서드는 내부적으로 MAX_VIDEOS_PER_TASK를 체크하므로,
+            # 이를 우회하거나 단순 루프 제어로 처리.
             output_path, metadata = downloader.download_clip_at_timestamp(
                 url=data['url'],
                 timestamp_min=data['timestamp_min'],
@@ -329,8 +344,7 @@ def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None
             }
         except LimitReachedError:
-            # 병렬 실행 중에는 이 예외 처리 방식이 조금 다를 수 있음 (다른 스레드 멈추게 하려면 Event 사용 등)
-            # 여기서는 개별 스레드 종료로 처리
+            # 내부에서 발생한 LimitReachedError도 처리
             return {'success': False, 'error': 'Limit reached', 'status': 'limit_reached'}
         except Exception as e:
@@ -350,18 +364,27 @@ def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None
             # 진행 상황 표시
             from tqdm import tqdm
             for future in tqdm(as_completed(futures), total=len(items), desc="Fast Download"):
-                res = future.result()
-                results.append(res)
-                if res.get('status') == 'limit_reached':
-                    logger.warning("Download limit reached. Stopping remaining tasks.")
-                    executor.shutdown(wait=False, cancel_futures=True)
-                    break
+                try:
+                    res = future.result()
+                    results.append(res)
+                    if res.get('status') == 'limit_reached' or downloader.get_saved_video_count() >= limit:
+                        logger.warning(f"Download limit ({limit}) reached. Stopping remaining tasks.")
+                        executor.shutdown(wait=False, cancel_futures=True)
+                        break
+                except Exception:
+                    continue
     else:
         # 순차 실행
         for item in items:
+            # 루프 시작 전 체크
+            if downloader.get_saved_video_count() >= limit:
+                logger.info(f"Target count ({limit}) reached. Stopping.")
+                break
             res = process_item(item)
             results.append(res)
             if res.get('status') == 'limit_reached':
+                logger.info(f"Target count ({limit}) reached. Stopping.")
                 break
     return results

{ytcollector-1.0.3 → ytcollector-1.0.5}/ytcollector.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ytcollector
-Version: 1.0.3
+Version: 1.0.5
 Summary: SBS 데이터셋 수집기
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown