PyPI - ytcollector - Versions diffs - 1.0.7__tar.gz → 1.0.8__tar.gz - Mend

ytcollector 1.0.7tar.gz → 1.0.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{ytcollector-1.0.7 → ytcollector-1.0.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ytcollector
-Version: 1.0.7
+Version: 1.0.8
 Summary: SBS 데이터셋 수집기
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown

{ytcollector-1.0.7 → ytcollector-1.0.8}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "ytcollector"
-version = "1.0.7"
+version = "1.0.8"
 description = "SBS 데이터셋 수집기"
 readme = "README.md"
 requires-python = ">=3.9"

{ytcollector-1.0.7 → ytcollector-1.0.8}/ytcollector/__init__.py RENAMED Viewed

@@ -3,7 +3,7 @@ SBS Dataset Collector - YouTube 영상 수집 및 YOLO-World 검증 파이프라
 """
 from pathlib import Path
-__version__ = "1.0.7"
+__version__ = "1.0.8"
 __author__ = "SBS Dataset Team"
 # Package root directory

{ytcollector-1.0.7 → ytcollector-1.0.8}/ytcollector/cli.py RENAMED Viewed

@@ -69,9 +69,9 @@ def run_download(args):
         if args.fast:
             from .downloader import download_from_txt_parallel
-            results = download_from_txt_parallel(txt_file, task, base_dir, max_count=args.count)
+            results = download_from_txt_parallel(txt_file, task, base_dir, max_count=args.count, skip_verify=args.skip_verify)
         else:
-            results = download_from_txt(txt_file, task, base_dir, max_count=args.count)
+            results = download_from_txt(txt_file, task, base_dir, max_count=args.count, skip_verify=args.skip_verify)
         success_count = sum(1 for r in results if r.get('success'))
         total_success += success_count
@@ -189,6 +189,7 @@ Examples:
     download_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS, help='One or more tasks (e.g. face tattoo)')
     download_parser.add_argument('--count', '-n', type=int, help='Max videos to collect (default: 1000)')
     download_parser.add_argument('--fast', action='store_true', help='Enable fast parallel downloading')
+    download_parser.add_argument('--skip-verify', '-S', action='store_true', help='Skip YOLO verification and save all clips')
     # Download single
     single_parser = subparsers.add_parser('download-single', help='Download single video')
@@ -206,6 +207,7 @@ Examples:
     pipeline_parser = subparsers.add_parser('pipeline', help='Full pipeline')
     pipeline_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS)
     pipeline_parser.add_argument('--verify', action='store_true')
+    pipeline_parser.add_argument('--skip-verify', '-S', action='store_true', help='Skip verification in download stage')
     # List tasks
     subparsers.add_parser('list-tasks', help='List available tasks')

{ytcollector-1.0.7 → ytcollector-1.0.8}/ytcollector/downloader.py RENAMED Viewed

@@ -145,7 +145,8 @@ class VideoDownloader:
         self,
         url: str,
         timestamp_min: int,
-        timestamp_sec: int
+        timestamp_sec: int,
+        skip_verify: bool = False
     ) -> Tuple[Optional[Path], Optional[dict]]:
         """
         특정 타임스탬프 기준으로 ±1:30 클립 다운로드
@@ -207,18 +208,24 @@ class VideoDownloader:
         try:
             self.download_segment(url, start_sec, end_sec, temp_path)
-            # 5. YOLO 검증
-            logger.info(f"Verifying content for task: {self.task_type}...")
-            # verifier 모듈 사용하여 검증
-            verify_result = verify_clip(temp_path, self.task_type, self.base_dir)
-            if not verify_result.get('is_valid', False):
-                logger.warning(f"Verification failed: No {self.task_type} detected. Deleting...")
-                if temp_path.exists():
-                    temp_path.unlink()
-                return None, None
+            # 5. YOLO 검증 (skip_verify가 False일 때만 수행)
+            if not skip_verify:
+                logger.info(f"Verifying content for task: {self.task_type}...")
+                # verifier 모듈 사용하여 검증
+                verify_result = verify_clip(temp_path, self.task_type, self.base_dir)
+                logger.info(f"Verification Info - Rate: {verify_result.get('summary', {}).get('detection_rate'):.2%}, Is Valid: {verify_result.get('is_valid')}")
+                if not verify_result.get('is_valid', False):
+                    logger.warning(f"Verification failed: No {self.task_type} detected (Rate: {verify_result.get('summary', {}).get('detection_rate'):.2%}). Deleting...")
+                    if temp_path.exists():
+                        temp_path.unlink()
+                    return None, None
+            else:
+                logger.info(f"Skipping verification for task: {self.task_type} (--skip-verify enabled)")
+                verify_result = {'is_valid': True, 'skipped': True}
-            # 6. 검증 통과 -> 최종 저장 (순차적 파일명 생성)
+            # 6. 검증 통과(혹은 건너뜀) -> 최종 저장 (순차적 파일명 생성)
             final_path = get_clip_path(self.base_dir, self.task_type, filename=None)
             # 이동 (네트워크 드라이브면 shutil.move 사용)
@@ -276,19 +283,17 @@ def parse_txt_line(line: str) -> Optional[Dict]:
         return None
-def download_from_txt(txt_path: Path, task_type: str, base_dir: Path = None, max_count: int = None) -> list:
+def download_from_txt(txt_path: Path, task_type: str, base_dir: Path = None, max_count: int = None, skip_verify: bool = False) -> list:
     """TXT 파일에서 다운로드 실행 (순차)"""
-    # 기존 로직을 process_single_item 함수로 분리하여 재사용할 수 있으면 좋겠지만,
-    # 코드 구조상 일단 순차 실행 유지하고 parallel 함수 별도 구현
-    return _process_download_loop(txt_path, task_type, base_dir, parallel=False, max_count=max_count)
+    return _process_download_loop(txt_path, task_type, base_dir, parallel=False, max_count=max_count, skip_verify=skip_verify)
-def download_from_txt_parallel(txt_path: Path, task_type: str, base_dir: Path = None, max_count: int = None) -> list:
+def download_from_txt_parallel(txt_path: Path, task_type: str, base_dir: Path = None, max_count: int = None, skip_verify: bool = False) -> list:
     """TXT 파일에서 병렬 다운로드 실행 (Fast Mode)"""
-    return _process_download_loop(txt_path, task_type, base_dir, parallel=True, max_count=max_count)
+    return _process_download_loop(txt_path, task_type, base_dir, parallel=True, max_count=max_count, skip_verify=skip_verify)
-def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None, parallel: bool = False, max_count: int = None) -> list:
+def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None, parallel: bool = False, max_count: int = None, skip_verify: bool = False) -> list:
     from .config import MAX_VIDEOS_PER_TASK, MAX_WORKERS, REQUEST_DELAY_MIN, REQUEST_DELAY_MAX
     import time
     import random
@@ -346,7 +351,8 @@ def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None
             output_path, metadata = downloader.download_clip_at_timestamp(
                 url=data['url'],
                 timestamp_min=data['timestamp_min'],
-                timestamp_sec=data['timestamp_sec']
+                timestamp_sec=data['timestamp_sec'],
+                skip_verify=skip_verify
             )
             if output_path is None:
@@ -414,38 +420,51 @@ def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None
                 if res.get('status') == 'limit_reached':
                     break
-    # --- 2단계: 목표 수량을 못 채웠을 경우 YouTube 검색Fallback ---
-    current_count = downloader.get_saved_video_count()
-    if current_count < limit:
+    # --- 2단계: 목표 수량을 못 채웠을 경우 YouTube 검색Fallback (반복 시도) ---
+    max_search_attempts = 5  # 최대 검색 시도 횟수
+    search_attempt = 0
+    processed_urls = set(data['url'] for data in items) # 이미 처리한 URL 중복 검색 방지
+    while downloader.get_saved_video_count() < limit and search_attempt < max_search_attempts:
+        current_count = downloader.get_saved_video_count()
         remaining = limit - current_count
-        logger.info(f"\nTarget not reached ({current_count}/{limit}). Starting YouTube Search fallback for '{task_type}'...")
-        # 검색어: 태스크 이름 (필요시 config에서 태스크별 검색어 별도 지정 가능)
-        search_results = downloader.search_youtube(task_type, max_results=remaining * 2)
+        logger.info(f"\n[Search Attempt {search_attempt+1}] Target not reached ({current_count}/{limit}). Searching YouTube for '{task_type}'...")
+        # 검색어: 태스크 이름
+        # 검색 결과 개수를 점진적으로 늘리거나 조절 가능
+        search_results = downloader.search_youtube(task_type, max_results=min(100, remaining * 5))
         if not search_results:
-            logger.warning("No search results found.")
-            return results
+            logger.warning("No more search results found.")
+            break
-        # 검색 결과는 타임스탬프 정보가 없으므로, 기본적으로 영상의 1:00 지점 혹은 0:00 지점을 시도
-        # 여기서는 영상의 대략 1분 지점(영상이 짧으면 0)을 타겟으로 시도해봄
+        # 새로운 URL만 필터링
+        new_entries = [e for e in search_results if e['url'] not in processed_urls]
+        if not new_entries:
+            logger.info("No new unique videos found in this search attempt.")
+            search_attempt += 1
+            continue
         search_items = []
-        for entry in search_results:
+        for entry in new_entries:
+            processed_urls.add(entry['url'])
+            # 검색 결과는 타임스탬프 정보가 없으므로 여러 지점 시도 가능 (현재는 1분 지점 고정)
             search_items.append({
                 'task_type': task_type,
                 'url': entry['url'],
-                'timestamp_min': 1,  # 1분 지점 샘플링 시도
+                'timestamp_min': 1,
                 'timestamp_sec': 0,
                 'description': f"Auto-searched: {entry['title']}"
             })
-        logger.info(f"Processing {len(search_items)} search results...")
+        logger.info(f"Processing {len(search_items)} new search results...")
         if parallel:
             with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
                 futures = [executor.submit(process_item, item) for item in search_items]
                 from tqdm import tqdm
-                for future in tqdm(as_completed(futures), total=len(search_items), desc="Search Fallback"):
+                for future in tqdm(as_completed(futures), total=len(search_items), desc=f"Search Fallback #{search_attempt+1}"):
                     try:
                         res = future.result()
                         results.append(res)
@@ -462,5 +481,13 @@ def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None
                 results.append(res)
                 if res.get('status') == 'limit_reached':
                     break
+        search_attempt += 1
+    final_count = downloader.get_saved_video_count()
+    if final_count < limit:
+        logger.warning(f"Finished search attempts. Final count: {final_count}/{limit}")
+    else:
+        logger.info(f"Successfully reached target count: {final_count}/{limit}")
     return results

{ytcollector-1.0.7 → ytcollector-1.0.8}/ytcollector/verifier.py RENAMED Viewed

@@ -136,7 +136,7 @@ class YOLOWorldVerifier:
             'frame_results': frame_results,
             'verified_at': datetime.now().isoformat(),
             'model': self.model_name,
-            'is_valid': detection_rate > 0.1,
+            'is_valid': detection_rate > 0.01,  # 1% 이상 탐지되면 유효한 것으로 간주 (기존 10%에서 하향)
         }
         logger.info(

{ytcollector-1.0.7 → ytcollector-1.0.8}/ytcollector.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ytcollector
-Version: 1.0.7
+Version: 1.0.8
 Summary: SBS 데이터셋 수집기
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown