PyPI - ytcollector - Versions diffs - 1.0.9__tar.gz → 1.1.1__tar.gz - Mend

ytcollector 1.0.9tar.gz → 1.1.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{ytcollector-1.0.9 → ytcollector-1.1.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ytcollector
-Version: 1.0.9
+Version: 1.1.1
 Summary: YouTube 콘텐츠 수집기 - 얼굴, 번호판, 타투, 텍스트 감지
 Author: YTCollector Team
 License: MIT
@@ -54,7 +54,7 @@ pip install opencv-python easyocr numpy
 ### 기본 실행
 ```bash
-python main.py
+ytcollector
 ```
 기본값: 얼굴 카테고리 5개, 최대 3분 영상
@@ -66,7 +66,7 @@ python main.py
 | `-c`, `--categories` | 수집할 카테고리 | `face` |
 | `-n`, `--count` | 카테고리당 다운로드 수 | `5` |
 | `-d`, `--duration` | 최대 영상 길이(분) | `3` |
-| `-o`, `--output` | 저장 경로 | `~/Downloads/youtube_collection` |
+| `-o`, `--output` | 저장 경로 | `.` (현재 폴더) |
 | `--fast` | 고속 모드 (병렬 다운로드) | 비활성화 |
 | `-w`, `--workers` | 병렬 다운로드 수 | `3` |
 | `--proxy` | 프록시 주소 | 없음 |
@@ -86,45 +86,45 @@ python main.py
 ```bash
 # 얼굴 영상 10개 수집
-python main.py -c face -n 10
+ytcollector -c face -n 10
 # 번호판 영상 수집 (최대 5분)
-python main.py -c license_plate -d 5
+ytcollector -c license_plate -d 5
 # 타투 영상 수집
-python main.py -c tattoo -n 5
+ytcollector -c tattoo -n 5
 ```
 ### 여러 카테고리
 ```bash
 # 얼굴과 텍스트 각 10개씩
-python main.py -c face text -n 10
+ytcollector -c face text -n 10
 # 모든 카테고리 수집
-python main.py -c face license_plate tattoo text -n 5
+ytcollector -c face license_plate tattoo text -n 5
 ```
 ### 고속 모드
 ```bash
 # 병렬 다운로드 (기본 3개 동시)
-python main.py -c face -n 10 --fast
+ytcollector -c face -n 10 --fast
 # 5개 동시 다운로드
-python main.py -c face -n 10 --fast -w 5
+ytcollector -c face -n 10 --fast -w 5
 ```
 ### 저장 경로 지정
 ```bash
-python main.py -c face -o /path/to/save
+ytcollector -c face -o /path/to/save
 ```
 ### 프록시 사용
 ```bash
-python main.py -c face --proxy http://proxy.server:8080
+ytcollector -c face --proxy http://proxy.server:8080
 ```
 ## SBS Dataset 구축 (URL 리스트 기반)

{ytcollector-1.0.9 → ytcollector-1.1.1}/README.md RENAMED Viewed

@@ -21,7 +21,7 @@ pip install opencv-python easyocr numpy
 ### 기본 실행
 ```bash
-python main.py
+ytcollector
 ```
 기본값: 얼굴 카테고리 5개, 최대 3분 영상
@@ -33,7 +33,7 @@ python main.py
 | `-c`, `--categories` | 수집할 카테고리 | `face` |
 | `-n`, `--count` | 카테고리당 다운로드 수 | `5` |
 | `-d`, `--duration` | 최대 영상 길이(분) | `3` |
-| `-o`, `--output` | 저장 경로 | `~/Downloads/youtube_collection` |
+| `-o`, `--output` | 저장 경로 | `.` (현재 폴더) |
 | `--fast` | 고속 모드 (병렬 다운로드) | 비활성화 |
 | `-w`, `--workers` | 병렬 다운로드 수 | `3` |
 | `--proxy` | 프록시 주소 | 없음 |
@@ -53,45 +53,45 @@ python main.py
 ```bash
 # 얼굴 영상 10개 수집
-python main.py -c face -n 10
+ytcollector -c face -n 10
 # 번호판 영상 수집 (최대 5분)
-python main.py -c license_plate -d 5
+ytcollector -c license_plate -d 5
 # 타투 영상 수집
-python main.py -c tattoo -n 5
+ytcollector -c tattoo -n 5
 ```
 ### 여러 카테고리
 ```bash
 # 얼굴과 텍스트 각 10개씩
-python main.py -c face text -n 10
+ytcollector -c face text -n 10
 # 모든 카테고리 수집
-python main.py -c face license_plate tattoo text -n 5
+ytcollector -c face license_plate tattoo text -n 5
 ```
 ### 고속 모드
 ```bash
 # 병렬 다운로드 (기본 3개 동시)
-python main.py -c face -n 10 --fast
+ytcollector -c face -n 10 --fast
 # 5개 동시 다운로드
-python main.py -c face -n 10 --fast -w 5
+ytcollector -c face -n 10 --fast -w 5
 ```
 ### 저장 경로 지정
 ```bash
-python main.py -c face -o /path/to/save
+ytcollector -c face -o /path/to/save
 ```
 ### 프록시 사용
 ```bash
-python main.py -c face --proxy http://proxy.server:8080
+ytcollector -c face --proxy http://proxy.server:8080
 ```
 ## SBS Dataset 구축 (URL 리스트 기반)

{ytcollector-1.0.9 → ytcollector-1.1.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "ytcollector"
-version = "1.0.9"
+version = "1.1.1"
 description = "YouTube 콘텐츠 수집기 - 얼굴, 번호판, 타투, 텍스트 감지"
 readme = "README.md"
 requires-python = ">=3.8"

{ytcollector-1.0.9 → ytcollector-1.1.1}/ytcollector/analyzer.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import re
+import threading
 from .config import LICENSE_PLATE_PATTERNS
 # 선택적 import
@@ -24,6 +25,8 @@ except ImportError:
 class VideoAnalyzer:
     """영상 분석 클래스 - 얼굴, 텍스트, 번호판, 타투 감지"""
+    _ocr_lock = threading.Lock()
     def __init__(self):
         self.ocr_reader = None
         self.face_cascade = None
@@ -33,10 +36,12 @@ class VideoAnalyzer:
             self.face_cascade = cv2.CascadeClassifier(cascade_path)
     def _init_ocr(self):
-        """OCR 리더 초기화 (필요할 때만)"""
+        """OCR 리더 초기화 (필요할 때만, 스레드 안전)"""
         if EASYOCR_AVAILABLE and self.ocr_reader is None:
-            print("  OCR 엔진 초기화 중...")
-            self.ocr_reader = easyocr.Reader(['ko', 'en'], gpu=False, verbose=False)
+            with self._ocr_lock:
+                if self.ocr_reader is None:
+                    print("  OCR 엔진 초기화 중...")
+                    self.ocr_reader = easyocr.Reader(['ko', 'en'], gpu=False, verbose=False)
     def extract_frames(self, video_path, num_frames=10):
         """영상에서 균등 간격으로 프레임 추출"""
@@ -75,7 +80,7 @@ class VideoAnalyzer:
         )
     def detect_text(self, frame):
-        """EasyOCR로 텍스트 감지"""
+        """EasyOCR로 텍스트 감지 (스레드 안전)"""
         if not EASYOCR_AVAILABLE:
             return []
@@ -86,7 +91,8 @@ class VideoAnalyzer:
                 scale = 640 / w
                 frame = cv2.resize(frame, (640, int(h * scale)))
-            results = self.ocr_reader.readtext(frame)
+            with self._ocr_lock:
+                results = self.ocr_reader.readtext(frame)
             return [r[1] for r in results if r[2] > 0.3]
         except:
             return []
@@ -151,38 +157,66 @@ class VideoAnalyzer:
             'license_plate': False,
             'tattoo': False,
             'face_count': 0,
-            'detected_texts': []
+            'detected_texts': [],
+            'first_detection_sec': None,
+            'first_detection_ts': None
         }
         if not CV2_AVAILABLE:
             print("  ⚠ OpenCV 미설치")
             return results
-        frames = self.extract_frames(video_path, num_frames=8)
-        if not frames:
-            print("  ⚠ 프레임 추출 실패")
+        # 영상 정보 가져오기
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
             return results
+        fps = cap.get(cv2.CAP_PROP_FPS) or 30
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        cap.release()
+        num_analysis_frames = 10
+        frame_indices = [int(i * total_frames / (num_analysis_frames + 1)) for i in range(1, num_analysis_frames + 1)]
         all_texts = []
         total_faces = 0
-        for i, frame in enumerate(frames):
+        cap = cv2.VideoCapture(video_path)
+        for idx in frame_indices:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+            ret, frame = cap.read()
+            if not ret: continue
+            # 현재 프레임의 시간(초)
+            current_sec = idx / fps
+            detected_now = False
             # 얼굴
             faces = self.detect_faces(frame)
             if len(faces) > 0:
                 results['face'] = True
                 total_faces += len(faces)
+                detected_now = True
-            # 텍스트 (일부 프레임만)
-            if i % 2 == 0 and EASYOCR_AVAILABLE:
-                texts = self.detect_text(frame)
-                if texts:
-                    results['text'] = True
-                    all_texts.extend(texts)
+            # 텍스트
+            texts = self.detect_text(frame)
+            if texts:
+                results['text'] = True
+                all_texts.extend(texts)
+                detected_now = True
             # 타투
             if self.detect_tattoo(frame):
                 results['tattoo'] = True
+                detected_now = True
+            # 첫 감지 시점 기록
+            if detected_now and results['first_detection_sec'] is None:
+                results['first_detection_sec'] = current_sec
+                m, s = int(current_sec // 60), int(current_sec % 60)
+                results['first_detection_ts'] = f"{m:02d}:{s:02d}"
+        cap.release()
         # 번호판 (텍스트에서)
         if all_texts:

{ytcollector-1.0.9 → ytcollector-1.1.1}/ytcollector/cli.py RENAMED Viewed

@@ -51,8 +51,8 @@ def create_parser():
     parser.add_argument(
         '-o', '--output',
         type=str,
-        default=os.path.expanduser("~/youtube"),
-        help='저장 경로 (기본: ~/youtube)'
+        default=".",
+        help='저장 경로 (기본: 현재 폴더)'
     )
     parser.add_argument(
         '--fast',
@@ -74,7 +74,7 @@ def create_parser():
     parser.add_argument(
         '-v', '--version',
         action='version',
-        version='%(prog)s 1.0.9'
+        version='%(prog)s 1.1.1'
     )
     parser.add_argument(
         '--check-deps',
@@ -115,7 +115,7 @@ def run(
         categories = ['face']
     if output is None:
-        output = os.path.expanduser("~/youtube")
+        output = "."
     # 의존성 체크
     missing = check_dependencies()

{ytcollector-1.0.9 → ytcollector-1.1.1}/ytcollector/config.py RENAMED Viewed

@@ -55,6 +55,20 @@ CATEGORY_NAMES = {
     'text': '텍스트'
 }
+# 카테고리별 제외 키워드 (제목에 포함 시 스킵)
+BLACKLIST_KEYWORDS = {
+    'tattoo': [
+        "두피 문신", "두피문신",
+        "눈썹 문신", "눈썹문신",
+        "입술 문신", "입술문신",
+        "틴트 입술",
+        "반영구", "SMP"
+    ],
+    'face': [],
+    'license_plate': [],
+    'text': []
+}
 # 번호판 정규식 패턴
 LICENSE_PLATE_PATTERNS = [
     r'\d{2,3}[가-힣]\d{4}',

ytcollector-1.1.1/ytcollector/dataset_builder.py ADDED Viewed

@@ -0,0 +1,71 @@
+import os
+import subprocess
+from yt_dlp import YoutubeDL
+from .utils import clip_video, get_url_list, get_video_duration, timestamp_to_seconds
+def download_videos(url_list, output_dir):
+    os.makedirs(output_dir, exist_ok=True)
+    for idx, item in enumerate(url_list, 1):
+        url = item['url']
+        task = item['task']
+        index_str = f"{idx:03d}"
+        existing_files = [f for f in os.listdir(output_dir) if f.startswith(f"{index_str}_")]
+        if existing_files:
+            print(f"[{index_str}] Skip: {existing_files[0]}")
+            continue
+        print(f"[{index_str}] Downloading: {url} ({task})")
+        try:
+            ydl_opts = {
+                'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
+                'outtmpl': os.path.join(output_dir, f"{index_str}_{task}_%(title)s.%(ext)s"),
+                'quiet': True,
+                'no_warnings': True,
+            }
+            with YoutubeDL(ydl_opts) as ydl:
+                ydl.download([url])
+        except Exception as e:
+            print(f"[{index_str}] Failed: {e}")
+def build_dataset(url_file, output_root="."):
+    video_dir = os.path.abspath(os.path.join(output_root, "video"))
+    clip_dir = os.path.abspath(os.path.join(output_root, "video_clips"))
+    urls = get_url_list(url_file)
+    if not urls:
+        print(f"Error: No valid data in {url_file}")
+        return
+    print(f"--- Step 1: Downloading {len(urls)} videos ---")
+    download_videos(urls, video_dir)
+    print(f"\n--- Step 2: Clipping videos ---")
+    os.makedirs(clip_dir, exist_ok=True)
+    for idx, item in enumerate(urls, 1):
+        index_str = f"{idx:03d}"
+        files = [f for f in os.listdir(video_dir) if f.startswith(f"{index_str}_")]
+        if not files: continue
+        input_file = os.path.join(video_dir, files[0])
+        output_file = os.path.join(clip_dir, files[0])
+        if os.path.exists(output_file): continue
+        print(f"[{index_str}] Clipping: {files[0]}")
+        center_sec = timestamp_to_seconds(item['timestamp'])
+        clip_video(input_file, output_file, center_sec)
+    print(f"\nDone! Clips saved in: {clip_dir}")
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description='Build SBS Dataset from YouTube URL list')
+    parser.add_argument('file', help='Path to youtube_url.txt')
+    parser.add_argument('-o', '--output', default='.', help='Output root directory (default: .)')
+    args = parser.parse_args()
+    build_dataset(args.file, args.output)
+if __name__ == "__main__":
+    main()

{ytcollector-1.0.9 → ytcollector-1.1.1}/ytcollector/downloader.py RENAMED Viewed

@@ -2,20 +2,24 @@ import os
 import time
 import random
 import shutil
+import threading
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from yt_dlp import YoutubeDL
-from .config import USER_AGENTS, CATEGORY_QUERIES, CATEGORY_NAMES, SKIP_ERRORS
+from .config import USER_AGENTS, CATEGORY_QUERIES, CATEGORY_NAMES, SKIP_ERRORS, BLACKLIST_KEYWORDS
 from .analyzer import VideoAnalyzer
+from .utils import clip_video, append_to_url_list, get_video_duration, get_next_index
 class YouTubeDownloader:
     """YouTube 다운로더 클래스"""
+    _file_lock = threading.Lock()
     def __init__(self, output_path, max_duration=180, proxy=None, fast_mode=False, workers=3):
         self.output_path = output_path
-        self.max_duration = max_duration
+        self.max_duration = max_duration # 기본 180초(3분)
         self.proxy = proxy
         self.fast_mode = fast_mode
         self.workers = workers
@@ -146,7 +150,7 @@ class YouTubeDownloader:
             return None
     def _process_video(self, entry, category, cat_name):
-        """단일 영상 처리 (다운로드 + 분석)"""
+        """단일 영상 처리 (다운로드 + 분석 + 자동 트리밍 + URL 기록)"""
         vid = entry.get('id')
         url = f"https://www.youtube.com/watch?v={vid}"
         title = entry.get('title', '?')[:45]
@@ -156,6 +160,7 @@ class YouTubeDownloader:
         result_info = {'title': title, 'status': status, 'saved': False}
         if status == "ok" and filepath:
+            print(f"  🔍 분석 중...")
             analysis = self.analyzer.analyze(filepath)
             detected = []
@@ -171,12 +176,34 @@ class YouTubeDownloader:
             result_info['detected'] = detected
             if analysis.get(category):
+                # 1. 태스크별 전용 youtube_url_{category}.txt 업데이트
+                url_file_path = f"youtube_url_{category}.txt"
+                ts = analysis.get('first_detection_ts', '00:00')
+                append_to_url_list(url_file_path, url, ts, category)
+                # 2. 결과 폴더 이동 및 파일명 변경 (category_0001.mp4 형식)
                 dest_dir = os.path.join(self.output_path, cat_name)
                 os.makedirs(dest_dir, exist_ok=True)
-                dest = os.path.join(dest_dir, os.path.basename(filepath))
-                if not os.path.exists(dest):
-                    shutil.move(filepath, dest)
+                # 파일명 접두어 결정 (license_plate -> license)
+                prefix = category.replace('license_plate', 'license')
+                with self._file_lock:
+                    idx = get_next_index(dest_dir, prefix)
+                    new_filename = f"{prefix}_{idx:04d}.mp4"
+                    dest = os.path.join(dest_dir, new_filename)
+                # 원본 길이가 3분(180초) 초과면 감지 시점 기준 트리밍
+                duration = get_video_duration(filepath)
+                if duration > 180:
+                    print(f"  ✂ 3분 초과 영상 자동 트리밍 ({self._format_duration(duration)} -> 3:00)")
+                    clip_video(filepath, dest, analysis.get('first_detection_sec', 0))
+                else:
+                    if not os.path.exists(dest):
+                        shutil.move(filepath, dest)
                 result_info['saved'] = True
+                result_info['new_path'] = dest
             else:
                 if category == 'license_plate':
                     dest_dir = os.path.join(self.output_path, "번호판_미감지")
@@ -201,7 +228,9 @@ class YouTubeDownloader:
         print(f"\n{'='*60}")
         print(f"[{cat_name}] 검색: {query}")
         mode = "⚡ 고속" if self.fast_mode else "일반"
-        print(f"목표: {max_videos}개 | 최대길이: {self._format_duration(self.max_duration)} | {mode}")
+        # 검색 시에는 제한을 20분(1200초)으로 완화하여 더 많은 영상 확보
+        search_limit = 1200
+        print(f"목표: {max_videos}개 | 검색제한: {self._format_duration(search_limit)} | {mode}")
         print('='*60)
         # 검색
@@ -212,23 +241,29 @@ class YouTubeDownloader:
         print(f"검색됨: {len(entries)}개")
-        # 길이 필터링
+        # 필터링
         filtered = []
         for entry in entries:
-            if not entry:
-                continue
+            if not entry: continue
             vid = entry.get('id')
-            title = entry.get('title', '?')[:40]
+            title = entry.get('title', '')
             dur = entry.get('duration') or self._get_duration(vid)
-            if dur and dur < self.max_duration:
+            # 블랙리스트 키워드 체크
+            blacklist = BLACKLIST_KEYWORDS.get(category, [])
+            if any(kw in title for kw in blacklist):
+                print(f"  ✗ [제외] {title[:40]}...")
+                continue
+            # 너무 긴 영상(예: 20분 초과) 제외
+            if dur and dur < search_limit:
                 filtered.append(entry)
                 print(f"  ✓ [{self._format_duration(dur)}] {title}")
                 if len(filtered) >= max_videos:
                     break
             elif dur:
-                print(f"  ✗ [{self._format_duration(dur)}] {title}")
+                print(f"  ✗ [{self._format_duration(dur)}] (너무 filter됨)")
             if not self.fast_mode:
                 time.sleep(0.3)
@@ -237,30 +272,27 @@ class YouTubeDownloader:
             print("조건 맞는 영상 없음")
             return 0
-        print(f"\n다운로드: {len(filtered)}개" + (" (병렬)" if self.fast_mode else ""))
+        print(f"\n다운로드 및 분석: {len(filtered)}개" + (" (병렬)" if self.fast_mode else ""))
         success = 0
         if self.fast_mode and self.workers > 1:
-            # 병렬 다운로드
             with ThreadPoolExecutor(max_workers=self.workers) as executor:
                 futures = {
                     executor.submit(self._process_video, entry, category, cat_name): entry
                     for entry in filtered
                 }
                 for i, future in enumerate(as_completed(futures)):
                     entry = futures[future]
                     title = entry.get('title', '?')[:45]
                     try:
                         result = future.result()
                         print(f"\n[{i+1}/{len(filtered)}] {title}")
                         if result['status'] == "ok":
                             if result.get('detected'):
                                 print(f"  감지: {', '.join(result['detected'])}")
                             if result['saved']:
-                                print(f"  ✅ 저장: {cat_name}/")
+                                new_name = os.path.basename(result['new_path'])
+                                print(f"  ✅ 저장: {cat_name}/{new_name}")
                                 success += 1
                             elif result.get('undetected_saved'):
                                 print("  📁 미감지 보관")
@@ -276,61 +308,26 @@ class YouTubeDownloader:
                         print(f"\n[{i+1}/{len(filtered)}] {title}")
                         print(f"  ✗ 에러: {e}")
         else:
-            # 순차 다운로드
             for i, entry in enumerate(filtered):
                 vid = entry.get('id')
-                url = f"https://www.youtube.com/watch?v={vid}"
                 title = entry.get('title', '?')[:45]
                 print(f"\n[{i+1}/{len(filtered)}] {title}")
-                status, filepath, _ = self._download_one(url)
-                if not self.fast_mode:
-                    print()
-                if status == "ok" and filepath:
-                    print("  🔍 분석...")
-                    result = self.analyzer.analyze(filepath)
-                    detected = []
-                    if result['face']:
-                        detected.append(f"얼굴({result['face_count']})")
-                    if result['text']:
-                        detected.append("텍스트")
-                    if result['license_plate']:
-                        detected.append("번호판")
-                    if result['tattoo']:
-                        detected.append("타투")
-                    if detected:
-                        print(f"  감지: {', '.join(detected)}")
-                    if result.get(category):
-                        dest_dir = os.path.join(self.output_path, cat_name)
-                        os.makedirs(dest_dir, exist_ok=True)
-                        dest = os.path.join(dest_dir, os.path.basename(filepath))
-                        if not os.path.exists(dest):
-                            shutil.move(filepath, dest)
-                        print(f"  ✅ 저장: {cat_name}/")
+                result = self._process_video(entry, category, cat_name)
+                if result['status'] == "ok":
+                    if result.get('detected'):
+                        print(f"  감지: {', '.join(result['detected'])}")
+                    if result['saved']:
+                        new_name = os.path.basename(result['new_path'])
+                        print(f"  ✅ 저장: {cat_name}/{new_name}")
                         success += 1
+                    elif result.get('undetected_saved'):
+                        print("  📁 미감지 보관")
                     else:
-                        if category == 'license_plate':
-                            dest_dir = os.path.join(self.output_path, "번호판_미감지")
-                            os.makedirs(dest_dir, exist_ok=True)
-                            dest = os.path.join(dest_dir, os.path.basename(filepath))
-                            if not os.path.exists(dest):
-                                shutil.move(filepath, dest)
-                            print("  📁 미감지 보관")
-                        else:
-                            try:
-                                os.remove(filepath)
-                            except:
-                                pass
-                            print("  ❌ 미감지 삭제")
-                elif status == "skipped":
+                        print("  ❌ 미감지 삭제")
+                elif result['status'] == "skipped":
                     print("  ⏭ 이미 있음")
-                elif status == "unavailable":
+                elif result['status'] == "unavailable":
                     print("  ⏭ 사용불가")
                 else:
                     print("  ✗ 실패")

ytcollector-1.1.1/ytcollector/utils.py ADDED Viewed

@@ -0,0 +1,126 @@
+import os
+import subprocess
+def get_video_duration(file_path):
+    """영상 전체 길이를 초 단위로 반환"""
+    cmd = [
+        'ffprobe', '-v', 'error', '-show_entries', 'format=duration',
+        '-of', 'default=noprint_wrappers=1:nokey=1', file_path
+    ]
+    try:
+        output = subprocess.check_output(cmd).decode('utf-8').strip()
+        return float(output)
+    except:
+        return 0.0
+def timestamp_to_seconds(timestamp):
+    """MM:SS 또는 SS 형식을 초 단위로 변환"""
+    if isinstance(timestamp, (int, float)):
+        return float(timestamp)
+    try:
+        parts = str(timestamp).split(':')
+        if len(parts) == 2:
+            return int(parts[0]) * 60 + int(parts[1])
+        return float(parts[0])
+    except:
+        return 0.0
+def seconds_to_timestamp(seconds):
+    """초 단위를 MM:SS 형식으로 변환"""
+    m = int(seconds // 60)
+    s = int(seconds % 60)
+    return f"{m:02d}:{s:02d}"
+def clip_video(input_path, output_path, center_sec, window_seconds=90):
+    """center_sec를 기준으로 앞뒤 window_seconds만큼 자름"""
+    duration = get_video_duration(input_path)
+    if duration == 0:
+        return False
+    start_sec = max(0, center_sec - window_seconds)
+    end_sec = min(duration, start_sec + (window_seconds * 2))
+    if (end_sec - start_sec) < (window_seconds * 2) and start_sec > 0:
+        start_sec = max(0, end_sec - (window_seconds * 2))
+    actual_duration = end_sec - start_sec
+    # 임시 파일 경로
+    temp_output = output_path + ".tmp.mp4"
+    cmd = [
+        'ffmpeg', '-y', '-ss', f"{start_sec:.2f}", '-t', f"{actual_duration:.2f}",
+        '-i', input_path, '-c', 'copy', temp_output
+    ]
+    try:
+        subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        if os.path.exists(output_path):
+            os.remove(output_path)
+        os.rename(temp_output, output_path)
+        return True
+    except:
+        # copy 실패 시 재인코딩
+        cmd[7:9] = ['-c:v', 'libx264', '-crf', '23', '-c:a', 'aac']
+        try:
+            subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+            if os.path.exists(temp_output):
+                if os.path.exists(output_path): os.remove(output_path)
+                os.rename(temp_output, output_path)
+            return True
+        except:
+            if os.path.exists(temp_output): os.remove(temp_output)
+            return False
+def append_to_url_list(file_path, url, timestamp, task):
+    """youtube_url.txt에 데이터 추가"""
+    line = f"{url}, {timestamp}, {task}\n"
+    # 파일이 없으면 헤더 추가
+    exists = os.path.exists(file_path)
+    with open(file_path, 'a', encoding='utf-8') as f:
+        if not exists:
+            f.write("# URL, MM:SS, TaskName\n")
+        f.write(line)
+def get_url_list(file_path):
+    """youtube_url.txt 파일을 읽어서 리스트로 반환"""
+    if not os.path.exists(file_path):
+        return []
+    urls = []
+    with open(file_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            line = line.strip()
+            if not line or line.startswith('#'):
+                continue
+            parts = [p.strip() for p in line.split(',')]
+            if len(parts) >= 3:
+                urls.append({
+                    'url': parts[0],
+                    'timestamp': parts[1],
+                    'task': parts[2]
+                })
+    return urls
+def get_next_index(directory, prefix):
+    """
+    directory 내에서 {prefix}_{index:04d}.mp4 형식의 파일들을 찾아
+    가장 높은 index + 1을 반환함. 파일이 없으면 1 반환.
+    """
+    if not os.path.exists(directory):
+        return 1
+    max_idx = 0
+    pattern = f"{prefix}_"
+    for filename in os.listdir(directory):
+        if filename.startswith(pattern) and filename.endswith(".mp4"):
+            try:
+                # {prefix}_0001.mp4 -> 0001 추출
+                idx_part = filename[len(pattern):].split('.')[0]
+                idx = int(idx_part)
+                if idx > max_idx:
+                    max_idx = idx
+            except (ValueError, IndexError):
+                continue
+    return max_idx + 1

{ytcollector-1.0.9 → ytcollector-1.1.1}/ytcollector.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ytcollector
-Version: 1.0.9
+Version: 1.1.1
 Summary: YouTube 콘텐츠 수집기 - 얼굴, 번호판, 타투, 텍스트 감지
 Author: YTCollector Team
 License: MIT
@@ -54,7 +54,7 @@ pip install opencv-python easyocr numpy
 ### 기본 실행
 ```bash
-python main.py
+ytcollector
 ```
 기본값: 얼굴 카테고리 5개, 최대 3분 영상
@@ -66,7 +66,7 @@ python main.py
 | `-c`, `--categories` | 수집할 카테고리 | `face` |
 | `-n`, `--count` | 카테고리당 다운로드 수 | `5` |
 | `-d`, `--duration` | 최대 영상 길이(분) | `3` |
-| `-o`, `--output` | 저장 경로 | `~/Downloads/youtube_collection` |
+| `-o`, `--output` | 저장 경로 | `.` (현재 폴더) |
 | `--fast` | 고속 모드 (병렬 다운로드) | 비활성화 |
 | `-w`, `--workers` | 병렬 다운로드 수 | `3` |
 | `--proxy` | 프록시 주소 | 없음 |
@@ -86,45 +86,45 @@ python main.py
 ```bash
 # 얼굴 영상 10개 수집
-python main.py -c face -n 10
+ytcollector -c face -n 10
 # 번호판 영상 수집 (최대 5분)
-python main.py -c license_plate -d 5
+ytcollector -c license_plate -d 5
 # 타투 영상 수집
-python main.py -c tattoo -n 5
+ytcollector -c tattoo -n 5
 ```
 ### 여러 카테고리
 ```bash
 # 얼굴과 텍스트 각 10개씩
-python main.py -c face text -n 10
+ytcollector -c face text -n 10
 # 모든 카테고리 수집
-python main.py -c face license_plate tattoo text -n 5
+ytcollector -c face license_plate tattoo text -n 5
 ```
 ### 고속 모드
 ```bash
 # 병렬 다운로드 (기본 3개 동시)
-python main.py -c face -n 10 --fast
+ytcollector -c face -n 10 --fast
 # 5개 동시 다운로드
-python main.py -c face -n 10 --fast -w 5
+ytcollector -c face -n 10 --fast -w 5
 ```
 ### 저장 경로 지정
 ```bash
-python main.py -c face -o /path/to/save
+ytcollector -c face -o /path/to/save
 ```
 ### 프록시 사용
 ```bash
-python main.py -c face --proxy http://proxy.server:8080
+ytcollector -c face --proxy http://proxy.server:8080
 ```
 ## SBS Dataset 구축 (URL 리스트 기반)

{ytcollector-1.0.9 → ytcollector-1.1.1}/ytcollector.egg-info/SOURCES.txt RENAMED Viewed

@@ -6,6 +6,7 @@ ytcollector/cli.py
 ytcollector/config.py
 ytcollector/dataset_builder.py
 ytcollector/downloader.py
+ytcollector/utils.py
 ytcollector.egg-info/PKG-INFO
 ytcollector.egg-info/SOURCES.txt
 ytcollector.egg-info/dependency_links.txt

ytcollector-1.0.9/ytcollector/dataset_builder.py DELETED Viewed

@@ -1,136 +0,0 @@
-import os
-import subprocess
-from yt_dlp import YoutubeDL
-def get_url_list(file_path):
-    if not os.path.exists(file_path):
-        return []
-    urls = []
-    with open(file_path, 'r', encoding='utf-8') as f:
-        for line in f:
-            line = line.strip()
-            if not line or line.startswith('#'):
-                continue
-            parts = [p.strip() for p in line.split(',')]
-            if len(parts) >= 3:
-                urls.append({
-                    'url': parts[0],
-                    'timestamp': parts[1],
-                    'task': parts[2]
-                })
-    return urls
-def download_videos(url_list, output_dir):
-    os.makedirs(output_dir, exist_ok=True)
-    for idx, item in enumerate(url_list, 1):
-        url = item['url']
-        task = item['task']
-        index_str = f"{idx:03d}"
-        existing_files = [f for f in os.listdir(output_dir) if f.startswith(f"{index_str}_")]
-        if existing_files:
-            print(f"[{index_str}] Skip: {existing_files[0]}")
-            continue
-        print(f"[{index_str}] Downloading: {url} ({task})")
-        try:
-            ydl_opts = {
-                'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
-                'outtmpl': os.path.join(output_dir, f"{index_str}_{task}_%(title)s.%(ext)s"),
-                'quiet': True,
-                'no_warnings': True,
-            }
-            with YoutubeDL(ydl_opts) as ydl:
-                ydl.download([url])
-        except Exception as e:
-            print(f"[{index_str}] Failed: {e}")
-def get_video_duration(file_path):
-    cmd = [
-        'ffprobe', '-v', 'error', '-show_entries', 'format=duration',
-        '-of', 'default=noprint_wrappers=1:nokey=1', file_path
-    ]
-    try:
-        output = subprocess.check_output(cmd).decode('utf-8').strip()
-        return float(output)
-    except:
-        return 0.0
-def timestamp_to_seconds(timestamp):
-    try:
-        parts = timestamp.split(':')
-        if len(parts) == 2:
-            return int(parts[0]) * 60 + int(parts[1])
-        return 0.0
-    except:
-        return 0.0
-def clip_video(input_path, output_path, center_timestamp, window_seconds=90):
-    duration = get_video_duration(input_path)
-    if duration == 0: return False
-    center_sec = timestamp_to_seconds(center_timestamp)
-    start_sec = max(0, center_sec - window_seconds)
-    end_sec = min(duration, start_sec + (window_seconds * 2))
-    if (end_sec - start_sec) < (window_seconds * 2) and start_sec > 0:
-        start_sec = max(0, end_sec - (window_seconds * 2))
-    actual_duration = end_sec - start_sec
-    cmd = [
-        'ffmpeg', '-y', '-ss', str(start_sec), '-t', str(actual_duration),
-        '-i', input_path, '-c', 'copy', output_path
-    ]
-    try:
-        subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        return True
-    except:
-        cmd[7:9] = ['-c:v', 'libx264', '-crf', '23', '-c:a', 'aac']
-        try:
-            subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-            return True
-        except:
-            return False
-def build_dataset(url_file, output_root="."):
-    video_dir = os.path.join(output_root, "video")
-    clip_dir = os.path.join(output_root, "video_clips")
-    urls = get_url_list(url_file)
-    if not urls:
-        print(f"Error: No valid data in {url_file}")
-        return
-    print(f"--- Step 1: Downloading {len(urls)} videos ---")
-    download_videos(urls, video_dir)
-    print(f"\n--- Step 2: Clipping videos ---")
-    os.makedirs(clip_dir, exist_ok=True)
-    for idx, item in enumerate(urls, 1):
-        index_str = f"{idx:03d}"
-        files = [f for f in os.listdir(video_dir) if f.startswith(f"{index_str}_")]
-        if not files: continue
-        input_file = os.path.join(video_dir, files[0])
-        output_file = os.path.join(clip_dir, files[0])
-        if os.path.exists(output_file): continue
-        print(f"[{index_str}] Clipping: {files[0]}")
-        clip_video(input_file, output_file, item['timestamp'])
-    print(f"\nDone! Clips saved in: {os.path.abspath(clip_dir)}")
-def main():
-    import argparse
-    parser = argparse.ArgumentParser(description='Build SBS Dataset from YouTube URL list')
-    parser.add_argument('file', help='Path to youtube_url.txt')
-    parser.add_argument('-o', '--output', default='.', help='Output root directory (default: .)')
-    args = parser.parse_args()
-    build_dataset(args.file, args.output)
-if __name__ == "__main__":
-    main()

{ytcollector-1.0.9 → ytcollector-1.1.1}/setup.cfg RENAMED Viewed

File without changes

{ytcollector-1.0.9 → ytcollector-1.1.1}/ytcollector/__init__.py RENAMED Viewed

File without changes

{ytcollector-1.0.9 → ytcollector-1.1.1}/ytcollector.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{ytcollector-1.0.9 → ytcollector-1.1.1}/ytcollector.egg-info/entry_points.txt RENAMED Viewed

File without changes

{ytcollector-1.0.9 → ytcollector-1.1.1}/ytcollector.egg-info/requires.txt RENAMED Viewed

File without changes

{ytcollector-1.0.9 → ytcollector-1.1.1}/ytcollector.egg-info/top_level.txt RENAMED Viewed

File without changes

ytcollector 1.0.9__tar.gz → 1.1.1__tar.gz

ytcollector 1.0.9tar.gz → 1.1.1tar.gz