ytcollector 1.0.7__tar.gz → 1.0.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ytcollector
3
- Version: 1.0.7
3
+ Version: 1.0.8
4
4
  Summary: SBS 데이터셋 수집기
5
5
  Requires-Python: >=3.9
6
6
  Description-Content-Type: text/markdown
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "ytcollector"
7
- version = "1.0.7"
7
+ version = "1.0.8"
8
8
  description = "SBS 데이터셋 수집기"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -3,7 +3,7 @@ SBS Dataset Collector - YouTube 영상 수집 및 YOLO-World 검증 파이프라
3
3
  """
4
4
  from pathlib import Path
5
5
 
6
- __version__ = "1.0.7"
6
+ __version__ = "1.0.8"
7
7
  __author__ = "SBS Dataset Team"
8
8
 
9
9
  # Package root directory
@@ -69,9 +69,9 @@ def run_download(args):
69
69
 
70
70
  if args.fast:
71
71
  from .downloader import download_from_txt_parallel
72
- results = download_from_txt_parallel(txt_file, task, base_dir, max_count=args.count)
72
+ results = download_from_txt_parallel(txt_file, task, base_dir, max_count=args.count, skip_verify=args.skip_verify)
73
73
  else:
74
- results = download_from_txt(txt_file, task, base_dir, max_count=args.count)
74
+ results = download_from_txt(txt_file, task, base_dir, max_count=args.count, skip_verify=args.skip_verify)
75
75
 
76
76
  success_count = sum(1 for r in results if r.get('success'))
77
77
  total_success += success_count
@@ -189,6 +189,7 @@ Examples:
189
189
  download_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS, help='One or more tasks (e.g. face tattoo)')
190
190
  download_parser.add_argument('--count', '-n', type=int, help='Max videos to collect (default: 1000)')
191
191
  download_parser.add_argument('--fast', action='store_true', help='Enable fast parallel downloading')
192
+ download_parser.add_argument('--skip-verify', '-S', action='store_true', help='Skip YOLO verification and save all clips')
192
193
 
193
194
  # Download single
194
195
  single_parser = subparsers.add_parser('download-single', help='Download single video')
@@ -206,6 +207,7 @@ Examples:
206
207
  pipeline_parser = subparsers.add_parser('pipeline', help='Full pipeline')
207
208
  pipeline_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS)
208
209
  pipeline_parser.add_argument('--verify', action='store_true')
210
+ pipeline_parser.add_argument('--skip-verify', '-S', action='store_true', help='Skip verification in download stage')
209
211
 
210
212
  # List tasks
211
213
  subparsers.add_parser('list-tasks', help='List available tasks')
@@ -145,7 +145,8 @@ class VideoDownloader:
145
145
  self,
146
146
  url: str,
147
147
  timestamp_min: int,
148
- timestamp_sec: int
148
+ timestamp_sec: int,
149
+ skip_verify: bool = False
149
150
  ) -> Tuple[Optional[Path], Optional[dict]]:
150
151
  """
151
152
  특정 타임스탬프 기준으로 ±1:30 클립 다운로드
@@ -207,18 +208,24 @@ class VideoDownloader:
207
208
  try:
208
209
  self.download_segment(url, start_sec, end_sec, temp_path)
209
210
 
210
- # 5. YOLO 검증
211
- logger.info(f"Verifying content for task: {self.task_type}...")
212
- # verifier 모듈 사용하여 검증
213
- verify_result = verify_clip(temp_path, self.task_type, self.base_dir)
214
-
215
- if not verify_result.get('is_valid', False):
216
- logger.warning(f"Verification failed: No {self.task_type} detected. Deleting...")
217
- if temp_path.exists():
218
- temp_path.unlink()
219
- return None, None
211
+ # 5. YOLO 검증 (skip_verify가 False일 때만 수행)
212
+ if not skip_verify:
213
+ logger.info(f"Verifying content for task: {self.task_type}...")
214
+ # verifier 모듈 사용하여 검증
215
+ verify_result = verify_clip(temp_path, self.task_type, self.base_dir)
216
+
217
+ logger.info(f"Verification Info - Rate: {verify_result.get('summary', {}).get('detection_rate'):.2%}, Is Valid: {verify_result.get('is_valid')}")
218
+
219
+ if not verify_result.get('is_valid', False):
220
+ logger.warning(f"Verification failed: No {self.task_type} detected (Rate: {verify_result.get('summary', {}).get('detection_rate'):.2%}). Deleting...")
221
+ if temp_path.exists():
222
+ temp_path.unlink()
223
+ return None, None
224
+ else:
225
+ logger.info(f"Skipping verification for task: {self.task_type} (--skip-verify enabled)")
226
+ verify_result = {'is_valid': True, 'skipped': True}
220
227
 
221
- # 6. 검증 통과 -> 최종 저장 (순차적 파일명 생성)
228
+ # 6. 검증 통과(혹은 건너뜀) -> 최종 저장 (순차적 파일명 생성)
222
229
  final_path = get_clip_path(self.base_dir, self.task_type, filename=None)
223
230
 
224
231
  # 이동 (네트워크 드라이브면 shutil.move 사용)
@@ -276,19 +283,17 @@ def parse_txt_line(line: str) -> Optional[Dict]:
276
283
  return None
277
284
 
278
285
 
279
- def download_from_txt(txt_path: Path, task_type: str, base_dir: Path = None, max_count: int = None) -> list:
286
+ def download_from_txt(txt_path: Path, task_type: str, base_dir: Path = None, max_count: int = None, skip_verify: bool = False) -> list:
280
287
  """TXT 파일에서 다운로드 실행 (순차)"""
281
- # 기존 로직을 process_single_item 함수로 분리하여 재사용할 수 있으면 좋겠지만,
282
- # 코드 구조상 일단 순차 실행 유지하고 parallel 함수 별도 구현
283
- return _process_download_loop(txt_path, task_type, base_dir, parallel=False, max_count=max_count)
288
+ return _process_download_loop(txt_path, task_type, base_dir, parallel=False, max_count=max_count, skip_verify=skip_verify)
284
289
 
285
290
 
286
- def download_from_txt_parallel(txt_path: Path, task_type: str, base_dir: Path = None, max_count: int = None) -> list:
291
+ def download_from_txt_parallel(txt_path: Path, task_type: str, base_dir: Path = None, max_count: int = None, skip_verify: bool = False) -> list:
287
292
  """TXT 파일에서 병렬 다운로드 실행 (Fast Mode)"""
288
- return _process_download_loop(txt_path, task_type, base_dir, parallel=True, max_count=max_count)
293
+ return _process_download_loop(txt_path, task_type, base_dir, parallel=True, max_count=max_count, skip_verify=skip_verify)
289
294
 
290
295
 
291
- def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None, parallel: bool = False, max_count: int = None) -> list:
296
+ def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None, parallel: bool = False, max_count: int = None, skip_verify: bool = False) -> list:
292
297
  from .config import MAX_VIDEOS_PER_TASK, MAX_WORKERS, REQUEST_DELAY_MIN, REQUEST_DELAY_MAX
293
298
  import time
294
299
  import random
@@ -346,7 +351,8 @@ def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None
346
351
  output_path, metadata = downloader.download_clip_at_timestamp(
347
352
  url=data['url'],
348
353
  timestamp_min=data['timestamp_min'],
349
- timestamp_sec=data['timestamp_sec']
354
+ timestamp_sec=data['timestamp_sec'],
355
+ skip_verify=skip_verify
350
356
  )
351
357
 
352
358
  if output_path is None:
@@ -414,38 +420,51 @@ def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None
414
420
  if res.get('status') == 'limit_reached':
415
421
  break
416
422
 
417
- # --- 2단계: 목표 수량을 못 채웠을 경우 YouTube 검색Fallback ---
418
- current_count = downloader.get_saved_video_count()
419
- if current_count < limit:
423
+ # --- 2단계: 목표 수량을 못 채웠을 경우 YouTube 검색Fallback (반복 시도) ---
424
+ max_search_attempts = 5 # 최대 검색 시도 횟수
425
+ search_attempt = 0
426
+ processed_urls = set(data['url'] for data in items) # 이미 처리한 URL 중복 검색 방지
427
+
428
+ while downloader.get_saved_video_count() < limit and search_attempt < max_search_attempts:
429
+ current_count = downloader.get_saved_video_count()
420
430
  remaining = limit - current_count
421
- logger.info(f"\nTarget not reached ({current_count}/{limit}). Starting YouTube Search fallback for '{task_type}'...")
422
431
 
423
- # 검색어: 태스크 이름 (필요시 config에서 태스크별 검색어 별도 지정 가능)
424
- search_results = downloader.search_youtube(task_type, max_results=remaining * 2)
432
+ logger.info(f"\n[Search Attempt {search_attempt+1}] Target not reached ({current_count}/{limit}). Searching YouTube for '{task_type}'...")
433
+
434
+ # 검색어: 태스크 이름
435
+ # 검색 결과 개수를 점진적으로 늘리거나 조절 가능
436
+ search_results = downloader.search_youtube(task_type, max_results=min(100, remaining * 5))
425
437
 
426
438
  if not search_results:
427
- logger.warning("No search results found.")
428
- return results
439
+ logger.warning("No more search results found.")
440
+ break
429
441
 
430
- # 검색 결과는 타임스탬프 정보가 없으므로, 기본적으로 영상의 1:00 지점 혹은 0:00 지점을 시도
431
- # 여기서는 영상의 대략 1분 지점(영상이 짧으면 0)을 타겟으로 시도해봄
442
+ # 새로운 URL만 필터링
443
+ new_entries = [e for e in search_results if e['url'] not in processed_urls]
444
+ if not new_entries:
445
+ logger.info("No new unique videos found in this search attempt.")
446
+ search_attempt += 1
447
+ continue
448
+
432
449
  search_items = []
433
- for entry in search_results:
450
+ for entry in new_entries:
451
+ processed_urls.add(entry['url'])
452
+ # 검색 결과는 타임스탬프 정보가 없으므로 여러 지점 시도 가능 (현재는 1분 지점 고정)
434
453
  search_items.append({
435
454
  'task_type': task_type,
436
455
  'url': entry['url'],
437
- 'timestamp_min': 1, # 1분 지점 샘플링 시도
456
+ 'timestamp_min': 1,
438
457
  'timestamp_sec': 0,
439
458
  'description': f"Auto-searched: {entry['title']}"
440
459
  })
441
460
 
442
- logger.info(f"Processing {len(search_items)} search results...")
461
+ logger.info(f"Processing {len(search_items)} new search results...")
443
462
 
444
463
  if parallel:
445
464
  with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
446
465
  futures = [executor.submit(process_item, item) for item in search_items]
447
466
  from tqdm import tqdm
448
- for future in tqdm(as_completed(futures), total=len(search_items), desc="Search Fallback"):
467
+ for future in tqdm(as_completed(futures), total=len(search_items), desc=f"Search Fallback #{search_attempt+1}"):
449
468
  try:
450
469
  res = future.result()
451
470
  results.append(res)
@@ -462,5 +481,13 @@ def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None
462
481
  results.append(res)
463
482
  if res.get('status') == 'limit_reached':
464
483
  break
484
+
485
+ search_attempt += 1
486
+
487
+ final_count = downloader.get_saved_video_count()
488
+ if final_count < limit:
489
+ logger.warning(f"Finished search attempts. Final count: {final_count}/{limit}")
490
+ else:
491
+ logger.info(f"Successfully reached target count: {final_count}/{limit}")
465
492
 
466
493
  return results
@@ -136,7 +136,7 @@ class YOLOWorldVerifier:
136
136
  'frame_results': frame_results,
137
137
  'verified_at': datetime.now().isoformat(),
138
138
  'model': self.model_name,
139
- 'is_valid': detection_rate > 0.1,
139
+ 'is_valid': detection_rate > 0.01, # 1% 이상 탐지되면 유효한 것으로 간주 (기존 10%에서 하향)
140
140
  }
141
141
 
142
142
  logger.info(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ytcollector
3
- Version: 1.0.7
3
+ Version: 1.0.8
4
4
  Summary: SBS 데이터셋 수집기
5
5
  Requires-Python: >=3.9
6
6
  Description-Content-Type: text/markdown
File without changes
File without changes