ytcollector 1.0.7__py3-none-any.whl → 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ytcollector/__init__.py +1 -1
- ytcollector/cli.py +4 -2
- ytcollector/downloader.py +61 -34
- ytcollector/verifier.py +1 -1
- {ytcollector-1.0.7.dist-info → ytcollector-1.0.8.dist-info}/METADATA +1 -1
- ytcollector-1.0.8.dist-info/RECORD +12 -0
- ytcollector-1.0.7.dist-info/RECORD +0 -12
- {ytcollector-1.0.7.dist-info → ytcollector-1.0.8.dist-info}/WHEEL +0 -0
- {ytcollector-1.0.7.dist-info → ytcollector-1.0.8.dist-info}/entry_points.txt +0 -0
- {ytcollector-1.0.7.dist-info → ytcollector-1.0.8.dist-info}/top_level.txt +0 -0
ytcollector/__init__.py
CHANGED
ytcollector/cli.py
CHANGED
|
@@ -69,9 +69,9 @@ def run_download(args):
|
|
|
69
69
|
|
|
70
70
|
if args.fast:
|
|
71
71
|
from .downloader import download_from_txt_parallel
|
|
72
|
-
results = download_from_txt_parallel(txt_file, task, base_dir, max_count=args.count)
|
|
72
|
+
results = download_from_txt_parallel(txt_file, task, base_dir, max_count=args.count, skip_verify=args.skip_verify)
|
|
73
73
|
else:
|
|
74
|
-
results = download_from_txt(txt_file, task, base_dir, max_count=args.count)
|
|
74
|
+
results = download_from_txt(txt_file, task, base_dir, max_count=args.count, skip_verify=args.skip_verify)
|
|
75
75
|
|
|
76
76
|
success_count = sum(1 for r in results if r.get('success'))
|
|
77
77
|
total_success += success_count
|
|
@@ -189,6 +189,7 @@ Examples:
|
|
|
189
189
|
download_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS, help='One or more tasks (e.g. face tattoo)')
|
|
190
190
|
download_parser.add_argument('--count', '-n', type=int, help='Max videos to collect (default: 1000)')
|
|
191
191
|
download_parser.add_argument('--fast', action='store_true', help='Enable fast parallel downloading')
|
|
192
|
+
download_parser.add_argument('--skip-verify', '-S', action='store_true', help='Skip YOLO verification and save all clips')
|
|
192
193
|
|
|
193
194
|
# Download single
|
|
194
195
|
single_parser = subparsers.add_parser('download-single', help='Download single video')
|
|
@@ -206,6 +207,7 @@ Examples:
|
|
|
206
207
|
pipeline_parser = subparsers.add_parser('pipeline', help='Full pipeline')
|
|
207
208
|
pipeline_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS)
|
|
208
209
|
pipeline_parser.add_argument('--verify', action='store_true')
|
|
210
|
+
pipeline_parser.add_argument('--skip-verify', '-S', action='store_true', help='Skip verification in download stage')
|
|
209
211
|
|
|
210
212
|
# List tasks
|
|
211
213
|
subparsers.add_parser('list-tasks', help='List available tasks')
|
ytcollector/downloader.py
CHANGED
|
@@ -145,7 +145,8 @@ class VideoDownloader:
|
|
|
145
145
|
self,
|
|
146
146
|
url: str,
|
|
147
147
|
timestamp_min: int,
|
|
148
|
-
timestamp_sec: int
|
|
148
|
+
timestamp_sec: int,
|
|
149
|
+
skip_verify: bool = False
|
|
149
150
|
) -> Tuple[Optional[Path], Optional[dict]]:
|
|
150
151
|
"""
|
|
151
152
|
특정 타임스탬프 기준으로 ±1:30 클립 다운로드
|
|
@@ -207,18 +208,24 @@ class VideoDownloader:
|
|
|
207
208
|
try:
|
|
208
209
|
self.download_segment(url, start_sec, end_sec, temp_path)
|
|
209
210
|
|
|
210
|
-
# 5. YOLO 검증
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
logger.
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
211
|
+
# 5. YOLO 검증 (skip_verify가 False일 때만 수행)
|
|
212
|
+
if not skip_verify:
|
|
213
|
+
logger.info(f"Verifying content for task: {self.task_type}...")
|
|
214
|
+
# verifier 모듈 사용하여 검증
|
|
215
|
+
verify_result = verify_clip(temp_path, self.task_type, self.base_dir)
|
|
216
|
+
|
|
217
|
+
logger.info(f"Verification Info - Rate: {verify_result.get('summary', {}).get('detection_rate'):.2%}, Is Valid: {verify_result.get('is_valid')}")
|
|
218
|
+
|
|
219
|
+
if not verify_result.get('is_valid', False):
|
|
220
|
+
logger.warning(f"Verification failed: No {self.task_type} detected (Rate: {verify_result.get('summary', {}).get('detection_rate'):.2%}). Deleting...")
|
|
221
|
+
if temp_path.exists():
|
|
222
|
+
temp_path.unlink()
|
|
223
|
+
return None, None
|
|
224
|
+
else:
|
|
225
|
+
logger.info(f"Skipping verification for task: {self.task_type} (--skip-verify enabled)")
|
|
226
|
+
verify_result = {'is_valid': True, 'skipped': True}
|
|
220
227
|
|
|
221
|
-
# 6. 검증 통과 -> 최종 저장 (순차적 파일명 생성)
|
|
228
|
+
# 6. 검증 통과(혹은 건너뜀) -> 최종 저장 (순차적 파일명 생성)
|
|
222
229
|
final_path = get_clip_path(self.base_dir, self.task_type, filename=None)
|
|
223
230
|
|
|
224
231
|
# 이동 (네트워크 드라이브면 shutil.move 사용)
|
|
@@ -276,19 +283,17 @@ def parse_txt_line(line: str) -> Optional[Dict]:
|
|
|
276
283
|
return None
|
|
277
284
|
|
|
278
285
|
|
|
279
|
-
def download_from_txt(txt_path: Path, task_type: str, base_dir: Path = None, max_count: int = None) -> list:
|
|
286
|
+
def download_from_txt(txt_path: Path, task_type: str, base_dir: Path = None, max_count: int = None, skip_verify: bool = False) -> list:
|
|
280
287
|
"""TXT 파일에서 다운로드 실행 (순차)"""
|
|
281
|
-
|
|
282
|
-
# 코드 구조상 일단 순차 실행 유지하고 parallel 함수 별도 구현
|
|
283
|
-
return _process_download_loop(txt_path, task_type, base_dir, parallel=False, max_count=max_count)
|
|
288
|
+
return _process_download_loop(txt_path, task_type, base_dir, parallel=False, max_count=max_count, skip_verify=skip_verify)
|
|
284
289
|
|
|
285
290
|
|
|
286
|
-
def download_from_txt_parallel(txt_path: Path, task_type: str, base_dir: Path = None, max_count: int = None) -> list:
|
|
291
|
+
def download_from_txt_parallel(txt_path: Path, task_type: str, base_dir: Path = None, max_count: int = None, skip_verify: bool = False) -> list:
|
|
287
292
|
"""TXT 파일에서 병렬 다운로드 실행 (Fast Mode)"""
|
|
288
|
-
return _process_download_loop(txt_path, task_type, base_dir, parallel=True, max_count=max_count)
|
|
293
|
+
return _process_download_loop(txt_path, task_type, base_dir, parallel=True, max_count=max_count, skip_verify=skip_verify)
|
|
289
294
|
|
|
290
295
|
|
|
291
|
-
def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None, parallel: bool = False, max_count: int = None) -> list:
|
|
296
|
+
def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None, parallel: bool = False, max_count: int = None, skip_verify: bool = False) -> list:
|
|
292
297
|
from .config import MAX_VIDEOS_PER_TASK, MAX_WORKERS, REQUEST_DELAY_MIN, REQUEST_DELAY_MAX
|
|
293
298
|
import time
|
|
294
299
|
import random
|
|
@@ -346,7 +351,8 @@ def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None
|
|
|
346
351
|
output_path, metadata = downloader.download_clip_at_timestamp(
|
|
347
352
|
url=data['url'],
|
|
348
353
|
timestamp_min=data['timestamp_min'],
|
|
349
|
-
timestamp_sec=data['timestamp_sec']
|
|
354
|
+
timestamp_sec=data['timestamp_sec'],
|
|
355
|
+
skip_verify=skip_verify
|
|
350
356
|
)
|
|
351
357
|
|
|
352
358
|
if output_path is None:
|
|
@@ -414,38 +420,51 @@ def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None
|
|
|
414
420
|
if res.get('status') == 'limit_reached':
|
|
415
421
|
break
|
|
416
422
|
|
|
417
|
-
# --- 2단계: 목표 수량을 못 채웠을 경우 YouTube 검색Fallback ---
|
|
418
|
-
|
|
419
|
-
|
|
423
|
+
# --- 2단계: 목표 수량을 못 채웠을 경우 YouTube 검색Fallback (반복 시도) ---
|
|
424
|
+
max_search_attempts = 5 # 최대 검색 시도 횟수
|
|
425
|
+
search_attempt = 0
|
|
426
|
+
processed_urls = set(data['url'] for data in items) # 이미 처리한 URL 중복 검색 방지
|
|
427
|
+
|
|
428
|
+
while downloader.get_saved_video_count() < limit and search_attempt < max_search_attempts:
|
|
429
|
+
current_count = downloader.get_saved_video_count()
|
|
420
430
|
remaining = limit - current_count
|
|
421
|
-
logger.info(f"\nTarget not reached ({current_count}/{limit}). Starting YouTube Search fallback for '{task_type}'...")
|
|
422
431
|
|
|
423
|
-
|
|
424
|
-
|
|
432
|
+
logger.info(f"\n[Search Attempt {search_attempt+1}] Target not reached ({current_count}/{limit}). Searching YouTube for '{task_type}'...")
|
|
433
|
+
|
|
434
|
+
# 검색어: 태스크 이름
|
|
435
|
+
# 검색 결과 개수를 점진적으로 늘리거나 조절 가능
|
|
436
|
+
search_results = downloader.search_youtube(task_type, max_results=min(100, remaining * 5))
|
|
425
437
|
|
|
426
438
|
if not search_results:
|
|
427
|
-
logger.warning("No search results found.")
|
|
428
|
-
|
|
439
|
+
logger.warning("No more search results found.")
|
|
440
|
+
break
|
|
429
441
|
|
|
430
|
-
#
|
|
431
|
-
|
|
442
|
+
# 새로운 URL만 필터링
|
|
443
|
+
new_entries = [e for e in search_results if e['url'] not in processed_urls]
|
|
444
|
+
if not new_entries:
|
|
445
|
+
logger.info("No new unique videos found in this search attempt.")
|
|
446
|
+
search_attempt += 1
|
|
447
|
+
continue
|
|
448
|
+
|
|
432
449
|
search_items = []
|
|
433
|
-
for entry in
|
|
450
|
+
for entry in new_entries:
|
|
451
|
+
processed_urls.add(entry['url'])
|
|
452
|
+
# 검색 결과는 타임스탬프 정보가 없으므로 여러 지점 시도 가능 (현재는 1분 지점 고정)
|
|
434
453
|
search_items.append({
|
|
435
454
|
'task_type': task_type,
|
|
436
455
|
'url': entry['url'],
|
|
437
|
-
'timestamp_min': 1,
|
|
456
|
+
'timestamp_min': 1,
|
|
438
457
|
'timestamp_sec': 0,
|
|
439
458
|
'description': f"Auto-searched: {entry['title']}"
|
|
440
459
|
})
|
|
441
460
|
|
|
442
|
-
logger.info(f"Processing {len(search_items)} search results...")
|
|
461
|
+
logger.info(f"Processing {len(search_items)} new search results...")
|
|
443
462
|
|
|
444
463
|
if parallel:
|
|
445
464
|
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
|
446
465
|
futures = [executor.submit(process_item, item) for item in search_items]
|
|
447
466
|
from tqdm import tqdm
|
|
448
|
-
for future in tqdm(as_completed(futures), total=len(search_items), desc="Search Fallback"):
|
|
467
|
+
for future in tqdm(as_completed(futures), total=len(search_items), desc=f"Search Fallback #{search_attempt+1}"):
|
|
449
468
|
try:
|
|
450
469
|
res = future.result()
|
|
451
470
|
results.append(res)
|
|
@@ -462,5 +481,13 @@ def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None
|
|
|
462
481
|
results.append(res)
|
|
463
482
|
if res.get('status') == 'limit_reached':
|
|
464
483
|
break
|
|
484
|
+
|
|
485
|
+
search_attempt += 1
|
|
486
|
+
|
|
487
|
+
final_count = downloader.get_saved_video_count()
|
|
488
|
+
if final_count < limit:
|
|
489
|
+
logger.warning(f"Finished search attempts. Final count: {final_count}/{limit}")
|
|
490
|
+
else:
|
|
491
|
+
logger.info(f"Successfully reached target count: {final_count}/{limit}")
|
|
465
492
|
|
|
466
493
|
return results
|
ytcollector/verifier.py
CHANGED
|
@@ -136,7 +136,7 @@ class YOLOWorldVerifier:
|
|
|
136
136
|
'frame_results': frame_results,
|
|
137
137
|
'verified_at': datetime.now().isoformat(),
|
|
138
138
|
'model': self.model_name,
|
|
139
|
-
'is_valid': detection_rate > 0.1
|
|
139
|
+
'is_valid': detection_rate > 0.01, # 1% 이상 탐지되면 유효한 것으로 간주 (기존 10%에서 하향)
|
|
140
140
|
}
|
|
141
141
|
|
|
142
142
|
logger.info(
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
config/settings.py,sha256=RcK41kaUC0zam5SsdXfb7u_qjM_TlJDa0a8hC_MGacQ,1197
|
|
2
|
+
ytcollector/__init__.py,sha256=Alq_JsvEi2RZusFB7-AAhzNnMFyBbHNU6NJR_AxDgs4,365
|
|
3
|
+
ytcollector/cli.py,sha256=Fg8GA7gQu3s26XgUg1NBEZuNKQpZLRrnp2s5bqhed8M,8636
|
|
4
|
+
ytcollector/config.py,sha256=ez9flxTbjmdiJB7_IYivWd9xaRfJ8CLBhPYRedLi8Mk,2323
|
|
5
|
+
ytcollector/downloader.py,sha256=QrqwG7PpoJZymKnOOEj1aqZudE6qY3AV4fiuo1anoGk,19901
|
|
6
|
+
ytcollector/utils.py,sha256=gInDx6adV-SfQ2SH5_i8w1gvYL-Nsmz1e1W__gCdVH8,4654
|
|
7
|
+
ytcollector/verifier.py,sha256=OHkAyUF3J6wFqKSa7RYT9Z6-W_lwTXarBEF9xqxNthA,6366
|
|
8
|
+
ytcollector-1.0.8.dist-info/METADATA,sha256=a0t8IXj-xlqbsy_D5EBNemVRi-Z52gJAmu2VJlep0Cc,3727
|
|
9
|
+
ytcollector-1.0.8.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
10
|
+
ytcollector-1.0.8.dist-info/entry_points.txt,sha256=PoanZbxogGnV4tLcZZkla0Yh7OvPtqcukDYr563w5RA,53
|
|
11
|
+
ytcollector-1.0.8.dist-info/top_level.txt,sha256=TVfBZHJgYRfSSTgLJELvOoMA55qR8kWuxtiIaItwzIQ,19
|
|
12
|
+
ytcollector-1.0.8.dist-info/RECORD,,
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
config/settings.py,sha256=RcK41kaUC0zam5SsdXfb7u_qjM_TlJDa0a8hC_MGacQ,1197
|
|
2
|
-
ytcollector/__init__.py,sha256=tMCSB_dqIAY-1jVYkIrI1PvRYTnL1EkofRYGKh1uN24,365
|
|
3
|
-
ytcollector/cli.py,sha256=meCnT3cMBF15AkHpPeUdCFjL6WHZI-UB_F1echeU6is,8328
|
|
4
|
-
ytcollector/config.py,sha256=ez9flxTbjmdiJB7_IYivWd9xaRfJ8CLBhPYRedLi8Mk,2323
|
|
5
|
-
ytcollector/downloader.py,sha256=ZYCz2oQyVmr9PQG3gVIH17KlZgqomCnNd06n1TyMSR8,18481
|
|
6
|
-
ytcollector/utils.py,sha256=gInDx6adV-SfQ2SH5_i8w1gvYL-Nsmz1e1W__gCdVH8,4654
|
|
7
|
-
ytcollector/verifier.py,sha256=8Nn3b6fTQYxCGPt01kJMDSZ2hy8gk54deSayOpBuY48,6286
|
|
8
|
-
ytcollector-1.0.7.dist-info/METADATA,sha256=79KVtg4U_lrLn-UHTJTugSdggrRdg3kKN5Usqc6kcTM,3727
|
|
9
|
-
ytcollector-1.0.7.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
10
|
-
ytcollector-1.0.7.dist-info/entry_points.txt,sha256=PoanZbxogGnV4tLcZZkla0Yh7OvPtqcukDYr563w5RA,53
|
|
11
|
-
ytcollector-1.0.7.dist-info/top_level.txt,sha256=TVfBZHJgYRfSSTgLJELvOoMA55qR8kWuxtiIaItwzIQ,19
|
|
12
|
-
ytcollector-1.0.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|