ytcollector 1.0.6__py3-none-any.whl → 1.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ytcollector/__init__.py +1 -1
- ytcollector/downloader.py +104 -28
- {ytcollector-1.0.6.dist-info → ytcollector-1.0.7.dist-info}/METADATA +12 -10
- ytcollector-1.0.7.dist-info/RECORD +12 -0
- ytcollector-1.0.6.dist-info/RECORD +0 -12
- {ytcollector-1.0.6.dist-info → ytcollector-1.0.7.dist-info}/WHEEL +0 -0
- {ytcollector-1.0.6.dist-info → ytcollector-1.0.7.dist-info}/entry_points.txt +0 -0
- {ytcollector-1.0.6.dist-info → ytcollector-1.0.7.dist-info}/top_level.txt +0 -0
ytcollector/__init__.py
CHANGED
ytcollector/downloader.py
CHANGED
|
@@ -49,6 +49,35 @@ class VideoDownloader:
|
|
|
49
49
|
'channel': info.get('channel'),
|
|
50
50
|
'upload_date': info.get('upload_date'),
|
|
51
51
|
}
|
|
52
|
+
|
|
53
|
+
def search_youtube(self, query: str, max_results: int = 50) -> List[Dict]:
|
|
54
|
+
"""YouTube 검색을 통해 상위 결과의 URL 목록 반환"""
|
|
55
|
+
ydl_opts = {
|
|
56
|
+
'quiet': True,
|
|
57
|
+
'no_warnings': True,
|
|
58
|
+
'extract_flat': True,
|
|
59
|
+
'force_generic_extractor': False,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
search_query = f"ytsearch{max_results}:{query}"
|
|
63
|
+
logger.info(f"Searching YouTube for: '{query}' (Max {max_results} results)")
|
|
64
|
+
|
|
65
|
+
results = []
|
|
66
|
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
67
|
+
try:
|
|
68
|
+
info = ydl.extract_info(search_query, download=False)
|
|
69
|
+
if 'entries' in info:
|
|
70
|
+
for entry in info['entries']:
|
|
71
|
+
if entry:
|
|
72
|
+
results.append({
|
|
73
|
+
'url': f"https://www.youtube.com/watch?v={entry['id']}",
|
|
74
|
+
'title': entry.get('title'),
|
|
75
|
+
'id': entry['id']
|
|
76
|
+
})
|
|
77
|
+
except Exception as e:
|
|
78
|
+
logger.error(f"Search failed: {e}")
|
|
79
|
+
|
|
80
|
+
return results
|
|
52
81
|
|
|
53
82
|
def calculate_clip_range(
|
|
54
83
|
self,
|
|
@@ -357,34 +386,81 @@ def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None
|
|
|
357
386
|
'status': 'error'
|
|
358
387
|
}
|
|
359
388
|
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
from tqdm import tqdm
|
|
366
|
-
for future in tqdm(as_completed(futures), total=len(items), desc="Fast Download"):
|
|
367
|
-
try:
|
|
368
|
-
res = future.result()
|
|
369
|
-
results.append(res)
|
|
370
|
-
if res.get('status') == 'limit_reached' or downloader.get_saved_video_count() >= limit:
|
|
371
|
-
logger.warning(f"Download limit ({limit}) reached. Stopping remaining tasks.")
|
|
372
|
-
executor.shutdown(wait=False, cancel_futures=True)
|
|
373
|
-
break
|
|
374
|
-
except Exception:
|
|
375
|
-
continue
|
|
376
|
-
else:
|
|
377
|
-
# 순차 실행
|
|
378
|
-
for item in items:
|
|
379
|
-
# 루프 시작 전 체크
|
|
380
|
-
if downloader.get_saved_video_count() >= limit:
|
|
381
|
-
logger.info(f"Target count ({limit}) reached. Stopping.")
|
|
382
|
-
break
|
|
389
|
+
# --- 1단계: youtube_url.txt 파일 목록 처리 ---
|
|
390
|
+
if items:
|
|
391
|
+
if parallel:
|
|
392
|
+
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
|
393
|
+
futures = [executor.submit(process_item, item) for item in items]
|
|
383
394
|
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
395
|
+
# 진행 상황 표시
|
|
396
|
+
from tqdm import tqdm
|
|
397
|
+
for future in tqdm(as_completed(futures), total=len(items), desc="Fast Download"):
|
|
398
|
+
try:
|
|
399
|
+
res = future.result()
|
|
400
|
+
results.append(res)
|
|
401
|
+
if res.get('status') == 'limit_reached' or downloader.get_saved_video_count() >= limit:
|
|
402
|
+
logger.info(f"Download limit ({limit}) reached. Stopping.")
|
|
403
|
+
executor.shutdown(wait=False, cancel_futures=True)
|
|
404
|
+
break
|
|
405
|
+
except Exception:
|
|
406
|
+
continue
|
|
407
|
+
else:
|
|
408
|
+
# 순차 실행
|
|
409
|
+
for item in items:
|
|
410
|
+
if downloader.get_saved_video_count() >= limit:
|
|
411
|
+
break
|
|
412
|
+
res = process_item(item)
|
|
413
|
+
results.append(res)
|
|
414
|
+
if res.get('status') == 'limit_reached':
|
|
415
|
+
break
|
|
416
|
+
|
|
417
|
+
# --- 2단계: 목표 수량을 못 채웠을 경우 YouTube 검색Fallback ---
|
|
418
|
+
current_count = downloader.get_saved_video_count()
|
|
419
|
+
if current_count < limit:
|
|
420
|
+
remaining = limit - current_count
|
|
421
|
+
logger.info(f"\nTarget not reached ({current_count}/{limit}). Starting YouTube Search fallback for '{task_type}'...")
|
|
422
|
+
|
|
423
|
+
# 검색어: 태스크 이름 (필요시 config에서 태스크별 검색어 별도 지정 가능)
|
|
424
|
+
search_results = downloader.search_youtube(task_type, max_results=remaining * 2)
|
|
425
|
+
|
|
426
|
+
if not search_results:
|
|
427
|
+
logger.warning("No search results found.")
|
|
428
|
+
return results
|
|
429
|
+
|
|
430
|
+
# 검색 결과는 타임스탬프 정보가 없으므로, 기본적으로 영상의 1:00 지점 혹은 0:00 지점을 시도
|
|
431
|
+
# 여기서는 영상의 대략 1분 지점(영상이 짧으면 0)을 타겟으로 시도해봄
|
|
432
|
+
search_items = []
|
|
433
|
+
for entry in search_results:
|
|
434
|
+
search_items.append({
|
|
435
|
+
'task_type': task_type,
|
|
436
|
+
'url': entry['url'],
|
|
437
|
+
'timestamp_min': 1, # 1분 지점 샘플링 시도
|
|
438
|
+
'timestamp_sec': 0,
|
|
439
|
+
'description': f"Auto-searched: {entry['title']}"
|
|
440
|
+
})
|
|
441
|
+
|
|
442
|
+
logger.info(f"Processing {len(search_items)} search results...")
|
|
443
|
+
|
|
444
|
+
if parallel:
|
|
445
|
+
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
|
446
|
+
futures = [executor.submit(process_item, item) for item in search_items]
|
|
447
|
+
from tqdm import tqdm
|
|
448
|
+
for future in tqdm(as_completed(futures), total=len(search_items), desc="Search Fallback"):
|
|
449
|
+
try:
|
|
450
|
+
res = future.result()
|
|
451
|
+
results.append(res)
|
|
452
|
+
if res.get('status') == 'limit_reached' or downloader.get_saved_video_count() >= limit:
|
|
453
|
+
executor.shutdown(wait=False, cancel_futures=True)
|
|
454
|
+
break
|
|
455
|
+
except Exception:
|
|
456
|
+
continue
|
|
457
|
+
else:
|
|
458
|
+
for item in search_items:
|
|
459
|
+
if downloader.get_saved_video_count() >= limit:
|
|
460
|
+
break
|
|
461
|
+
res = process_item(item)
|
|
462
|
+
results.append(res)
|
|
463
|
+
if res.get('status') == 'limit_reached':
|
|
464
|
+
break
|
|
389
465
|
|
|
390
466
|
return results
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ytcollector
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.7
|
|
4
4
|
Summary: SBS 데이터셋 수집기
|
|
5
5
|
Requires-Python: >=3.9
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -18,7 +18,7 @@ YouTube 영상에서 얼굴, 자동차 번호판, 타투, 텍스트 자막을
|
|
|
18
18
|
|
|
19
19
|
**필수 요구사항:**
|
|
20
20
|
- Python 3.8 이상
|
|
21
|
-
- FFmpeg (Mac: `brew install ffmpeg`
|
|
21
|
+
- FFmpeg (pip 설치 시 `imageio-ffmpeg`를 통해 자동으로 구성되나, 실패 시 Mac: `brew install ffmpeg` 설치 권장)
|
|
22
22
|
|
|
23
23
|
**설치:**
|
|
24
24
|
```bash
|
|
@@ -52,15 +52,17 @@ face,https://www.youtube.com/watch?v=VIDEO_ID,2,30,설명
|
|
|
52
52
|
|
|
53
53
|
이 프로그램은 **다운로드 → YOLO 검증 → (성공 시) 저장** 순서로 작동합니다. 타겟 객체가 없으면 자동으로 삭제됩니다.
|
|
54
54
|
|
|
55
|
-
###
|
|
56
|
-
안정적으로 하나씩
|
|
55
|
+
### 주요 명령어 예시
|
|
56
|
+
안정적으로 하나씩 다운로드하거나, 여러 태스크를 동시에 처리하고 목표 수량을 설정할 수 있습니다.
|
|
57
|
+
|
|
57
58
|
```bash
|
|
59
|
+
# 기본 다운로드 (태스크 하나)
|
|
58
60
|
ytcollector download --task face
|
|
59
|
-
```
|
|
60
61
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
62
|
+
# 여러 태스크 동시에 실행 및 목표 수량(-n) 설정
|
|
63
|
+
ytcollector download --task face tattoo text -n 100
|
|
64
|
+
|
|
65
|
+
# 🚀 Fast 모드 (병렬 다운로드)
|
|
64
66
|
ytcollector download --task face --fast
|
|
65
67
|
```
|
|
66
68
|
* **방화벽 우회**: 랜덤 딜레이(1~3초)가 적용되어 차단을 방지합니다.
|
|
@@ -93,8 +95,8 @@ NAS_PATH_MAC = "/Volumes/Data/Private Dataset/..."
|
|
|
93
95
|
| 명령어 | 설명 | 예시 |
|
|
94
96
|
|--------|------|------|
|
|
95
97
|
| `init` | 프로젝트 초기화 | `ytcollector init` |
|
|
96
|
-
| `download` |
|
|
97
|
-
| `download-single` | URL 1개만 테스트 다운로드 | `ytcollector download-single --task face ...` |
|
|
98
|
+
| `download` | 대량 다운로드 (여러 태스크, 개수 제한 가능) | `ytcollector download --task face tattoo -n 50` |
|
|
99
|
+
| `download-single` | URL 1개만 테스트 다운로드 | `ytcollector download-single --task face -u ...` |
|
|
98
100
|
| `verify` | 수동 YOLO 검증 (기존 파일) | `ytcollector verify --task face` |
|
|
99
101
|
| `list-tasks` | 지원하는 태스크 목록 확인 | `ytcollector list-tasks` |
|
|
100
102
|
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
config/settings.py,sha256=RcK41kaUC0zam5SsdXfb7u_qjM_TlJDa0a8hC_MGacQ,1197
|
|
2
|
+
ytcollector/__init__.py,sha256=tMCSB_dqIAY-1jVYkIrI1PvRYTnL1EkofRYGKh1uN24,365
|
|
3
|
+
ytcollector/cli.py,sha256=meCnT3cMBF15AkHpPeUdCFjL6WHZI-UB_F1echeU6is,8328
|
|
4
|
+
ytcollector/config.py,sha256=ez9flxTbjmdiJB7_IYivWd9xaRfJ8CLBhPYRedLi8Mk,2323
|
|
5
|
+
ytcollector/downloader.py,sha256=ZYCz2oQyVmr9PQG3gVIH17KlZgqomCnNd06n1TyMSR8,18481
|
|
6
|
+
ytcollector/utils.py,sha256=gInDx6adV-SfQ2SH5_i8w1gvYL-Nsmz1e1W__gCdVH8,4654
|
|
7
|
+
ytcollector/verifier.py,sha256=8Nn3b6fTQYxCGPt01kJMDSZ2hy8gk54deSayOpBuY48,6286
|
|
8
|
+
ytcollector-1.0.7.dist-info/METADATA,sha256=79KVtg4U_lrLn-UHTJTugSdggrRdg3kKN5Usqc6kcTM,3727
|
|
9
|
+
ytcollector-1.0.7.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
10
|
+
ytcollector-1.0.7.dist-info/entry_points.txt,sha256=PoanZbxogGnV4tLcZZkla0Yh7OvPtqcukDYr563w5RA,53
|
|
11
|
+
ytcollector-1.0.7.dist-info/top_level.txt,sha256=TVfBZHJgYRfSSTgLJELvOoMA55qR8kWuxtiIaItwzIQ,19
|
|
12
|
+
ytcollector-1.0.7.dist-info/RECORD,,
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
config/settings.py,sha256=RcK41kaUC0zam5SsdXfb7u_qjM_TlJDa0a8hC_MGacQ,1197
|
|
2
|
-
ytcollector/__init__.py,sha256=0xuhBoqfD_wEORF5eN2C8a1z7FV-mafh-f8W-zax2c0,365
|
|
3
|
-
ytcollector/cli.py,sha256=meCnT3cMBF15AkHpPeUdCFjL6WHZI-UB_F1echeU6is,8328
|
|
4
|
-
ytcollector/config.py,sha256=ez9flxTbjmdiJB7_IYivWd9xaRfJ8CLBhPYRedLi8Mk,2323
|
|
5
|
-
ytcollector/downloader.py,sha256=3Y54mudqQ1LqfFtKrU3uloNp2Oz-hcf28GwEI6eDUa0,14944
|
|
6
|
-
ytcollector/utils.py,sha256=gInDx6adV-SfQ2SH5_i8w1gvYL-Nsmz1e1W__gCdVH8,4654
|
|
7
|
-
ytcollector/verifier.py,sha256=8Nn3b6fTQYxCGPt01kJMDSZ2hy8gk54deSayOpBuY48,6286
|
|
8
|
-
ytcollector-1.0.6.dist-info/METADATA,sha256=IAnpxTaWUUaZULsCEbnWXiQgPCGYE2oBq0bPl9r8U8w,3543
|
|
9
|
-
ytcollector-1.0.6.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
10
|
-
ytcollector-1.0.6.dist-info/entry_points.txt,sha256=PoanZbxogGnV4tLcZZkla0Yh7OvPtqcukDYr563w5RA,53
|
|
11
|
-
ytcollector-1.0.6.dist-info/top_level.txt,sha256=TVfBZHJgYRfSSTgLJELvOoMA55qR8kWuxtiIaItwzIQ,19
|
|
12
|
-
ytcollector-1.0.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|