ytcollector 1.0.3__tar.gz → 1.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ytcollector-1.0.3 → ytcollector-1.0.5}/PKG-INFO +1 -1
- {ytcollector-1.0.3 → ytcollector-1.0.5}/pyproject.toml +1 -1
- {ytcollector-1.0.3 → ytcollector-1.0.5}/ytcollector/__init__.py +1 -1
- {ytcollector-1.0.3 → ytcollector-1.0.5}/ytcollector/cli.py +34 -19
- {ytcollector-1.0.3 → ytcollector-1.0.5}/ytcollector/config.py +1 -1
- {ytcollector-1.0.3 → ytcollector-1.0.5}/ytcollector/downloader.py +39 -16
- {ytcollector-1.0.3 → ytcollector-1.0.5}/ytcollector.egg-info/PKG-INFO +1 -1
- {ytcollector-1.0.3 → ytcollector-1.0.5}/README.md +0 -0
- {ytcollector-1.0.3 → ytcollector-1.0.5}/config/settings.py +0 -0
- {ytcollector-1.0.3 → ytcollector-1.0.5}/setup.cfg +0 -0
- {ytcollector-1.0.3 → ytcollector-1.0.5}/ytcollector/utils.py +0 -0
- {ytcollector-1.0.3 → ytcollector-1.0.5}/ytcollector/verifier.py +0 -0
- {ytcollector-1.0.3 → ytcollector-1.0.5}/ytcollector.egg-info/SOURCES.txt +0 -0
- {ytcollector-1.0.3 → ytcollector-1.0.5}/ytcollector.egg-info/dependency_links.txt +0 -0
- {ytcollector-1.0.3 → ytcollector-1.0.5}/ytcollector.egg-info/entry_points.txt +0 -0
- {ytcollector-1.0.3 → ytcollector-1.0.5}/ytcollector.egg-info/requires.txt +0 -0
- {ytcollector-1.0.3 → ytcollector-1.0.5}/ytcollector.egg-info/top_level.txt +0 -0
|
@@ -48,24 +48,38 @@ def run_download(args):
|
|
|
48
48
|
|
|
49
49
|
base_dir = Path(args.dir) if args.dir else Path.cwd()
|
|
50
50
|
|
|
51
|
-
#
|
|
52
|
-
|
|
51
|
+
# argparse에서 nargs='+'로 받아오면 args.task는 항상 리스트
|
|
52
|
+
tasks = args.task if isinstance(args.task, list) else [args.task]
|
|
53
53
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
logger.info("Run 'downloader init' first to create project structure")
|
|
57
|
-
return
|
|
58
|
-
|
|
59
|
-
logger.info(f"Starting{' fast' if args.fast else ''} download for task: {args.task}")
|
|
60
|
-
|
|
61
|
-
if args.fast:
|
|
62
|
-
from .downloader import download_from_txt_parallel
|
|
63
|
-
results = download_from_txt_parallel(txt_file, args.task, base_dir)
|
|
64
|
-
else:
|
|
65
|
-
results = download_from_txt(txt_file, args.task, base_dir)
|
|
54
|
+
total_success = 0
|
|
55
|
+
total_processed = 0
|
|
66
56
|
|
|
67
|
-
|
|
68
|
-
|
|
57
|
+
for task in tasks:
|
|
58
|
+
logger.info(f"=== Processing Task: {task} ===")
|
|
59
|
+
|
|
60
|
+
# 파일 경로: video/{task}/youtube_url.txt
|
|
61
|
+
txt_file = get_url_file_path(base_dir, task)
|
|
62
|
+
|
|
63
|
+
if not txt_file.exists():
|
|
64
|
+
logger.error(f"URL file not found: {txt_file}")
|
|
65
|
+
logger.info(f"Skipping {task}. Run 'ytcollector init' first.")
|
|
66
|
+
continue
|
|
67
|
+
|
|
68
|
+
logger.info(f"Starting{' fast' if args.fast else ''} download for task: {task}")
|
|
69
|
+
|
|
70
|
+
if args.fast:
|
|
71
|
+
from .downloader import download_from_txt_parallel
|
|
72
|
+
results = download_from_txt_parallel(txt_file, task, base_dir, max_count=args.count)
|
|
73
|
+
else:
|
|
74
|
+
results = download_from_txt(txt_file, task, base_dir, max_count=args.count)
|
|
75
|
+
|
|
76
|
+
success_count = sum(1 for r in results if r.get('success'))
|
|
77
|
+
total_success += success_count
|
|
78
|
+
total_processed += len(results)
|
|
79
|
+
|
|
80
|
+
print(f"✓ Task '{task}' complete: {success_count}/{len(results)} successful")
|
|
81
|
+
|
|
82
|
+
print(f"\n✓ All tasks complete: {total_success}/{total_processed} successful total")
|
|
69
83
|
|
|
70
84
|
|
|
71
85
|
def run_download_single(args):
|
|
@@ -172,7 +186,8 @@ Examples:
|
|
|
172
186
|
|
|
173
187
|
# Download
|
|
174
188
|
download_parser = subparsers.add_parser('download', help='Download from youtube_url.txt')
|
|
175
|
-
download_parser.add_argument('--task', '-t', required=True, choices=VALID_TASKS)
|
|
189
|
+
download_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS, help='One or more tasks (e.g. face tattoo)')
|
|
190
|
+
download_parser.add_argument('--count', '-n', type=int, help='Max videos to collect (default: 1000)')
|
|
176
191
|
download_parser.add_argument('--fast', action='store_true', help='Enable fast parallel downloading')
|
|
177
192
|
|
|
178
193
|
# Download single
|
|
@@ -184,12 +199,12 @@ Examples:
|
|
|
184
199
|
|
|
185
200
|
# Verify
|
|
186
201
|
verify_parser = subparsers.add_parser('verify', help='Verify with YOLO-World')
|
|
187
|
-
verify_parser.add_argument('--task', '-t', required=True, choices=VALID_TASKS)
|
|
202
|
+
verify_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS)
|
|
188
203
|
verify_parser.add_argument('--video', '-v', help='Specific video file')
|
|
189
204
|
|
|
190
205
|
# Pipeline
|
|
191
206
|
pipeline_parser = subparsers.add_parser('pipeline', help='Full pipeline')
|
|
192
|
-
pipeline_parser.add_argument('--task', '-t', required=True, choices=VALID_TASKS)
|
|
207
|
+
pipeline_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS)
|
|
193
208
|
pipeline_parser.add_argument('--verify', action='store_true')
|
|
194
209
|
|
|
195
210
|
# List tasks
|
|
@@ -41,7 +41,7 @@ MAX_CLIP_DURATION = 180 # 최대 3분
|
|
|
41
41
|
# Download settings
|
|
42
42
|
VIDEO_FORMAT = "best[ext=mp4]/best"
|
|
43
43
|
DOWNLOAD_RETRIES = 3
|
|
44
|
-
MAX_VIDEOS_PER_TASK =
|
|
44
|
+
MAX_VIDEOS_PER_TASK = 1000 # 태스크별 최대 영상 저장 수 (CLI -n 옵션으로 덮어쓰기 가능)
|
|
45
45
|
|
|
46
46
|
# Fast Mode Settings (Parallel)
|
|
47
47
|
MAX_WORKERS = 4 # 병렬 작업 프로세스 수
|
|
@@ -247,31 +247,34 @@ def parse_txt_line(line: str) -> Optional[Dict]:
|
|
|
247
247
|
return None
|
|
248
248
|
|
|
249
249
|
|
|
250
|
-
def download_from_txt(txt_path: Path, task_type: str, base_dir: Path = None) -> list:
|
|
250
|
+
def download_from_txt(txt_path: Path, task_type: str, base_dir: Path = None, max_count: int = None) -> list:
|
|
251
251
|
"""TXT 파일에서 다운로드 실행 (순차)"""
|
|
252
252
|
# 기존 로직을 process_single_item 함수로 분리하여 재사용할 수 있으면 좋겠지만,
|
|
253
253
|
# 코드 구조상 일단 순차 실행 유지하고 parallel 함수 별도 구현
|
|
254
|
-
return _process_download_loop(txt_path, task_type, base_dir, parallel=False)
|
|
254
|
+
return _process_download_loop(txt_path, task_type, base_dir, parallel=False, max_count=max_count)
|
|
255
255
|
|
|
256
256
|
|
|
257
|
-
def download_from_txt_parallel(txt_path: Path, task_type: str, base_dir: Path = None) -> list:
|
|
257
|
+
def download_from_txt_parallel(txt_path: Path, task_type: str, base_dir: Path = None, max_count: int = None) -> list:
|
|
258
258
|
"""TXT 파일에서 병렬 다운로드 실행 (Fast Mode)"""
|
|
259
|
-
return _process_download_loop(txt_path, task_type, base_dir, parallel=True)
|
|
259
|
+
return _process_download_loop(txt_path, task_type, base_dir, parallel=True, max_count=max_count)
|
|
260
260
|
|
|
261
261
|
|
|
262
|
-
def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None, parallel: bool = False) -> list:
|
|
262
|
+
def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None, parallel: bool = False, max_count: int = None) -> list:
|
|
263
263
|
from .config import MAX_VIDEOS_PER_TASK, MAX_WORKERS, REQUEST_DELAY_MIN, REQUEST_DELAY_MAX
|
|
264
264
|
import time
|
|
265
265
|
import random
|
|
266
266
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
267
267
|
|
|
268
|
+
# max_count가 없으면 config의 기본값 사용
|
|
269
|
+
limit = max_count if max_count is not None else MAX_VIDEOS_PER_TASK
|
|
270
|
+
|
|
268
271
|
results = []
|
|
269
272
|
downloader = VideoDownloader(task_type, base_dir)
|
|
270
273
|
|
|
271
274
|
# 시작 전 개수 확인
|
|
272
275
|
initial_count = downloader.get_saved_video_count()
|
|
273
|
-
if initial_count >=
|
|
274
|
-
logger.warning(f"Task '{task_type}' already has {initial_count} videos. Skipping.")
|
|
276
|
+
if initial_count >= limit:
|
|
277
|
+
logger.warning(f"Task '{task_type}' already has {initial_count} videos (Limit: {limit}). Skipping.")
|
|
275
278
|
return results
|
|
276
279
|
|
|
277
280
|
if not txt_path.exists():
|
|
@@ -291,14 +294,26 @@ def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None
|
|
|
291
294
|
if not items:
|
|
292
295
|
return results
|
|
293
296
|
|
|
294
|
-
logger.info(f"Found {len(items)} URLs. Starting {'parallel' if parallel else 'sequential'} download...")
|
|
297
|
+
logger.info(f"Found {len(items)} URLs. Target: {limit} videos (Current: {initial_count}). Starting {'parallel' if parallel else 'sequential'} download...")
|
|
295
298
|
|
|
296
299
|
def process_item(data):
|
|
300
|
+
# 현재 개수 체크 (루프 도중 목표 달성 시 중단 위함)
|
|
301
|
+
# 주의: 병렬 처리 시 정확한 count 동기화는 Lock이 필요하지만, 여기선 대략적인 체크로 충분
|
|
302
|
+
current = downloader.get_saved_video_count()
|
|
303
|
+
if current >= limit:
|
|
304
|
+
raise LimitReachedError("Target count reached")
|
|
305
|
+
|
|
297
306
|
# 방화벽 우회용 랜덤 딜레이 (병렬 모드에서도 적용하여 동시 요청 폭주 완화)
|
|
298
307
|
if parallel:
|
|
299
308
|
time.sleep(random.uniform(REQUEST_DELAY_MIN, REQUEST_DELAY_MAX))
|
|
300
309
|
|
|
301
310
|
try:
|
|
311
|
+
# VideoDownloader 내부의 limit 체크는 config 값을 쓰므로,
|
|
312
|
+
# 여기서는 외부에서 주입된 limit을 강제할 방법이 필요하거나,
|
|
313
|
+
# 단순히 루프 레벨에서 제어하면 됨.
|
|
314
|
+
# download_clip_at_timestamp 메서드는 내부적으로 MAX_VIDEOS_PER_TASK를 체크하므로,
|
|
315
|
+
# 이를 우회하거나 단순 루프 제어로 처리.
|
|
316
|
+
|
|
302
317
|
output_path, metadata = downloader.download_clip_at_timestamp(
|
|
303
318
|
url=data['url'],
|
|
304
319
|
timestamp_min=data['timestamp_min'],
|
|
@@ -329,8 +344,7 @@ def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None
|
|
|
329
344
|
}
|
|
330
345
|
|
|
331
346
|
except LimitReachedError:
|
|
332
|
-
#
|
|
333
|
-
# 여기서는 개별 스레드 종료로 처리
|
|
347
|
+
# 내부에서 발생한 LimitReachedError도 처리
|
|
334
348
|
return {'success': False, 'error': 'Limit reached', 'status': 'limit_reached'}
|
|
335
349
|
|
|
336
350
|
except Exception as e:
|
|
@@ -350,18 +364,27 @@ def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None
|
|
|
350
364
|
# 진행 상황 표시
|
|
351
365
|
from tqdm import tqdm
|
|
352
366
|
for future in tqdm(as_completed(futures), total=len(items), desc="Fast Download"):
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
367
|
+
try:
|
|
368
|
+
res = future.result()
|
|
369
|
+
results.append(res)
|
|
370
|
+
if res.get('status') == 'limit_reached' or downloader.get_saved_video_count() >= limit:
|
|
371
|
+
logger.warning(f"Download limit ({limit}) reached. Stopping remaining tasks.")
|
|
372
|
+
executor.shutdown(wait=False, cancel_futures=True)
|
|
373
|
+
break
|
|
374
|
+
except Exception:
|
|
375
|
+
continue
|
|
359
376
|
else:
|
|
360
377
|
# 순차 실행
|
|
361
378
|
for item in items:
|
|
379
|
+
# 루프 시작 전 체크
|
|
380
|
+
if downloader.get_saved_video_count() >= limit:
|
|
381
|
+
logger.info(f"Target count ({limit}) reached. Stopping.")
|
|
382
|
+
break
|
|
383
|
+
|
|
362
384
|
res = process_item(item)
|
|
363
385
|
results.append(res)
|
|
364
386
|
if res.get('status') == 'limit_reached':
|
|
387
|
+
logger.info(f"Target count ({limit}) reached. Stopping.")
|
|
365
388
|
break
|
|
366
389
|
|
|
367
390
|
return results
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|