ytcollector 1.0.3__tar.gz → 1.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ytcollector
3
- Version: 1.0.3
3
+ Version: 1.0.5
4
4
  Summary: SBS 데이터셋 수집기
5
5
  Requires-Python: >=3.9
6
6
  Description-Content-Type: text/markdown
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "ytcollector"
7
- version = "1.0.3"
7
+ version = "1.0.5"
8
8
  description = "SBS 데이터셋 수집기"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9"
@@ -3,7 +3,7 @@ SBS Dataset Collector - YouTube 영상 수집 및 YOLO-World 검증 파이프라
3
3
  """
4
4
  from pathlib import Path
5
5
 
6
- __version__ = "1.0.3"
6
+ __version__ = "1.0.5"
7
7
  __author__ = "SBS Dataset Team"
8
8
 
9
9
  # Package root directory
@@ -48,24 +48,38 @@ def run_download(args):
48
48
 
49
49
  base_dir = Path(args.dir) if args.dir else Path.cwd()
50
50
 
51
- # 파일 경로: video/{task}/youtube_url.txt
52
- txt_file = get_url_file_path(base_dir, args.task)
51
+ # argparse에서 nargs='+'로 받아오면 args.task는 항상 리스트
52
+ tasks = args.task if isinstance(args.task, list) else [args.task]
53
53
 
54
- if not txt_file.exists():
55
- logger.error(f"URL file not found: {txt_file}")
56
- logger.info("Run 'downloader init' first to create project structure")
57
- return
58
-
59
- logger.info(f"Starting{' fast' if args.fast else ''} download for task: {args.task}")
60
-
61
- if args.fast:
62
- from .downloader import download_from_txt_parallel
63
- results = download_from_txt_parallel(txt_file, args.task, base_dir)
64
- else:
65
- results = download_from_txt(txt_file, args.task, base_dir)
54
+ total_success = 0
55
+ total_processed = 0
66
56
 
67
- success_count = sum(1 for r in results if r.get('success'))
68
- print(f" Download complete: {success_count}/{len(results)} successful")
57
+ for task in tasks:
58
+ logger.info(f"=== Processing Task: {task} ===")
59
+
60
+ # 파일 경로: video/{task}/youtube_url.txt
61
+ txt_file = get_url_file_path(base_dir, task)
62
+
63
+ if not txt_file.exists():
64
+ logger.error(f"URL file not found: {txt_file}")
65
+ logger.info(f"Skipping {task}. Run 'ytcollector init' first.")
66
+ continue
67
+
68
+ logger.info(f"Starting{' fast' if args.fast else ''} download for task: {task}")
69
+
70
+ if args.fast:
71
+ from .downloader import download_from_txt_parallel
72
+ results = download_from_txt_parallel(txt_file, task, base_dir, max_count=args.count)
73
+ else:
74
+ results = download_from_txt(txt_file, task, base_dir, max_count=args.count)
75
+
76
+ success_count = sum(1 for r in results if r.get('success'))
77
+ total_success += success_count
78
+ total_processed += len(results)
79
+
80
+ print(f"✓ Task '{task}' complete: {success_count}/{len(results)} successful")
81
+
82
+ print(f"\n✓ All tasks complete: {total_success}/{total_processed} successful total")
69
83
 
70
84
 
71
85
  def run_download_single(args):
@@ -172,7 +186,8 @@ Examples:
172
186
 
173
187
  # Download
174
188
  download_parser = subparsers.add_parser('download', help='Download from youtube_url.txt')
175
- download_parser.add_argument('--task', '-t', required=True, choices=VALID_TASKS)
189
+ download_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS, help='One or more tasks (e.g. face tattoo)')
190
+ download_parser.add_argument('--count', '-n', type=int, help='Max videos to collect (default: 1000)')
176
191
  download_parser.add_argument('--fast', action='store_true', help='Enable fast parallel downloading')
177
192
 
178
193
  # Download single
@@ -184,12 +199,12 @@ Examples:
184
199
 
185
200
  # Verify
186
201
  verify_parser = subparsers.add_parser('verify', help='Verify with YOLO-World')
187
- verify_parser.add_argument('--task', '-t', required=True, choices=VALID_TASKS)
202
+ verify_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS)
188
203
  verify_parser.add_argument('--video', '-v', help='Specific video file')
189
204
 
190
205
  # Pipeline
191
206
  pipeline_parser = subparsers.add_parser('pipeline', help='Full pipeline')
192
- pipeline_parser.add_argument('--task', '-t', required=True, choices=VALID_TASKS)
207
+ pipeline_parser.add_argument('--task', '-t', required=True, nargs='+', choices=VALID_TASKS)
193
208
  pipeline_parser.add_argument('--verify', action='store_true')
194
209
 
195
210
  # List tasks
@@ -41,7 +41,7 @@ MAX_CLIP_DURATION = 180 # 최대 3분
41
41
  # Download settings
42
42
  VIDEO_FORMAT = "best[ext=mp4]/best"
43
43
  DOWNLOAD_RETRIES = 3
44
- MAX_VIDEOS_PER_TASK = 100 # 태스크별 최대 영상 저장 수
44
+ MAX_VIDEOS_PER_TASK = 1000 # 태스크별 최대 영상 저장 수 (CLI -n 옵션으로 덮어쓰기 가능)
45
45
 
46
46
  # Fast Mode Settings (Parallel)
47
47
  MAX_WORKERS = 4 # 병렬 작업 프로세스 수
@@ -247,31 +247,34 @@ def parse_txt_line(line: str) -> Optional[Dict]:
247
247
  return None
248
248
 
249
249
 
250
- def download_from_txt(txt_path: Path, task_type: str, base_dir: Path = None) -> list:
250
+ def download_from_txt(txt_path: Path, task_type: str, base_dir: Path = None, max_count: int = None) -> list:
251
251
  """TXT 파일에서 다운로드 실행 (순차)"""
252
252
  # 기존 로직을 process_single_item 함수로 분리하여 재사용할 수 있으면 좋겠지만,
253
253
  # 코드 구조상 일단 순차 실행 유지하고 parallel 함수 별도 구현
254
- return _process_download_loop(txt_path, task_type, base_dir, parallel=False)
254
+ return _process_download_loop(txt_path, task_type, base_dir, parallel=False, max_count=max_count)
255
255
 
256
256
 
257
- def download_from_txt_parallel(txt_path: Path, task_type: str, base_dir: Path = None) -> list:
257
+ def download_from_txt_parallel(txt_path: Path, task_type: str, base_dir: Path = None, max_count: int = None) -> list:
258
258
  """TXT 파일에서 병렬 다운로드 실행 (Fast Mode)"""
259
- return _process_download_loop(txt_path, task_type, base_dir, parallel=True)
259
+ return _process_download_loop(txt_path, task_type, base_dir, parallel=True, max_count=max_count)
260
260
 
261
261
 
262
- def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None, parallel: bool = False) -> list:
262
+ def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None, parallel: bool = False, max_count: int = None) -> list:
263
263
  from .config import MAX_VIDEOS_PER_TASK, MAX_WORKERS, REQUEST_DELAY_MIN, REQUEST_DELAY_MAX
264
264
  import time
265
265
  import random
266
266
  from concurrent.futures import ThreadPoolExecutor, as_completed
267
267
 
268
+ # max_count가 없으면 config의 기본값 사용
269
+ limit = max_count if max_count is not None else MAX_VIDEOS_PER_TASK
270
+
268
271
  results = []
269
272
  downloader = VideoDownloader(task_type, base_dir)
270
273
 
271
274
  # 시작 전 개수 확인
272
275
  initial_count = downloader.get_saved_video_count()
273
- if initial_count >= MAX_VIDEOS_PER_TASK:
274
- logger.warning(f"Task '{task_type}' already has {initial_count} videos. Skipping.")
276
+ if initial_count >= limit:
277
+ logger.warning(f"Task '{task_type}' already has {initial_count} videos (Limit: {limit}). Skipping.")
275
278
  return results
276
279
 
277
280
  if not txt_path.exists():
@@ -291,14 +294,26 @@ def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None
291
294
  if not items:
292
295
  return results
293
296
 
294
- logger.info(f"Found {len(items)} URLs. Starting {'parallel' if parallel else 'sequential'} download...")
297
+ logger.info(f"Found {len(items)} URLs. Target: {limit} videos (Current: {initial_count}). Starting {'parallel' if parallel else 'sequential'} download...")
295
298
 
296
299
  def process_item(data):
300
+ # 현재 개수 체크 (루프 도중 목표 달성 시 중단 위함)
301
+ # 주의: 병렬 처리 시 정확한 count 동기화는 Lock이 필요하지만, 여기선 대략적인 체크로 충분
302
+ current = downloader.get_saved_video_count()
303
+ if current >= limit:
304
+ raise LimitReachedError("Target count reached")
305
+
297
306
  # 방화벽 우회용 랜덤 딜레이 (병렬 모드에서도 적용하여 동시 요청 폭주 완화)
298
307
  if parallel:
299
308
  time.sleep(random.uniform(REQUEST_DELAY_MIN, REQUEST_DELAY_MAX))
300
309
 
301
310
  try:
311
+ # VideoDownloader 내부의 limit 체크는 config 값을 쓰므로,
312
+ # 여기서는 외부에서 주입된 limit을 강제할 방법이 필요하거나,
313
+ # 단순히 루프 레벨에서 제어하면 됨.
314
+ # download_clip_at_timestamp 메서드는 내부적으로 MAX_VIDEOS_PER_TASK를 체크하므로,
315
+ # 이를 우회하거나 단순 루프 제어로 처리.
316
+
302
317
  output_path, metadata = downloader.download_clip_at_timestamp(
303
318
  url=data['url'],
304
319
  timestamp_min=data['timestamp_min'],
@@ -329,8 +344,7 @@ def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None
329
344
  }
330
345
 
331
346
  except LimitReachedError:
332
- # 병렬 실행 중에는 이 예외 처리 방식이 조금 다를 수 있음 (다른 스레드 멈추게 하려면 Event 사용 등)
333
- # 여기서는 개별 스레드 종료로 처리
347
+ # 내부에서 발생한 LimitReachedError도 처리
334
348
  return {'success': False, 'error': 'Limit reached', 'status': 'limit_reached'}
335
349
 
336
350
  except Exception as e:
@@ -350,18 +364,27 @@ def _process_download_loop(txt_path: Path, task_type: str, base_dir: Path = None
350
364
  # 진행 상황 표시
351
365
  from tqdm import tqdm
352
366
  for future in tqdm(as_completed(futures), total=len(items), desc="Fast Download"):
353
- res = future.result()
354
- results.append(res)
355
- if res.get('status') == 'limit_reached':
356
- logger.warning("Download limit reached. Stopping remaining tasks.")
357
- executor.shutdown(wait=False, cancel_futures=True)
358
- break
367
+ try:
368
+ res = future.result()
369
+ results.append(res)
370
+ if res.get('status') == 'limit_reached' or downloader.get_saved_video_count() >= limit:
371
+ logger.warning(f"Download limit ({limit}) reached. Stopping remaining tasks.")
372
+ executor.shutdown(wait=False, cancel_futures=True)
373
+ break
374
+ except Exception:
375
+ continue
359
376
  else:
360
377
  # 순차 실행
361
378
  for item in items:
379
+ # 루프 시작 전 체크
380
+ if downloader.get_saved_video_count() >= limit:
381
+ logger.info(f"Target count ({limit}) reached. Stopping.")
382
+ break
383
+
362
384
  res = process_item(item)
363
385
  results.append(res)
364
386
  if res.get('status') == 'limit_reached':
387
+ logger.info(f"Target count ({limit}) reached. Stopping.")
365
388
  break
366
389
 
367
390
  return results
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ytcollector
3
- Version: 1.0.3
3
+ Version: 1.0.5
4
4
  Summary: SBS 데이터셋 수집기
5
5
  Requires-Python: >=3.9
6
6
  Description-Content-Type: text/markdown
File without changes
File without changes