ytminer-client 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ytminer_client-0.2.0/PKG-INFO +8 -0
- ytminer_client-0.2.0/pyproject.toml +20 -0
- ytminer_client-0.2.0/setup.cfg +4 -0
- ytminer_client-0.2.0/ytminer_client/__init__.py +1 -0
- ytminer_client-0.2.0/ytminer_client/cli.py +456 -0
- ytminer_client-0.2.0/ytminer_client/downloader.py +274 -0
- ytminer_client-0.2.0/ytminer_client.egg-info/PKG-INFO +8 -0
- ytminer_client-0.2.0/ytminer_client.egg-info/SOURCES.txt +10 -0
- ytminer_client-0.2.0/ytminer_client.egg-info/dependency_links.txt +1 -0
- ytminer_client-0.2.0/ytminer_client.egg-info/entry_points.txt +2 -0
- ytminer_client-0.2.0/ytminer_client.egg-info/requires.txt +3 -0
- ytminer_client-0.2.0/ytminer_client.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "ytminer-client"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "Distributed YouTube video downloader client"
|
|
9
|
+
requires-python = ">=3.10"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"click>=8.0",
|
|
12
|
+
"httpx>=0.27.0",
|
|
13
|
+
"yt-dlp>=2025.0",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[project.scripts]
|
|
17
|
+
ytminer-download = "ytminer_client.cli:main"
|
|
18
|
+
|
|
19
|
+
[tool.setuptools.packages.find]
|
|
20
|
+
include = ["ytminer_client*"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.0"
|
|
@@ -0,0 +1,456 @@
|
|
|
1
|
+
"""CLI for distributed YouTube video downloading."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
import platform
|
|
8
|
+
import sys
|
|
9
|
+
import time
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
import click
|
|
13
|
+
import httpx
|
|
14
|
+
|
|
15
|
+
from ytminer_client import __version__
|
|
16
|
+
from ytminer_client.downloader import CookieManager, DownloadResult, RateLimiter, download_video
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger("ytminer-client")
|
|
19
|
+
|
|
20
|
+
COOLDOWN_STEPS = [30 * 60, 60 * 60, 2 * 3600] # 30m, 1h, 2h (then 2h forever)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def setup_logging(verbose: bool):
|
|
24
|
+
level = logging.DEBUG if verbose else logging.INFO
|
|
25
|
+
logging.basicConfig(
|
|
26
|
+
level=level,
|
|
27
|
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
28
|
+
datefmt="%H:%M:%S",
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def format_size(size_bytes: int) -> str:
|
|
33
|
+
for unit in ("B", "KB", "MB", "GB"):
|
|
34
|
+
if size_bytes < 1024:
|
|
35
|
+
return f"{size_bytes:.1f}{unit}"
|
|
36
|
+
size_bytes /= 1024
|
|
37
|
+
return f"{size_bytes:.1f}TB"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ─── Server Communication ──────────────────────────────────────
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ServerClient:
|
|
44
|
+
def __init__(self, server_url: str, worker_name: str):
|
|
45
|
+
self.server_url = server_url.rstrip("/")
|
|
46
|
+
self.worker_name = worker_name
|
|
47
|
+
self.http = httpx.Client(timeout=30)
|
|
48
|
+
|
|
49
|
+
def fetch_batch(self, channel: str | None = None, batch_size: int = 50) -> dict | None:
|
|
50
|
+
params = {"worker": self.worker_name, "batch_size": batch_size}
|
|
51
|
+
if channel:
|
|
52
|
+
params["channel"] = channel
|
|
53
|
+
resp = self.http.get(f"{self.server_url}/batch", params=params)
|
|
54
|
+
resp.raise_for_status()
|
|
55
|
+
data = resp.json()
|
|
56
|
+
if not data.get("video_ids"):
|
|
57
|
+
return None
|
|
58
|
+
return data
|
|
59
|
+
|
|
60
|
+
def report_batch(
|
|
61
|
+
self,
|
|
62
|
+
batch_id: str,
|
|
63
|
+
results: dict[str, str],
|
|
64
|
+
errors: dict[str, str],
|
|
65
|
+
channel: str | None = None,
|
|
66
|
+
) -> dict:
|
|
67
|
+
resp = self.http.post(
|
|
68
|
+
f"{self.server_url}/report",
|
|
69
|
+
json={
|
|
70
|
+
"batch_id": batch_id,
|
|
71
|
+
"worker": self.worker_name,
|
|
72
|
+
"results": results,
|
|
73
|
+
"errors": errors,
|
|
74
|
+
"channel": channel,
|
|
75
|
+
},
|
|
76
|
+
)
|
|
77
|
+
resp.raise_for_status()
|
|
78
|
+
return resp.json()
|
|
79
|
+
|
|
80
|
+
def check_version(self) -> str | None:
|
|
81
|
+
"""Check if a newer client version is available. Returns new version or None."""
|
|
82
|
+
try:
|
|
83
|
+
resp = self.http.get(f"{self.server_url}/version", timeout=5)
|
|
84
|
+
resp.raise_for_status()
|
|
85
|
+
server_version = resp.json().get("client_version", "")
|
|
86
|
+
if server_version and server_version != __version__:
|
|
87
|
+
return server_version
|
|
88
|
+
except Exception:
|
|
89
|
+
pass
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
def close(self):
|
|
93
|
+
self.http.close()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# ─── Uploader ─────────────────────────────────────────────────
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class Uploader:
|
|
100
|
+
def __init__(self, server_url: str, worker_name: str, enabled: bool = False, cleanup: bool = True):
|
|
101
|
+
self.server_url = server_url.rstrip("/")
|
|
102
|
+
self.worker_name = worker_name
|
|
103
|
+
self.enabled = enabled
|
|
104
|
+
self.cleanup = cleanup
|
|
105
|
+
self._http: httpx.AsyncClient | None = None
|
|
106
|
+
self._queue: asyncio.Queue | None = None
|
|
107
|
+
self._task: asyncio.Task | None = None
|
|
108
|
+
self.total_uploaded = 0
|
|
109
|
+
self.total_bytes = 0
|
|
110
|
+
self.pending = 0
|
|
111
|
+
|
|
112
|
+
async def start(self):
|
|
113
|
+
if self.enabled:
|
|
114
|
+
self._http = httpx.AsyncClient(timeout=httpx.Timeout(300.0, connect=10.0))
|
|
115
|
+
self._queue = asyncio.Queue()
|
|
116
|
+
self._task = asyncio.create_task(self._worker())
|
|
117
|
+
|
|
118
|
+
async def close(self):
|
|
119
|
+
if self._queue and self._task:
|
|
120
|
+
await self._queue.join() # wait for pending uploads to finish
|
|
121
|
+
self._task.cancel()
|
|
122
|
+
if self._http:
|
|
123
|
+
await self._http.aclose()
|
|
124
|
+
|
|
125
|
+
def enqueue(self, video_id: str, channel: str, channel_dir: Path):
|
|
126
|
+
"""Queue a video for background upload. Non-blocking."""
|
|
127
|
+
if self.enabled and self._queue is not None:
|
|
128
|
+
self._queue.put_nowait((video_id, channel, channel_dir))
|
|
129
|
+
self.pending += 1
|
|
130
|
+
|
|
131
|
+
async def _worker(self):
|
|
132
|
+
"""Background task that drains the upload queue."""
|
|
133
|
+
while True:
|
|
134
|
+
try:
|
|
135
|
+
video_id, channel, channel_dir = await self._queue.get()
|
|
136
|
+
await self._upload_video(video_id, channel, channel_dir)
|
|
137
|
+
self._queue.task_done()
|
|
138
|
+
self.pending -= 1
|
|
139
|
+
except asyncio.CancelledError:
|
|
140
|
+
break
|
|
141
|
+
except Exception as e:
|
|
142
|
+
logger.error(f"Upload worker error: {e}")
|
|
143
|
+
self._queue.task_done()
|
|
144
|
+
self.pending -= 1
|
|
145
|
+
|
|
146
|
+
async def _upload_video(self, video_id: str, channel: str, channel_dir: Path):
|
|
147
|
+
video_path = channel_dir / f"{video_id}.mp4"
|
|
148
|
+
info_path = channel_dir / f"{video_id}.info.json"
|
|
149
|
+
|
|
150
|
+
success = True
|
|
151
|
+
for path in [video_path, info_path]:
|
|
152
|
+
if not path.exists():
|
|
153
|
+
continue
|
|
154
|
+
ok = await self._upload_file(channel, video_id, path)
|
|
155
|
+
if not ok:
|
|
156
|
+
success = False
|
|
157
|
+
|
|
158
|
+
if success and self.cleanup:
|
|
159
|
+
for path in [video_path, info_path]:
|
|
160
|
+
if path.exists():
|
|
161
|
+
path.unlink()
|
|
162
|
+
|
|
163
|
+
if success:
|
|
164
|
+
self.total_uploaded += 1
|
|
165
|
+
|
|
166
|
+
async def _upload_file(self, channel: str, video_id: str, file_path: Path) -> bool:
|
|
167
|
+
url = f"{self.server_url}/upload/{channel}/{video_id}"
|
|
168
|
+
wait = 30
|
|
169
|
+
for attempt in range(5):
|
|
170
|
+
try:
|
|
171
|
+
data = file_path.read_bytes()
|
|
172
|
+
file_size = len(data)
|
|
173
|
+
resp = await self._http.post(
|
|
174
|
+
url,
|
|
175
|
+
content=data,
|
|
176
|
+
params={"worker": self.worker_name, "filename": file_path.name},
|
|
177
|
+
headers={"Content-Type": "application/octet-stream"},
|
|
178
|
+
)
|
|
179
|
+
resp.raise_for_status()
|
|
180
|
+
self.total_bytes += file_size
|
|
181
|
+
return True
|
|
182
|
+
except Exception as e:
|
|
183
|
+
logger.warning(f"Upload failed for {file_path.name} (attempt {attempt+1}): {e}")
|
|
184
|
+
await asyncio.sleep(wait)
|
|
185
|
+
wait = min(wait * 2, 300)
|
|
186
|
+
return False
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
# ─── Cooldown ──────────────────────────────────────────────────
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
async def bot_cooldown(
|
|
193
|
+
cookie_manager: CookieManager,
|
|
194
|
+
output_dir: Path,
|
|
195
|
+
cooldown_step: int,
|
|
196
|
+
) -> bool:
|
|
197
|
+
"""Progressive cooldown on bot detection. Returns True if recovered."""
|
|
198
|
+
wait_secs = COOLDOWN_STEPS[min(cooldown_step, len(COOLDOWN_STEPS) - 1)]
|
|
199
|
+
wait_min = wait_secs // 60
|
|
200
|
+
|
|
201
|
+
click.echo(f" Bot detected! Cooling down for {wait_min}min...")
|
|
202
|
+
await asyncio.sleep(wait_secs)
|
|
203
|
+
|
|
204
|
+
# Test with a known public video
|
|
205
|
+
click.echo(f" Testing if block lifted...")
|
|
206
|
+
test_result = await download_video(
|
|
207
|
+
"dQw4w9WgXcQ", output_dir, cookie_manager, timeout=30,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
if test_result.error_category == "bot_blocked":
|
|
211
|
+
click.echo(f" Still blocked.")
|
|
212
|
+
return False
|
|
213
|
+
|
|
214
|
+
click.echo(f" Block lifted! Resuming downloads.")
|
|
215
|
+
return True
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
# ─── Network Retry Helpers ─────────────────────────────────────
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
async def fetch_with_retry(server: ServerClient, channel: str | None, batch_size: int) -> dict | None:
|
|
222
|
+
"""Fetch a batch, retrying forever on network errors."""
|
|
223
|
+
wait = 30
|
|
224
|
+
while True:
|
|
225
|
+
try:
|
|
226
|
+
batch = server.fetch_batch(channel=channel, batch_size=batch_size)
|
|
227
|
+
return batch
|
|
228
|
+
except Exception as e:
|
|
229
|
+
click.echo(f" Network error fetching batch: {e}. Retrying in {wait}s...")
|
|
230
|
+
await asyncio.sleep(wait)
|
|
231
|
+
wait = min(wait * 2, 600) # cap at 10min
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
async def report_with_retry(
|
|
235
|
+
server: ServerClient, batch_id: str, results: dict, errors: dict, channel: str | None,
|
|
236
|
+
) -> dict:
|
|
237
|
+
"""Report batch results, retrying forever on network errors."""
|
|
238
|
+
wait = 30
|
|
239
|
+
while True:
|
|
240
|
+
try:
|
|
241
|
+
return server.report_batch(batch_id, results, errors, channel=channel)
|
|
242
|
+
except Exception as e:
|
|
243
|
+
click.echo(f" Network error reporting batch: {e}. Retrying in {wait}s...")
|
|
244
|
+
await asyncio.sleep(wait)
|
|
245
|
+
wait = min(wait * 2, 600)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
# ─── Main Download Loop ─────────────────────────────────────────
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
async def download_loop(
|
|
252
|
+
server: ServerClient,
|
|
253
|
+
output_dir: Path,
|
|
254
|
+
cookie_manager: CookieManager,
|
|
255
|
+
rate_limiter: RateLimiter,
|
|
256
|
+
uploader: Uploader,
|
|
257
|
+
channel: str | None,
|
|
258
|
+
batch_size: int,
|
|
259
|
+
update_check_interval: int = 5,
|
|
260
|
+
):
|
|
261
|
+
"""Main loop: fetch batch → download → upload → report → repeat."""
|
|
262
|
+
batch_count = 0
|
|
263
|
+
session_ok = 0
|
|
264
|
+
session_failed = 0
|
|
265
|
+
session_skipped = 0
|
|
266
|
+
session_start = time.monotonic()
|
|
267
|
+
consecutive_bot = 0
|
|
268
|
+
|
|
269
|
+
# One-time async browser detection (non-blocking)
|
|
270
|
+
await cookie_manager.warmup()
|
|
271
|
+
|
|
272
|
+
# Start uploader if enabled
|
|
273
|
+
await uploader.start()
|
|
274
|
+
|
|
275
|
+
click.echo(f"Worker: {server.worker_name}")
|
|
276
|
+
click.echo(f"Server: {server.server_url}")
|
|
277
|
+
click.echo(f"Output: {output_dir}")
|
|
278
|
+
click.echo(f"Cookie mode: {cookie_manager.mode}")
|
|
279
|
+
if uploader.enabled:
|
|
280
|
+
click.echo(f"Upload: enabled (cleanup: {uploader.cleanup})")
|
|
281
|
+
click.echo()
|
|
282
|
+
|
|
283
|
+
# First batch
|
|
284
|
+
batch = await fetch_with_retry(server, channel, batch_size)
|
|
285
|
+
|
|
286
|
+
while batch:
|
|
287
|
+
batch_id = batch["batch_id"]
|
|
288
|
+
video_ids = batch["video_ids"]
|
|
289
|
+
batch_channel = batch["channel"]
|
|
290
|
+
batch_count += 1
|
|
291
|
+
|
|
292
|
+
# Ensure channel output dir exists
|
|
293
|
+
channel_dir = output_dir / batch_channel
|
|
294
|
+
channel_dir.mkdir(parents=True, exist_ok=True)
|
|
295
|
+
|
|
296
|
+
click.echo(f"--- Batch {batch_count} ({batch_channel}) | {len(video_ids)} videos ---")
|
|
297
|
+
|
|
298
|
+
results: dict[str, str] = {}
|
|
299
|
+
errors: dict[str, str] = {}
|
|
300
|
+
|
|
301
|
+
for i, video_id in enumerate(video_ids):
|
|
302
|
+
# Rate limit (skip delay for skipped/permanent videos)
|
|
303
|
+
if i > 0 and not (results.get(video_ids[i - 1]) == "skipped"):
|
|
304
|
+
await rate_limiter.wait()
|
|
305
|
+
|
|
306
|
+
result = await download_video(video_id, channel_dir, cookie_manager)
|
|
307
|
+
|
|
308
|
+
results[video_id] = result.status
|
|
309
|
+
if result.error_category:
|
|
310
|
+
errors[video_id] = result.error_category
|
|
311
|
+
|
|
312
|
+
# Update session counters
|
|
313
|
+
if result.status == "ok":
|
|
314
|
+
session_ok += 1
|
|
315
|
+
consecutive_bot = 0
|
|
316
|
+
size_str = format_size(result.file_size)
|
|
317
|
+
click.echo(f" [{i+1}/{len(video_ids)}] {video_id} OK {size_str} {result.elapsed:.1f}s")
|
|
318
|
+
rate_limiter.report_success()
|
|
319
|
+
|
|
320
|
+
# Queue for background upload
|
|
321
|
+
if uploader.enabled:
|
|
322
|
+
uploader.enqueue(video_id, batch_channel, channel_dir)
|
|
323
|
+
click.echo(f" -> queued for upload ({uploader.pending} pending)")
|
|
324
|
+
|
|
325
|
+
elif result.status == "skipped":
|
|
326
|
+
session_skipped += 1
|
|
327
|
+
consecutive_bot = 0
|
|
328
|
+
reason = result.error_category or "exists"
|
|
329
|
+
click.echo(f" [{i+1}/{len(video_ids)}] {video_id} SKIP ({reason})")
|
|
330
|
+
else:
|
|
331
|
+
session_failed += 1
|
|
332
|
+
click.echo(f" [{i+1}/{len(video_ids)}] {video_id} FAIL ({result.error_category})")
|
|
333
|
+
rate_limiter.report_error(result.error_category or "unknown")
|
|
334
|
+
|
|
335
|
+
# Bot detection cooldown
|
|
336
|
+
if result.error_category == "bot_blocked":
|
|
337
|
+
consecutive_bot += 1
|
|
338
|
+
if consecutive_bot >= 3:
|
|
339
|
+
step = 0
|
|
340
|
+
while True:
|
|
341
|
+
recovered = await bot_cooldown(cookie_manager, channel_dir, step)
|
|
342
|
+
if recovered:
|
|
343
|
+
consecutive_bot = 0
|
|
344
|
+
break
|
|
345
|
+
step += 1
|
|
346
|
+
else:
|
|
347
|
+
consecutive_bot = 0
|
|
348
|
+
|
|
349
|
+
# Report and get next batch
|
|
350
|
+
elapsed = time.monotonic() - session_start
|
|
351
|
+
rate = session_ok / (elapsed / 3600) if elapsed > 60 else 0
|
|
352
|
+
click.echo(
|
|
353
|
+
f" Batch done: {sum(1 for v in results.values() if v == 'ok')} ok, "
|
|
354
|
+
f"{sum(1 for v in results.values() if v == 'failed')} failed, "
|
|
355
|
+
f"{sum(1 for v in results.values() if v == 'skipped')} skipped"
|
|
356
|
+
)
|
|
357
|
+
click.echo(
|
|
358
|
+
f" Session total: {session_ok} ok, {session_failed} failed, "
|
|
359
|
+
f"{session_skipped} skipped | {rate:.0f}/hr"
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
resp = await report_with_retry(server, batch_id, results, errors, channel)
|
|
363
|
+
batch = resp.get("next_batch")
|
|
364
|
+
if not batch:
|
|
365
|
+
batch = await fetch_with_retry(server, channel, batch_size)
|
|
366
|
+
|
|
367
|
+
# Periodic version check
|
|
368
|
+
if batch_count % update_check_interval == 0:
|
|
369
|
+
new_version = server.check_version()
|
|
370
|
+
if new_version:
|
|
371
|
+
click.echo(
|
|
372
|
+
f"\n Update available: v{__version__} -> v{new_version}\n"
|
|
373
|
+
f" Run: pip install --upgrade ytminer-client\n"
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
elapsed = time.monotonic() - session_start
|
|
377
|
+
click.echo()
|
|
378
|
+
click.echo(f"All done! {session_ok} downloaded, {session_failed} failed, {session_skipped} skipped")
|
|
379
|
+
click.echo(f"Total time: {elapsed/3600:.1f}h")
|
|
380
|
+
|
|
381
|
+
await uploader.close()
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
# ─── CLI ─────────────────────────────────────────────────────────
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
@click.command()
|
|
388
|
+
@click.option("--server", required=True, help="Server URL (e.g. http://localhost:8000)")
|
|
389
|
+
@click.option("--output", default="./videos", help="Output directory for downloaded videos")
|
|
390
|
+
@click.option("--worker-name", default=None, help="Worker name (default: hostname)")
|
|
391
|
+
@click.option("--channel", default=None, help="Only download this channel (e.g. @geonews)")
|
|
392
|
+
@click.option("--batch-size", default=50, help="Videos per batch (default: 50)")
|
|
393
|
+
@click.option("--delay", default=30.0, help="Base delay between downloads in seconds (default: 30)")
|
|
394
|
+
@click.option("--jitter", default=10.0, help="Random jitter added to delay in seconds (default: 10)")
|
|
395
|
+
@click.option("--cookies", default=None, help="Path to cookies.txt file")
|
|
396
|
+
@click.option("--cookies-from-browser", default=None, help="Browser to extract cookies from (e.g. chrome, firefox)")
|
|
397
|
+
@click.option("--upload", is_flag=True, help="Upload videos to server after download (for ephemeral envs like Colab)")
|
|
398
|
+
@click.option("--keep-files", is_flag=True, help="Keep local files after upload (default: delete after upload)")
|
|
399
|
+
@click.option("--verbose", is_flag=True, help="Enable debug logging")
|
|
400
|
+
def main(server, output, worker_name, channel, batch_size, delay, jitter, cookies, cookies_from_browser, upload, keep_files, verbose):
|
|
401
|
+
"""Download YouTube videos from a ytminer server."""
|
|
402
|
+
setup_logging(verbose)
|
|
403
|
+
|
|
404
|
+
if worker_name is None:
|
|
405
|
+
worker_name = platform.node() or "anonymous"
|
|
406
|
+
|
|
407
|
+
output_dir = Path(output)
|
|
408
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
409
|
+
|
|
410
|
+
cookie_manager = CookieManager(
|
|
411
|
+
cookies_file=cookies,
|
|
412
|
+
cookies_from_browser=cookies_from_browser,
|
|
413
|
+
)
|
|
414
|
+
rate_limiter = RateLimiter(base_delay=delay, jitter=jitter)
|
|
415
|
+
uploader = Uploader(
|
|
416
|
+
server_url=server,
|
|
417
|
+
worker_name=worker_name,
|
|
418
|
+
enabled=upload,
|
|
419
|
+
cleanup=not keep_files,
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
srv = ServerClient(server, worker_name)
|
|
423
|
+
|
|
424
|
+
# Quick connectivity check
|
|
425
|
+
try:
|
|
426
|
+
srv.http.get(f"{srv.server_url}/status", timeout=5).raise_for_status()
|
|
427
|
+
click.echo(f"Connected to server: {server}")
|
|
428
|
+
except Exception as e:
|
|
429
|
+
click.echo(f"Cannot reach server at {server}: {e}", err=True)
|
|
430
|
+
sys.exit(1)
|
|
431
|
+
|
|
432
|
+
# Check for updates at startup
|
|
433
|
+
new_version = srv.check_version()
|
|
434
|
+
if new_version:
|
|
435
|
+
click.echo(f"Update available: v{__version__} -> v{new_version}")
|
|
436
|
+
click.echo(f"Run: pip install --upgrade ytminer-client")
|
|
437
|
+
click.echo()
|
|
438
|
+
|
|
439
|
+
try:
|
|
440
|
+
asyncio.run(download_loop(
|
|
441
|
+
server=srv,
|
|
442
|
+
output_dir=output_dir,
|
|
443
|
+
cookie_manager=cookie_manager,
|
|
444
|
+
rate_limiter=rate_limiter,
|
|
445
|
+
uploader=uploader,
|
|
446
|
+
channel=channel,
|
|
447
|
+
batch_size=batch_size,
|
|
448
|
+
))
|
|
449
|
+
except KeyboardInterrupt:
|
|
450
|
+
click.echo("\nInterrupted by user. Goodbye!")
|
|
451
|
+
finally:
|
|
452
|
+
srv.close()
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
if __name__ == "__main__":
|
|
456
|
+
main()
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
"""yt-dlp wrapper with cookie fallback and error classification."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
import random
|
|
8
|
+
import shutil
|
|
9
|
+
import sys
|
|
10
|
+
import time
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger("ytminer-client.downloader")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _find_yt_dlp() -> str:
|
|
19
|
+
"""Find yt-dlp binary in the same venv as this Python process."""
|
|
20
|
+
venv_bin = Path(sys.executable).parent / "yt-dlp"
|
|
21
|
+
if venv_bin.exists():
|
|
22
|
+
return str(venv_bin)
|
|
23
|
+
return shutil.which("yt-dlp") or "yt-dlp"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# ─── Error Classification ──────────────────────────────────────
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def classify_error(stderr: str) -> str:
|
|
30
|
+
"""Classify yt-dlp error output into a category."""
|
|
31
|
+
msg = stderr.lower()
|
|
32
|
+
if any(x in msg for x in ["sign in to confirm", "bot", "429", "too many"]):
|
|
33
|
+
return "bot_blocked"
|
|
34
|
+
if any(x in msg for x in [
|
|
35
|
+
"unavailable", "private", "removed", "deleted",
|
|
36
|
+
"copyright", "geo restricted", "members only",
|
|
37
|
+
"age restricted", "terminated", "not found",
|
|
38
|
+
"has been removed", "is not available",
|
|
39
|
+
]):
|
|
40
|
+
return "permanent"
|
|
41
|
+
if any(x in msg for x in ["timeout", "timed out", "connection", "network"]):
|
|
42
|
+
return "network"
|
|
43
|
+
return "unknown"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# ─── Rate Limiter ──────────────────────────────────────────────
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class RateLimiter:
|
|
51
|
+
"""Simple rate limiter with jitter and adaptive backoff."""
|
|
52
|
+
base_delay: float = 5.0
|
|
53
|
+
jitter: float = 3.0
|
|
54
|
+
consecutive_errors: int = 0
|
|
55
|
+
max_backoff: float = 300.0 # 5 min cap
|
|
56
|
+
|
|
57
|
+
async def wait(self):
|
|
58
|
+
"""Wait the appropriate amount of time before next request."""
|
|
59
|
+
delay = self.base_delay + random.uniform(0, self.jitter)
|
|
60
|
+
if self.consecutive_errors > 0:
|
|
61
|
+
backoff = min(2 ** self.consecutive_errors, self.max_backoff)
|
|
62
|
+
delay += backoff
|
|
63
|
+
logger.debug(f"Backoff: {backoff:.0f}s (consecutive errors: {self.consecutive_errors})")
|
|
64
|
+
await asyncio.sleep(delay)
|
|
65
|
+
|
|
66
|
+
def report_success(self):
|
|
67
|
+
self.consecutive_errors = 0
|
|
68
|
+
|
|
69
|
+
def report_error(self, category: str):
|
|
70
|
+
if category == "bot_blocked":
|
|
71
|
+
self.consecutive_errors += 1
|
|
72
|
+
elif category == "network":
|
|
73
|
+
self.consecutive_errors += 1
|
|
74
|
+
# permanent errors don't affect backoff
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# ─── Cookie Manager ────────────────────────────────────────────
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
AUTO_BROWSERS = ["chrome", "firefox", "brave", "edge", "safari", "chromium"]
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class CookieManager:
|
|
85
|
+
"""Manages cookie escalation strategy.
|
|
86
|
+
|
|
87
|
+
Starts with no cookies, escalates on bot detection:
|
|
88
|
+
none → explicit cookies file → cookies-from-browser (auto-detect)
|
|
89
|
+
"""
|
|
90
|
+
cookies_file: str | None = None
|
|
91
|
+
cookies_from_browser: str | None = None
|
|
92
|
+
_mode: str = "none"
|
|
93
|
+
_escalated: bool = False
|
|
94
|
+
_auto_browser: str | None = None
|
|
95
|
+
_browser_detected: bool = False
|
|
96
|
+
|
|
97
|
+
def get_args(self) -> list[str]:
|
|
98
|
+
"""Return yt-dlp cookie arguments for current mode."""
|
|
99
|
+
if self._mode == "cookies_file" and self.cookies_file:
|
|
100
|
+
return ["--cookies", self.cookies_file]
|
|
101
|
+
elif self._mode == "cookies_from_browser":
|
|
102
|
+
browser = self.cookies_from_browser or self._auto_browser
|
|
103
|
+
if browser:
|
|
104
|
+
return ["--cookies-from-browser", browser]
|
|
105
|
+
return []
|
|
106
|
+
|
|
107
|
+
async def warmup(self):
|
|
108
|
+
"""Detect available browser cookies once at startup (non-blocking)."""
|
|
109
|
+
if self.cookies_from_browser or self.cookies_file:
|
|
110
|
+
return # user provided explicit cookies, no need to auto-detect
|
|
111
|
+
logger.info("Detecting browser cookies (one-time)...")
|
|
112
|
+
self._auto_browser = await self._detect_browser_async()
|
|
113
|
+
self._browser_detected = True
|
|
114
|
+
if self._auto_browser:
|
|
115
|
+
logger.info(f"Found browser cookies: {self._auto_browser}")
|
|
116
|
+
else:
|
|
117
|
+
logger.info("No browser cookies found (will download without cookies)")
|
|
118
|
+
|
|
119
|
+
def escalate(self) -> bool:
|
|
120
|
+
"""Try next cookie strategy. Returns True if escalation happened."""
|
|
121
|
+
if self._mode == "none":
|
|
122
|
+
if self.cookies_file:
|
|
123
|
+
self._mode = "cookies_file"
|
|
124
|
+
logger.info(f"Escalating to cookies file: {self.cookies_file}")
|
|
125
|
+
return True
|
|
126
|
+
elif self.cookies_from_browser:
|
|
127
|
+
self._mode = "cookies_from_browser"
|
|
128
|
+
logger.info(f"Escalating to cookies from browser: {self.cookies_from_browser}")
|
|
129
|
+
return True
|
|
130
|
+
elif self._auto_browser:
|
|
131
|
+
self._mode = "cookies_from_browser"
|
|
132
|
+
logger.info(f"Escalating to auto-detected browser cookies: {self._auto_browser}")
|
|
133
|
+
return True
|
|
134
|
+
elif self._mode == "cookies_file":
|
|
135
|
+
if self.cookies_from_browser:
|
|
136
|
+
self._mode = "cookies_from_browser"
|
|
137
|
+
logger.info(f"Escalating to cookies from browser: {self.cookies_from_browser}")
|
|
138
|
+
return True
|
|
139
|
+
elif self._auto_browser:
|
|
140
|
+
self._mode = "cookies_from_browser"
|
|
141
|
+
logger.info(f"Escalating to auto-detected browser cookies: {self._auto_browser}")
|
|
142
|
+
return True
|
|
143
|
+
|
|
144
|
+
if not self._escalated:
|
|
145
|
+
self._escalated = True
|
|
146
|
+
logger.warning("Bot detection and no cookie strategies worked.")
|
|
147
|
+
return False
|
|
148
|
+
|
|
149
|
+
async def _detect_browser_async(self) -> str | None:
|
|
150
|
+
"""Try each browser to see if yt-dlp can extract cookies (non-blocking)."""
|
|
151
|
+
yt_dlp_bin = _find_yt_dlp()
|
|
152
|
+
|
|
153
|
+
for browser in AUTO_BROWSERS:
|
|
154
|
+
try:
|
|
155
|
+
proc = await asyncio.create_subprocess_exec(
|
|
156
|
+
yt_dlp_bin, "--cookies-from-browser", browser,
|
|
157
|
+
"--skip-download", "--no-warnings", "-q",
|
|
158
|
+
"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
|
|
159
|
+
stdout=asyncio.subprocess.PIPE,
|
|
160
|
+
stderr=asyncio.subprocess.PIPE,
|
|
161
|
+
)
|
|
162
|
+
_, stderr_bytes = await asyncio.wait_for(proc.communicate(), timeout=15)
|
|
163
|
+
stderr = stderr_bytes.decode(errors="replace").lower()
|
|
164
|
+
if "no supported browser" not in stderr and "could not find" not in stderr and "error" not in stderr:
|
|
165
|
+
return browser
|
|
166
|
+
except Exception:
|
|
167
|
+
continue
|
|
168
|
+
return None
|
|
169
|
+
|
|
170
|
+
@property
|
|
171
|
+
def mode(self) -> str:
|
|
172
|
+
return self._mode
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
# ─── Downloader ─────────────────────────────────────────────────
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
@dataclass
|
|
179
|
+
class DownloadResult:
|
|
180
|
+
video_id: str
|
|
181
|
+
status: str # "ok", "failed", "skipped"
|
|
182
|
+
error_category: str | None = None
|
|
183
|
+
file_size: int = 0
|
|
184
|
+
elapsed: float = 0.0
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
async def download_video(
|
|
188
|
+
video_id: str,
|
|
189
|
+
output_dir: Path,
|
|
190
|
+
cookie_manager: CookieManager,
|
|
191
|
+
timeout: int = 600,
|
|
192
|
+
) -> DownloadResult:
|
|
193
|
+
"""Download a video + metadata with yt-dlp.
|
|
194
|
+
|
|
195
|
+
Returns DownloadResult with status and error info.
|
|
196
|
+
"""
|
|
197
|
+
video_path = output_dir / f"{video_id}.mp4"
|
|
198
|
+
info_path = output_dir / f"{video_id}.info.json"
|
|
199
|
+
|
|
200
|
+
# Skip if both files already exist
|
|
201
|
+
if video_path.exists() and info_path.exists():
|
|
202
|
+
return DownloadResult(video_id=video_id, status="skipped")
|
|
203
|
+
|
|
204
|
+
t0 = time.monotonic()
|
|
205
|
+
|
|
206
|
+
yt_dlp_bin = _find_yt_dlp()
|
|
207
|
+
|
|
208
|
+
cmd = [
|
|
209
|
+
yt_dlp_bin,
|
|
210
|
+
"-f", "bv*[ext=mp4]+ba[ext=m4a]/b[ext=mp4]/b",
|
|
211
|
+
"--merge-output-format", "mp4",
|
|
212
|
+
"-o", str(video_path),
|
|
213
|
+
"--write-info-json",
|
|
214
|
+
"--no-overwrites",
|
|
215
|
+
"--no-playlist",
|
|
216
|
+
"--socket-timeout", "30",
|
|
217
|
+
"--retries", "2",
|
|
218
|
+
"--no-warnings",
|
|
219
|
+
]
|
|
220
|
+
cmd.extend(cookie_manager.get_args())
|
|
221
|
+
cmd.append(f"https://www.youtube.com/watch?v={video_id}")
|
|
222
|
+
|
|
223
|
+
try:
|
|
224
|
+
proc = await asyncio.create_subprocess_exec(
|
|
225
|
+
*cmd,
|
|
226
|
+
stdout=asyncio.subprocess.PIPE,
|
|
227
|
+
stderr=asyncio.subprocess.PIPE,
|
|
228
|
+
)
|
|
229
|
+
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
|
|
230
|
+
elapsed = time.monotonic() - t0
|
|
231
|
+
|
|
232
|
+
if proc.returncode == 0:
|
|
233
|
+
size = video_path.stat().st_size if video_path.exists() else 0
|
|
234
|
+
return DownloadResult(
|
|
235
|
+
video_id=video_id, status="ok",
|
|
236
|
+
file_size=size, elapsed=elapsed,
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
err_text = stderr.decode(errors="replace").strip()
|
|
240
|
+
logger.warning(f"yt-dlp failed for {video_id}: {err_text}")
|
|
241
|
+
category = classify_error(err_text)
|
|
242
|
+
|
|
243
|
+
# Bot detection — try cookie escalation
|
|
244
|
+
if category == "bot_blocked":
|
|
245
|
+
escalated = cookie_manager.escalate()
|
|
246
|
+
if escalated:
|
|
247
|
+
# Retry once with new cookie strategy
|
|
248
|
+
return await download_video(video_id, output_dir, cookie_manager, timeout)
|
|
249
|
+
|
|
250
|
+
# Permanent errors are "skipped" (don't retry)
|
|
251
|
+
if category == "permanent":
|
|
252
|
+
return DownloadResult(
|
|
253
|
+
video_id=video_id, status="skipped",
|
|
254
|
+
error_category=category, elapsed=elapsed,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
return DownloadResult(
|
|
258
|
+
video_id=video_id, status="failed",
|
|
259
|
+
error_category=category, elapsed=elapsed,
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
except asyncio.TimeoutError:
|
|
263
|
+
elapsed = time.monotonic() - t0
|
|
264
|
+
return DownloadResult(
|
|
265
|
+
video_id=video_id, status="failed",
|
|
266
|
+
error_category="timeout", elapsed=elapsed,
|
|
267
|
+
)
|
|
268
|
+
except Exception as e:
|
|
269
|
+
elapsed = time.monotonic() - t0
|
|
270
|
+
logger.error(f"Unexpected error downloading {video_id}: {e}")
|
|
271
|
+
return DownloadResult(
|
|
272
|
+
video_id=video_id, status="failed",
|
|
273
|
+
error_category="unknown", elapsed=elapsed,
|
|
274
|
+
)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
pyproject.toml
|
|
2
|
+
ytminer_client/__init__.py
|
|
3
|
+
ytminer_client/cli.py
|
|
4
|
+
ytminer_client/downloader.py
|
|
5
|
+
ytminer_client.egg-info/PKG-INFO
|
|
6
|
+
ytminer_client.egg-info/SOURCES.txt
|
|
7
|
+
ytminer_client.egg-info/dependency_links.txt
|
|
8
|
+
ytminer_client.egg-info/entry_points.txt
|
|
9
|
+
ytminer_client.egg-info/requires.txt
|
|
10
|
+
ytminer_client.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ytminer_client
|