java-extention 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jav_dl_tmux/__init__.py +4 -0
- jav_dl_tmux/__main__.py +264 -0
- jav_dl_tmux/downloader.py +477 -0
- jav_dl_tmux/scraper.py +437 -0
- java_extention-1.1.1.dist-info/METADATA +10 -0
- java_extention-1.1.1.dist-info/RECORD +8 -0
- java_extention-1.1.1.dist-info/WHEEL +4 -0
- java_extention-1.1.1.dist-info/entry_points.txt +2 -0
jav_dl_tmux/__init__.py
ADDED
jav_dl_tmux/__main__.py
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
jav-dl-tmux — MissAV search + threaded HLS/MP4 downloader.
|
|
4
|
+
No browser. No cookies. No setup. armv7l / Termux safe.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
jav-dl-tmux # interactive menu
|
|
8
|
+
jav-dl-tmux "STARS-123" # search immediately
|
|
9
|
+
jav-dl-tmux "STARS" --recent # recent uploads
|
|
10
|
+
jav-dl-tmux "STARS-123" -r 720 # prefer 720p
|
|
11
|
+
jav-dl-tmux "STARS-123" -t 4 # 4-thread download (default, safe on 2 GB RAM)
|
|
12
|
+
jav-dl-tmux "STARS-123" -o ~/dl # custom output dir
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import re
|
|
17
|
+
import sys
|
|
18
|
+
import platform
|
|
19
|
+
import argparse
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
from rich.console import Console
|
|
23
|
+
from rich.panel import Panel
|
|
24
|
+
from rich.prompt import Prompt, IntPrompt
|
|
25
|
+
from rich.table import Table
|
|
26
|
+
from rich.live import Live
|
|
27
|
+
from rich import box
|
|
28
|
+
|
|
29
|
+
from .scraper import MissAV
|
|
30
|
+
from .downloader import download_video, sanitize_filename, _HAS_CFFI
|
|
31
|
+
|
|
32
|
+
console = Console()
|
|
33
|
+
DOWNLOAD_DIR = Path("~/downloads").expanduser()
|
|
34
|
+
|
|
35
|
+
_ARCH = platform.machine().lower()
|
|
36
|
+
_RAM_CONSTRAINED = _ARCH in ("armv7l", "armv6l", "i686", "i386")
|
|
37
|
+
_DEFAULT_THREADS = 4 # safe on 2 GB RAM — raise to 8/16 on beefier machines
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ─────────────────────────────────────────────────────────────────
|
|
41
|
+
# Banner
|
|
42
|
+
# ─────────────────────────────────────────────────────────────────
|
|
43
|
+
|
|
44
|
+
def banner() -> None:
|
|
45
|
+
backend = "curl_cffi (Chrome TLS)" if _HAS_CFFI else "urllib3 (fallback)"
|
|
46
|
+
arch_note = f" [dim]arch:[/dim] {_ARCH}" if _RAM_CONSTRAINED else ""
|
|
47
|
+
console.print(Panel.fit(
|
|
48
|
+
f"[bold red]JAV Downloader[/bold red] [dim]v1.1.1[/dim]\n"
|
|
49
|
+
f"[dim]backend:[/dim] {backend} "
|
|
50
|
+
f"[dim]threads:[/dim] {_DEFAULT_THREADS}{arch_note}",
|
|
51
|
+
border_style="red",
|
|
52
|
+
))
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# ─────────────────────────────────────────────────────────────────
|
|
56
|
+
# Listing helpers
|
|
57
|
+
# ─────────────────────────────────────────────────────────────────
|
|
58
|
+
|
|
59
|
+
def show_results(results, title: str = "Results"):
|
|
60
|
+
if not results:
|
|
61
|
+
console.print("[red]No results.[/red]")
|
|
62
|
+
return None
|
|
63
|
+
table = Table(
|
|
64
|
+
show_header=True, header_style="bold cyan",
|
|
65
|
+
box=box.SIMPLE_HEAVY, show_lines=False, title=title,
|
|
66
|
+
)
|
|
67
|
+
table.add_column("#", style="dim", width=4)
|
|
68
|
+
table.add_column("Code", style="bold yellow", width=14)
|
|
69
|
+
table.add_column("Title", min_width=28)
|
|
70
|
+
table.add_column("Date", width=12, style="dim")
|
|
71
|
+
for i, v in enumerate(results, 1):
|
|
72
|
+
table.add_row(str(i), v.code or "-", (v.title or "")[:60], v.date or "-")
|
|
73
|
+
console.print(table)
|
|
74
|
+
return results
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def ask_index(n: int, prompt: str = "Select", default: int = 1) -> int:
|
|
78
|
+
return IntPrompt.ask(
|
|
79
|
+
f"[bold]{prompt}[/bold]", default=default,
|
|
80
|
+
choices=[str(i) for i in range(1, n + 1)],
|
|
81
|
+
) - 1
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def ask_resolution(variants: list[dict]) -> dict:
|
|
85
|
+
if not variants:
|
|
86
|
+
return {}
|
|
87
|
+
table = Table(
|
|
88
|
+
show_header=True, header_style="bold magenta",
|
|
89
|
+
box=box.SIMPLE, title="Available resolutions",
|
|
90
|
+
)
|
|
91
|
+
table.add_column("#", width=4)
|
|
92
|
+
table.add_column("Resolution", width=12)
|
|
93
|
+
table.add_column("Bandwidth", width=14)
|
|
94
|
+
for i, v in enumerate(variants, 1):
|
|
95
|
+
bw = v.get("bandwidth", "?")
|
|
96
|
+
bw_str = f"{bw / 1_000_000:.1f} Mbps" if isinstance(bw, int) else str(bw)
|
|
97
|
+
table.add_row(str(i), v["resolution"], bw_str)
|
|
98
|
+
console.print(table)
|
|
99
|
+
idx = IntPrompt.ask(
|
|
100
|
+
"Choose resolution", default=1,
|
|
101
|
+
choices=[str(i) for i in range(1, len(variants) + 1)],
|
|
102
|
+
) - 1
|
|
103
|
+
return variants[idx]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# ─────────────────────────────────────────────────────────────────
|
|
107
|
+
# Progress bar renderer
|
|
108
|
+
# ─────────────────────────────────────────────────────────────────
|
|
109
|
+
|
|
110
|
+
def _render_bar(state: dict) -> str:
|
|
111
|
+
d, t, s = state["done"], state["total"], state["speed"]
|
|
112
|
+
if not t:
|
|
113
|
+
pct, bar = 0, ""
|
|
114
|
+
else:
|
|
115
|
+
pct = min(100, int(d * 100 / t))
|
|
116
|
+
bar = "█" * (pct // 4) + "░" * (25 - pct // 4)
|
|
117
|
+
speed_str = (
|
|
118
|
+
f"{s / 1_000_000:.2f} MB/s" if s >= 1_000_000
|
|
119
|
+
else f"{s / 1_000:.0f} KB/s" if s >= 1_000
|
|
120
|
+
else f"{s:.0f} B/s"
|
|
121
|
+
)
|
|
122
|
+
return (
|
|
123
|
+
f"[cyan]{bar}[/cyan] {pct:3d}% "
|
|
124
|
+
f"[dim]{d / 1_048_576:6.1f} / {t / 1_048_576:6.1f} MB "
|
|
125
|
+
f"{speed_str}[/dim]"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# ─────────────────────────────────────────────────────────────────
|
|
130
|
+
# Core run
|
|
131
|
+
# ─────────────────────────────────────────────────────────────────
|
|
132
|
+
|
|
133
|
+
def run(args: argparse.Namespace) -> None:
|
|
134
|
+
banner()
|
|
135
|
+
api = MissAV()
|
|
136
|
+
|
|
137
|
+
# ── 1. Pick something to download ────────────────────────────
|
|
138
|
+
if args.recent:
|
|
139
|
+
results = api.recent()
|
|
140
|
+
results = show_results(results, f"Recent uploads ({len(results)})") or []
|
|
141
|
+
if not results:
|
|
142
|
+
return
|
|
143
|
+
idx = ask_index(len(results), "Pick", 1)
|
|
144
|
+
selected = results[idx]
|
|
145
|
+
|
|
146
|
+
elif args.query:
|
|
147
|
+
console.print(f"\n[bold]Searching:[/bold] {args.query}")
|
|
148
|
+
results = api.search(args.query)
|
|
149
|
+
results = show_results(results, f"Search '{args.query}'") or []
|
|
150
|
+
if not results:
|
|
151
|
+
console.print("[red]No matches.[/red]")
|
|
152
|
+
return
|
|
153
|
+
# auto-pick when query is an exact code and only one result
|
|
154
|
+
if re.match(r'^[A-Z]{2,5}-\d{2,5}$', args.query.upper()) and len(results) == 1:
|
|
155
|
+
selected = results[0]
|
|
156
|
+
console.print(f"[green]Auto-selected:[/green] {selected.code} — {selected.title}")
|
|
157
|
+
else:
|
|
158
|
+
idx = ask_index(len(results), "Pick", 1)
|
|
159
|
+
selected = results[idx]
|
|
160
|
+
|
|
161
|
+
else:
|
|
162
|
+
# fully interactive
|
|
163
|
+
mode = Prompt.ask("[bold]What do you want?[/bold]",
|
|
164
|
+
choices=["search", "recent"], default="search")
|
|
165
|
+
if mode == "search":
|
|
166
|
+
q = Prompt.ask("[bold]Search query[/bold]")
|
|
167
|
+
results = api.search(q)
|
|
168
|
+
results = show_results(results, f"Search '{q}'") or []
|
|
169
|
+
else:
|
|
170
|
+
results = api.recent()
|
|
171
|
+
results = show_results(results, "Recent uploads") or []
|
|
172
|
+
if not results:
|
|
173
|
+
return
|
|
174
|
+
idx = ask_index(len(results), "Pick", 1)
|
|
175
|
+
selected = results[idx]
|
|
176
|
+
|
|
177
|
+
# ── 2. Fetch full video page (m3u8) ──────────────────────────
|
|
178
|
+
console.print(f"\n[dim]Fetching video page:[/dim] {selected.url}")
|
|
179
|
+
full = api.video(selected.slug)
|
|
180
|
+
m3u8 = getattr(full, "m3u8", "")
|
|
181
|
+
if not m3u8:
|
|
182
|
+
console.print(f"[red]No m3u8 stream found for {full.code}.[/red]")
|
|
183
|
+
return
|
|
184
|
+
console.print(f"[dim]Stream:[/dim] {m3u8[:72]}…" if len(m3u8) > 72 else f"[dim]Stream:[/dim] {m3u8}")
|
|
185
|
+
|
|
186
|
+
# ── 3. Resolution picker ──────────────────────────────────────
|
|
187
|
+
chosen_res = args.resolution
|
|
188
|
+
if not chosen_res:
|
|
189
|
+
variants = api.resolutions(m3u8)
|
|
190
|
+
if variants:
|
|
191
|
+
pick = ask_resolution(variants)
|
|
192
|
+
chosen_res = pick["resolution"].split("x")[-1] if "x" in pick["resolution"] else None
|
|
193
|
+
if isinstance(pick.get("bandwidth"), int):
|
|
194
|
+
console.print(
|
|
195
|
+
f"[green]Chosen:[/green] {pick['resolution']} "
|
|
196
|
+
f"@ {pick['bandwidth'] / 1_000_000:.1f} Mbps"
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# ── 4. Download ───────────────────────────────────────────────
|
|
200
|
+
out_dir = Path(args.output).expanduser().resolve()
|
|
201
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
202
|
+
|
|
203
|
+
state: dict = {"done": 0, "total": 1, "speed": 0.0}
|
|
204
|
+
|
|
205
|
+
with Live(_render_bar(state), refresh_per_second=8,
|
|
206
|
+
console=console, transient=True) as live:
|
|
207
|
+
def prog(d: int, t: int, s: float) -> None:
|
|
208
|
+
state.update(done=d, total=t, speed=s)
|
|
209
|
+
live.update(_render_bar(state))
|
|
210
|
+
|
|
211
|
+
try:
|
|
212
|
+
out = download_video(
|
|
213
|
+
full, out_dir,
|
|
214
|
+
threads=args.threads,
|
|
215
|
+
preferred_resolution=str(chosen_res) if chosen_res else None,
|
|
216
|
+
progress=prog,
|
|
217
|
+
)
|
|
218
|
+
except Exception as e:
|
|
219
|
+
console.print(f"[bold red]Download error:[/bold red] {e}")
|
|
220
|
+
return
|
|
221
|
+
|
|
222
|
+
sz_mb = out.stat().st_size / 1_048_576
|
|
223
|
+
console.print(Panel.fit(
|
|
224
|
+
f"[bold green]Done![/bold green]\n"
|
|
225
|
+
f"[dim]Saved:[/dim] [cyan]{out}[/cyan]\n"
|
|
226
|
+
f"[dim]Size:[/dim] {sz_mb:.1f} MB",
|
|
227
|
+
border_style="green",
|
|
228
|
+
))
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
# ─────────────────────────────────────────────────────────────────
|
|
232
|
+
# Entry
|
|
233
|
+
# ─────────────────────────────────────────────────────────────────
|
|
234
|
+
|
|
235
|
+
def main() -> None:
|
|
236
|
+
parser = argparse.ArgumentParser(
|
|
237
|
+
description="MissAV downloader — no browser, no cookies, armv7l safe",
|
|
238
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
239
|
+
epilog=__doc__,
|
|
240
|
+
)
|
|
241
|
+
parser.add_argument("query", nargs="?",
|
|
242
|
+
help="Search query / JAV code (e.g. STARS-123)")
|
|
243
|
+
parser.add_argument("--recent", action="store_true",
|
|
244
|
+
help="Show recent uploads instead of searching")
|
|
245
|
+
parser.add_argument("-r", "--resolution",
|
|
246
|
+
help="Preferred resolution height (1080, 720, 480 …)")
|
|
247
|
+
parser.add_argument("-t", "--threads", type=int, default=_DEFAULT_THREADS,
|
|
248
|
+
help=f"Download threads (default {_DEFAULT_THREADS}, safe on 2 GB RAM)")
|
|
249
|
+
parser.add_argument("-o", "--output", default=str(DOWNLOAD_DIR),
|
|
250
|
+
help=f"Output directory (default {DOWNLOAD_DIR})")
|
|
251
|
+
args = parser.parse_args()
|
|
252
|
+
|
|
253
|
+
try:
|
|
254
|
+
run(args)
|
|
255
|
+
except KeyboardInterrupt:
|
|
256
|
+
console.print("\n[yellow]Aborted.[/yellow]")
|
|
257
|
+
sys.exit(0)
|
|
258
|
+
except Exception as e:
|
|
259
|
+
console.print(f"\n[bold red]Error:[/bold red] {e}")
|
|
260
|
+
sys.exit(1)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
if __name__ == "__main__":
|
|
264
|
+
main()
|
|
@@ -0,0 +1,477 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Downloader — streaming HLS (.ts segments) + MP4 chunked.
|
|
3
|
+
Tuned for armv7l / 2 GB RAM:
|
|
4
|
+
- Default 4 threads (not 16) — 4× a typical 4 MB segment = 16 MB working set
|
|
5
|
+
- Segments streamed directly to disk, never fully held in RAM
|
|
6
|
+
- Concat phase reads one file at a time in 256 KB chunks
|
|
7
|
+
- Range-request pool capped at 4 concurrent connections
|
|
8
|
+
- ffmpeg path preferred when available (zero extra RAM, fastest)
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
import re
|
|
14
|
+
import time
|
|
15
|
+
import shutil
|
|
16
|
+
import threading
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
19
|
+
from typing import Optional, Callable
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
from curl_cffi import requests as _cffi # type: ignore
|
|
23
|
+
_HAS_CFFI = True
|
|
24
|
+
except Exception:
|
|
25
|
+
_HAS_CFFI = False
|
|
26
|
+
|
|
27
|
+
import urllib3
|
|
28
|
+
from urllib3.util.retry import Retry
|
|
29
|
+
from urllib3.util.ssl_ import create_urllib3_context
|
|
30
|
+
|
|
31
|
+
UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
32
|
+
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
|
33
|
+
IMPERSONATE = "chrome120"
|
|
34
|
+
REFERER = "https://missav.live/"
|
|
35
|
+
|
|
36
|
+
# 256 KB stream chunks — small enough for 2 GB RAM
|
|
37
|
+
_STREAM_CHUNK = 256 * 1024
|
|
38
|
+
|
|
39
|
+
ProgressFn = Callable[[int, int, float], None]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _noop_progress(_d: int, _t: int, _s: float) -> None: ...
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# ─────────────────────────────────────────────────────────────────
|
|
46
|
+
# Fetcher — shared across worker threads
|
|
47
|
+
# ─────────────────────────────────────────────────────────────────
|
|
48
|
+
|
|
49
|
+
class Fetcher:
|
|
50
|
+
"""
|
|
51
|
+
Single shared session for all download workers.
|
|
52
|
+
curl_cffi when available; urllib3 with TLS 1.2+ on armv7l.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(self, max_pool: int = 8):
|
|
56
|
+
if _HAS_CFFI:
|
|
57
|
+
self.session: Optional[object] = _cffi.Session()
|
|
58
|
+
self.session.headers.update({ # type: ignore[union-attr]
|
|
59
|
+
"User-Agent": UA,
|
|
60
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
61
|
+
"Referer": REFERER,
|
|
62
|
+
})
|
|
63
|
+
self._pool: Optional[urllib3.PoolManager] = None
|
|
64
|
+
else:
|
|
65
|
+
self.session = None
|
|
66
|
+
ctx = create_urllib3_context()
|
|
67
|
+
ctx.minimum_version = 0x0303 # TLS 1.2 floor
|
|
68
|
+
self._pool = urllib3.PoolManager(
|
|
69
|
+
num_pools=max_pool,
|
|
70
|
+
maxsize=max_pool,
|
|
71
|
+
headers={"User-Agent": UA, "Referer": REFERER},
|
|
72
|
+
retries=Retry(total=5, backoff_factor=0.3,
|
|
73
|
+
status_forcelist={429, 500, 502, 503, 504}),
|
|
74
|
+
ssl_context=ctx,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# ── text ──────────────────────────────────────────────────────
|
|
78
|
+
def text(self, url: str, timeout: float = 30.0) -> str:
|
|
79
|
+
if self.session is not None:
|
|
80
|
+
r = self.session.get(url, impersonate=IMPERSONATE, timeout=timeout) # type: ignore
|
|
81
|
+
if r.status_code != 200:
|
|
82
|
+
raise IOError(f"HTTP {r.status_code} for {url}")
|
|
83
|
+
return r.text
|
|
84
|
+
assert self._pool is not None
|
|
85
|
+
r2 = self._pool.request("GET", url,
|
|
86
|
+
timeout=urllib3.Timeout(connect=10, read=timeout))
|
|
87
|
+
if r2.status != 200:
|
|
88
|
+
raise IOError(f"HTTP {r2.status} for {url}")
|
|
89
|
+
return r2.data.decode("utf-8", errors="replace")
|
|
90
|
+
|
|
91
|
+
# ── bytes (full, for small payloads like sub-playlists) ───────
|
|
92
|
+
def bytes(self, url: str, timeout: float = 60.0) -> bytes:
|
|
93
|
+
if self.session is not None:
|
|
94
|
+
r = self.session.get(url, impersonate=IMPERSONATE, timeout=timeout) # type: ignore
|
|
95
|
+
if r.status_code != 200:
|
|
96
|
+
raise IOError(f"HTTP {r.status_code} for {url}")
|
|
97
|
+
return r.content
|
|
98
|
+
assert self._pool is not None
|
|
99
|
+
r2 = self._pool.request("GET", url,
|
|
100
|
+
timeout=urllib3.Timeout(connect=10, read=timeout),
|
|
101
|
+
preload_content=True)
|
|
102
|
+
if r2.status != 200:
|
|
103
|
+
raise IOError(f"HTTP {r2.status} for {url}")
|
|
104
|
+
return r2.data
|
|
105
|
+
|
|
106
|
+
# ── streaming write-to-file (memory-safe for large segments) ──
|
|
107
|
+
def stream_to_file(self, url: str, dest: Path, timeout: float = 120.0) -> int:
|
|
108
|
+
"""
|
|
109
|
+
Fetch url and write directly to dest in _STREAM_CHUNK chunks.
|
|
110
|
+
Returns number of bytes written. Never holds the full payload in RAM.
|
|
111
|
+
"""
|
|
112
|
+
if self.session is not None:
|
|
113
|
+
r = self.session.get(url, impersonate=IMPERSONATE, # type: ignore
|
|
114
|
+
timeout=timeout, stream=True,
|
|
115
|
+
headers={"Referer": REFERER})
|
|
116
|
+
if r.status_code != 200:
|
|
117
|
+
raise IOError(f"HTTP {r.status_code} for {url}")
|
|
118
|
+
written = 0
|
|
119
|
+
with dest.open("wb") as fh:
|
|
120
|
+
for chunk in r.iter_content(_STREAM_CHUNK):
|
|
121
|
+
if chunk:
|
|
122
|
+
fh.write(chunk)
|
|
123
|
+
written += len(chunk)
|
|
124
|
+
return written
|
|
125
|
+
else:
|
|
126
|
+
assert self._pool is not None
|
|
127
|
+
r2 = self._pool.request(
|
|
128
|
+
"GET", url,
|
|
129
|
+
headers={"Referer": REFERER},
|
|
130
|
+
timeout=urllib3.Timeout(connect=10, read=timeout),
|
|
131
|
+
preload_content=False,
|
|
132
|
+
)
|
|
133
|
+
if r2.status != 200:
|
|
134
|
+
r2.drain_conn()
|
|
135
|
+
raise IOError(f"HTTP {r2.status} for {url}")
|
|
136
|
+
written = 0
|
|
137
|
+
try:
|
|
138
|
+
with dest.open("wb") as fh:
|
|
139
|
+
while True:
|
|
140
|
+
chunk = r2.read(_STREAM_CHUNK)
|
|
141
|
+
if not chunk:
|
|
142
|
+
break
|
|
143
|
+
fh.write(chunk)
|
|
144
|
+
written += len(chunk)
|
|
145
|
+
finally:
|
|
146
|
+
r2.drain_conn()
|
|
147
|
+
r2.release_conn()
|
|
148
|
+
return written
|
|
149
|
+
|
|
150
|
+
# ── HEAD / content-length ──────────────────────────────────────
|
|
151
|
+
def head_size(self, url: str, timeout: float = 15.0) -> int:
|
|
152
|
+
if self.session is not None:
|
|
153
|
+
try:
|
|
154
|
+
r = self.session.head(url, impersonate=IMPERSONATE, # type: ignore
|
|
155
|
+
timeout=timeout, allow_redirects=True)
|
|
156
|
+
if r.status_code == 200:
|
|
157
|
+
cl = r.headers.get("Content-Length", "")
|
|
158
|
+
if cl.isdigit():
|
|
159
|
+
return int(cl)
|
|
160
|
+
except Exception:
|
|
161
|
+
pass
|
|
162
|
+
# cffi path gave no size — don't touch self._pool (it doesn't exist)
|
|
163
|
+
return 0
|
|
164
|
+
assert self._pool is not None
|
|
165
|
+
try:
|
|
166
|
+
r2 = self._pool.request("HEAD", url, redirect=True,
|
|
167
|
+
timeout=urllib3.Timeout(connect=10, read=timeout))
|
|
168
|
+
if r2.status == 200:
|
|
169
|
+
cl = r2.headers.get("Content-Length", "")
|
|
170
|
+
if cl.isdigit():
|
|
171
|
+
return int(cl)
|
|
172
|
+
except Exception:
|
|
173
|
+
pass
|
|
174
|
+
return 0
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
# ─────────────────────────────────────────────────────────────────
|
|
178
|
+
# HLS / m3u8 downloader
|
|
179
|
+
# ─────────────────────────────────────────────────────────────────
|
|
180
|
+
|
|
181
|
+
class HLSDownloader:
|
|
182
|
+
"""
|
|
183
|
+
Parse master/media playlist, stream every .ts segment to disk concurrently.
|
|
184
|
+
Default 4 threads — safe on 2 GB RAM (4 × ~4 MB = ~16 MB peak working set).
|
|
185
|
+
"""
|
|
186
|
+
|
|
187
|
+
def __init__(self, threads: int = 4, fetcher: Optional[Fetcher] = None):
|
|
188
|
+
self.threads = threads
|
|
189
|
+
self.fetcher = fetcher or Fetcher(max_pool=threads + 2)
|
|
190
|
+
|
|
191
|
+
# ── playlist parsing ──────────────────────────────────────────
|
|
192
|
+
def _parse_segments(self, playlist_url: str) -> list[tuple[str, float]]:
|
|
193
|
+
txt = self.fetcher.text(playlist_url)
|
|
194
|
+
|
|
195
|
+
if "#EXT-X-STREAM-INF" in txt:
|
|
196
|
+
# master playlist — pick highest bandwidth variant
|
|
197
|
+
variants: list[tuple[int, str]] = []
|
|
198
|
+
base = playlist_url.rsplit("/", 1)[0] + "/"
|
|
199
|
+
lines = txt.splitlines()
|
|
200
|
+
for i, line in enumerate(lines):
|
|
201
|
+
if not line.startswith("#EXT-X-STREAM-INF"):
|
|
202
|
+
continue
|
|
203
|
+
bw_m = re.search(r'BANDWIDTH=(\d+)', line)
|
|
204
|
+
bw = int(bw_m.group(1)) if bw_m else 0
|
|
205
|
+
for nxt in lines[i + 1:]:
|
|
206
|
+
nxt = nxt.strip()
|
|
207
|
+
if nxt and not nxt.startswith("#"):
|
|
208
|
+
variants.append((bw, nxt if "://" in nxt else base + nxt))
|
|
209
|
+
break
|
|
210
|
+
if not variants:
|
|
211
|
+
raise IOError("master playlist: no variants found")
|
|
212
|
+
best = sorted(variants, key=lambda x: -x[0])[0][1]
|
|
213
|
+
return self._parse_segments(best)
|
|
214
|
+
|
|
215
|
+
# media playlist
|
|
216
|
+
out: list[tuple[str, float]] = []
|
|
217
|
+
base = playlist_url.rsplit("/", 1)[0] + "/"
|
|
218
|
+
dur = 0.0
|
|
219
|
+
for line in txt.splitlines():
|
|
220
|
+
line = line.strip()
|
|
221
|
+
if not line:
|
|
222
|
+
continue
|
|
223
|
+
if line.startswith("#EXTINF:"):
|
|
224
|
+
try:
|
|
225
|
+
dur = float(line.split(":", 1)[1].rstrip(",").split()[0])
|
|
226
|
+
except (ValueError, IndexError):
|
|
227
|
+
dur = 0.0
|
|
228
|
+
elif line.startswith("#"):
|
|
229
|
+
continue
|
|
230
|
+
else:
|
|
231
|
+
url = line if "://" in line else base + line
|
|
232
|
+
out.append((url, dur))
|
|
233
|
+
dur = 0.0
|
|
234
|
+
return out
|
|
235
|
+
|
|
236
|
+
# ── main entry ────────────────────────────────────────────────
|
|
237
|
+
def download(self, m3u8_url: str, out_path: Path,
|
|
238
|
+
progress: ProgressFn = _noop_progress) -> Path:
|
|
239
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
240
|
+
segments = self._parse_segments(m3u8_url)
|
|
241
|
+
if not segments:
|
|
242
|
+
raise IOError(f"no segments found in {m3u8_url}")
|
|
243
|
+
|
|
244
|
+
# rough size estimate: 2 Mbit/s avg × duration
|
|
245
|
+
total_est = max(int(sum(d for _, d in segments) * 250_000), 1)
|
|
246
|
+
progress(0, total_est, 0.0)
|
|
247
|
+
|
|
248
|
+
# prefer ffmpeg — zero extra RAM, fastest possible
|
|
249
|
+
if shutil.which("ffmpeg"):
|
|
250
|
+
return self._download_ffmpeg(m3u8_url, out_path, progress)
|
|
251
|
+
|
|
252
|
+
return self._download_manual(segments, out_path, total_est, progress)
|
|
253
|
+
|
|
254
|
+
# ── ffmpeg path ───────────────────────────────────────────────
|
|
255
|
+
def _download_ffmpeg(self, m3u8_url: str, out_path: Path,
|
|
256
|
+
progress: ProgressFn) -> Path:
|
|
257
|
+
import subprocess
|
|
258
|
+
cmd = [
|
|
259
|
+
"ffmpeg", "-y", "-loglevel", "error",
|
|
260
|
+
"-headers", f"User-Agent: {UA}\r\nReferer: {REFERER}\r\n",
|
|
261
|
+
"-i", m3u8_url,
|
|
262
|
+
"-c", "copy",
|
|
263
|
+
str(out_path),
|
|
264
|
+
]
|
|
265
|
+
proc = subprocess.Popen(cmd)
|
|
266
|
+
proc.wait()
|
|
267
|
+
if proc.returncode != 0:
|
|
268
|
+
raise IOError(f"ffmpeg exited {proc.returncode}")
|
|
269
|
+
sz = out_path.stat().st_size
|
|
270
|
+
progress(sz, sz, 0.0)
|
|
271
|
+
return out_path
|
|
272
|
+
|
|
273
|
+
# ── manual threaded path ──────────────────────────────────────
|
|
274
|
+
def _download_manual(self, segments: list[tuple[str, float]],
|
|
275
|
+
out_path: Path, total_est: int,
|
|
276
|
+
progress: ProgressFn) -> Path:
|
|
277
|
+
tmp_dir = out_path.with_suffix(".parts")
|
|
278
|
+
if tmp_dir.exists():
|
|
279
|
+
shutil.rmtree(tmp_dir)
|
|
280
|
+
tmp_dir.mkdir(parents=True, exist_ok=True)
|
|
281
|
+
|
|
282
|
+
bytes_done = 0
|
|
283
|
+
t0 = time.time()
|
|
284
|
+
lock = threading.Lock()
|
|
285
|
+
errors: list[str] = []
|
|
286
|
+
|
|
287
|
+
def fetch_seg(args: tuple[int, str, float]) -> None:
|
|
288
|
+
nonlocal bytes_done
|
|
289
|
+
idx, url, _dur = args
|
|
290
|
+
seg_path = tmp_dir / f"{idx:06d}.ts"
|
|
291
|
+
|
|
292
|
+
# resume: skip already-fetched segments
|
|
293
|
+
if seg_path.exists() and seg_path.stat().st_size > 0:
|
|
294
|
+
with lock:
|
|
295
|
+
bytes_done += seg_path.stat().st_size
|
|
296
|
+
return
|
|
297
|
+
|
|
298
|
+
for attempt in range(5):
|
|
299
|
+
try:
|
|
300
|
+
written = self.fetcher.stream_to_file(url, seg_path, timeout=120.0)
|
|
301
|
+
with lock:
|
|
302
|
+
bytes_done += written
|
|
303
|
+
elapsed = time.time() - t0 or 0.001
|
|
304
|
+
progress(bytes_done, total_est, bytes_done / elapsed)
|
|
305
|
+
return
|
|
306
|
+
except Exception as e:
|
|
307
|
+
if seg_path.exists():
|
|
308
|
+
seg_path.unlink(missing_ok=True)
|
|
309
|
+
time.sleep(0.5 * (attempt + 1))
|
|
310
|
+
errors.append(url)
|
|
311
|
+
|
|
312
|
+
indexed = [(i, u, d) for i, (u, d) in enumerate(segments)]
|
|
313
|
+
with ThreadPoolExecutor(max_workers=self.threads) as pool:
|
|
314
|
+
futs = {pool.submit(fetch_seg, t): t for t in indexed}
|
|
315
|
+
for f in as_completed(futs):
|
|
316
|
+
try:
|
|
317
|
+
f.result()
|
|
318
|
+
except Exception as e:
|
|
319
|
+
errors.append(str(e))
|
|
320
|
+
|
|
321
|
+
if errors:
|
|
322
|
+
print(f"\n[!] {len(errors)} segment(s) failed — file may be incomplete")
|
|
323
|
+
|
|
324
|
+
# stream-concat in order — 256 KB at a time, minimal RAM
|
|
325
|
+
with out_path.open("wb") as outf:
|
|
326
|
+
for i in range(len(segments)):
|
|
327
|
+
seg = tmp_dir / f"{i:06d}.ts"
|
|
328
|
+
if not seg.exists():
|
|
329
|
+
continue
|
|
330
|
+
with seg.open("rb") as sf:
|
|
331
|
+
while True:
|
|
332
|
+
chunk = sf.read(_STREAM_CHUNK)
|
|
333
|
+
if not chunk:
|
|
334
|
+
break
|
|
335
|
+
outf.write(chunk)
|
|
336
|
+
seg.unlink()
|
|
337
|
+
|
|
338
|
+
shutil.rmtree(tmp_dir, ignore_errors=True)
|
|
339
|
+
sz = out_path.stat().st_size
|
|
340
|
+
progress(sz, sz, 0.0)
|
|
341
|
+
return out_path
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
# ─────────────────────────────────────────────────────────────────
|
|
345
|
+
# Direct MP4 chunked downloader
|
|
346
|
+
# ─────────────────────────────────────────────────────────────────
|
|
347
|
+
|
|
348
|
+
class DirectDownloader:
|
|
349
|
+
"""
|
|
350
|
+
Range-request parallel download for direct MP4 URLs.
|
|
351
|
+
Each chunk is streamed directly to the correct file offset.
|
|
352
|
+
"""
|
|
353
|
+
|
|
354
|
+
def __init__(self, threads: int = 4, fetcher: Optional[Fetcher] = None):
|
|
355
|
+
self.threads = threads
|
|
356
|
+
self.fetcher = fetcher or Fetcher(max_pool=threads + 2)
|
|
357
|
+
|
|
358
|
+
def download(self, url: str, out_path: Path,
|
|
359
|
+
progress: ProgressFn = _noop_progress) -> Path:
|
|
360
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
361
|
+
total = self.fetcher.head_size(url)
|
|
362
|
+
if not total:
|
|
363
|
+
return self._single_stream(url, out_path, progress)
|
|
364
|
+
|
|
365
|
+
# chunk size: ensure at most 4 MB per chunk to keep RAM bounded
|
|
366
|
+
per_thread = max(4 * 1024 * 1024, total // (self.threads * 4))
|
|
367
|
+
chunk_size = min(per_thread, 8 * 1024 * 1024)
|
|
368
|
+
ranges: list[tuple[int, int]] = []
|
|
369
|
+
pos = 0
|
|
370
|
+
while pos < total:
|
|
371
|
+
end = min(pos + chunk_size - 1, total - 1)
|
|
372
|
+
ranges.append((pos, end))
|
|
373
|
+
pos = end + 1
|
|
374
|
+
|
|
375
|
+
tmp = out_path.with_suffix(out_path.suffix + ".part")
|
|
376
|
+
# pre-allocate with zeros so we can seek-write safely
|
|
377
|
+
with tmp.open("wb") as f:
|
|
378
|
+
f.seek(total - 1)
|
|
379
|
+
f.write(b"\x00")
|
|
380
|
+
|
|
381
|
+
bytes_done = 0
|
|
382
|
+
t0 = time.time()
|
|
383
|
+
lock = threading.Lock()
|
|
384
|
+
|
|
385
|
+
def pull(rng: tuple[int, int]) -> None:
|
|
386
|
+
nonlocal bytes_done
|
|
387
|
+
start, end = rng
|
|
388
|
+
for attempt in range(5):
|
|
389
|
+
try:
|
|
390
|
+
hdrs = {
|
|
391
|
+
"Range": f"bytes={start}-{end}",
|
|
392
|
+
"User-Agent": UA,
|
|
393
|
+
"Referer": REFERER,
|
|
394
|
+
}
|
|
395
|
+
if _HAS_CFFI and self.fetcher.session is not None:
|
|
396
|
+
r = self.fetcher.session.get( # type: ignore
|
|
397
|
+
url, impersonate=IMPERSONATE,
|
|
398
|
+
headers=hdrs, timeout=120,
|
|
399
|
+
)
|
|
400
|
+
if r.status_code not in (200, 206):
|
|
401
|
+
raise IOError(f"range status {r.status_code}")
|
|
402
|
+
data = r.content
|
|
403
|
+
else:
|
|
404
|
+
assert self.fetcher._pool is not None
|
|
405
|
+
r2 = self.fetcher._pool.request(
|
|
406
|
+
"GET", url, headers=hdrs,
|
|
407
|
+
timeout=urllib3.Timeout(connect=10, read=120),
|
|
408
|
+
preload_content=True,
|
|
409
|
+
)
|
|
410
|
+
if r2.status not in (200, 206):
|
|
411
|
+
raise IOError(f"range status {r2.status}")
|
|
412
|
+
data = r2.data
|
|
413
|
+
|
|
414
|
+
with open(tmp, "r+b") as f:
|
|
415
|
+
f.seek(start)
|
|
416
|
+
f.write(data)
|
|
417
|
+
with lock:
|
|
418
|
+
bytes_done += len(data)
|
|
419
|
+
elapsed = time.time() - t0 or 0.001
|
|
420
|
+
progress(bytes_done, total, bytes_done / elapsed)
|
|
421
|
+
return
|
|
422
|
+
except Exception:
|
|
423
|
+
time.sleep(0.5 * (attempt + 1))
|
|
424
|
+
raise IOError(f"range {start}-{end} failed after 5 attempts")
|
|
425
|
+
|
|
426
|
+
with ThreadPoolExecutor(max_workers=self.threads) as pool:
|
|
427
|
+
list(pool.map(pull, ranges))
|
|
428
|
+
|
|
429
|
+
tmp.rename(out_path)
|
|
430
|
+
progress(total, total, 0.0)
|
|
431
|
+
return out_path
|
|
432
|
+
|
|
433
|
+
def _single_stream(self, url: str, out_path: Path,
|
|
434
|
+
progress: ProgressFn) -> Path:
|
|
435
|
+
"""No Content-Length — stream directly to disk."""
|
|
436
|
+
written = self.fetcher.stream_to_file(url, out_path, timeout=600.0)
|
|
437
|
+
progress(written, written, 0.0)
|
|
438
|
+
return out_path
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
# ─────────────────────────────────────────────────────────────────
|
|
442
|
+
# Helpers
|
|
443
|
+
# ─────────────────────────────────────────────────────────────────
|
|
444
|
+
|
|
445
|
+
def sanitize_filename(name: str, max_len: int = 120) -> str:
|
|
446
|
+
name = re.sub(r'[\\/*?:"<>|\r\n]+', "_", name).strip(" ._")
|
|
447
|
+
return (name or "untitled")[:max_len]
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
# ─────────────────────────────────────────────────────────────────
|
|
451
|
+
# Public entrypoint
|
|
452
|
+
# ─────────────────────────────────────────────────────────────────
|
|
453
|
+
|
|
454
|
+
def download_video(video, out_dir: Path, threads: int = 4,
|
|
455
|
+
preferred_resolution: Optional[str] = None,
|
|
456
|
+
progress: ProgressFn = _noop_progress) -> Path:
|
|
457
|
+
"""Download a Video object. threads=4 by default (safe on 2 GB RAM)."""
|
|
458
|
+
safe = sanitize_filename(f"{video.code}_{video.title}")
|
|
459
|
+
out_path = out_dir / f"{safe}.mp4"
|
|
460
|
+
|
|
461
|
+
m3u8 = getattr(video, "m3u8", "") or ""
|
|
462
|
+
if not m3u8:
|
|
463
|
+
raise IOError(f"no m3u8 source found for {video.code}")
|
|
464
|
+
|
|
465
|
+
if preferred_resolution:
|
|
466
|
+
from .scraper import MissAV
|
|
467
|
+
m = MissAV()
|
|
468
|
+
variants = m.resolutions(m3u8)
|
|
469
|
+
pick = next(
|
|
470
|
+
(v for v in variants
|
|
471
|
+
if v["resolution"].endswith(preferred_resolution)), None,
|
|
472
|
+
)
|
|
473
|
+
if pick and pick.get("url"):
|
|
474
|
+
m3u8 = pick["url"]
|
|
475
|
+
|
|
476
|
+
hls = HLSDownloader(threads=threads)
|
|
477
|
+
return hls.download(m3u8, out_path, progress=progress)
|
jav_dl_tmux/scraper.py
ADDED
|
@@ -0,0 +1,437 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Scraper — MissAV endpoints, live search, recent uploads, single-video metadata.
|
|
3
|
+
|
|
4
|
+
MissAV sits behind Cloudflare. curl_cffi (Chrome TLS impersonation) bypasses it
|
|
5
|
+
cleanly when available. On armv7l / Termux where curl_cffi won't build, we fall
|
|
6
|
+
back to urllib3 with realistic headers — good enough for most requests.
|
|
7
|
+
|
|
8
|
+
Endpoints:
|
|
9
|
+
GET https://missav.live/en -> recent uploads
|
|
10
|
+
GET https://missav.live/en/search/{query} -> search results
|
|
11
|
+
GET https://missav.live/en/{slug} -> single video page (m3u8 here)
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
import time
|
|
17
|
+
import base64
|
|
18
|
+
import binascii
|
|
19
|
+
from typing import Optional
|
|
20
|
+
from urllib.parse import urljoin, quote_plus
|
|
21
|
+
|
|
22
|
+
# ── optional Chrome-TLS backend ──────────────────────────────────────────────
|
|
23
|
+
try:
|
|
24
|
+
from curl_cffi import requests as _cffi # type: ignore
|
|
25
|
+
_HAS_CFFI = True
|
|
26
|
+
except Exception:
|
|
27
|
+
_HAS_CFFI = False
|
|
28
|
+
|
|
29
|
+
import urllib3
|
|
30
|
+
from urllib3.exceptions import HTTPError
|
|
31
|
+
from urllib3.util.retry import Retry
|
|
32
|
+
from urllib3.util.ssl_ import create_urllib3_context
|
|
33
|
+
|
|
34
|
+
# ── constants ─────────────────────────────────────────────────────────────────
|
|
35
|
+
UA = (
|
|
36
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
37
|
+
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
38
|
+
)
|
|
39
|
+
BASE = "https://missav.live"
|
|
40
|
+
IMPERSONATE = "chrome120"
|
|
41
|
+
|
|
42
|
+
_HEADERS = {
|
|
43
|
+
"User-Agent": UA,
|
|
44
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
45
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
46
|
+
"Accept-Encoding": "gzip, deflate, br",
|
|
47
|
+
"Referer": BASE + "/",
|
|
48
|
+
"DNT": "1",
|
|
49
|
+
"Upgrade-Insecure-Requests": "1",
|
|
50
|
+
"Sec-Fetch-Dest": "document",
|
|
51
|
+
"Sec-Fetch-Mode": "navigate",
|
|
52
|
+
"Sec-Fetch-Site": "none",
|
|
53
|
+
"Sec-Fetch-User": "?1",
|
|
54
|
+
"Sec-Ch-Ua": '"Not_A brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
|
|
55
|
+
"Sec-Ch-Ua-Mobile": "?0",
|
|
56
|
+
"Sec-Ch-Ua-Platform": '"Windows"',
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
_RETRY = Retry(
|
|
60
|
+
total=4,
|
|
61
|
+
backoff_factor=0.5,
|
|
62
|
+
status_forcelist={429, 500, 502, 503, 504},
|
|
63
|
+
allowed_methods={"GET", "HEAD"},
|
|
64
|
+
raise_on_status=False,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# ─────────────────────────────────────────────────────────────────
|
|
69
|
+
# HTTP wrapper
|
|
70
|
+
# ─────────────────────────────────────────────────────────────────
|
|
71
|
+
|
|
72
|
+
class Http:
|
|
73
|
+
"""
|
|
74
|
+
Chrome-impersonating session when curl_cffi is available.
|
|
75
|
+
Falls back to urllib3 with realistic headers on armv7l / unsupported arches.
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def __init__(self, retries: int = 4, timeout: float = 25.0):
|
|
79
|
+
self.timeout = timeout
|
|
80
|
+
self._retries = retries
|
|
81
|
+
self._min_delay = 1.5
|
|
82
|
+
self._last_call = 0.0
|
|
83
|
+
|
|
84
|
+
if _HAS_CFFI:
|
|
85
|
+
self._session = _cffi.Session()
|
|
86
|
+
self._session.headers.update(_HEADERS)
|
|
87
|
+
self._pool: Optional[urllib3.PoolManager] = None
|
|
88
|
+
self._warmup()
|
|
89
|
+
else:
|
|
90
|
+
self._session = None
|
|
91
|
+
ctx = create_urllib3_context()
|
|
92
|
+
ctx.minimum_version = 0x0303 # TLS 1.2
|
|
93
|
+
self._pool = urllib3.PoolManager(
|
|
94
|
+
num_pools=6,
|
|
95
|
+
maxsize=4,
|
|
96
|
+
headers=_HEADERS,
|
|
97
|
+
retries=_RETRY,
|
|
98
|
+
timeout=urllib3.Timeout(connect=15, read=timeout),
|
|
99
|
+
ssl_context=ctx,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# ── warmup ────────────────────────────────────────────────────
|
|
103
|
+
def _warmup(self) -> None:
|
|
104
|
+
if self._session is None:
|
|
105
|
+
return
|
|
106
|
+
for url in (BASE + "/", BASE + "/en"):
|
|
107
|
+
try:
|
|
108
|
+
self._session.get(url, impersonate=IMPERSONATE, timeout=self.timeout)
|
|
109
|
+
time.sleep(0.8)
|
|
110
|
+
except Exception:
|
|
111
|
+
pass
|
|
112
|
+
|
|
113
|
+
# ── throttle ──────────────────────────────────────────────────
|
|
114
|
+
def _throttle(self) -> None:
|
|
115
|
+
elapsed = time.time() - self._last_call
|
|
116
|
+
if elapsed < self._min_delay:
|
|
117
|
+
time.sleep(self._min_delay - elapsed)
|
|
118
|
+
self._last_call = time.time()
|
|
119
|
+
|
|
120
|
+
@staticmethod
|
|
121
|
+
def _is_challenge(body: str) -> bool:
|
|
122
|
+
return (
|
|
123
|
+
"Just a moment" in body[:600]
|
|
124
|
+
or "cf-chl-bypass" in body
|
|
125
|
+
or "<title>Just a moment" in body
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# ── core fetch ────────────────────────────────────────────────
|
|
129
|
+
def _fetch(self, url: str, method: str = "GET") -> str:
|
|
130
|
+
last_err: Exception = HTTPError(f"no attempts made for {url}")
|
|
131
|
+
|
|
132
|
+
extra: dict = {}
|
|
133
|
+
if "/search/" in url:
|
|
134
|
+
extra["Referer"] = BASE + "/en"
|
|
135
|
+
elif re.search(r"/en/[a-z0-9\-]+$", url):
|
|
136
|
+
extra["Referer"] = BASE + "/en"
|
|
137
|
+
|
|
138
|
+
for attempt in range(self._retries):
|
|
139
|
+
self._throttle()
|
|
140
|
+
try:
|
|
141
|
+
if self._session is not None:
|
|
142
|
+
fn = self._session.get if method == "GET" else self._session.head
|
|
143
|
+
r = fn(
|
|
144
|
+
url,
|
|
145
|
+
impersonate=IMPERSONATE,
|
|
146
|
+
timeout=self.timeout,
|
|
147
|
+
headers=extra,
|
|
148
|
+
)
|
|
149
|
+
if r.status_code != 200:
|
|
150
|
+
last_err = HTTPError(f"HTTP {r.status_code} for {url}")
|
|
151
|
+
time.sleep(1.5 * (attempt + 1))
|
|
152
|
+
continue
|
|
153
|
+
if self._is_challenge(r.text):
|
|
154
|
+
last_err = HTTPError(f"CF challenge for {url}")
|
|
155
|
+
time.sleep(2.0 * (attempt + 1))
|
|
156
|
+
if attempt == 1:
|
|
157
|
+
self._warmup()
|
|
158
|
+
continue
|
|
159
|
+
return r.text
|
|
160
|
+
else:
|
|
161
|
+
assert self._pool is not None
|
|
162
|
+
r2 = self._pool.request(method, url, headers=extra,
|
|
163
|
+
timeout=urllib3.Timeout(connect=15,
|
|
164
|
+
read=self.timeout))
|
|
165
|
+
if r2.status != 200:
|
|
166
|
+
last_err = HTTPError(f"HTTP {r2.status} for {url}")
|
|
167
|
+
time.sleep(0.8 * (attempt + 1))
|
|
168
|
+
continue
|
|
169
|
+
body = r2.data.decode("utf-8", errors="replace")
|
|
170
|
+
if self._is_challenge(body):
|
|
171
|
+
last_err = HTTPError(f"CF challenge for {url}")
|
|
172
|
+
time.sleep(1.2 * (attempt + 1))
|
|
173
|
+
continue
|
|
174
|
+
return body
|
|
175
|
+
except (HTTPError, IOError) as e:
|
|
176
|
+
last_err = e
|
|
177
|
+
time.sleep(0.8 * (attempt + 1))
|
|
178
|
+
except Exception as e:
|
|
179
|
+
last_err = IOError(str(e))
|
|
180
|
+
time.sleep(1.0 * (attempt + 1))
|
|
181
|
+
|
|
182
|
+
raise last_err
|
|
183
|
+
|
|
184
|
+
def get_text(self, url: str) -> str:
|
|
185
|
+
return self._fetch(url, "GET")
|
|
186
|
+
|
|
187
|
+
def get_bytes(self, url: str, referer: str = BASE + "/") -> bytes:
|
|
188
|
+
"""Fetch raw bytes — used for thumbnails / m3u8 binary payloads."""
|
|
189
|
+
last_err: Exception = HTTPError(f"no attempts for {url}")
|
|
190
|
+
hdrs = {"Referer": referer}
|
|
191
|
+
|
|
192
|
+
for attempt in range(self._retries):
|
|
193
|
+
self._throttle()
|
|
194
|
+
try:
|
|
195
|
+
if self._session is not None:
|
|
196
|
+
r = self._session.get(
|
|
197
|
+
url, impersonate=IMPERSONATE,
|
|
198
|
+
timeout=self.timeout, headers=hdrs,
|
|
199
|
+
)
|
|
200
|
+
if r.status_code != 200:
|
|
201
|
+
last_err = HTTPError(f"HTTP {r.status_code} for {url}")
|
|
202
|
+
time.sleep(0.5 * (attempt + 1))
|
|
203
|
+
continue
|
|
204
|
+
return r.content
|
|
205
|
+
else:
|
|
206
|
+
assert self._pool is not None
|
|
207
|
+
r2 = self._pool.request("GET", url, headers=hdrs,
|
|
208
|
+
timeout=urllib3.Timeout(connect=15,
|
|
209
|
+
read=self.timeout))
|
|
210
|
+
if r2.status != 200:
|
|
211
|
+
last_err = HTTPError(f"HTTP {r2.status} for {url}")
|
|
212
|
+
time.sleep(0.5 * (attempt + 1))
|
|
213
|
+
continue
|
|
214
|
+
return r2.data
|
|
215
|
+
except Exception as e:
|
|
216
|
+
last_err = IOError(str(e))
|
|
217
|
+
time.sleep(0.5 * (attempt + 1))
|
|
218
|
+
|
|
219
|
+
raise last_err
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
# ─────────────────────────────────────────────────────────────────
|
|
223
|
+
# Data containers
|
|
224
|
+
# ─────────────────────────────────────────────────────────────────
|
|
225
|
+
|
|
226
|
+
class Video:
|
|
227
|
+
__slots__ = ("title", "slug", "url", "thumb", "duration", "date",
|
|
228
|
+
"actresses", "tags", "code", "size_hint", "m3u8")
|
|
229
|
+
|
|
230
|
+
def __init__(self, **kw):
|
|
231
|
+
for k in self.__slots__:
|
|
232
|
+
setattr(self, k, kw.get(k))
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
# ─────────────────────────────────────────────────────────────────
|
|
236
|
+
# Regex patterns
|
|
237
|
+
# ─────────────────────────────────────────────────────────────────
|
|
238
|
+
|
|
239
|
+
CARD_RE = re.compile(
|
|
240
|
+
r'<a[^>]+href="https://missav\.live/en/([a-z0-9\-]+)"[^>]*alt="[^"]*"\s*>\s*'
|
|
241
|
+
r'([A-Z]{2,5}-\d{2,5}[^<]*)',
|
|
242
|
+
re.IGNORECASE | re.DOTALL,
|
|
243
|
+
)
|
|
244
|
+
CARD_THUMB_RE = re.compile(
|
|
245
|
+
r'<img[^>]+(?:src|data-src|data-original)="([^"]+\.(?:jpg|jpeg|png|webp))"',
|
|
246
|
+
re.IGNORECASE,
|
|
247
|
+
)
|
|
248
|
+
CARD_DUR_RE = re.compile(r'>\s*(\d{1,3}:\d{2}(?::\d{2})?)\s*<')
|
|
249
|
+
DATE_RE = re.compile(r'(\d{4}-\d{2}-\d{2})')
|
|
250
|
+
DURATION_RE = re.compile(r'(\d{1,3}):(\d{2})')
|
|
251
|
+
CODE_RE = re.compile(r'\b([A-Z]{2,5}-\d{2,5})\b')
|
|
252
|
+
M3U8_URL_RE = re.compile(r'https?://[^\s"\'<>)]+?\.m3u8(?:\?[^\s"\'<>)]*)?', re.IGNORECASE)
|
|
253
|
+
M3U8_B64_RE = re.compile(r'atob\(["\']([A-Za-z0-9+/=]{20,})["\']\)')
|
|
254
|
+
M3U8_VAR_RE = re.compile(
|
|
255
|
+
r'(?:videoSource|videoUrl|streamUrl|playerUrl|m3u8Url)\s*[:=]\s*["\']([^"\']+)["\']'
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def _strip_tags(s: str) -> str:
|
|
260
|
+
return re.sub(r'<[^>]+>', '', s).strip()
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
# ─────────────────────────────────────────────────────────────────
|
|
264
|
+
# Parsers
|
|
265
|
+
# ─────────────────────────────────────────────────────────────────
|
|
266
|
+
|
|
267
|
+
def parse_video(html: str, slug: str, base: str = BASE) -> Video:
|
|
268
|
+
og_title = re.search(r'<meta property="og:title" content="([^"]+)"', html)
|
|
269
|
+
tw_title = re.search(r'<meta name="twitter:title" content="([^"]+)"', html)
|
|
270
|
+
title = ""
|
|
271
|
+
if og_title:
|
|
272
|
+
title = og_title.group(1)
|
|
273
|
+
elif tw_title:
|
|
274
|
+
title = tw_title.group(1)
|
|
275
|
+
else:
|
|
276
|
+
h1_m = re.search(r'<h1[^>]*>(.*?)</h1>', html, re.IGNORECASE | re.DOTALL)
|
|
277
|
+
if h1_m:
|
|
278
|
+
title = _strip_tags(h1_m.group(1))
|
|
279
|
+
title = _strip_tags(title) or slug.replace("-", " ").title()
|
|
280
|
+
|
|
281
|
+
code_m = CODE_RE.search(title)
|
|
282
|
+
code = code_m.group(1) if code_m else slug.upper()
|
|
283
|
+
|
|
284
|
+
thumb = ""
|
|
285
|
+
for pat in (r'<meta property="og:image" content="([^"]+)"',
|
|
286
|
+
r'<meta name="twitter:image" content="([^"]+)"'):
|
|
287
|
+
m = re.search(pat, html)
|
|
288
|
+
if m:
|
|
289
|
+
thumb = m.group(1)
|
|
290
|
+
break
|
|
291
|
+
|
|
292
|
+
duration = None
|
|
293
|
+
dur_m = DURATION_RE.search(html)
|
|
294
|
+
if dur_m:
|
|
295
|
+
duration = int(dur_m.group(1)) * 60 + int(dur_m.group(2))
|
|
296
|
+
|
|
297
|
+
date_m = DATE_RE.search(html)
|
|
298
|
+
date = date_m.group(1) if date_m else ""
|
|
299
|
+
|
|
300
|
+
return Video(
|
|
301
|
+
title=title, slug=slug, code=code,
|
|
302
|
+
url=f"{base}/en/{slug}",
|
|
303
|
+
thumb=thumb, duration=duration, date=date,
|
|
304
|
+
actresses=[], tags=[], size_hint=None,
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def parse_listing(html: str, base: str = BASE) -> list[Video]:
|
|
309
|
+
out: dict[str, Video] = {}
|
|
310
|
+
|
|
311
|
+
for m in CARD_RE.finditer(html):
|
|
312
|
+
slug = m.group(1)
|
|
313
|
+
anchor_text = _strip_tags(m.group(2))
|
|
314
|
+
if not slug or slug in out:
|
|
315
|
+
continue
|
|
316
|
+
|
|
317
|
+
code_m = re.match(r'^([A-Z]{2,5}-\d{2,5})\s*(.*)', anchor_text)
|
|
318
|
+
if code_m:
|
|
319
|
+
code = code_m.group(1)
|
|
320
|
+
title = code_m.group(2).strip()
|
|
321
|
+
else:
|
|
322
|
+
code = slug.upper()
|
|
323
|
+
title = anchor_text
|
|
324
|
+
|
|
325
|
+
chunk = html[max(0, m.start() - 2500): m.end() + 500]
|
|
326
|
+
|
|
327
|
+
thumb_m = CARD_THUMB_RE.search(chunk)
|
|
328
|
+
thumb = thumb_m.group(1) if thumb_m else ""
|
|
329
|
+
if thumb and not thumb.startswith("http"):
|
|
330
|
+
thumb = urljoin(base, thumb)
|
|
331
|
+
|
|
332
|
+
dur_m = CARD_DUR_RE.search(chunk)
|
|
333
|
+
duration = None
|
|
334
|
+
if dur_m:
|
|
335
|
+
parts = [int(p) for p in dur_m.group(1).split(":")]
|
|
336
|
+
if len(parts) == 3:
|
|
337
|
+
duration = parts[0] * 3600 + parts[1] * 60 + parts[2]
|
|
338
|
+
elif len(parts) == 2:
|
|
339
|
+
duration = parts[0] * 60 + parts[1]
|
|
340
|
+
|
|
341
|
+
date_m = DATE_RE.search(chunk)
|
|
342
|
+
out[slug] = Video(
|
|
343
|
+
title=title or slug, slug=slug, code=code,
|
|
344
|
+
url=f"{base}/en/{slug}",
|
|
345
|
+
thumb=thumb, duration=duration,
|
|
346
|
+
date=date_m.group(1) if date_m else "",
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
if not out:
|
|
350
|
+
for m in re.finditer(r'href="https://missav\.live/en/([a-z0-9\-]+)"', html):
|
|
351
|
+
slug = m.group(1)
|
|
352
|
+
if slug in out:
|
|
353
|
+
continue
|
|
354
|
+
if slug in ("vip", "actresses", "genres", "makers", "klive", "clive", "saved"):
|
|
355
|
+
continue
|
|
356
|
+
out[slug] = Video(
|
|
357
|
+
title=slug, slug=slug, code=slug.upper(),
|
|
358
|
+
url=f"{base}/en/{slug}", thumb="", date="",
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
return list(out.values())
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
# ─────────────────────────────────────────────────────────────────
|
|
365
|
+
# m3u8 extractor
|
|
366
|
+
# ─────────────────────────────────────────────────────────────────
|
|
367
|
+
|
|
368
|
+
def _find_m3u8(html: str) -> str:
|
|
369
|
+
for m in M3U8_URL_RE.finditer(html):
|
|
370
|
+
return m.group(0)
|
|
371
|
+
b64 = M3U8_B64_RE.search(html)
|
|
372
|
+
if b64:
|
|
373
|
+
try:
|
|
374
|
+
return base64.b64decode(b64.group(1)).decode("utf-8", errors="replace")
|
|
375
|
+
except (binascii.Error, ValueError):
|
|
376
|
+
pass
|
|
377
|
+
varm = M3U8_VAR_RE.search(html)
|
|
378
|
+
if varm:
|
|
379
|
+
return varm.group(1)
|
|
380
|
+
return ""
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
# ─────────────────────────────────────────────────────────────────
|
|
384
|
+
# Public API
|
|
385
|
+
# ─────────────────────────────────────────────────────────────────
|
|
386
|
+
|
|
387
|
+
class MissAV:
|
|
388
|
+
"""Main scraper object."""
|
|
389
|
+
|
|
390
|
+
def __init__(self, http: Optional[Http] = None):
|
|
391
|
+
self.http = http or Http()
|
|
392
|
+
|
|
393
|
+
def search(self, query: str) -> list[Video]:
|
|
394
|
+
if not query.strip():
|
|
395
|
+
return self.recent()
|
|
396
|
+
url = f"{BASE}/en/search/{quote_plus(query)}"
|
|
397
|
+
try:
|
|
398
|
+
html = self.http.get_text(url)
|
|
399
|
+
except Exception:
|
|
400
|
+
return []
|
|
401
|
+
return parse_listing(html)
|
|
402
|
+
|
|
403
|
+
def recent(self, page: int = 1) -> list[Video]:
|
|
404
|
+
url = BASE if page == 1 else f"{BASE}/en?page={page}"
|
|
405
|
+
try:
|
|
406
|
+
html = self.http.get_text(url)
|
|
407
|
+
except Exception:
|
|
408
|
+
return []
|
|
409
|
+
return parse_listing(html)
|
|
410
|
+
|
|
411
|
+
def video(self, slug: str) -> Video:
|
|
412
|
+
url = f"{BASE}/en/{slug}"
|
|
413
|
+
html = self.http.get_text(url)
|
|
414
|
+
v = parse_video(html, slug)
|
|
415
|
+
v.m3u8 = _find_m3u8(html)
|
|
416
|
+
return v
|
|
417
|
+
|
|
418
|
+
def resolutions(self, m3u8_url: str) -> list[dict]:
|
|
419
|
+
try:
|
|
420
|
+
txt = self.http.get_text(m3u8_url)
|
|
421
|
+
except Exception:
|
|
422
|
+
return []
|
|
423
|
+
out: list[dict] = []
|
|
424
|
+
base = m3u8_url.rsplit("/", 1)[0] + "/"
|
|
425
|
+
for line in txt.splitlines():
|
|
426
|
+
if line.startswith("#EXT-X-STREAM-INF"):
|
|
427
|
+
attrs = dict(re.findall(r'([A-Z\-]+)=("[^"]+"|\d+)', line))
|
|
428
|
+
res = attrs.get("RESOLUTION", "?").strip('"')
|
|
429
|
+
bw = attrs.get("BANDWIDTH", "?")
|
|
430
|
+
try:
|
|
431
|
+
bw = int(bw)
|
|
432
|
+
except (TypeError, ValueError):
|
|
433
|
+
pass
|
|
434
|
+
out.append({"resolution": res, "bandwidth": bw, "url": None})
|
|
435
|
+
elif line.strip() and not line.startswith("#") and out:
|
|
436
|
+
out[-1]["url"] = urljoin(base, line.strip())
|
|
437
|
+
return out
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: java_extention
|
|
3
|
+
Version: 1.1.1
|
|
4
|
+
Summary: MissAV HLS downloader. armv7l/Termux safe.
|
|
5
|
+
Requires-Python: >=3.9
|
|
6
|
+
Requires-Dist: rich>=13.0
|
|
7
|
+
Requires-Dist: urllib3>=2.0
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Environment :: Console
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
jav_dl_tmux/__init__.py,sha256=6VUkFXkixUir6dJ9CWMqL2FjxZEliFJhEZa6Ixfjgq8,133
|
|
2
|
+
jav_dl_tmux/__main__.py,sha256=1q_oFO1yRT7d-vCKepBvwKbnHnW0Sd6gFlEi-P_Pr70,11213
|
|
3
|
+
jav_dl_tmux/scraper.py,sha256=prT_Fe75oz_9XvBxM3QQfcczZYoJVMyJGWcaNWywUd0,17579
|
|
4
|
+
jav_dl_tmux/downloader.py,sha256=_IDYQCasDqkueFgO5NoV8_lBzlbxBihPY_I85zCzl4Q,20598
|
|
5
|
+
java_extention-1.1.1.dist-info/METADATA,sha256=KLwMB5c-LSylxcX6RxQeoqqZ1cSX-0Hu_8TZAf039Nk,321
|
|
6
|
+
java_extention-1.1.1.dist-info/WHEEL,sha256=2jHdxKWnCIK_lI5LTqwkSmjgOUSsTtjdBrux51aJqKg,77
|
|
7
|
+
java_extention-1.1.1.dist-info/entry_points.txt,sha256=EO58EgQkmXdZyUBuHbA07YVBqSxy7pn5AA8-pXFwKjo,58
|
|
8
|
+
java_extention-1.1.1.dist-info/RECORD,,
|