java2-extention 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- java2_extention/__init__.py +1 -0
- java2_extention/__main__.py +325 -0
- java2_extention/cache.py +52 -0
- java2_extention/config.py +75 -0
- java2_extention/downloader.py +525 -0
- java2_extention/http_client.py +232 -0
- java2_extention/scraper.py +345 -0
- java2_extention-1.0.0.dist-info/METADATA +53 -0
- java2_extention-1.0.0.dist-info/RECORD +12 -0
- java2_extention-1.0.0.dist-info/WHEEL +5 -0
- java2_extention-1.0.0.dist-info/entry_points.txt +2 -0
- java2_extention-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import platform
|
|
3
|
+
import shutil
|
|
4
|
+
import subprocess
|
|
5
|
+
import sys
|
|
6
|
+
import time
|
|
7
|
+
import urllib3
|
|
8
|
+
from urllib3.util.retry import Retry
|
|
9
|
+
from urllib3.util.ssl_ import create_urllib3_context
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
_IS_TERMUX = (
|
|
13
|
+
platform.machine() in ("aarch64", "armv7l")
|
|
14
|
+
or platform.system() == "Android"
|
|
15
|
+
or "com.termux" in sys.executable
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
_HAS_SYS_CURL = False
|
|
19
|
+
_HAS_CFFI = False
|
|
20
|
+
_HAS_CS = False
|
|
21
|
+
|
|
22
|
+
if _IS_TERMUX:
|
|
23
|
+
_curl_bin = shutil.which("curl")
|
|
24
|
+
if _curl_bin:
|
|
25
|
+
try:
|
|
26
|
+
out = subprocess.check_output(
|
|
27
|
+
[_curl_bin, "--version"], timeout=5, stderr=subprocess.DEVNULL
|
|
28
|
+
).decode()
|
|
29
|
+
if "curl" in out.lower():
|
|
30
|
+
_HAS_SYS_CURL = True
|
|
31
|
+
except Exception:
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
if not _IS_TERMUX:
|
|
35
|
+
try:
|
|
36
|
+
from curl_cffi import requests as _cffi_requests
|
|
37
|
+
_HAS_CFFI = True
|
|
38
|
+
except Exception:
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
if not _HAS_CFFI and not _HAS_SYS_CURL:
|
|
42
|
+
try:
|
|
43
|
+
import cloudscraper as _cs_mod
|
|
44
|
+
_HAS_CS = True
|
|
45
|
+
except Exception:
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
49
|
+
|
|
50
|
+
UA = (
|
|
51
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
52
|
+
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
_PLAIN_HEADERS = {
|
|
56
|
+
"User-Agent": UA,
|
|
57
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
|
58
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
59
|
+
"Accept-Encoding": "identity",
|
|
60
|
+
"Cache-Control": "no-cache",
|
|
61
|
+
"Pragma": "no-cache",
|
|
62
|
+
"Sec-Fetch-Dest": "document",
|
|
63
|
+
"Sec-Fetch-Mode": "navigate",
|
|
64
|
+
"Sec-Fetch-Site": "none",
|
|
65
|
+
"Sec-Fetch-User": "?1",
|
|
66
|
+
"Upgrade-Insecure-Requests": "1",
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
_POOL_SIZE = 4 if _IS_TERMUX else 8
|
|
70
|
+
|
|
71
|
+
_RETRY = Retry(
|
|
72
|
+
total=4, backoff_factor=1.0,
|
|
73
|
+
status_forcelist={429, 500, 502, 503, 504},
|
|
74
|
+
allowed_methods={"GET", "HEAD", "POST"},
|
|
75
|
+
raise_on_status=False,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _backend_name() -> str:
|
|
80
|
+
if _HAS_SYS_CURL: return f"system curl ({shutil.which('curl')})"
|
|
81
|
+
if _HAS_CFFI: return "curl_cffi (Chrome TLS)"
|
|
82
|
+
if _HAS_CS: return "cloudscraper"
|
|
83
|
+
return "urllib3"
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _slog(msg: str) -> None:
|
|
87
|
+
print(msg, flush=True)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _dump_body(status: int, text: str) -> None:
|
|
91
|
+
_slog(f" [net] ── full response body (status={status}) ──────────────")
|
|
92
|
+
_slog(text)
|
|
93
|
+
_slog(f" [net] ── end of body ({len(text)} chars) ──────────────────")
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _curl_get(url: str, extra_headers: Optional[dict] = None,
|
|
97
|
+
timeout: float = 30.0) -> tuple[int, str]:
|
|
98
|
+
curl_bin = shutil.which("curl") or "curl"
|
|
99
|
+
cmd = [
|
|
100
|
+
curl_bin,
|
|
101
|
+
"-sS", "-L",
|
|
102
|
+
"--max-time", str(int(timeout)),
|
|
103
|
+
"--compressed",
|
|
104
|
+
"-A", UA,
|
|
105
|
+
"-H", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
106
|
+
"-H", "Accept-Language: en-US,en;q=0.9",
|
|
107
|
+
"-H", "Cache-Control: no-cache",
|
|
108
|
+
"-H", "Pragma: no-cache",
|
|
109
|
+
"-H", "Sec-Fetch-Dest: document",
|
|
110
|
+
"-H", "Sec-Fetch-Mode: navigate",
|
|
111
|
+
"-H", "Sec-Fetch-Site: none",
|
|
112
|
+
"-H", "Sec-Fetch-User: ?1",
|
|
113
|
+
"-H", "Upgrade-Insecure-Requests: 1",
|
|
114
|
+
"--write-out", "\n__STATUS__:%{http_code}",
|
|
115
|
+
url,
|
|
116
|
+
]
|
|
117
|
+
if extra_headers:
|
|
118
|
+
for k, v in extra_headers.items():
|
|
119
|
+
cmd += ["-H", f"{k}: {v}"]
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
result = subprocess.run(cmd, capture_output=True, timeout=timeout + 5)
|
|
123
|
+
raw = result.stdout.decode("utf-8", errors="replace")
|
|
124
|
+
if "\n__STATUS__:" in raw:
|
|
125
|
+
body, status_str = raw.rsplit("\n__STATUS__:", 1)
|
|
126
|
+
try:
|
|
127
|
+
status = int(status_str.strip())
|
|
128
|
+
except ValueError:
|
|
129
|
+
status = 0
|
|
130
|
+
else:
|
|
131
|
+
body = raw
|
|
132
|
+
status = 0
|
|
133
|
+
return status, body
|
|
134
|
+
except subprocess.TimeoutExpired:
|
|
135
|
+
raise RuntimeError("system curl timed out")
|
|
136
|
+
except Exception as e:
|
|
137
|
+
raise RuntimeError(f"system curl failed: {e}")
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class HttpClient:
|
|
141
|
+
|
|
142
|
+
def __init__(self, timeout: float = 30.0, min_delay: float = 3.0):
|
|
143
|
+
self.timeout = timeout
|
|
144
|
+
self._delay = min_delay
|
|
145
|
+
self._last = 0.0
|
|
146
|
+
self._first = True
|
|
147
|
+
|
|
148
|
+
_slog(f" [net] backend: {_backend_name()}")
|
|
149
|
+
if _IS_TERMUX:
|
|
150
|
+
_slog(f" [net] Termux mode — pool_size={_POOL_SIZE}, stream-to-disk active")
|
|
151
|
+
|
|
152
|
+
self._cffi = None
|
|
153
|
+
self._cs = None
|
|
154
|
+
self._pool = None
|
|
155
|
+
|
|
156
|
+
if _HAS_CFFI:
|
|
157
|
+
self._cffi = _cffi_requests.Session(
|
|
158
|
+
impersonate="chrome124",
|
|
159
|
+
headers=_PLAIN_HEADERS,
|
|
160
|
+
)
|
|
161
|
+
elif _HAS_CS:
|
|
162
|
+
self._cs = _cs_mod.create_scraper(
|
|
163
|
+
browser={"browser": "chrome", "platform": "windows", "mobile": False}
|
|
164
|
+
)
|
|
165
|
+
self._cs.headers.update(_PLAIN_HEADERS)
|
|
166
|
+
elif not _HAS_SYS_CURL:
|
|
167
|
+
ctx = create_urllib3_context()
|
|
168
|
+
ctx.minimum_version = 0x0303
|
|
169
|
+
self._pool = urllib3.PoolManager(
|
|
170
|
+
num_pools=_POOL_SIZE,
|
|
171
|
+
maxsize=_POOL_SIZE,
|
|
172
|
+
headers=_PLAIN_HEADERS,
|
|
173
|
+
retries=_RETRY,
|
|
174
|
+
timeout=urllib3.Timeout(connect=15, read=timeout),
|
|
175
|
+
ssl_context=ctx,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
def _throttle(self) -> None:
|
|
179
|
+
now = time.time()
|
|
180
|
+
elapsed = now - self._last
|
|
181
|
+
if self._first or elapsed < self._delay:
|
|
182
|
+
wait = self._delay - elapsed if not self._first else self._delay
|
|
183
|
+
if wait > 0:
|
|
184
|
+
_slog(f" [net] waiting {wait:.1f}s (throttle)…")
|
|
185
|
+
time.sleep(wait)
|
|
186
|
+
self._first = False
|
|
187
|
+
self._last = time.time()
|
|
188
|
+
|
|
189
|
+
def get(self, url: str, extra_headers: Optional[dict] = None,
|
|
190
|
+
allow_redirects: bool = True) -> tuple[int, str]:
|
|
191
|
+
self._throttle()
|
|
192
|
+
_slog(f" [net] GET {url}")
|
|
193
|
+
hdrs: dict = {"Accept-Encoding": "identity"}
|
|
194
|
+
if extra_headers:
|
|
195
|
+
hdrs.update(extra_headers)
|
|
196
|
+
|
|
197
|
+
if _HAS_SYS_CURL:
|
|
198
|
+
status, text = _curl_get(url, extra_headers=hdrs, timeout=self.timeout)
|
|
199
|
+
_slog(f" [net] → {status} ({len(text)} chars)")
|
|
200
|
+
if status != 200:
|
|
201
|
+
_dump_body(status, text)
|
|
202
|
+
return status, text
|
|
203
|
+
|
|
204
|
+
if self._cffi is not None:
|
|
205
|
+
r = self._cffi.get(url, headers=hdrs, timeout=self.timeout,
|
|
206
|
+
allow_redirects=allow_redirects)
|
|
207
|
+
text = r.text
|
|
208
|
+
_slog(f" [net] → {r.status_code} ({len(text)} chars)")
|
|
209
|
+
if r.status_code != 200:
|
|
210
|
+
_dump_body(r.status_code, text)
|
|
211
|
+
return r.status_code, text
|
|
212
|
+
|
|
213
|
+
if self._cs is not None:
|
|
214
|
+
r = self._cs.get(url, headers=hdrs, timeout=self.timeout,
|
|
215
|
+
allow_redirects=allow_redirects)
|
|
216
|
+
text = r.text
|
|
217
|
+
_slog(f" [net] → {r.status_code} ({len(text)} chars)")
|
|
218
|
+
if r.status_code != 200:
|
|
219
|
+
_dump_body(r.status_code, text)
|
|
220
|
+
return r.status_code, text
|
|
221
|
+
|
|
222
|
+
assert self._pool is not None
|
|
223
|
+
r2 = self._pool.request(
|
|
224
|
+
"GET", url, headers=hdrs,
|
|
225
|
+
timeout=urllib3.Timeout(connect=15, read=self.timeout),
|
|
226
|
+
redirect=allow_redirects,
|
|
227
|
+
)
|
|
228
|
+
text = r2.data.decode("utf-8", errors="replace")
|
|
229
|
+
_slog(f" [net] → {r2.status} ({len(text)} chars)")
|
|
230
|
+
if r2.status != 200:
|
|
231
|
+
_dump_body(r2.status, text)
|
|
232
|
+
return r2.status, text
|
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
"""
|
|
2
|
+
scraper.py — freepornvideos.xxx scraper
|
|
3
|
+
Handles: search, recent/latest, video detail + URL extraction.
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from dataclasses import dataclass, field, asdict
|
|
9
|
+
from typing import Optional
|
|
10
|
+
from urllib.parse import urljoin, quote_plus
|
|
11
|
+
|
|
12
|
+
from bs4 import BeautifulSoup
|
|
13
|
+
|
|
14
|
+
SITE_BASE = "https://www.freepornvideos.xxx"
|
|
15
|
+
|
|
16
|
+
VIDEO_PAGE_RE = re.compile(
|
|
17
|
+
r"^https?://(?:www\.)?freepornvideos\.xxx/(?:[a-z]{2}/)?videos/(\d+)/([a-z0-9\-_%]+)/?$",
|
|
18
|
+
re.I,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
QUALITY_RE = re.compile(
|
|
22
|
+
r"(\d{3,4}p|2160p|1440p|1080p|720p|480p|360p|240p|4k|2k|hd|sd)", re.I
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
VIDEO_URL_RE = re.compile(
|
|
26
|
+
r"""(?P<url>(?:https?:)?//[^\s'"]+?\.(?:mp4|m3u8|m3u|m4v|webm|mov|mpd)(?:\?[^\s'"]*)?)""",
|
|
27
|
+
re.I,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
QUALITY_RANK = {
|
|
31
|
+
"4k": 2160, "2160p": 2160,
|
|
32
|
+
"2k": 1440, "1440p": 1440,
|
|
33
|
+
"1080p": 1080, "hd": 1080,
|
|
34
|
+
"720p": 720,
|
|
35
|
+
"480p": 480, "sd": 480,
|
|
36
|
+
"360p": 360,
|
|
37
|
+
"240p": 240,
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _clean(s: Optional[str]) -> str:
|
|
42
|
+
if not s:
|
|
43
|
+
return ""
|
|
44
|
+
return re.sub(r"\s+", " ", s).strip()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _abs(base: str, maybe: Optional[str]) -> Optional[str]:
|
|
48
|
+
if not maybe:
|
|
49
|
+
return None
|
|
50
|
+
maybe = maybe.strip()
|
|
51
|
+
if not maybe:
|
|
52
|
+
return None
|
|
53
|
+
return urljoin(base, maybe)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _detect_quality(text: str) -> Optional[str]:
|
|
57
|
+
if not text:
|
|
58
|
+
return None
|
|
59
|
+
m = QUALITY_RE.search(text)
|
|
60
|
+
return m.group(1).lower() if m else None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _ext_from_url(u: str) -> str:
|
|
64
|
+
p = u.lower().split("?")[0]
|
|
65
|
+
if ".m3u8" in p or ".m3u" in p:
|
|
66
|
+
return "m3u8"
|
|
67
|
+
if ".mpd" in p:
|
|
68
|
+
return "mpd"
|
|
69
|
+
if ".webm" in p:
|
|
70
|
+
return "webm"
|
|
71
|
+
if ".mp4" in p or ".m4v" in p or ".mov" in p:
|
|
72
|
+
return "mp4"
|
|
73
|
+
return "other"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _score_url(url: str, quality: Optional[str], source: str) -> int:
|
|
77
|
+
ext = _ext_from_url(url)
|
|
78
|
+
q = (quality or "").lower()
|
|
79
|
+
s = QUALITY_RANK.get(q, 0)
|
|
80
|
+
if ext == "mp4":
|
|
81
|
+
s += 100_000
|
|
82
|
+
elif ext == "webm":
|
|
83
|
+
s += 90_000
|
|
84
|
+
elif ext in ("m3u8", "mpd"):
|
|
85
|
+
s += 10_000
|
|
86
|
+
if source in ("iframe", "embed"):
|
|
87
|
+
s = -1
|
|
88
|
+
u = url.lower()
|
|
89
|
+
if "preview" in u or "screenshot" in u or "thumb" in u or "/previews/" in u:
|
|
90
|
+
s -= 200_000
|
|
91
|
+
return s
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def is_video_page_url(url: str) -> bool:
|
|
95
|
+
return bool(VIDEO_PAGE_RE.match(url))
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def video_id_from_url(url: str) -> Optional[str]:
|
|
99
|
+
m = VIDEO_PAGE_RE.match(url)
|
|
100
|
+
return m.group(1) if m else None
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@dataclass
|
|
104
|
+
class VideoEntry:
|
|
105
|
+
title: str
|
|
106
|
+
url: str
|
|
107
|
+
quality: Optional[str] = None
|
|
108
|
+
ext: str = "mp4"
|
|
109
|
+
source: str = ""
|
|
110
|
+
|
|
111
|
+
def to_dict(self) -> dict:
|
|
112
|
+
return asdict(self)
|
|
113
|
+
|
|
114
|
+
@staticmethod
|
|
115
|
+
def from_dict(d: dict) -> "VideoEntry":
|
|
116
|
+
return VideoEntry(**{k: d[k] for k in ("title", "url", "quality", "ext", "source") if k in d})
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@dataclass
|
|
120
|
+
class Video:
|
|
121
|
+
video_id: str
|
|
122
|
+
title: str
|
|
123
|
+
page_url: str
|
|
124
|
+
thumbnail: Optional[str] = None
|
|
125
|
+
duration: Optional[str] = None
|
|
126
|
+
entries: list = field(default_factory=list)
|
|
127
|
+
|
|
128
|
+
def to_dict(self) -> dict:
|
|
129
|
+
return asdict(self)
|
|
130
|
+
|
|
131
|
+
@staticmethod
|
|
132
|
+
def from_dict(d: dict) -> "Video":
|
|
133
|
+
v = Video(
|
|
134
|
+
video_id=d["video_id"],
|
|
135
|
+
title=d["title"],
|
|
136
|
+
page_url=d["page_url"],
|
|
137
|
+
thumbnail=d.get("thumbnail"),
|
|
138
|
+
duration=d.get("duration"),
|
|
139
|
+
)
|
|
140
|
+
v.entries = d.get("entries", [])
|
|
141
|
+
return v
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class _VideoExtractor:
|
|
145
|
+
def __init__(self, base_url: str) -> None:
|
|
146
|
+
self.base_url = base_url
|
|
147
|
+
|
|
148
|
+
def _from_og_video(self, soup: BeautifulSoup) -> list[VideoEntry]:
|
|
149
|
+
out = []
|
|
150
|
+
for meta in soup.find_all("meta"):
|
|
151
|
+
prop = (meta.get("property") or "").lower()
|
|
152
|
+
if prop in ("og:video", "og:video:url", "og:video:secure_url"):
|
|
153
|
+
url = _abs(self.base_url, meta.get("content"))
|
|
154
|
+
if url:
|
|
155
|
+
out.append(VideoEntry(
|
|
156
|
+
title="", url=url,
|
|
157
|
+
quality=_detect_quality(url) or "sd",
|
|
158
|
+
ext=_ext_from_url(url),
|
|
159
|
+
source="og:video",
|
|
160
|
+
))
|
|
161
|
+
return out
|
|
162
|
+
|
|
163
|
+
def _from_video_tags(self, soup: BeautifulSoup) -> list[VideoEntry]:
|
|
164
|
+
out = []
|
|
165
|
+
for tag in soup.find_all(["video", "source"]):
|
|
166
|
+
src = tag.get("src") or tag.get("data-src") or tag.get("data-video-src")
|
|
167
|
+
if not src:
|
|
168
|
+
continue
|
|
169
|
+
url = _abs(self.base_url, src)
|
|
170
|
+
if not url:
|
|
171
|
+
continue
|
|
172
|
+
quality = (
|
|
173
|
+
_detect_quality(tag.get("title", ""))
|
|
174
|
+
or _detect_quality(tag.get("label", ""))
|
|
175
|
+
or _detect_quality(tag.get("res", ""))
|
|
176
|
+
or _detect_quality(url)
|
|
177
|
+
)
|
|
178
|
+
out.append(VideoEntry(
|
|
179
|
+
title="", url=url, quality=quality,
|
|
180
|
+
ext=_ext_from_url(url), source=tag.name,
|
|
181
|
+
extra={"type": tag.get("type"), "label": tag.get("label")},
|
|
182
|
+
))
|
|
183
|
+
return out
|
|
184
|
+
|
|
185
|
+
def _from_inline_scripts(self, soup: BeautifulSoup) -> list[VideoEntry]:
|
|
186
|
+
out = []
|
|
187
|
+
for script in soup.find_all("script"):
|
|
188
|
+
text = script.get_text() or ""
|
|
189
|
+
if not text:
|
|
190
|
+
continue
|
|
191
|
+
for m in VIDEO_URL_RE.finditer(text):
|
|
192
|
+
url = m.group("url")
|
|
193
|
+
if url.startswith("//"):
|
|
194
|
+
url = "https:" + url
|
|
195
|
+
url = _abs(self.base_url, url)
|
|
196
|
+
if not url:
|
|
197
|
+
continue
|
|
198
|
+
# skip obvious thumbnails/previews
|
|
199
|
+
ul = url.lower()
|
|
200
|
+
if any(x in ul for x in ("thumb", "preview", "screenshot", "poster")):
|
|
201
|
+
continue
|
|
202
|
+
out.append(VideoEntry(
|
|
203
|
+
title="", url=url,
|
|
204
|
+
quality=_detect_quality(url),
|
|
205
|
+
ext=_ext_from_url(url),
|
|
206
|
+
source="script",
|
|
207
|
+
))
|
|
208
|
+
return out
|
|
209
|
+
|
|
210
|
+
def extract(self, html: str) -> list[VideoEntry]:
|
|
211
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
212
|
+
title = ""
|
|
213
|
+
for sel, attr in (
|
|
214
|
+
('meta[property="og:title"]', "content"),
|
|
215
|
+
('meta[name="twitter:title"]', "content"),
|
|
216
|
+
("h1", None),
|
|
217
|
+
("title", None),
|
|
218
|
+
):
|
|
219
|
+
el = soup.select_one(sel)
|
|
220
|
+
if el:
|
|
221
|
+
v = el.get(attr) if attr else el.get_text(" ", strip=True)
|
|
222
|
+
if v:
|
|
223
|
+
title = _clean(str(v))
|
|
224
|
+
break
|
|
225
|
+
|
|
226
|
+
raw: list[VideoEntry] = []
|
|
227
|
+
raw.extend(self._from_og_video(soup))
|
|
228
|
+
raw.extend(self._from_video_tags(soup))
|
|
229
|
+
raw.extend(self._from_inline_scripts(soup))
|
|
230
|
+
|
|
231
|
+
seen: set[str] = set()
|
|
232
|
+
unique: list[VideoEntry] = []
|
|
233
|
+
for e in raw:
|
|
234
|
+
key = e.url.split("?")[0].lower()
|
|
235
|
+
if key in seen:
|
|
236
|
+
continue
|
|
237
|
+
seen.add(key)
|
|
238
|
+
if not e.title:
|
|
239
|
+
e.title = title
|
|
240
|
+
unique.append(e)
|
|
241
|
+
|
|
242
|
+
return sorted(unique, key=lambda e: _score_url(e.url, e.quality, e.source), reverse=True)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _parse_listing(html: str, base: str = SITE_BASE) -> list[Video]:
|
|
246
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
247
|
+
seen: set[str] = set()
|
|
248
|
+
videos: list[Video] = []
|
|
249
|
+
for a in soup.find_all("a", href=True):
|
|
250
|
+
try:
|
|
251
|
+
full = urljoin(base, a["href"]).split("#")[0].split("?")[0]
|
|
252
|
+
if not is_video_page_url(full):
|
|
253
|
+
continue
|
|
254
|
+
vid = video_id_from_url(full)
|
|
255
|
+
if not vid or vid in seen:
|
|
256
|
+
continue
|
|
257
|
+
seen.add(vid)
|
|
258
|
+
|
|
259
|
+
title = _clean(a.get_text(" ", strip=True)) or f"Video {vid}"
|
|
260
|
+
|
|
261
|
+
# try to find thumbnail near the link
|
|
262
|
+
thumb = None
|
|
263
|
+
img = a.find("img")
|
|
264
|
+
if img:
|
|
265
|
+
thumb = img.get("data-src") or img.get("src")
|
|
266
|
+
if thumb:
|
|
267
|
+
thumb = _abs(base, thumb)
|
|
268
|
+
|
|
269
|
+
# duration — look for a time-looking element nearby
|
|
270
|
+
dur = None
|
|
271
|
+
for cls in ("duration", "time", "video-duration"):
|
|
272
|
+
el = a.find(class_=re.compile(cls, re.I))
|
|
273
|
+
if el:
|
|
274
|
+
dur = _clean(el.get_text())
|
|
275
|
+
break
|
|
276
|
+
|
|
277
|
+
videos.append(Video(
|
|
278
|
+
video_id=vid,
|
|
279
|
+
title=title,
|
|
280
|
+
page_url=full,
|
|
281
|
+
thumbnail=thumb,
|
|
282
|
+
duration=dur,
|
|
283
|
+
))
|
|
284
|
+
except Exception:
|
|
285
|
+
continue
|
|
286
|
+
return videos
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
class FPV:
|
|
290
|
+
"""freepornvideos.xxx scraper — search + recent + video details."""
|
|
291
|
+
|
|
292
|
+
def __init__(self, http) -> None:
|
|
293
|
+
self._http = http
|
|
294
|
+
|
|
295
|
+
# ── recent ────────────────────────────────────────────────
|
|
296
|
+
def recent(self, page: int = 1) -> list[Video]:
|
|
297
|
+
if page == 1:
|
|
298
|
+
url = f"{SITE_BASE}/latest-updates/"
|
|
299
|
+
else:
|
|
300
|
+
url = f"{SITE_BASE}/latest-updates/{page}/"
|
|
301
|
+
status, html = self._http.get(url)
|
|
302
|
+
if status != 200 or not html:
|
|
303
|
+
return []
|
|
304
|
+
return _parse_listing(html)
|
|
305
|
+
|
|
306
|
+
# ── search ────────────────────────────────────────────────
|
|
307
|
+
def search(self, query: str, page: int = 1) -> list[Video]:
|
|
308
|
+
slug = quote_plus(query.strip().replace(" ", "-").lower())
|
|
309
|
+
# try common search URL patterns for WordPress-based adult sites
|
|
310
|
+
candidates = [
|
|
311
|
+
f"{SITE_BASE}/search/{slug}/",
|
|
312
|
+
f"{SITE_BASE}/search/{slug}/{page}/",
|
|
313
|
+
f"{SITE_BASE}/?s={quote_plus(query)}",
|
|
314
|
+
f"{SITE_BASE}/search/?q={quote_plus(query)}",
|
|
315
|
+
]
|
|
316
|
+
for url in candidates:
|
|
317
|
+
try:
|
|
318
|
+
status, html = self._http.get(url)
|
|
319
|
+
if status == 200 and html:
|
|
320
|
+
videos = _parse_listing(html)
|
|
321
|
+
if videos:
|
|
322
|
+
return videos
|
|
323
|
+
except Exception:
|
|
324
|
+
continue
|
|
325
|
+
return []
|
|
326
|
+
|
|
327
|
+
# ── video detail ──────────────────────────────────────────
|
|
328
|
+
def video_details(self, video: Video) -> Video:
|
|
329
|
+
"""Fetch the video page and extract all download URLs. Updates video in-place."""
|
|
330
|
+
status, html = self._http.get(video.page_url)
|
|
331
|
+
if status != 200 or not html:
|
|
332
|
+
return video
|
|
333
|
+
|
|
334
|
+
extractor = _VideoExtractor(video.page_url)
|
|
335
|
+
entries = extractor.extract(html)
|
|
336
|
+
|
|
337
|
+
# filter to actual video files only (no iframes, no thumbs)
|
|
338
|
+
usable = [
|
|
339
|
+
e for e in entries
|
|
340
|
+
if _ext_from_url(e.url) in ("mp4", "webm", "m3u8", "mpd")
|
|
341
|
+
and _score_url(e.url, e.quality, e.source) >= 0
|
|
342
|
+
]
|
|
343
|
+
|
|
344
|
+
video.entries = [e.to_dict() for e in usable]
|
|
345
|
+
return video
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: java2-extention
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: java2 extension PKG really important PKG for men
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: urllib3>=2.0
|
|
9
|
+
Requires-Dist: cloudscraper>=1.2
|
|
10
|
+
Requires-Dist: rich>=13.0
|
|
11
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
12
|
+
Requires-Dist: lxml>=5.0
|
|
13
|
+
Requires-Dist: curl-cffi>=0.6; platform_machine != "armv7l" and platform_machine != "aarch64" and platform_system != "Android"
|
|
14
|
+
|
|
15
|
+
# java2-extention
|
|
16
|
+
|
|
17
|
+
freepornvideos.xxx CLI downloader — direct MP4 + HLS, Termux/2GB RAM safe.
|
|
18
|
+
|
|
19
|
+
## Install
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install java2-extention
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Usage
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
java2 # interactive menu
|
|
29
|
+
java2 "milf" # search
|
|
30
|
+
java2 --recent # recent uploads
|
|
31
|
+
java2 --recent -p 2 # recent page 2
|
|
32
|
+
java2 -t 8 # 8 threads
|
|
33
|
+
java2 -o /sdcard/Download "teen"
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Menu
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
1. Search
|
|
40
|
+
2. Recent uploads
|
|
41
|
+
3. Settings
|
|
42
|
+
4. Clear cache
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Pick video → pick quality → downloads to your download folder.
|
|
46
|
+
|
|
47
|
+
## Termux setup
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pkg install python curl ffmpeg
|
|
51
|
+
pip install java2-extention
|
|
52
|
+
java2
|
|
53
|
+
```
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
java2_extention/__init__.py,sha256=J-j-u0itpEFT6irdmWmixQqYMadNl1X91TxUmoiLHMI,22
|
|
2
|
+
java2_extention/__main__.py,sha256=ELNbpNfqfdkGFXdIY1nkMFvp7Y_xCbcgxWMyJw1igN8,11236
|
|
3
|
+
java2_extention/cache.py,sha256=HrNz9RSKkt82bAkfrUXxwXxsV4BTFbIXwhqsAIgGzag,1229
|
|
4
|
+
java2_extention/config.py,sha256=C1fwQGDbN4Exdu3WJVjSnMJbX0CBLmVizH9tTNpSEc8,2041
|
|
5
|
+
java2_extention/downloader.py,sha256=wiglLdgzqw1mCucyd0KwZam46H979NwIIbPkIaXaFjE,18113
|
|
6
|
+
java2_extention/http_client.py,sha256=7B54Mo6j3viEo3zOuipjEsgiVPwpoj8zX7OzdldQcis,7689
|
|
7
|
+
java2_extention/scraper.py,sha256=WDQxWxGzA6XPVgug59KSqVTzK677jex4MvpFtc6zITA,10928
|
|
8
|
+
java2_extention-1.0.0.dist-info/METADATA,sha256=fYPm8Yq96sMVUXMHhiyTCLVPbhUKuqx4q2UzCG34lQA,1098
|
|
9
|
+
java2_extention-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
10
|
+
java2_extention-1.0.0.dist-info/entry_points.txt,sha256=2XiI66yyLAvzmo_k8GoWq5_euVmhnfRWHgPufQEA5Pk,56
|
|
11
|
+
java2_extention-1.0.0.dist-info/top_level.txt,sha256=oMEJ3Vnd46dL_O1gnpSAwY8W7aVyJ8idc8noeB0CFnk,16
|
|
12
|
+
java2_extention-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
java2_extention
|