hipda 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hipda-0.1.11.dist-info/METADATA +54 -0
- hipda-0.1.11.dist-info/RECORD +10 -0
- hipda-0.1.11.dist-info/WHEEL +4 -0
- hipda-0.1.11.dist-info/entry_points.txt +2 -0
- hipda_cli/__init__.py +5 -0
- hipda_cli/auth.py +115 -0
- hipda_cli/cli.py +236 -0
- hipda_cli/client.py +90 -0
- hipda_cli/models.py +24 -0
- hipda_cli/parser.py +131 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hipda
|
|
3
|
+
Version: 0.1.11
|
|
4
|
+
Summary: CLI reader for 4D4Y/HiPDA Discovery forum posts.
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
7
|
+
Requires-Dist: browser-cookie3>=0.20
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
|
|
10
|
+
# hipda
|
|
11
|
+
|
|
12
|
+
CLI reader for the 4D4Y/HiPDA Discovery channel (`fid=2`).
|
|
13
|
+
|
|
14
|
+
The site uses browser/session checks, so direct unauthenticated requests may return a Cloudflare challenge. Log in once through Chrome:
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
uvx --from . hipda login
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
From PyPI:
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
uvx hipda login
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
That opens 4D4Y in Google Chrome. After you finish logging in, return to the terminal and press Enter. Then read Discovery:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
uvx --from . hipda list --limit 20
|
|
30
|
+
uvx --from . hipda read 3446553
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
From PyPI:
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
uvx hipda list --limit 20
|
|
37
|
+
uvx hipda read 3446553
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
`hipda list` also tries to import automatically if Chrome is already logged in, so most of the time you can skip straight to reading. The old `hipda discovery list` and `hipda discovery read` commands still work.
|
|
41
|
+
|
|
42
|
+
The cookie is stored at `~/.config/hipda/cookie` and the user agent is stored at `~/.config/hipda/user-agent`, both with `0600` permissions. You can override them per command with `HIPDA_COOKIE` / `--cookie` and `HIPDA_USER_AGENT` / `--user-agent`.
|
|
43
|
+
|
|
44
|
+
You can also pass a browser user agent:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
HIPDA_USER_AGENT='Mozilla/5.0 ...' uvx --from . hipda list
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
The CLI disables HTTPS certificate verification by default because 4D4Y often fails from Python environments where Chrome still works. To verify certificates, pass a trusted root certificate and `--verify-tls`:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
uvx --from . hipda --verify-tls --ca-file /path/to/root-ca.pem list
|
|
54
|
+
```
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
hipda_cli/__init__.py,sha256=wOQd_Wj4i4Y_coXW38Gsu3Io76IKVCxgdS5T8ZnJwzg,113
|
|
2
|
+
hipda_cli/auth.py,sha256=yoqnHNc6GccR4aY9q05GfLMPbr-kLhidaDa11BkjsUA,3491
|
|
3
|
+
hipda_cli/cli.py,sha256=Sbo_YN7GtOX1StujM4XrZlp18YQgn_CtngGYirajN2A,9251
|
|
4
|
+
hipda_cli/client.py,sha256=lmeWZgLFq8GDCXtAhEOSHyA2Qk_O5D6juFzNr3rrfio,3286
|
|
5
|
+
hipda_cli/models.py,sha256=jcy9SN3dyTT6lkzlvtZh_67CbpEaPu7dl8c51V2vutE,406
|
|
6
|
+
hipda_cli/parser.py,sha256=wsVn3qFaY42Rg-8DGHUM1SwlvGT4Lgt8FKjR-4hRIFY,4559
|
|
7
|
+
hipda-0.1.11.dist-info/METADATA,sha256=-W0T_mFtLjD_g-eUb5KPzPvvfFFzk_fimESvpoVbINg,1679
|
|
8
|
+
hipda-0.1.11.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
9
|
+
hipda-0.1.11.dist-info/entry_points.txt,sha256=53g4SRiLfISMiXV5kpCDdioyWFh_fuPur-t1v2SIB7g,45
|
|
10
|
+
hipda-0.1.11.dist-info/RECORD,,
|
hipda_cli/__init__.py
ADDED
hipda_cli/auth.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import plistlib
|
|
5
|
+
import subprocess
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import browser_cookie3
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
LOGIN_URL = "https://www.4d4y.com/forum/forumdisplay.php?fid=2"
|
|
12
|
+
|
|
13
|
+
CHROME_INFO_PLIST_PATHS = (
|
|
14
|
+
Path("/Applications/Google Chrome.app/Contents/Info.plist"),
|
|
15
|
+
Path.home() / "Applications/Google Chrome.app/Contents/Info.plist",
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def default_cookie_path() -> Path:
|
|
20
|
+
return _config_path("cookie")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def default_user_agent_path() -> Path:
|
|
24
|
+
return _config_path("user-agent")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _config_path(name: str) -> Path:
|
|
28
|
+
config_home = os.environ.get("XDG_CONFIG_HOME")
|
|
29
|
+
if config_home:
|
|
30
|
+
return Path(config_home) / "hipda" / name
|
|
31
|
+
return Path.home() / ".config" / "hipda" / name
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def normalize_cookie(cookie: str) -> str:
|
|
35
|
+
cookie = cookie.strip()
|
|
36
|
+
if cookie.lower().startswith("cookie:"):
|
|
37
|
+
cookie = cookie.split(":", 1)[1].strip()
|
|
38
|
+
return cookie
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def load_cookie(path: Path | None = None) -> str:
|
|
42
|
+
cookie_path = path or default_cookie_path()
|
|
43
|
+
if not cookie_path.exists():
|
|
44
|
+
return ""
|
|
45
|
+
return normalize_cookie(cookie_path.read_text(encoding="utf-8"))
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def save_cookie(cookie: str, path: Path | None = None) -> Path:
|
|
49
|
+
normalized = normalize_cookie(cookie)
|
|
50
|
+
if not normalized:
|
|
51
|
+
raise ValueError("cookie is empty")
|
|
52
|
+
|
|
53
|
+
cookie_path = path or default_cookie_path()
|
|
54
|
+
cookie_path.parent.mkdir(parents=True, exist_ok=True)
|
|
55
|
+
cookie_path.write_text(normalized + "\n", encoding="utf-8")
|
|
56
|
+
cookie_path.chmod(0o600)
|
|
57
|
+
return cookie_path
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def load_user_agent(path: Path | None = None) -> str:
|
|
61
|
+
user_agent_path = path or default_user_agent_path()
|
|
62
|
+
if not user_agent_path.exists():
|
|
63
|
+
return ""
|
|
64
|
+
return user_agent_path.read_text(encoding="utf-8").strip()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def save_user_agent(user_agent: str, path: Path | None = None) -> Path:
|
|
68
|
+
normalized = user_agent.strip()
|
|
69
|
+
if not normalized:
|
|
70
|
+
raise ValueError("user-agent is empty")
|
|
71
|
+
|
|
72
|
+
user_agent_path = path or default_user_agent_path()
|
|
73
|
+
user_agent_path.parent.mkdir(parents=True, exist_ok=True)
|
|
74
|
+
user_agent_path.write_text(normalized + "\n", encoding="utf-8")
|
|
75
|
+
user_agent_path.chmod(0o600)
|
|
76
|
+
return user_agent_path
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def cookie_header_from_browser(domain: str = "4d4y.com") -> str:
|
|
80
|
+
jar = browser_cookie3.chrome(domain_name=domain)
|
|
81
|
+
cookies = []
|
|
82
|
+
for cookie in jar:
|
|
83
|
+
if cookie.domain.lstrip(".") == domain or cookie.domain.endswith("." + domain):
|
|
84
|
+
cookies.append(f"{cookie.name}={cookie.value}")
|
|
85
|
+
return "; ".join(cookies)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def chrome_user_agent() -> str:
|
|
89
|
+
major = "147"
|
|
90
|
+
for plist_path in CHROME_INFO_PLIST_PATHS:
|
|
91
|
+
if not plist_path.exists():
|
|
92
|
+
continue
|
|
93
|
+
with plist_path.open("rb") as file:
|
|
94
|
+
version = str(plistlib.load(file).get("CFBundleShortVersionString", ""))
|
|
95
|
+
if version:
|
|
96
|
+
major = version.split(".", 1)[0]
|
|
97
|
+
break
|
|
98
|
+
return (
|
|
99
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
100
|
+
f"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{major}.0.0.0 Safari/537.36"
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def import_browser_auth(domain: str = "4d4y.com") -> tuple[str, str]:
|
|
105
|
+
cookie = cookie_header_from_browser(domain)
|
|
106
|
+
if not cookie:
|
|
107
|
+
raise ValueError(f"no {domain} cookies found in Chrome")
|
|
108
|
+
user_agent = chrome_user_agent()
|
|
109
|
+
save_cookie(cookie)
|
|
110
|
+
save_user_agent(user_agent)
|
|
111
|
+
return cookie, user_agent
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def open_login_page() -> None:
|
|
115
|
+
subprocess.run(["open", "-a", "Google Chrome", LOGIN_URL], check=False)
|
hipda_cli/cli.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
from .auth import import_browser_auth, open_login_page, save_cookie, save_user_agent
|
|
7
|
+
from .client import BASE_URL, HipdaClient, HipdaClientError
|
|
8
|
+
from .parser import is_login_required_page, parse_forum_listing, parse_thread
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
12
|
+
parser = argparse.ArgumentParser(prog="hipda", description="Read 4D4Y/HiPDA forum posts from the terminal.")
|
|
13
|
+
parser.add_argument("--cookie", help="Logged-in Cookie header. Defaults to HIPDA_COOKIE.")
|
|
14
|
+
parser.add_argument("--user-agent", help="User-Agent header. Defaults to HIPDA_USER_AGENT or Chrome-like UA.")
|
|
15
|
+
parser.add_argument("--ca-file", help="PEM CA bundle to trust for HTTPS. Defaults to HIPDA_CA_FILE.")
|
|
16
|
+
parser.add_argument(
|
|
17
|
+
"--insecure-tls",
|
|
18
|
+
action="store_true",
|
|
19
|
+
help="Disable HTTPS certificate verification. This is the default for 4D4Y.",
|
|
20
|
+
)
|
|
21
|
+
parser.add_argument(
|
|
22
|
+
"--verify-tls",
|
|
23
|
+
action="store_true",
|
|
24
|
+
help="Enable HTTPS certificate verification.",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
subparsers = parser.add_subparsers(dest="command", metavar="{login,list,read}")
|
|
28
|
+
|
|
29
|
+
subparsers.add_parser("login", help="Import 4D4Y login cookies from Chrome.")
|
|
30
|
+
|
|
31
|
+
list_parser = subparsers.add_parser("list", help="List Discovery threads.")
|
|
32
|
+
list_parser.add_argument("--page", type=int, default=1, help="Forum page number.")
|
|
33
|
+
list_parser.add_argument("--limit", type=int, default=30, help="Maximum number of threads to print.")
|
|
34
|
+
|
|
35
|
+
read_parser = subparsers.add_parser("read", help="Read a thread by tid or URL.")
|
|
36
|
+
read_parser.add_argument("thread", help="Thread id, or a full viewthread.php URL.")
|
|
37
|
+
read_parser.add_argument("--page", type=int, default=1, help="Thread page number.")
|
|
38
|
+
|
|
39
|
+
auth = subparsers.add_parser("auth", help=argparse.SUPPRESS)
|
|
40
|
+
auth_subparsers = auth.add_subparsers(dest="auth_command", required=True)
|
|
41
|
+
save_cookie_parser = auth_subparsers.add_parser("save-cookie", help="Save a pasted 4D4Y Cookie header.")
|
|
42
|
+
save_cookie_parser.add_argument("cookie", nargs="?", help="Cookie header value. Reads stdin if omitted.")
|
|
43
|
+
save_user_agent_parser = auth_subparsers.add_parser("save-user-agent", help="Save the Chrome User-Agent used with the cookie.")
|
|
44
|
+
save_user_agent_parser.add_argument("user_agent", nargs="?", help="User-Agent value. Reads stdin if omitted.")
|
|
45
|
+
|
|
46
|
+
discovery = subparsers.add_parser("discovery", help=argparse.SUPPRESS)
|
|
47
|
+
discovery_subparsers = discovery.add_subparsers(dest="discovery_command", required=True)
|
|
48
|
+
|
|
49
|
+
list_parser = discovery_subparsers.add_parser("list", help="List Discovery threads.")
|
|
50
|
+
list_parser.add_argument("--page", type=int, default=1, help="Forum page number.")
|
|
51
|
+
list_parser.add_argument("--limit", type=int, default=30, help="Maximum number of threads to print.")
|
|
52
|
+
|
|
53
|
+
read_parser = discovery_subparsers.add_parser("read", help="Read a thread by tid or URL.")
|
|
54
|
+
read_parser.add_argument("thread", help="Thread id, or a full viewthread.php URL.")
|
|
55
|
+
read_parser.add_argument("--page", type=int, default=1, help="Thread page number.")
|
|
56
|
+
|
|
57
|
+
subparsers._choices_actions = [
|
|
58
|
+
action for action in subparsers._choices_actions if action.dest not in {"auth", "discovery"}
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
return parser
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _thread_params(thread: str, page: int) -> dict[str, str | int]:
|
|
65
|
+
if "tid=" in thread:
|
|
66
|
+
tid = thread.split("tid=", 1)[1].split("&", 1)[0]
|
|
67
|
+
else:
|
|
68
|
+
tid = thread
|
|
69
|
+
return {"tid": tid, "page": page}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def load_discovery_page(
|
|
73
|
+
*,
|
|
74
|
+
page: int,
|
|
75
|
+
path: str,
|
|
76
|
+
params: dict[str, str | int],
|
|
77
|
+
cookie: str | None,
|
|
78
|
+
user_agent: str | None,
|
|
79
|
+
ca_file: str | None,
|
|
80
|
+
insecure_tls: bool,
|
|
81
|
+
verify_tls: bool = False,
|
|
82
|
+
) -> tuple[str, HipdaClient]:
|
|
83
|
+
client = HipdaClient.from_env(
|
|
84
|
+
cookie=cookie,
|
|
85
|
+
user_agent=user_agent,
|
|
86
|
+
ca_file=ca_file,
|
|
87
|
+
insecure_tls=insecure_tls or not verify_tls,
|
|
88
|
+
verify_tls=verify_tls,
|
|
89
|
+
)
|
|
90
|
+
html = client.get(path, params)
|
|
91
|
+
if not is_login_required_page(html):
|
|
92
|
+
return html, client
|
|
93
|
+
|
|
94
|
+
if cookie:
|
|
95
|
+
return html, client
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
imported_cookie, imported_user_agent = import_browser_auth()
|
|
99
|
+
except Exception:
|
|
100
|
+
return html, client
|
|
101
|
+
|
|
102
|
+
client = HipdaClient.from_env(
|
|
103
|
+
cookie=imported_cookie,
|
|
104
|
+
user_agent=user_agent or imported_user_agent,
|
|
105
|
+
ca_file=ca_file,
|
|
106
|
+
insecure_tls=insecure_tls or not verify_tls,
|
|
107
|
+
verify_tls=verify_tls,
|
|
108
|
+
)
|
|
109
|
+
return client.get(path, params), client
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def wait_for_login_confirmation() -> None:
|
|
113
|
+
if sys.stdin.isatty():
|
|
114
|
+
input("Log in to 4D4Y in Chrome, then press Enter here...")
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def run(args: argparse.Namespace) -> int:
|
|
118
|
+
if args.command == "login":
|
|
119
|
+
try:
|
|
120
|
+
open_login_page()
|
|
121
|
+
wait_for_login_confirmation()
|
|
122
|
+
import_browser_auth()
|
|
123
|
+
except Exception as exc:
|
|
124
|
+
print(
|
|
125
|
+
"hipda: could not import 4D4Y cookies from Chrome. "
|
|
126
|
+
"Open Chrome, log in to https://www.4d4y.com/forum/forumdisplay.php?fid=2, then run `hipda login` again.",
|
|
127
|
+
file=sys.stderr,
|
|
128
|
+
)
|
|
129
|
+
print(f"hipda: {exc}", file=sys.stderr)
|
|
130
|
+
return 2
|
|
131
|
+
print("Imported 4D4Y login from Chrome.")
|
|
132
|
+
return 0
|
|
133
|
+
|
|
134
|
+
if args.command == "auth" and args.auth_command == "save-cookie":
|
|
135
|
+
cookie = args.cookie if args.cookie is not None else sys.stdin.read()
|
|
136
|
+
try:
|
|
137
|
+
path = save_cookie(cookie)
|
|
138
|
+
except ValueError as exc:
|
|
139
|
+
print(f"hipda: {exc}", file=sys.stderr)
|
|
140
|
+
return 2
|
|
141
|
+
print(f"Saved cookie to {path}")
|
|
142
|
+
return 0
|
|
143
|
+
|
|
144
|
+
if args.command == "auth" and args.auth_command == "save-user-agent":
|
|
145
|
+
user_agent = args.user_agent if args.user_agent is not None else sys.stdin.read()
|
|
146
|
+
try:
|
|
147
|
+
path = save_user_agent(user_agent)
|
|
148
|
+
except ValueError as exc:
|
|
149
|
+
print(f"hipda: {exc}", file=sys.stderr)
|
|
150
|
+
return 2
|
|
151
|
+
print(f"Saved user-agent to {path}")
|
|
152
|
+
return 0
|
|
153
|
+
|
|
154
|
+
client = HipdaClient.from_env(
|
|
155
|
+
cookie=args.cookie,
|
|
156
|
+
user_agent=args.user_agent,
|
|
157
|
+
ca_file=args.ca_file,
|
|
158
|
+
insecure_tls=args.insecure_tls or not args.verify_tls,
|
|
159
|
+
verify_tls=args.verify_tls,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
try:
|
|
163
|
+
command = args.discovery_command if args.command == "discovery" else args.command
|
|
164
|
+
|
|
165
|
+
if command == "list":
|
|
166
|
+
html, client = load_discovery_page(
|
|
167
|
+
page=args.page,
|
|
168
|
+
path="forumdisplay.php",
|
|
169
|
+
params={"fid": 2, "page": args.page},
|
|
170
|
+
cookie=args.cookie,
|
|
171
|
+
user_agent=args.user_agent,
|
|
172
|
+
ca_file=args.ca_file,
|
|
173
|
+
insecure_tls=args.insecure_tls or not args.verify_tls,
|
|
174
|
+
verify_tls=args.verify_tls,
|
|
175
|
+
)
|
|
176
|
+
if is_login_required_page(html):
|
|
177
|
+
print(
|
|
178
|
+
"hipda: 4D4Y says this request is not logged in. "
|
|
179
|
+
"Open Chrome, log in to 4D4Y, then run `hipda login`.",
|
|
180
|
+
file=sys.stderr,
|
|
181
|
+
)
|
|
182
|
+
return 2
|
|
183
|
+
threads = parse_forum_listing(html, base_url=BASE_URL)[: args.limit]
|
|
184
|
+
for thread in threads:
|
|
185
|
+
stats = ""
|
|
186
|
+
if thread.replies is not None and thread.views is not None:
|
|
187
|
+
stats = f" {thread.replies}/{thread.views}"
|
|
188
|
+
last = f" last: {thread.last_author} {thread.last_at}".rstrip() if thread.last_author else ""
|
|
189
|
+
print(f"{thread.tid}\t{thread.title}\t{thread.author} {thread.created_at}{stats}{last}")
|
|
190
|
+
return 0
|
|
191
|
+
|
|
192
|
+
if command == "read":
|
|
193
|
+
html, client = load_discovery_page(
|
|
194
|
+
page=args.page,
|
|
195
|
+
path="viewthread.php",
|
|
196
|
+
params=_thread_params(args.thread, args.page),
|
|
197
|
+
cookie=args.cookie,
|
|
198
|
+
user_agent=args.user_agent,
|
|
199
|
+
ca_file=args.ca_file,
|
|
200
|
+
insecure_tls=args.insecure_tls or not args.verify_tls,
|
|
201
|
+
verify_tls=args.verify_tls,
|
|
202
|
+
)
|
|
203
|
+
if is_login_required_page(html):
|
|
204
|
+
print(
|
|
205
|
+
"hipda: 4D4Y says this request is not logged in. "
|
|
206
|
+
"Open Chrome, log in to 4D4Y, then run `hipda login`.",
|
|
207
|
+
file=sys.stderr,
|
|
208
|
+
)
|
|
209
|
+
return 2
|
|
210
|
+
posts = parse_thread(html)
|
|
211
|
+
for index, post in enumerate(posts, start=1):
|
|
212
|
+
print(f"#{index} {post.author} {post.published_at}".rstrip())
|
|
213
|
+
print(post.content)
|
|
214
|
+
print()
|
|
215
|
+
return 0
|
|
216
|
+
except HipdaClientError as exc:
|
|
217
|
+
print(f"hipda: {exc}", file=sys.stderr)
|
|
218
|
+
if not client.cookie:
|
|
219
|
+
print("hipda: set HIPDA_COOKIE or pass --cookie with a logged-in 4D4Y Cookie header.", file=sys.stderr)
|
|
220
|
+
return 2
|
|
221
|
+
|
|
222
|
+
raise AssertionError(f"Unhandled command: {args}")
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def main(argv: list[str] | None = None) -> int:
|
|
226
|
+
parser = build_parser()
|
|
227
|
+
if argv is None:
|
|
228
|
+
argv = sys.argv[1:]
|
|
229
|
+
if not argv:
|
|
230
|
+
parser.print_help()
|
|
231
|
+
return 0
|
|
232
|
+
return run(parser.parse_args(argv))
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
if __name__ == "__main__":
|
|
236
|
+
raise SystemExit(main())
|
hipda_cli/client.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import ssl
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from urllib.error import HTTPError, URLError
|
|
7
|
+
from urllib.parse import urlencode
|
|
8
|
+
from urllib.request import Request, urlopen
|
|
9
|
+
|
|
10
|
+
from .auth import load_cookie, load_user_agent
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
BASE_URL = "https://www.4d4y.com/forum/"
|
|
14
|
+
DEFAULT_USER_AGENT = (
|
|
15
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
16
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class HipdaClientError(RuntimeError):
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass(frozen=True)
|
|
25
|
+
class HipdaClient:
|
|
26
|
+
cookie: str = ""
|
|
27
|
+
user_agent: str = DEFAULT_USER_AGENT
|
|
28
|
+
ca_file: str | None = None
|
|
29
|
+
insecure_tls: bool = True
|
|
30
|
+
base_url: str = BASE_URL
|
|
31
|
+
timeout: float = 20.0
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def from_env(
|
|
35
|
+
cls,
|
|
36
|
+
cookie: str | None = None,
|
|
37
|
+
user_agent: str | None = None,
|
|
38
|
+
ca_file: str | None = None,
|
|
39
|
+
insecure_tls: bool = True,
|
|
40
|
+
verify_tls: bool = False,
|
|
41
|
+
) -> "HipdaClient":
|
|
42
|
+
return cls(
|
|
43
|
+
cookie=cookie or os.environ.get("HIPDA_COOKIE", "") or load_cookie(),
|
|
44
|
+
user_agent=user_agent or os.environ.get("HIPDA_USER_AGENT", "") or load_user_agent() or DEFAULT_USER_AGENT,
|
|
45
|
+
ca_file=ca_file or os.environ.get("HIPDA_CA_FILE"),
|
|
46
|
+
insecure_tls=(
|
|
47
|
+
not verify_tls
|
|
48
|
+
and (insecure_tls or os.environ.get("HIPDA_INSECURE_TLS", "").lower() in {"1", "true", "yes"})
|
|
49
|
+
),
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
def ssl_context(self) -> ssl.SSLContext | None:
|
|
53
|
+
if self.insecure_tls:
|
|
54
|
+
context = ssl.create_default_context()
|
|
55
|
+
context.check_hostname = False
|
|
56
|
+
context.verify_mode = ssl.CERT_NONE
|
|
57
|
+
return context
|
|
58
|
+
if self.ca_file:
|
|
59
|
+
return ssl.create_default_context(cafile=self.ca_file)
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
def get(self, path: str, params: dict[str, str | int] | None = None) -> str:
|
|
63
|
+
url = self.base_url + path
|
|
64
|
+
if params:
|
|
65
|
+
url = f"{url}?{urlencode(params)}"
|
|
66
|
+
|
|
67
|
+
headers = {
|
|
68
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
69
|
+
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
70
|
+
"User-Agent": self.user_agent,
|
|
71
|
+
}
|
|
72
|
+
if self.cookie:
|
|
73
|
+
headers["Cookie"] = self.cookie
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
with urlopen(Request(url, headers=headers), timeout=self.timeout, context=self.ssl_context()) as response:
|
|
77
|
+
body = response.read()
|
|
78
|
+
encoding = response.headers.get_content_charset() or "utf-8"
|
|
79
|
+
return body.decode(encoding, errors="replace")
|
|
80
|
+
except HTTPError as exc:
|
|
81
|
+
body = exc.read().decode("utf-8", errors="replace")[:300]
|
|
82
|
+
raise HipdaClientError(f"HTTP {exc.code} fetching {url}: {body}") from exc
|
|
83
|
+
except URLError as exc:
|
|
84
|
+
if isinstance(exc.reason, ssl.SSLCertVerificationError):
|
|
85
|
+
raise HipdaClientError(
|
|
86
|
+
f"Could not verify TLS certificate for {url}: {exc.reason}. "
|
|
87
|
+
"If you use a trusted local proxy, pass --ca-file /path/to/root.pem. "
|
|
88
|
+
"As a last resort, pass --insecure-tls."
|
|
89
|
+
) from exc
|
|
90
|
+
raise HipdaClientError(f"Could not fetch {url}: {exc.reason}") from exc
|
hipda_cli/models.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass(frozen=True)
|
|
7
|
+
class ThreadSummary:
|
|
8
|
+
tid: str
|
|
9
|
+
title: str
|
|
10
|
+
url: str
|
|
11
|
+
author: str = ""
|
|
12
|
+
created_at: str = ""
|
|
13
|
+
replies: int | None = None
|
|
14
|
+
views: int | None = None
|
|
15
|
+
last_author: str = ""
|
|
16
|
+
last_at: str = ""
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(frozen=True)
|
|
20
|
+
class Post:
|
|
21
|
+
author: str
|
|
22
|
+
published_at: str
|
|
23
|
+
content: str
|
|
24
|
+
|
hipda_cli/parser.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from urllib.parse import parse_qs, urljoin, urlparse
|
|
5
|
+
|
|
6
|
+
from bs4 import BeautifulSoup
|
|
7
|
+
|
|
8
|
+
from .models import Post, ThreadSummary
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
THREAD_RE = re.compile(r"(?:^|[?&])tid=(\d+)")
|
|
12
|
+
WHITESPACE_RE = re.compile(r"[ \t\r\f\v]+")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def clean_text(value: str) -> str:
|
|
16
|
+
lines = []
|
|
17
|
+
for line in value.replace("\xa0", " ").splitlines():
|
|
18
|
+
cleaned = WHITESPACE_RE.sub(" ", line).strip()
|
|
19
|
+
if cleaned:
|
|
20
|
+
lines.append(cleaned)
|
|
21
|
+
return "\n".join(lines)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _tid_from_href(href: str) -> str | None:
|
|
25
|
+
parsed = urlparse(href)
|
|
26
|
+
tid = parse_qs(parsed.query).get("tid", [None])[0]
|
|
27
|
+
if tid:
|
|
28
|
+
return tid
|
|
29
|
+
match = THREAD_RE.search(href)
|
|
30
|
+
return match.group(1) if match else None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _split_cell_lines(cell) -> list[str]:
|
|
34
|
+
return clean_text(cell.get_text("\n")).splitlines()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _parse_counts(value: str) -> tuple[int | None, int | None]:
|
|
38
|
+
match = re.search(r"(\d+)\s*/\s*(\d+)", value)
|
|
39
|
+
if not match:
|
|
40
|
+
return None, None
|
|
41
|
+
return int(match.group(1)), int(match.group(2))
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def parse_forum_listing(html: str, base_url: str) -> list[ThreadSummary]:
|
|
45
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
46
|
+
threads: list[ThreadSummary] = []
|
|
47
|
+
seen: set[str] = set()
|
|
48
|
+
|
|
49
|
+
for anchor in soup.find_all("a", href=True):
|
|
50
|
+
href = anchor["href"]
|
|
51
|
+
if "viewthread.php" not in href:
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
tid = _tid_from_href(href)
|
|
55
|
+
title = clean_text(anchor.get_text())
|
|
56
|
+
if not tid or not title or tid in seen:
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
row = anchor.find_parent("tr")
|
|
60
|
+
author = created_at = last_author = last_at = ""
|
|
61
|
+
replies = views = None
|
|
62
|
+
if row:
|
|
63
|
+
cells = row.find_all(["td", "th"], recursive=False)
|
|
64
|
+
anchor_cell = anchor.find_parent(["td", "th"])
|
|
65
|
+
anchor_cell_index = cells.index(anchor_cell) if anchor_cell in cells else -1
|
|
66
|
+
trailing_cells = cells[anchor_cell_index + 1 :] if anchor_cell_index >= 0 else []
|
|
67
|
+
if trailing_cells:
|
|
68
|
+
author_lines = _split_cell_lines(trailing_cells[0])
|
|
69
|
+
author = author_lines[0] if author_lines else ""
|
|
70
|
+
created_at = author_lines[1] if len(author_lines) > 1 else ""
|
|
71
|
+
if len(trailing_cells) > 1:
|
|
72
|
+
replies, views = _parse_counts(clean_text(trailing_cells[1].get_text(" ")))
|
|
73
|
+
if len(trailing_cells) > 2:
|
|
74
|
+
last_lines = _split_cell_lines(trailing_cells[2])
|
|
75
|
+
last_author = last_lines[0] if last_lines else ""
|
|
76
|
+
last_at = last_lines[1] if len(last_lines) > 1 else ""
|
|
77
|
+
|
|
78
|
+
seen.add(tid)
|
|
79
|
+
threads.append(
|
|
80
|
+
ThreadSummary(
|
|
81
|
+
tid=tid,
|
|
82
|
+
title=title,
|
|
83
|
+
url=urljoin(base_url, href),
|
|
84
|
+
author=author,
|
|
85
|
+
created_at=created_at,
|
|
86
|
+
replies=replies,
|
|
87
|
+
views=views,
|
|
88
|
+
last_author=last_author,
|
|
89
|
+
last_at=last_at,
|
|
90
|
+
)
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
return threads
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def is_login_required_page(html: str) -> bool:
|
|
97
|
+
text = clean_text(BeautifulSoup(html, "html.parser").get_text("\n"))
|
|
98
|
+
return "您还未登录" in text or "无权访问该版块" in text
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def parse_thread(html: str) -> list[Post]:
|
|
102
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
103
|
+
posts: list[Post] = []
|
|
104
|
+
|
|
105
|
+
for container in soup.find_all(id=re.compile(r"^post_\d+")):
|
|
106
|
+
message = container.select_one(".t_msgfont") or container.select_one("[id^=postmessage_]")
|
|
107
|
+
if not message:
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
author_node = container.select_one(".postauthor > .postinfo a") or container.select_one(".postauthor a")
|
|
111
|
+
if not author_node:
|
|
112
|
+
fallback_author_node = container.select_one(".postauthor")
|
|
113
|
+
if fallback_author_node and fallback_author_node.name != "td":
|
|
114
|
+
author_node = fallback_author_node
|
|
115
|
+
info_node = (
|
|
116
|
+
container.select_one(".authorinfo [id^=authorposton]")
|
|
117
|
+
or container.select_one(".postcontent .postinfo")
|
|
118
|
+
or container.select_one(".postinfo")
|
|
119
|
+
)
|
|
120
|
+
info_text = clean_text(info_node.get_text("\n")) if info_node else ""
|
|
121
|
+
published_at = re.sub(r"^发表于\s*", "", info_text.splitlines()[0]).strip() if info_text else ""
|
|
122
|
+
|
|
123
|
+
posts.append(
|
|
124
|
+
Post(
|
|
125
|
+
author=clean_text(author_node.get_text()) if author_node else "",
|
|
126
|
+
published_at=published_at,
|
|
127
|
+
content=clean_text(message.get_text("\n")),
|
|
128
|
+
)
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
return posts
|