ituring-fetch 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ituring_fetch/__init__.py +1 -0
- ituring_fetch/api.py +88 -0
- ituring_fetch/auth.py +147 -0
- ituring_fetch/cli.py +147 -0
- ituring_fetch/converter.py +151 -0
- ituring_fetch/debug.py +36 -0
- ituring_fetch/models.py +17 -0
- ituring_fetch/py.typed +0 -0
- ituring_fetch/scraper.py +79 -0
- ituring_fetch-0.1.0.dist-info/METADATA +71 -0
- ituring_fetch-0.1.0.dist-info/RECORD +13 -0
- ituring_fetch-0.1.0.dist-info/WHEEL +4 -0
- ituring_fetch-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""ituring-fetch: CLI tool to scrape purchased ebooks from ituring.com.cn."""
|
ituring_fetch/api.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""API client for ituring.com.cn REST API."""
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
import httpx
|
|
5
|
+
|
|
6
|
+
from .debug import log, log_request
|
|
7
|
+
from .models import Book, Chapter
|
|
8
|
+
|
|
9
|
+
BASE_URL = "https://api.ituring.com.cn/api"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class IturingAPI:
|
|
13
|
+
def __init__(self, access_token: str = "", cookies: list[dict[str, Any]] | None = None):
|
|
14
|
+
headers: dict[str, str] = {
|
|
15
|
+
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
|
|
16
|
+
}
|
|
17
|
+
if access_token:
|
|
18
|
+
headers["Authorization"] = f"Bearer {access_token}"
|
|
19
|
+
|
|
20
|
+
self.client = httpx.Client(
|
|
21
|
+
base_url=BASE_URL,
|
|
22
|
+
timeout=30.0,
|
|
23
|
+
headers=headers,
|
|
24
|
+
)
|
|
25
|
+
# Also set cookies for dual auth
|
|
26
|
+
for c in (cookies or []):
|
|
27
|
+
self.client.cookies.set(
|
|
28
|
+
name=c["name"],
|
|
29
|
+
value=c["value"],
|
|
30
|
+
domain=c.get("domain", ""),
|
|
31
|
+
path=c.get("path", "/"),
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
def _parse_book(self, data: dict[str, Any]) -> Book:
|
|
35
|
+
chapters = []
|
|
36
|
+
ebook = data.get("ebook") or {}
|
|
37
|
+
for ch_data in ebook.get("chapters", []):
|
|
38
|
+
chapters.append(Chapter(
|
|
39
|
+
id=ch_data["id"],
|
|
40
|
+
subject=ch_data["subject"],
|
|
41
|
+
is_free=ch_data.get("isFree", True),
|
|
42
|
+
is_completed=ch_data.get("isCompleted", True),
|
|
43
|
+
))
|
|
44
|
+
return Book(
|
|
45
|
+
id=data["id"],
|
|
46
|
+
name=data["name"],
|
|
47
|
+
author=data.get("authorNameString", ""),
|
|
48
|
+
chapters=chapters,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def get_book(self, book_id: int) -> Book | None:
|
|
52
|
+
"""Fetch book details by ID. Returns None on failure."""
|
|
53
|
+
url = f"/Book/{book_id}"
|
|
54
|
+
log_request("GET", url)
|
|
55
|
+
resp = self.client.get(url)
|
|
56
|
+
log_request("GET", url, resp.status_code, resp.text[:500] if resp.status_code != 200 else "")
|
|
57
|
+
if resp.status_code != 200:
|
|
58
|
+
return None
|
|
59
|
+
return self._parse_book(resp.json())
|
|
60
|
+
|
|
61
|
+
def get_shelf_ebooks(self, page: int = 1) -> list[Book]:
|
|
62
|
+
"""Fetch purchased ebooks list. Returns empty list on failure."""
|
|
63
|
+
url = "/User/ShelfEBook"
|
|
64
|
+
params = {"page": page, "query": "", "desc": "true"}
|
|
65
|
+
log_request("GET", f"{url}?page={page}")
|
|
66
|
+
resp = self.client.get(url, params=params)
|
|
67
|
+
log_request("GET", url, resp.status_code, resp.text[:500])
|
|
68
|
+
if resp.status_code != 200:
|
|
69
|
+
return []
|
|
70
|
+
data = resp.json()
|
|
71
|
+
log(f"ShelfEBook response keys: {list(data.keys())}, bookItems count: {len(data.get('bookItems', []))}")
|
|
72
|
+
books = []
|
|
73
|
+
for item in data.get("bookItems", []):
|
|
74
|
+
books.append(Book(
|
|
75
|
+
id=item["id"],
|
|
76
|
+
name=item["name"],
|
|
77
|
+
author=item.get("authorNameString", ""),
|
|
78
|
+
))
|
|
79
|
+
return books
|
|
80
|
+
|
|
81
|
+
def get_profile(self) -> dict[str, Any] | None:
|
|
82
|
+
"""Fetch user profile. Returns None if not logged in."""
|
|
83
|
+
log_request("GET", "/User/Profile")
|
|
84
|
+
resp = self.client.get("/User/Profile")
|
|
85
|
+
log_request("GET", "/User/Profile", resp.status_code, resp.text[:200])
|
|
86
|
+
if resp.status_code != 200:
|
|
87
|
+
return None
|
|
88
|
+
return resp.json()
|
ituring_fetch/auth.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""Authentication: credential persistence and login/logout operations."""
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from .debug import log, log_step
|
|
7
|
+
|
|
8
|
+
CREDS_DIR = Path.home() / ".ituring"
|
|
9
|
+
CREDS_FILE = CREDS_DIR / "credentials.json"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _ensure_dir() -> None:
|
|
13
|
+
CREDS_DIR.mkdir(parents=True, exist_ok=True)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def save_credentials(cookies: list[dict[str, Any]], access_token: str = "",
|
|
17
|
+
refresh_token: str = "", user_id: str = "",
|
|
18
|
+
path: Path | None = None) -> None:
|
|
19
|
+
"""Save credentials (cookies + tokens) to JSON file."""
|
|
20
|
+
target = path or CREDS_FILE
|
|
21
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
22
|
+
data = {
|
|
23
|
+
"cookies": cookies,
|
|
24
|
+
"access_token": access_token,
|
|
25
|
+
"refresh_token": refresh_token,
|
|
26
|
+
"user_id": user_id,
|
|
27
|
+
}
|
|
28
|
+
target.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def load_credentials(path: Path | None = None) -> dict[str, Any]:
|
|
32
|
+
"""Load credentials from JSON file. Returns empty dict if file missing."""
|
|
33
|
+
target = path or CREDS_FILE
|
|
34
|
+
if not target.exists():
|
|
35
|
+
return {}
|
|
36
|
+
return json.loads(target.read_text(encoding="utf-8"))
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def load_cookies() -> list[dict[str, Any]]:
|
|
40
|
+
"""Load just the cookies from credentials file."""
|
|
41
|
+
creds = load_credentials()
|
|
42
|
+
return creds.get("cookies", [])
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def load_access_token() -> str:
|
|
46
|
+
"""Load the access token from credentials file."""
|
|
47
|
+
creds = load_credentials()
|
|
48
|
+
return creds.get("access_token", "")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def delete_credentials(path: Path | None = None) -> None:
|
|
52
|
+
"""Delete the credentials file."""
|
|
53
|
+
target = path or CREDS_FILE
|
|
54
|
+
target.unlink(missing_ok=True)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def credentials_exist() -> bool:
|
|
58
|
+
"""Check if credentials file exists."""
|
|
59
|
+
return CREDS_FILE.exists()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def login() -> None:
|
|
63
|
+
"""Open browser for manual login, save cookies + token on success."""
|
|
64
|
+
from playwright.sync_api import sync_playwright
|
|
65
|
+
|
|
66
|
+
with sync_playwright() as p:
|
|
67
|
+
browser = p.chromium.launch(channel="chrome", headless=False)
|
|
68
|
+
context = browser.new_context(
|
|
69
|
+
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"
|
|
70
|
+
)
|
|
71
|
+
page = context.new_page()
|
|
72
|
+
|
|
73
|
+
page.goto("https://www.ituring.com.cn/login", wait_until="domcontentloaded")
|
|
74
|
+
log_step("已打开登录页面")
|
|
75
|
+
print("已打开登录页面,请在浏览器中完成登录...")
|
|
76
|
+
print("登录成功后,工具将自动保存凭据。")
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
# Wait for navigation away from login page
|
|
80
|
+
page.wait_for_url(
|
|
81
|
+
lambda url: "login" not in url,
|
|
82
|
+
timeout=300_000,
|
|
83
|
+
)
|
|
84
|
+
# Wait for SPA to initialize and tokens to be stored
|
|
85
|
+
page.wait_for_timeout(3000)
|
|
86
|
+
|
|
87
|
+
# Extract tokens from localStorage
|
|
88
|
+
log_step("登录重定向完成,正在提取 token...")
|
|
89
|
+
access_token = page.evaluate(
|
|
90
|
+
"() => localStorage.getItem('accessToken') || ''"
|
|
91
|
+
)
|
|
92
|
+
refresh_token = page.evaluate(
|
|
93
|
+
"() => localStorage.getItem('refreshToken') || ''"
|
|
94
|
+
)
|
|
95
|
+
user_id = page.evaluate(
|
|
96
|
+
"() => localStorage.getItem('userID') || ''"
|
|
97
|
+
)
|
|
98
|
+
log(f"accessToken: {'有' if access_token else '无'} (len={len(access_token)})")
|
|
99
|
+
log(f"refreshToken: {'有' if refresh_token else '无'}")
|
|
100
|
+
log(f"userID: {user_id}")
|
|
101
|
+
|
|
102
|
+
if not access_token:
|
|
103
|
+
# Retry after a short wait — SPA might still be initializing
|
|
104
|
+
log_step("token 未就绪,等待 5s 重试...")
|
|
105
|
+
page.wait_for_timeout(5000)
|
|
106
|
+
access_token = page.evaluate(
|
|
107
|
+
"() => localStorage.getItem('accessToken') || ''"
|
|
108
|
+
)
|
|
109
|
+
log(f"重试后 accessToken: {'有' if access_token else '无'}")
|
|
110
|
+
|
|
111
|
+
cookies = list(context.cookies())
|
|
112
|
+
log(f"保存了 {len(cookies)} 个 cookie, token={bool(access_token)}")
|
|
113
|
+
save_credentials(list(cookies), access_token, refresh_token, user_id)
|
|
114
|
+
|
|
115
|
+
if access_token:
|
|
116
|
+
print("登录成功!凭据已保存。")
|
|
117
|
+
else:
|
|
118
|
+
print("登录可能未完成(未检测到 access token),请重试。")
|
|
119
|
+
except Exception as e:
|
|
120
|
+
print(f"登录超时或已取消: {e}")
|
|
121
|
+
finally:
|
|
122
|
+
browser.close()
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def logout() -> None:
|
|
126
|
+
"""Delete saved credentials."""
|
|
127
|
+
delete_credentials()
|
|
128
|
+
print("已登出,凭据已删除。")
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def status() -> int:
|
|
132
|
+
"""检查登录状态。返回 0 表示已登录,1 表示未登录。"""
|
|
133
|
+
creds = load_credentials()
|
|
134
|
+
if not creds or not creds.get("access_token"):
|
|
135
|
+
print("未登录。请执行: ituring-fetch login")
|
|
136
|
+
return 1
|
|
137
|
+
|
|
138
|
+
from .api import IturingAPI
|
|
139
|
+
api = IturingAPI(access_token=creds["access_token"], cookies=creds.get("cookies", []))
|
|
140
|
+
profile = api.get_profile()
|
|
141
|
+
if profile is None:
|
|
142
|
+
print("登录已过期,请重新执行: ituring-fetch login")
|
|
143
|
+
return 1
|
|
144
|
+
|
|
145
|
+
name = profile.get("nickName") or str(profile.get("userId", "未知"))
|
|
146
|
+
print(f"已登录: {name}")
|
|
147
|
+
return 0
|
ituring_fetch/cli.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""CLI entry point for ituring-fetch."""
|
|
2
|
+
import tempfile
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
from rich.progress import Progress
|
|
8
|
+
from rich.table import Table
|
|
9
|
+
|
|
10
|
+
from .auth import login as do_login, logout as do_logout, status as do_status, load_cookies, load_access_token
|
|
11
|
+
from .api import IturingAPI
|
|
12
|
+
from .converter import assemble_html, check_pandoc, convert
|
|
13
|
+
from .debug import log_step
|
|
14
|
+
from .scraper import fetch_all_chapters
|
|
15
|
+
|
|
16
|
+
console = Console()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@click.group()
|
|
20
|
+
def main() -> None:
|
|
21
|
+
"""ituring-fetch: 从图灵社区抓取已购电子书并转换为 epub/pdf/html。"""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@main.command()
|
|
25
|
+
def login() -> None:
|
|
26
|
+
"""打开浏览器窗口让使用者登录图灵社区。"""
|
|
27
|
+
do_login()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@main.command()
|
|
31
|
+
def logout() -> None:
|
|
32
|
+
"""删除保存的图灵社区账号信息。"""
|
|
33
|
+
do_logout()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@main.command()
|
|
37
|
+
def status() -> None:
|
|
38
|
+
"""检查登录状态。"""
|
|
39
|
+
do_status()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@main.command()
|
|
43
|
+
def list_command() -> None:
|
|
44
|
+
"""罗列已购电子书,呈现书籍名称和唯一ID。"""
|
|
45
|
+
cookies = load_cookies()
|
|
46
|
+
token = load_access_token()
|
|
47
|
+
if not token:
|
|
48
|
+
console.print("[red]未登录,请先执行: ituring-fetch login[/red]")
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
api = IturingAPI(access_token=token, cookies=cookies)
|
|
52
|
+
log_step(f"正在获取已购电子书列表...")
|
|
53
|
+
books = api.get_shelf_ebooks()
|
|
54
|
+
|
|
55
|
+
if not books:
|
|
56
|
+
console.print("[yellow]没有找到已购电子书,或登录已过期。[/yellow]")
|
|
57
|
+
return
|
|
58
|
+
|
|
59
|
+
table = Table(title="已购电子书")
|
|
60
|
+
table.add_column("ID", style="cyan", width=6)
|
|
61
|
+
table.add_column("书名", style="green")
|
|
62
|
+
table.add_column("作者", style="dim")
|
|
63
|
+
|
|
64
|
+
for b in books:
|
|
65
|
+
table.add_row(str(b.id), b.name, b.author)
|
|
66
|
+
|
|
67
|
+
console.print(table)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# Alias 'list' command (avoids shadowing built-in)
|
|
71
|
+
main.add_command(list_command, name="list")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@main.command()
|
|
75
|
+
@click.argument("book_id", type=int)
|
|
76
|
+
@click.option("--type", "output_type", type=click.Choice(["epub", "pdf", "html"]), required=True, help="输出格式")
|
|
77
|
+
@click.option("--output", "-o", "output_file", type=click.Path(), default=None, help="输出文件名")
|
|
78
|
+
def fetch(book_id: int, output_type: str, output_file: str | None) -> None:
|
|
79
|
+
"""爬取 ID 对应书籍,生成指定格式的单一文件。"""
|
|
80
|
+
if output_type in ("epub", "pdf") and not check_pandoc():
|
|
81
|
+
console.print("[red]请先安装 pandoc: sudo apt install pandoc[/red]")
|
|
82
|
+
return
|
|
83
|
+
|
|
84
|
+
cookies = load_cookies()
|
|
85
|
+
token = load_access_token()
|
|
86
|
+
if not token:
|
|
87
|
+
console.print("[red]未登录,请先执行: ituring-fetch login[/red]")
|
|
88
|
+
return
|
|
89
|
+
|
|
90
|
+
api = IturingAPI(access_token=token, cookies=cookies)
|
|
91
|
+
log_step(f"正在获取书籍信息 (ID: {book_id})...")
|
|
92
|
+
console.print(f"正在获取书籍信息 (ID: {book_id})...")
|
|
93
|
+
book = api.get_book(book_id)
|
|
94
|
+
|
|
95
|
+
if book is None:
|
|
96
|
+
console.print(f"[red]未找到该书籍 (ID: {book_id}),请用 list 命令确认[/red]")
|
|
97
|
+
return
|
|
98
|
+
|
|
99
|
+
console.print(f"书名: [green]{book.name}[/green]")
|
|
100
|
+
console.print(f"作者: {book.author}")
|
|
101
|
+
console.print(f"章节数: {len(book.chapters)}")
|
|
102
|
+
|
|
103
|
+
chapter_ids = [ch.id for ch in book.chapters]
|
|
104
|
+
|
|
105
|
+
console.print("\n正在抓取章节内容...")
|
|
106
|
+
with Progress() as progress:
|
|
107
|
+
task = progress.add_task("抓取中...", total=len(chapter_ids))
|
|
108
|
+
|
|
109
|
+
def update_progress(current: int, total: int) -> None:
|
|
110
|
+
progress.update(task, completed=current, description=f"抓取中... ({current}/{total})")
|
|
111
|
+
|
|
112
|
+
chapter_html_list = fetch_all_chapters(cookies, book_id, chapter_ids, update_progress)
|
|
113
|
+
|
|
114
|
+
# Pair chapters with their HTML
|
|
115
|
+
chapters_data = []
|
|
116
|
+
for ch, html_content in zip(book.chapters, chapter_html_list):
|
|
117
|
+
if not html_content:
|
|
118
|
+
console.print(f"[yellow] 警告: 章节 '{ch.subject}' 内容为空[/yellow]")
|
|
119
|
+
chapters_data.append({"subject": ch.subject, "html": html_content})
|
|
120
|
+
|
|
121
|
+
valid_chapters = [c for c in chapters_data if c["html"]]
|
|
122
|
+
if not valid_chapters:
|
|
123
|
+
console.print("[red]所有章节抓取失败,无法生成文件[/red]")
|
|
124
|
+
return
|
|
125
|
+
|
|
126
|
+
console.print(f"\n成功抓取 {len(valid_chapters)}/{len(chapters_data)} 个章节")
|
|
127
|
+
|
|
128
|
+
# Assemble HTML
|
|
129
|
+
metadata = {"title": book.name, "author": book.author}
|
|
130
|
+
full_html = assemble_html(valid_chapters, metadata)
|
|
131
|
+
|
|
132
|
+
# Determine output path
|
|
133
|
+
if output_file:
|
|
134
|
+
out_path = Path(output_file)
|
|
135
|
+
else:
|
|
136
|
+
out_path = Path(f"{book.name}.{output_type}")
|
|
137
|
+
|
|
138
|
+
with tempfile.NamedTemporaryFile(suffix=".html", mode="w", encoding="utf-8", delete=False) as f:
|
|
139
|
+
temp_html = Path(f.name)
|
|
140
|
+
f.write(full_html)
|
|
141
|
+
|
|
142
|
+
try:
|
|
143
|
+
console.print(f"\n正在转换为 {output_type} 格式...")
|
|
144
|
+
convert(temp_html, out_path, output_type, metadata, cookies)
|
|
145
|
+
console.print(f"[green]✓ 已生成: {out_path}[/green]")
|
|
146
|
+
finally:
|
|
147
|
+
temp_html.unlink(missing_ok=True)
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""HTML assembly and format conversion (epub via pandoc, pdf via Playwright)."""
|
|
2
|
+
import base64
|
|
3
|
+
import html as html_module
|
|
4
|
+
import shutil
|
|
5
|
+
import subprocess
|
|
6
|
+
import tempfile
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
from playwright.sync_api import sync_playwright
|
|
12
|
+
|
|
13
|
+
from .debug import log, log_step
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def check_pandoc() -> bool:
|
|
17
|
+
"""Check if pandoc is available on PATH."""
|
|
18
|
+
return shutil.which("pandoc") is not None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def assemble_html(chapters: list[dict[str, str]], metadata: dict[str, str]) -> str:
|
|
22
|
+
"""Assemble multiple chapters into a single HTML document.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
chapters: List of {"subject": str, "html": str}
|
|
26
|
+
metadata: {"title": str, "author": str}
|
|
27
|
+
"""
|
|
28
|
+
title = html_module.escape(metadata.get("title", ""))
|
|
29
|
+
author = html_module.escape(metadata.get("author", ""))
|
|
30
|
+
|
|
31
|
+
toc_items = []
|
|
32
|
+
body_items = []
|
|
33
|
+
for i, ch in enumerate(chapters):
|
|
34
|
+
subject = ch["subject"]
|
|
35
|
+
safe_subject = html_module.escape(subject)
|
|
36
|
+
toc_items.append(f'<li><a href="#ch{i}">{safe_subject}</a></li>')
|
|
37
|
+
body_items.append(f'<h2 id="ch{i}">{safe_subject}</h2>')
|
|
38
|
+
body_items.append(ch["html"])
|
|
39
|
+
|
|
40
|
+
return f"""<!DOCTYPE html>
|
|
41
|
+
<html lang="zh-CN">
|
|
42
|
+
<head>
|
|
43
|
+
<meta charset="utf-8">
|
|
44
|
+
<title>{title}</title>
|
|
45
|
+
<style>
|
|
46
|
+
body {{ font-family: "Noto Serif CJK SC", "Source Han Serif SC", serif; line-height: 1.8; max-width: 800px; margin: 0 auto; padding: 2em; }}
|
|
47
|
+
h1 {{ text-align: center; }}
|
|
48
|
+
h2 {{ margin-top: 2em; border-bottom: 1px solid #ccc; padding-bottom: 0.3em; }}
|
|
49
|
+
.toc {{ background: #f5f5f5; padding: 1em 2em; margin: 1em 0; }}
|
|
50
|
+
img {{ max-width: 100%; }}
|
|
51
|
+
pre {{ background: #f4f4f4; padding: 1em; overflow-x: auto; }}
|
|
52
|
+
code {{ font-family: "Noto Mono", "Source Code Pro", monospace; font-size: 0.9em; }}
|
|
53
|
+
</style>
|
|
54
|
+
</head>
|
|
55
|
+
<body>
|
|
56
|
+
<h1>{title}</h1>
|
|
57
|
+
<p style="text-align:center;color:#888;">{author}</p>
|
|
58
|
+
|
|
59
|
+
<div class="toc">
|
|
60
|
+
<h3>目录</h3>
|
|
61
|
+
<ol>
|
|
62
|
+
{"".join(toc_items)}
|
|
63
|
+
</ol>
|
|
64
|
+
</div>
|
|
65
|
+
|
|
66
|
+
{"".join(body_items)}
|
|
67
|
+
</body>
|
|
68
|
+
</html>"""
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def convert(html_path: Path, output_path: Path, output_type: str,
|
|
72
|
+
metadata: dict[str, str], cookies: list[dict[str, Any]] | None = None) -> None:
|
|
73
|
+
"""Convert HTML file to target format.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
html_path: Input HTML file path
|
|
77
|
+
output_path: Output file path
|
|
78
|
+
output_type: "epub", "pdf", or "html"
|
|
79
|
+
metadata: {"title": str, "author": str}
|
|
80
|
+
cookies: Playwright-format cookies for authenticated image fetch
|
|
81
|
+
"""
|
|
82
|
+
if output_type == "html":
|
|
83
|
+
shutil.copy(html_path, output_path)
|
|
84
|
+
return
|
|
85
|
+
|
|
86
|
+
if output_type == "pdf":
|
|
87
|
+
_convert_pdf_playwright(html_path, output_path, cookies or [])
|
|
88
|
+
return
|
|
89
|
+
|
|
90
|
+
if output_type == "epub":
|
|
91
|
+
_convert_epub(html_path, output_path, metadata, cookies or [])
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _convert_epub(html_path: Path, output_path: Path, metadata: dict[str, str],
|
|
96
|
+
cookies: list[dict[str, Any]]) -> None:
|
|
97
|
+
"""Convert HTML to EPUB via pandoc."""
|
|
98
|
+
title = metadata.get("title", "")
|
|
99
|
+
author = metadata.get("author", "")
|
|
100
|
+
|
|
101
|
+
cmd = [
|
|
102
|
+
"pandoc",
|
|
103
|
+
str(html_path),
|
|
104
|
+
"-o", str(output_path),
|
|
105
|
+
"--from=html",
|
|
106
|
+
"--to=epub",
|
|
107
|
+
"--metadata", f"title={title}",
|
|
108
|
+
]
|
|
109
|
+
if author:
|
|
110
|
+
cmd.extend(["--metadata", f"author={author}"])
|
|
111
|
+
|
|
112
|
+
# Allow image fetch failures — pandoc will use placeholders
|
|
113
|
+
log_step(f"pandoc epub: {' '.join(cmd)}")
|
|
114
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
115
|
+
if result.returncode != 0:
|
|
116
|
+
log(f"pandoc stderr: {result.stderr[:1000]}")
|
|
117
|
+
# Retry without image fetching
|
|
118
|
+
cmd_no_images = cmd + ["--extract-media=/dev/null"]
|
|
119
|
+
try:
|
|
120
|
+
subprocess.run(cmd_no_images, check=True)
|
|
121
|
+
except subprocess.CalledProcessError:
|
|
122
|
+
raise
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _convert_pdf_playwright(html_path: Path, output_path: Path,
|
|
126
|
+
cookies: list[dict[str, Any]]) -> None:
|
|
127
|
+
"""Convert HTML to PDF using Playwright's built-in PDF generation."""
|
|
128
|
+
log_step("使用 Playwright 生成 PDF...")
|
|
129
|
+
html_content = html_path.read_text(encoding="utf-8")
|
|
130
|
+
|
|
131
|
+
with sync_playwright() as p:
|
|
132
|
+
browser = p.chromium.launch(channel="chrome", headless=True)
|
|
133
|
+
context = browser.new_context()
|
|
134
|
+
if cookies:
|
|
135
|
+
context.add_cookies(cookies) # type: ignore[reportArgumentType]
|
|
136
|
+
|
|
137
|
+
page = context.new_page()
|
|
138
|
+
# Use data URL to avoid file:// CORS issues with images
|
|
139
|
+
encoded = base64.b64encode(html_content.encode("utf-8")).decode()
|
|
140
|
+
page.goto(f"data:text/html;base64,{encoded}", wait_until="networkidle", timeout=60000)
|
|
141
|
+
page.wait_for_timeout(2000)
|
|
142
|
+
|
|
143
|
+
page.pdf(
|
|
144
|
+
path=str(output_path),
|
|
145
|
+
format="A4",
|
|
146
|
+
margin={"top": "2cm", "bottom": "2cm", "left": "2cm", "right": "2cm"},
|
|
147
|
+
print_background=True,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
browser.close()
|
|
151
|
+
log_step(f"PDF 已生成: {output_path}")
|
ituring_fetch/debug.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Debug logging controlled by ITURING_DEBUG environment variable.
|
|
2
|
+
|
|
3
|
+
Set ITURING_DEBUG=1 to enable debug output to stderr.
|
|
4
|
+
"""
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
import time
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _enabled() -> bool:
|
|
11
|
+
return os.environ.get("ITURING_DEBUG", "") in ("1", "true", "yes")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def log(msg: str) -> None:
|
|
15
|
+
"""Write debug message to stderr if ITURING_DEBUG is enabled."""
|
|
16
|
+
if _enabled():
|
|
17
|
+
ts = time.strftime("%H:%M:%S")
|
|
18
|
+
print(f"[DEBUG {ts}] {msg}", file=sys.stderr)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def log_request(method: str, url: str, status: int | None = None, body: str = "") -> None:
|
|
22
|
+
"""Log an HTTP request/response."""
|
|
23
|
+
if _enabled():
|
|
24
|
+
ts = time.strftime("%H:%M:%S")
|
|
25
|
+
status_str = f" → {status}" if status else ""
|
|
26
|
+
print(f"[DEBUG {ts}] {method} {url}{status_str}", file=sys.stderr)
|
|
27
|
+
if body:
|
|
28
|
+
preview = body[:500] + ("..." if len(body) > 500 else "")
|
|
29
|
+
print(f"[DEBUG {ts}] {preview}", file=sys.stderr)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def log_step(msg: str) -> None:
|
|
33
|
+
"""Log a processing step."""
|
|
34
|
+
if _enabled():
|
|
35
|
+
ts = time.strftime("%H:%M:%S")
|
|
36
|
+
print(f"[DEBUG {ts}] ● {msg}", file=sys.stderr)
|
ituring_fetch/models.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
@dataclass
|
|
5
|
+
class Chapter:
|
|
6
|
+
id: int
|
|
7
|
+
subject: str
|
|
8
|
+
is_free: bool = True
|
|
9
|
+
is_completed: bool = True
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class Book:
|
|
14
|
+
id: int
|
|
15
|
+
name: str
|
|
16
|
+
author: str = ""
|
|
17
|
+
chapters: list[Chapter] = field(default_factory=list)
|
ituring_fetch/py.typed
ADDED
|
File without changes
|
ituring_fetch/scraper.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Playwright-based chapter content scraper."""
|
|
2
|
+
import random
|
|
3
|
+
import time
|
|
4
|
+
from typing import Any, Callable
|
|
5
|
+
|
|
6
|
+
from playwright.sync_api import sync_playwright, BrowserContext
|
|
7
|
+
|
|
8
|
+
from .debug import log, log_step
|
|
9
|
+
|
|
10
|
+
CHAPTER_URL = "https://www.ituring.com.cn/book/miniarticle/{chapter_id}?bookID={book_id}&type=minibook"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def create_context(cookies: list[dict[str, Any]]) -> tuple:
|
|
14
|
+
"""Create a Playwright browser context with saved cookies.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
(playwright, browser, context) — caller must clean up all three.
|
|
18
|
+
"""
|
|
19
|
+
p = sync_playwright().start()
|
|
20
|
+
browser = p.chromium.launch(channel="chrome", headless=True)
|
|
21
|
+
context = browser.new_context(
|
|
22
|
+
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"
|
|
23
|
+
)
|
|
24
|
+
context.add_cookies(cookies) # type: ignore[reportArgumentType]
|
|
25
|
+
return p, browser, context
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def fetch_chapter_html(context: BrowserContext, book_id: int, chapter_id: int) -> str:
|
|
29
|
+
"""Fetch HTML content of a single chapter. Returns empty string on failure."""
|
|
30
|
+
url = CHAPTER_URL.format(chapter_id=chapter_id, book_id=book_id)
|
|
31
|
+
page = context.new_page()
|
|
32
|
+
try:
|
|
33
|
+
page.goto(url, wait_until="networkidle", timeout=30000)
|
|
34
|
+
page.wait_for_timeout(1500)
|
|
35
|
+
|
|
36
|
+
html = page.evaluate("""() => {
|
|
37
|
+
const el = document.querySelector('.article-content');
|
|
38
|
+
return el ? el.innerHTML : '';
|
|
39
|
+
}""")
|
|
40
|
+
return html
|
|
41
|
+
except Exception:
|
|
42
|
+
return ""
|
|
43
|
+
finally:
|
|
44
|
+
page.close()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def fetch_all_chapters(
|
|
48
|
+
cookies: list[dict[str, Any]], book_id: int, chapter_ids: list[int],
|
|
49
|
+
progress_callback: Callable[[int, int], None] | None = None,
|
|
50
|
+
) -> list[str]:
|
|
51
|
+
"""Fetch all chapters' HTML content.
|
|
52
|
+
|
|
53
|
+
Creates a dedicated browser context, fetches each chapter, then cleans up.
|
|
54
|
+
Sleeps 1-3s between requests to avoid rate limiting.
|
|
55
|
+
Returns list of HTML strings (one per chapter, empty string for failures).
|
|
56
|
+
"""
|
|
57
|
+
log_step(f"创建浏览器上下文,准备抓取 {len(chapter_ids)} 个章节...")
|
|
58
|
+
p, browser, context = create_context(cookies)
|
|
59
|
+
results: list[str] = []
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
for i, ch_id in enumerate(chapter_ids):
|
|
63
|
+
if progress_callback:
|
|
64
|
+
progress_callback(i + 1, len(chapter_ids))
|
|
65
|
+
|
|
66
|
+
html = fetch_chapter_html(context, book_id, ch_id)
|
|
67
|
+
results.append(html)
|
|
68
|
+
log(f"章节 {ch_id}: {'OK' if html else '空'} ({len(html)} chars)")
|
|
69
|
+
|
|
70
|
+
if i < len(chapter_ids) - 1:
|
|
71
|
+
delay = random.uniform(1.0, 3.0)
|
|
72
|
+
log(f"等待 {delay:.1f}s...")
|
|
73
|
+
time.sleep(delay)
|
|
74
|
+
finally:
|
|
75
|
+
context.close()
|
|
76
|
+
browser.close()
|
|
77
|
+
p.stop()
|
|
78
|
+
|
|
79
|
+
return results
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ituring-fetch
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: CLI tool to scrape purchased ebooks from ituring.com.cn
|
|
5
|
+
Author-email: yms_hi <yms_hi@Outlook.com>
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Requires-Dist: click>=8
|
|
8
|
+
Requires-Dist: httpx>=0.27
|
|
9
|
+
Requires-Dist: playwright>=1.40
|
|
10
|
+
Requires-Dist: rich>=13
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
# ituring-fetch
|
|
14
|
+
|
|
15
|
+
把图灵社区已购电子书扒下来,转成 epub/pdf/html。
|
|
16
|
+
|
|
17
|
+
## 安装
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
git clone <repo>
|
|
21
|
+
cd ituring_fetch
|
|
22
|
+
uv sync
|
|
23
|
+
playwright install chromium # 或者用系统自带的 Chrome
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
依赖:Python 3.11+、pandoc(epub 格式需要)、Chrome/Chromium。
|
|
27
|
+
|
|
28
|
+
## 使用
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
# 登录(会弹出浏览器窗口)
|
|
32
|
+
uv run ituring-fetch login
|
|
33
|
+
|
|
34
|
+
# 看看登录没
|
|
35
|
+
uv run ituring-fetch status
|
|
36
|
+
|
|
37
|
+
# 列出已购电子书
|
|
38
|
+
uv run ituring-fetch list
|
|
39
|
+
|
|
40
|
+
# 抓取一本书
|
|
41
|
+
uv run ituring-fetch fetch 1143 --type=epub
|
|
42
|
+
uv run ituring-fetch fetch 1143 --type=pdf
|
|
43
|
+
uv run ituring-fetch fetch 1143 --type=html
|
|
44
|
+
|
|
45
|
+
# 指定输出文件名
|
|
46
|
+
uv run ituring-fetch fetch 1143 --type=pdf -o mybook.pdf
|
|
47
|
+
|
|
48
|
+
# 登出
|
|
49
|
+
uv run ituring-fetch logout
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
`list` 输出的第一列就是书籍 ID。
|
|
53
|
+
|
|
54
|
+
## 调试
|
|
55
|
+
|
|
56
|
+
出问题时设个环境变量看详细日志:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
ITURING_DEBUG=1 uv run ituring-fetch fetch 1143 --type=pdf
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
会往 stderr 打请求/响应细节。
|
|
63
|
+
|
|
64
|
+
## 原理
|
|
65
|
+
|
|
66
|
+
登录时用 Playwright 打开浏览器,你手动登录后,程序从 localStorage 里取出 access token,跟 cookie 一起存到 `~/.ituring/credentials.json`。
|
|
67
|
+
|
|
68
|
+
后续操作:
|
|
69
|
+
- `list` 和书籍信息直接调 `api.ituring.com.cn` 的接口,带 Bearer token
|
|
70
|
+
- 章节内容用 Playwright headless 渲染页面,抽 `.article-content` 的 HTML
|
|
71
|
+
- 合并所有章节,pandoc 转 epub,Playwright 内置 PDF 引擎出 pdf
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
ituring_fetch/__init__.py,sha256=2KhedlgmuNnHU09K4VujfJNDvD9HgXik1q7L7loQg1Q,78
|
|
2
|
+
ituring_fetch/api.py,sha256=S-G9zHkStOO-knmodyA0Jd_NnqsOEmsvlEQ883IQ-P0,3180
|
|
3
|
+
ituring_fetch/auth.py,sha256=R-5OFfmdGWGldZnfEvaemEtegATL430Yy3Qtxrsj728,5220
|
|
4
|
+
ituring_fetch/cli.py,sha256=ziy77Co5pItcNS90tzR4i6saDdORt8PLyi_YGwRuG_g,4931
|
|
5
|
+
ituring_fetch/converter.py,sha256=61uO6X1LFATwy6NFMtjCPVtFRRDx1_zpsEWShLgEnRc,4905
|
|
6
|
+
ituring_fetch/debug.py,sha256=9ptYkRhMCrnR1dNNOI7Op-KsAR_E1a2YLs-I8o8Cm4g,1122
|
|
7
|
+
ituring_fetch/models.py,sha256=lZ8ka6wj21OKBACQSGwSrng-rFUOt30YbYfuk1i1z7I,283
|
|
8
|
+
ituring_fetch/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
ituring_fetch/scraper.py,sha256=eUO7OYeCeVn8oVRedhCGPrz4wUthadIYqh7_Yc6SQ3w,2696
|
|
10
|
+
ituring_fetch-0.1.0.dist-info/METADATA,sha256=NoE9DQMIwJXgTB_N2zuzt4UrTCCPq9SkPQ6JYpQsOOo,1745
|
|
11
|
+
ituring_fetch-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
12
|
+
ituring_fetch-0.1.0.dist-info/entry_points.txt,sha256=qKDY1RYVbZZlm0iejgmlO2dfIFg3k1L-ulc2Bq7YShw,57
|
|
13
|
+
ituring_fetch-0.1.0.dist-info/RECORD,,
|