novel-downloader 1.1.1__py3-none-any.whl → 1.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/download.py +58 -24
- novel_downloader/config/adapter.py +16 -10
- novel_downloader/config/models.py +10 -5
- novel_downloader/core/downloaders/__init__.py +2 -0
- novel_downloader/core/downloaders/base_async_downloader.py +157 -0
- novel_downloader/core/downloaders/common_asynb_downloader.py +207 -0
- novel_downloader/core/downloaders/common_downloader.py +2 -3
- novel_downloader/core/factory/__init__.py +14 -2
- novel_downloader/core/factory/downloader_factory.py +95 -8
- novel_downloader/core/factory/requester_factory.py +65 -21
- novel_downloader/core/interfaces/__init__.py +4 -0
- novel_downloader/core/interfaces/async_downloader_protocol.py +37 -0
- novel_downloader/core/interfaces/async_requester_protocol.py +70 -0
- novel_downloader/core/interfaces/requester_protocol.py +3 -3
- novel_downloader/core/parsers/qidian_parser/browser/main_parser.py +2 -0
- novel_downloader/core/parsers/qidian_parser/session/main_parser.py +2 -0
- novel_downloader/core/requesters/__init__.py +5 -1
- novel_downloader/core/requesters/base_async_session.py +299 -0
- novel_downloader/core/requesters/base_browser.py +3 -3
- novel_downloader/core/requesters/base_session.py +5 -5
- novel_downloader/core/requesters/common_requester/__init__.py +5 -1
- novel_downloader/core/requesters/common_requester/common_async_session.py +98 -0
- novel_downloader/core/requesters/common_requester/common_session.py +2 -2
- novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +3 -3
- novel_downloader/core/requesters/qidian_requester/qidian_session.py +4 -4
- novel_downloader/resources/config/settings.yaml +20 -14
- novel_downloader/utils/crypto_utils.py +4 -4
- novel_downloader/utils/fontocr/ocr_v2.py +6 -0
- {novel_downloader-1.1.1.dist-info → novel_downloader-1.2.1.dist-info}/METADATA +27 -7
- {novel_downloader-1.1.1.dist-info → novel_downloader-1.2.1.dist-info}/RECORD +35 -29
- {novel_downloader-1.1.1.dist-info → novel_downloader-1.2.1.dist-info}/WHEEL +0 -0
- {novel_downloader-1.1.1.dist-info → novel_downloader-1.2.1.dist-info}/entry_points.txt +0 -0
- {novel_downloader-1.1.1.dist-info → novel_downloader-1.2.1.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.1.1.dist-info → novel_downloader-1.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,299 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.core.requesters.base_async_session
|
5
|
+
---------------------------------------------------
|
6
|
+
|
7
|
+
This module defines the BaseAsyncSession class, which provides asynchronous
|
8
|
+
HTTP request capabilities using aiohttp. It maintains a persistent
|
9
|
+
client session and supports retries, headers, timeout configurations,
|
10
|
+
cookie handling, and defines abstract methods for subclasses.
|
11
|
+
"""
|
12
|
+
|
13
|
+
import abc
|
14
|
+
import asyncio
|
15
|
+
import time
|
16
|
+
from typing import Any, Dict, Optional, Union
|
17
|
+
|
18
|
+
import aiohttp
|
19
|
+
from aiohttp import ClientResponse, ClientSession, ClientTimeout, TCPConnector
|
20
|
+
|
21
|
+
from novel_downloader.config.models import RequesterConfig
|
22
|
+
from novel_downloader.core.interfaces import AsyncRequesterProtocol
|
23
|
+
from novel_downloader.utils.constants import DEFAULT_USER_HEADERS
|
24
|
+
|
25
|
+
|
26
|
+
class RateLimiter:
|
27
|
+
"""
|
28
|
+
Simple async token-bucket rate limiter: ensures no more than rate_per_sec
|
29
|
+
requests are started per second, across all coroutines.
|
30
|
+
"""
|
31
|
+
|
32
|
+
def __init__(self, rate_per_sec: float):
|
33
|
+
self._interval = 1.0 / rate_per_sec
|
34
|
+
self._lock = asyncio.Lock()
|
35
|
+
self._last = time.monotonic()
|
36
|
+
|
37
|
+
async def wait(self) -> None:
|
38
|
+
async with self._lock:
|
39
|
+
now = time.monotonic()
|
40
|
+
elapsed = now - self._last
|
41
|
+
delay = self._interval - elapsed
|
42
|
+
if delay > 0:
|
43
|
+
await asyncio.sleep(delay)
|
44
|
+
self._last = time.monotonic()
|
45
|
+
|
46
|
+
|
47
|
+
class BaseAsyncSession(AsyncRequesterProtocol, abc.ABC):
|
48
|
+
"""
|
49
|
+
BaseAsyncSession wraps basic HTTP operations using aiohttp.ClientSession,
|
50
|
+
supporting retry logic, timeout, persistent connections, and cookie management.
|
51
|
+
|
52
|
+
Attributes:
|
53
|
+
_session (ClientSession): The persistent aiohttp client session.
|
54
|
+
_timeout (float): Timeout for each request in seconds.
|
55
|
+
_retry_times (int): Number of retry attempts on failure.
|
56
|
+
_retry_interval (float): Delay (in seconds) between retries.
|
57
|
+
_headers (Dict[str, str]): Default HTTP headers to send.
|
58
|
+
_cookies (Dict[str, str]): Optional cookie jar for the session.
|
59
|
+
"""
|
60
|
+
|
61
|
+
def _init_session(
|
62
|
+
self,
|
63
|
+
config: RequesterConfig,
|
64
|
+
cookies: Optional[Dict[str, str]] = None,
|
65
|
+
) -> None:
|
66
|
+
"""
|
67
|
+
Initialize the async session with configuration.
|
68
|
+
|
69
|
+
:param config: Configuration object for session behavior
|
70
|
+
(timeouts, retries, headers, etc.)
|
71
|
+
:param cookies: Optional initial cookies to set on the session.
|
72
|
+
"""
|
73
|
+
self._config = config
|
74
|
+
self._timeout = config.timeout
|
75
|
+
self._retry_times = config.retry_times
|
76
|
+
self._retry_interval = config.retry_interval
|
77
|
+
self._cookies = cookies or {}
|
78
|
+
self._headers = DEFAULT_USER_HEADERS.copy()
|
79
|
+
self._session: Optional[ClientSession] = None
|
80
|
+
self._rate_limiter: Optional[RateLimiter] = None
|
81
|
+
|
82
|
+
async def _setup(self) -> None:
|
83
|
+
"""
|
84
|
+
Set up the aiohttp.ClientSession with timeout, connector, headers, and cookies.
|
85
|
+
"""
|
86
|
+
max_rps = getattr(self._config, "max_rps", None)
|
87
|
+
if max_rps is not None:
|
88
|
+
self._rate_limiter = RateLimiter(max_rps)
|
89
|
+
|
90
|
+
timeout = ClientTimeout(total=self._timeout)
|
91
|
+
connector = TCPConnector(
|
92
|
+
limit_per_host=getattr(self._config, "max_connections", 10)
|
93
|
+
)
|
94
|
+
self._session = ClientSession(
|
95
|
+
timeout=timeout,
|
96
|
+
connector=connector,
|
97
|
+
headers=self._headers,
|
98
|
+
cookies=self._cookies,
|
99
|
+
)
|
100
|
+
|
101
|
+
async def login(self, max_retries: int = 3, manual_login: bool = False) -> bool:
|
102
|
+
"""
|
103
|
+
Attempt to log in asynchronously.
|
104
|
+
Override in subclasses that require authentication.
|
105
|
+
|
106
|
+
:returns: True if login succeeded, False otherwise.
|
107
|
+
"""
|
108
|
+
raise NotImplementedError(
|
109
|
+
"Login is not supported by this session type. "
|
110
|
+
"Override login() in your subclass to enable it."
|
111
|
+
)
|
112
|
+
|
113
|
+
@abc.abstractmethod
|
114
|
+
async def get_book_info(
|
115
|
+
self, book_id: str, wait_time: Optional[float] = None
|
116
|
+
) -> str:
|
117
|
+
"""
|
118
|
+
Fetch the raw HTML (or JSON) of the book info page asynchronously.
|
119
|
+
|
120
|
+
:param book_id: The book identifier.
|
121
|
+
:param wait_time: Base number of seconds to wait before returning content.
|
122
|
+
:return: The page content as a string.
|
123
|
+
"""
|
124
|
+
...
|
125
|
+
|
126
|
+
@abc.abstractmethod
|
127
|
+
async def get_book_chapter(
|
128
|
+
self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
|
129
|
+
) -> str:
|
130
|
+
"""
|
131
|
+
Fetch the raw HTML (or JSON) of a single chapter asynchronously.
|
132
|
+
|
133
|
+
:param book_id: The book identifier.
|
134
|
+
:param chapter_id: The chapter identifier.
|
135
|
+
:param wait_time: Base number of seconds to wait before returning content.
|
136
|
+
:return: The chapter content as a string.
|
137
|
+
"""
|
138
|
+
...
|
139
|
+
|
140
|
+
async def get_bookcase(self, wait_time: Optional[float] = None) -> str:
|
141
|
+
"""
|
142
|
+
Optional: Retrieve the HTML content of the authenticated user's bookcase page.
|
143
|
+
Subclasses that support user login/bookcase should override this.
|
144
|
+
|
145
|
+
:param wait_time: Base number of seconds to wait before returning content.
|
146
|
+
:return: The HTML of the bookcase page.
|
147
|
+
"""
|
148
|
+
raise NotImplementedError(
|
149
|
+
"Bookcase fetching is not supported by this session type. "
|
150
|
+
"Override get_bookcase() in your subclass to enable it."
|
151
|
+
)
|
152
|
+
|
153
|
+
async def fetch(self, url: str, **kwargs: Any) -> str:
|
154
|
+
"""
|
155
|
+
Fetch the content from the given URL asynchronously, with retry support.
|
156
|
+
|
157
|
+
:param url: The target URL to fetch.
|
158
|
+
:param kwargs: Additional keyword arguments to pass to `session.get`.
|
159
|
+
:return: The response body as text.
|
160
|
+
:raises: aiohttp.ClientError on final failure.
|
161
|
+
"""
|
162
|
+
if self._session is None:
|
163
|
+
await self._setup()
|
164
|
+
if self._session is None:
|
165
|
+
raise RuntimeError("Session not initialized after setup")
|
166
|
+
|
167
|
+
if self._rate_limiter:
|
168
|
+
await self._rate_limiter.wait()
|
169
|
+
|
170
|
+
for attempt in range(self._retry_times + 1):
|
171
|
+
try:
|
172
|
+
async with self._session.get(url, **kwargs) as resp:
|
173
|
+
resp.raise_for_status()
|
174
|
+
text: str = await resp.text()
|
175
|
+
return text
|
176
|
+
except aiohttp.ClientError:
|
177
|
+
if attempt < self._retry_times:
|
178
|
+
await asyncio.sleep(self._retry_interval)
|
179
|
+
continue
|
180
|
+
raise
|
181
|
+
|
182
|
+
raise RuntimeError("Unreachable code reached in fetch()")
|
183
|
+
|
184
|
+
async def get(
|
185
|
+
self, url: str, params: Optional[Dict[str, Any]] = None, **kwargs: Any
|
186
|
+
) -> ClientResponse:
|
187
|
+
"""
|
188
|
+
Send an HTTP GET request asynchronously.
|
189
|
+
|
190
|
+
:param url: The target URL.
|
191
|
+
:param params: Query parameters to include in the request.
|
192
|
+
:param kwargs: Additional args passed to session.get().
|
193
|
+
:return: aiohttp.ClientResponse object.
|
194
|
+
:raises RuntimeError: If the session is not initialized.
|
195
|
+
"""
|
196
|
+
if self._session is None:
|
197
|
+
await self._setup()
|
198
|
+
if self._session is None:
|
199
|
+
raise RuntimeError("Session not initialized after setup")
|
200
|
+
|
201
|
+
if self._rate_limiter:
|
202
|
+
await self._rate_limiter.wait()
|
203
|
+
return await self._session.get(url, params=params, **kwargs)
|
204
|
+
|
205
|
+
async def post(
|
206
|
+
self,
|
207
|
+
url: str,
|
208
|
+
data: Optional[Union[Dict[str, Any], bytes]] = None,
|
209
|
+
json: Optional[Dict[str, Any]] = None,
|
210
|
+
**kwargs: Any,
|
211
|
+
) -> ClientResponse:
|
212
|
+
"""
|
213
|
+
Send an HTTP POST request asynchronously.
|
214
|
+
|
215
|
+
:param url: The target URL.
|
216
|
+
:param data: Form data to include in the request body.
|
217
|
+
:param json: JSON body to include in the request.
|
218
|
+
:param kwargs: Additional args passed to session.post().
|
219
|
+
:return: aiohttp.ClientResponse object.
|
220
|
+
:raises RuntimeError: If the session is not initialized.
|
221
|
+
"""
|
222
|
+
if self._session is None:
|
223
|
+
await self._setup()
|
224
|
+
if self._session is None:
|
225
|
+
raise RuntimeError("Session not initialized after setup")
|
226
|
+
|
227
|
+
if self._rate_limiter:
|
228
|
+
await self._rate_limiter.wait()
|
229
|
+
return await self._session.post(url, data=data, json=json, **kwargs)
|
230
|
+
|
231
|
+
@property
|
232
|
+
def session(self) -> ClientSession:
|
233
|
+
"""
|
234
|
+
Return the active aiohttp.ClientSession.
|
235
|
+
|
236
|
+
:raises RuntimeError: If the session is uninitialized.
|
237
|
+
"""
|
238
|
+
if self._session is None:
|
239
|
+
raise RuntimeError("Session is not initialized or has been shut down.")
|
240
|
+
return self._session
|
241
|
+
|
242
|
+
@property
|
243
|
+
def timeout(self) -> float:
|
244
|
+
"""Return the default timeout setting."""
|
245
|
+
return self._timeout
|
246
|
+
|
247
|
+
@property
|
248
|
+
def retry_times(self) -> int:
|
249
|
+
"""Return the maximum number of retry attempts."""
|
250
|
+
return self._retry_times
|
251
|
+
|
252
|
+
@property
|
253
|
+
def retry_interval(self) -> float:
|
254
|
+
"""Return the base interval (in seconds) between retries."""
|
255
|
+
return self._retry_interval
|
256
|
+
|
257
|
+
async def update_cookies(
|
258
|
+
self, cookies: Dict[str, str], overwrite: bool = True
|
259
|
+
) -> None:
|
260
|
+
"""
|
261
|
+
Update cookies for the current session and internal cache.
|
262
|
+
|
263
|
+
:param cookies: New cookies to merge.
|
264
|
+
:param overwrite: If True, replace existing; else, only set missing.
|
265
|
+
"""
|
266
|
+
# update internal cache
|
267
|
+
if overwrite:
|
268
|
+
self._cookies.update({str(k): str(v) for k, v in cookies.items()})
|
269
|
+
else:
|
270
|
+
for k, v in cookies.items():
|
271
|
+
self._cookies.setdefault(str(k), str(v))
|
272
|
+
|
273
|
+
# apply to live session
|
274
|
+
if self._session:
|
275
|
+
self._session.cookie_jar.update_cookies(self._cookies)
|
276
|
+
|
277
|
+
async def shutdown(self) -> None:
|
278
|
+
"""
|
279
|
+
Shutdown and clean up the session. Closes connection pool.
|
280
|
+
"""
|
281
|
+
if self._session:
|
282
|
+
await self._session.close()
|
283
|
+
self._session = None
|
284
|
+
|
285
|
+
def __getstate__(self) -> Dict[str, Any]:
|
286
|
+
"""
|
287
|
+
Prepare object state for serialization: remove unpickleable session.
|
288
|
+
"""
|
289
|
+
state = self.__dict__.copy()
|
290
|
+
state.pop("_session", None)
|
291
|
+
state.pop("_rate_limiter", None)
|
292
|
+
return state
|
293
|
+
|
294
|
+
def __setstate__(self, state: Dict[str, Any]) -> None:
|
295
|
+
"""
|
296
|
+
Restore object state. Session will be lazily reinitialized on next request.
|
297
|
+
"""
|
298
|
+
self.__dict__.update(state)
|
299
|
+
self._session = None
|
@@ -111,7 +111,7 @@ class BaseBrowser(RequesterProtocol, abc.ABC):
|
|
111
111
|
)
|
112
112
|
|
113
113
|
@abc.abstractmethod
|
114
|
-
def get_book_info(self, book_id: str, wait_time: Optional[
|
114
|
+
def get_book_info(self, book_id: str, wait_time: Optional[float] = None) -> str:
|
115
115
|
"""
|
116
116
|
Fetch the raw HTML (or JSON) of the book info page.
|
117
117
|
|
@@ -123,7 +123,7 @@ class BaseBrowser(RequesterProtocol, abc.ABC):
|
|
123
123
|
|
124
124
|
@abc.abstractmethod
|
125
125
|
def get_book_chapter(
|
126
|
-
self, book_id: str, chapter_id: str, wait_time: Optional[
|
126
|
+
self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
|
127
127
|
) -> str:
|
128
128
|
"""
|
129
129
|
Fetch the raw HTML (or JSON) of a single chapter.
|
@@ -135,7 +135,7 @@ class BaseBrowser(RequesterProtocol, abc.ABC):
|
|
135
135
|
"""
|
136
136
|
...
|
137
137
|
|
138
|
-
def get_bookcase(self, wait_time: Optional[
|
138
|
+
def get_bookcase(self, wait_time: Optional[float] = None) -> str:
|
139
139
|
"""
|
140
140
|
Optional: Retrieve the HTML content of the authenticated user's bookcase page.
|
141
141
|
|
@@ -28,7 +28,7 @@ class BaseSession(RequesterProtocol, abc.ABC):
|
|
28
28
|
|
29
29
|
Attributes:
|
30
30
|
_session (requests.Session): The persistent HTTP session.
|
31
|
-
_timeout (
|
31
|
+
_timeout (float): Timeout for each request in seconds.
|
32
32
|
"""
|
33
33
|
|
34
34
|
def _init_session(
|
@@ -81,7 +81,7 @@ class BaseSession(RequesterProtocol, abc.ABC):
|
|
81
81
|
)
|
82
82
|
|
83
83
|
@abc.abstractmethod
|
84
|
-
def get_book_info(self, book_id: str, wait_time: Optional[
|
84
|
+
def get_book_info(self, book_id: str, wait_time: Optional[float] = None) -> str:
|
85
85
|
"""
|
86
86
|
Fetch the raw HTML (or JSON) of the book info page.
|
87
87
|
|
@@ -93,7 +93,7 @@ class BaseSession(RequesterProtocol, abc.ABC):
|
|
93
93
|
|
94
94
|
@abc.abstractmethod
|
95
95
|
def get_book_chapter(
|
96
|
-
self, book_id: str, chapter_id: str, wait_time: Optional[
|
96
|
+
self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
|
97
97
|
) -> str:
|
98
98
|
"""
|
99
99
|
Fetch the raw HTML (or JSON) of a single chapter.
|
@@ -105,7 +105,7 @@ class BaseSession(RequesterProtocol, abc.ABC):
|
|
105
105
|
"""
|
106
106
|
...
|
107
107
|
|
108
|
-
def get_bookcase(self, wait_time: Optional[
|
108
|
+
def get_bookcase(self, wait_time: Optional[float] = None) -> str:
|
109
109
|
"""
|
110
110
|
Optional: Retrieve the HTML content of the authenticated user's bookcase page.
|
111
111
|
|
@@ -171,7 +171,7 @@ class BaseSession(RequesterProtocol, abc.ABC):
|
|
171
171
|
return self._session
|
172
172
|
|
173
173
|
@property
|
174
|
-
def timeout(self) ->
|
174
|
+
def timeout(self) -> float:
|
175
175
|
"""Return the default timeout setting."""
|
176
176
|
return self._timeout
|
177
177
|
|
@@ -9,6 +9,10 @@ request operations to novel websites. It serves as a unified access
|
|
9
9
|
point to import `CommonSession` without exposing lower-level modules.
|
10
10
|
"""
|
11
11
|
|
12
|
+
from .common_async_session import CommonAsyncSession
|
12
13
|
from .common_session import CommonSession
|
13
14
|
|
14
|
-
__all__ = [
|
15
|
+
__all__ = [
|
16
|
+
"CommonAsyncSession",
|
17
|
+
"CommonSession",
|
18
|
+
]
|
@@ -0,0 +1,98 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.core.requesters.common_requester.common_async_session
|
5
|
+
----------------------------------------------------------------------
|
6
|
+
|
7
|
+
This module defines a `CommonAsyncSession` class for handling HTTP requests
|
8
|
+
to common novel sites **asynchronously**. It provides methods to retrieve
|
9
|
+
raw book info pages and chapter contents using a flexible URL templating
|
10
|
+
system defined by a site profile, with retry logic and random delays.
|
11
|
+
"""
|
12
|
+
|
13
|
+
import asyncio
|
14
|
+
import random
|
15
|
+
from typing import Dict, Optional
|
16
|
+
|
17
|
+
from novel_downloader.config import RequesterConfig, SiteProfile
|
18
|
+
from novel_downloader.core.requesters.base_async_session import BaseAsyncSession
|
19
|
+
|
20
|
+
|
21
|
+
class CommonAsyncSession(BaseAsyncSession):
|
22
|
+
"""
|
23
|
+
A common async session for handling site-specific HTTP requests.
|
24
|
+
|
25
|
+
:ivar _site: The unique identifier or name of the site.
|
26
|
+
:ivar _profile: Metadata and URL templates related to the site.
|
27
|
+
"""
|
28
|
+
|
29
|
+
def __init__(
|
30
|
+
self,
|
31
|
+
config: RequesterConfig,
|
32
|
+
site: str,
|
33
|
+
profile: SiteProfile,
|
34
|
+
cookies: Optional[Dict[str, str]] = None,
|
35
|
+
) -> None:
|
36
|
+
"""
|
37
|
+
Initialize a CommonAsyncSession instance.
|
38
|
+
|
39
|
+
:param config: The RequesterConfig instance containing settings.
|
40
|
+
:param site: The identifier or domain of the target site.
|
41
|
+
:param profile: The site's metadata and URL templates.
|
42
|
+
:param cookies: Optional cookies to preload into the session.
|
43
|
+
"""
|
44
|
+
self._init_session(config=config, cookies=cookies)
|
45
|
+
self._site = site
|
46
|
+
self._profile = profile
|
47
|
+
|
48
|
+
async def get_book_info(
|
49
|
+
self, book_id: str, wait_time: Optional[float] = None
|
50
|
+
) -> str:
|
51
|
+
"""
|
52
|
+
Fetch the raw HTML of the book info page asynchronously.
|
53
|
+
|
54
|
+
Relies on BaseAsyncSession.fetch for retry logic, then sleeps with jitter.
|
55
|
+
|
56
|
+
:param book_id: The book identifier.
|
57
|
+
:param wait_time: Base seconds to sleep (with 0.5-1.5x random factor).
|
58
|
+
:return: The page content as a string.
|
59
|
+
"""
|
60
|
+
url = self.book_info_url.format(book_id=book_id)
|
61
|
+
html = await self.fetch(url)
|
62
|
+
base = wait_time if wait_time is not None else self._config.wait_time
|
63
|
+
await asyncio.sleep(base * random.uniform(0.5, 1.5))
|
64
|
+
return html
|
65
|
+
|
66
|
+
async def get_book_chapter(
|
67
|
+
self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
|
68
|
+
) -> str:
|
69
|
+
"""
|
70
|
+
Fetch the raw HTML of a single chapter asynchronously.
|
71
|
+
|
72
|
+
Relies on BaseAsyncSession.fetch for retry logic, then sleeps with jitter.
|
73
|
+
|
74
|
+
:param book_id: The book identifier.
|
75
|
+
:param chapter_id: The chapter identifier.
|
76
|
+
:param wait_time: Base seconds to sleep (with 0.5-1.5x random factor).
|
77
|
+
:return: The chapter content as a string.
|
78
|
+
"""
|
79
|
+
url = self.chapter_url.format(book_id=book_id, chapter_id=chapter_id)
|
80
|
+
html = await self.fetch(url)
|
81
|
+
base = wait_time if wait_time is not None else self._config.wait_time
|
82
|
+
await asyncio.sleep(base * random.uniform(0.5, 1.5))
|
83
|
+
return html
|
84
|
+
|
85
|
+
@property
|
86
|
+
def site(self) -> str:
|
87
|
+
"""Return the site name."""
|
88
|
+
return self._site
|
89
|
+
|
90
|
+
@property
|
91
|
+
def book_info_url(self) -> str:
|
92
|
+
"""Return the URL template for fetching book info."""
|
93
|
+
return self._profile["book_info_url"]
|
94
|
+
|
95
|
+
@property
|
96
|
+
def chapter_url(self) -> str:
|
97
|
+
"""Return the URL template for fetching chapter content."""
|
98
|
+
return self._profile["chapter_url"]
|
@@ -47,7 +47,7 @@ class CommonSession(BaseSession):
|
|
47
47
|
self._site = site
|
48
48
|
self._profile = profile
|
49
49
|
|
50
|
-
def get_book_info(self, book_id: str, wait_time: Optional[
|
50
|
+
def get_book_info(self, book_id: str, wait_time: Optional[float] = None) -> str:
|
51
51
|
"""
|
52
52
|
Fetch the raw HTML (or JSON) of the book info page.
|
53
53
|
|
@@ -75,7 +75,7 @@ class CommonSession(BaseSession):
|
|
75
75
|
raise RuntimeError("Unexpected error: get_book_info failed without returning")
|
76
76
|
|
77
77
|
def get_book_chapter(
|
78
|
-
self, book_id: str, chapter_id: str, wait_time: Optional[
|
78
|
+
self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
|
79
79
|
) -> str:
|
80
80
|
"""
|
81
81
|
Fetch the raw HTML (or JSON) of a single chapter.
|
@@ -266,7 +266,7 @@ class QidianBrowser(BaseBrowser):
|
|
266
266
|
"""
|
267
267
|
return self.QIDIAN_BOOKCASE_URL
|
268
268
|
|
269
|
-
def get_book_info(self, book_id: str, wait_time: Optional[
|
269
|
+
def get_book_info(self, book_id: str, wait_time: Optional[float] = None) -> str:
|
270
270
|
"""
|
271
271
|
Retrieve the HTML of a Qidian book info page.
|
272
272
|
|
@@ -311,7 +311,7 @@ class QidianBrowser(BaseBrowser):
|
|
311
311
|
time.sleep(pause)
|
312
312
|
|
313
313
|
def get_book_chapter(
|
314
|
-
self, book_id: str, chapter_id: str, wait_time: Optional[
|
314
|
+
self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
|
315
315
|
) -> str:
|
316
316
|
"""
|
317
317
|
Retrieve the HTML content of a specific chapter.
|
@@ -347,7 +347,7 @@ class QidianBrowser(BaseBrowser):
|
|
347
347
|
logger.warning("[fetch] Error fetching chapter from '%s': %s", url, e)
|
348
348
|
return ""
|
349
349
|
|
350
|
-
def get_bookcase(self, wait_time: Optional[
|
350
|
+
def get_bookcase(self, wait_time: Optional[float] = None) -> str:
|
351
351
|
"""
|
352
352
|
Retrieve the HTML content of the logged‑in user's Qidian bookcase page.
|
353
353
|
|
@@ -70,7 +70,7 @@ class QidianSession(BaseSession):
|
|
70
70
|
3. Updates both the live ``requests.Session`` and the internal cache;
|
71
71
|
4. Delegates the actual request to ``super().get``.
|
72
72
|
"""
|
73
|
-
if self._session is None: # defensive
|
73
|
+
if self._session is None: # defensive - mirrors BaseSession check
|
74
74
|
raise RuntimeError("Session is not initialized or has been shut down.")
|
75
75
|
|
76
76
|
# ---- 1. refresh token cookie --------------------------------------
|
@@ -108,7 +108,7 @@ class QidianSession(BaseSession):
|
|
108
108
|
self.get("https://www.qidian.com")
|
109
109
|
return True
|
110
110
|
|
111
|
-
def get_book_info(self, book_id: str, wait_time: Optional[
|
111
|
+
def get_book_info(self, book_id: str, wait_time: Optional[float] = None) -> str:
|
112
112
|
"""
|
113
113
|
Fetch the raw HTML of the book info page.
|
114
114
|
|
@@ -140,7 +140,7 @@ class QidianSession(BaseSession):
|
|
140
140
|
raise RuntimeError("Unexpected fall-through in get_book_info")
|
141
141
|
|
142
142
|
def get_book_chapter(
|
143
|
-
self, book_id: str, chapter_id: str, wait_time: Optional[
|
143
|
+
self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
|
144
144
|
) -> str:
|
145
145
|
"""
|
146
146
|
Fetch the HTML of a single chapter.
|
@@ -174,7 +174,7 @@ class QidianSession(BaseSession):
|
|
174
174
|
|
175
175
|
raise RuntimeError("Unexpected fall-through in get_book_chapter")
|
176
176
|
|
177
|
-
def get_bookcase(self, wait_time: Optional[
|
177
|
+
def get_bookcase(self, wait_time: Optional[float] = None) -> str:
|
178
178
|
"""
|
179
179
|
Retrieve the user's *bookcase* page.
|
180
180
|
|
@@ -1,9 +1,10 @@
|
|
1
1
|
# 网络请求层设置
|
2
2
|
requests:
|
3
|
-
wait_time: 5
|
3
|
+
wait_time: 5.0 # 每次请求等待时间 (秒)
|
4
4
|
retry_times: 3 # 请求失败重试次数
|
5
|
-
retry_interval: 5
|
6
|
-
timeout: 30
|
5
|
+
retry_interval: 5.0
|
6
|
+
timeout: 30.0 # 页面加载超时时间 (秒)
|
7
|
+
max_rps: null # 最大请求速率 (requests per second), 为 null 则不限制
|
7
8
|
# DrissionPage 专用设置
|
8
9
|
headless: false # 是否以无头模式启动浏览器
|
9
10
|
user_data_folder: "" # 浏览器用户数据目录: 为空则使用默认目录
|
@@ -14,15 +15,29 @@ requests:
|
|
14
15
|
|
15
16
|
# 全局通用设置
|
16
17
|
general:
|
17
|
-
request_interval: 5
|
18
|
+
request_interval: 5.0 # 同一本书各章节请求间隔 (秒)
|
18
19
|
raw_data_dir: "./raw_data" # 原始章节 HTML/JSON 存放目录
|
19
20
|
output_dir: "./downloads" # 最终输出文件存放目录
|
20
21
|
cache_dir: "./novel_cache" # 本地缓存目录 (字体 / 图片等)
|
21
|
-
|
22
|
+
download_workers: 4 # 并发下载线程数
|
23
|
+
parser_workers: 4 # 并发解析线程数
|
24
|
+
use_process_pool: false # 是否使用多进程池来处理任务
|
22
25
|
skip_existing: true # 是否跳过已存在章节
|
23
26
|
debug:
|
24
27
|
save_html: false # 是否将抓取到的原始 HTML 保留到磁盘
|
25
28
|
log_level: "INFO" # 日志级别: DEBUG, INFO, WARNING, ERROR
|
29
|
+
font_ocr:
|
30
|
+
decode_font: false # 是否尝试本地解码混淆字体
|
31
|
+
use_freq: false # 是否使用频率分析
|
32
|
+
ocr_version: "v2.0" # "v1.0" / "v2.0"
|
33
|
+
use_ocr: true # 是否使用 OCR 辅助识别文本
|
34
|
+
use_vec: false # 是否使用 Vector 辅助识别文本
|
35
|
+
save_font_debug: false # 是否保存字体解码调试数据
|
36
|
+
batch_size: 32
|
37
|
+
gpu_mem: 500 # GPU 显存限制 (MB)
|
38
|
+
gpu_id: null # 使用哪个 GPU
|
39
|
+
ocr_weight: 0.6
|
40
|
+
vec_weight: 0.4
|
26
41
|
|
27
42
|
# 各站点的特定配置
|
28
43
|
sites:
|
@@ -35,15 +50,6 @@ sites:
|
|
35
50
|
- "0000000000"
|
36
51
|
mode: "browser" # browser / session
|
37
52
|
login_required: true # 是否需要登录才能访问
|
38
|
-
decode_font: false # 是否尝试本地解码混淆字体
|
39
|
-
use_freq: false # 是否使用频率分析
|
40
|
-
ocr_version: "v2.0" # "v1.0" / "v2.0"
|
41
|
-
use_ocr: true # 是否使用 OCR 辅助识别文本
|
42
|
-
use_vec: false # 是否使用 Vector 辅助识别文本
|
43
|
-
save_font_debug: false # 是否保存字体解码调试数据
|
44
|
-
batch_size: 32
|
45
|
-
ocr_weight: 0.6
|
46
|
-
vec_weight: 0.4
|
47
53
|
#
|
48
54
|
sample_site:
|
49
55
|
book_ids:
|
@@ -108,17 +108,17 @@ def patch_qd_payload_token(
|
|
108
108
|
if not key:
|
109
109
|
key = _get_key()
|
110
110
|
|
111
|
-
# Step 1
|
111
|
+
# Step 1 - decrypt --------------------------------------------------
|
112
112
|
decrypted_json: str = rc4_crypt(key, enc_token, mode="decrypt")
|
113
113
|
payload: Dict[str, Any] = json.loads(decrypted_json)
|
114
114
|
|
115
|
-
# Step 2
|
115
|
+
# Step 2 - rebuild timing fields -----------------------------------
|
116
116
|
loadts = int(time.time() * 1000) # ms since epoch
|
117
117
|
# Simulate the JS duration: N(600, 150) pushed into [300, 1000]
|
118
118
|
duration = max(300, min(1000, int(random.normalvariate(600, 150))))
|
119
119
|
timestamp = loadts + duration
|
120
120
|
|
121
|
-
# Step 3
|
121
|
+
# Step 3 - recalculate ------------------------------------
|
122
122
|
fp_key = _d("ZmluZ2VycHJpbnQ=")
|
123
123
|
ab_key = _d("YWJub3JtYWw=")
|
124
124
|
ck_key = _d("Y2hlY2tzdW0=")
|
@@ -138,7 +138,7 @@ def patch_qd_payload_token(
|
|
138
138
|
ck_key: ck_val,
|
139
139
|
}
|
140
140
|
|
141
|
-
# Step 4
|
141
|
+
# Step 4 - encrypt and return --------------------------------------
|
142
142
|
return rc4_crypt(
|
143
143
|
key, json.dumps(new_payload, separators=(",", ":")), mode="encrypt"
|
144
144
|
)
|
@@ -221,6 +221,8 @@ class FontOCRV2:
|
|
221
221
|
use_ocr: bool = True,
|
222
222
|
use_vec: bool = False,
|
223
223
|
batch_size: int = 32,
|
224
|
+
gpu_mem: int = 500,
|
225
|
+
gpu_id: Optional[int] = None,
|
224
226
|
ocr_weight: float = 0.6,
|
225
227
|
vec_weight: float = 0.4,
|
226
228
|
ocr_version: str = "v1.0",
|
@@ -232,6 +234,8 @@ class FontOCRV2:
|
|
232
234
|
self.use_ocr = use_ocr
|
233
235
|
self.use_vec = use_vec
|
234
236
|
self.batch_size = batch_size
|
237
|
+
self.gpu_mem = gpu_mem
|
238
|
+
self.gpu_id = gpu_id
|
235
239
|
self.ocr_weight = ocr_weight
|
236
240
|
self.vec_weight = vec_weight
|
237
241
|
self.ocr_version = ocr_version
|
@@ -279,6 +283,8 @@ class FontOCRV2:
|
|
279
283
|
rec_batch_num=self.batch_size,
|
280
284
|
use_space_char=False,
|
281
285
|
use_gpu=gpu_available,
|
286
|
+
gpu_mem=self.gpu_mem,
|
287
|
+
gpu_id=self.gpu_id,
|
282
288
|
)
|
283
289
|
|
284
290
|
def _load_char_freq_db(self) -> bool:
|