novel-downloader 1.1.1__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/download.py +58 -24
  3. novel_downloader/config/adapter.py +16 -10
  4. novel_downloader/config/models.py +10 -5
  5. novel_downloader/core/downloaders/__init__.py +2 -0
  6. novel_downloader/core/downloaders/base_async_downloader.py +157 -0
  7. novel_downloader/core/downloaders/common_asynb_downloader.py +207 -0
  8. novel_downloader/core/downloaders/common_downloader.py +2 -3
  9. novel_downloader/core/factory/__init__.py +14 -2
  10. novel_downloader/core/factory/downloader_factory.py +95 -8
  11. novel_downloader/core/factory/requester_factory.py +65 -21
  12. novel_downloader/core/interfaces/__init__.py +4 -0
  13. novel_downloader/core/interfaces/async_downloader_protocol.py +37 -0
  14. novel_downloader/core/interfaces/async_requester_protocol.py +70 -0
  15. novel_downloader/core/interfaces/requester_protocol.py +3 -3
  16. novel_downloader/core/parsers/qidian_parser/browser/main_parser.py +2 -0
  17. novel_downloader/core/parsers/qidian_parser/session/main_parser.py +2 -0
  18. novel_downloader/core/requesters/__init__.py +5 -1
  19. novel_downloader/core/requesters/base_async_session.py +299 -0
  20. novel_downloader/core/requesters/base_browser.py +3 -3
  21. novel_downloader/core/requesters/base_session.py +5 -5
  22. novel_downloader/core/requesters/common_requester/__init__.py +5 -1
  23. novel_downloader/core/requesters/common_requester/common_async_session.py +98 -0
  24. novel_downloader/core/requesters/common_requester/common_session.py +2 -2
  25. novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +3 -3
  26. novel_downloader/core/requesters/qidian_requester/qidian_session.py +4 -4
  27. novel_downloader/resources/config/settings.yaml +20 -14
  28. novel_downloader/utils/crypto_utils.py +4 -4
  29. novel_downloader/utils/fontocr/ocr_v2.py +6 -0
  30. {novel_downloader-1.1.1.dist-info → novel_downloader-1.2.1.dist-info}/METADATA +27 -7
  31. {novel_downloader-1.1.1.dist-info → novel_downloader-1.2.1.dist-info}/RECORD +35 -29
  32. {novel_downloader-1.1.1.dist-info → novel_downloader-1.2.1.dist-info}/WHEEL +0 -0
  33. {novel_downloader-1.1.1.dist-info → novel_downloader-1.2.1.dist-info}/entry_points.txt +0 -0
  34. {novel_downloader-1.1.1.dist-info → novel_downloader-1.2.1.dist-info}/licenses/LICENSE +0 -0
  35. {novel_downloader-1.1.1.dist-info → novel_downloader-1.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,299 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.core.requesters.base_async_session
5
+ ---------------------------------------------------
6
+
7
+ This module defines the BaseAsyncSession class, which provides asynchronous
8
+ HTTP request capabilities using aiohttp. It maintains a persistent
9
+ client session and supports retries, headers, timeout configurations,
10
+ cookie handling, and defines abstract methods for subclasses.
11
+ """
12
+
13
+ import abc
14
+ import asyncio
15
+ import time
16
+ from typing import Any, Dict, Optional, Union
17
+
18
+ import aiohttp
19
+ from aiohttp import ClientResponse, ClientSession, ClientTimeout, TCPConnector
20
+
21
+ from novel_downloader.config.models import RequesterConfig
22
+ from novel_downloader.core.interfaces import AsyncRequesterProtocol
23
+ from novel_downloader.utils.constants import DEFAULT_USER_HEADERS
24
+
25
+
26
+ class RateLimiter:
27
+ """
28
+ Simple async token-bucket rate limiter: ensures no more than rate_per_sec
29
+ requests are started per second, across all coroutines.
30
+ """
31
+
32
+ def __init__(self, rate_per_sec: float):
33
+ self._interval = 1.0 / rate_per_sec
34
+ self._lock = asyncio.Lock()
35
+ self._last = time.monotonic()
36
+
37
+ async def wait(self) -> None:
38
+ async with self._lock:
39
+ now = time.monotonic()
40
+ elapsed = now - self._last
41
+ delay = self._interval - elapsed
42
+ if delay > 0:
43
+ await asyncio.sleep(delay)
44
+ self._last = time.monotonic()
45
+
46
+
47
+ class BaseAsyncSession(AsyncRequesterProtocol, abc.ABC):
48
+ """
49
+ BaseAsyncSession wraps basic HTTP operations using aiohttp.ClientSession,
50
+ supporting retry logic, timeout, persistent connections, and cookie management.
51
+
52
+ Attributes:
53
+ _session (ClientSession): The persistent aiohttp client session.
54
+ _timeout (float): Timeout for each request in seconds.
55
+ _retry_times (int): Number of retry attempts on failure.
56
+ _retry_interval (float): Delay (in seconds) between retries.
57
+ _headers (Dict[str, str]): Default HTTP headers to send.
58
+ _cookies (Dict[str, str]): Optional cookie jar for the session.
59
+ """
60
+
61
+ def _init_session(
62
+ self,
63
+ config: RequesterConfig,
64
+ cookies: Optional[Dict[str, str]] = None,
65
+ ) -> None:
66
+ """
67
+ Initialize the async session with configuration.
68
+
69
+ :param config: Configuration object for session behavior
70
+ (timeouts, retries, headers, etc.)
71
+ :param cookies: Optional initial cookies to set on the session.
72
+ """
73
+ self._config = config
74
+ self._timeout = config.timeout
75
+ self._retry_times = config.retry_times
76
+ self._retry_interval = config.retry_interval
77
+ self._cookies = cookies or {}
78
+ self._headers = DEFAULT_USER_HEADERS.copy()
79
+ self._session: Optional[ClientSession] = None
80
+ self._rate_limiter: Optional[RateLimiter] = None
81
+
82
+ async def _setup(self) -> None:
83
+ """
84
+ Set up the aiohttp.ClientSession with timeout, connector, headers, and cookies.
85
+ """
86
+ max_rps = getattr(self._config, "max_rps", None)
87
+ if max_rps is not None:
88
+ self._rate_limiter = RateLimiter(max_rps)
89
+
90
+ timeout = ClientTimeout(total=self._timeout)
91
+ connector = TCPConnector(
92
+ limit_per_host=getattr(self._config, "max_connections", 10)
93
+ )
94
+ self._session = ClientSession(
95
+ timeout=timeout,
96
+ connector=connector,
97
+ headers=self._headers,
98
+ cookies=self._cookies,
99
+ )
100
+
101
+ async def login(self, max_retries: int = 3, manual_login: bool = False) -> bool:
102
+ """
103
+ Attempt to log in asynchronously.
104
+ Override in subclasses that require authentication.
105
+
106
+ :returns: True if login succeeded, False otherwise.
107
+ """
108
+ raise NotImplementedError(
109
+ "Login is not supported by this session type. "
110
+ "Override login() in your subclass to enable it."
111
+ )
112
+
113
+ @abc.abstractmethod
114
+ async def get_book_info(
115
+ self, book_id: str, wait_time: Optional[float] = None
116
+ ) -> str:
117
+ """
118
+ Fetch the raw HTML (or JSON) of the book info page asynchronously.
119
+
120
+ :param book_id: The book identifier.
121
+ :param wait_time: Base number of seconds to wait before returning content.
122
+ :return: The page content as a string.
123
+ """
124
+ ...
125
+
126
+ @abc.abstractmethod
127
+ async def get_book_chapter(
128
+ self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
129
+ ) -> str:
130
+ """
131
+ Fetch the raw HTML (or JSON) of a single chapter asynchronously.
132
+
133
+ :param book_id: The book identifier.
134
+ :param chapter_id: The chapter identifier.
135
+ :param wait_time: Base number of seconds to wait before returning content.
136
+ :return: The chapter content as a string.
137
+ """
138
+ ...
139
+
140
+ async def get_bookcase(self, wait_time: Optional[float] = None) -> str:
141
+ """
142
+ Optional: Retrieve the HTML content of the authenticated user's bookcase page.
143
+ Subclasses that support user login/bookcase should override this.
144
+
145
+ :param wait_time: Base number of seconds to wait before returning content.
146
+ :return: The HTML of the bookcase page.
147
+ """
148
+ raise NotImplementedError(
149
+ "Bookcase fetching is not supported by this session type. "
150
+ "Override get_bookcase() in your subclass to enable it."
151
+ )
152
+
153
+ async def fetch(self, url: str, **kwargs: Any) -> str:
154
+ """
155
+ Fetch the content from the given URL asynchronously, with retry support.
156
+
157
+ :param url: The target URL to fetch.
158
+ :param kwargs: Additional keyword arguments to pass to `session.get`.
159
+ :return: The response body as text.
160
+ :raises: aiohttp.ClientError on final failure.
161
+ """
162
+ if self._session is None:
163
+ await self._setup()
164
+ if self._session is None:
165
+ raise RuntimeError("Session not initialized after setup")
166
+
167
+ if self._rate_limiter:
168
+ await self._rate_limiter.wait()
169
+
170
+ for attempt in range(self._retry_times + 1):
171
+ try:
172
+ async with self._session.get(url, **kwargs) as resp:
173
+ resp.raise_for_status()
174
+ text: str = await resp.text()
175
+ return text
176
+ except aiohttp.ClientError:
177
+ if attempt < self._retry_times:
178
+ await asyncio.sleep(self._retry_interval)
179
+ continue
180
+ raise
181
+
182
+ raise RuntimeError("Unreachable code reached in fetch()")
183
+
184
+ async def get(
185
+ self, url: str, params: Optional[Dict[str, Any]] = None, **kwargs: Any
186
+ ) -> ClientResponse:
187
+ """
188
+ Send an HTTP GET request asynchronously.
189
+
190
+ :param url: The target URL.
191
+ :param params: Query parameters to include in the request.
192
+ :param kwargs: Additional args passed to session.get().
193
+ :return: aiohttp.ClientResponse object.
194
+ :raises RuntimeError: If the session is not initialized.
195
+ """
196
+ if self._session is None:
197
+ await self._setup()
198
+ if self._session is None:
199
+ raise RuntimeError("Session not initialized after setup")
200
+
201
+ if self._rate_limiter:
202
+ await self._rate_limiter.wait()
203
+ return await self._session.get(url, params=params, **kwargs)
204
+
205
+ async def post(
206
+ self,
207
+ url: str,
208
+ data: Optional[Union[Dict[str, Any], bytes]] = None,
209
+ json: Optional[Dict[str, Any]] = None,
210
+ **kwargs: Any,
211
+ ) -> ClientResponse:
212
+ """
213
+ Send an HTTP POST request asynchronously.
214
+
215
+ :param url: The target URL.
216
+ :param data: Form data to include in the request body.
217
+ :param json: JSON body to include in the request.
218
+ :param kwargs: Additional args passed to session.post().
219
+ :return: aiohttp.ClientResponse object.
220
+ :raises RuntimeError: If the session is not initialized.
221
+ """
222
+ if self._session is None:
223
+ await self._setup()
224
+ if self._session is None:
225
+ raise RuntimeError("Session not initialized after setup")
226
+
227
+ if self._rate_limiter:
228
+ await self._rate_limiter.wait()
229
+ return await self._session.post(url, data=data, json=json, **kwargs)
230
+
231
+ @property
232
+ def session(self) -> ClientSession:
233
+ """
234
+ Return the active aiohttp.ClientSession.
235
+
236
+ :raises RuntimeError: If the session is uninitialized.
237
+ """
238
+ if self._session is None:
239
+ raise RuntimeError("Session is not initialized or has been shut down.")
240
+ return self._session
241
+
242
+ @property
243
+ def timeout(self) -> float:
244
+ """Return the default timeout setting."""
245
+ return self._timeout
246
+
247
+ @property
248
+ def retry_times(self) -> int:
249
+ """Return the maximum number of retry attempts."""
250
+ return self._retry_times
251
+
252
+ @property
253
+ def retry_interval(self) -> float:
254
+ """Return the base interval (in seconds) between retries."""
255
+ return self._retry_interval
256
+
257
+ async def update_cookies(
258
+ self, cookies: Dict[str, str], overwrite: bool = True
259
+ ) -> None:
260
+ """
261
+ Update cookies for the current session and internal cache.
262
+
263
+ :param cookies: New cookies to merge.
264
+ :param overwrite: If True, replace existing; else, only set missing.
265
+ """
266
+ # update internal cache
267
+ if overwrite:
268
+ self._cookies.update({str(k): str(v) for k, v in cookies.items()})
269
+ else:
270
+ for k, v in cookies.items():
271
+ self._cookies.setdefault(str(k), str(v))
272
+
273
+ # apply to live session
274
+ if self._session:
275
+ self._session.cookie_jar.update_cookies(self._cookies)
276
+
277
+ async def shutdown(self) -> None:
278
+ """
279
+ Shutdown and clean up the session. Closes connection pool.
280
+ """
281
+ if self._session:
282
+ await self._session.close()
283
+ self._session = None
284
+
285
+ def __getstate__(self) -> Dict[str, Any]:
286
+ """
287
+ Prepare object state for serialization: remove unpickleable session.
288
+ """
289
+ state = self.__dict__.copy()
290
+ state.pop("_session", None)
291
+ state.pop("_rate_limiter", None)
292
+ return state
293
+
294
+ def __setstate__(self, state: Dict[str, Any]) -> None:
295
+ """
296
+ Restore object state. Session will be lazily reinitialized on next request.
297
+ """
298
+ self.__dict__.update(state)
299
+ self._session = None
@@ -111,7 +111,7 @@ class BaseBrowser(RequesterProtocol, abc.ABC):
111
111
  )
112
112
 
113
113
  @abc.abstractmethod
114
- def get_book_info(self, book_id: str, wait_time: Optional[int] = None) -> str:
114
+ def get_book_info(self, book_id: str, wait_time: Optional[float] = None) -> str:
115
115
  """
116
116
  Fetch the raw HTML (or JSON) of the book info page.
117
117
 
@@ -123,7 +123,7 @@ class BaseBrowser(RequesterProtocol, abc.ABC):
123
123
 
124
124
  @abc.abstractmethod
125
125
  def get_book_chapter(
126
- self, book_id: str, chapter_id: str, wait_time: Optional[int] = None
126
+ self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
127
127
  ) -> str:
128
128
  """
129
129
  Fetch the raw HTML (or JSON) of a single chapter.
@@ -135,7 +135,7 @@ class BaseBrowser(RequesterProtocol, abc.ABC):
135
135
  """
136
136
  ...
137
137
 
138
- def get_bookcase(self, wait_time: Optional[int] = None) -> str:
138
+ def get_bookcase(self, wait_time: Optional[float] = None) -> str:
139
139
  """
140
140
  Optional: Retrieve the HTML content of the authenticated user's bookcase page.
141
141
 
@@ -28,7 +28,7 @@ class BaseSession(RequesterProtocol, abc.ABC):
28
28
 
29
29
  Attributes:
30
30
  _session (requests.Session): The persistent HTTP session.
31
- _timeout (int): Timeout for each request in seconds.
31
+ _timeout (float): Timeout for each request in seconds.
32
32
  """
33
33
 
34
34
  def _init_session(
@@ -81,7 +81,7 @@ class BaseSession(RequesterProtocol, abc.ABC):
81
81
  )
82
82
 
83
83
  @abc.abstractmethod
84
- def get_book_info(self, book_id: str, wait_time: Optional[int] = None) -> str:
84
+ def get_book_info(self, book_id: str, wait_time: Optional[float] = None) -> str:
85
85
  """
86
86
  Fetch the raw HTML (or JSON) of the book info page.
87
87
 
@@ -93,7 +93,7 @@ class BaseSession(RequesterProtocol, abc.ABC):
93
93
 
94
94
  @abc.abstractmethod
95
95
  def get_book_chapter(
96
- self, book_id: str, chapter_id: str, wait_time: Optional[int] = None
96
+ self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
97
97
  ) -> str:
98
98
  """
99
99
  Fetch the raw HTML (or JSON) of a single chapter.
@@ -105,7 +105,7 @@ class BaseSession(RequesterProtocol, abc.ABC):
105
105
  """
106
106
  ...
107
107
 
108
- def get_bookcase(self, wait_time: Optional[int] = None) -> str:
108
+ def get_bookcase(self, wait_time: Optional[float] = None) -> str:
109
109
  """
110
110
  Optional: Retrieve the HTML content of the authenticated user's bookcase page.
111
111
 
@@ -171,7 +171,7 @@ class BaseSession(RequesterProtocol, abc.ABC):
171
171
  return self._session
172
172
 
173
173
  @property
174
- def timeout(self) -> int:
174
+ def timeout(self) -> float:
175
175
  """Return the default timeout setting."""
176
176
  return self._timeout
177
177
 
@@ -9,6 +9,10 @@ request operations to novel websites. It serves as a unified access
9
9
  point to import `CommonSession` without exposing lower-level modules.
10
10
  """
11
11
 
12
+ from .common_async_session import CommonAsyncSession
12
13
  from .common_session import CommonSession
13
14
 
14
- __all__ = ["CommonSession"]
15
+ __all__ = [
16
+ "CommonAsyncSession",
17
+ "CommonSession",
18
+ ]
@@ -0,0 +1,98 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.core.requesters.common_requester.common_async_session
5
+ ----------------------------------------------------------------------
6
+
7
+ This module defines a `CommonAsyncSession` class for handling HTTP requests
8
+ to common novel sites **asynchronously**. It provides methods to retrieve
9
+ raw book info pages and chapter contents using a flexible URL templating
10
+ system defined by a site profile, with retry logic and random delays.
11
+ """
12
+
13
+ import asyncio
14
+ import random
15
+ from typing import Dict, Optional
16
+
17
+ from novel_downloader.config import RequesterConfig, SiteProfile
18
+ from novel_downloader.core.requesters.base_async_session import BaseAsyncSession
19
+
20
+
21
+ class CommonAsyncSession(BaseAsyncSession):
22
+ """
23
+ A common async session for handling site-specific HTTP requests.
24
+
25
+ :ivar _site: The unique identifier or name of the site.
26
+ :ivar _profile: Metadata and URL templates related to the site.
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ config: RequesterConfig,
32
+ site: str,
33
+ profile: SiteProfile,
34
+ cookies: Optional[Dict[str, str]] = None,
35
+ ) -> None:
36
+ """
37
+ Initialize a CommonAsyncSession instance.
38
+
39
+ :param config: The RequesterConfig instance containing settings.
40
+ :param site: The identifier or domain of the target site.
41
+ :param profile: The site's metadata and URL templates.
42
+ :param cookies: Optional cookies to preload into the session.
43
+ """
44
+ self._init_session(config=config, cookies=cookies)
45
+ self._site = site
46
+ self._profile = profile
47
+
48
+ async def get_book_info(
49
+ self, book_id: str, wait_time: Optional[float] = None
50
+ ) -> str:
51
+ """
52
+ Fetch the raw HTML of the book info page asynchronously.
53
+
54
+ Relies on BaseAsyncSession.fetch for retry logic, then sleeps with jitter.
55
+
56
+ :param book_id: The book identifier.
57
+ :param wait_time: Base seconds to sleep (with 0.5-1.5x random factor).
58
+ :return: The page content as a string.
59
+ """
60
+ url = self.book_info_url.format(book_id=book_id)
61
+ html = await self.fetch(url)
62
+ base = wait_time if wait_time is not None else self._config.wait_time
63
+ await asyncio.sleep(base * random.uniform(0.5, 1.5))
64
+ return html
65
+
66
+ async def get_book_chapter(
67
+ self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
68
+ ) -> str:
69
+ """
70
+ Fetch the raw HTML of a single chapter asynchronously.
71
+
72
+ Relies on BaseAsyncSession.fetch for retry logic, then sleeps with jitter.
73
+
74
+ :param book_id: The book identifier.
75
+ :param chapter_id: The chapter identifier.
76
+ :param wait_time: Base seconds to sleep (with 0.5-1.5x random factor).
77
+ :return: The chapter content as a string.
78
+ """
79
+ url = self.chapter_url.format(book_id=book_id, chapter_id=chapter_id)
80
+ html = await self.fetch(url)
81
+ base = wait_time if wait_time is not None else self._config.wait_time
82
+ await asyncio.sleep(base * random.uniform(0.5, 1.5))
83
+ return html
84
+
85
+ @property
86
+ def site(self) -> str:
87
+ """Return the site name."""
88
+ return self._site
89
+
90
+ @property
91
+ def book_info_url(self) -> str:
92
+ """Return the URL template for fetching book info."""
93
+ return self._profile["book_info_url"]
94
+
95
+ @property
96
+ def chapter_url(self) -> str:
97
+ """Return the URL template for fetching chapter content."""
98
+ return self._profile["chapter_url"]
@@ -47,7 +47,7 @@ class CommonSession(BaseSession):
47
47
  self._site = site
48
48
  self._profile = profile
49
49
 
50
- def get_book_info(self, book_id: str, wait_time: Optional[int] = None) -> str:
50
+ def get_book_info(self, book_id: str, wait_time: Optional[float] = None) -> str:
51
51
  """
52
52
  Fetch the raw HTML (or JSON) of the book info page.
53
53
 
@@ -75,7 +75,7 @@ class CommonSession(BaseSession):
75
75
  raise RuntimeError("Unexpected error: get_book_info failed without returning")
76
76
 
77
77
  def get_book_chapter(
78
- self, book_id: str, chapter_id: str, wait_time: Optional[int] = None
78
+ self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
79
79
  ) -> str:
80
80
  """
81
81
  Fetch the raw HTML (or JSON) of a single chapter.
@@ -266,7 +266,7 @@ class QidianBrowser(BaseBrowser):
266
266
  """
267
267
  return self.QIDIAN_BOOKCASE_URL
268
268
 
269
- def get_book_info(self, book_id: str, wait_time: Optional[int] = None) -> str:
269
+ def get_book_info(self, book_id: str, wait_time: Optional[float] = None) -> str:
270
270
  """
271
271
  Retrieve the HTML of a Qidian book info page.
272
272
 
@@ -311,7 +311,7 @@ class QidianBrowser(BaseBrowser):
311
311
  time.sleep(pause)
312
312
 
313
313
  def get_book_chapter(
314
- self, book_id: str, chapter_id: str, wait_time: Optional[int] = None
314
+ self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
315
315
  ) -> str:
316
316
  """
317
317
  Retrieve the HTML content of a specific chapter.
@@ -347,7 +347,7 @@ class QidianBrowser(BaseBrowser):
347
347
  logger.warning("[fetch] Error fetching chapter from '%s': %s", url, e)
348
348
  return ""
349
349
 
350
- def get_bookcase(self, wait_time: Optional[int] = None) -> str:
350
+ def get_bookcase(self, wait_time: Optional[float] = None) -> str:
351
351
  """
352
352
  Retrieve the HTML content of the logged‑in user's Qidian bookcase page.
353
353
 
@@ -70,7 +70,7 @@ class QidianSession(BaseSession):
70
70
  3. Updates both the live ``requests.Session`` and the internal cache;
71
71
  4. Delegates the actual request to ``super().get``.
72
72
  """
73
- if self._session is None: # defensive mirrors BaseSession check
73
+ if self._session is None: # defensive - mirrors BaseSession check
74
74
  raise RuntimeError("Session is not initialized or has been shut down.")
75
75
 
76
76
  # ---- 1. refresh token cookie --------------------------------------
@@ -108,7 +108,7 @@ class QidianSession(BaseSession):
108
108
  self.get("https://www.qidian.com")
109
109
  return True
110
110
 
111
- def get_book_info(self, book_id: str, wait_time: Optional[int] = None) -> str:
111
+ def get_book_info(self, book_id: str, wait_time: Optional[float] = None) -> str:
112
112
  """
113
113
  Fetch the raw HTML of the book info page.
114
114
 
@@ -140,7 +140,7 @@ class QidianSession(BaseSession):
140
140
  raise RuntimeError("Unexpected fall-through in get_book_info")
141
141
 
142
142
  def get_book_chapter(
143
- self, book_id: str, chapter_id: str, wait_time: Optional[int] = None
143
+ self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
144
144
  ) -> str:
145
145
  """
146
146
  Fetch the HTML of a single chapter.
@@ -174,7 +174,7 @@ class QidianSession(BaseSession):
174
174
 
175
175
  raise RuntimeError("Unexpected fall-through in get_book_chapter")
176
176
 
177
- def get_bookcase(self, wait_time: Optional[int] = None) -> str:
177
+ def get_bookcase(self, wait_time: Optional[float] = None) -> str:
178
178
  """
179
179
  Retrieve the user's *bookcase* page.
180
180
 
@@ -1,9 +1,10 @@
1
1
  # 网络请求层设置
2
2
  requests:
3
- wait_time: 5 # 每次请求等待时间 (秒)
3
+ wait_time: 5.0 # 每次请求等待时间 (秒)
4
4
  retry_times: 3 # 请求失败重试次数
5
- retry_interval: 5
6
- timeout: 30 # 页面加载超时时间 (秒)
5
+ retry_interval: 5.0
6
+ timeout: 30.0 # 页面加载超时时间 (秒)
7
+ max_rps: null # 最大请求速率 (requests per second), 为 null 则不限制
7
8
  # DrissionPage 专用设置
8
9
  headless: false # 是否以无头模式启动浏览器
9
10
  user_data_folder: "" # 浏览器用户数据目录: 为空则使用默认目录
@@ -14,15 +15,29 @@ requests:
14
15
 
15
16
  # 全局通用设置
16
17
  general:
17
- request_interval: 5 # 同一本书各章节请求间隔 (秒)
18
+ request_interval: 5.0 # 同一本书各章节请求间隔 (秒)
18
19
  raw_data_dir: "./raw_data" # 原始章节 HTML/JSON 存放目录
19
20
  output_dir: "./downloads" # 最终输出文件存放目录
20
21
  cache_dir: "./novel_cache" # 本地缓存目录 (字体 / 图片等)
21
- max_threads: 4 # 最大并发下载线程数 (未实现)
22
+ download_workers: 4 # 并发下载线程数
23
+ parser_workers: 4 # 并发解析线程数
24
+ use_process_pool: false # 是否使用多进程池来处理任务
22
25
  skip_existing: true # 是否跳过已存在章节
23
26
  debug:
24
27
  save_html: false # 是否将抓取到的原始 HTML 保留到磁盘
25
28
  log_level: "INFO" # 日志级别: DEBUG, INFO, WARNING, ERROR
29
+ font_ocr:
30
+ decode_font: false # 是否尝试本地解码混淆字体
31
+ use_freq: false # 是否使用频率分析
32
+ ocr_version: "v2.0" # "v1.0" / "v2.0"
33
+ use_ocr: true # 是否使用 OCR 辅助识别文本
34
+ use_vec: false # 是否使用 Vector 辅助识别文本
35
+ save_font_debug: false # 是否保存字体解码调试数据
36
+ batch_size: 32
37
+ gpu_mem: 500 # GPU 显存限制 (MB)
38
+ gpu_id: null # 使用哪个 GPU
39
+ ocr_weight: 0.6
40
+ vec_weight: 0.4
26
41
 
27
42
  # 各站点的特定配置
28
43
  sites:
@@ -35,15 +50,6 @@ sites:
35
50
  - "0000000000"
36
51
  mode: "browser" # browser / session
37
52
  login_required: true # 是否需要登录才能访问
38
- decode_font: false # 是否尝试本地解码混淆字体
39
- use_freq: false # 是否使用频率分析
40
- ocr_version: "v2.0" # "v1.0" / "v2.0"
41
- use_ocr: true # 是否使用 OCR 辅助识别文本
42
- use_vec: false # 是否使用 Vector 辅助识别文本
43
- save_font_debug: false # 是否保存字体解码调试数据
44
- batch_size: 32
45
- ocr_weight: 0.6
46
- vec_weight: 0.4
47
53
  #
48
54
  sample_site:
49
55
  book_ids:
@@ -108,17 +108,17 @@ def patch_qd_payload_token(
108
108
  if not key:
109
109
  key = _get_key()
110
110
 
111
- # Step 1 decrypt --------------------------------------------------
111
+ # Step 1 - decrypt --------------------------------------------------
112
112
  decrypted_json: str = rc4_crypt(key, enc_token, mode="decrypt")
113
113
  payload: Dict[str, Any] = json.loads(decrypted_json)
114
114
 
115
- # Step 2 rebuild timing fields -----------------------------------
115
+ # Step 2 - rebuild timing fields -----------------------------------
116
116
  loadts = int(time.time() * 1000) # ms since epoch
117
117
  # Simulate the JS duration: N(600, 150) pushed into [300, 1000]
118
118
  duration = max(300, min(1000, int(random.normalvariate(600, 150))))
119
119
  timestamp = loadts + duration
120
120
 
121
- # Step 3 recalculate ------------------------------------
121
+ # Step 3 - recalculate ------------------------------------
122
122
  fp_key = _d("ZmluZ2VycHJpbnQ=")
123
123
  ab_key = _d("YWJub3JtYWw=")
124
124
  ck_key = _d("Y2hlY2tzdW0=")
@@ -138,7 +138,7 @@ def patch_qd_payload_token(
138
138
  ck_key: ck_val,
139
139
  }
140
140
 
141
- # Step 4 encrypt and return --------------------------------------
141
+ # Step 4 - encrypt and return --------------------------------------
142
142
  return rc4_crypt(
143
143
  key, json.dumps(new_payload, separators=(",", ":")), mode="encrypt"
144
144
  )
@@ -221,6 +221,8 @@ class FontOCRV2:
221
221
  use_ocr: bool = True,
222
222
  use_vec: bool = False,
223
223
  batch_size: int = 32,
224
+ gpu_mem: int = 500,
225
+ gpu_id: Optional[int] = None,
224
226
  ocr_weight: float = 0.6,
225
227
  vec_weight: float = 0.4,
226
228
  ocr_version: str = "v1.0",
@@ -232,6 +234,8 @@ class FontOCRV2:
232
234
  self.use_ocr = use_ocr
233
235
  self.use_vec = use_vec
234
236
  self.batch_size = batch_size
237
+ self.gpu_mem = gpu_mem
238
+ self.gpu_id = gpu_id
235
239
  self.ocr_weight = ocr_weight
236
240
  self.vec_weight = vec_weight
237
241
  self.ocr_version = ocr_version
@@ -279,6 +283,8 @@ class FontOCRV2:
279
283
  rec_batch_num=self.batch_size,
280
284
  use_space_char=False,
281
285
  use_gpu=gpu_available,
286
+ gpu_mem=self.gpu_mem,
287
+ gpu_id=self.gpu_id,
282
288
  )
283
289
 
284
290
  def _load_char_freq_db(self) -> bool: