novel-downloader 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/download.py +2 -0
- novel_downloader/config/adapter.py +41 -13
- novel_downloader/config/models.py +13 -8
- novel_downloader/core/downloaders/base_async_downloader.py +1 -1
- novel_downloader/core/downloaders/common_downloader.py +1 -2
- novel_downloader/core/downloaders/qidian_downloader.py +1 -2
- novel_downloader/core/factory/downloader_factory.py +13 -11
- novel_downloader/core/interfaces/async_requester_protocol.py +9 -4
- novel_downloader/core/interfaces/requester_protocol.py +7 -4
- novel_downloader/core/parsers/base_parser.py +3 -3
- novel_downloader/core/parsers/common_parser/helper.py +7 -5
- novel_downloader/core/parsers/qidian_parser/browser/chapter_encrypted.py +1 -1
- novel_downloader/core/parsers/qidian_parser/browser/main_parser.py +5 -3
- novel_downloader/core/parsers/qidian_parser/session/chapter_encrypted.py +1 -1
- novel_downloader/core/parsers/qidian_parser/session/main_parser.py +5 -3
- novel_downloader/core/parsers/qidian_parser/shared/book_info_parser.py +74 -18
- novel_downloader/core/parsers/qidian_parser/shared/helpers.py +2 -2
- novel_downloader/core/requesters/base_async_session.py +11 -6
- novel_downloader/core/requesters/base_browser.py +12 -8
- novel_downloader/core/requesters/base_session.py +9 -6
- novel_downloader/core/requesters/common_requester/common_async_session.py +4 -2
- novel_downloader/core/requesters/common_requester/common_session.py +4 -4
- novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +38 -19
- novel_downloader/core/requesters/qidian_requester/qidian_session.py +6 -6
- novel_downloader/core/savers/common_saver/common_epub.py +1 -1
- novel_downloader/locales/en.json +4 -0
- novel_downloader/locales/zh.json +4 -0
- novel_downloader/resources/config/settings.yaml +16 -13
- novel_downloader/utils/constants.py +2 -1
- novel_downloader/utils/fontocr/ocr_v2.py +6 -0
- novel_downloader/utils/time_utils/datetime_utils.py +1 -1
- novel_downloader/utils/time_utils/sleep_utils.py +27 -11
- {novel_downloader-1.2.0.dist-info → novel_downloader-1.2.2.dist-info}/METADATA +1 -1
- {novel_downloader-1.2.0.dist-info → novel_downloader-1.2.2.dist-info}/RECORD +39 -39
- {novel_downloader-1.2.0.dist-info → novel_downloader-1.2.2.dist-info}/WHEEL +1 -1
- {novel_downloader-1.2.0.dist-info → novel_downloader-1.2.2.dist-info}/entry_points.txt +0 -0
- {novel_downloader-1.2.0.dist-info → novel_downloader-1.2.2.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.2.0.dist-info → novel_downloader-1.2.2.dist-info}/top_level.txt +0 -0
@@ -13,7 +13,7 @@ cookie handling, and defines abstract methods for subclasses.
|
|
13
13
|
import abc
|
14
14
|
import asyncio
|
15
15
|
import time
|
16
|
-
from typing import Any, Dict, Optional, Union
|
16
|
+
from typing import Any, Dict, Literal, Optional, Union
|
17
17
|
|
18
18
|
import aiohttp
|
19
19
|
from aiohttp import ClientResponse, ClientSession, ClientTimeout, TCPConnector
|
@@ -51,13 +51,16 @@ class BaseAsyncSession(AsyncRequesterProtocol, abc.ABC):
|
|
51
51
|
|
52
52
|
Attributes:
|
53
53
|
_session (ClientSession): The persistent aiohttp client session.
|
54
|
-
_timeout (
|
54
|
+
_timeout (float): Timeout for each request in seconds.
|
55
55
|
_retry_times (int): Number of retry attempts on failure.
|
56
56
|
_retry_interval (float): Delay (in seconds) between retries.
|
57
57
|
_headers (Dict[str, str]): Default HTTP headers to send.
|
58
58
|
_cookies (Dict[str, str]): Optional cookie jar for the session.
|
59
59
|
"""
|
60
60
|
|
61
|
+
def is_async(self) -> Literal[True]:
|
62
|
+
return True
|
63
|
+
|
61
64
|
def _init_session(
|
62
65
|
self,
|
63
66
|
config: RequesterConfig,
|
@@ -111,7 +114,9 @@ class BaseAsyncSession(AsyncRequesterProtocol, abc.ABC):
|
|
111
114
|
)
|
112
115
|
|
113
116
|
@abc.abstractmethod
|
114
|
-
async def get_book_info(
|
117
|
+
async def get_book_info(
|
118
|
+
self, book_id: str, wait_time: Optional[float] = None
|
119
|
+
) -> str:
|
115
120
|
"""
|
116
121
|
Fetch the raw HTML (or JSON) of the book info page asynchronously.
|
117
122
|
|
@@ -123,7 +128,7 @@ class BaseAsyncSession(AsyncRequesterProtocol, abc.ABC):
|
|
123
128
|
|
124
129
|
@abc.abstractmethod
|
125
130
|
async def get_book_chapter(
|
126
|
-
self, book_id: str, chapter_id: str, wait_time: Optional[
|
131
|
+
self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
|
127
132
|
) -> str:
|
128
133
|
"""
|
129
134
|
Fetch the raw HTML (or JSON) of a single chapter asynchronously.
|
@@ -135,7 +140,7 @@ class BaseAsyncSession(AsyncRequesterProtocol, abc.ABC):
|
|
135
140
|
"""
|
136
141
|
...
|
137
142
|
|
138
|
-
async def get_bookcase(self, wait_time: Optional[
|
143
|
+
async def get_bookcase(self, wait_time: Optional[float] = None) -> str:
|
139
144
|
"""
|
140
145
|
Optional: Retrieve the HTML content of the authenticated user's bookcase page.
|
141
146
|
Subclasses that support user login/bookcase should override this.
|
@@ -238,7 +243,7 @@ class BaseAsyncSession(AsyncRequesterProtocol, abc.ABC):
|
|
238
243
|
return self._session
|
239
244
|
|
240
245
|
@property
|
241
|
-
def timeout(self) ->
|
246
|
+
def timeout(self) -> float:
|
242
247
|
"""Return the default timeout setting."""
|
243
248
|
return self._timeout
|
244
249
|
|
@@ -11,9 +11,10 @@ specialized purposes.
|
|
11
11
|
|
12
12
|
import abc
|
13
13
|
import logging
|
14
|
-
from typing import Any, Dict, Optional
|
14
|
+
from typing import Any, Dict, Literal, Optional, cast
|
15
15
|
|
16
|
-
from DrissionPage import Chromium, ChromiumOptions
|
16
|
+
from DrissionPage import Chromium, ChromiumOptions
|
17
|
+
from DrissionPage._pages.mix_tab import MixTab
|
17
18
|
|
18
19
|
from novel_downloader.config.models import RequesterConfig
|
19
20
|
from novel_downloader.core.interfaces import RequesterProtocol
|
@@ -42,6 +43,9 @@ class BaseBrowser(RequesterProtocol, abc.ABC):
|
|
42
43
|
_page (ChromiumPage): The active browser tab.
|
43
44
|
"""
|
44
45
|
|
46
|
+
def is_async(self) -> Literal[False]:
|
47
|
+
return False
|
48
|
+
|
45
49
|
def _init_browser(self, config: RequesterConfig) -> None:
|
46
50
|
"""
|
47
51
|
Initialize the browser with specified options from RequesterConfig.
|
@@ -99,7 +103,7 @@ class BaseBrowser(RequesterProtocol, abc.ABC):
|
|
99
103
|
Set up the browser instance and open the default tab.
|
100
104
|
"""
|
101
105
|
self._browser = Chromium(self._options)
|
102
|
-
self._page = self._browser.get_tab()
|
106
|
+
self._page = cast(MixTab, self._browser.get_tab())
|
103
107
|
|
104
108
|
def login(self, max_retries: int = 3, manual_login: bool = False) -> bool:
|
105
109
|
"""
|
@@ -111,7 +115,7 @@ class BaseBrowser(RequesterProtocol, abc.ABC):
|
|
111
115
|
)
|
112
116
|
|
113
117
|
@abc.abstractmethod
|
114
|
-
def get_book_info(self, book_id: str, wait_time: Optional[
|
118
|
+
def get_book_info(self, book_id: str, wait_time: Optional[float] = None) -> str:
|
115
119
|
"""
|
116
120
|
Fetch the raw HTML (or JSON) of the book info page.
|
117
121
|
|
@@ -123,7 +127,7 @@ class BaseBrowser(RequesterProtocol, abc.ABC):
|
|
123
127
|
|
124
128
|
@abc.abstractmethod
|
125
129
|
def get_book_chapter(
|
126
|
-
self, book_id: str, chapter_id: str, wait_time: Optional[
|
130
|
+
self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
|
127
131
|
) -> str:
|
128
132
|
"""
|
129
133
|
Fetch the raw HTML (or JSON) of a single chapter.
|
@@ -135,7 +139,7 @@ class BaseBrowser(RequesterProtocol, abc.ABC):
|
|
135
139
|
"""
|
136
140
|
...
|
137
141
|
|
138
|
-
def get_bookcase(self, wait_time: Optional[
|
142
|
+
def get_bookcase(self, wait_time: Optional[float] = None) -> str:
|
139
143
|
"""
|
140
144
|
Optional: Retrieve the HTML content of the authenticated user's bookcase page.
|
141
145
|
|
@@ -151,7 +155,7 @@ class BaseBrowser(RequesterProtocol, abc.ABC):
|
|
151
155
|
)
|
152
156
|
|
153
157
|
@property
|
154
|
-
def page(self) ->
|
158
|
+
def page(self) -> Optional[MixTab]:
|
155
159
|
"""
|
156
160
|
Return the current Chromium page object.
|
157
161
|
|
@@ -160,7 +164,7 @@ class BaseBrowser(RequesterProtocol, abc.ABC):
|
|
160
164
|
return self._page
|
161
165
|
|
162
166
|
@property
|
163
|
-
def browser(self) -> Chromium:
|
167
|
+
def browser(self) -> Optional[Chromium]:
|
164
168
|
"""
|
165
169
|
Return the Chromium browser instance.
|
166
170
|
|
@@ -10,7 +10,7 @@ persistent session and supports retries, headers, and timeout configurations.
|
|
10
10
|
"""
|
11
11
|
|
12
12
|
import abc
|
13
|
-
from typing import Any, Dict, Optional, Union
|
13
|
+
from typing import Any, Dict, Literal, Optional, Union
|
14
14
|
|
15
15
|
import requests
|
16
16
|
from requests import Response, Session
|
@@ -28,9 +28,12 @@ class BaseSession(RequesterProtocol, abc.ABC):
|
|
28
28
|
|
29
29
|
Attributes:
|
30
30
|
_session (requests.Session): The persistent HTTP session.
|
31
|
-
_timeout (
|
31
|
+
_timeout (float): Timeout for each request in seconds.
|
32
32
|
"""
|
33
33
|
|
34
|
+
def is_async(self) -> Literal[False]:
|
35
|
+
return False
|
36
|
+
|
34
37
|
def _init_session(
|
35
38
|
self, config: RequesterConfig, cookies: Optional[Dict[str, str]] = None
|
36
39
|
) -> None:
|
@@ -81,7 +84,7 @@ class BaseSession(RequesterProtocol, abc.ABC):
|
|
81
84
|
)
|
82
85
|
|
83
86
|
@abc.abstractmethod
|
84
|
-
def get_book_info(self, book_id: str, wait_time: Optional[
|
87
|
+
def get_book_info(self, book_id: str, wait_time: Optional[float] = None) -> str:
|
85
88
|
"""
|
86
89
|
Fetch the raw HTML (or JSON) of the book info page.
|
87
90
|
|
@@ -93,7 +96,7 @@ class BaseSession(RequesterProtocol, abc.ABC):
|
|
93
96
|
|
94
97
|
@abc.abstractmethod
|
95
98
|
def get_book_chapter(
|
96
|
-
self, book_id: str, chapter_id: str, wait_time: Optional[
|
99
|
+
self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
|
97
100
|
) -> str:
|
98
101
|
"""
|
99
102
|
Fetch the raw HTML (or JSON) of a single chapter.
|
@@ -105,7 +108,7 @@ class BaseSession(RequesterProtocol, abc.ABC):
|
|
105
108
|
"""
|
106
109
|
...
|
107
110
|
|
108
|
-
def get_bookcase(self, wait_time: Optional[
|
111
|
+
def get_bookcase(self, wait_time: Optional[float] = None) -> str:
|
109
112
|
"""
|
110
113
|
Optional: Retrieve the HTML content of the authenticated user's bookcase page.
|
111
114
|
|
@@ -171,7 +174,7 @@ class BaseSession(RequesterProtocol, abc.ABC):
|
|
171
174
|
return self._session
|
172
175
|
|
173
176
|
@property
|
174
|
-
def timeout(self) ->
|
177
|
+
def timeout(self) -> float:
|
175
178
|
"""Return the default timeout setting."""
|
176
179
|
return self._timeout
|
177
180
|
|
@@ -45,7 +45,9 @@ class CommonAsyncSession(BaseAsyncSession):
|
|
45
45
|
self._site = site
|
46
46
|
self._profile = profile
|
47
47
|
|
48
|
-
async def get_book_info(
|
48
|
+
async def get_book_info(
|
49
|
+
self, book_id: str, wait_time: Optional[float] = None
|
50
|
+
) -> str:
|
49
51
|
"""
|
50
52
|
Fetch the raw HTML of the book info page asynchronously.
|
51
53
|
|
@@ -62,7 +64,7 @@ class CommonAsyncSession(BaseAsyncSession):
|
|
62
64
|
return html
|
63
65
|
|
64
66
|
async def get_book_chapter(
|
65
|
-
self, book_id: str, chapter_id: str, wait_time: Optional[
|
67
|
+
self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
|
66
68
|
) -> str:
|
67
69
|
"""
|
68
70
|
Fetch the raw HTML of a single chapter asynchronously.
|
@@ -47,7 +47,7 @@ class CommonSession(BaseSession):
|
|
47
47
|
self._site = site
|
48
48
|
self._profile = profile
|
49
49
|
|
50
|
-
def get_book_info(self, book_id: str, wait_time: Optional[
|
50
|
+
def get_book_info(self, book_id: str, wait_time: Optional[float] = None) -> str:
|
51
51
|
"""
|
52
52
|
Fetch the raw HTML (or JSON) of the book info page.
|
53
53
|
|
@@ -64,7 +64,7 @@ class CommonSession(BaseSession):
|
|
64
64
|
with self.session.get(url, timeout=self.timeout) as response:
|
65
65
|
response.raise_for_status()
|
66
66
|
content = response.text
|
67
|
-
sleep_with_random_delay(base)
|
67
|
+
sleep_with_random_delay(base, add_spread=1.0)
|
68
68
|
return content
|
69
69
|
except Exception as e:
|
70
70
|
if attempt == self.retry_times:
|
@@ -75,7 +75,7 @@ class CommonSession(BaseSession):
|
|
75
75
|
raise RuntimeError("Unexpected error: get_book_info failed without returning")
|
76
76
|
|
77
77
|
def get_book_chapter(
|
78
|
-
self, book_id: str, chapter_id: str, wait_time: Optional[
|
78
|
+
self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
|
79
79
|
) -> str:
|
80
80
|
"""
|
81
81
|
Fetch the raw HTML (or JSON) of a single chapter.
|
@@ -94,7 +94,7 @@ class CommonSession(BaseSession):
|
|
94
94
|
with self.session.get(url, timeout=self.timeout) as response:
|
95
95
|
response.raise_for_status()
|
96
96
|
content = response.text
|
97
|
-
sleep_with_random_delay(base)
|
97
|
+
sleep_with_random_delay(base, add_spread=1.0)
|
98
98
|
return content
|
99
99
|
except Exception as e:
|
100
100
|
if attempt == self.retry_times:
|
@@ -15,10 +15,12 @@ import random
|
|
15
15
|
import time
|
16
16
|
from typing import Optional
|
17
17
|
|
18
|
+
from DrissionPage._elements.chromium_element import ChromiumElement
|
18
19
|
from DrissionPage.common import Keys
|
19
20
|
|
20
21
|
from novel_downloader.config.models import RequesterConfig
|
21
22
|
from novel_downloader.core.requesters.base_browser import BaseBrowser
|
23
|
+
from novel_downloader.utils.i18n import t
|
22
24
|
from novel_downloader.utils.time_utils import sleep_with_random_delay
|
23
25
|
|
24
26
|
logger = logging.getLogger(__name__)
|
@@ -56,6 +58,8 @@ class QidianBrowser(BaseBrowser):
|
|
56
58
|
|
57
59
|
:return: True if the user appears to be logged in, False otherwise.
|
58
60
|
"""
|
61
|
+
if self._page is None:
|
62
|
+
raise RuntimeError("Browser page not initialized.")
|
59
63
|
try:
|
60
64
|
self._handle_overlay_mask()
|
61
65
|
sign_in_elem = self._page.ele("@class=sign-in")
|
@@ -83,6 +87,8 @@ class QidianBrowser(BaseBrowser):
|
|
83
87
|
:param max_retries: Maximum number of times to try clicking the login button.
|
84
88
|
:return: True if login succeeds or is already in place; False otherwise.
|
85
89
|
"""
|
90
|
+
if self._page is None:
|
91
|
+
raise RuntimeError("Browser page not initialized.")
|
86
92
|
original_url = self._page.url
|
87
93
|
try:
|
88
94
|
self._page.get("https://www.qidian.com/")
|
@@ -107,7 +113,8 @@ class QidianBrowser(BaseBrowser):
|
|
107
113
|
|
108
114
|
# return to original page
|
109
115
|
try:
|
110
|
-
|
116
|
+
if original_url:
|
117
|
+
self._page.get(original_url)
|
111
118
|
except Exception as e:
|
112
119
|
logger.debug("[auth] Failed to restore page URL: %s", e)
|
113
120
|
|
@@ -117,6 +124,8 @@ class QidianBrowser(BaseBrowser):
|
|
117
124
|
"""
|
118
125
|
Detect and close any full-page overlay mask that might block the login UI.
|
119
126
|
"""
|
127
|
+
if self._page is None:
|
128
|
+
raise RuntimeError("Browser page not initialized.")
|
120
129
|
try:
|
121
130
|
mask = self._page.ele("@@tag()=div@@class=mask", timeout=2)
|
122
131
|
if not mask:
|
@@ -143,10 +152,12 @@ class QidianBrowser(BaseBrowser):
|
|
143
152
|
|
144
153
|
:param attempt: The current attempt number (for logging).
|
145
154
|
"""
|
155
|
+
if self._page is None:
|
156
|
+
raise RuntimeError("Browser page not initialized.")
|
146
157
|
try:
|
147
158
|
logger.debug("[auth] Attempting login click (#%s).", attempt)
|
148
159
|
login_btn = self._page.ele("@id=login-btn", timeout=5)
|
149
|
-
if login_btn:
|
160
|
+
if isinstance(login_btn, ChromiumElement):
|
150
161
|
login_btn.click()
|
151
162
|
logger.debug("[auth] Login button clicked.")
|
152
163
|
else:
|
@@ -170,6 +181,8 @@ class QidianBrowser(BaseBrowser):
|
|
170
181
|
:param max_retries: Number of times to check for login success.
|
171
182
|
:return: True if login was detected, False otherwise.
|
172
183
|
"""
|
184
|
+
if self._page is None:
|
185
|
+
raise RuntimeError("Browser page not initialized.")
|
173
186
|
original_headless = self._headless
|
174
187
|
|
175
188
|
# 1. Switch to headful mode if needed
|
@@ -193,13 +206,11 @@ class QidianBrowser(BaseBrowser):
|
|
193
206
|
logger.info("[auth] Detected successful login.")
|
194
207
|
self._logged_in = True
|
195
208
|
break
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
attempt,
|
200
|
-
max_retries,
|
209
|
+
if attempt == 1:
|
210
|
+
print(t("login_prompt_intro"))
|
211
|
+
input(
|
212
|
+
t("login_prompt_press_enter", attempt=attempt, max_retries=max_retries)
|
201
213
|
)
|
202
|
-
input()
|
203
214
|
else:
|
204
215
|
logger.warning("[auth] Manual login failed after %d attempts.", max_retries)
|
205
216
|
self._logged_in = False
|
@@ -266,7 +277,7 @@ class QidianBrowser(BaseBrowser):
|
|
266
277
|
"""
|
267
278
|
return self.QIDIAN_BOOKCASE_URL
|
268
279
|
|
269
|
-
def get_book_info(self, book_id: str, wait_time: Optional[
|
280
|
+
def get_book_info(self, book_id: str, wait_time: Optional[float] = None) -> str:
|
270
281
|
"""
|
271
282
|
Retrieve the HTML of a Qidian book info page.
|
272
283
|
|
@@ -279,14 +290,16 @@ class QidianBrowser(BaseBrowser):
|
|
279
290
|
If None, uses `self._config.wait_time`.
|
280
291
|
:return: The HTML content of the book info page, or an empty string on error.
|
281
292
|
"""
|
293
|
+
if self._page is None:
|
294
|
+
raise RuntimeError("Browser page not initialized.")
|
282
295
|
url = self._build_book_info_url(book_id)
|
283
296
|
try:
|
284
297
|
# Navigate and fetch
|
285
298
|
self._page.get(url)
|
286
299
|
|
287
|
-
# Randomized human
|
300
|
+
# Randomized human-like delay
|
288
301
|
base = wait_time if wait_time is not None else self._config.wait_time
|
289
|
-
sleep_with_random_delay(base,
|
302
|
+
sleep_with_random_delay(base, mul_spread=1.2)
|
290
303
|
|
291
304
|
html = str(self._page.html)
|
292
305
|
logger.debug("[fetch] Fetched book info for ID %s from %s", book_id, url)
|
@@ -303,6 +316,8 @@ class QidianBrowser(BaseBrowser):
|
|
303
316
|
:param presses: Number of DOWN key presses.
|
304
317
|
:param pause: Seconds to wait between each press.
|
305
318
|
"""
|
319
|
+
if self._page is None:
|
320
|
+
raise RuntimeError("Browser page not initialized.")
|
306
321
|
for _ in range(presses):
|
307
322
|
try:
|
308
323
|
self._page.actions.key_down(Keys.DOWN)
|
@@ -311,14 +326,14 @@ class QidianBrowser(BaseBrowser):
|
|
311
326
|
time.sleep(pause)
|
312
327
|
|
313
328
|
def get_book_chapter(
|
314
|
-
self, book_id: str, chapter_id: str, wait_time: Optional[
|
329
|
+
self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
|
315
330
|
) -> str:
|
316
331
|
"""
|
317
332
|
Retrieve the HTML content of a specific chapter.
|
318
333
|
|
319
334
|
Ensures the user is logged in, navigates to the chapter page,
|
320
335
|
waits a randomized delay to mimic human reading, then scrolls
|
321
|
-
to trigger any lazy
|
336
|
+
to trigger any lazy-loaded content.
|
322
337
|
|
323
338
|
:param book_id: The identifier of the book.
|
324
339
|
:param chapter_id: The identifier of the chapter.
|
@@ -326,14 +341,16 @@ class QidianBrowser(BaseBrowser):
|
|
326
341
|
falls back to `self._config.wait_time`.
|
327
342
|
:return: The HTML content of the chapter page, or empty string on error.
|
328
343
|
"""
|
344
|
+
if self._page is None:
|
345
|
+
raise RuntimeError("Browser page not initialized.")
|
329
346
|
url = self._build_chapter_url(book_id, chapter_id)
|
330
347
|
try:
|
331
348
|
# 1. Navigate to chapter URL
|
332
349
|
self._page.get(url)
|
333
350
|
|
334
|
-
# 2. Randomized human
|
351
|
+
# 2. Randomized human-like delay
|
335
352
|
base = wait_time if wait_time is not None else self._config.wait_time
|
336
|
-
# sleep_with_random_delay(base,
|
353
|
+
# sleep_with_random_delay(base, mul_spread=1.2)
|
337
354
|
|
338
355
|
# 3. Scroll down to load dynamic content
|
339
356
|
presses = int(random.uniform(base, base + 5) * 2)
|
@@ -347,15 +364,17 @@ class QidianBrowser(BaseBrowser):
|
|
347
364
|
logger.warning("[fetch] Error fetching chapter from '%s': %s", url, e)
|
348
365
|
return ""
|
349
366
|
|
350
|
-
def get_bookcase(self, wait_time: Optional[
|
367
|
+
def get_bookcase(self, wait_time: Optional[float] = None) -> str:
|
351
368
|
"""
|
352
|
-
Retrieve the HTML content of the logged
|
369
|
+
Retrieve the HTML content of the logged-in user's Qidian bookcase page.
|
353
370
|
|
354
371
|
:param wait_time: Base number of seconds to wait before returning content.
|
355
372
|
If None, falls back to `self._config.wait_time`.
|
356
373
|
:return: The HTML markup of the bookcase page, or empty string on error.
|
357
374
|
:raises RuntimeError: If the user is not logged in.
|
358
375
|
"""
|
376
|
+
if self._page is None:
|
377
|
+
raise RuntimeError("Browser page not initialized.")
|
359
378
|
if not self._logged_in:
|
360
379
|
raise RuntimeError("User not logged in. Please call login() first.")
|
361
380
|
|
@@ -364,9 +383,9 @@ class QidianBrowser(BaseBrowser):
|
|
364
383
|
# Navigate to the bookcase page
|
365
384
|
self._page.get(url)
|
366
385
|
|
367
|
-
# Randomized human
|
386
|
+
# Randomized human-like delay
|
368
387
|
base = wait_time if wait_time is not None else self._config.wait_time
|
369
|
-
sleep_with_random_delay(base,
|
388
|
+
sleep_with_random_delay(base, mul_spread=1.2)
|
370
389
|
|
371
390
|
html = str(self._page.html)
|
372
391
|
logger.debug("[fetch] Fetched bookcase HTML from %s", url)
|
@@ -108,7 +108,7 @@ class QidianSession(BaseSession):
|
|
108
108
|
self.get("https://www.qidian.com")
|
109
109
|
return True
|
110
110
|
|
111
|
-
def get_book_info(self, book_id: str, wait_time: Optional[
|
111
|
+
def get_book_info(self, book_id: str, wait_time: Optional[float] = None) -> str:
|
112
112
|
"""
|
113
113
|
Fetch the raw HTML of the book info page.
|
114
114
|
|
@@ -123,7 +123,7 @@ class QidianSession(BaseSession):
|
|
123
123
|
try:
|
124
124
|
resp = self.get(url)
|
125
125
|
resp.raise_for_status()
|
126
|
-
sleep_with_random_delay(base_delay,
|
126
|
+
sleep_with_random_delay(base_delay, mul_spread=1.2)
|
127
127
|
return resp.text
|
128
128
|
except Exception as exc:
|
129
129
|
logger.warning(
|
@@ -140,7 +140,7 @@ class QidianSession(BaseSession):
|
|
140
140
|
raise RuntimeError("Unexpected fall-through in get_book_info")
|
141
141
|
|
142
142
|
def get_book_chapter(
|
143
|
-
self, book_id: str, chapter_id: str, wait_time: Optional[
|
143
|
+
self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
|
144
144
|
) -> str:
|
145
145
|
"""
|
146
146
|
Fetch the HTML of a single chapter.
|
@@ -157,7 +157,7 @@ class QidianSession(BaseSession):
|
|
157
157
|
try:
|
158
158
|
resp = self.get(url)
|
159
159
|
resp.raise_for_status()
|
160
|
-
sleep_with_random_delay(base_delay,
|
160
|
+
sleep_with_random_delay(base_delay, mul_spread=1.2)
|
161
161
|
return resp.text
|
162
162
|
except Exception as exc:
|
163
163
|
logger.warning(
|
@@ -174,7 +174,7 @@ class QidianSession(BaseSession):
|
|
174
174
|
|
175
175
|
raise RuntimeError("Unexpected fall-through in get_book_chapter")
|
176
176
|
|
177
|
-
def get_bookcase(self, wait_time: Optional[
|
177
|
+
def get_bookcase(self, wait_time: Optional[float] = None) -> str:
|
178
178
|
"""
|
179
179
|
Retrieve the user's *bookcase* page.
|
180
180
|
|
@@ -186,7 +186,7 @@ class QidianSession(BaseSession):
|
|
186
186
|
try:
|
187
187
|
resp = self.get(self.QIDIAN_BOOKCASE_URL, allow_redirects=True)
|
188
188
|
resp.raise_for_status()
|
189
|
-
sleep_with_random_delay(base_delay,
|
189
|
+
sleep_with_random_delay(base_delay, mul_spread=1.2)
|
190
190
|
return resp.text
|
191
191
|
except Exception as exc:
|
192
192
|
logger.warning(
|
@@ -212,7 +212,7 @@ def common_save_as_epub(
|
|
212
212
|
|
213
213
|
# --- 5. Finalize EPUB ---
|
214
214
|
logger.info("%s Building TOC and spine...", TAG)
|
215
|
-
book.toc =
|
215
|
+
book.toc = toc_list
|
216
216
|
book.spine = spine
|
217
217
|
book.add_item(epub.EpubNcx())
|
218
218
|
book.add_item(epub.EpubNav())
|
novel_downloader/locales/en.json
CHANGED
@@ -58,6 +58,7 @@
|
|
58
58
|
"download_option_site": "Website source, default is '{default}'.",
|
59
59
|
"download_using_config": "Using config: {path}",
|
60
60
|
"download_site_info": "Site: {site}",
|
61
|
+
"download_site_mode": "Mode: {mode}",
|
61
62
|
"download_no_ids": "No book IDs provided. Exiting.",
|
62
63
|
"download_fail_get_ids": "Failed to get book IDs from config: {err}",
|
63
64
|
"download_only_example": "Only example book IDs found (e.g. '{example}').",
|
@@ -65,6 +66,9 @@
|
|
65
66
|
"download_downloading": "Downloading book {book_id} from {site}...",
|
66
67
|
"download_prompt_parse": "Parse...",
|
67
68
|
|
69
|
+
"login_prompt_intro": "Manual login is required. Please switch to the browser and log in.",
|
70
|
+
"login_prompt_press_enter": "Attempt {attempt}/{max_retries}: Press Enter after completing login in the browser...",
|
71
|
+
|
68
72
|
"clean_logs": "Clean log directory",
|
69
73
|
"clean_cache": "Clean scripts and browser cache",
|
70
74
|
"clean_state": "Clean state files (state.json)",
|
novel_downloader/locales/zh.json
CHANGED
@@ -58,6 +58,7 @@
|
|
58
58
|
"download_option_site": "网站来源, 默认为 '{default}'",
|
59
59
|
"download_using_config": "使用配置: {path}",
|
60
60
|
"download_site_info": "站点: {site}",
|
61
|
+
"download_site_mode": "使用模式: {mode}",
|
61
62
|
"download_no_ids": "未提供书籍 ID, 正在退出",
|
62
63
|
"download_fail_get_ids": "从配置获取书籍 ID 失败: {err}",
|
63
64
|
"download_only_example": "只发现示例书籍 ID (例如 '{example}')",
|
@@ -65,6 +66,9 @@
|
|
65
66
|
"download_downloading": "正在从 {site} 下载书籍 {book_id}...",
|
66
67
|
"download_prompt_parse": "结束...",
|
67
68
|
|
69
|
+
"login_prompt_intro": "需要手动登录, 请切换到浏览器窗口完成登录",
|
70
|
+
"login_prompt_press_enter": "第 {attempt}/{max_retries} 次尝试: 请在浏览器中完成登录后按回车键...",
|
71
|
+
|
68
72
|
"clean_logs": "清理日志目录",
|
69
73
|
"clean_cache": "清理脚本和浏览器缓存",
|
70
74
|
"clean_state": "清理状态文件 (state.json)",
|
@@ -1,9 +1,9 @@
|
|
1
1
|
# 网络请求层设置
|
2
2
|
requests:
|
3
|
-
wait_time: 5
|
3
|
+
wait_time: 5.0 # 每次请求等待时间 (秒)
|
4
4
|
retry_times: 3 # 请求失败重试次数
|
5
|
-
retry_interval: 5
|
6
|
-
timeout: 30
|
5
|
+
retry_interval: 5.0
|
6
|
+
timeout: 30.0 # 页面加载超时时间 (秒)
|
7
7
|
max_rps: null # 最大请求速率 (requests per second), 为 null 则不限制
|
8
8
|
# DrissionPage 专用设置
|
9
9
|
headless: false # 是否以无头模式启动浏览器
|
@@ -15,7 +15,7 @@ requests:
|
|
15
15
|
|
16
16
|
# 全局通用设置
|
17
17
|
general:
|
18
|
-
request_interval: 5
|
18
|
+
request_interval: 5.0 # 同一本书各章节请求间隔 (秒)
|
19
19
|
raw_data_dir: "./raw_data" # 原始章节 HTML/JSON 存放目录
|
20
20
|
output_dir: "./downloads" # 最终输出文件存放目录
|
21
21
|
cache_dir: "./novel_cache" # 本地缓存目录 (字体 / 图片等)
|
@@ -26,6 +26,18 @@ general:
|
|
26
26
|
debug:
|
27
27
|
save_html: false # 是否将抓取到的原始 HTML 保留到磁盘
|
28
28
|
log_level: "INFO" # 日志级别: DEBUG, INFO, WARNING, ERROR
|
29
|
+
font_ocr:
|
30
|
+
decode_font: false # 是否尝试本地解码混淆字体
|
31
|
+
use_freq: false # 是否使用频率分析
|
32
|
+
ocr_version: "v2.0" # "v1.0" / "v2.0"
|
33
|
+
use_ocr: true # 是否使用 OCR 辅助识别文本
|
34
|
+
use_vec: false # 是否使用 Vector 辅助识别文本
|
35
|
+
save_font_debug: false # 是否保存字体解码调试数据
|
36
|
+
batch_size: 32
|
37
|
+
gpu_mem: 500 # GPU 显存限制 (MB)
|
38
|
+
gpu_id: null # 使用哪个 GPU
|
39
|
+
ocr_weight: 0.6
|
40
|
+
vec_weight: 0.4
|
29
41
|
|
30
42
|
# 各站点的特定配置
|
31
43
|
sites:
|
@@ -38,15 +50,6 @@ sites:
|
|
38
50
|
- "0000000000"
|
39
51
|
mode: "browser" # browser / session
|
40
52
|
login_required: true # 是否需要登录才能访问
|
41
|
-
decode_font: false # 是否尝试本地解码混淆字体
|
42
|
-
use_freq: false # 是否使用频率分析
|
43
|
-
ocr_version: "v2.0" # "v1.0" / "v2.0"
|
44
|
-
use_ocr: true # 是否使用 OCR 辅助识别文本
|
45
|
-
use_vec: false # 是否使用 Vector 辅助识别文本
|
46
|
-
save_font_debug: false # 是否保存字体解码调试数据
|
47
|
-
batch_size: 32
|
48
|
-
ocr_weight: 0.6
|
49
|
-
vec_weight: 0.4
|
50
53
|
#
|
51
54
|
sample_site:
|
52
55
|
book_ids:
|
@@ -26,11 +26,12 @@ LOGGER_NAME = PACKAGE_NAME # Root logger name
|
|
26
26
|
# -----------------------------------------------------------------------------
|
27
27
|
# Base config directory (e.g. ~/AppData/Local/novel_downloader/)
|
28
28
|
BASE_CONFIG_DIR = Path(user_config_dir(APP_DIR_NAME, appauthor=False))
|
29
|
+
WORK_DIR = Path.cwd()
|
29
30
|
PACKAGE_ROOT: Path = Path(__file__).parent.parent
|
30
31
|
LOCALES_DIR: Path = PACKAGE_ROOT / "locales"
|
31
32
|
|
32
33
|
# Subdirectories under BASE_CONFIG_DIR
|
33
|
-
LOGGER_DIR =
|
34
|
+
LOGGER_DIR = WORK_DIR / "logs"
|
34
35
|
JS_SCRIPT_DIR = BASE_CONFIG_DIR / "scripts"
|
35
36
|
STATE_DIR = BASE_CONFIG_DIR / "state"
|
36
37
|
DATA_DIR = BASE_CONFIG_DIR / "data"
|
@@ -221,6 +221,8 @@ class FontOCRV2:
|
|
221
221
|
use_ocr: bool = True,
|
222
222
|
use_vec: bool = False,
|
223
223
|
batch_size: int = 32,
|
224
|
+
gpu_mem: int = 500,
|
225
|
+
gpu_id: Optional[int] = None,
|
224
226
|
ocr_weight: float = 0.6,
|
225
227
|
vec_weight: float = 0.4,
|
226
228
|
ocr_version: str = "v1.0",
|
@@ -232,6 +234,8 @@ class FontOCRV2:
|
|
232
234
|
self.use_ocr = use_ocr
|
233
235
|
self.use_vec = use_vec
|
234
236
|
self.batch_size = batch_size
|
237
|
+
self.gpu_mem = gpu_mem
|
238
|
+
self.gpu_id = gpu_id
|
235
239
|
self.ocr_weight = ocr_weight
|
236
240
|
self.vec_weight = vec_weight
|
237
241
|
self.ocr_version = ocr_version
|
@@ -279,6 +283,8 @@ class FontOCRV2:
|
|
279
283
|
rec_batch_num=self.batch_size,
|
280
284
|
use_space_char=False,
|
281
285
|
use_gpu=gpu_available,
|
286
|
+
gpu_mem=self.gpu_mem,
|
287
|
+
gpu_id=self.gpu_id,
|
282
288
|
)
|
283
289
|
|
284
290
|
def _load_char_freq_db(self) -> bool:
|
@@ -106,7 +106,7 @@ def calculate_time_difference(
|
|
106
106
|
"""
|
107
107
|
Calculate the difference between two datetime values.
|
108
108
|
|
109
|
-
:param from_time_str: Date‐time string "YYYY
|
109
|
+
:param from_time_str: Date‐time string "YYYY-MM-DD HH:MM:SS" for the start.
|
110
110
|
:param tz_str: Timezone of from_time_str, e.g. 'UTC+8'. Defaults to 'UTC'.
|
111
111
|
:param to_time_str: Optional date‐time string for the end; if None, uses now().
|
112
112
|
:param to_tz_str: Timezone of to_time_str. Defaults to 'UTC'.
|