novel-downloader 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +14 -0
- novel_downloader/cli/__init__.py +14 -0
- novel_downloader/cli/clean.py +134 -0
- novel_downloader/cli/download.py +98 -0
- novel_downloader/cli/interactive.py +67 -0
- novel_downloader/cli/main.py +45 -0
- novel_downloader/cli/settings.py +177 -0
- novel_downloader/config/__init__.py +52 -0
- novel_downloader/config/adapter.py +150 -0
- novel_downloader/config/loader.py +177 -0
- novel_downloader/config/models.py +170 -0
- novel_downloader/config/site_rules.py +97 -0
- novel_downloader/core/__init__.py +25 -0
- novel_downloader/core/downloaders/__init__.py +20 -0
- novel_downloader/core/downloaders/base_downloader.py +187 -0
- novel_downloader/core/downloaders/common_downloader.py +192 -0
- novel_downloader/core/downloaders/qidian_downloader.py +208 -0
- novel_downloader/core/factory/__init__.py +21 -0
- novel_downloader/core/factory/downloader_factory.py +62 -0
- novel_downloader/core/factory/parser_factory.py +62 -0
- novel_downloader/core/factory/requester_factory.py +62 -0
- novel_downloader/core/factory/saver_factory.py +49 -0
- novel_downloader/core/interfaces/__init__.py +28 -0
- novel_downloader/core/interfaces/downloader_protocol.py +37 -0
- novel_downloader/core/interfaces/parser_protocol.py +40 -0
- novel_downloader/core/interfaces/requester_protocol.py +65 -0
- novel_downloader/core/interfaces/saver_protocol.py +61 -0
- novel_downloader/core/parsers/__init__.py +28 -0
- novel_downloader/core/parsers/base_parser.py +96 -0
- novel_downloader/core/parsers/common_parser/__init__.py +14 -0
- novel_downloader/core/parsers/common_parser/helper.py +321 -0
- novel_downloader/core/parsers/common_parser/main_parser.py +86 -0
- novel_downloader/core/parsers/qidian_parser/__init__.py +20 -0
- novel_downloader/core/parsers/qidian_parser/browser/__init__.py +13 -0
- novel_downloader/core/parsers/qidian_parser/browser/chapter_encrypted.py +498 -0
- novel_downloader/core/parsers/qidian_parser/browser/chapter_normal.py +97 -0
- novel_downloader/core/parsers/qidian_parser/browser/chapter_router.py +70 -0
- novel_downloader/core/parsers/qidian_parser/browser/main_parser.py +110 -0
- novel_downloader/core/parsers/qidian_parser/session/__init__.py +13 -0
- novel_downloader/core/parsers/qidian_parser/session/chapter_encrypted.py +451 -0
- novel_downloader/core/parsers/qidian_parser/session/chapter_normal.py +119 -0
- novel_downloader/core/parsers/qidian_parser/session/chapter_router.py +67 -0
- novel_downloader/core/parsers/qidian_parser/session/main_parser.py +113 -0
- novel_downloader/core/parsers/qidian_parser/session/node_decryptor.py +164 -0
- novel_downloader/core/parsers/qidian_parser/shared/__init__.py +38 -0
- novel_downloader/core/parsers/qidian_parser/shared/book_info_parser.py +95 -0
- novel_downloader/core/parsers/qidian_parser/shared/helpers.py +133 -0
- novel_downloader/core/requesters/__init__.py +27 -0
- novel_downloader/core/requesters/base_browser.py +210 -0
- novel_downloader/core/requesters/base_session.py +243 -0
- novel_downloader/core/requesters/common_requester/__init__.py +14 -0
- novel_downloader/core/requesters/common_requester/common_session.py +126 -0
- novel_downloader/core/requesters/qidian_requester/__init__.py +22 -0
- novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +377 -0
- novel_downloader/core/requesters/qidian_requester/qidian_session.py +202 -0
- novel_downloader/core/savers/__init__.py +20 -0
- novel_downloader/core/savers/base_saver.py +169 -0
- novel_downloader/core/savers/common_saver/__init__.py +13 -0
- novel_downloader/core/savers/common_saver/common_epub.py +232 -0
- novel_downloader/core/savers/common_saver/common_txt.py +176 -0
- novel_downloader/core/savers/common_saver/main_saver.py +86 -0
- novel_downloader/core/savers/epub_utils/__init__.py +27 -0
- novel_downloader/core/savers/epub_utils/css_builder.py +68 -0
- novel_downloader/core/savers/epub_utils/initializer.py +98 -0
- novel_downloader/core/savers/epub_utils/text_to_html.py +132 -0
- novel_downloader/core/savers/epub_utils/volume_intro.py +61 -0
- novel_downloader/core/savers/qidian_saver.py +22 -0
- novel_downloader/locales/en.json +91 -0
- novel_downloader/locales/zh.json +91 -0
- novel_downloader/resources/config/rules.toml +196 -0
- novel_downloader/resources/config/settings.yaml +70 -0
- novel_downloader/resources/css_styles/main.css +104 -0
- novel_downloader/resources/css_styles/volume-intro.css +56 -0
- novel_downloader/resources/images/volume_border.png +0 -0
- novel_downloader/resources/js_scripts/qidian_decrypt_node.js +82 -0
- novel_downloader/resources/json/replace_word_map.json +4 -0
- novel_downloader/resources/text/blacklist.txt +22 -0
- novel_downloader/utils/__init__.py +0 -0
- novel_downloader/utils/cache.py +24 -0
- novel_downloader/utils/constants.py +158 -0
- novel_downloader/utils/crypto_utils.py +144 -0
- novel_downloader/utils/file_utils/__init__.py +43 -0
- novel_downloader/utils/file_utils/io.py +252 -0
- novel_downloader/utils/file_utils/normalize.py +68 -0
- novel_downloader/utils/file_utils/sanitize.py +77 -0
- novel_downloader/utils/fontocr/__init__.py +23 -0
- novel_downloader/utils/fontocr/ocr_v1.py +304 -0
- novel_downloader/utils/fontocr/ocr_v2.py +658 -0
- novel_downloader/utils/hash_store.py +288 -0
- novel_downloader/utils/hash_utils.py +103 -0
- novel_downloader/utils/i18n.py +41 -0
- novel_downloader/utils/logger.py +104 -0
- novel_downloader/utils/model_loader.py +72 -0
- novel_downloader/utils/network.py +287 -0
- novel_downloader/utils/state.py +156 -0
- novel_downloader/utils/text_utils/__init__.py +27 -0
- novel_downloader/utils/text_utils/chapter_formatting.py +46 -0
- novel_downloader/utils/text_utils/diff_display.py +75 -0
- novel_downloader/utils/text_utils/font_mapping.py +31 -0
- novel_downloader/utils/text_utils/text_cleaning.py +57 -0
- novel_downloader/utils/time_utils/__init__.py +22 -0
- novel_downloader/utils/time_utils/datetime_utils.py +146 -0
- novel_downloader/utils/time_utils/sleep_utils.py +49 -0
- novel_downloader-1.1.1.dist-info/METADATA +137 -0
- novel_downloader-1.1.1.dist-info/RECORD +109 -0
- novel_downloader-1.1.1.dist-info/WHEEL +5 -0
- novel_downloader-1.1.1.dist-info/entry_points.txt +2 -0
- novel_downloader-1.1.1.dist-info/licenses/LICENSE +21 -0
- novel_downloader-1.1.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,287 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.utils.network
|
5
|
+
------------------------------
|
6
|
+
|
7
|
+
Utilities for handling HTTP requests and downloading remote resources.
|
8
|
+
"""
|
9
|
+
|
10
|
+
import logging
|
11
|
+
import random
|
12
|
+
import time
|
13
|
+
from pathlib import Path
|
14
|
+
from typing import Dict, Literal, Optional, Union
|
15
|
+
from urllib.parse import unquote, urlparse
|
16
|
+
|
17
|
+
import requests
|
18
|
+
|
19
|
+
from .constants import DEFAULT_HEADERS, DEFAULT_IMAGE_SUFFIX
|
20
|
+
from .file_utils.io import _get_non_conflicting_path, _write_file, read_binary_file
|
21
|
+
|
22
|
+
logger = logging.getLogger(__name__)
|
23
|
+
|
24
|
+
_DEFAULT_CHUNK_SIZE = 8192 # 8KB per chunk for streaming downloads
|
25
|
+
|
26
|
+
|
27
|
+
def http_get_with_retry(
|
28
|
+
url: str,
|
29
|
+
*,
|
30
|
+
retries: int = 3,
|
31
|
+
timeout: int = 10,
|
32
|
+
backoff: float = 0.5,
|
33
|
+
headers: Optional[Dict[str, str]] = None,
|
34
|
+
stream: bool = False,
|
35
|
+
) -> Optional[requests.Response]:
|
36
|
+
"""
|
37
|
+
Perform a GET request with retry support.
|
38
|
+
|
39
|
+
:param url: URL to request.
|
40
|
+
:param retries: Number of retry attempts.
|
41
|
+
:param timeout: Timeout in seconds per request.
|
42
|
+
:param backoff: Base backoff delay between retries.
|
43
|
+
:param headers: Optional HTTP headers.
|
44
|
+
:param stream: Whether to stream the response.
|
45
|
+
:return: Response object if successful, else None.
|
46
|
+
"""
|
47
|
+
for attempt in range(1, retries + 1):
|
48
|
+
try:
|
49
|
+
response = requests.get(
|
50
|
+
url, timeout=timeout, headers=headers, stream=stream
|
51
|
+
)
|
52
|
+
response.raise_for_status()
|
53
|
+
return response
|
54
|
+
except requests.RequestException as e:
|
55
|
+
logger.warning("[http] Attempt %s/%s failed: %s", attempt, retries, e)
|
56
|
+
if attempt < retries:
|
57
|
+
sleep_time = backoff * (2 ** (attempt - 1)) + random.uniform(0, 0.1)
|
58
|
+
time.sleep(sleep_time)
|
59
|
+
except Exception as e:
|
60
|
+
logger.error("[http] Unexpected error: %s", e)
|
61
|
+
break
|
62
|
+
|
63
|
+
logger.error("[http] Failed after %s attempts: %s", retries, url)
|
64
|
+
return None
|
65
|
+
|
66
|
+
|
67
|
+
def image_url_to_filename(url: str) -> str:
|
68
|
+
"""
|
69
|
+
Parse and sanitize a image filename from a URL.
|
70
|
+
If no filename or suffix exists, fallback to default name and extension.
|
71
|
+
|
72
|
+
:param url: URL string
|
73
|
+
:return: Safe filename string
|
74
|
+
"""
|
75
|
+
parsed_url = urlparse(url)
|
76
|
+
path = unquote(parsed_url.path)
|
77
|
+
filename = Path(path).name
|
78
|
+
|
79
|
+
if not filename:
|
80
|
+
filename = "image"
|
81
|
+
|
82
|
+
if not Path(filename).suffix:
|
83
|
+
filename += DEFAULT_IMAGE_SUFFIX
|
84
|
+
|
85
|
+
return filename
|
86
|
+
|
87
|
+
|
88
|
+
def download_image_as_bytes(
|
89
|
+
url: str,
|
90
|
+
target_folder: Optional[Union[str, Path]] = None,
|
91
|
+
*,
|
92
|
+
timeout: int = 10,
|
93
|
+
retries: int = 3,
|
94
|
+
backoff: float = 0.5,
|
95
|
+
on_exist: Literal["overwrite", "skip", "rename"] = "overwrite",
|
96
|
+
) -> Optional[bytes]:
|
97
|
+
"""
|
98
|
+
Download an image from a given URL and return its content as bytes.
|
99
|
+
|
100
|
+
If on_exist='skip' and the file already exists, it will be read from disk
|
101
|
+
instead of being downloaded again.
|
102
|
+
|
103
|
+
:param url: Image URL. Can start with 'http', '//', or without protocol.
|
104
|
+
:param target_folder: Optional folder to save the image (str or Path).
|
105
|
+
:param timeout: Request timeout in seconds.
|
106
|
+
:param retries: Number of retry attempts.
|
107
|
+
:param backoff: Base delay between retries (exponential backoff).
|
108
|
+
:param on_exist: What to do if file exists: 'overwrite', 'skip', or 'rename'.
|
109
|
+
:return: Image content as bytes, or None if failed.
|
110
|
+
"""
|
111
|
+
# Normalize URL
|
112
|
+
if url.startswith("//"):
|
113
|
+
url = "https:" + url
|
114
|
+
elif not url.startswith("http"):
|
115
|
+
url = "https://" + url
|
116
|
+
|
117
|
+
save_path = None
|
118
|
+
if target_folder:
|
119
|
+
target_folder = Path(target_folder)
|
120
|
+
filename = image_url_to_filename(url)
|
121
|
+
save_path = target_folder / filename
|
122
|
+
|
123
|
+
if on_exist == "skip" and save_path.exists():
|
124
|
+
logger.info(
|
125
|
+
"[image] '%s' exists, skipping download and reading from disk.",
|
126
|
+
save_path,
|
127
|
+
)
|
128
|
+
return read_binary_file(save_path)
|
129
|
+
|
130
|
+
# Proceed with download
|
131
|
+
response = http_get_with_retry(
|
132
|
+
url,
|
133
|
+
retries=retries,
|
134
|
+
timeout=timeout,
|
135
|
+
backoff=backoff,
|
136
|
+
headers=DEFAULT_HEADERS,
|
137
|
+
stream=False,
|
138
|
+
)
|
139
|
+
|
140
|
+
if response and response.ok:
|
141
|
+
content = response.content
|
142
|
+
|
143
|
+
if save_path:
|
144
|
+
_write_file(
|
145
|
+
content=content,
|
146
|
+
filepath=save_path,
|
147
|
+
mode="wb",
|
148
|
+
on_exist=on_exist,
|
149
|
+
)
|
150
|
+
|
151
|
+
return content
|
152
|
+
|
153
|
+
return None
|
154
|
+
|
155
|
+
|
156
|
+
def download_font_file(
|
157
|
+
url: str,
|
158
|
+
target_folder: Union[str, Path],
|
159
|
+
*,
|
160
|
+
timeout: int = 10,
|
161
|
+
retries: int = 3,
|
162
|
+
backoff: float = 0.5,
|
163
|
+
on_exist: Literal["overwrite", "skip", "rename"] = "skip",
|
164
|
+
) -> Optional[Path]:
|
165
|
+
"""
|
166
|
+
Download a font file from a URL and save it locally with retry and overwrite control
|
167
|
+
|
168
|
+
:param url: Fully-qualified font file URL.
|
169
|
+
:param target_folder: Local folder to save the font file.
|
170
|
+
:param timeout: Timeout for each request (in seconds).
|
171
|
+
:param retries: Number of retry attempts.
|
172
|
+
:param backoff: Base backoff time between retries (in seconds).
|
173
|
+
:param on_exist: File conflict strategy: 'overwrite', 'skip', or 'rename'.
|
174
|
+
:return: Path to the saved font file, or None if failed.
|
175
|
+
"""
|
176
|
+
# Validate and parse URL
|
177
|
+
parsed = urlparse(url)
|
178
|
+
if not parsed.scheme or not parsed.netloc:
|
179
|
+
logger.warning("[font] Invalid URL: %s", url)
|
180
|
+
return None
|
181
|
+
|
182
|
+
# Determine filename
|
183
|
+
filename = Path(unquote(parsed.path)).name
|
184
|
+
if not filename:
|
185
|
+
logger.warning("[font] Could not extract filename from URL: %s", url)
|
186
|
+
return None
|
187
|
+
|
188
|
+
# Resolve save path
|
189
|
+
target_folder = Path(target_folder)
|
190
|
+
target_folder.mkdir(parents=True, exist_ok=True)
|
191
|
+
font_path = target_folder / filename
|
192
|
+
|
193
|
+
# If skip and file exists -> return immediately
|
194
|
+
if on_exist == "skip" and font_path.exists():
|
195
|
+
logger.info("[font] File exists, skipping download: %s", font_path)
|
196
|
+
return font_path
|
197
|
+
|
198
|
+
# Retry download with exponential backoff
|
199
|
+
response = http_get_with_retry(
|
200
|
+
url,
|
201
|
+
retries=retries,
|
202
|
+
timeout=timeout,
|
203
|
+
backoff=backoff,
|
204
|
+
headers=DEFAULT_HEADERS,
|
205
|
+
stream=True,
|
206
|
+
)
|
207
|
+
|
208
|
+
if response:
|
209
|
+
try:
|
210
|
+
if on_exist == "rename":
|
211
|
+
font_path = _get_non_conflicting_path(font_path)
|
212
|
+
|
213
|
+
with open(font_path, "wb") as f:
|
214
|
+
for chunk in response.iter_content(chunk_size=_DEFAULT_CHUNK_SIZE):
|
215
|
+
if chunk:
|
216
|
+
f.write(chunk)
|
217
|
+
|
218
|
+
logger.info("[font] Font saved to: %s", font_path)
|
219
|
+
return font_path
|
220
|
+
|
221
|
+
except Exception as e:
|
222
|
+
logger.error("[font] Error writing font to disk: %s", e)
|
223
|
+
|
224
|
+
return None
|
225
|
+
|
226
|
+
|
227
|
+
def download_js_file(
|
228
|
+
url: str,
|
229
|
+
target_folder: Union[str, Path],
|
230
|
+
*,
|
231
|
+
timeout: int = 10,
|
232
|
+
retries: int = 3,
|
233
|
+
backoff: float = 0.5,
|
234
|
+
on_exist: Literal["overwrite", "skip", "rename"] = "skip",
|
235
|
+
) -> Optional[Path]:
|
236
|
+
"""
|
237
|
+
Download a JavaScript (.js) file from a URL and save it locally.
|
238
|
+
|
239
|
+
:param url: Fully-qualified JS file URL.
|
240
|
+
:param target_folder: Local folder to save the JS file.
|
241
|
+
:param timeout: Timeout for each request (in seconds).
|
242
|
+
:param retries: Number of retry attempts.
|
243
|
+
:param backoff: Base backoff time between retries (in seconds).
|
244
|
+
:param on_exist: File conflict strategy: 'overwrite', 'skip', or 'rename'.
|
245
|
+
:return: Path to the saved JS file, or None if failed.
|
246
|
+
"""
|
247
|
+
parsed = urlparse(url)
|
248
|
+
if not parsed.scheme or not parsed.netloc:
|
249
|
+
logger.warning("[js] Invalid URL: %s", url)
|
250
|
+
return None
|
251
|
+
|
252
|
+
# Determine filename
|
253
|
+
filename = Path(unquote(parsed.path)).name
|
254
|
+
if not filename.endswith(".js"):
|
255
|
+
filename += ".js"
|
256
|
+
|
257
|
+
target_folder = Path(target_folder)
|
258
|
+
target_folder.mkdir(parents=True, exist_ok=True)
|
259
|
+
save_path = target_folder / filename
|
260
|
+
|
261
|
+
if on_exist == "skip" and save_path.exists():
|
262
|
+
logger.info("[js] File exists, skipping download: %s", save_path)
|
263
|
+
return save_path
|
264
|
+
|
265
|
+
response = http_get_with_retry(
|
266
|
+
url,
|
267
|
+
retries=retries,
|
268
|
+
timeout=timeout,
|
269
|
+
backoff=backoff,
|
270
|
+
headers=DEFAULT_HEADERS,
|
271
|
+
stream=False,
|
272
|
+
)
|
273
|
+
|
274
|
+
if response and response.ok:
|
275
|
+
content = response.content
|
276
|
+
|
277
|
+
if on_exist == "rename":
|
278
|
+
save_path = _get_non_conflicting_path(save_path)
|
279
|
+
|
280
|
+
try:
|
281
|
+
_write_file(content=content, filepath=save_path, mode="wb")
|
282
|
+
logger.info("[js] JS file saved to: %s", save_path)
|
283
|
+
return save_path
|
284
|
+
except Exception as e:
|
285
|
+
logger.error("[js] Error writing JS to disk: %s", e)
|
286
|
+
|
287
|
+
return None
|
@@ -0,0 +1,156 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.utils.state
|
5
|
+
----------------------------
|
6
|
+
State management for user preferences and runtime flags.
|
7
|
+
|
8
|
+
Supported sections:
|
9
|
+
- general: global preferences (e.g. language)
|
10
|
+
- sites: per-site flags & data (e.g. manual_login, cookies)
|
11
|
+
"""
|
12
|
+
import json
|
13
|
+
from pathlib import Path
|
14
|
+
from typing import Any, Dict, Union
|
15
|
+
|
16
|
+
from .constants import STATE_FILE
|
17
|
+
|
18
|
+
|
19
|
+
class StateManager:
|
20
|
+
"""
|
21
|
+
Manages persistent state for user preferences and runtime flags.
|
22
|
+
Stores data in JSON at STATE_FILE.
|
23
|
+
"""
|
24
|
+
|
25
|
+
def __init__(self, path: Path = STATE_FILE) -> None:
|
26
|
+
self._path = path
|
27
|
+
self._data = self._load()
|
28
|
+
|
29
|
+
def _load(self) -> Dict[str, Any]:
|
30
|
+
"""
|
31
|
+
Load the configuration file into a Python dictionary.
|
32
|
+
|
33
|
+
:return: A dict representing the full config state.
|
34
|
+
"""
|
35
|
+
if not self._path.exists():
|
36
|
+
return {}
|
37
|
+
try:
|
38
|
+
text = self._path.read_text(encoding="utf-8")
|
39
|
+
return json.loads(text) or {}
|
40
|
+
except Exception:
|
41
|
+
return {}
|
42
|
+
|
43
|
+
def _save(self) -> None:
|
44
|
+
"""
|
45
|
+
Save a configuration dictionary to the config file.
|
46
|
+
|
47
|
+
:param data: A dict representing the full config state to be written.
|
48
|
+
"""
|
49
|
+
self._path.parent.mkdir(parents=True, exist_ok=True)
|
50
|
+
content = json.dumps(self._data, ensure_ascii=False, indent=2)
|
51
|
+
self._path.write_text(content, encoding="utf-8")
|
52
|
+
|
53
|
+
def _parse_cookie_string(self, cookie_str: str) -> Dict[str, str]:
|
54
|
+
"""
|
55
|
+
Parse a Cookie header string into a dict.
|
56
|
+
|
57
|
+
:param cookie_str: e.g. 'k1=v1; k2=v2; k3'
|
58
|
+
:return: mapping cookie names to values (missing '=' yields empty string)
|
59
|
+
:rtype: Dict[str, str]
|
60
|
+
"""
|
61
|
+
cookies: Dict[str, str] = {}
|
62
|
+
for item in cookie_str.split(";"):
|
63
|
+
item = item.strip()
|
64
|
+
if not item:
|
65
|
+
continue
|
66
|
+
if "=" in item:
|
67
|
+
k, v = item.split("=", 1)
|
68
|
+
cookies[k.strip()] = v.strip()
|
69
|
+
else:
|
70
|
+
cookies[item] = ""
|
71
|
+
return cookies
|
72
|
+
|
73
|
+
def get_language(self) -> str:
|
74
|
+
"""
|
75
|
+
Load the user's language preference, defaulting to 'zh'.
|
76
|
+
|
77
|
+
:return: Language code string
|
78
|
+
"""
|
79
|
+
lang = self._data.get("general", {}).get("lang", "zh")
|
80
|
+
return str(lang)
|
81
|
+
|
82
|
+
def set_language(self, lang: str) -> None:
|
83
|
+
"""
|
84
|
+
Save the user's language preference.
|
85
|
+
|
86
|
+
:param lang: Language code (e.g. 'zh', 'en')
|
87
|
+
"""
|
88
|
+
self._data.setdefault("general", {})["lang"] = lang
|
89
|
+
self._save()
|
90
|
+
|
91
|
+
def get_manual_login_flag(self, site: str) -> bool:
|
92
|
+
"""
|
93
|
+
Retrieve the manual login requirement flag for a specific site.
|
94
|
+
|
95
|
+
:param site: Site identifier (e.g. 'qidian', 'bqg')
|
96
|
+
:return: True if manual login is required (defaults to True)
|
97
|
+
"""
|
98
|
+
val = self._data.get("sites", {}).get(site, {}).get("manual_login", True)
|
99
|
+
return bool(val)
|
100
|
+
|
101
|
+
def set_manual_login_flag(self, site: str, flag: bool) -> None:
|
102
|
+
"""
|
103
|
+
Set the 'manual_login' flag for a specific site.
|
104
|
+
|
105
|
+
:param flag: True if the site requires manual login.
|
106
|
+
:param site: Site identifier (e.g. 'qidian', 'bqg')
|
107
|
+
"""
|
108
|
+
sites = self._data.setdefault("sites", {})
|
109
|
+
site_data = sites.setdefault(site, {})
|
110
|
+
site_data["manual_login"] = flag
|
111
|
+
self._save()
|
112
|
+
|
113
|
+
def get_cookies(self, site: str) -> Dict[str, str]:
|
114
|
+
"""
|
115
|
+
Retrieve the persisted cookies for a specific site.
|
116
|
+
|
117
|
+
:param site: Site identifier (e.g. 'qidian', 'bqg')
|
118
|
+
:return: A dict mapping cookie names to values. Returns empty dict if not set.
|
119
|
+
"""
|
120
|
+
cookies = self._data.get("sites", {}).get(site, {}).get("cookies", {})
|
121
|
+
return {str(k): str(v) for k, v in cookies.items()}
|
122
|
+
|
123
|
+
def set_cookies(self, site: str, cookies: Union[str, Dict[str, str]]) -> None:
|
124
|
+
"""
|
125
|
+
Persist (overwrite) the cookies for a specific site.
|
126
|
+
|
127
|
+
:param site: Site identifier (e.g. 'qidian', 'bqg')
|
128
|
+
:param cookies: Either a dict mapping cookie names to values,
|
129
|
+
or a string (JSON or 'k=v; k2=v2') to be parsed.
|
130
|
+
:raises TypeError: if cookies is neither str nor dict
|
131
|
+
"""
|
132
|
+
# 1) normalize to dict
|
133
|
+
if isinstance(cookies, dict):
|
134
|
+
cookies_dict = cookies
|
135
|
+
elif isinstance(cookies, str):
|
136
|
+
# try JSON first
|
137
|
+
try:
|
138
|
+
parsed = json.loads(cookies)
|
139
|
+
if isinstance(parsed, dict):
|
140
|
+
cookies_dict = parsed # OK!
|
141
|
+
else:
|
142
|
+
raise ValueError
|
143
|
+
except Exception:
|
144
|
+
# fallback to "k=v; k2=v2" format
|
145
|
+
cookies_dict = self._parse_cookie_string(cookies)
|
146
|
+
else:
|
147
|
+
raise TypeError("`cookies` must be a dict or a str")
|
148
|
+
|
149
|
+
# 2) persist
|
150
|
+
sites = self._data.setdefault("sites", {})
|
151
|
+
site_data = sites.setdefault(site, {})
|
152
|
+
site_data["cookies"] = {str(k): str(v) for k, v in cookies_dict.items()}
|
153
|
+
self._save()
|
154
|
+
|
155
|
+
|
156
|
+
state_mgr = StateManager()
|
@@ -0,0 +1,27 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.utils.text_utils
|
5
|
+
---------------------------------
|
6
|
+
|
7
|
+
Utility modules for text formatting, font mapping, cleaning, and diff display.
|
8
|
+
|
9
|
+
Submodules:
|
10
|
+
- font_mapping: Replace obfuscated characters using font maps
|
11
|
+
- chapter_formatting: Build structured chapter strings from raw content
|
12
|
+
- text_cleaning: Remove promo text and check for spam lines
|
13
|
+
- diff_display: Generate inline diffs with aligned character markers
|
14
|
+
"""
|
15
|
+
|
16
|
+
from .chapter_formatting import format_chapter
|
17
|
+
from .diff_display import diff_inline_display
|
18
|
+
from .font_mapping import apply_font_mapping
|
19
|
+
from .text_cleaning import clean_chapter_title, is_promotional_line
|
20
|
+
|
21
|
+
__all__ = [
|
22
|
+
"apply_font_mapping",
|
23
|
+
"format_chapter",
|
24
|
+
"clean_chapter_title",
|
25
|
+
"is_promotional_line",
|
26
|
+
"diff_inline_display",
|
27
|
+
]
|
@@ -0,0 +1,46 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.utils.text_utils.chapter_formatting
|
5
|
+
----------------------------------------------------
|
6
|
+
|
7
|
+
Format chapter content with title, paragraph blocks, and optional author notes.
|
8
|
+
"""
|
9
|
+
|
10
|
+
from typing import List, Optional
|
11
|
+
|
12
|
+
|
13
|
+
def format_chapter(
|
14
|
+
title: str, paragraphs: str, author_say: Optional[str] = None
|
15
|
+
) -> str:
|
16
|
+
"""
|
17
|
+
Build a formatted chapter string with title, paragraphs, and optional author note.
|
18
|
+
|
19
|
+
:param title: The chapter title.
|
20
|
+
:param paragraphs: Raw multi‐line string; lines are treated as paragraphs.
|
21
|
+
:param author_say: Optional author comment to append at the end.
|
22
|
+
:return: A single string where title, paragraphs, and author note
|
23
|
+
are separated by blank lines.
|
24
|
+
"""
|
25
|
+
parts: List[str] = [title.strip()]
|
26
|
+
|
27
|
+
# add each nonempty paragraph line
|
28
|
+
for ln in paragraphs.splitlines():
|
29
|
+
line = ln.strip()
|
30
|
+
if line:
|
31
|
+
parts.append(line)
|
32
|
+
|
33
|
+
# add author_say lines if present
|
34
|
+
if author_say:
|
35
|
+
author_lines = [ln.strip() for ln in author_say.splitlines() if ln.strip()]
|
36
|
+
if author_lines:
|
37
|
+
parts.append("---")
|
38
|
+
parts.append("作者说:")
|
39
|
+
parts.extend(author_lines)
|
40
|
+
|
41
|
+
return "\n\n".join(parts)
|
42
|
+
|
43
|
+
|
44
|
+
__all__ = [
|
45
|
+
"format_chapter",
|
46
|
+
]
|
@@ -0,0 +1,75 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.utils.text_utils.diff_display
|
5
|
+
----------------------------------------------
|
6
|
+
|
7
|
+
Generate inline character-level diff between two strings with visual markers.
|
8
|
+
"""
|
9
|
+
|
10
|
+
import difflib
|
11
|
+
import unicodedata
|
12
|
+
|
13
|
+
|
14
|
+
def _char_width_space(
|
15
|
+
c: str, normal_char: str = " ", asian_char: str = "\u3000"
|
16
|
+
) -> str:
|
17
|
+
"""
|
18
|
+
Return a placeholder space character matching the width of `c`.
|
19
|
+
|
20
|
+
Fullwidth (F) or Wide (W) characters map to `asian_char`, else `normal_char`.
|
21
|
+
|
22
|
+
:param c: A single character.
|
23
|
+
:param normal_char: Replacement for narrow chars (default U+0020).
|
24
|
+
:param asian_char: Replacement for wide chars (default U+3000).
|
25
|
+
:return: The appropriate space character.
|
26
|
+
"""
|
27
|
+
return asian_char if unicodedata.east_asian_width(c) in ("F", "W") else normal_char
|
28
|
+
|
29
|
+
|
30
|
+
def diff_inline_display(old_str: str, new_str: str) -> str:
|
31
|
+
"""
|
32
|
+
Show inline diff between two strings,
|
33
|
+
marking deletions with '^' and insertions with '^'.
|
34
|
+
|
35
|
+
:param old_str: Original string (prefixed '-' will be trimmed).
|
36
|
+
:param new_str: Modified string (prefixed '+' will be trimmed).
|
37
|
+
:return: A multiline diff display with aligned markers.
|
38
|
+
"""
|
39
|
+
space_1 = " "
|
40
|
+
space_2 = "\u3000"
|
41
|
+
mark_1 = "^"
|
42
|
+
mark_2 = "\ufe3f" # '人' / '\ufe3f' / '宀' / '立' / '八'
|
43
|
+
|
44
|
+
# Clean leading +/- if present
|
45
|
+
s1 = old_str.lstrip("-").strip()
|
46
|
+
s2 = new_str.lstrip("+").strip()
|
47
|
+
|
48
|
+
sm = difflib.SequenceMatcher(None, s1, s2)
|
49
|
+
marker_s1 = ""
|
50
|
+
marker_s2 = ""
|
51
|
+
|
52
|
+
for tag, i1, i2, j1, j2 in sm.get_opcodes():
|
53
|
+
if tag == "equal":
|
54
|
+
s1_seg = s1[i1:i2]
|
55
|
+
s2_seg = s2[j1:j2]
|
56
|
+
marker_s1 += "".join(_char_width_space(c, space_1, space_2) for c in s1_seg)
|
57
|
+
marker_s2 += "".join(_char_width_space(c, space_1, space_2) for c in s2_seg)
|
58
|
+
elif tag == "delete":
|
59
|
+
seg = s1[i1:i2]
|
60
|
+
marker_s1 += "".join(_char_width_space(c, mark_1, mark_2) for c in seg)
|
61
|
+
elif tag == "insert":
|
62
|
+
seg = s2[j1:j2]
|
63
|
+
marker_s2 += "".join(_char_width_space(c, mark_1, mark_2) for c in seg)
|
64
|
+
elif tag == "replace":
|
65
|
+
s1_seg = s1[i1:i2]
|
66
|
+
s2_seg = s2[j1:j2]
|
67
|
+
marker_s1 += "".join(_char_width_space(c, mark_1, mark_2) for c in s1_seg)
|
68
|
+
marker_s2 += "".join(_char_width_space(c, mark_1, mark_2) for c in s2_seg)
|
69
|
+
output_str = f"-{s1}\n {marker_s1}\n+{s2}\n {marker_s2}"
|
70
|
+
return output_str
|
71
|
+
|
72
|
+
|
73
|
+
__all__ = [
|
74
|
+
"diff_inline_display",
|
75
|
+
]
|
@@ -0,0 +1,31 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.utils.text_utils.font_mapping
|
5
|
+
----------------------------------------------
|
6
|
+
|
7
|
+
Utility for decoding obfuscated text by applying character-level font mapping.
|
8
|
+
|
9
|
+
This is commonly used to reverse font-based obfuscation in scraped content,
|
10
|
+
where characters are visually disguised via custom font glyphs but can be
|
11
|
+
recovered using a known mapping.
|
12
|
+
"""
|
13
|
+
|
14
|
+
from typing import Dict
|
15
|
+
|
16
|
+
|
17
|
+
def apply_font_mapping(text: str, font_map: Dict[str, str]) -> str:
|
18
|
+
"""
|
19
|
+
Replace each character in `text` using `font_map`,
|
20
|
+
leaving unmapped characters unchanged.
|
21
|
+
|
22
|
+
:param text: The input string, possibly containing obfuscated font chars.
|
23
|
+
:param font_map: A dict mapping obfuscated chars to real chars.
|
24
|
+
:return: The de‐obfuscated text.
|
25
|
+
"""
|
26
|
+
return "".join(font_map.get(ch, ch) for ch in text)
|
27
|
+
|
28
|
+
|
29
|
+
__all__ = [
|
30
|
+
"apply_font_mapping",
|
31
|
+
]
|
@@ -0,0 +1,57 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.utils.text_utils.text_cleaning
|
5
|
+
-----------------------------------------------
|
6
|
+
|
7
|
+
Tools for detecting and removing promotional or ad-like content from text.
|
8
|
+
"""
|
9
|
+
|
10
|
+
import re
|
11
|
+
|
12
|
+
from novel_downloader.utils.file_utils.io import load_blacklisted_words
|
13
|
+
|
14
|
+
# --- Constants & Precompiled Patterns ---
|
15
|
+
|
16
|
+
_BLACKLISTED_WORDS = load_blacklisted_words()
|
17
|
+
|
18
|
+
_BRACKET_PATTERN = re.compile(r"[\((](.*?)[\))]")
|
19
|
+
_K_PROMO_PATTERN = re.compile(r"\b\d{1,4}k\b", re.IGNORECASE)
|
20
|
+
|
21
|
+
|
22
|
+
def clean_chapter_title(title: str) -> str:
|
23
|
+
"""
|
24
|
+
Remove bracketed promotional content from a chapter title.
|
25
|
+
|
26
|
+
If any blacklisted word appears inside parentheses (Chinese or English),
|
27
|
+
the entire bracketed section is stripped.
|
28
|
+
|
29
|
+
:param title: Original title, possibly containing ad text in brackets.
|
30
|
+
:return: Title with offending bracketed sections removed.
|
31
|
+
"""
|
32
|
+
cleaned = title
|
33
|
+
for content in _BRACKET_PATTERN.findall(title):
|
34
|
+
if any(bw in content for bw in _BLACKLISTED_WORDS):
|
35
|
+
cleaned = re.sub(rf"[\((]{re.escape(content)}[\))]", "", cleaned)
|
36
|
+
return cleaned.strip()
|
37
|
+
|
38
|
+
|
39
|
+
def is_promotional_line(line: str) -> bool:
|
40
|
+
"""
|
41
|
+
Check if a line of text likely contains promotional or ad‐like content.
|
42
|
+
|
43
|
+
:param line: A single line of text.
|
44
|
+
:return: True if it contains promo keywords or a '###k' vote count pattern.
|
45
|
+
"""
|
46
|
+
low = line.lower()
|
47
|
+
if any(kw in low for kw in _BLACKLISTED_WORDS):
|
48
|
+
return True
|
49
|
+
if _K_PROMO_PATTERN.search(low):
|
50
|
+
return True
|
51
|
+
return False
|
52
|
+
|
53
|
+
|
54
|
+
__all__ = [
|
55
|
+
"clean_chapter_title",
|
56
|
+
"is_promotional_line",
|
57
|
+
]
|