novel-downloader 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. novel_downloader/__init__.py +14 -0
  2. novel_downloader/cli/__init__.py +14 -0
  3. novel_downloader/cli/clean.py +134 -0
  4. novel_downloader/cli/download.py +132 -0
  5. novel_downloader/cli/interactive.py +67 -0
  6. novel_downloader/cli/main.py +45 -0
  7. novel_downloader/cli/settings.py +177 -0
  8. novel_downloader/config/__init__.py +52 -0
  9. novel_downloader/config/adapter.py +153 -0
  10. novel_downloader/config/loader.py +177 -0
  11. novel_downloader/config/models.py +173 -0
  12. novel_downloader/config/site_rules.py +97 -0
  13. novel_downloader/core/__init__.py +25 -0
  14. novel_downloader/core/downloaders/__init__.py +22 -0
  15. novel_downloader/core/downloaders/base_async_downloader.py +157 -0
  16. novel_downloader/core/downloaders/base_downloader.py +187 -0
  17. novel_downloader/core/downloaders/common_asynb_downloader.py +207 -0
  18. novel_downloader/core/downloaders/common_downloader.py +191 -0
  19. novel_downloader/core/downloaders/qidian_downloader.py +208 -0
  20. novel_downloader/core/factory/__init__.py +33 -0
  21. novel_downloader/core/factory/downloader_factory.py +149 -0
  22. novel_downloader/core/factory/parser_factory.py +62 -0
  23. novel_downloader/core/factory/requester_factory.py +106 -0
  24. novel_downloader/core/factory/saver_factory.py +49 -0
  25. novel_downloader/core/interfaces/__init__.py +32 -0
  26. novel_downloader/core/interfaces/async_downloader_protocol.py +37 -0
  27. novel_downloader/core/interfaces/async_requester_protocol.py +68 -0
  28. novel_downloader/core/interfaces/downloader_protocol.py +37 -0
  29. novel_downloader/core/interfaces/parser_protocol.py +40 -0
  30. novel_downloader/core/interfaces/requester_protocol.py +65 -0
  31. novel_downloader/core/interfaces/saver_protocol.py +61 -0
  32. novel_downloader/core/parsers/__init__.py +28 -0
  33. novel_downloader/core/parsers/base_parser.py +96 -0
  34. novel_downloader/core/parsers/common_parser/__init__.py +14 -0
  35. novel_downloader/core/parsers/common_parser/helper.py +321 -0
  36. novel_downloader/core/parsers/common_parser/main_parser.py +86 -0
  37. novel_downloader/core/parsers/qidian_parser/__init__.py +20 -0
  38. novel_downloader/core/parsers/qidian_parser/browser/__init__.py +13 -0
  39. novel_downloader/core/parsers/qidian_parser/browser/chapter_encrypted.py +498 -0
  40. novel_downloader/core/parsers/qidian_parser/browser/chapter_normal.py +97 -0
  41. novel_downloader/core/parsers/qidian_parser/browser/chapter_router.py +70 -0
  42. novel_downloader/core/parsers/qidian_parser/browser/main_parser.py +110 -0
  43. novel_downloader/core/parsers/qidian_parser/session/__init__.py +13 -0
  44. novel_downloader/core/parsers/qidian_parser/session/chapter_encrypted.py +451 -0
  45. novel_downloader/core/parsers/qidian_parser/session/chapter_normal.py +119 -0
  46. novel_downloader/core/parsers/qidian_parser/session/chapter_router.py +67 -0
  47. novel_downloader/core/parsers/qidian_parser/session/main_parser.py +113 -0
  48. novel_downloader/core/parsers/qidian_parser/session/node_decryptor.py +164 -0
  49. novel_downloader/core/parsers/qidian_parser/shared/__init__.py +38 -0
  50. novel_downloader/core/parsers/qidian_parser/shared/book_info_parser.py +95 -0
  51. novel_downloader/core/parsers/qidian_parser/shared/helpers.py +133 -0
  52. novel_downloader/core/requesters/__init__.py +31 -0
  53. novel_downloader/core/requesters/base_async_session.py +297 -0
  54. novel_downloader/core/requesters/base_browser.py +210 -0
  55. novel_downloader/core/requesters/base_session.py +243 -0
  56. novel_downloader/core/requesters/common_requester/__init__.py +18 -0
  57. novel_downloader/core/requesters/common_requester/common_async_session.py +96 -0
  58. novel_downloader/core/requesters/common_requester/common_session.py +126 -0
  59. novel_downloader/core/requesters/qidian_requester/__init__.py +22 -0
  60. novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +377 -0
  61. novel_downloader/core/requesters/qidian_requester/qidian_session.py +202 -0
  62. novel_downloader/core/savers/__init__.py +20 -0
  63. novel_downloader/core/savers/base_saver.py +169 -0
  64. novel_downloader/core/savers/common_saver/__init__.py +13 -0
  65. novel_downloader/core/savers/common_saver/common_epub.py +232 -0
  66. novel_downloader/core/savers/common_saver/common_txt.py +176 -0
  67. novel_downloader/core/savers/common_saver/main_saver.py +86 -0
  68. novel_downloader/core/savers/epub_utils/__init__.py +27 -0
  69. novel_downloader/core/savers/epub_utils/css_builder.py +68 -0
  70. novel_downloader/core/savers/epub_utils/initializer.py +98 -0
  71. novel_downloader/core/savers/epub_utils/text_to_html.py +132 -0
  72. novel_downloader/core/savers/epub_utils/volume_intro.py +61 -0
  73. novel_downloader/core/savers/qidian_saver.py +22 -0
  74. novel_downloader/locales/en.json +91 -0
  75. novel_downloader/locales/zh.json +91 -0
  76. novel_downloader/resources/config/rules.toml +196 -0
  77. novel_downloader/resources/config/settings.yaml +73 -0
  78. novel_downloader/resources/css_styles/main.css +104 -0
  79. novel_downloader/resources/css_styles/volume-intro.css +56 -0
  80. novel_downloader/resources/images/volume_border.png +0 -0
  81. novel_downloader/resources/js_scripts/qidian_decrypt_node.js +82 -0
  82. novel_downloader/resources/json/replace_word_map.json +4 -0
  83. novel_downloader/resources/text/blacklist.txt +22 -0
  84. novel_downloader/utils/__init__.py +0 -0
  85. novel_downloader/utils/cache.py +24 -0
  86. novel_downloader/utils/constants.py +158 -0
  87. novel_downloader/utils/crypto_utils.py +144 -0
  88. novel_downloader/utils/file_utils/__init__.py +43 -0
  89. novel_downloader/utils/file_utils/io.py +252 -0
  90. novel_downloader/utils/file_utils/normalize.py +68 -0
  91. novel_downloader/utils/file_utils/sanitize.py +77 -0
  92. novel_downloader/utils/fontocr/__init__.py +23 -0
  93. novel_downloader/utils/fontocr/ocr_v1.py +304 -0
  94. novel_downloader/utils/fontocr/ocr_v2.py +658 -0
  95. novel_downloader/utils/hash_store.py +288 -0
  96. novel_downloader/utils/hash_utils.py +103 -0
  97. novel_downloader/utils/i18n.py +41 -0
  98. novel_downloader/utils/logger.py +104 -0
  99. novel_downloader/utils/model_loader.py +72 -0
  100. novel_downloader/utils/network.py +287 -0
  101. novel_downloader/utils/state.py +156 -0
  102. novel_downloader/utils/text_utils/__init__.py +27 -0
  103. novel_downloader/utils/text_utils/chapter_formatting.py +46 -0
  104. novel_downloader/utils/text_utils/diff_display.py +75 -0
  105. novel_downloader/utils/text_utils/font_mapping.py +31 -0
  106. novel_downloader/utils/text_utils/text_cleaning.py +57 -0
  107. novel_downloader/utils/time_utils/__init__.py +22 -0
  108. novel_downloader/utils/time_utils/datetime_utils.py +146 -0
  109. novel_downloader/utils/time_utils/sleep_utils.py +49 -0
  110. novel_downloader-1.1.0.dist-info/METADATA +157 -0
  111. novel_downloader-1.1.0.dist-info/RECORD +115 -0
  112. novel_downloader-1.1.0.dist-info/WHEEL +5 -0
  113. novel_downloader-1.1.0.dist-info/entry_points.txt +2 -0
  114. novel_downloader-1.1.0.dist-info/licenses/LICENSE +21 -0
  115. novel_downloader-1.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,287 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.utils.network
5
+ ------------------------------
6
+
7
+ Utilities for handling HTTP requests and downloading remote resources.
8
+ """
9
+
10
+ import logging
11
+ import random
12
+ import time
13
+ from pathlib import Path
14
+ from typing import Dict, Literal, Optional, Union
15
+ from urllib.parse import unquote, urlparse
16
+
17
+ import requests
18
+
19
+ from .constants import DEFAULT_HEADERS, DEFAULT_IMAGE_SUFFIX
20
+ from .file_utils.io import _get_non_conflicting_path, _write_file, read_binary_file
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ _DEFAULT_CHUNK_SIZE = 8192 # 8KB per chunk for streaming downloads
25
+
26
+
27
+ def http_get_with_retry(
28
+ url: str,
29
+ *,
30
+ retries: int = 3,
31
+ timeout: int = 10,
32
+ backoff: float = 0.5,
33
+ headers: Optional[Dict[str, str]] = None,
34
+ stream: bool = False,
35
+ ) -> Optional[requests.Response]:
36
+ """
37
+ Perform a GET request with retry support.
38
+
39
+ :param url: URL to request.
40
+ :param retries: Number of retry attempts.
41
+ :param timeout: Timeout in seconds per request.
42
+ :param backoff: Base backoff delay between retries.
43
+ :param headers: Optional HTTP headers.
44
+ :param stream: Whether to stream the response.
45
+ :return: Response object if successful, else None.
46
+ """
47
+ for attempt in range(1, retries + 1):
48
+ try:
49
+ response = requests.get(
50
+ url, timeout=timeout, headers=headers, stream=stream
51
+ )
52
+ response.raise_for_status()
53
+ return response
54
+ except requests.RequestException as e:
55
+ logger.warning("[http] Attempt %s/%s failed: %s", attempt, retries, e)
56
+ if attempt < retries:
57
+ sleep_time = backoff * (2 ** (attempt - 1)) + random.uniform(0, 0.1)
58
+ time.sleep(sleep_time)
59
+ except Exception as e:
60
+ logger.error("[http] Unexpected error: %s", e)
61
+ break
62
+
63
+ logger.error("[http] Failed after %s attempts: %s", retries, url)
64
+ return None
65
+
66
+
67
+ def image_url_to_filename(url: str) -> str:
68
+ """
69
+ Parse and sanitize a image filename from a URL.
70
+ If no filename or suffix exists, fallback to default name and extension.
71
+
72
+ :param url: URL string
73
+ :return: Safe filename string
74
+ """
75
+ parsed_url = urlparse(url)
76
+ path = unquote(parsed_url.path)
77
+ filename = Path(path).name
78
+
79
+ if not filename:
80
+ filename = "image"
81
+
82
+ if not Path(filename).suffix:
83
+ filename += DEFAULT_IMAGE_SUFFIX
84
+
85
+ return filename
86
+
87
+
88
+ def download_image_as_bytes(
89
+ url: str,
90
+ target_folder: Optional[Union[str, Path]] = None,
91
+ *,
92
+ timeout: int = 10,
93
+ retries: int = 3,
94
+ backoff: float = 0.5,
95
+ on_exist: Literal["overwrite", "skip", "rename"] = "overwrite",
96
+ ) -> Optional[bytes]:
97
+ """
98
+ Download an image from a given URL and return its content as bytes.
99
+
100
+ If on_exist='skip' and the file already exists, it will be read from disk
101
+ instead of being downloaded again.
102
+
103
+ :param url: Image URL. Can start with 'http', '//', or without protocol.
104
+ :param target_folder: Optional folder to save the image (str or Path).
105
+ :param timeout: Request timeout in seconds.
106
+ :param retries: Number of retry attempts.
107
+ :param backoff: Base delay between retries (exponential backoff).
108
+ :param on_exist: What to do if file exists: 'overwrite', 'skip', or 'rename'.
109
+ :return: Image content as bytes, or None if failed.
110
+ """
111
+ # Normalize URL
112
+ if url.startswith("//"):
113
+ url = "https:" + url
114
+ elif not url.startswith("http"):
115
+ url = "https://" + url
116
+
117
+ save_path = None
118
+ if target_folder:
119
+ target_folder = Path(target_folder)
120
+ filename = image_url_to_filename(url)
121
+ save_path = target_folder / filename
122
+
123
+ if on_exist == "skip" and save_path.exists():
124
+ logger.info(
125
+ "[image] '%s' exists, skipping download and reading from disk.",
126
+ save_path,
127
+ )
128
+ return read_binary_file(save_path)
129
+
130
+ # Proceed with download
131
+ response = http_get_with_retry(
132
+ url,
133
+ retries=retries,
134
+ timeout=timeout,
135
+ backoff=backoff,
136
+ headers=DEFAULT_HEADERS,
137
+ stream=False,
138
+ )
139
+
140
+ if response and response.ok:
141
+ content = response.content
142
+
143
+ if save_path:
144
+ _write_file(
145
+ content=content,
146
+ filepath=save_path,
147
+ mode="wb",
148
+ on_exist=on_exist,
149
+ )
150
+
151
+ return content
152
+
153
+ return None
154
+
155
+
156
+ def download_font_file(
157
+ url: str,
158
+ target_folder: Union[str, Path],
159
+ *,
160
+ timeout: int = 10,
161
+ retries: int = 3,
162
+ backoff: float = 0.5,
163
+ on_exist: Literal["overwrite", "skip", "rename"] = "skip",
164
+ ) -> Optional[Path]:
165
+ """
166
+ Download a font file from a URL and save it locally with retry and overwrite control
167
+
168
+ :param url: Fully-qualified font file URL.
169
+ :param target_folder: Local folder to save the font file.
170
+ :param timeout: Timeout for each request (in seconds).
171
+ :param retries: Number of retry attempts.
172
+ :param backoff: Base backoff time between retries (in seconds).
173
+ :param on_exist: File conflict strategy: 'overwrite', 'skip', or 'rename'.
174
+ :return: Path to the saved font file, or None if failed.
175
+ """
176
+ # Validate and parse URL
177
+ parsed = urlparse(url)
178
+ if not parsed.scheme or not parsed.netloc:
179
+ logger.warning("[font] Invalid URL: %s", url)
180
+ return None
181
+
182
+ # Determine filename
183
+ filename = Path(unquote(parsed.path)).name
184
+ if not filename:
185
+ logger.warning("[font] Could not extract filename from URL: %s", url)
186
+ return None
187
+
188
+ # Resolve save path
189
+ target_folder = Path(target_folder)
190
+ target_folder.mkdir(parents=True, exist_ok=True)
191
+ font_path = target_folder / filename
192
+
193
+ # If skip and file exists -> return immediately
194
+ if on_exist == "skip" and font_path.exists():
195
+ logger.info("[font] File exists, skipping download: %s", font_path)
196
+ return font_path
197
+
198
+ # Retry download with exponential backoff
199
+ response = http_get_with_retry(
200
+ url,
201
+ retries=retries,
202
+ timeout=timeout,
203
+ backoff=backoff,
204
+ headers=DEFAULT_HEADERS,
205
+ stream=True,
206
+ )
207
+
208
+ if response:
209
+ try:
210
+ if on_exist == "rename":
211
+ font_path = _get_non_conflicting_path(font_path)
212
+
213
+ with open(font_path, "wb") as f:
214
+ for chunk in response.iter_content(chunk_size=_DEFAULT_CHUNK_SIZE):
215
+ if chunk:
216
+ f.write(chunk)
217
+
218
+ logger.info("[font] Font saved to: %s", font_path)
219
+ return font_path
220
+
221
+ except Exception as e:
222
+ logger.error("[font] Error writing font to disk: %s", e)
223
+
224
+ return None
225
+
226
+
227
+ def download_js_file(
228
+ url: str,
229
+ target_folder: Union[str, Path],
230
+ *,
231
+ timeout: int = 10,
232
+ retries: int = 3,
233
+ backoff: float = 0.5,
234
+ on_exist: Literal["overwrite", "skip", "rename"] = "skip",
235
+ ) -> Optional[Path]:
236
+ """
237
+ Download a JavaScript (.js) file from a URL and save it locally.
238
+
239
+ :param url: Fully-qualified JS file URL.
240
+ :param target_folder: Local folder to save the JS file.
241
+ :param timeout: Timeout for each request (in seconds).
242
+ :param retries: Number of retry attempts.
243
+ :param backoff: Base backoff time between retries (in seconds).
244
+ :param on_exist: File conflict strategy: 'overwrite', 'skip', or 'rename'.
245
+ :return: Path to the saved JS file, or None if failed.
246
+ """
247
+ parsed = urlparse(url)
248
+ if not parsed.scheme or not parsed.netloc:
249
+ logger.warning("[js] Invalid URL: %s", url)
250
+ return None
251
+
252
+ # Determine filename
253
+ filename = Path(unquote(parsed.path)).name
254
+ if not filename.endswith(".js"):
255
+ filename += ".js"
256
+
257
+ target_folder = Path(target_folder)
258
+ target_folder.mkdir(parents=True, exist_ok=True)
259
+ save_path = target_folder / filename
260
+
261
+ if on_exist == "skip" and save_path.exists():
262
+ logger.info("[js] File exists, skipping download: %s", save_path)
263
+ return save_path
264
+
265
+ response = http_get_with_retry(
266
+ url,
267
+ retries=retries,
268
+ timeout=timeout,
269
+ backoff=backoff,
270
+ headers=DEFAULT_HEADERS,
271
+ stream=False,
272
+ )
273
+
274
+ if response and response.ok:
275
+ content = response.content
276
+
277
+ if on_exist == "rename":
278
+ save_path = _get_non_conflicting_path(save_path)
279
+
280
+ try:
281
+ _write_file(content=content, filepath=save_path, mode="wb")
282
+ logger.info("[js] JS file saved to: %s", save_path)
283
+ return save_path
284
+ except Exception as e:
285
+ logger.error("[js] Error writing JS to disk: %s", e)
286
+
287
+ return None
@@ -0,0 +1,156 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.utils.state
5
+ ----------------------------
6
+ State management for user preferences and runtime flags.
7
+
8
+ Supported sections:
9
+ - general: global preferences (e.g. language)
10
+ - sites: per-site flags & data (e.g. manual_login, cookies)
11
+ """
12
+ import json
13
+ from pathlib import Path
14
+ from typing import Any, Dict, Union
15
+
16
+ from .constants import STATE_FILE
17
+
18
+
19
+ class StateManager:
20
+ """
21
+ Manages persistent state for user preferences and runtime flags.
22
+ Stores data in JSON at STATE_FILE.
23
+ """
24
+
25
+ def __init__(self, path: Path = STATE_FILE) -> None:
26
+ self._path = path
27
+ self._data = self._load()
28
+
29
+ def _load(self) -> Dict[str, Any]:
30
+ """
31
+ Load the configuration file into a Python dictionary.
32
+
33
+ :return: A dict representing the full config state.
34
+ """
35
+ if not self._path.exists():
36
+ return {}
37
+ try:
38
+ text = self._path.read_text(encoding="utf-8")
39
+ return json.loads(text) or {}
40
+ except Exception:
41
+ return {}
42
+
43
+ def _save(self) -> None:
44
+ """
45
+ Save a configuration dictionary to the config file.
46
+
47
+ :param data: A dict representing the full config state to be written.
48
+ """
49
+ self._path.parent.mkdir(parents=True, exist_ok=True)
50
+ content = json.dumps(self._data, ensure_ascii=False, indent=2)
51
+ self._path.write_text(content, encoding="utf-8")
52
+
53
+ def _parse_cookie_string(self, cookie_str: str) -> Dict[str, str]:
54
+ """
55
+ Parse a Cookie header string into a dict.
56
+
57
+ :param cookie_str: e.g. 'k1=v1; k2=v2; k3'
58
+ :return: mapping cookie names to values (missing '=' yields empty string)
59
+ :rtype: Dict[str, str]
60
+ """
61
+ cookies: Dict[str, str] = {}
62
+ for item in cookie_str.split(";"):
63
+ item = item.strip()
64
+ if not item:
65
+ continue
66
+ if "=" in item:
67
+ k, v = item.split("=", 1)
68
+ cookies[k.strip()] = v.strip()
69
+ else:
70
+ cookies[item] = ""
71
+ return cookies
72
+
73
+ def get_language(self) -> str:
74
+ """
75
+ Load the user's language preference, defaulting to 'zh'.
76
+
77
+ :return: Language code string
78
+ """
79
+ lang = self._data.get("general", {}).get("lang", "zh")
80
+ return str(lang)
81
+
82
+ def set_language(self, lang: str) -> None:
83
+ """
84
+ Save the user's language preference.
85
+
86
+ :param lang: Language code (e.g. 'zh', 'en')
87
+ """
88
+ self._data.setdefault("general", {})["lang"] = lang
89
+ self._save()
90
+
91
+ def get_manual_login_flag(self, site: str) -> bool:
92
+ """
93
+ Retrieve the manual login requirement flag for a specific site.
94
+
95
+ :param site: Site identifier (e.g. 'qidian', 'bqg')
96
+ :return: True if manual login is required (defaults to True)
97
+ """
98
+ val = self._data.get("sites", {}).get(site, {}).get("manual_login", True)
99
+ return bool(val)
100
+
101
+ def set_manual_login_flag(self, site: str, flag: bool) -> None:
102
+ """
103
+ Set the 'manual_login' flag for a specific site.
104
+
105
+ :param flag: True if the site requires manual login.
106
+ :param site: Site identifier (e.g. 'qidian', 'bqg')
107
+ """
108
+ sites = self._data.setdefault("sites", {})
109
+ site_data = sites.setdefault(site, {})
110
+ site_data["manual_login"] = flag
111
+ self._save()
112
+
113
+ def get_cookies(self, site: str) -> Dict[str, str]:
114
+ """
115
+ Retrieve the persisted cookies for a specific site.
116
+
117
+ :param site: Site identifier (e.g. 'qidian', 'bqg')
118
+ :return: A dict mapping cookie names to values. Returns empty dict if not set.
119
+ """
120
+ cookies = self._data.get("sites", {}).get(site, {}).get("cookies", {})
121
+ return {str(k): str(v) for k, v in cookies.items()}
122
+
123
+ def set_cookies(self, site: str, cookies: Union[str, Dict[str, str]]) -> None:
124
+ """
125
+ Persist (overwrite) the cookies for a specific site.
126
+
127
+ :param site: Site identifier (e.g. 'qidian', 'bqg')
128
+ :param cookies: Either a dict mapping cookie names to values,
129
+ or a string (JSON or 'k=v; k2=v2') to be parsed.
130
+ :raises TypeError: if cookies is neither str nor dict
131
+ """
132
+ # 1) normalize to dict
133
+ if isinstance(cookies, dict):
134
+ cookies_dict = cookies
135
+ elif isinstance(cookies, str):
136
+ # try JSON first
137
+ try:
138
+ parsed = json.loads(cookies)
139
+ if isinstance(parsed, dict):
140
+ cookies_dict = parsed # OK!
141
+ else:
142
+ raise ValueError
143
+ except Exception:
144
+ # fallback to "k=v; k2=v2" format
145
+ cookies_dict = self._parse_cookie_string(cookies)
146
+ else:
147
+ raise TypeError("`cookies` must be a dict or a str")
148
+
149
+ # 2) persist
150
+ sites = self._data.setdefault("sites", {})
151
+ site_data = sites.setdefault(site, {})
152
+ site_data["cookies"] = {str(k): str(v) for k, v in cookies_dict.items()}
153
+ self._save()
154
+
155
+
156
+ state_mgr = StateManager()
@@ -0,0 +1,27 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.utils.text_utils
5
+ ---------------------------------
6
+
7
+ Utility modules for text formatting, font mapping, cleaning, and diff display.
8
+
9
+ Submodules:
10
+ - font_mapping: Replace obfuscated characters using font maps
11
+ - chapter_formatting: Build structured chapter strings from raw content
12
+ - text_cleaning: Remove promo text and check for spam lines
13
+ - diff_display: Generate inline diffs with aligned character markers
14
+ """
15
+
16
+ from .chapter_formatting import format_chapter
17
+ from .diff_display import diff_inline_display
18
+ from .font_mapping import apply_font_mapping
19
+ from .text_cleaning import clean_chapter_title, is_promotional_line
20
+
21
+ __all__ = [
22
+ "apply_font_mapping",
23
+ "format_chapter",
24
+ "clean_chapter_title",
25
+ "is_promotional_line",
26
+ "diff_inline_display",
27
+ ]
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.utils.text_utils.chapter_formatting
5
+ ----------------------------------------------------
6
+
7
+ Format chapter content with title, paragraph blocks, and optional author notes.
8
+ """
9
+
10
+ from typing import List, Optional
11
+
12
+
13
+ def format_chapter(
14
+ title: str, paragraphs: str, author_say: Optional[str] = None
15
+ ) -> str:
16
+ """
17
+ Build a formatted chapter string with title, paragraphs, and optional author note.
18
+
19
+ :param title: The chapter title.
20
+ :param paragraphs: Raw multi‐line string; lines are treated as paragraphs.
21
+ :param author_say: Optional author comment to append at the end.
22
+ :return: A single string where title, paragraphs, and author note
23
+ are separated by blank lines.
24
+ """
25
+ parts: List[str] = [title.strip()]
26
+
27
+ # add each nonempty paragraph line
28
+ for ln in paragraphs.splitlines():
29
+ line = ln.strip()
30
+ if line:
31
+ parts.append(line)
32
+
33
+ # add author_say lines if present
34
+ if author_say:
35
+ author_lines = [ln.strip() for ln in author_say.splitlines() if ln.strip()]
36
+ if author_lines:
37
+ parts.append("---")
38
+ parts.append("作者说:")
39
+ parts.extend(author_lines)
40
+
41
+ return "\n\n".join(parts)
42
+
43
+
44
+ __all__ = [
45
+ "format_chapter",
46
+ ]
@@ -0,0 +1,75 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.utils.text_utils.diff_display
5
+ ----------------------------------------------
6
+
7
+ Generate inline character-level diff between two strings with visual markers.
8
+ """
9
+
10
+ import difflib
11
+ import unicodedata
12
+
13
+
14
+ def _char_width_space(
15
+ c: str, normal_char: str = " ", asian_char: str = "\u3000"
16
+ ) -> str:
17
+ """
18
+ Return a placeholder space character matching the width of `c`.
19
+
20
+ Fullwidth (F) or Wide (W) characters map to `asian_char`, else `normal_char`.
21
+
22
+ :param c: A single character.
23
+ :param normal_char: Replacement for narrow chars (default U+0020).
24
+ :param asian_char: Replacement for wide chars (default U+3000).
25
+ :return: The appropriate space character.
26
+ """
27
+ return asian_char if unicodedata.east_asian_width(c) in ("F", "W") else normal_char
28
+
29
+
30
+ def diff_inline_display(old_str: str, new_str: str) -> str:
31
+ """
32
+ Show inline diff between two strings,
33
+ marking deletions with '^' and insertions with '^'.
34
+
35
+ :param old_str: Original string (prefixed '-' will be trimmed).
36
+ :param new_str: Modified string (prefixed '+' will be trimmed).
37
+ :return: A multiline diff display with aligned markers.
38
+ """
39
+ space_1 = " "
40
+ space_2 = "\u3000"
41
+ mark_1 = "^"
42
+ mark_2 = "\ufe3f" # '人' / '\ufe3f' / '宀' / '立' / '八'
43
+
44
+ # Clean leading +/- if present
45
+ s1 = old_str.lstrip("-").strip()
46
+ s2 = new_str.lstrip("+").strip()
47
+
48
+ sm = difflib.SequenceMatcher(None, s1, s2)
49
+ marker_s1 = ""
50
+ marker_s2 = ""
51
+
52
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
53
+ if tag == "equal":
54
+ s1_seg = s1[i1:i2]
55
+ s2_seg = s2[j1:j2]
56
+ marker_s1 += "".join(_char_width_space(c, space_1, space_2) for c in s1_seg)
57
+ marker_s2 += "".join(_char_width_space(c, space_1, space_2) for c in s2_seg)
58
+ elif tag == "delete":
59
+ seg = s1[i1:i2]
60
+ marker_s1 += "".join(_char_width_space(c, mark_1, mark_2) for c in seg)
61
+ elif tag == "insert":
62
+ seg = s2[j1:j2]
63
+ marker_s2 += "".join(_char_width_space(c, mark_1, mark_2) for c in seg)
64
+ elif tag == "replace":
65
+ s1_seg = s1[i1:i2]
66
+ s2_seg = s2[j1:j2]
67
+ marker_s1 += "".join(_char_width_space(c, mark_1, mark_2) for c in s1_seg)
68
+ marker_s2 += "".join(_char_width_space(c, mark_1, mark_2) for c in s2_seg)
69
+ output_str = f"-{s1}\n {marker_s1}\n+{s2}\n {marker_s2}"
70
+ return output_str
71
+
72
+
73
+ __all__ = [
74
+ "diff_inline_display",
75
+ ]
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.utils.text_utils.font_mapping
5
+ ----------------------------------------------
6
+
7
+ Utility for decoding obfuscated text by applying character-level font mapping.
8
+
9
+ This is commonly used to reverse font-based obfuscation in scraped content,
10
+ where characters are visually disguised via custom font glyphs but can be
11
+ recovered using a known mapping.
12
+ """
13
+
14
+ from typing import Dict
15
+
16
+
17
+ def apply_font_mapping(text: str, font_map: Dict[str, str]) -> str:
18
+ """
19
+ Replace each character in `text` using `font_map`,
20
+ leaving unmapped characters unchanged.
21
+
22
+ :param text: The input string, possibly containing obfuscated font chars.
23
+ :param font_map: A dict mapping obfuscated chars to real chars.
24
+ :return: The de‐obfuscated text.
25
+ """
26
+ return "".join(font_map.get(ch, ch) for ch in text)
27
+
28
+
29
+ __all__ = [
30
+ "apply_font_mapping",
31
+ ]
@@ -0,0 +1,57 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.utils.text_utils.text_cleaning
5
+ -----------------------------------------------
6
+
7
+ Tools for detecting and removing promotional or ad-like content from text.
8
+ """
9
+
10
+ import re
11
+
12
+ from novel_downloader.utils.file_utils.io import load_blacklisted_words
13
+
14
+ # --- Constants & Precompiled Patterns ---
15
+
16
+ _BLACKLISTED_WORDS = load_blacklisted_words()
17
+
18
+ _BRACKET_PATTERN = re.compile(r"[\((](.*?)[\))]")
19
+ _K_PROMO_PATTERN = re.compile(r"\b\d{1,4}k\b", re.IGNORECASE)
20
+
21
+
22
+ def clean_chapter_title(title: str) -> str:
23
+ """
24
+ Remove bracketed promotional content from a chapter title.
25
+
26
+ If any blacklisted word appears inside parentheses (Chinese or English),
27
+ the entire bracketed section is stripped.
28
+
29
+ :param title: Original title, possibly containing ad text in brackets.
30
+ :return: Title with offending bracketed sections removed.
31
+ """
32
+ cleaned = title
33
+ for content in _BRACKET_PATTERN.findall(title):
34
+ if any(bw in content for bw in _BLACKLISTED_WORDS):
35
+ cleaned = re.sub(rf"[\((]{re.escape(content)}[\))]", "", cleaned)
36
+ return cleaned.strip()
37
+
38
+
39
+ def is_promotional_line(line: str) -> bool:
40
+ """
41
+ Check if a line of text likely contains promotional or ad‐like content.
42
+
43
+ :param line: A single line of text.
44
+ :return: True if it contains promo keywords or a '###k' vote count pattern.
45
+ """
46
+ low = line.lower()
47
+ if any(kw in low for kw in _BLACKLISTED_WORDS):
48
+ return True
49
+ if _K_PROMO_PATTERN.search(low):
50
+ return True
51
+ return False
52
+
53
+
54
+ __all__ = [
55
+ "clean_chapter_title",
56
+ "is_promotional_line",
57
+ ]