novel-downloader 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. novel_downloader/__init__.py +14 -0
  2. novel_downloader/cli/__init__.py +14 -0
  3. novel_downloader/cli/clean.py +134 -0
  4. novel_downloader/cli/download.py +98 -0
  5. novel_downloader/cli/interactive.py +67 -0
  6. novel_downloader/cli/main.py +45 -0
  7. novel_downloader/cli/settings.py +177 -0
  8. novel_downloader/config/__init__.py +52 -0
  9. novel_downloader/config/adapter.py +150 -0
  10. novel_downloader/config/loader.py +177 -0
  11. novel_downloader/config/models.py +170 -0
  12. novel_downloader/config/site_rules.py +97 -0
  13. novel_downloader/core/__init__.py +25 -0
  14. novel_downloader/core/downloaders/__init__.py +20 -0
  15. novel_downloader/core/downloaders/base_downloader.py +187 -0
  16. novel_downloader/core/downloaders/common_downloader.py +192 -0
  17. novel_downloader/core/downloaders/qidian_downloader.py +208 -0
  18. novel_downloader/core/factory/__init__.py +21 -0
  19. novel_downloader/core/factory/downloader_factory.py +62 -0
  20. novel_downloader/core/factory/parser_factory.py +62 -0
  21. novel_downloader/core/factory/requester_factory.py +62 -0
  22. novel_downloader/core/factory/saver_factory.py +49 -0
  23. novel_downloader/core/interfaces/__init__.py +28 -0
  24. novel_downloader/core/interfaces/downloader_protocol.py +37 -0
  25. novel_downloader/core/interfaces/parser_protocol.py +40 -0
  26. novel_downloader/core/interfaces/requester_protocol.py +65 -0
  27. novel_downloader/core/interfaces/saver_protocol.py +61 -0
  28. novel_downloader/core/parsers/__init__.py +28 -0
  29. novel_downloader/core/parsers/base_parser.py +96 -0
  30. novel_downloader/core/parsers/common_parser/__init__.py +14 -0
  31. novel_downloader/core/parsers/common_parser/helper.py +321 -0
  32. novel_downloader/core/parsers/common_parser/main_parser.py +86 -0
  33. novel_downloader/core/parsers/qidian_parser/__init__.py +20 -0
  34. novel_downloader/core/parsers/qidian_parser/browser/__init__.py +13 -0
  35. novel_downloader/core/parsers/qidian_parser/browser/chapter_encrypted.py +498 -0
  36. novel_downloader/core/parsers/qidian_parser/browser/chapter_normal.py +97 -0
  37. novel_downloader/core/parsers/qidian_parser/browser/chapter_router.py +70 -0
  38. novel_downloader/core/parsers/qidian_parser/browser/main_parser.py +110 -0
  39. novel_downloader/core/parsers/qidian_parser/session/__init__.py +13 -0
  40. novel_downloader/core/parsers/qidian_parser/session/chapter_encrypted.py +451 -0
  41. novel_downloader/core/parsers/qidian_parser/session/chapter_normal.py +119 -0
  42. novel_downloader/core/parsers/qidian_parser/session/chapter_router.py +67 -0
  43. novel_downloader/core/parsers/qidian_parser/session/main_parser.py +113 -0
  44. novel_downloader/core/parsers/qidian_parser/session/node_decryptor.py +164 -0
  45. novel_downloader/core/parsers/qidian_parser/shared/__init__.py +38 -0
  46. novel_downloader/core/parsers/qidian_parser/shared/book_info_parser.py +95 -0
  47. novel_downloader/core/parsers/qidian_parser/shared/helpers.py +133 -0
  48. novel_downloader/core/requesters/__init__.py +27 -0
  49. novel_downloader/core/requesters/base_browser.py +210 -0
  50. novel_downloader/core/requesters/base_session.py +243 -0
  51. novel_downloader/core/requesters/common_requester/__init__.py +14 -0
  52. novel_downloader/core/requesters/common_requester/common_session.py +126 -0
  53. novel_downloader/core/requesters/qidian_requester/__init__.py +22 -0
  54. novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +377 -0
  55. novel_downloader/core/requesters/qidian_requester/qidian_session.py +202 -0
  56. novel_downloader/core/savers/__init__.py +20 -0
  57. novel_downloader/core/savers/base_saver.py +169 -0
  58. novel_downloader/core/savers/common_saver/__init__.py +13 -0
  59. novel_downloader/core/savers/common_saver/common_epub.py +232 -0
  60. novel_downloader/core/savers/common_saver/common_txt.py +176 -0
  61. novel_downloader/core/savers/common_saver/main_saver.py +86 -0
  62. novel_downloader/core/savers/epub_utils/__init__.py +27 -0
  63. novel_downloader/core/savers/epub_utils/css_builder.py +68 -0
  64. novel_downloader/core/savers/epub_utils/initializer.py +98 -0
  65. novel_downloader/core/savers/epub_utils/text_to_html.py +132 -0
  66. novel_downloader/core/savers/epub_utils/volume_intro.py +61 -0
  67. novel_downloader/core/savers/qidian_saver.py +22 -0
  68. novel_downloader/locales/en.json +91 -0
  69. novel_downloader/locales/zh.json +91 -0
  70. novel_downloader/resources/config/rules.toml +196 -0
  71. novel_downloader/resources/config/settings.yaml +70 -0
  72. novel_downloader/resources/css_styles/main.css +104 -0
  73. novel_downloader/resources/css_styles/volume-intro.css +56 -0
  74. novel_downloader/resources/images/volume_border.png +0 -0
  75. novel_downloader/resources/js_scripts/qidian_decrypt_node.js +82 -0
  76. novel_downloader/resources/json/replace_word_map.json +4 -0
  77. novel_downloader/resources/text/blacklist.txt +22 -0
  78. novel_downloader/utils/__init__.py +0 -0
  79. novel_downloader/utils/cache.py +24 -0
  80. novel_downloader/utils/constants.py +158 -0
  81. novel_downloader/utils/crypto_utils.py +144 -0
  82. novel_downloader/utils/file_utils/__init__.py +43 -0
  83. novel_downloader/utils/file_utils/io.py +252 -0
  84. novel_downloader/utils/file_utils/normalize.py +68 -0
  85. novel_downloader/utils/file_utils/sanitize.py +77 -0
  86. novel_downloader/utils/fontocr/__init__.py +23 -0
  87. novel_downloader/utils/fontocr/ocr_v1.py +304 -0
  88. novel_downloader/utils/fontocr/ocr_v2.py +658 -0
  89. novel_downloader/utils/hash_store.py +288 -0
  90. novel_downloader/utils/hash_utils.py +103 -0
  91. novel_downloader/utils/i18n.py +41 -0
  92. novel_downloader/utils/logger.py +104 -0
  93. novel_downloader/utils/model_loader.py +72 -0
  94. novel_downloader/utils/network.py +287 -0
  95. novel_downloader/utils/state.py +156 -0
  96. novel_downloader/utils/text_utils/__init__.py +27 -0
  97. novel_downloader/utils/text_utils/chapter_formatting.py +46 -0
  98. novel_downloader/utils/text_utils/diff_display.py +75 -0
  99. novel_downloader/utils/text_utils/font_mapping.py +31 -0
  100. novel_downloader/utils/text_utils/text_cleaning.py +57 -0
  101. novel_downloader/utils/time_utils/__init__.py +22 -0
  102. novel_downloader/utils/time_utils/datetime_utils.py +146 -0
  103. novel_downloader/utils/time_utils/sleep_utils.py +49 -0
  104. novel_downloader-1.1.1.dist-info/METADATA +137 -0
  105. novel_downloader-1.1.1.dist-info/RECORD +109 -0
  106. novel_downloader-1.1.1.dist-info/WHEEL +5 -0
  107. novel_downloader-1.1.1.dist-info/entry_points.txt +2 -0
  108. novel_downloader-1.1.1.dist-info/licenses/LICENSE +21 -0
  109. novel_downloader-1.1.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,77 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.utils.file_utils.sanitize
5
+ ------------------------------------------
6
+
7
+ Utility functions for cleaning and validating filenames for safe use
8
+ on different operating systems.
9
+
10
+ This module provides a cross-platform `sanitize_filename` function
11
+ that replaces or removes illegal characters from filenames, trims
12
+ lengths, and avoids reserved names on Windows systems.
13
+ """
14
+
15
+ import logging
16
+ import os
17
+ import re
18
+ from typing import Optional
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # Windows 保留名称列表 (忽略大小写)
23
+ _WIN_RESERVED_NAMES = {
24
+ "CON",
25
+ "PRN",
26
+ "AUX",
27
+ "NUL",
28
+ *(f"COM{i}" for i in range(1, 10)),
29
+ *(f"LPT{i}" for i in range(1, 10)),
30
+ }
31
+
32
+ _SANITIZE_PATTERN_WIN = re.compile(r'[<>:"/\\|?*\x00-\x1F]')
33
+ _SANITIZE_PATTERN_POSIX = re.compile(r"[/\x00]")
34
+
35
+
36
+ def sanitize_filename(filename: str, max_length: Optional[int] = 255) -> str:
37
+ """
38
+ Sanitize the given filename by replacing characters
39
+ that are invalid in file paths with '_'.
40
+
41
+ This function checks the operating system environment and applies the appropriate
42
+ filtering rules:
43
+ - On Windows, it replaces characters: <>:"/\\|?*
44
+ - On POSIX systems, it replaces the forward slash '/'
45
+
46
+ :param filename: The input filename to sanitize.
47
+ :param max_length: Optional maximum length of the output filename. Defaults to 255.
48
+ :return: The sanitized filename as a string.
49
+ """
50
+ if os.name == "nt":
51
+ # Windows: invalid characters in filenames are: <>:"/\\|?*
52
+ pattern = _SANITIZE_PATTERN_WIN
53
+ else:
54
+ # POSIX systems: the forward slash is not allowed
55
+ pattern = _SANITIZE_PATTERN_POSIX
56
+
57
+ name = pattern.sub("_", filename).strip(" .")
58
+
59
+ stem, dot, ext = name.partition(".")
60
+ if os.name == "nt" and stem.upper() in _WIN_RESERVED_NAMES:
61
+ stem = f"_{stem}"
62
+ cleaned = f"{stem}{dot}{ext}" if ext else stem
63
+
64
+ if max_length and len(cleaned) > max_length:
65
+ if ext:
66
+ keep = max_length - len(ext) - 1
67
+ cleaned = f"{cleaned[:keep]}.{ext}"
68
+ else:
69
+ cleaned = cleaned[:max_length]
70
+
71
+ if not cleaned:
72
+ cleaned = "_untitled"
73
+ logger.debug("[file] Sanitized filename: %r -> %r", filename, cleaned)
74
+ return cleaned
75
+
76
+
77
+ __all__ = ["sanitize_filename"]
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.utils.fontocr
5
+ ------------------------------
6
+
7
+ Utilities for font-based OCR, primarily used to decode custom font obfuscation
8
+
9
+ Supports:
10
+ - Font rendering and perceptual hash matching
11
+ - PaddleOCR-based character recognition
12
+ - Frequency-based scoring for ambiguous results
13
+ - Debugging and font mapping persistence
14
+
15
+ Exposes the selected OCR engine version via `FontOCR`.
16
+ """
17
+
18
+ # from .ocr_v1 import FontOCRV1 as FontOCR
19
+ from .ocr_v2 import FontOCRV2 as FontOCR
20
+
21
+ __version__ = "v2"
22
+
23
+ __all__ = ["FontOCR"]
@@ -0,0 +1,304 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.utils.fontocr.ocr_v1
5
+ -------------------------------------
6
+
7
+ This class provides utility methods for optical character recognition (OCR)
8
+ and font mapping, primarily used for decrypting custom font encryption
9
+ on web pages (e.g., the Qidian website).
10
+ """
11
+
12
+ import json
13
+ import logging
14
+ from pathlib import Path
15
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union
16
+
17
+ import numpy as np
18
+ import paddle
19
+ from fontTools.ttLib import TTFont
20
+ from paddleocr import PaddleOCR
21
+ from PIL import Image, ImageDraw, ImageFont
22
+
23
+ from novel_downloader.utils.constants import (
24
+ REC_CHAR_MODEL_FILES,
25
+ REC_IMAGE_SHAPE_MAP,
26
+ )
27
+ from novel_downloader.utils.hash_store import img_hash_store
28
+ from novel_downloader.utils.model_loader import get_rec_chinese_char_model_dir
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ class FontOCRV1:
34
+ """
35
+ Version 1 of the FontOCR utility.
36
+
37
+ :param use_freq: if True, weight OCR scores by character frequency
38
+ :param cache_dir: base path to store font-map JSON data
39
+ :param threshold: minimum confidence threshold [0.0-1.0]
40
+ :param font_debug: if True, dump per-char debug images under cache_dir
41
+ """
42
+
43
+ # Default constants
44
+ CHAR_IMAGE_SIZE = 64
45
+ CHAR_FONT_SIZE = 52
46
+ _freq_weight = 0.05
47
+
48
+ # shared resources
49
+ _global_char_freq_db: Dict[str, int] = {}
50
+ _global_ocr: Optional[PaddleOCR] = None
51
+
52
+ def __init__(
53
+ self,
54
+ cache_dir: Union[str, Path],
55
+ use_freq: bool = False,
56
+ ocr_version: str = "v1.0",
57
+ threshold: float = 0.0,
58
+ font_debug: bool = False,
59
+ **kwargs: Any,
60
+ ) -> None:
61
+ self.use_freq = use_freq
62
+ self.ocr_version = ocr_version
63
+ self.threshold = threshold
64
+ self.font_debug = font_debug
65
+ self._max_freq = 5
66
+
67
+ self._cache_dir = Path(cache_dir)
68
+ self._cache_dir.mkdir(parents=True, exist_ok=True)
69
+ self._fixed_map_dir = self._cache_dir / "fixed_font_map"
70
+ self._fixed_map_dir.mkdir(exist_ok=True)
71
+
72
+ if font_debug:
73
+ self._debug_dir = self._cache_dir / "font_debug" / "badcase"
74
+ self._debug_dir.mkdir(parents=True, exist_ok=True)
75
+
76
+ # load shared NLP/OCR + frequency DB once
77
+ self._load_ocr_model()
78
+ if self.use_freq and not FontOCRV1._global_char_freq_db:
79
+ self._load_char_freq_db()
80
+
81
+ def _load_ocr_model(self) -> None:
82
+ """
83
+ Initialize the shared PaddleOCR model if not already loaded.
84
+ """
85
+ if FontOCRV1._global_ocr is not None:
86
+ return
87
+
88
+ gpu_available = paddle.device.is_compiled_with_cuda()
89
+ self._char_model_dir = get_rec_chinese_char_model_dir(self.ocr_version)
90
+
91
+ for fname in REC_CHAR_MODEL_FILES:
92
+ full_path = self._char_model_dir / fname
93
+ if not full_path.exists():
94
+ raise FileNotFoundError(f"[FontOCR] Required file missing: {full_path}")
95
+
96
+ char_dict_file = self._char_model_dir / "rec_custom_keys.txt"
97
+ FontOCRV1._global_ocr = PaddleOCR(
98
+ use_angle_cls=False,
99
+ lang="ch",
100
+ det=False,
101
+ use_gpu=gpu_available,
102
+ show_log=self.font_debug,
103
+ rec_model_dir=str(self._char_model_dir),
104
+ rec_char_dict_path=str(char_dict_file),
105
+ rec_image_shape=REC_IMAGE_SHAPE_MAP[self.ocr_version],
106
+ max_text_length=1,
107
+ use_space_char=False,
108
+ )
109
+
110
+ def _load_char_freq_db(self) -> bool:
111
+ """
112
+ Loads character frequency data from a JSON file and
113
+ assigns it to the instance variable.
114
+
115
+ :return: True if successfully loaded, False otherwise.
116
+ """
117
+ try:
118
+ char_freq_map_file = self._char_model_dir / "char_freq.json"
119
+ with char_freq_map_file.open("r", encoding="utf-8") as f:
120
+ FontOCRV1._global_char_freq_db = json.load(f)
121
+ self._max_freq = max(FontOCRV1._global_char_freq_db.values())
122
+ return True
123
+ except Exception as e:
124
+ logger.warning("[FontOCR] Failed to load char freq DB: %s", e)
125
+ return False
126
+
127
+ @staticmethod
128
+ def _generate_char_image(
129
+ char: str,
130
+ render_font: ImageFont.FreeTypeFont,
131
+ is_reflect: bool = False,
132
+ ) -> Optional[Image.Image]:
133
+ """
134
+ Render a single character into a square image.
135
+ If is_reflect is True, flip horizontally.
136
+ """
137
+ size = FontOCRV1.CHAR_IMAGE_SIZE
138
+ img = Image.new("L", (size, size), color=255)
139
+ draw = ImageDraw.Draw(img)
140
+ bbox = draw.textbbox((0, 0), char, font=render_font)
141
+ w, h = bbox[2] - bbox[0], bbox[3] - bbox[1]
142
+ x = (size - w) // 2 - bbox[0]
143
+ y = (size - h) // 2 - bbox[1]
144
+ draw.text((x, y), char, fill=0, font=render_font)
145
+ if is_reflect:
146
+ img = img.transpose(Image.FLIP_LEFT_RIGHT)
147
+
148
+ img_np = np.array(img)
149
+ if np.unique(img_np).size == 1:
150
+ return None
151
+
152
+ return img
153
+
154
+ def ocr_text(
155
+ self, img: Image.Image, top_k: int = 1
156
+ ) -> Union[str, List[Tuple[str, float]]]:
157
+ """
158
+ Run PaddleOCR on a single-image, return best match(es).
159
+ If use_freq, adjust score by frequency bonus.
160
+ """
161
+ if not FontOCRV1._global_ocr:
162
+ self._load_ocr_model()
163
+ try:
164
+ img_np = np.asarray(img)
165
+ assert FontOCRV1._global_ocr is not None
166
+ result = FontOCRV1._global_ocr.ocr(
167
+ img_np, cls=False, det=False
168
+ ) # returns List[List[ (text, score) ]]
169
+ candidates = result[0] if result else []
170
+ # attach frequency weight if enabled
171
+ if self.use_freq and FontOCRV1._global_char_freq_db:
172
+ adjusted = []
173
+ for ch, score in candidates:
174
+ freq = FontOCRV1._global_char_freq_db.get(ch, self._max_freq)
175
+ bonus = (
176
+ FontOCRV1._freq_weight
177
+ * (self._max_freq - freq)
178
+ / self._max_freq
179
+ )
180
+ adjusted.append((ch, score + bonus))
181
+ candidates = adjusted
182
+ # filter by threshold
183
+ filtered = [c for c in candidates if c[1] >= self.threshold]
184
+ return filtered[0][0] if top_k == 1 and filtered else filtered[:top_k]
185
+ except Exception as e:
186
+ logger.error("[FontOCR] OCR failure: %s", e)
187
+ return "" if top_k == 1 else []
188
+
189
+ def query(
190
+ self, img: Image.Image, top_k: int = 1
191
+ ) -> Union[str, List[Tuple[str, float]]]:
192
+ """
193
+ First try hash-based lookup via img_hash_store;
194
+ if no hit, fall back to ocr_text().
195
+ """
196
+ # quick hash lookup
197
+ matches = img_hash_store.query(img, k=top_k)
198
+ if matches:
199
+ # matches is List[(label, dist)]
200
+ return matches[0][0] if top_k == 1 else matches
201
+
202
+ # fallback to OCR
203
+ return self.ocr_text(img, top_k=top_k)
204
+
205
+ def generate_font_map(
206
+ self,
207
+ fixed_font_path: Union[str, Path],
208
+ random_font_path: Union[str, Path],
209
+ char_set: Set[str],
210
+ refl_set: Set[str],
211
+ chapter_id: Optional[str] = None,
212
+ ) -> Dict[str, str]:
213
+ """
214
+ Generates a mapping from encrypted (randomized) font characters to
215
+ their real recognized characters by rendering and OCR-based matching.
216
+
217
+ :param fixed_font_path: Path to the reference (fixed) font.
218
+ :param random_font_path: Path to the obfuscated (random) font.
219
+ :param char_set: Characters to process normally.
220
+ :param refl_set: Characters to process as horizontally flipped.
221
+ :param chapter_id: Chapter ID
222
+
223
+ :returns mapping_result: { obf_char: real_char, ... }
224
+ """
225
+ mapping_result: Dict[str, str] = {}
226
+ fixed_map_file = self._fixed_map_dir / f"{Path(fixed_font_path).stem}.json"
227
+
228
+ # 1) load or init fixed_font_map
229
+ if fixed_map_file.exists():
230
+ try:
231
+ with open(fixed_map_file, "r", encoding="utf-8") as f:
232
+ fixed_map = json.load(f)
233
+ except Exception as e:
234
+ logger.debug("[FontOCR] Failed to load fixed map file: %s", e)
235
+ fixed_map = {}
236
+ else:
237
+ fixed_map = {}
238
+
239
+ # prepare font renderers and cmap sets
240
+ try:
241
+ fixed_ttf = TTFont(fixed_font_path)
242
+ fixed_chars = set(chr(c) for c in fixed_ttf.getBestCmap().keys())
243
+ fixed_font = ImageFont.truetype(str(fixed_font_path), self.CHAR_FONT_SIZE)
244
+
245
+ random_ttf = TTFont(random_font_path)
246
+ random_chars = set(chr(c) for c in random_ttf.getBestCmap().keys())
247
+ random_font = ImageFont.truetype(str(random_font_path), self.CHAR_FONT_SIZE)
248
+ except Exception as e:
249
+ logger.error("[FontOCR] Failed to load TTF fonts: %s", e)
250
+ return mapping_result
251
+
252
+ def _process(chars: Set[str], reflect: bool = False) -> None:
253
+ for ch in chars:
254
+ try:
255
+ if ch in fixed_map:
256
+ mapping_result[ch] = fixed_map[ch]
257
+ logger.debug(
258
+ "[FontOCR] Using cached mapping: '%s' -> '%s'",
259
+ ch,
260
+ fixed_map[ch],
261
+ )
262
+ continue
263
+
264
+ if ch in fixed_chars:
265
+ font_to_use = fixed_font
266
+ elif ch in random_chars:
267
+ font_to_use = random_font
268
+ else:
269
+ logger.debug("[FontOCR] Skipping unknown char: '%s'", ch)
270
+ continue
271
+
272
+ img = self._generate_char_image(ch, font_to_use, is_reflect=reflect)
273
+ if img is None:
274
+ logger.debug("[FontOCR] Skipping unknown char: '%s'", ch)
275
+ continue
276
+
277
+ real = self.query(img, top_k=1)
278
+ if real:
279
+ real_char = (
280
+ str(real[0]) if isinstance(real, (list, tuple)) else real
281
+ )
282
+ mapping_result[ch] = real_char
283
+ if ch in fixed_chars:
284
+ fixed_map[ch] = real_char
285
+ logger.debug("[FontOCR] Mapped '%s' -> '%s'", ch, real_char)
286
+ elif self.font_debug and chapter_id:
287
+ dbg_path = self._debug_dir / f"{ord(ch):05X}_{chapter_id}.png"
288
+ img.save(dbg_path)
289
+ logger.debug("[FontOCR] Saved debug image: %s", dbg_path)
290
+ except Exception as e:
291
+ logger.warning("[FontOCR] Failed to process char '%s': %s", ch, e)
292
+
293
+ # process normal + reflected chars
294
+ _process(char_set, reflect=False)
295
+ _process(refl_set, reflect=True)
296
+
297
+ # persist updated fixed_map
298
+ try:
299
+ with open(fixed_map_file, "w", encoding="utf-8") as f:
300
+ json.dump(fixed_map, f, ensure_ascii=False, indent=2)
301
+ except Exception as e:
302
+ logger.error("[FontOCR] Failed to save fixed map: %s", e)
303
+
304
+ return mapping_result