novel-downloader 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. novel_downloader/__init__.py +14 -0
  2. novel_downloader/cli/__init__.py +14 -0
  3. novel_downloader/cli/clean.py +134 -0
  4. novel_downloader/cli/download.py +98 -0
  5. novel_downloader/cli/interactive.py +67 -0
  6. novel_downloader/cli/main.py +45 -0
  7. novel_downloader/cli/settings.py +177 -0
  8. novel_downloader/config/__init__.py +52 -0
  9. novel_downloader/config/adapter.py +150 -0
  10. novel_downloader/config/loader.py +177 -0
  11. novel_downloader/config/models.py +170 -0
  12. novel_downloader/config/site_rules.py +97 -0
  13. novel_downloader/core/__init__.py +25 -0
  14. novel_downloader/core/downloaders/__init__.py +20 -0
  15. novel_downloader/core/downloaders/base_downloader.py +187 -0
  16. novel_downloader/core/downloaders/common_downloader.py +192 -0
  17. novel_downloader/core/downloaders/qidian_downloader.py +208 -0
  18. novel_downloader/core/factory/__init__.py +21 -0
  19. novel_downloader/core/factory/downloader_factory.py +62 -0
  20. novel_downloader/core/factory/parser_factory.py +62 -0
  21. novel_downloader/core/factory/requester_factory.py +62 -0
  22. novel_downloader/core/factory/saver_factory.py +49 -0
  23. novel_downloader/core/interfaces/__init__.py +28 -0
  24. novel_downloader/core/interfaces/downloader_protocol.py +37 -0
  25. novel_downloader/core/interfaces/parser_protocol.py +40 -0
  26. novel_downloader/core/interfaces/requester_protocol.py +65 -0
  27. novel_downloader/core/interfaces/saver_protocol.py +61 -0
  28. novel_downloader/core/parsers/__init__.py +28 -0
  29. novel_downloader/core/parsers/base_parser.py +96 -0
  30. novel_downloader/core/parsers/common_parser/__init__.py +14 -0
  31. novel_downloader/core/parsers/common_parser/helper.py +321 -0
  32. novel_downloader/core/parsers/common_parser/main_parser.py +86 -0
  33. novel_downloader/core/parsers/qidian_parser/__init__.py +20 -0
  34. novel_downloader/core/parsers/qidian_parser/browser/__init__.py +13 -0
  35. novel_downloader/core/parsers/qidian_parser/browser/chapter_encrypted.py +498 -0
  36. novel_downloader/core/parsers/qidian_parser/browser/chapter_normal.py +97 -0
  37. novel_downloader/core/parsers/qidian_parser/browser/chapter_router.py +70 -0
  38. novel_downloader/core/parsers/qidian_parser/browser/main_parser.py +110 -0
  39. novel_downloader/core/parsers/qidian_parser/session/__init__.py +13 -0
  40. novel_downloader/core/parsers/qidian_parser/session/chapter_encrypted.py +451 -0
  41. novel_downloader/core/parsers/qidian_parser/session/chapter_normal.py +119 -0
  42. novel_downloader/core/parsers/qidian_parser/session/chapter_router.py +67 -0
  43. novel_downloader/core/parsers/qidian_parser/session/main_parser.py +113 -0
  44. novel_downloader/core/parsers/qidian_parser/session/node_decryptor.py +164 -0
  45. novel_downloader/core/parsers/qidian_parser/shared/__init__.py +38 -0
  46. novel_downloader/core/parsers/qidian_parser/shared/book_info_parser.py +95 -0
  47. novel_downloader/core/parsers/qidian_parser/shared/helpers.py +133 -0
  48. novel_downloader/core/requesters/__init__.py +27 -0
  49. novel_downloader/core/requesters/base_browser.py +210 -0
  50. novel_downloader/core/requesters/base_session.py +243 -0
  51. novel_downloader/core/requesters/common_requester/__init__.py +14 -0
  52. novel_downloader/core/requesters/common_requester/common_session.py +126 -0
  53. novel_downloader/core/requesters/qidian_requester/__init__.py +22 -0
  54. novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +377 -0
  55. novel_downloader/core/requesters/qidian_requester/qidian_session.py +202 -0
  56. novel_downloader/core/savers/__init__.py +20 -0
  57. novel_downloader/core/savers/base_saver.py +169 -0
  58. novel_downloader/core/savers/common_saver/__init__.py +13 -0
  59. novel_downloader/core/savers/common_saver/common_epub.py +232 -0
  60. novel_downloader/core/savers/common_saver/common_txt.py +176 -0
  61. novel_downloader/core/savers/common_saver/main_saver.py +86 -0
  62. novel_downloader/core/savers/epub_utils/__init__.py +27 -0
  63. novel_downloader/core/savers/epub_utils/css_builder.py +68 -0
  64. novel_downloader/core/savers/epub_utils/initializer.py +98 -0
  65. novel_downloader/core/savers/epub_utils/text_to_html.py +132 -0
  66. novel_downloader/core/savers/epub_utils/volume_intro.py +61 -0
  67. novel_downloader/core/savers/qidian_saver.py +22 -0
  68. novel_downloader/locales/en.json +91 -0
  69. novel_downloader/locales/zh.json +91 -0
  70. novel_downloader/resources/config/rules.toml +196 -0
  71. novel_downloader/resources/config/settings.yaml +70 -0
  72. novel_downloader/resources/css_styles/main.css +104 -0
  73. novel_downloader/resources/css_styles/volume-intro.css +56 -0
  74. novel_downloader/resources/images/volume_border.png +0 -0
  75. novel_downloader/resources/js_scripts/qidian_decrypt_node.js +82 -0
  76. novel_downloader/resources/json/replace_word_map.json +4 -0
  77. novel_downloader/resources/text/blacklist.txt +22 -0
  78. novel_downloader/utils/__init__.py +0 -0
  79. novel_downloader/utils/cache.py +24 -0
  80. novel_downloader/utils/constants.py +158 -0
  81. novel_downloader/utils/crypto_utils.py +144 -0
  82. novel_downloader/utils/file_utils/__init__.py +43 -0
  83. novel_downloader/utils/file_utils/io.py +252 -0
  84. novel_downloader/utils/file_utils/normalize.py +68 -0
  85. novel_downloader/utils/file_utils/sanitize.py +77 -0
  86. novel_downloader/utils/fontocr/__init__.py +23 -0
  87. novel_downloader/utils/fontocr/ocr_v1.py +304 -0
  88. novel_downloader/utils/fontocr/ocr_v2.py +658 -0
  89. novel_downloader/utils/hash_store.py +288 -0
  90. novel_downloader/utils/hash_utils.py +103 -0
  91. novel_downloader/utils/i18n.py +41 -0
  92. novel_downloader/utils/logger.py +104 -0
  93. novel_downloader/utils/model_loader.py +72 -0
  94. novel_downloader/utils/network.py +287 -0
  95. novel_downloader/utils/state.py +156 -0
  96. novel_downloader/utils/text_utils/__init__.py +27 -0
  97. novel_downloader/utils/text_utils/chapter_formatting.py +46 -0
  98. novel_downloader/utils/text_utils/diff_display.py +75 -0
  99. novel_downloader/utils/text_utils/font_mapping.py +31 -0
  100. novel_downloader/utils/text_utils/text_cleaning.py +57 -0
  101. novel_downloader/utils/time_utils/__init__.py +22 -0
  102. novel_downloader/utils/time_utils/datetime_utils.py +146 -0
  103. novel_downloader/utils/time_utils/sleep_utils.py +49 -0
  104. novel_downloader-1.1.1.dist-info/METADATA +137 -0
  105. novel_downloader-1.1.1.dist-info/RECORD +109 -0
  106. novel_downloader-1.1.1.dist-info/WHEEL +5 -0
  107. novel_downloader-1.1.1.dist-info/entry_points.txt +2 -0
  108. novel_downloader-1.1.1.dist-info/licenses/LICENSE +21 -0
  109. novel_downloader-1.1.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,158 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.utils.constants
5
+ --------------------------------
6
+
7
+ Constants and default paths used throughout the NovelDownloader project.
8
+ """
9
+
10
+ from importlib.resources import files
11
+ from pathlib import Path
12
+
13
+ from platformdirs import user_config_dir
14
+
15
+ # -----------------------------------------------------------------------------
16
+ # Application identity
17
+ # -----------------------------------------------------------------------------
18
+ PACKAGE_NAME = "novel_downloader" # Python package name
19
+ APP_NAME = "NovelDownloader" # Display name
20
+ APP_DIR_NAME = "novel_downloader" # Directory name for platformdirs
21
+ LOGGER_NAME = PACKAGE_NAME # Root logger name
22
+
23
+
24
+ # -----------------------------------------------------------------------------
25
+ # Base directories
26
+ # -----------------------------------------------------------------------------
27
+ # Base config directory (e.g. ~/AppData/Local/novel_downloader/)
28
+ BASE_CONFIG_DIR = Path(user_config_dir(APP_DIR_NAME, appauthor=False))
29
+ PACKAGE_ROOT: Path = Path(__file__).parent.parent
30
+ LOCALES_DIR: Path = PACKAGE_ROOT / "locales"
31
+
32
+ # Subdirectories under BASE_CONFIG_DIR
33
+ LOGGER_DIR = BASE_CONFIG_DIR / "logs"
34
+ JS_SCRIPT_DIR = BASE_CONFIG_DIR / "scripts"
35
+ STATE_DIR = BASE_CONFIG_DIR / "state"
36
+ DATA_DIR = BASE_CONFIG_DIR / "data"
37
+ CONFIG_DIR = BASE_CONFIG_DIR / "config"
38
+ MODEL_CACHE_DIR = BASE_CONFIG_DIR / "models"
39
+
40
+ # -----------------------------------------------------------------------------
41
+ # Default file paths
42
+ # -----------------------------------------------------------------------------
43
+ STATE_FILE = STATE_DIR / "state.json"
44
+ HASH_STORE_FILE = DATA_DIR / "image_hashes.json"
45
+ SETTING_FILE = CONFIG_DIR / "settings.json"
46
+ SITE_RULES_FILE = CONFIG_DIR / "site_rules.json"
47
+ DEFAULT_USER_DATA_DIR = DATA_DIR / "browser_data"
48
+
49
+
50
+ # -----------------------------------------------------------------------------
51
+ # Default preferences & headers
52
+ # -----------------------------------------------------------------------------
53
+ DEFAULT_USER_PROFILE_NAME = "Profile_1"
54
+ DEFAULT_IMAGE_SUFFIX = ".jpg"
55
+
56
+ DEFAULT_USER_AGENT = (
57
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
58
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
59
+ "Chrome/134.0.0.0 Safari/537.36"
60
+ )
61
+ DEFAULT_HEADERS = {"User-Agent": DEFAULT_USER_AGENT}
62
+
63
+ DEFAULT_ACCEPT = (
64
+ "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
65
+ )
66
+
67
+ DEFAULT_USER_HEADERS = {
68
+ "Accept": DEFAULT_ACCEPT,
69
+ "Accept-Encoding": "gzip, deflate",
70
+ "Accept-Language": "en,zh;q=0.9,zh-CN;q=0.8",
71
+ "User-Agent": DEFAULT_USER_AGENT,
72
+ "Connection": "keep-alive",
73
+ }
74
+
75
+ # -----------------------------------------------------------------------------
76
+ # Embedded resources (via importlib.resources)
77
+ # -----------------------------------------------------------------------------
78
+ BASE_CONFIG_PATH = files("novel_downloader.resources.config").joinpath("settings.yaml")
79
+ BASE_RULE_PATH = files("novel_downloader.resources.config").joinpath("rules.toml")
80
+
81
+ DEFAULT_SETTINGS_PATHS = [
82
+ BASE_CONFIG_PATH,
83
+ BASE_RULE_PATH,
84
+ ]
85
+
86
+ # CSS Styles
87
+ CSS_MAIN_PATH = files("novel_downloader.resources.css_styles").joinpath("main.css")
88
+ CSS_VOLUME_INTRO_PATH = files("novel_downloader.resources.css_styles").joinpath(
89
+ "volume-intro.css"
90
+ )
91
+
92
+ # Images
93
+ VOLUME_BORDER_IMAGE_PATH = files("novel_downloader.resources.images").joinpath(
94
+ "volume_border.png"
95
+ )
96
+
97
+ # JSON
98
+ REPLACE_WORD_MAP_PATH = files("novel_downloader.resources.json").joinpath(
99
+ "replace_word_map.json"
100
+ )
101
+
102
+ # JavaScript
103
+ QD_DECRYPT_SCRIPT_PATH = files("novel_downloader.resources.js_scripts").joinpath(
104
+ "qidian_decrypt_node.js"
105
+ )
106
+
107
+ # Text Files
108
+ BLACKLIST_PATH = files("novel_downloader.resources.text").joinpath("blacklist.txt")
109
+
110
+ # -----------------------------------------------------------------------------
111
+ # EPUB defaults
112
+ # -----------------------------------------------------------------------------
113
+ EPUB_IMAGE_FOLDER = "Images"
114
+ EPUB_TEXT_FOLDER = "Text"
115
+
116
+ EPUB_OPTIONS = {
117
+ # guide 是 EPUB 2 的一个部分, 包含封面, 目录, 索引等重要导航信息
118
+ "epub2_guide": True,
119
+ # landmark 是 EPUB 3 用来标识重要页面 (如目录, 封面, 起始页) 的 <nav> 结构
120
+ "epub3_landmark": True,
121
+ # EPUB 3 允许提供一个 page list, 让电子书在不同设备上仍然保持相对一致的分页结构
122
+ "epub3_pages": True,
123
+ # 这个名字会出现在 EPUB 阅读器的导航栏
124
+ "landmark_title": "Guide",
125
+ # 这个名字会显示在 EPUB 阅读器的分页导航栏
126
+ "pages_title": "Pages",
127
+ # 是否根据 book.spine 的排列顺序自动设置 EPUB 阅读器的 page-progression-direction
128
+ "spine_direction": True,
129
+ # 控制 EPUB 阅读器的默认翻页方向 (LTR 或 RTL)
130
+ "package_direction": False,
131
+ # 是否为 EPUB 书籍中的章节 添加播放顺序
132
+ "play_order": {"enabled": True, "start_from": 1},
133
+ }
134
+
135
+ # ---------------------------------------------------------------------
136
+ # Pretrained model registry (e.g. used in font recovery or OCR)
137
+ # ---------------------------------------------------------------------
138
+
139
+ # Hugging Face model repo for character recognition
140
+ REC_CHAR_MODEL_REPO = "saudadez/rec_chinese_char"
141
+
142
+ # Required files to be downloaded for the model
143
+ REC_CHAR_MODEL_FILES = [
144
+ "inference.pdmodel",
145
+ "inference.pdiparams",
146
+ "rec_custom_keys.txt",
147
+ "char_freq.json",
148
+ ]
149
+
150
+ REC_CHAR_VECTOR_FILES = [
151
+ "char_vectors.npy",
152
+ "char_vectors.txt",
153
+ ]
154
+
155
+ REC_IMAGE_SHAPE_MAP = {
156
+ "v1.0": "3,32,32",
157
+ "v2.0": "3,48,48",
158
+ }
@@ -0,0 +1,144 @@
1
+ """
2
+ novel_downloader.utils.crypto_utils
3
+ -----------------------------------
4
+
5
+ Generic cryptographic utilities
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import base64
11
+ import hashlib
12
+ import json
13
+ import random
14
+ import time
15
+ from typing import Any, Dict, List
16
+
17
+
18
+ def rc4_crypt(
19
+ key: str,
20
+ data: str,
21
+ *,
22
+ mode: str = "encrypt",
23
+ encoding: str = "utf-8",
24
+ ) -> str:
25
+ """
26
+ Encrypt or decrypt data using RC4 and Base64.
27
+
28
+ :param key: RC4 key (will be encoded using the specified encoding).
29
+ :type key: str
30
+ :param data: Plain-text (for 'encrypt') or Base64 cipher-text (for 'decrypt').
31
+ :type data: str
32
+ :param mode: Operation mode, either 'encrypt' or 'decrypt'. Defaults to 'encrypt'.
33
+ :type mode: str, optional
34
+ :param encoding: Character encoding for key and returned string. Defaults 'utf-8'.
35
+ :type encoding: str, optional
36
+
37
+ :return: Base64 cipher-text (for encryption) or decoded plain-text (for decryption).
38
+ :rtype: str
39
+
40
+ :raises ValueError: If mode is not 'encrypt' or 'decrypt'.
41
+ """
42
+
43
+ def _rc4(key_bytes: bytes, data_bytes: bytes) -> bytes:
44
+ # Key-Scheduling Algorithm (KSA)
45
+ S = list(range(256))
46
+ j = 0
47
+ for i in range(256):
48
+ j = (j + S[i] + key_bytes[i % len(key_bytes)]) % 256
49
+ S[i], S[j] = S[j], S[i]
50
+
51
+ # Pseudo-Random Generation Algorithm (PRGA)
52
+ i = j = 0
53
+ out: List[int] = []
54
+ for char in data_bytes:
55
+ i = (i + 1) % 256
56
+ j = (j + S[i]) % 256
57
+ S[i], S[j] = S[j], S[i]
58
+ K = S[(S[i] + S[j]) % 256]
59
+ out.append(char ^ K)
60
+
61
+ return bytes(out)
62
+
63
+ key_bytes = key.encode(encoding)
64
+
65
+ if mode == "encrypt":
66
+ plain_bytes = data.encode(encoding)
67
+ cipher_bytes = _rc4(key_bytes, plain_bytes)
68
+ return base64.b64encode(cipher_bytes).decode(encoding)
69
+
70
+ if mode == "decrypt":
71
+ cipher_bytes = base64.b64decode(data)
72
+ plain_bytes = _rc4(key_bytes, cipher_bytes)
73
+ return plain_bytes.decode(encoding, errors="replace")
74
+
75
+ raise ValueError("Mode must be 'encrypt' or 'decrypt'.")
76
+
77
+
78
+ def _get_key() -> str:
79
+ encoded = "Lj1qYxMuaXBjMg=="
80
+ decoded = base64.b64decode(encoded)
81
+ key = "".join([chr(b ^ 0x5A) for b in decoded])
82
+ return key
83
+
84
+
85
+ def _d(b64str: str) -> str:
86
+ return base64.b64decode(b64str).decode()
87
+
88
+
89
+ def patch_qd_payload_token(
90
+ enc_token: str,
91
+ new_uri: str,
92
+ *,
93
+ key: str = "",
94
+ ) -> str:
95
+ """
96
+ Patch a timestamp-bearing token with fresh timing and checksum info.
97
+
98
+ :param enc_token: Encrypted token string from a live request.
99
+ :type enc_token: str
100
+ :param new_uri: URI used in checksum generation.
101
+ :type new_uri: str
102
+ :param key: RC4 key extracted from front-end JavaScript (optional).
103
+ :type key: str, optional
104
+
105
+ :return: Updated token with new timing and checksum values.
106
+ :rtype: str
107
+ """
108
+ if not key:
109
+ key = _get_key()
110
+
111
+ # Step 1 – decrypt --------------------------------------------------
112
+ decrypted_json: str = rc4_crypt(key, enc_token, mode="decrypt")
113
+ payload: Dict[str, Any] = json.loads(decrypted_json)
114
+
115
+ # Step 2 – rebuild timing fields -----------------------------------
116
+ loadts = int(time.time() * 1000) # ms since epoch
117
+ # Simulate the JS duration: N(600, 150) pushed into [300, 1000]
118
+ duration = max(300, min(1000, int(random.normalvariate(600, 150))))
119
+ timestamp = loadts + duration
120
+
121
+ # Step 3 – recalculate ------------------------------------
122
+ fp_key = _d("ZmluZ2VycHJpbnQ=")
123
+ ab_key = _d("YWJub3JtYWw=")
124
+ ck_key = _d("Y2hlY2tzdW0=")
125
+ lt_key = _d("bG9hZHRz")
126
+ ts_key = _d("dGltZXN0YW1w")
127
+
128
+ fp_val = payload.get(fp_key, "")
129
+ ab_val = payload.get(ab_key, "0" * 32)
130
+ comb = f"{new_uri}{loadts}{fp_val}"
131
+ ck_val = hashlib.md5(comb.encode("utf-8")).hexdigest()
132
+
133
+ new_payload = {
134
+ lt_key: loadts,
135
+ ts_key: timestamp,
136
+ fp_key: fp_val,
137
+ ab_key: ab_val,
138
+ ck_key: ck_val,
139
+ }
140
+
141
+ # Step 4 – encrypt and return --------------------------------------
142
+ return rc4_crypt(
143
+ key, json.dumps(new_payload, separators=(",", ":")), mode="encrypt"
144
+ )
@@ -0,0 +1,43 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.utils.file_utils
5
+ ---------------------------------
6
+
7
+ High-level file I/O utility re-exports for convenience.
8
+
9
+ This module aggregates commonly used low-level file utilities such as:
10
+ - Path sanitization (for safe filenames)
11
+ - Text normalization (e.g. Windows/Linux line endings)
12
+ - JSON, plain text, and binary file reading/writing
13
+
14
+ Included utilities:
15
+ - sanitize_filename: remove invalid characters from filenames
16
+ - normalize_txt_line_endings: standardize line endings in text files
17
+ - save_as_json / save_as_txt: write dict or text to file
18
+ - read_text_file / read_json_file / read_binary_file: load content from file
19
+ """
20
+
21
+ from .io import (
22
+ load_blacklisted_words,
23
+ load_text_resource,
24
+ read_binary_file,
25
+ read_json_file,
26
+ read_text_file,
27
+ save_as_json,
28
+ save_as_txt,
29
+ )
30
+ from .normalize import normalize_txt_line_endings
31
+ from .sanitize import sanitize_filename
32
+
33
+ __all__ = [
34
+ "sanitize_filename",
35
+ "save_as_json",
36
+ "save_as_txt",
37
+ "read_text_file",
38
+ "read_json_file",
39
+ "read_binary_file",
40
+ "load_text_resource",
41
+ "load_blacklisted_words",
42
+ "normalize_txt_line_endings",
43
+ ]
@@ -0,0 +1,252 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.utils.file_utils.io
5
+ ------------------------------------
6
+
7
+ File I/O utilities for reading and writing text, JSON, and binary data.
8
+
9
+ Includes:
10
+ - Safe, atomic file saving with optional overwrite and auto-renaming
11
+ - JSON pretty-printing with size-aware formatting
12
+ - Simple helpers for reading files with fallback and logging
13
+ """
14
+
15
+ import json
16
+ import logging
17
+ import tempfile
18
+ from importlib.resources import files
19
+ from pathlib import Path
20
+ from typing import Any, Dict, List, Literal, Optional, Set, Union
21
+
22
+ from .sanitize import sanitize_filename
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ _JSON_INDENT_THRESHOLD = 50 * 1024 # bytes
27
+
28
+
29
+ def _get_non_conflicting_path(path: Path) -> Path:
30
+ """
31
+ If the path exists, generate a new one by appending _1, _2, etc.
32
+ """
33
+ counter = 1
34
+ new_path = path
35
+ while new_path.exists():
36
+ stem = path.stem
37
+ suffix = path.suffix
38
+ new_path = path.with_name(f"{stem}_{counter}{suffix}")
39
+ counter += 1
40
+ return new_path
41
+
42
+
43
+ def _write_file(
44
+ content: Union[str, bytes, Dict[Any, Any], List[Any], Any],
45
+ filepath: Union[str, Path],
46
+ mode: Optional[str] = None,
47
+ *,
48
+ on_exist: Literal["overwrite", "skip", "rename"] = "overwrite",
49
+ dump_json: bool = False,
50
+ encoding: str = "utf-8",
51
+ ) -> bool:
52
+ """
53
+ Write content to a file safely with optional atomic behavior
54
+ and JSON serialization.
55
+
56
+ :param content: The content to write; can be text, bytes, or a
57
+ JSON-serializable object.
58
+ :param filepath: Destination path (str or Path).
59
+ :param mode: File mode ('w', 'wb'). Auto-determined if None.
60
+ :param on_exist: Behavior if file exists: 'overwrite', 'skip',
61
+ or 'rename'.
62
+ :param dump_json: If True, serialize content as JSON.
63
+ :param encoding: Text encoding for writing.
64
+ :return: True if writing succeeds, False otherwise.
65
+ """
66
+ path = Path(filepath)
67
+ path = path.with_name(sanitize_filename(path.name))
68
+ path.parent.mkdir(parents=True, exist_ok=True)
69
+
70
+ if path.exists():
71
+ if on_exist == "skip":
72
+ logger.debug("[file] '%s' exists, skipping", path)
73
+ return False
74
+ if on_exist == "rename":
75
+ path = _get_non_conflicting_path(path)
76
+ logger.debug("[file] Renaming target to avoid conflict: %s", path)
77
+ else:
78
+ logger.debug("[file] '%s' exists, will overwrite", path)
79
+
80
+ # Prepare content and write mode
81
+ content_to_write: Union[str, bytes]
82
+ if dump_json:
83
+ # Serialize original object to JSON string
84
+ json_str = json.dumps(content, ensure_ascii=False, indent=2)
85
+ if len(json_str.encode(encoding)) > _JSON_INDENT_THRESHOLD:
86
+ json_str = json.dumps(content, ensure_ascii=False, separators=(",", ":"))
87
+ content_to_write = json_str
88
+ write_mode = "w"
89
+ else:
90
+ if isinstance(content, (str, bytes)):
91
+ content_to_write = content
92
+ else:
93
+ raise TypeError("Non-JSON content must be str or bytes.")
94
+ write_mode = "wb" if isinstance(content, bytes) else "w"
95
+
96
+ try:
97
+ with tempfile.NamedTemporaryFile(
98
+ mode=write_mode,
99
+ encoding=None if "b" in write_mode else encoding,
100
+ newline=None if "b" in write_mode else "\n",
101
+ delete=False,
102
+ dir=path.parent,
103
+ ) as tmp:
104
+ tmp.write(content_to_write)
105
+ tmp_path = Path(tmp.name)
106
+ tmp_path.replace(path)
107
+ logger.info("[file] '%s' written successfully", path)
108
+ return True
109
+ except Exception as exc:
110
+ logger.warning("[file] Error writing %r: %s", path, exc)
111
+ return False
112
+
113
+
114
+ def save_as_txt(
115
+ content: str,
116
+ filepath: Union[str, Path],
117
+ *,
118
+ encoding: str = "utf-8",
119
+ on_exist: Literal["overwrite", "skip", "rename"] = "overwrite",
120
+ ) -> bool:
121
+ """
122
+ Save plain text content to the given file path.
123
+
124
+ :param content: Text content to write.
125
+ :param filepath: Destination file path.
126
+ :param encoding: Text encoding to use (default: 'utf-8').
127
+ :param on_exist: How to handle existing files: 'overwrite', 'skip', or 'rename'.
128
+ :return: True if successful, False otherwise.
129
+ """
130
+ return _write_file(
131
+ content=content,
132
+ filepath=filepath,
133
+ mode="w",
134
+ on_exist=on_exist,
135
+ dump_json=False,
136
+ encoding=encoding,
137
+ )
138
+
139
+
140
+ def save_as_json(
141
+ content: Any,
142
+ filepath: Union[str, Path],
143
+ *,
144
+ encoding: str = "utf-8",
145
+ on_exist: Literal["overwrite", "skip", "rename"] = "overwrite",
146
+ ) -> bool:
147
+ """
148
+ Save JSON-serializable content to the given file path.
149
+
150
+ :param content: Data to write as JSON.
151
+ :param filepath: Destination file path.
152
+ :param encoding: Text encoding to use (default: 'utf-8').
153
+ :param on_exist: How to handle existing files: 'overwrite', 'skip', or 'rename'.
154
+ :return: True if successful, False otherwise.
155
+ """
156
+ return _write_file(
157
+ content=content,
158
+ filepath=filepath,
159
+ mode="w",
160
+ on_exist=on_exist,
161
+ dump_json=True,
162
+ encoding=encoding,
163
+ )
164
+
165
+
166
+ def read_text_file(
167
+ filepath: Union[str, Path], encoding: str = "utf-8"
168
+ ) -> Optional[str]:
169
+ """
170
+ Read a UTF-8 text file.
171
+
172
+ :param filepath: Path to file.
173
+ :param encoding: Encoding to use.
174
+ :return: Text content or None on failure.
175
+ """
176
+ path = Path(filepath)
177
+ try:
178
+ return path.read_text(encoding=encoding)
179
+ except Exception as e:
180
+ logger.warning("[file] Failed to read %r: %s", path, e)
181
+ return None
182
+
183
+
184
+ def read_json_file(
185
+ filepath: Union[str, Path], encoding: str = "utf-8"
186
+ ) -> Optional[Any]:
187
+ """
188
+ Read a JSON file and parse it into Python objects.
189
+
190
+ :param filepath: Path to file.
191
+ :param encoding: Encoding to use.
192
+ :return: Python object or None on failure.
193
+ """
194
+ path = Path(filepath)
195
+ try:
196
+ return json.loads(path.read_text(encoding=encoding))
197
+ except Exception as e:
198
+ logger.warning("[file] Failed to read %r: %s", path, e)
199
+ return None
200
+
201
+
202
+ def read_binary_file(filepath: Union[str, Path]) -> Optional[bytes]:
203
+ """
204
+ Read a binary file and return its content as bytes.
205
+
206
+ :param filepath: Path to file.
207
+ :return: Bytes or None on failure.
208
+ """
209
+ path = Path(filepath)
210
+ try:
211
+ return path.read_bytes()
212
+ except Exception as e:
213
+ logger.warning("[file] Failed to read %r: %s", path, e)
214
+ return None
215
+
216
+
217
+ def load_text_resource(
218
+ filename: str,
219
+ package: str = "novel_downloader.resources.text",
220
+ ) -> str:
221
+ """
222
+ Load and return the contents of a text resource.
223
+
224
+ :param filename: Name of the text file (e.g. "blacklist.txt").
225
+ :param package: Package path where resources live (default: text resources).
226
+ For other resource types, point to the appropriate subpackage
227
+ (e.g. "novel_downloader.resources.css").
228
+ :return: File contents as a string.
229
+ """
230
+ resource_path = files(package).joinpath(filename)
231
+ return resource_path.read_text(encoding="utf-8")
232
+
233
+
234
+ def load_blacklisted_words() -> Set[str]:
235
+ """
236
+ Convenience loader for the blacklist.txt in the text resources.
237
+
238
+ :return: A set of non-empty, stripped lines from blacklist.txt.
239
+ """
240
+ text = load_text_resource("blacklist.txt")
241
+ return {line.strip() for line in text.splitlines() if line.strip()}
242
+
243
+
244
+ __all__ = [
245
+ "save_as_txt",
246
+ "save_as_json",
247
+ "read_text_file",
248
+ "read_json_file",
249
+ "read_binary_file",
250
+ "load_text_resource",
251
+ "load_blacklisted_words",
252
+ ]
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.utils.file_utils.normalize
5
+ -------------------------------------------
6
+
7
+ Utilities for normalizing the contents of text files for consistency
8
+ across platforms or output formats.
9
+
10
+ Currently includes line-ending normalization for .txt files.
11
+ """
12
+
13
+ import logging
14
+ from pathlib import Path
15
+ from typing import Union
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def normalize_txt_line_endings(folder_path: Union[str, Path]) -> None:
21
+ """
22
+ Convert all .txt files in the given folder (recursively)
23
+ to use Unix-style LF (\\n) line endings.
24
+
25
+ :param folder_path: Path to the folder containing .txt files.
26
+ Can be a str or Path.
27
+ :return: None
28
+ """
29
+ path = Path(folder_path).resolve()
30
+ if not path.exists() or not path.is_dir():
31
+ logger.warning("[file] Invalid folder: %s", path)
32
+ return
33
+
34
+ count_success, count_fail = 0, 0
35
+
36
+ for txt_file in path.rglob("*.txt"):
37
+ try:
38
+ content = txt_file.read_text(encoding="utf-8")
39
+ normalized = content.replace("\r\n", "\n").replace("\r", "\n")
40
+ txt_file.write_text(normalized, encoding="utf-8", newline="\n")
41
+ logger.debug("[file] Normalized: %s", txt_file)
42
+ count_success += 1
43
+ except (OSError, UnicodeDecodeError) as e:
44
+ logger.warning("[file] Failed: %s | %s", txt_file, e)
45
+ count_fail += 1
46
+
47
+ logger.info("[file] Completed. Success: %s, Failed: %s", count_success, count_fail)
48
+ return
49
+
50
+
51
+ __all__ = ["normalize_txt_line_endings"]
52
+
53
+ if __name__ == "__main__": # pragma: no cover
54
+ import argparse
55
+
56
+ logging.basicConfig(
57
+ level=logging.INFO, format="[%(asctime)s] [%(levelname)s] %(name)s: %(message)s"
58
+ )
59
+
60
+ parser = argparse.ArgumentParser(
61
+ description="Normalize line endings of .txt files in a folder to LF."
62
+ )
63
+ parser.add_argument(
64
+ "folder", type=str, help="Path to the folder containing .txt files."
65
+ )
66
+ args = parser.parse_args()
67
+
68
+ normalize_txt_line_endings(args.folder)