novel-downloader 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +14 -0
- novel_downloader/cli/__init__.py +14 -0
- novel_downloader/cli/clean.py +134 -0
- novel_downloader/cli/download.py +98 -0
- novel_downloader/cli/interactive.py +67 -0
- novel_downloader/cli/main.py +45 -0
- novel_downloader/cli/settings.py +177 -0
- novel_downloader/config/__init__.py +52 -0
- novel_downloader/config/adapter.py +150 -0
- novel_downloader/config/loader.py +177 -0
- novel_downloader/config/models.py +170 -0
- novel_downloader/config/site_rules.py +97 -0
- novel_downloader/core/__init__.py +25 -0
- novel_downloader/core/downloaders/__init__.py +20 -0
- novel_downloader/core/downloaders/base_downloader.py +187 -0
- novel_downloader/core/downloaders/common_downloader.py +192 -0
- novel_downloader/core/downloaders/qidian_downloader.py +208 -0
- novel_downloader/core/factory/__init__.py +21 -0
- novel_downloader/core/factory/downloader_factory.py +62 -0
- novel_downloader/core/factory/parser_factory.py +62 -0
- novel_downloader/core/factory/requester_factory.py +62 -0
- novel_downloader/core/factory/saver_factory.py +49 -0
- novel_downloader/core/interfaces/__init__.py +28 -0
- novel_downloader/core/interfaces/downloader_protocol.py +37 -0
- novel_downloader/core/interfaces/parser_protocol.py +40 -0
- novel_downloader/core/interfaces/requester_protocol.py +65 -0
- novel_downloader/core/interfaces/saver_protocol.py +61 -0
- novel_downloader/core/parsers/__init__.py +28 -0
- novel_downloader/core/parsers/base_parser.py +96 -0
- novel_downloader/core/parsers/common_parser/__init__.py +14 -0
- novel_downloader/core/parsers/common_parser/helper.py +321 -0
- novel_downloader/core/parsers/common_parser/main_parser.py +86 -0
- novel_downloader/core/parsers/qidian_parser/__init__.py +20 -0
- novel_downloader/core/parsers/qidian_parser/browser/__init__.py +13 -0
- novel_downloader/core/parsers/qidian_parser/browser/chapter_encrypted.py +498 -0
- novel_downloader/core/parsers/qidian_parser/browser/chapter_normal.py +97 -0
- novel_downloader/core/parsers/qidian_parser/browser/chapter_router.py +70 -0
- novel_downloader/core/parsers/qidian_parser/browser/main_parser.py +110 -0
- novel_downloader/core/parsers/qidian_parser/session/__init__.py +13 -0
- novel_downloader/core/parsers/qidian_parser/session/chapter_encrypted.py +451 -0
- novel_downloader/core/parsers/qidian_parser/session/chapter_normal.py +119 -0
- novel_downloader/core/parsers/qidian_parser/session/chapter_router.py +67 -0
- novel_downloader/core/parsers/qidian_parser/session/main_parser.py +113 -0
- novel_downloader/core/parsers/qidian_parser/session/node_decryptor.py +164 -0
- novel_downloader/core/parsers/qidian_parser/shared/__init__.py +38 -0
- novel_downloader/core/parsers/qidian_parser/shared/book_info_parser.py +95 -0
- novel_downloader/core/parsers/qidian_parser/shared/helpers.py +133 -0
- novel_downloader/core/requesters/__init__.py +27 -0
- novel_downloader/core/requesters/base_browser.py +210 -0
- novel_downloader/core/requesters/base_session.py +243 -0
- novel_downloader/core/requesters/common_requester/__init__.py +14 -0
- novel_downloader/core/requesters/common_requester/common_session.py +126 -0
- novel_downloader/core/requesters/qidian_requester/__init__.py +22 -0
- novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +377 -0
- novel_downloader/core/requesters/qidian_requester/qidian_session.py +202 -0
- novel_downloader/core/savers/__init__.py +20 -0
- novel_downloader/core/savers/base_saver.py +169 -0
- novel_downloader/core/savers/common_saver/__init__.py +13 -0
- novel_downloader/core/savers/common_saver/common_epub.py +232 -0
- novel_downloader/core/savers/common_saver/common_txt.py +176 -0
- novel_downloader/core/savers/common_saver/main_saver.py +86 -0
- novel_downloader/core/savers/epub_utils/__init__.py +27 -0
- novel_downloader/core/savers/epub_utils/css_builder.py +68 -0
- novel_downloader/core/savers/epub_utils/initializer.py +98 -0
- novel_downloader/core/savers/epub_utils/text_to_html.py +132 -0
- novel_downloader/core/savers/epub_utils/volume_intro.py +61 -0
- novel_downloader/core/savers/qidian_saver.py +22 -0
- novel_downloader/locales/en.json +91 -0
- novel_downloader/locales/zh.json +91 -0
- novel_downloader/resources/config/rules.toml +196 -0
- novel_downloader/resources/config/settings.yaml +70 -0
- novel_downloader/resources/css_styles/main.css +104 -0
- novel_downloader/resources/css_styles/volume-intro.css +56 -0
- novel_downloader/resources/images/volume_border.png +0 -0
- novel_downloader/resources/js_scripts/qidian_decrypt_node.js +82 -0
- novel_downloader/resources/json/replace_word_map.json +4 -0
- novel_downloader/resources/text/blacklist.txt +22 -0
- novel_downloader/utils/__init__.py +0 -0
- novel_downloader/utils/cache.py +24 -0
- novel_downloader/utils/constants.py +158 -0
- novel_downloader/utils/crypto_utils.py +144 -0
- novel_downloader/utils/file_utils/__init__.py +43 -0
- novel_downloader/utils/file_utils/io.py +252 -0
- novel_downloader/utils/file_utils/normalize.py +68 -0
- novel_downloader/utils/file_utils/sanitize.py +77 -0
- novel_downloader/utils/fontocr/__init__.py +23 -0
- novel_downloader/utils/fontocr/ocr_v1.py +304 -0
- novel_downloader/utils/fontocr/ocr_v2.py +658 -0
- novel_downloader/utils/hash_store.py +288 -0
- novel_downloader/utils/hash_utils.py +103 -0
- novel_downloader/utils/i18n.py +41 -0
- novel_downloader/utils/logger.py +104 -0
- novel_downloader/utils/model_loader.py +72 -0
- novel_downloader/utils/network.py +287 -0
- novel_downloader/utils/state.py +156 -0
- novel_downloader/utils/text_utils/__init__.py +27 -0
- novel_downloader/utils/text_utils/chapter_formatting.py +46 -0
- novel_downloader/utils/text_utils/diff_display.py +75 -0
- novel_downloader/utils/text_utils/font_mapping.py +31 -0
- novel_downloader/utils/text_utils/text_cleaning.py +57 -0
- novel_downloader/utils/time_utils/__init__.py +22 -0
- novel_downloader/utils/time_utils/datetime_utils.py +146 -0
- novel_downloader/utils/time_utils/sleep_utils.py +49 -0
- novel_downloader-1.1.1.dist-info/METADATA +137 -0
- novel_downloader-1.1.1.dist-info/RECORD +109 -0
- novel_downloader-1.1.1.dist-info/WHEEL +5 -0
- novel_downloader-1.1.1.dist-info/entry_points.txt +2 -0
- novel_downloader-1.1.1.dist-info/licenses/LICENSE +21 -0
- novel_downloader-1.1.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,158 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.utils.constants
|
5
|
+
--------------------------------
|
6
|
+
|
7
|
+
Constants and default paths used throughout the NovelDownloader project.
|
8
|
+
"""
|
9
|
+
|
10
|
+
from importlib.resources import files
|
11
|
+
from pathlib import Path
|
12
|
+
|
13
|
+
from platformdirs import user_config_dir
|
14
|
+
|
15
|
+
# -----------------------------------------------------------------------------
|
16
|
+
# Application identity
|
17
|
+
# -----------------------------------------------------------------------------
|
18
|
+
PACKAGE_NAME = "novel_downloader" # Python package name
|
19
|
+
APP_NAME = "NovelDownloader" # Display name
|
20
|
+
APP_DIR_NAME = "novel_downloader" # Directory name for platformdirs
|
21
|
+
LOGGER_NAME = PACKAGE_NAME # Root logger name
|
22
|
+
|
23
|
+
|
24
|
+
# -----------------------------------------------------------------------------
|
25
|
+
# Base directories
|
26
|
+
# -----------------------------------------------------------------------------
|
27
|
+
# Base config directory (e.g. ~/AppData/Local/novel_downloader/)
|
28
|
+
BASE_CONFIG_DIR = Path(user_config_dir(APP_DIR_NAME, appauthor=False))
|
29
|
+
PACKAGE_ROOT: Path = Path(__file__).parent.parent
|
30
|
+
LOCALES_DIR: Path = PACKAGE_ROOT / "locales"
|
31
|
+
|
32
|
+
# Subdirectories under BASE_CONFIG_DIR
|
33
|
+
LOGGER_DIR = BASE_CONFIG_DIR / "logs"
|
34
|
+
JS_SCRIPT_DIR = BASE_CONFIG_DIR / "scripts"
|
35
|
+
STATE_DIR = BASE_CONFIG_DIR / "state"
|
36
|
+
DATA_DIR = BASE_CONFIG_DIR / "data"
|
37
|
+
CONFIG_DIR = BASE_CONFIG_DIR / "config"
|
38
|
+
MODEL_CACHE_DIR = BASE_CONFIG_DIR / "models"
|
39
|
+
|
40
|
+
# -----------------------------------------------------------------------------
|
41
|
+
# Default file paths
|
42
|
+
# -----------------------------------------------------------------------------
|
43
|
+
STATE_FILE = STATE_DIR / "state.json"
|
44
|
+
HASH_STORE_FILE = DATA_DIR / "image_hashes.json"
|
45
|
+
SETTING_FILE = CONFIG_DIR / "settings.json"
|
46
|
+
SITE_RULES_FILE = CONFIG_DIR / "site_rules.json"
|
47
|
+
DEFAULT_USER_DATA_DIR = DATA_DIR / "browser_data"
|
48
|
+
|
49
|
+
|
50
|
+
# -----------------------------------------------------------------------------
|
51
|
+
# Default preferences & headers
|
52
|
+
# -----------------------------------------------------------------------------
|
53
|
+
DEFAULT_USER_PROFILE_NAME = "Profile_1"
|
54
|
+
DEFAULT_IMAGE_SUFFIX = ".jpg"
|
55
|
+
|
56
|
+
DEFAULT_USER_AGENT = (
|
57
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
58
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
59
|
+
"Chrome/134.0.0.0 Safari/537.36"
|
60
|
+
)
|
61
|
+
DEFAULT_HEADERS = {"User-Agent": DEFAULT_USER_AGENT}
|
62
|
+
|
63
|
+
DEFAULT_ACCEPT = (
|
64
|
+
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
|
65
|
+
)
|
66
|
+
|
67
|
+
DEFAULT_USER_HEADERS = {
|
68
|
+
"Accept": DEFAULT_ACCEPT,
|
69
|
+
"Accept-Encoding": "gzip, deflate",
|
70
|
+
"Accept-Language": "en,zh;q=0.9,zh-CN;q=0.8",
|
71
|
+
"User-Agent": DEFAULT_USER_AGENT,
|
72
|
+
"Connection": "keep-alive",
|
73
|
+
}
|
74
|
+
|
75
|
+
# -----------------------------------------------------------------------------
|
76
|
+
# Embedded resources (via importlib.resources)
|
77
|
+
# -----------------------------------------------------------------------------
|
78
|
+
BASE_CONFIG_PATH = files("novel_downloader.resources.config").joinpath("settings.yaml")
|
79
|
+
BASE_RULE_PATH = files("novel_downloader.resources.config").joinpath("rules.toml")
|
80
|
+
|
81
|
+
DEFAULT_SETTINGS_PATHS = [
|
82
|
+
BASE_CONFIG_PATH,
|
83
|
+
BASE_RULE_PATH,
|
84
|
+
]
|
85
|
+
|
86
|
+
# CSS Styles
|
87
|
+
CSS_MAIN_PATH = files("novel_downloader.resources.css_styles").joinpath("main.css")
|
88
|
+
CSS_VOLUME_INTRO_PATH = files("novel_downloader.resources.css_styles").joinpath(
|
89
|
+
"volume-intro.css"
|
90
|
+
)
|
91
|
+
|
92
|
+
# Images
|
93
|
+
VOLUME_BORDER_IMAGE_PATH = files("novel_downloader.resources.images").joinpath(
|
94
|
+
"volume_border.png"
|
95
|
+
)
|
96
|
+
|
97
|
+
# JSON
|
98
|
+
REPLACE_WORD_MAP_PATH = files("novel_downloader.resources.json").joinpath(
|
99
|
+
"replace_word_map.json"
|
100
|
+
)
|
101
|
+
|
102
|
+
# JavaScript
|
103
|
+
QD_DECRYPT_SCRIPT_PATH = files("novel_downloader.resources.js_scripts").joinpath(
|
104
|
+
"qidian_decrypt_node.js"
|
105
|
+
)
|
106
|
+
|
107
|
+
# Text Files
|
108
|
+
BLACKLIST_PATH = files("novel_downloader.resources.text").joinpath("blacklist.txt")
|
109
|
+
|
110
|
+
# -----------------------------------------------------------------------------
|
111
|
+
# EPUB defaults
|
112
|
+
# -----------------------------------------------------------------------------
|
113
|
+
EPUB_IMAGE_FOLDER = "Images"
|
114
|
+
EPUB_TEXT_FOLDER = "Text"
|
115
|
+
|
116
|
+
EPUB_OPTIONS = {
|
117
|
+
# guide 是 EPUB 2 的一个部分, 包含封面, 目录, 索引等重要导航信息
|
118
|
+
"epub2_guide": True,
|
119
|
+
# landmark 是 EPUB 3 用来标识重要页面 (如目录, 封面, 起始页) 的 <nav> 结构
|
120
|
+
"epub3_landmark": True,
|
121
|
+
# EPUB 3 允许提供一个 page list, 让电子书在不同设备上仍然保持相对一致的分页结构
|
122
|
+
"epub3_pages": True,
|
123
|
+
# 这个名字会出现在 EPUB 阅读器的导航栏
|
124
|
+
"landmark_title": "Guide",
|
125
|
+
# 这个名字会显示在 EPUB 阅读器的分页导航栏
|
126
|
+
"pages_title": "Pages",
|
127
|
+
# 是否根据 book.spine 的排列顺序自动设置 EPUB 阅读器的 page-progression-direction
|
128
|
+
"spine_direction": True,
|
129
|
+
# 控制 EPUB 阅读器的默认翻页方向 (LTR 或 RTL)
|
130
|
+
"package_direction": False,
|
131
|
+
# 是否为 EPUB 书籍中的章节 添加播放顺序
|
132
|
+
"play_order": {"enabled": True, "start_from": 1},
|
133
|
+
}
|
134
|
+
|
135
|
+
# ---------------------------------------------------------------------
|
136
|
+
# Pretrained model registry (e.g. used in font recovery or OCR)
|
137
|
+
# ---------------------------------------------------------------------
|
138
|
+
|
139
|
+
# Hugging Face model repo for character recognition
|
140
|
+
REC_CHAR_MODEL_REPO = "saudadez/rec_chinese_char"
|
141
|
+
|
142
|
+
# Required files to be downloaded for the model
|
143
|
+
REC_CHAR_MODEL_FILES = [
|
144
|
+
"inference.pdmodel",
|
145
|
+
"inference.pdiparams",
|
146
|
+
"rec_custom_keys.txt",
|
147
|
+
"char_freq.json",
|
148
|
+
]
|
149
|
+
|
150
|
+
REC_CHAR_VECTOR_FILES = [
|
151
|
+
"char_vectors.npy",
|
152
|
+
"char_vectors.txt",
|
153
|
+
]
|
154
|
+
|
155
|
+
REC_IMAGE_SHAPE_MAP = {
|
156
|
+
"v1.0": "3,32,32",
|
157
|
+
"v2.0": "3,48,48",
|
158
|
+
}
|
@@ -0,0 +1,144 @@
|
|
1
|
+
"""
|
2
|
+
novel_downloader.utils.crypto_utils
|
3
|
+
-----------------------------------
|
4
|
+
|
5
|
+
Generic cryptographic utilities
|
6
|
+
"""
|
7
|
+
|
8
|
+
from __future__ import annotations
|
9
|
+
|
10
|
+
import base64
|
11
|
+
import hashlib
|
12
|
+
import json
|
13
|
+
import random
|
14
|
+
import time
|
15
|
+
from typing import Any, Dict, List
|
16
|
+
|
17
|
+
|
18
|
+
def rc4_crypt(
|
19
|
+
key: str,
|
20
|
+
data: str,
|
21
|
+
*,
|
22
|
+
mode: str = "encrypt",
|
23
|
+
encoding: str = "utf-8",
|
24
|
+
) -> str:
|
25
|
+
"""
|
26
|
+
Encrypt or decrypt data using RC4 and Base64.
|
27
|
+
|
28
|
+
:param key: RC4 key (will be encoded using the specified encoding).
|
29
|
+
:type key: str
|
30
|
+
:param data: Plain-text (for 'encrypt') or Base64 cipher-text (for 'decrypt').
|
31
|
+
:type data: str
|
32
|
+
:param mode: Operation mode, either 'encrypt' or 'decrypt'. Defaults to 'encrypt'.
|
33
|
+
:type mode: str, optional
|
34
|
+
:param encoding: Character encoding for key and returned string. Defaults 'utf-8'.
|
35
|
+
:type encoding: str, optional
|
36
|
+
|
37
|
+
:return: Base64 cipher-text (for encryption) or decoded plain-text (for decryption).
|
38
|
+
:rtype: str
|
39
|
+
|
40
|
+
:raises ValueError: If mode is not 'encrypt' or 'decrypt'.
|
41
|
+
"""
|
42
|
+
|
43
|
+
def _rc4(key_bytes: bytes, data_bytes: bytes) -> bytes:
|
44
|
+
# Key-Scheduling Algorithm (KSA)
|
45
|
+
S = list(range(256))
|
46
|
+
j = 0
|
47
|
+
for i in range(256):
|
48
|
+
j = (j + S[i] + key_bytes[i % len(key_bytes)]) % 256
|
49
|
+
S[i], S[j] = S[j], S[i]
|
50
|
+
|
51
|
+
# Pseudo-Random Generation Algorithm (PRGA)
|
52
|
+
i = j = 0
|
53
|
+
out: List[int] = []
|
54
|
+
for char in data_bytes:
|
55
|
+
i = (i + 1) % 256
|
56
|
+
j = (j + S[i]) % 256
|
57
|
+
S[i], S[j] = S[j], S[i]
|
58
|
+
K = S[(S[i] + S[j]) % 256]
|
59
|
+
out.append(char ^ K)
|
60
|
+
|
61
|
+
return bytes(out)
|
62
|
+
|
63
|
+
key_bytes = key.encode(encoding)
|
64
|
+
|
65
|
+
if mode == "encrypt":
|
66
|
+
plain_bytes = data.encode(encoding)
|
67
|
+
cipher_bytes = _rc4(key_bytes, plain_bytes)
|
68
|
+
return base64.b64encode(cipher_bytes).decode(encoding)
|
69
|
+
|
70
|
+
if mode == "decrypt":
|
71
|
+
cipher_bytes = base64.b64decode(data)
|
72
|
+
plain_bytes = _rc4(key_bytes, cipher_bytes)
|
73
|
+
return plain_bytes.decode(encoding, errors="replace")
|
74
|
+
|
75
|
+
raise ValueError("Mode must be 'encrypt' or 'decrypt'.")
|
76
|
+
|
77
|
+
|
78
|
+
def _get_key() -> str:
|
79
|
+
encoded = "Lj1qYxMuaXBjMg=="
|
80
|
+
decoded = base64.b64decode(encoded)
|
81
|
+
key = "".join([chr(b ^ 0x5A) for b in decoded])
|
82
|
+
return key
|
83
|
+
|
84
|
+
|
85
|
+
def _d(b64str: str) -> str:
|
86
|
+
return base64.b64decode(b64str).decode()
|
87
|
+
|
88
|
+
|
89
|
+
def patch_qd_payload_token(
|
90
|
+
enc_token: str,
|
91
|
+
new_uri: str,
|
92
|
+
*,
|
93
|
+
key: str = "",
|
94
|
+
) -> str:
|
95
|
+
"""
|
96
|
+
Patch a timestamp-bearing token with fresh timing and checksum info.
|
97
|
+
|
98
|
+
:param enc_token: Encrypted token string from a live request.
|
99
|
+
:type enc_token: str
|
100
|
+
:param new_uri: URI used in checksum generation.
|
101
|
+
:type new_uri: str
|
102
|
+
:param key: RC4 key extracted from front-end JavaScript (optional).
|
103
|
+
:type key: str, optional
|
104
|
+
|
105
|
+
:return: Updated token with new timing and checksum values.
|
106
|
+
:rtype: str
|
107
|
+
"""
|
108
|
+
if not key:
|
109
|
+
key = _get_key()
|
110
|
+
|
111
|
+
# Step 1 – decrypt --------------------------------------------------
|
112
|
+
decrypted_json: str = rc4_crypt(key, enc_token, mode="decrypt")
|
113
|
+
payload: Dict[str, Any] = json.loads(decrypted_json)
|
114
|
+
|
115
|
+
# Step 2 – rebuild timing fields -----------------------------------
|
116
|
+
loadts = int(time.time() * 1000) # ms since epoch
|
117
|
+
# Simulate the JS duration: N(600, 150) pushed into [300, 1000]
|
118
|
+
duration = max(300, min(1000, int(random.normalvariate(600, 150))))
|
119
|
+
timestamp = loadts + duration
|
120
|
+
|
121
|
+
# Step 3 – recalculate ------------------------------------
|
122
|
+
fp_key = _d("ZmluZ2VycHJpbnQ=")
|
123
|
+
ab_key = _d("YWJub3JtYWw=")
|
124
|
+
ck_key = _d("Y2hlY2tzdW0=")
|
125
|
+
lt_key = _d("bG9hZHRz")
|
126
|
+
ts_key = _d("dGltZXN0YW1w")
|
127
|
+
|
128
|
+
fp_val = payload.get(fp_key, "")
|
129
|
+
ab_val = payload.get(ab_key, "0" * 32)
|
130
|
+
comb = f"{new_uri}{loadts}{fp_val}"
|
131
|
+
ck_val = hashlib.md5(comb.encode("utf-8")).hexdigest()
|
132
|
+
|
133
|
+
new_payload = {
|
134
|
+
lt_key: loadts,
|
135
|
+
ts_key: timestamp,
|
136
|
+
fp_key: fp_val,
|
137
|
+
ab_key: ab_val,
|
138
|
+
ck_key: ck_val,
|
139
|
+
}
|
140
|
+
|
141
|
+
# Step 4 – encrypt and return --------------------------------------
|
142
|
+
return rc4_crypt(
|
143
|
+
key, json.dumps(new_payload, separators=(",", ":")), mode="encrypt"
|
144
|
+
)
|
@@ -0,0 +1,43 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.utils.file_utils
|
5
|
+
---------------------------------
|
6
|
+
|
7
|
+
High-level file I/O utility re-exports for convenience.
|
8
|
+
|
9
|
+
This module aggregates commonly used low-level file utilities such as:
|
10
|
+
- Path sanitization (for safe filenames)
|
11
|
+
- Text normalization (e.g. Windows/Linux line endings)
|
12
|
+
- JSON, plain text, and binary file reading/writing
|
13
|
+
|
14
|
+
Included utilities:
|
15
|
+
- sanitize_filename: remove invalid characters from filenames
|
16
|
+
- normalize_txt_line_endings: standardize line endings in text files
|
17
|
+
- save_as_json / save_as_txt: write dict or text to file
|
18
|
+
- read_text_file / read_json_file / read_binary_file: load content from file
|
19
|
+
"""
|
20
|
+
|
21
|
+
from .io import (
|
22
|
+
load_blacklisted_words,
|
23
|
+
load_text_resource,
|
24
|
+
read_binary_file,
|
25
|
+
read_json_file,
|
26
|
+
read_text_file,
|
27
|
+
save_as_json,
|
28
|
+
save_as_txt,
|
29
|
+
)
|
30
|
+
from .normalize import normalize_txt_line_endings
|
31
|
+
from .sanitize import sanitize_filename
|
32
|
+
|
33
|
+
__all__ = [
|
34
|
+
"sanitize_filename",
|
35
|
+
"save_as_json",
|
36
|
+
"save_as_txt",
|
37
|
+
"read_text_file",
|
38
|
+
"read_json_file",
|
39
|
+
"read_binary_file",
|
40
|
+
"load_text_resource",
|
41
|
+
"load_blacklisted_words",
|
42
|
+
"normalize_txt_line_endings",
|
43
|
+
]
|
@@ -0,0 +1,252 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.utils.file_utils.io
|
5
|
+
------------------------------------
|
6
|
+
|
7
|
+
File I/O utilities for reading and writing text, JSON, and binary data.
|
8
|
+
|
9
|
+
Includes:
|
10
|
+
- Safe, atomic file saving with optional overwrite and auto-renaming
|
11
|
+
- JSON pretty-printing with size-aware formatting
|
12
|
+
- Simple helpers for reading files with fallback and logging
|
13
|
+
"""
|
14
|
+
|
15
|
+
import json
|
16
|
+
import logging
|
17
|
+
import tempfile
|
18
|
+
from importlib.resources import files
|
19
|
+
from pathlib import Path
|
20
|
+
from typing import Any, Dict, List, Literal, Optional, Set, Union
|
21
|
+
|
22
|
+
from .sanitize import sanitize_filename
|
23
|
+
|
24
|
+
logger = logging.getLogger(__name__)
|
25
|
+
|
26
|
+
_JSON_INDENT_THRESHOLD = 50 * 1024 # bytes
|
27
|
+
|
28
|
+
|
29
|
+
def _get_non_conflicting_path(path: Path) -> Path:
|
30
|
+
"""
|
31
|
+
If the path exists, generate a new one by appending _1, _2, etc.
|
32
|
+
"""
|
33
|
+
counter = 1
|
34
|
+
new_path = path
|
35
|
+
while new_path.exists():
|
36
|
+
stem = path.stem
|
37
|
+
suffix = path.suffix
|
38
|
+
new_path = path.with_name(f"{stem}_{counter}{suffix}")
|
39
|
+
counter += 1
|
40
|
+
return new_path
|
41
|
+
|
42
|
+
|
43
|
+
def _write_file(
|
44
|
+
content: Union[str, bytes, Dict[Any, Any], List[Any], Any],
|
45
|
+
filepath: Union[str, Path],
|
46
|
+
mode: Optional[str] = None,
|
47
|
+
*,
|
48
|
+
on_exist: Literal["overwrite", "skip", "rename"] = "overwrite",
|
49
|
+
dump_json: bool = False,
|
50
|
+
encoding: str = "utf-8",
|
51
|
+
) -> bool:
|
52
|
+
"""
|
53
|
+
Write content to a file safely with optional atomic behavior
|
54
|
+
and JSON serialization.
|
55
|
+
|
56
|
+
:param content: The content to write; can be text, bytes, or a
|
57
|
+
JSON-serializable object.
|
58
|
+
:param filepath: Destination path (str or Path).
|
59
|
+
:param mode: File mode ('w', 'wb'). Auto-determined if None.
|
60
|
+
:param on_exist: Behavior if file exists: 'overwrite', 'skip',
|
61
|
+
or 'rename'.
|
62
|
+
:param dump_json: If True, serialize content as JSON.
|
63
|
+
:param encoding: Text encoding for writing.
|
64
|
+
:return: True if writing succeeds, False otherwise.
|
65
|
+
"""
|
66
|
+
path = Path(filepath)
|
67
|
+
path = path.with_name(sanitize_filename(path.name))
|
68
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
69
|
+
|
70
|
+
if path.exists():
|
71
|
+
if on_exist == "skip":
|
72
|
+
logger.debug("[file] '%s' exists, skipping", path)
|
73
|
+
return False
|
74
|
+
if on_exist == "rename":
|
75
|
+
path = _get_non_conflicting_path(path)
|
76
|
+
logger.debug("[file] Renaming target to avoid conflict: %s", path)
|
77
|
+
else:
|
78
|
+
logger.debug("[file] '%s' exists, will overwrite", path)
|
79
|
+
|
80
|
+
# Prepare content and write mode
|
81
|
+
content_to_write: Union[str, bytes]
|
82
|
+
if dump_json:
|
83
|
+
# Serialize original object to JSON string
|
84
|
+
json_str = json.dumps(content, ensure_ascii=False, indent=2)
|
85
|
+
if len(json_str.encode(encoding)) > _JSON_INDENT_THRESHOLD:
|
86
|
+
json_str = json.dumps(content, ensure_ascii=False, separators=(",", ":"))
|
87
|
+
content_to_write = json_str
|
88
|
+
write_mode = "w"
|
89
|
+
else:
|
90
|
+
if isinstance(content, (str, bytes)):
|
91
|
+
content_to_write = content
|
92
|
+
else:
|
93
|
+
raise TypeError("Non-JSON content must be str or bytes.")
|
94
|
+
write_mode = "wb" if isinstance(content, bytes) else "w"
|
95
|
+
|
96
|
+
try:
|
97
|
+
with tempfile.NamedTemporaryFile(
|
98
|
+
mode=write_mode,
|
99
|
+
encoding=None if "b" in write_mode else encoding,
|
100
|
+
newline=None if "b" in write_mode else "\n",
|
101
|
+
delete=False,
|
102
|
+
dir=path.parent,
|
103
|
+
) as tmp:
|
104
|
+
tmp.write(content_to_write)
|
105
|
+
tmp_path = Path(tmp.name)
|
106
|
+
tmp_path.replace(path)
|
107
|
+
logger.info("[file] '%s' written successfully", path)
|
108
|
+
return True
|
109
|
+
except Exception as exc:
|
110
|
+
logger.warning("[file] Error writing %r: %s", path, exc)
|
111
|
+
return False
|
112
|
+
|
113
|
+
|
114
|
+
def save_as_txt(
|
115
|
+
content: str,
|
116
|
+
filepath: Union[str, Path],
|
117
|
+
*,
|
118
|
+
encoding: str = "utf-8",
|
119
|
+
on_exist: Literal["overwrite", "skip", "rename"] = "overwrite",
|
120
|
+
) -> bool:
|
121
|
+
"""
|
122
|
+
Save plain text content to the given file path.
|
123
|
+
|
124
|
+
:param content: Text content to write.
|
125
|
+
:param filepath: Destination file path.
|
126
|
+
:param encoding: Text encoding to use (default: 'utf-8').
|
127
|
+
:param on_exist: How to handle existing files: 'overwrite', 'skip', or 'rename'.
|
128
|
+
:return: True if successful, False otherwise.
|
129
|
+
"""
|
130
|
+
return _write_file(
|
131
|
+
content=content,
|
132
|
+
filepath=filepath,
|
133
|
+
mode="w",
|
134
|
+
on_exist=on_exist,
|
135
|
+
dump_json=False,
|
136
|
+
encoding=encoding,
|
137
|
+
)
|
138
|
+
|
139
|
+
|
140
|
+
def save_as_json(
|
141
|
+
content: Any,
|
142
|
+
filepath: Union[str, Path],
|
143
|
+
*,
|
144
|
+
encoding: str = "utf-8",
|
145
|
+
on_exist: Literal["overwrite", "skip", "rename"] = "overwrite",
|
146
|
+
) -> bool:
|
147
|
+
"""
|
148
|
+
Save JSON-serializable content to the given file path.
|
149
|
+
|
150
|
+
:param content: Data to write as JSON.
|
151
|
+
:param filepath: Destination file path.
|
152
|
+
:param encoding: Text encoding to use (default: 'utf-8').
|
153
|
+
:param on_exist: How to handle existing files: 'overwrite', 'skip', or 'rename'.
|
154
|
+
:return: True if successful, False otherwise.
|
155
|
+
"""
|
156
|
+
return _write_file(
|
157
|
+
content=content,
|
158
|
+
filepath=filepath,
|
159
|
+
mode="w",
|
160
|
+
on_exist=on_exist,
|
161
|
+
dump_json=True,
|
162
|
+
encoding=encoding,
|
163
|
+
)
|
164
|
+
|
165
|
+
|
166
|
+
def read_text_file(
|
167
|
+
filepath: Union[str, Path], encoding: str = "utf-8"
|
168
|
+
) -> Optional[str]:
|
169
|
+
"""
|
170
|
+
Read a UTF-8 text file.
|
171
|
+
|
172
|
+
:param filepath: Path to file.
|
173
|
+
:param encoding: Encoding to use.
|
174
|
+
:return: Text content or None on failure.
|
175
|
+
"""
|
176
|
+
path = Path(filepath)
|
177
|
+
try:
|
178
|
+
return path.read_text(encoding=encoding)
|
179
|
+
except Exception as e:
|
180
|
+
logger.warning("[file] Failed to read %r: %s", path, e)
|
181
|
+
return None
|
182
|
+
|
183
|
+
|
184
|
+
def read_json_file(
|
185
|
+
filepath: Union[str, Path], encoding: str = "utf-8"
|
186
|
+
) -> Optional[Any]:
|
187
|
+
"""
|
188
|
+
Read a JSON file and parse it into Python objects.
|
189
|
+
|
190
|
+
:param filepath: Path to file.
|
191
|
+
:param encoding: Encoding to use.
|
192
|
+
:return: Python object or None on failure.
|
193
|
+
"""
|
194
|
+
path = Path(filepath)
|
195
|
+
try:
|
196
|
+
return json.loads(path.read_text(encoding=encoding))
|
197
|
+
except Exception as e:
|
198
|
+
logger.warning("[file] Failed to read %r: %s", path, e)
|
199
|
+
return None
|
200
|
+
|
201
|
+
|
202
|
+
def read_binary_file(filepath: Union[str, Path]) -> Optional[bytes]:
|
203
|
+
"""
|
204
|
+
Read a binary file and return its content as bytes.
|
205
|
+
|
206
|
+
:param filepath: Path to file.
|
207
|
+
:return: Bytes or None on failure.
|
208
|
+
"""
|
209
|
+
path = Path(filepath)
|
210
|
+
try:
|
211
|
+
return path.read_bytes()
|
212
|
+
except Exception as e:
|
213
|
+
logger.warning("[file] Failed to read %r: %s", path, e)
|
214
|
+
return None
|
215
|
+
|
216
|
+
|
217
|
+
def load_text_resource(
|
218
|
+
filename: str,
|
219
|
+
package: str = "novel_downloader.resources.text",
|
220
|
+
) -> str:
|
221
|
+
"""
|
222
|
+
Load and return the contents of a text resource.
|
223
|
+
|
224
|
+
:param filename: Name of the text file (e.g. "blacklist.txt").
|
225
|
+
:param package: Package path where resources live (default: text resources).
|
226
|
+
For other resource types, point to the appropriate subpackage
|
227
|
+
(e.g. "novel_downloader.resources.css").
|
228
|
+
:return: File contents as a string.
|
229
|
+
"""
|
230
|
+
resource_path = files(package).joinpath(filename)
|
231
|
+
return resource_path.read_text(encoding="utf-8")
|
232
|
+
|
233
|
+
|
234
|
+
def load_blacklisted_words() -> Set[str]:
|
235
|
+
"""
|
236
|
+
Convenience loader for the blacklist.txt in the text resources.
|
237
|
+
|
238
|
+
:return: A set of non-empty, stripped lines from blacklist.txt.
|
239
|
+
"""
|
240
|
+
text = load_text_resource("blacklist.txt")
|
241
|
+
return {line.strip() for line in text.splitlines() if line.strip()}
|
242
|
+
|
243
|
+
|
244
|
+
__all__ = [
|
245
|
+
"save_as_txt",
|
246
|
+
"save_as_json",
|
247
|
+
"read_text_file",
|
248
|
+
"read_json_file",
|
249
|
+
"read_binary_file",
|
250
|
+
"load_text_resource",
|
251
|
+
"load_blacklisted_words",
|
252
|
+
]
|
@@ -0,0 +1,68 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.utils.file_utils.normalize
|
5
|
+
-------------------------------------------
|
6
|
+
|
7
|
+
Utilities for normalizing the contents of text files for consistency
|
8
|
+
across platforms or output formats.
|
9
|
+
|
10
|
+
Currently includes line-ending normalization for .txt files.
|
11
|
+
"""
|
12
|
+
|
13
|
+
import logging
|
14
|
+
from pathlib import Path
|
15
|
+
from typing import Union
|
16
|
+
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
def normalize_txt_line_endings(folder_path: Union[str, Path]) -> None:
|
21
|
+
"""
|
22
|
+
Convert all .txt files in the given folder (recursively)
|
23
|
+
to use Unix-style LF (\\n) line endings.
|
24
|
+
|
25
|
+
:param folder_path: Path to the folder containing .txt files.
|
26
|
+
Can be a str or Path.
|
27
|
+
:return: None
|
28
|
+
"""
|
29
|
+
path = Path(folder_path).resolve()
|
30
|
+
if not path.exists() or not path.is_dir():
|
31
|
+
logger.warning("[file] Invalid folder: %s", path)
|
32
|
+
return
|
33
|
+
|
34
|
+
count_success, count_fail = 0, 0
|
35
|
+
|
36
|
+
for txt_file in path.rglob("*.txt"):
|
37
|
+
try:
|
38
|
+
content = txt_file.read_text(encoding="utf-8")
|
39
|
+
normalized = content.replace("\r\n", "\n").replace("\r", "\n")
|
40
|
+
txt_file.write_text(normalized, encoding="utf-8", newline="\n")
|
41
|
+
logger.debug("[file] Normalized: %s", txt_file)
|
42
|
+
count_success += 1
|
43
|
+
except (OSError, UnicodeDecodeError) as e:
|
44
|
+
logger.warning("[file] Failed: %s | %s", txt_file, e)
|
45
|
+
count_fail += 1
|
46
|
+
|
47
|
+
logger.info("[file] Completed. Success: %s, Failed: %s", count_success, count_fail)
|
48
|
+
return
|
49
|
+
|
50
|
+
|
51
|
+
__all__ = ["normalize_txt_line_endings"]
|
52
|
+
|
53
|
+
if __name__ == "__main__": # pragma: no cover
|
54
|
+
import argparse
|
55
|
+
|
56
|
+
logging.basicConfig(
|
57
|
+
level=logging.INFO, format="[%(asctime)s] [%(levelname)s] %(name)s: %(message)s"
|
58
|
+
)
|
59
|
+
|
60
|
+
parser = argparse.ArgumentParser(
|
61
|
+
description="Normalize line endings of .txt files in a folder to LF."
|
62
|
+
)
|
63
|
+
parser.add_argument(
|
64
|
+
"folder", type=str, help="Path to the folder containing .txt files."
|
65
|
+
)
|
66
|
+
args = parser.parse_args()
|
67
|
+
|
68
|
+
normalize_txt_line_endings(args.folder)
|