novel-downloader 2.0.0__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/download.py +3 -3
- novel_downloader/cli/export.py +1 -1
- novel_downloader/cli/ui.py +7 -7
- novel_downloader/config/adapter.py +191 -154
- novel_downloader/core/__init__.py +5 -6
- novel_downloader/core/exporters/common/txt.py +9 -9
- novel_downloader/core/exporters/linovelib/txt.py +9 -9
- novel_downloader/core/fetchers/qidian.py +20 -35
- novel_downloader/core/interfaces/fetcher.py +2 -2
- novel_downloader/core/interfaces/parser.py +2 -2
- novel_downloader/core/parsers/base.py +1 -0
- novel_downloader/core/parsers/eightnovel.py +2 -2
- novel_downloader/core/parsers/esjzone.py +3 -3
- novel_downloader/core/parsers/qidian/main_parser.py +747 -12
- novel_downloader/core/parsers/qidian/utils/__init__.py +2 -21
- novel_downloader/core/parsers/qidian/utils/node_decryptor.py +4 -4
- novel_downloader/core/parsers/xiguashuwu.py +6 -12
- novel_downloader/locales/en.json +3 -3
- novel_downloader/locales/zh.json +3 -3
- novel_downloader/utils/__init__.py +0 -2
- novel_downloader/utils/chapter_storage.py +2 -3
- novel_downloader/utils/constants.py +1 -3
- novel_downloader/utils/cookies.py +32 -17
- novel_downloader/utils/crypto_utils/__init__.py +0 -6
- novel_downloader/utils/crypto_utils/rc4.py +40 -50
- novel_downloader/utils/epub/__init__.py +2 -3
- novel_downloader/utils/epub/builder.py +6 -6
- novel_downloader/utils/epub/constants.py +5 -5
- novel_downloader/utils/epub/documents.py +7 -7
- novel_downloader/utils/epub/models.py +8 -8
- novel_downloader/utils/epub/utils.py +10 -10
- novel_downloader/utils/file_utils/io.py +48 -73
- novel_downloader/utils/file_utils/normalize.py +1 -7
- novel_downloader/utils/file_utils/sanitize.py +4 -11
- novel_downloader/utils/fontocr/__init__.py +13 -0
- novel_downloader/utils/{fontocr.py → fontocr/core.py} +70 -61
- novel_downloader/utils/fontocr/loader.py +50 -0
- novel_downloader/utils/logger.py +80 -56
- novel_downloader/utils/network.py +16 -40
- novel_downloader/utils/text_utils/text_cleaner.py +39 -30
- novel_downloader/utils/text_utils/truncate_utils.py +3 -14
- novel_downloader/utils/time_utils/sleep_utils.py +53 -43
- novel_downloader/web/main.py +1 -1
- novel_downloader/web/pages/search.py +3 -3
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/METADATA +2 -1
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/RECORD +51 -55
- novel_downloader/core/parsers/qidian/book_info_parser.py +0 -89
- novel_downloader/core/parsers/qidian/chapter_encrypted.py +0 -470
- novel_downloader/core/parsers/qidian/chapter_normal.py +0 -126
- novel_downloader/core/parsers/qidian/chapter_router.py +0 -68
- novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +0 -143
- novel_downloader/core/parsers/qidian/utils/helpers.py +0 -110
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/WHEEL +0 -0
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/entry_points.txt +0 -0
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/top_level.txt +0 -0
@@ -14,8 +14,6 @@ __all__ = ["normalize_txt_line_endings"]
|
|
14
14
|
import logging
|
15
15
|
from pathlib import Path
|
16
16
|
|
17
|
-
logger = logging.getLogger(__name__)
|
18
|
-
|
19
17
|
|
20
18
|
def normalize_txt_line_endings(folder_path: str | Path) -> None:
|
21
19
|
"""
|
@@ -28,7 +26,6 @@ def normalize_txt_line_endings(folder_path: str | Path) -> None:
|
|
28
26
|
"""
|
29
27
|
path = Path(folder_path).resolve()
|
30
28
|
if not path.exists() or not path.is_dir():
|
31
|
-
logger.warning("[file] Invalid folder: %s", path)
|
32
29
|
return
|
33
30
|
|
34
31
|
count_success, count_fail = 0, 0
|
@@ -38,13 +35,10 @@ def normalize_txt_line_endings(folder_path: str | Path) -> None:
|
|
38
35
|
content = txt_file.read_text(encoding="utf-8")
|
39
36
|
normalized = content.replace("\r\n", "\n").replace("\r", "\n")
|
40
37
|
txt_file.write_text(normalized, encoding="utf-8", newline="\n")
|
41
|
-
logger.debug("[file] Normalized: %s", txt_file)
|
42
38
|
count_success += 1
|
43
|
-
except (OSError, UnicodeDecodeError)
|
44
|
-
logger.warning("[file] Failed: %s | %s", txt_file, e)
|
39
|
+
except (OSError, UnicodeDecodeError):
|
45
40
|
count_fail += 1
|
46
41
|
|
47
|
-
logger.info("[file] Completed. Success: %s, Failed: %s", count_success, count_fail)
|
48
42
|
return
|
49
43
|
|
50
44
|
|
@@ -9,13 +9,9 @@ on different operating systems.
|
|
9
9
|
|
10
10
|
__all__ = ["sanitize_filename"]
|
11
11
|
|
12
|
-
import logging
|
13
12
|
import os
|
14
13
|
import re
|
15
14
|
|
16
|
-
logger = logging.getLogger(__name__)
|
17
|
-
|
18
|
-
# Windows 保留名称列表 (忽略大小写)
|
19
15
|
_WIN_RESERVED_NAMES = {
|
20
16
|
"CON",
|
21
17
|
"PRN",
|
@@ -36,8 +32,8 @@ def sanitize_filename(filename: str, max_length: int | None = 255) -> str:
|
|
36
32
|
|
37
33
|
This function checks the operating system environment and applies the appropriate
|
38
34
|
filtering rules:
|
39
|
-
|
40
|
-
|
35
|
+
* On Windows, it replaces characters: <>:"/\\|?*
|
36
|
+
* On POSIX systems, it replaces the forward slash '/'
|
41
37
|
|
42
38
|
:param filename: The input filename to sanitize.
|
43
39
|
:param max_length: Optional maximum length of the output filename. Defaults to 255.
|
@@ -47,7 +43,7 @@ def sanitize_filename(filename: str, max_length: int | None = 255) -> str:
|
|
47
43
|
|
48
44
|
name = pattern.sub("_", filename).strip(" .")
|
49
45
|
|
50
|
-
stem, dot, ext = name.
|
46
|
+
stem, dot, ext = name.rpartition(".")
|
51
47
|
if os.name == "nt" and stem.upper() in _WIN_RESERVED_NAMES:
|
52
48
|
stem = f"_{stem}"
|
53
49
|
cleaned = f"{stem}{dot}{ext}" if ext else stem
|
@@ -59,7 +55,4 @@ def sanitize_filename(filename: str, max_length: int | None = 255) -> str:
|
|
59
55
|
else:
|
60
56
|
cleaned = cleaned[:max_length]
|
61
57
|
|
62
|
-
|
63
|
-
cleaned = "_untitled"
|
64
|
-
logger.debug("[file] Sanitized filename: %r -> %r", filename, cleaned)
|
65
|
-
return cleaned
|
58
|
+
return cleaned or "_untitled"
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.utils.fontocr
|
4
|
+
------------------------------
|
5
|
+
|
6
|
+
Lazy-loading interface for FontOCR. Provides a safe entry point
|
7
|
+
to obtain an OCR utility instance if optional dependencies are available.
|
8
|
+
"""
|
9
|
+
|
10
|
+
__all__ = ["get_font_ocr"]
|
11
|
+
__version__ = "4.0"
|
12
|
+
|
13
|
+
from .loader import get_font_ocr
|
@@ -1,28 +1,23 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
2
|
"""
|
3
|
-
novel_downloader.utils.fontocr
|
4
|
-
|
3
|
+
novel_downloader.utils.fontocr.core
|
4
|
+
-----------------------------------
|
5
5
|
|
6
6
|
This class provides utility methods for optical character recognition (OCR),
|
7
7
|
primarily used for decrypting custom font encryption.
|
8
8
|
"""
|
9
9
|
|
10
|
-
|
11
|
-
"FontOCR",
|
12
|
-
"get_font_ocr",
|
13
|
-
]
|
14
|
-
__version__ = "4.0"
|
15
|
-
|
10
|
+
import io
|
16
11
|
import logging
|
17
|
-
from
|
18
|
-
from typing import Any
|
12
|
+
from pathlib import Path
|
13
|
+
from typing import Any
|
19
14
|
|
20
15
|
import numpy as np
|
21
|
-
from
|
16
|
+
from fontTools.ttLib import TTFont
|
17
|
+
from paddleocr import TextRecognition
|
22
18
|
from PIL import Image, ImageDraw, ImageFont
|
23
19
|
from PIL.Image import Transpose
|
24
20
|
|
25
|
-
T = TypeVar("T")
|
26
21
|
logger = logging.getLogger(__name__)
|
27
22
|
|
28
23
|
|
@@ -39,22 +34,20 @@ class FontOCR:
|
|
39
34
|
device: str | None = None,
|
40
35
|
precision: str = "fp32",
|
41
36
|
cpu_threads: int = 10,
|
42
|
-
batch_size: int = 32,
|
43
|
-
threshold: float = 0.0,
|
44
37
|
**kwargs: Any,
|
45
38
|
) -> None:
|
46
39
|
"""
|
47
40
|
Initialize a FontOCR instance.
|
48
41
|
|
49
|
-
:param
|
50
|
-
:param
|
51
|
-
:param
|
52
|
-
:param
|
42
|
+
:param model_name: If set to None, PP-OCRv5_server_rec is used.
|
43
|
+
:param model_dir: Model storage path.
|
44
|
+
:param input_shape: Input image size for the model in the format (C, H, W).
|
45
|
+
:param device: Device for inference.
|
46
|
+
:param precision: Precision for TensorRT.
|
47
|
+
:param cpu_threads: Number of threads to use for inference on CPUs.
|
53
48
|
:param kwargs: reserved for future extensions
|
54
49
|
"""
|
55
|
-
self.
|
56
|
-
self._threshold = threshold
|
57
|
-
self._ocr_model = TextRecognition(
|
50
|
+
self._ocr_model = TextRecognition( # takes 5 ~ 12 sec to init
|
58
51
|
model_name=model_name,
|
59
52
|
model_dir=model_dir,
|
60
53
|
input_shape=input_shape,
|
@@ -66,18 +59,18 @@ class FontOCR:
|
|
66
59
|
def predict(
|
67
60
|
self,
|
68
61
|
images: list[np.ndarray],
|
69
|
-
|
70
|
-
) -> list[
|
62
|
+
batch_size: int = 1,
|
63
|
+
) -> list[tuple[str, float]]:
|
71
64
|
"""
|
72
65
|
Run OCR on input images.
|
73
66
|
|
74
67
|
:param images: list of np.ndarray objects to predict
|
75
|
-
:param
|
76
|
-
:return: list of
|
68
|
+
:param batch_size: batch size for OCR inference (minimum 1)
|
69
|
+
:return: list of tuple containing (character, score)
|
77
70
|
"""
|
78
71
|
return [
|
79
|
-
|
80
|
-
for pred in self._ocr_model.predict(images, batch_size=
|
72
|
+
(pred.get("rec_text"), pred.get("rec_score"))
|
73
|
+
for pred in self._ocr_model.predict(images, batch_size=batch_size)
|
81
74
|
]
|
82
75
|
|
83
76
|
@staticmethod
|
@@ -86,7 +79,7 @@ class FontOCR:
|
|
86
79
|
render_font: ImageFont.FreeTypeFont,
|
87
80
|
is_reflect: bool = False,
|
88
81
|
size: int = 64,
|
89
|
-
) -> Image.Image
|
82
|
+
) -> Image.Image:
|
90
83
|
"""
|
91
84
|
Render a single character into an RGB square image.
|
92
85
|
|
@@ -107,10 +100,6 @@ class FontOCR:
|
|
107
100
|
if is_reflect:
|
108
101
|
img = img.transpose(Transpose.FLIP_LEFT_RIGHT)
|
109
102
|
|
110
|
-
img_np = np.array(img)
|
111
|
-
if np.unique(img_np).size == 1:
|
112
|
-
return None
|
113
|
-
|
114
103
|
return img
|
115
104
|
|
116
105
|
@staticmethod
|
@@ -119,7 +108,7 @@ class FontOCR:
|
|
119
108
|
render_font: ImageFont.FreeTypeFont,
|
120
109
|
is_reflect: bool = False,
|
121
110
|
size: int = 64,
|
122
|
-
) -> np.ndarray
|
111
|
+
) -> np.ndarray:
|
123
112
|
"""
|
124
113
|
Render a single character into an RGB square image.
|
125
114
|
|
@@ -140,11 +129,7 @@ class FontOCR:
|
|
140
129
|
if is_reflect:
|
141
130
|
img = img.transpose(Transpose.FLIP_LEFT_RIGHT)
|
142
131
|
|
143
|
-
|
144
|
-
if np.unique(img_np).size == 1:
|
145
|
-
return None
|
146
|
-
|
147
|
-
return img_np
|
132
|
+
return np.array(img)
|
148
133
|
|
149
134
|
@staticmethod
|
150
135
|
def render_text_image(
|
@@ -176,32 +161,56 @@ class FontOCR:
|
|
176
161
|
return img
|
177
162
|
|
178
163
|
@staticmethod
|
179
|
-
def
|
164
|
+
def load_image_array_from_bytes(data: bytes) -> np.ndarray:
|
180
165
|
"""
|
181
|
-
|
166
|
+
Decode image bytes into an RGB NumPy array.
|
167
|
+
|
168
|
+
Reads common image formats (e.g. PNG/JPEG/WebP) from an
|
169
|
+
in-memory byte buffer using Pillow, converts the image to RGB,
|
170
|
+
and returns a NumPy array suitable for OCR inference.
|
171
|
+
|
172
|
+
:param data: Image file content as raw bytes.
|
173
|
+
:return: NumPy array of shape (H, W, 3), dtype=uint8, in RGB order.
|
174
|
+
:raises PIL.UnidentifiedImageError, OSError: If input bytes cannot be decoded.
|
182
175
|
"""
|
183
|
-
|
184
|
-
|
176
|
+
with Image.open(io.BytesIO(data)) as im:
|
177
|
+
im = im.convert("RGB")
|
178
|
+
return np.asarray(im)
|
185
179
|
|
180
|
+
@staticmethod
|
181
|
+
def load_render_font(
|
182
|
+
font_path: Path | str, char_size: int = 52
|
183
|
+
) -> ImageFont.FreeTypeFont:
|
184
|
+
"""
|
185
|
+
Load a FreeType font face at the given pixel size for rendering helpers.
|
186
186
|
|
187
|
-
|
187
|
+
:param font_path: Path to a TTF/OTF font file.
|
188
|
+
:param char_size: Target glyph size in pixels (e.g. 52).
|
189
|
+
:return: A PIL `ImageFont.FreeTypeFont` instance.
|
190
|
+
:raises OSError: If the font file cannot be opened by PIL.
|
191
|
+
"""
|
192
|
+
return ImageFont.truetype(str(font_path), char_size)
|
188
193
|
|
194
|
+
@staticmethod
|
195
|
+
def extract_font_charset(font_path: Path | str) -> set[str]:
|
196
|
+
"""
|
197
|
+
Extract the set of Unicode characters encoded by a TrueType/OpenType font.
|
189
198
|
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
199
|
+
This reads the font's best available character map (cmap) and returns the
|
200
|
+
corresponding set of characters.
|
201
|
+
|
202
|
+
:param font_path: Path to a TTF/OTF font file.
|
203
|
+
:return: A set of Unicode characters present in the font's cmap.
|
204
|
+
"""
|
205
|
+
with TTFont(font_path) as font_ttf:
|
206
|
+
cmap = font_ttf.getBestCmap() or {}
|
207
|
+
|
208
|
+
charset: set[str] = set()
|
209
|
+
for cp in cmap:
|
210
|
+
# guard against invalid/surrogate code points
|
211
|
+
if 0 <= cp <= 0x10FFFF and not (0xD800 <= cp <= 0xDFFF):
|
212
|
+
try:
|
213
|
+
charset.add(chr(cp))
|
214
|
+
except ValueError:
|
215
|
+
continue
|
216
|
+
return charset
|
@@ -0,0 +1,50 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.utils.fontocr.loader
|
4
|
+
-------------------------------------
|
5
|
+
|
6
|
+
Lazily load the FontOCR class.
|
7
|
+
"""
|
8
|
+
|
9
|
+
import logging
|
10
|
+
from typing import TYPE_CHECKING
|
11
|
+
|
12
|
+
if TYPE_CHECKING:
|
13
|
+
from .core import FontOCR
|
14
|
+
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
_FONT_OCR: "FontOCR | None" = None
|
18
|
+
|
19
|
+
|
20
|
+
def get_font_ocr(
|
21
|
+
model_name: str | None = None,
|
22
|
+
model_dir: str | None = None,
|
23
|
+
input_shape: tuple[int, int, int] | None = None,
|
24
|
+
) -> "FontOCR | None":
|
25
|
+
"""
|
26
|
+
Try to initialize and return a singleton FontOCR instance.
|
27
|
+
Returns None if FontOCR or its dependencies are not available.
|
28
|
+
"""
|
29
|
+
global _FONT_OCR
|
30
|
+
if _FONT_OCR is None:
|
31
|
+
try:
|
32
|
+
from .core import FontOCR
|
33
|
+
|
34
|
+
_FONT_OCR = FontOCR(
|
35
|
+
model_name=model_name,
|
36
|
+
model_dir=model_dir,
|
37
|
+
input_shape=input_shape,
|
38
|
+
)
|
39
|
+
except ImportError:
|
40
|
+
logger.warning(
|
41
|
+
"FontOCR dependency not available "
|
42
|
+
"(paddleocr / numpy / pillow / fonttools). "
|
43
|
+
"Font decoding will be skipped."
|
44
|
+
)
|
45
|
+
return None
|
46
|
+
except Exception as e:
|
47
|
+
logger.warning("FontOCR initialization failed: %s", e, exc_info=True)
|
48
|
+
return None
|
49
|
+
|
50
|
+
return _FONT_OCR
|
novel_downloader/utils/logger.py
CHANGED
@@ -4,17 +4,17 @@ novel_downloader.utils.logger
|
|
4
4
|
-----------------------------
|
5
5
|
|
6
6
|
Provides a configurable logging setup for Python applications.
|
7
|
-
Log files are rotated daily and named with the given logger name and current date.
|
8
7
|
"""
|
9
8
|
|
9
|
+
from __future__ import annotations
|
10
|
+
|
10
11
|
__all__ = ["setup_logging"]
|
11
12
|
|
12
13
|
import logging
|
13
|
-
from datetime import datetime
|
14
14
|
from logging.handlers import TimedRotatingFileHandler
|
15
15
|
from pathlib import Path
|
16
16
|
|
17
|
-
from .constants import LOGGER_DIR,
|
17
|
+
from .constants import LOGGER_DIR, PACKAGE_NAME
|
18
18
|
|
19
19
|
LOG_LEVELS: dict[str, int] = {
|
20
20
|
"DEBUG": logging.DEBUG,
|
@@ -22,75 +22,99 @@ LOG_LEVELS: dict[str, int] = {
|
|
22
22
|
"WARNING": logging.WARNING,
|
23
23
|
"ERROR": logging.ERROR,
|
24
24
|
}
|
25
|
+
_MUTE_LOGGERS: set[str] = {
|
26
|
+
"fontTools.ttLib.tables._p_o_s_t",
|
27
|
+
}
|
28
|
+
|
29
|
+
|
30
|
+
def _normalize_level(level: int | str) -> int:
|
31
|
+
if isinstance(level, int):
|
32
|
+
return level
|
33
|
+
if isinstance(level, str):
|
34
|
+
lvl = LOG_LEVELS.get(level.upper())
|
35
|
+
if isinstance(lvl, int):
|
36
|
+
return lvl
|
37
|
+
return logging.INFO
|
25
38
|
|
26
39
|
|
27
40
|
def setup_logging(
|
28
|
-
|
29
|
-
|
41
|
+
log_filename: str | None = None,
|
42
|
+
console_level: int | str = "INFO",
|
43
|
+
file_level: int | str = "DEBUG",
|
30
44
|
log_dir: str | Path | None = None,
|
45
|
+
*,
|
46
|
+
console: bool = True,
|
47
|
+
file: bool = True,
|
48
|
+
backup_count: int = 7,
|
49
|
+
when: str = "midnight",
|
31
50
|
) -> logging.Logger:
|
32
51
|
"""
|
33
|
-
Create and configure a logger
|
52
|
+
Create and configure a package logger with optional console and file handlers.
|
34
53
|
|
35
|
-
:param
|
36
|
-
:param
|
37
|
-
|
54
|
+
:param log_filename: Base log file name (without date suffix).
|
55
|
+
:param console_level: Minimum level for the console handler (string or int).
|
56
|
+
:param file_level: Minimum level for the file handler (string or int).
|
38
57
|
:param log_dir: Directory where log files will be saved.
|
39
|
-
:
|
58
|
+
:param console: Add a console handler.
|
59
|
+
:param file: Add a file handler.
|
60
|
+
:param backup_count: How many rotated files to keep.
|
61
|
+
:param when: Rotation interval for TimedRotatingFileHandler (e.g., "midnight").
|
62
|
+
:return: The configured logger.
|
40
63
|
"""
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
level_str: str = log_level or "INFO"
|
47
|
-
console_level: int = LOG_LEVELS.get(level_str) or logging.INFO
|
48
|
-
|
49
|
-
# Resolve log file path
|
50
|
-
log_path = Path(log_dir) if log_dir else LOGGER_DIR
|
51
|
-
log_path.mkdir(parents=True, exist_ok=True)
|
52
|
-
|
53
|
-
# Resolve log file name
|
54
|
-
if not log_filename_prefix:
|
55
|
-
log_filename_prefix = LOGGER_NAME
|
56
|
-
date_str = datetime.now().strftime("%Y-%m-%d")
|
57
|
-
log_filename = log_path / f"{log_filename_prefix}_{date_str}.log"
|
64
|
+
# Tame noisy third-party loggers
|
65
|
+
for name in _MUTE_LOGGERS:
|
66
|
+
ml = logging.getLogger(name)
|
67
|
+
ml.setLevel(logging.ERROR)
|
68
|
+
ml.propagate = False
|
58
69
|
|
59
|
-
|
60
|
-
logger
|
61
|
-
logger.
|
62
|
-
logger.propagate = False
|
70
|
+
logger = logging.getLogger(PACKAGE_NAME)
|
71
|
+
logger.setLevel(logging.DEBUG)
|
72
|
+
logger.propagate = False # otherwise may affected by PaddleOCR
|
63
73
|
|
64
74
|
# Clear existing handlers to avoid duplicate logs
|
65
75
|
if logger.hasHandlers():
|
66
76
|
logger.handlers.clear()
|
67
77
|
|
68
|
-
# File handler
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
78
|
+
# File handler (rotates daily)
|
79
|
+
if file:
|
80
|
+
file_level = _normalize_level(file_level)
|
81
|
+
|
82
|
+
base_dir = Path(log_dir) if log_dir else LOGGER_DIR
|
83
|
+
base_dir.mkdir(parents=True, exist_ok=True)
|
84
|
+
base_name = log_filename or PACKAGE_NAME
|
85
|
+
log_path = base_dir / f"{base_name}.log"
|
86
|
+
|
87
|
+
fh = TimedRotatingFileHandler(
|
88
|
+
filename=log_path,
|
89
|
+
when=when,
|
90
|
+
interval=1,
|
91
|
+
backupCount=backup_count,
|
92
|
+
encoding="utf-8",
|
93
|
+
utc=False,
|
94
|
+
delay=True,
|
95
|
+
)
|
96
|
+
|
97
|
+
file_formatter = logging.Formatter(
|
98
|
+
fmt="%(asctime)s [%(levelname)s] %(name)s.%(funcName)s: %(message)s",
|
99
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
100
|
+
)
|
101
|
+
fh.setFormatter(file_formatter)
|
102
|
+
fh.setLevel(file_level)
|
103
|
+
logger.addHandler(fh)
|
104
|
+
|
105
|
+
print(f"Logging to {log_path}")
|
84
106
|
|
85
107
|
# Console handler
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
108
|
+
if console:
|
109
|
+
console_level = _normalize_level(console_level)
|
110
|
+
|
111
|
+
console_handler = logging.StreamHandler()
|
112
|
+
console_formatter = logging.Formatter(
|
113
|
+
fmt="%(asctime)s [%(levelname)s] %(message)s",
|
114
|
+
datefmt="%H:%M:%S",
|
115
|
+
)
|
116
|
+
console_handler.setFormatter(console_formatter)
|
117
|
+
console_handler.setLevel(console_level)
|
118
|
+
logger.addHandler(console_handler)
|
95
119
|
|
96
120
|
return logger
|
@@ -8,7 +8,6 @@ Utilities for handling HTTP requests and downloading remote resources.
|
|
8
8
|
|
9
9
|
__all__ = ["download"]
|
10
10
|
|
11
|
-
import logging
|
12
11
|
from pathlib import Path
|
13
12
|
from typing import Literal
|
14
13
|
from urllib.parse import unquote, urlparse
|
@@ -19,10 +18,7 @@ from urllib3.util.retry import Retry
|
|
19
18
|
|
20
19
|
from .constants import DEFAULT_HEADERS
|
21
20
|
from .file_utils import sanitize_filename
|
22
|
-
from .file_utils.io import
|
23
|
-
|
24
|
-
logger = logging.getLogger(__name__)
|
25
|
-
_DEFAULT_CHUNK_SIZE = 8192 # 8KB per chunk for streaming downloads
|
21
|
+
from .file_utils.io import _unique_path, write_file
|
26
22
|
|
27
23
|
|
28
24
|
def _normalize_url(url: str) -> str:
|
@@ -37,8 +33,8 @@ def _normalize_url(url: str) -> str:
|
|
37
33
|
|
38
34
|
|
39
35
|
def _build_filepath(
|
40
|
-
folder: Path,
|
41
36
|
url: str,
|
37
|
+
folder: Path,
|
42
38
|
filename: str | None,
|
43
39
|
default_suffix: str,
|
44
40
|
on_exist: Literal["overwrite", "skip", "rename"],
|
@@ -48,20 +44,18 @@ def _build_filepath(
|
|
48
44
|
|
49
45
|
raw_name = filename or url_path.name or "unnamed"
|
50
46
|
name = sanitize_filename(raw_name)
|
51
|
-
suffix = default_suffix or url_path.suffix
|
52
|
-
if suffix and not suffix.startswith("."):
|
53
|
-
suffix = "." + suffix
|
54
47
|
|
55
|
-
|
56
|
-
|
57
|
-
file_path = file_path.with_suffix(suffix)
|
48
|
+
if "." not in name and (url_path.suffix or default_suffix):
|
49
|
+
name += url_path.suffix or default_suffix
|
58
50
|
|
51
|
+
file_path = folder / name
|
59
52
|
if on_exist == "rename":
|
60
|
-
file_path =
|
53
|
+
file_path = _unique_path(file_path)
|
54
|
+
|
61
55
|
return file_path
|
62
56
|
|
63
57
|
|
64
|
-
def
|
58
|
+
def _new_session(
|
65
59
|
retries: int,
|
66
60
|
backoff: float,
|
67
61
|
headers: dict[str, str] | None,
|
@@ -72,7 +66,7 @@ def _make_session(
|
|
72
66
|
retry = Retry(
|
73
67
|
total=retries,
|
74
68
|
backoff_factor=backoff,
|
75
|
-
status_forcelist=[429, 500, 502, 503, 504],
|
69
|
+
status_forcelist=[413, 429, 500, 502, 503, 504],
|
76
70
|
allowed_methods={"GET", "HEAD", "OPTIONS"},
|
77
71
|
)
|
78
72
|
adapter = HTTPAdapter(max_retries=retry)
|
@@ -90,10 +84,8 @@ def download(
|
|
90
84
|
retries: int = 3,
|
91
85
|
backoff: float = 0.5,
|
92
86
|
headers: dict[str, str] | None = None,
|
93
|
-
stream: bool = False,
|
94
87
|
on_exist: Literal["overwrite", "skip", "rename"] = "overwrite",
|
95
88
|
default_suffix: str = "",
|
96
|
-
chunk_size: int = _DEFAULT_CHUNK_SIZE,
|
97
89
|
) -> Path | None:
|
98
90
|
"""
|
99
91
|
Download a URL to disk, with retries, optional rename/skip, and cleanup on failure.
|
@@ -105,10 +97,8 @@ def download(
|
|
105
97
|
:param retries: GET retry count.
|
106
98
|
:param backoff: exponential backoff base.
|
107
99
|
:param headers: optional headers.
|
108
|
-
:param stream: Whether to stream the response.
|
109
100
|
:param on_exist: if 'skip', return filepath; if 'rename', auto-rename.
|
110
101
|
:param default_suffix: used if no suffix in URL or filename.
|
111
|
-
:param chunk_size: streaming chunk size.
|
112
102
|
:return: path to the downloaded file.
|
113
103
|
"""
|
114
104
|
url = _normalize_url(url)
|
@@ -117,8 +107,8 @@ def download(
|
|
117
107
|
folder.mkdir(parents=True, exist_ok=True)
|
118
108
|
|
119
109
|
save_path = _build_filepath(
|
120
|
-
folder,
|
121
110
|
url,
|
111
|
+
folder,
|
122
112
|
filename,
|
123
113
|
default_suffix,
|
124
114
|
on_exist,
|
@@ -126,34 +116,20 @@ def download(
|
|
126
116
|
|
127
117
|
# Handle existing file
|
128
118
|
if save_path.exists() and on_exist == "skip":
|
129
|
-
logger.debug("Skipping download; file exists: %s", save_path)
|
130
119
|
return save_path
|
131
120
|
|
132
|
-
with
|
121
|
+
with _new_session(retries, backoff, headers) as session:
|
133
122
|
try:
|
134
|
-
resp = session.get(url, timeout=timeout
|
123
|
+
resp = session.get(url, timeout=timeout)
|
135
124
|
resp.raise_for_status()
|
136
|
-
except Exception as e:
|
137
|
-
logger.warning("[download] request failed: %s", e)
|
138
|
-
return None
|
139
125
|
|
140
|
-
|
141
|
-
if stream:
|
142
|
-
try:
|
143
|
-
with open(save_path, "wb") as f:
|
144
|
-
for chunk in resp.iter_content(chunk_size=chunk_size):
|
145
|
-
if chunk:
|
146
|
-
f.write(chunk)
|
147
|
-
return save_path
|
148
|
-
except Exception as e:
|
149
|
-
logger.warning("[download] write failed: %s", e)
|
150
|
-
save_path.unlink(missing_ok=True)
|
151
|
-
return None
|
152
|
-
else:
|
126
|
+
# Write to disk
|
153
127
|
return write_file(
|
154
128
|
content=resp.content,
|
155
129
|
filepath=save_path,
|
156
|
-
write_mode="wb",
|
157
130
|
on_exist=on_exist,
|
158
131
|
)
|
132
|
+
except Exception:
|
133
|
+
return None
|
134
|
+
|
159
135
|
return None
|