novel-downloader 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +14 -0
- novel_downloader/cli/__init__.py +14 -0
- novel_downloader/cli/clean.py +134 -0
- novel_downloader/cli/download.py +132 -0
- novel_downloader/cli/interactive.py +67 -0
- novel_downloader/cli/main.py +45 -0
- novel_downloader/cli/settings.py +177 -0
- novel_downloader/config/__init__.py +52 -0
- novel_downloader/config/adapter.py +153 -0
- novel_downloader/config/loader.py +177 -0
- novel_downloader/config/models.py +173 -0
- novel_downloader/config/site_rules.py +97 -0
- novel_downloader/core/__init__.py +25 -0
- novel_downloader/core/downloaders/__init__.py +22 -0
- novel_downloader/core/downloaders/base_async_downloader.py +157 -0
- novel_downloader/core/downloaders/base_downloader.py +187 -0
- novel_downloader/core/downloaders/common_asynb_downloader.py +207 -0
- novel_downloader/core/downloaders/common_downloader.py +191 -0
- novel_downloader/core/downloaders/qidian_downloader.py +208 -0
- novel_downloader/core/factory/__init__.py +33 -0
- novel_downloader/core/factory/downloader_factory.py +149 -0
- novel_downloader/core/factory/parser_factory.py +62 -0
- novel_downloader/core/factory/requester_factory.py +106 -0
- novel_downloader/core/factory/saver_factory.py +49 -0
- novel_downloader/core/interfaces/__init__.py +32 -0
- novel_downloader/core/interfaces/async_downloader_protocol.py +37 -0
- novel_downloader/core/interfaces/async_requester_protocol.py +68 -0
- novel_downloader/core/interfaces/downloader_protocol.py +37 -0
- novel_downloader/core/interfaces/parser_protocol.py +40 -0
- novel_downloader/core/interfaces/requester_protocol.py +65 -0
- novel_downloader/core/interfaces/saver_protocol.py +61 -0
- novel_downloader/core/parsers/__init__.py +28 -0
- novel_downloader/core/parsers/base_parser.py +96 -0
- novel_downloader/core/parsers/common_parser/__init__.py +14 -0
- novel_downloader/core/parsers/common_parser/helper.py +321 -0
- novel_downloader/core/parsers/common_parser/main_parser.py +86 -0
- novel_downloader/core/parsers/qidian_parser/__init__.py +20 -0
- novel_downloader/core/parsers/qidian_parser/browser/__init__.py +13 -0
- novel_downloader/core/parsers/qidian_parser/browser/chapter_encrypted.py +498 -0
- novel_downloader/core/parsers/qidian_parser/browser/chapter_normal.py +97 -0
- novel_downloader/core/parsers/qidian_parser/browser/chapter_router.py +70 -0
- novel_downloader/core/parsers/qidian_parser/browser/main_parser.py +110 -0
- novel_downloader/core/parsers/qidian_parser/session/__init__.py +13 -0
- novel_downloader/core/parsers/qidian_parser/session/chapter_encrypted.py +451 -0
- novel_downloader/core/parsers/qidian_parser/session/chapter_normal.py +119 -0
- novel_downloader/core/parsers/qidian_parser/session/chapter_router.py +67 -0
- novel_downloader/core/parsers/qidian_parser/session/main_parser.py +113 -0
- novel_downloader/core/parsers/qidian_parser/session/node_decryptor.py +164 -0
- novel_downloader/core/parsers/qidian_parser/shared/__init__.py +38 -0
- novel_downloader/core/parsers/qidian_parser/shared/book_info_parser.py +95 -0
- novel_downloader/core/parsers/qidian_parser/shared/helpers.py +133 -0
- novel_downloader/core/requesters/__init__.py +31 -0
- novel_downloader/core/requesters/base_async_session.py +297 -0
- novel_downloader/core/requesters/base_browser.py +210 -0
- novel_downloader/core/requesters/base_session.py +243 -0
- novel_downloader/core/requesters/common_requester/__init__.py +18 -0
- novel_downloader/core/requesters/common_requester/common_async_session.py +96 -0
- novel_downloader/core/requesters/common_requester/common_session.py +126 -0
- novel_downloader/core/requesters/qidian_requester/__init__.py +22 -0
- novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +377 -0
- novel_downloader/core/requesters/qidian_requester/qidian_session.py +202 -0
- novel_downloader/core/savers/__init__.py +20 -0
- novel_downloader/core/savers/base_saver.py +169 -0
- novel_downloader/core/savers/common_saver/__init__.py +13 -0
- novel_downloader/core/savers/common_saver/common_epub.py +232 -0
- novel_downloader/core/savers/common_saver/common_txt.py +176 -0
- novel_downloader/core/savers/common_saver/main_saver.py +86 -0
- novel_downloader/core/savers/epub_utils/__init__.py +27 -0
- novel_downloader/core/savers/epub_utils/css_builder.py +68 -0
- novel_downloader/core/savers/epub_utils/initializer.py +98 -0
- novel_downloader/core/savers/epub_utils/text_to_html.py +132 -0
- novel_downloader/core/savers/epub_utils/volume_intro.py +61 -0
- novel_downloader/core/savers/qidian_saver.py +22 -0
- novel_downloader/locales/en.json +91 -0
- novel_downloader/locales/zh.json +91 -0
- novel_downloader/resources/config/rules.toml +196 -0
- novel_downloader/resources/config/settings.yaml +73 -0
- novel_downloader/resources/css_styles/main.css +104 -0
- novel_downloader/resources/css_styles/volume-intro.css +56 -0
- novel_downloader/resources/images/volume_border.png +0 -0
- novel_downloader/resources/js_scripts/qidian_decrypt_node.js +82 -0
- novel_downloader/resources/json/replace_word_map.json +4 -0
- novel_downloader/resources/text/blacklist.txt +22 -0
- novel_downloader/utils/__init__.py +0 -0
- novel_downloader/utils/cache.py +24 -0
- novel_downloader/utils/constants.py +158 -0
- novel_downloader/utils/crypto_utils.py +144 -0
- novel_downloader/utils/file_utils/__init__.py +43 -0
- novel_downloader/utils/file_utils/io.py +252 -0
- novel_downloader/utils/file_utils/normalize.py +68 -0
- novel_downloader/utils/file_utils/sanitize.py +77 -0
- novel_downloader/utils/fontocr/__init__.py +23 -0
- novel_downloader/utils/fontocr/ocr_v1.py +304 -0
- novel_downloader/utils/fontocr/ocr_v2.py +658 -0
- novel_downloader/utils/hash_store.py +288 -0
- novel_downloader/utils/hash_utils.py +103 -0
- novel_downloader/utils/i18n.py +41 -0
- novel_downloader/utils/logger.py +104 -0
- novel_downloader/utils/model_loader.py +72 -0
- novel_downloader/utils/network.py +287 -0
- novel_downloader/utils/state.py +156 -0
- novel_downloader/utils/text_utils/__init__.py +27 -0
- novel_downloader/utils/text_utils/chapter_formatting.py +46 -0
- novel_downloader/utils/text_utils/diff_display.py +75 -0
- novel_downloader/utils/text_utils/font_mapping.py +31 -0
- novel_downloader/utils/text_utils/text_cleaning.py +57 -0
- novel_downloader/utils/time_utils/__init__.py +22 -0
- novel_downloader/utils/time_utils/datetime_utils.py +146 -0
- novel_downloader/utils/time_utils/sleep_utils.py +49 -0
- novel_downloader-1.1.0.dist-info/METADATA +157 -0
- novel_downloader-1.1.0.dist-info/RECORD +115 -0
- novel_downloader-1.1.0.dist-info/WHEEL +5 -0
- novel_downloader-1.1.0.dist-info/entry_points.txt +2 -0
- novel_downloader-1.1.0.dist-info/licenses/LICENSE +21 -0
- novel_downloader-1.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,67 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.core.parsers.qidian_parser.session.chapter_router
|
5
|
+
------------------------------------------------------------------
|
6
|
+
|
7
|
+
Routing logic for selecting the correct chapter parser for Qidian session pages.
|
8
|
+
"""
|
9
|
+
|
10
|
+
from __future__ import annotations
|
11
|
+
|
12
|
+
import logging
|
13
|
+
from typing import TYPE_CHECKING, Any, Dict
|
14
|
+
|
15
|
+
from ..shared import (
|
16
|
+
can_view_chapter,
|
17
|
+
html_to_soup,
|
18
|
+
is_encrypted,
|
19
|
+
)
|
20
|
+
from .chapter_normal import parse_normal_chapter
|
21
|
+
|
22
|
+
if TYPE_CHECKING:
|
23
|
+
from .main_parser import QidianSessionParser
|
24
|
+
|
25
|
+
logger = logging.getLogger(__name__)
|
26
|
+
|
27
|
+
|
28
|
+
def parse_chapter(
|
29
|
+
parser: QidianSessionParser,
|
30
|
+
html_str: str,
|
31
|
+
chapter_id: str,
|
32
|
+
) -> Dict[str, Any]:
|
33
|
+
"""
|
34
|
+
Extract and return the formatted textual content of chapter.
|
35
|
+
|
36
|
+
:param parser: Instance of QidianSessionParser.
|
37
|
+
:param html_str: Raw HTML content of the chapter page.
|
38
|
+
:param chapter_id: Identifier of the chapter being parsed.
|
39
|
+
:return: Formatted chapter text or empty string if not parsable.
|
40
|
+
"""
|
41
|
+
try:
|
42
|
+
soup = html_to_soup(html_str)
|
43
|
+
|
44
|
+
if not can_view_chapter(soup):
|
45
|
+
logger.warning(
|
46
|
+
"[Parser] Chapter '%s' is not purchased or inaccessible.", chapter_id
|
47
|
+
)
|
48
|
+
return {}
|
49
|
+
|
50
|
+
if is_encrypted(soup):
|
51
|
+
if not parser._decode_font:
|
52
|
+
return {}
|
53
|
+
try:
|
54
|
+
from .chapter_encrypted import parse_encrypted_chapter
|
55
|
+
|
56
|
+
return parse_encrypted_chapter(parser, soup, chapter_id, parser._fuid)
|
57
|
+
except ImportError:
|
58
|
+
logger.warning(
|
59
|
+
"[Parser] Encrypted chapter '%s' requires extra dependencies.",
|
60
|
+
chapter_id,
|
61
|
+
)
|
62
|
+
return {}
|
63
|
+
|
64
|
+
return parse_normal_chapter(soup, chapter_id, parser._fuid)
|
65
|
+
except Exception as e:
|
66
|
+
logger.warning("[Parser] parse error for chapter '%s': %s", chapter_id, e)
|
67
|
+
return {}
|
@@ -0,0 +1,113 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.core.parsers.qidian_parser.session.main_parser
|
5
|
+
---------------------------------------------------------------
|
6
|
+
|
7
|
+
Main parser class for handling Qidian chapters rendered via a session.
|
8
|
+
|
9
|
+
This module defines `QidianSessionParser`, a parser implementation that supports
|
10
|
+
content extracted from dynamically rendered Qidian HTML pages.
|
11
|
+
"""
|
12
|
+
|
13
|
+
from __future__ import annotations
|
14
|
+
|
15
|
+
from pathlib import Path
|
16
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional
|
17
|
+
|
18
|
+
from novel_downloader.config.models import ParserConfig
|
19
|
+
from novel_downloader.core.parsers.base_parser import BaseParser
|
20
|
+
from novel_downloader.utils.state import state_mgr
|
21
|
+
|
22
|
+
from ..shared import (
|
23
|
+
is_encrypted,
|
24
|
+
parse_book_info,
|
25
|
+
)
|
26
|
+
from .chapter_router import parse_chapter
|
27
|
+
|
28
|
+
if TYPE_CHECKING:
|
29
|
+
from novel_downloader.utils.fontocr import FontOCR
|
30
|
+
|
31
|
+
|
32
|
+
class QidianSessionParser(BaseParser):
|
33
|
+
"""
|
34
|
+
Parser for Qidian site using a session HTML workflow.
|
35
|
+
"""
|
36
|
+
|
37
|
+
def __init__(self, config: ParserConfig):
|
38
|
+
"""
|
39
|
+
Initialize the QidianBrowserParser with the given configuration.
|
40
|
+
|
41
|
+
:param config: ParserConfig object controlling:
|
42
|
+
"""
|
43
|
+
super().__init__(config)
|
44
|
+
|
45
|
+
# Extract and store parser flags from config
|
46
|
+
self._decode_font: bool = config.decode_font
|
47
|
+
self._save_font_debug: bool = config.save_font_debug
|
48
|
+
|
49
|
+
self._fixed_font_dir: Path = self._base_cache_dir / "fixed_fonts"
|
50
|
+
self._fixed_font_dir.mkdir(parents=True, exist_ok=True)
|
51
|
+
self._font_debug_dir: Optional[Path] = None
|
52
|
+
|
53
|
+
qd_cookies = state_mgr.get_cookies("qidian")
|
54
|
+
self._fuid: str = qd_cookies.get("ywguid", "")
|
55
|
+
|
56
|
+
self._font_ocr: Optional[FontOCR] = None
|
57
|
+
if self._decode_font:
|
58
|
+
from novel_downloader.utils.fontocr import FontOCR
|
59
|
+
|
60
|
+
self._font_ocr = FontOCR(
|
61
|
+
cache_dir=self._base_cache_dir,
|
62
|
+
use_freq=config.use_freq,
|
63
|
+
use_ocr=config.use_ocr,
|
64
|
+
use_vec=config.use_vec,
|
65
|
+
batch_size=config.batch_size,
|
66
|
+
ocr_weight=config.ocr_weight,
|
67
|
+
vec_weight=config.vec_weight,
|
68
|
+
font_debug=config.save_font_debug,
|
69
|
+
)
|
70
|
+
self._font_debug_dir = self._base_cache_dir / "font_debug"
|
71
|
+
self._font_debug_dir.mkdir(parents=True, exist_ok=True)
|
72
|
+
|
73
|
+
def parse_book_info(self, html: str) -> Dict[str, Any]:
|
74
|
+
"""
|
75
|
+
Parse a book info page and extract metadata and chapter structure.
|
76
|
+
|
77
|
+
:param html: Raw HTML of the book info page.
|
78
|
+
:return: Parsed metadata and chapter structure as a dictionary.
|
79
|
+
"""
|
80
|
+
return parse_book_info(html)
|
81
|
+
|
82
|
+
def parse_chapter(self, html_str: str, chapter_id: str) -> Dict[str, Any]:
|
83
|
+
"""
|
84
|
+
:param html: Raw HTML of the chapter page.
|
85
|
+
:param chapter_id: Identifier of the chapter being parsed.
|
86
|
+
:return: Cleaned chapter content as plain text.
|
87
|
+
"""
|
88
|
+
return parse_chapter(self, html_str, chapter_id)
|
89
|
+
|
90
|
+
def is_encrypted(self, html_str: str) -> bool:
|
91
|
+
"""
|
92
|
+
Return True if content is encrypted.
|
93
|
+
|
94
|
+
:param html: Raw HTML of the chapter page.
|
95
|
+
"""
|
96
|
+
return is_encrypted(html_str)
|
97
|
+
|
98
|
+
def _init_cache_folders(self) -> None:
|
99
|
+
"""
|
100
|
+
Prepare cache folders for plain/encrypted HTML and font debug data.
|
101
|
+
Folders are only created if corresponding debug/save flags are enabled.
|
102
|
+
"""
|
103
|
+
base = self._base_cache_dir
|
104
|
+
|
105
|
+
# Font debug folder
|
106
|
+
if self._save_font_debug and self.book_id:
|
107
|
+
self._font_debug_dir = base / self.book_id / "font_debug"
|
108
|
+
self._font_debug_dir.mkdir(parents=True, exist_ok=True)
|
109
|
+
else:
|
110
|
+
self._font_debug_dir = None
|
111
|
+
|
112
|
+
def _on_book_id_set(self) -> None:
|
113
|
+
self._init_cache_folders()
|
@@ -0,0 +1,164 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.core.parsers.qidian_parser.session.node_decryptor
|
5
|
+
------------------------------------------------------------------
|
6
|
+
|
7
|
+
Provides QidianNodeDecryptor, which ensures a Node.js environment,
|
8
|
+
downloads or installs the required JS modules (Fock + decrypt script),
|
9
|
+
and invokes a Node.js subprocess to decrypt Qidian chapter content.
|
10
|
+
"""
|
11
|
+
|
12
|
+
import json
|
13
|
+
import logging
|
14
|
+
import shutil
|
15
|
+
import subprocess
|
16
|
+
import uuid
|
17
|
+
from pathlib import Path
|
18
|
+
from typing import Union
|
19
|
+
|
20
|
+
from novel_downloader.utils.constants import (
|
21
|
+
JS_SCRIPT_DIR,
|
22
|
+
QD_DECRYPT_SCRIPT_PATH,
|
23
|
+
)
|
24
|
+
|
25
|
+
logger = logging.getLogger(__name__)
|
26
|
+
|
27
|
+
|
28
|
+
class QidianNodeDecryptor:
|
29
|
+
"""
|
30
|
+
A decryptor that uses Node.js plus Qidian's Fock JavaScript module
|
31
|
+
to decrypt encrypted chapter payloads.
|
32
|
+
|
33
|
+
On initialization, this class will:
|
34
|
+
1. Verify that `node` is on PATH.
|
35
|
+
2. Copy our bundled `qidian_decrypt_node.js` into `JS_SCRIPT_DIR`.
|
36
|
+
3. Download the remote Fock module JS if not already present.
|
37
|
+
|
38
|
+
Calling `decrypt()` will:
|
39
|
+
- Write a temp JSON input file with [ciphertext, chapter_id, fkp, fuid].
|
40
|
+
- Spawn `node qidian_decrypt_node.js <in> <out>`.
|
41
|
+
- Read and return the decrypted text.
|
42
|
+
- Clean up the temp files.
|
43
|
+
"""
|
44
|
+
|
45
|
+
QIDIAN_FOCK_JS_URL: str = (
|
46
|
+
"https://cococdn.qidian.com/coco/s12062024/4819793b.qeooxh.js"
|
47
|
+
)
|
48
|
+
QIDIAN_FOCK_JS_PATH: Path = JS_SCRIPT_DIR / "4819793b.qeooxh.js"
|
49
|
+
QIDIAN_DECRYPT_SCRIPT_FILE: str = "qidian_decrypt_node.js"
|
50
|
+
QIDIAN_DECRYPT_SCRIPT_PATH: Path = JS_SCRIPT_DIR / QIDIAN_DECRYPT_SCRIPT_FILE
|
51
|
+
|
52
|
+
def __init__(self) -> None:
|
53
|
+
"""
|
54
|
+
Prepare the script directory and verify that both Node.js
|
55
|
+
and the necessary JS files are available.
|
56
|
+
"""
|
57
|
+
self.script_dir: Path = JS_SCRIPT_DIR
|
58
|
+
self.script_dir.mkdir(parents=True, exist_ok=True)
|
59
|
+
self.script_path: Path = self.QIDIAN_DECRYPT_SCRIPT_PATH
|
60
|
+
self._check_environment()
|
61
|
+
|
62
|
+
def _check_environment(self) -> None:
|
63
|
+
"""
|
64
|
+
Ensure Node.js is installed, our decrypt script is copied from
|
65
|
+
package resources, and the Fock JS module is downloaded.
|
66
|
+
|
67
|
+
:raises EnvironmentError: if `node` is not on the system PATH.
|
68
|
+
"""
|
69
|
+
# 1) Check Node.js
|
70
|
+
if not shutil.which("node"):
|
71
|
+
raise EnvironmentError("Node.js is not installed or not in PATH.")
|
72
|
+
|
73
|
+
# 2) Copy bundled decrypt script into place if missing
|
74
|
+
if not self.QIDIAN_DECRYPT_SCRIPT_PATH.exists():
|
75
|
+
try:
|
76
|
+
resource = QD_DECRYPT_SCRIPT_PATH
|
77
|
+
shutil.copyfile(str(resource), str(self.QIDIAN_DECRYPT_SCRIPT_PATH))
|
78
|
+
logger.info(
|
79
|
+
"[decryptor] Copied decrypt script to %s",
|
80
|
+
self.QIDIAN_DECRYPT_SCRIPT_PATH,
|
81
|
+
)
|
82
|
+
except Exception as e:
|
83
|
+
logger.error("[decryptor] Failed to copy decrypt script: %s", e)
|
84
|
+
raise
|
85
|
+
|
86
|
+
# 3) Download the Fock JS module from Qidian CDN if missing
|
87
|
+
if not self.QIDIAN_FOCK_JS_PATH.exists():
|
88
|
+
from novel_downloader.utils.network import download_js_file
|
89
|
+
|
90
|
+
try:
|
91
|
+
download_js_file(
|
92
|
+
self.QIDIAN_FOCK_JS_URL,
|
93
|
+
self.script_dir,
|
94
|
+
on_exist="overwrite",
|
95
|
+
)
|
96
|
+
logger.info(
|
97
|
+
"[decryptor] Downloaded Fock module to %s", self.QIDIAN_FOCK_JS_PATH
|
98
|
+
)
|
99
|
+
except Exception as e:
|
100
|
+
logger.error("[decryptor] Failed to download Fock JS module: %s", e)
|
101
|
+
raise
|
102
|
+
|
103
|
+
def decrypt(
|
104
|
+
self,
|
105
|
+
ciphertext: Union[str, bytes],
|
106
|
+
chapter_id: Union[str, int],
|
107
|
+
fkp: str,
|
108
|
+
fuid: str,
|
109
|
+
) -> str:
|
110
|
+
"""
|
111
|
+
Decrypt a chapter payload via our Node.js script.
|
112
|
+
|
113
|
+
:param ciphertext: Base64-encoded encrypted content (str or bytes).
|
114
|
+
:param chapter_id: The chapter's numeric ID.
|
115
|
+
:param fkp: Base64-encoded Fock key param from the page.
|
116
|
+
:param fuid: Fock user ID param from the page.
|
117
|
+
:return: The decrypted plain-text content.
|
118
|
+
:raises RuntimeError: if the Node.js subprocess exits with a non-zero code.
|
119
|
+
"""
|
120
|
+
# Normalize inputs
|
121
|
+
cipher_str = (
|
122
|
+
ciphertext.decode("utf-8")
|
123
|
+
if isinstance(ciphertext, (bytes, bytearray))
|
124
|
+
else str(ciphertext)
|
125
|
+
)
|
126
|
+
chapter_str = str(chapter_id)
|
127
|
+
|
128
|
+
# Create unique temp file names
|
129
|
+
task_id = uuid.uuid4().hex
|
130
|
+
input_path = self.script_dir / f"input_{task_id}.json"
|
131
|
+
output_path = self.script_dir / f"output_{task_id}.txt"
|
132
|
+
|
133
|
+
try:
|
134
|
+
# Write arguments as JSON array
|
135
|
+
input_path.write_text(
|
136
|
+
json.dumps([cipher_str, chapter_str, fkp, fuid]),
|
137
|
+
encoding="utf-8",
|
138
|
+
)
|
139
|
+
|
140
|
+
logger.debug(
|
141
|
+
"[decryptor] Invoking Node.js: node %s %s %s",
|
142
|
+
self.script_path.name,
|
143
|
+
input_path.name,
|
144
|
+
output_path.name,
|
145
|
+
)
|
146
|
+
|
147
|
+
proc = subprocess.run(
|
148
|
+
["node", self.script_path.name, input_path.name, output_path.name],
|
149
|
+
stdout=subprocess.PIPE,
|
150
|
+
stderr=subprocess.PIPE,
|
151
|
+
text=True,
|
152
|
+
cwd=str(self.script_dir),
|
153
|
+
)
|
154
|
+
|
155
|
+
if proc.returncode != 0:
|
156
|
+
raise RuntimeError(f"Node error: {proc.stderr.strip()}")
|
157
|
+
|
158
|
+
# Return decrypted content
|
159
|
+
return output_path.read_text(encoding="utf-8").strip()
|
160
|
+
|
161
|
+
finally:
|
162
|
+
# Clean up temp files
|
163
|
+
input_path.unlink(missing_ok=True)
|
164
|
+
output_path.unlink(missing_ok=True)
|
@@ -0,0 +1,38 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.core.parsers.qidian_parser.shared
|
5
|
+
--------------------------------------------------
|
6
|
+
|
7
|
+
Shared parsing utilities for Qidian parser components.
|
8
|
+
|
9
|
+
This subpackage provides common functions used across
|
10
|
+
different Qidian parsing strategies. It encapsulates logic for:
|
11
|
+
|
12
|
+
- Parsing the SSR-rendered page context and chapter metadata.
|
13
|
+
- Determining access control and encryption status of chapters.
|
14
|
+
- Basic HTML preprocessing and fallback parsing behavior.
|
15
|
+
- Extracting structured book info from the main book page.
|
16
|
+
"""
|
17
|
+
|
18
|
+
from .book_info_parser import parse_book_info
|
19
|
+
from .helpers import (
|
20
|
+
can_view_chapter,
|
21
|
+
extract_chapter_info,
|
22
|
+
find_ssr_page_context,
|
23
|
+
html_to_soup,
|
24
|
+
is_encrypted,
|
25
|
+
is_vip,
|
26
|
+
vip_status,
|
27
|
+
)
|
28
|
+
|
29
|
+
__all__ = [
|
30
|
+
"parse_book_info",
|
31
|
+
"html_to_soup",
|
32
|
+
"is_vip",
|
33
|
+
"can_view_chapter",
|
34
|
+
"is_encrypted",
|
35
|
+
"vip_status",
|
36
|
+
"find_ssr_page_context",
|
37
|
+
"extract_chapter_info",
|
38
|
+
]
|
@@ -0,0 +1,95 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.core.parsers.qidian_parser.shared.book_info_parser
|
5
|
+
-------------------------------------------------------------------
|
6
|
+
|
7
|
+
This module provides parsing of Qidian book info pages.
|
8
|
+
|
9
|
+
It extracts metadata such as title, author, cover URL, update
|
10
|
+
time, status, word count, summary, and volume-chapter structure.
|
11
|
+
"""
|
12
|
+
|
13
|
+
import logging
|
14
|
+
import re
|
15
|
+
from typing import Any, Dict
|
16
|
+
|
17
|
+
from bs4.element import Tag
|
18
|
+
|
19
|
+
from .helpers import html_to_soup
|
20
|
+
|
21
|
+
logger = logging.getLogger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
def _chapter_url_to_id(url: str) -> str:
|
25
|
+
"""
|
26
|
+
Extract chapterId as the last non-empty segment of the URL.
|
27
|
+
"""
|
28
|
+
return url.rstrip("/").split("/")[-1]
|
29
|
+
|
30
|
+
|
31
|
+
def _get_volume_name(vol_div: Tag) -> str:
|
32
|
+
"""
|
33
|
+
Extracts the volume title from a <div class="volume"> element
|
34
|
+
"""
|
35
|
+
h3 = vol_div.select_one("h3")
|
36
|
+
if not h3:
|
37
|
+
return ""
|
38
|
+
for a in h3.find_all("a"):
|
39
|
+
a.decompose()
|
40
|
+
text: str = h3.get_text(strip=True)
|
41
|
+
return text.split(chr(183))[0].strip()
|
42
|
+
|
43
|
+
|
44
|
+
def parse_book_info(html_str: str) -> Dict[str, Any]:
|
45
|
+
"""
|
46
|
+
Extract metadata: title, author, cover_url, update_time, status,
|
47
|
+
word_count, summary, and volumes with chapters.
|
48
|
+
|
49
|
+
:param html_str: Raw HTML of the book info page.
|
50
|
+
:return: A dict containing book metadata.
|
51
|
+
"""
|
52
|
+
info: Dict[str, Any] = {}
|
53
|
+
try:
|
54
|
+
soup = html_to_soup(html_str)
|
55
|
+
info["book_name"] = soup.select_one("em#bookName").get_text(strip=True)
|
56
|
+
info["author"] = soup.select_one("a.writer").get_text(strip=True)
|
57
|
+
info["cover_url"] = soup.select_one("div.book-img img")["src"].strip()
|
58
|
+
info["update_time"] = (
|
59
|
+
soup.select_one("span.book-update-time")
|
60
|
+
.get_text(strip=True)
|
61
|
+
.replace("更新时间", "")
|
62
|
+
.strip()
|
63
|
+
)
|
64
|
+
info["serial_status"] = soup.select_one("span.blue").get_text(strip=True)
|
65
|
+
# word count via regex
|
66
|
+
match = re.search(
|
67
|
+
r"<em>([\d.]+)</em>\s*<cite>(.*?)字</cite>",
|
68
|
+
html_str,
|
69
|
+
)
|
70
|
+
if match:
|
71
|
+
info["word_count"] = match.group(1) + match.group(2) + "字"
|
72
|
+
else:
|
73
|
+
info["word_count"] = "Unknown"
|
74
|
+
info["summary"] = soup.select_one("div.book-intro p").get_text(
|
75
|
+
separator="\n", strip=True
|
76
|
+
)
|
77
|
+
# volumes
|
78
|
+
vols = []
|
79
|
+
for vol_div in soup.select("div.volume-wrap div.volume"):
|
80
|
+
name = _get_volume_name(vol_div)
|
81
|
+
chaps = []
|
82
|
+
for li in vol_div.select("li"):
|
83
|
+
a = li.select_one("a")
|
84
|
+
chaps.append(
|
85
|
+
{
|
86
|
+
"title": a.get_text(strip=True),
|
87
|
+
"url": a["href"].strip(),
|
88
|
+
"chapterId": _chapter_url_to_id(a["href"]),
|
89
|
+
}
|
90
|
+
)
|
91
|
+
vols.append({"volume_name": name, "chapters": chaps})
|
92
|
+
info["volumes"] = vols
|
93
|
+
except Exception as e:
|
94
|
+
logger.warning("[Parser] Error parsing book info: %s", e)
|
95
|
+
return info
|
@@ -0,0 +1,133 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.core.parsers.qidian_parser.shared.helpers
|
5
|
+
----------------------------------------------------------
|
6
|
+
|
7
|
+
Shared utility functions for parsing Qidian browser-rendered pages.
|
8
|
+
|
9
|
+
This module provides reusable helpers to:
|
10
|
+
- Convert HTML into BeautifulSoup objects with fallback.
|
11
|
+
- Extract SSR-rendered JSON page context and structured chapter metadata.
|
12
|
+
- Identify VIP chapters, encrypted content, and viewability conditions.
|
13
|
+
"""
|
14
|
+
|
15
|
+
import json
|
16
|
+
import logging
|
17
|
+
from typing import Any, Dict, Union
|
18
|
+
|
19
|
+
from bs4 import BeautifulSoup
|
20
|
+
|
21
|
+
logger = logging.getLogger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
def html_to_soup(html_str: str) -> BeautifulSoup:
|
25
|
+
"""
|
26
|
+
Convert an HTML string to a BeautifulSoup object with fallback.
|
27
|
+
|
28
|
+
:param html_str: Raw HTML string.
|
29
|
+
:return: Parsed BeautifulSoup object.
|
30
|
+
"""
|
31
|
+
try:
|
32
|
+
return BeautifulSoup(html_str, "lxml")
|
33
|
+
except Exception as e:
|
34
|
+
logger.warning("[Parser] lxml parse failed, falling back: %s", e)
|
35
|
+
return BeautifulSoup(html_str, "html.parser")
|
36
|
+
|
37
|
+
|
38
|
+
def is_vip(html_str: str) -> bool:
|
39
|
+
"""
|
40
|
+
Return True if page indicates VIP‐only content.
|
41
|
+
|
42
|
+
:param html_str: Raw HTML string.
|
43
|
+
"""
|
44
|
+
markers = ["这是VIP章节", "需要订阅", "订阅后才能阅读"]
|
45
|
+
return any(m in html_str for m in markers)
|
46
|
+
|
47
|
+
|
48
|
+
def vip_status(soup: BeautifulSoup) -> bool:
|
49
|
+
"""
|
50
|
+
:param soup: Parsed BeautifulSoup object of the HTML page.
|
51
|
+
:return: True if VIP, False otherwise.
|
52
|
+
"""
|
53
|
+
ssr_data = find_ssr_page_context(soup)
|
54
|
+
chapter_info = extract_chapter_info(ssr_data)
|
55
|
+
vip_flag = chapter_info.get("vipStatus", 0)
|
56
|
+
fens_flag = chapter_info.get("fEnS", 0)
|
57
|
+
return bool(vip_flag == 1 and fens_flag != 0)
|
58
|
+
|
59
|
+
|
60
|
+
def can_view_chapter(soup: BeautifulSoup) -> bool:
|
61
|
+
"""
|
62
|
+
Return True if the chapter is viewable by the current user.
|
63
|
+
|
64
|
+
A chapter is not viewable if it is marked as VIP
|
65
|
+
and has not been purchased.
|
66
|
+
|
67
|
+
:param soup: Parsed BeautifulSoup object of the HTML page.
|
68
|
+
:return: True if viewable, False otherwise.
|
69
|
+
"""
|
70
|
+
ssr_data = find_ssr_page_context(soup)
|
71
|
+
chapter_info = extract_chapter_info(ssr_data)
|
72
|
+
|
73
|
+
is_buy = chapter_info.get("isBuy", 0)
|
74
|
+
vip_status = chapter_info.get("vipStatus", 0)
|
75
|
+
|
76
|
+
return not (vip_status == 1 and is_buy == 0)
|
77
|
+
|
78
|
+
|
79
|
+
def is_encrypted(content: Union[str, BeautifulSoup]) -> bool:
|
80
|
+
"""
|
81
|
+
Return True if content is encrypted.
|
82
|
+
|
83
|
+
Chapter Encryption Status (cES):
|
84
|
+
- 0: 内容是'明文'
|
85
|
+
- 2: 字体加密
|
86
|
+
|
87
|
+
:param content: HTML content, either as a raw string or a BeautifulSoup object.
|
88
|
+
:return: True if encrypted marker is found, else False.
|
89
|
+
"""
|
90
|
+
# main = soup.select_one("div#app div#reader-content main")
|
91
|
+
# return bool(main and "r-font-encrypt" in main.get("class", []))
|
92
|
+
# Normalize to BeautifulSoup
|
93
|
+
soup = html_to_soup(content) if isinstance(content, str) else content
|
94
|
+
|
95
|
+
ssr_data = find_ssr_page_context(soup)
|
96
|
+
chapter_info = extract_chapter_info(ssr_data)
|
97
|
+
return int(chapter_info.get("cES", 0)) == 2
|
98
|
+
|
99
|
+
|
100
|
+
def find_ssr_page_context(soup: BeautifulSoup) -> Dict[str, Any]:
|
101
|
+
"""
|
102
|
+
Extract SSR JSON from <script id="vite-plugin-ssr_pageContext">.
|
103
|
+
"""
|
104
|
+
try:
|
105
|
+
tag = soup.find("script", id="vite-plugin-ssr_pageContext")
|
106
|
+
if tag and tag.string:
|
107
|
+
data: Dict[str, Any] = json.loads(tag.string.strip())
|
108
|
+
return data
|
109
|
+
except Exception as e:
|
110
|
+
logger.warning("[Parser] SSR JSON parse error: %s", e)
|
111
|
+
return {}
|
112
|
+
|
113
|
+
|
114
|
+
def extract_chapter_info(ssr_data: Dict[str, Any]) -> Dict[str, Any]:
|
115
|
+
"""
|
116
|
+
Extract the 'chapterInfo' dictionary from the SSR page context.
|
117
|
+
|
118
|
+
This handles nested key access and returns an empty dict if missing.
|
119
|
+
|
120
|
+
:param ssr_data: The full SSR data object from _find_ssr_page_context().
|
121
|
+
:return: A dict with chapter metadata such as chapterName, authorSay, etc.
|
122
|
+
"""
|
123
|
+
try:
|
124
|
+
page_context = ssr_data.get("pageContext", {})
|
125
|
+
page_props = page_context.get("pageProps", {})
|
126
|
+
page_data = page_props.get("pageData", {})
|
127
|
+
chapter_info = page_data.get("chapterInfo", {})
|
128
|
+
|
129
|
+
assert isinstance(chapter_info, dict)
|
130
|
+
return chapter_info
|
131
|
+
except Exception as e:
|
132
|
+
logger.warning("[Parser] Failed to extract chapterInfo: %s", e)
|
133
|
+
return {}
|
@@ -0,0 +1,31 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.core.requesters
|
5
|
+
--------------------------------
|
6
|
+
|
7
|
+
This package provides requester implementations for different novel platforms.
|
8
|
+
Each submodule corresponds to a specific site and encapsulates the logic needed
|
9
|
+
to perform network interactions, such as logging in, sending requests,
|
10
|
+
or interacting with browser/session-based sources.
|
11
|
+
|
12
|
+
Subpackages:
|
13
|
+
- common_requester: Handles all common-site requesting logic.
|
14
|
+
- qidian_requester: Handles all Qidian-related requesting logic.
|
15
|
+
"""
|
16
|
+
|
17
|
+
from .common_requester import (
|
18
|
+
CommonAsyncSession,
|
19
|
+
CommonSession,
|
20
|
+
)
|
21
|
+
from .qidian_requester import (
|
22
|
+
QidianBrowser,
|
23
|
+
QidianSession,
|
24
|
+
)
|
25
|
+
|
26
|
+
__all__ = [
|
27
|
+
"CommonAsyncSession",
|
28
|
+
"CommonSession",
|
29
|
+
"QidianBrowser",
|
30
|
+
"QidianSession",
|
31
|
+
]
|