novel-downloader 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. novel_downloader/__init__.py +14 -0
  2. novel_downloader/cli/__init__.py +14 -0
  3. novel_downloader/cli/clean.py +134 -0
  4. novel_downloader/cli/download.py +132 -0
  5. novel_downloader/cli/interactive.py +67 -0
  6. novel_downloader/cli/main.py +45 -0
  7. novel_downloader/cli/settings.py +177 -0
  8. novel_downloader/config/__init__.py +52 -0
  9. novel_downloader/config/adapter.py +153 -0
  10. novel_downloader/config/loader.py +177 -0
  11. novel_downloader/config/models.py +173 -0
  12. novel_downloader/config/site_rules.py +97 -0
  13. novel_downloader/core/__init__.py +25 -0
  14. novel_downloader/core/downloaders/__init__.py +22 -0
  15. novel_downloader/core/downloaders/base_async_downloader.py +157 -0
  16. novel_downloader/core/downloaders/base_downloader.py +187 -0
  17. novel_downloader/core/downloaders/common_asynb_downloader.py +207 -0
  18. novel_downloader/core/downloaders/common_downloader.py +191 -0
  19. novel_downloader/core/downloaders/qidian_downloader.py +208 -0
  20. novel_downloader/core/factory/__init__.py +33 -0
  21. novel_downloader/core/factory/downloader_factory.py +149 -0
  22. novel_downloader/core/factory/parser_factory.py +62 -0
  23. novel_downloader/core/factory/requester_factory.py +106 -0
  24. novel_downloader/core/factory/saver_factory.py +49 -0
  25. novel_downloader/core/interfaces/__init__.py +32 -0
  26. novel_downloader/core/interfaces/async_downloader_protocol.py +37 -0
  27. novel_downloader/core/interfaces/async_requester_protocol.py +68 -0
  28. novel_downloader/core/interfaces/downloader_protocol.py +37 -0
  29. novel_downloader/core/interfaces/parser_protocol.py +40 -0
  30. novel_downloader/core/interfaces/requester_protocol.py +65 -0
  31. novel_downloader/core/interfaces/saver_protocol.py +61 -0
  32. novel_downloader/core/parsers/__init__.py +28 -0
  33. novel_downloader/core/parsers/base_parser.py +96 -0
  34. novel_downloader/core/parsers/common_parser/__init__.py +14 -0
  35. novel_downloader/core/parsers/common_parser/helper.py +321 -0
  36. novel_downloader/core/parsers/common_parser/main_parser.py +86 -0
  37. novel_downloader/core/parsers/qidian_parser/__init__.py +20 -0
  38. novel_downloader/core/parsers/qidian_parser/browser/__init__.py +13 -0
  39. novel_downloader/core/parsers/qidian_parser/browser/chapter_encrypted.py +498 -0
  40. novel_downloader/core/parsers/qidian_parser/browser/chapter_normal.py +97 -0
  41. novel_downloader/core/parsers/qidian_parser/browser/chapter_router.py +70 -0
  42. novel_downloader/core/parsers/qidian_parser/browser/main_parser.py +110 -0
  43. novel_downloader/core/parsers/qidian_parser/session/__init__.py +13 -0
  44. novel_downloader/core/parsers/qidian_parser/session/chapter_encrypted.py +451 -0
  45. novel_downloader/core/parsers/qidian_parser/session/chapter_normal.py +119 -0
  46. novel_downloader/core/parsers/qidian_parser/session/chapter_router.py +67 -0
  47. novel_downloader/core/parsers/qidian_parser/session/main_parser.py +113 -0
  48. novel_downloader/core/parsers/qidian_parser/session/node_decryptor.py +164 -0
  49. novel_downloader/core/parsers/qidian_parser/shared/__init__.py +38 -0
  50. novel_downloader/core/parsers/qidian_parser/shared/book_info_parser.py +95 -0
  51. novel_downloader/core/parsers/qidian_parser/shared/helpers.py +133 -0
  52. novel_downloader/core/requesters/__init__.py +31 -0
  53. novel_downloader/core/requesters/base_async_session.py +297 -0
  54. novel_downloader/core/requesters/base_browser.py +210 -0
  55. novel_downloader/core/requesters/base_session.py +243 -0
  56. novel_downloader/core/requesters/common_requester/__init__.py +18 -0
  57. novel_downloader/core/requesters/common_requester/common_async_session.py +96 -0
  58. novel_downloader/core/requesters/common_requester/common_session.py +126 -0
  59. novel_downloader/core/requesters/qidian_requester/__init__.py +22 -0
  60. novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +377 -0
  61. novel_downloader/core/requesters/qidian_requester/qidian_session.py +202 -0
  62. novel_downloader/core/savers/__init__.py +20 -0
  63. novel_downloader/core/savers/base_saver.py +169 -0
  64. novel_downloader/core/savers/common_saver/__init__.py +13 -0
  65. novel_downloader/core/savers/common_saver/common_epub.py +232 -0
  66. novel_downloader/core/savers/common_saver/common_txt.py +176 -0
  67. novel_downloader/core/savers/common_saver/main_saver.py +86 -0
  68. novel_downloader/core/savers/epub_utils/__init__.py +27 -0
  69. novel_downloader/core/savers/epub_utils/css_builder.py +68 -0
  70. novel_downloader/core/savers/epub_utils/initializer.py +98 -0
  71. novel_downloader/core/savers/epub_utils/text_to_html.py +132 -0
  72. novel_downloader/core/savers/epub_utils/volume_intro.py +61 -0
  73. novel_downloader/core/savers/qidian_saver.py +22 -0
  74. novel_downloader/locales/en.json +91 -0
  75. novel_downloader/locales/zh.json +91 -0
  76. novel_downloader/resources/config/rules.toml +196 -0
  77. novel_downloader/resources/config/settings.yaml +73 -0
  78. novel_downloader/resources/css_styles/main.css +104 -0
  79. novel_downloader/resources/css_styles/volume-intro.css +56 -0
  80. novel_downloader/resources/images/volume_border.png +0 -0
  81. novel_downloader/resources/js_scripts/qidian_decrypt_node.js +82 -0
  82. novel_downloader/resources/json/replace_word_map.json +4 -0
  83. novel_downloader/resources/text/blacklist.txt +22 -0
  84. novel_downloader/utils/__init__.py +0 -0
  85. novel_downloader/utils/cache.py +24 -0
  86. novel_downloader/utils/constants.py +158 -0
  87. novel_downloader/utils/crypto_utils.py +144 -0
  88. novel_downloader/utils/file_utils/__init__.py +43 -0
  89. novel_downloader/utils/file_utils/io.py +252 -0
  90. novel_downloader/utils/file_utils/normalize.py +68 -0
  91. novel_downloader/utils/file_utils/sanitize.py +77 -0
  92. novel_downloader/utils/fontocr/__init__.py +23 -0
  93. novel_downloader/utils/fontocr/ocr_v1.py +304 -0
  94. novel_downloader/utils/fontocr/ocr_v2.py +658 -0
  95. novel_downloader/utils/hash_store.py +288 -0
  96. novel_downloader/utils/hash_utils.py +103 -0
  97. novel_downloader/utils/i18n.py +41 -0
  98. novel_downloader/utils/logger.py +104 -0
  99. novel_downloader/utils/model_loader.py +72 -0
  100. novel_downloader/utils/network.py +287 -0
  101. novel_downloader/utils/state.py +156 -0
  102. novel_downloader/utils/text_utils/__init__.py +27 -0
  103. novel_downloader/utils/text_utils/chapter_formatting.py +46 -0
  104. novel_downloader/utils/text_utils/diff_display.py +75 -0
  105. novel_downloader/utils/text_utils/font_mapping.py +31 -0
  106. novel_downloader/utils/text_utils/text_cleaning.py +57 -0
  107. novel_downloader/utils/time_utils/__init__.py +22 -0
  108. novel_downloader/utils/time_utils/datetime_utils.py +146 -0
  109. novel_downloader/utils/time_utils/sleep_utils.py +49 -0
  110. novel_downloader-1.1.0.dist-info/METADATA +157 -0
  111. novel_downloader-1.1.0.dist-info/RECORD +115 -0
  112. novel_downloader-1.1.0.dist-info/WHEEL +5 -0
  113. novel_downloader-1.1.0.dist-info/entry_points.txt +2 -0
  114. novel_downloader-1.1.0.dist-info/licenses/LICENSE +21 -0
  115. novel_downloader-1.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,67 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.core.parsers.qidian_parser.session.chapter_router
5
+ ------------------------------------------------------------------
6
+
7
+ Routing logic for selecting the correct chapter parser for Qidian session pages.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+ from typing import TYPE_CHECKING, Any, Dict
14
+
15
+ from ..shared import (
16
+ can_view_chapter,
17
+ html_to_soup,
18
+ is_encrypted,
19
+ )
20
+ from .chapter_normal import parse_normal_chapter
21
+
22
+ if TYPE_CHECKING:
23
+ from .main_parser import QidianSessionParser
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ def parse_chapter(
29
+ parser: QidianSessionParser,
30
+ html_str: str,
31
+ chapter_id: str,
32
+ ) -> Dict[str, Any]:
33
+ """
34
+ Extract and return the formatted textual content of chapter.
35
+
36
+ :param parser: Instance of QidianSessionParser.
37
+ :param html_str: Raw HTML content of the chapter page.
38
+ :param chapter_id: Identifier of the chapter being parsed.
39
+ :return: Formatted chapter text or empty string if not parsable.
40
+ """
41
+ try:
42
+ soup = html_to_soup(html_str)
43
+
44
+ if not can_view_chapter(soup):
45
+ logger.warning(
46
+ "[Parser] Chapter '%s' is not purchased or inaccessible.", chapter_id
47
+ )
48
+ return {}
49
+
50
+ if is_encrypted(soup):
51
+ if not parser._decode_font:
52
+ return {}
53
+ try:
54
+ from .chapter_encrypted import parse_encrypted_chapter
55
+
56
+ return parse_encrypted_chapter(parser, soup, chapter_id, parser._fuid)
57
+ except ImportError:
58
+ logger.warning(
59
+ "[Parser] Encrypted chapter '%s' requires extra dependencies.",
60
+ chapter_id,
61
+ )
62
+ return {}
63
+
64
+ return parse_normal_chapter(soup, chapter_id, parser._fuid)
65
+ except Exception as e:
66
+ logger.warning("[Parser] parse error for chapter '%s': %s", chapter_id, e)
67
+ return {}
@@ -0,0 +1,113 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.core.parsers.qidian_parser.session.main_parser
5
+ ---------------------------------------------------------------
6
+
7
+ Main parser class for handling Qidian chapters rendered via a session.
8
+
9
+ This module defines `QidianSessionParser`, a parser implementation that supports
10
+ content extracted from dynamically rendered Qidian HTML pages.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from pathlib import Path
16
+ from typing import TYPE_CHECKING, Any, Dict, Optional
17
+
18
+ from novel_downloader.config.models import ParserConfig
19
+ from novel_downloader.core.parsers.base_parser import BaseParser
20
+ from novel_downloader.utils.state import state_mgr
21
+
22
+ from ..shared import (
23
+ is_encrypted,
24
+ parse_book_info,
25
+ )
26
+ from .chapter_router import parse_chapter
27
+
28
+ if TYPE_CHECKING:
29
+ from novel_downloader.utils.fontocr import FontOCR
30
+
31
+
32
+ class QidianSessionParser(BaseParser):
33
+ """
34
+ Parser for Qidian site using a session HTML workflow.
35
+ """
36
+
37
+ def __init__(self, config: ParserConfig):
38
+ """
39
+ Initialize the QidianBrowserParser with the given configuration.
40
+
41
+ :param config: ParserConfig object controlling:
42
+ """
43
+ super().__init__(config)
44
+
45
+ # Extract and store parser flags from config
46
+ self._decode_font: bool = config.decode_font
47
+ self._save_font_debug: bool = config.save_font_debug
48
+
49
+ self._fixed_font_dir: Path = self._base_cache_dir / "fixed_fonts"
50
+ self._fixed_font_dir.mkdir(parents=True, exist_ok=True)
51
+ self._font_debug_dir: Optional[Path] = None
52
+
53
+ qd_cookies = state_mgr.get_cookies("qidian")
54
+ self._fuid: str = qd_cookies.get("ywguid", "")
55
+
56
+ self._font_ocr: Optional[FontOCR] = None
57
+ if self._decode_font:
58
+ from novel_downloader.utils.fontocr import FontOCR
59
+
60
+ self._font_ocr = FontOCR(
61
+ cache_dir=self._base_cache_dir,
62
+ use_freq=config.use_freq,
63
+ use_ocr=config.use_ocr,
64
+ use_vec=config.use_vec,
65
+ batch_size=config.batch_size,
66
+ ocr_weight=config.ocr_weight,
67
+ vec_weight=config.vec_weight,
68
+ font_debug=config.save_font_debug,
69
+ )
70
+ self._font_debug_dir = self._base_cache_dir / "font_debug"
71
+ self._font_debug_dir.mkdir(parents=True, exist_ok=True)
72
+
73
+ def parse_book_info(self, html: str) -> Dict[str, Any]:
74
+ """
75
+ Parse a book info page and extract metadata and chapter structure.
76
+
77
+ :param html: Raw HTML of the book info page.
78
+ :return: Parsed metadata and chapter structure as a dictionary.
79
+ """
80
+ return parse_book_info(html)
81
+
82
+ def parse_chapter(self, html_str: str, chapter_id: str) -> Dict[str, Any]:
83
+ """
84
+ :param html: Raw HTML of the chapter page.
85
+ :param chapter_id: Identifier of the chapter being parsed.
86
+ :return: Cleaned chapter content as plain text.
87
+ """
88
+ return parse_chapter(self, html_str, chapter_id)
89
+
90
+ def is_encrypted(self, html_str: str) -> bool:
91
+ """
92
+ Return True if content is encrypted.
93
+
94
+ :param html: Raw HTML of the chapter page.
95
+ """
96
+ return is_encrypted(html_str)
97
+
98
+ def _init_cache_folders(self) -> None:
99
+ """
100
+ Prepare cache folders for plain/encrypted HTML and font debug data.
101
+ Folders are only created if corresponding debug/save flags are enabled.
102
+ """
103
+ base = self._base_cache_dir
104
+
105
+ # Font debug folder
106
+ if self._save_font_debug and self.book_id:
107
+ self._font_debug_dir = base / self.book_id / "font_debug"
108
+ self._font_debug_dir.mkdir(parents=True, exist_ok=True)
109
+ else:
110
+ self._font_debug_dir = None
111
+
112
+ def _on_book_id_set(self) -> None:
113
+ self._init_cache_folders()
@@ -0,0 +1,164 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.core.parsers.qidian_parser.session.node_decryptor
5
+ ------------------------------------------------------------------
6
+
7
+ Provides QidianNodeDecryptor, which ensures a Node.js environment,
8
+ downloads or installs the required JS modules (Fock + decrypt script),
9
+ and invokes a Node.js subprocess to decrypt Qidian chapter content.
10
+ """
11
+
12
+ import json
13
+ import logging
14
+ import shutil
15
+ import subprocess
16
+ import uuid
17
+ from pathlib import Path
18
+ from typing import Union
19
+
20
+ from novel_downloader.utils.constants import (
21
+ JS_SCRIPT_DIR,
22
+ QD_DECRYPT_SCRIPT_PATH,
23
+ )
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class QidianNodeDecryptor:
29
+ """
30
+ A decryptor that uses Node.js plus Qidian's Fock JavaScript module
31
+ to decrypt encrypted chapter payloads.
32
+
33
+ On initialization, this class will:
34
+ 1. Verify that `node` is on PATH.
35
+ 2. Copy our bundled `qidian_decrypt_node.js` into `JS_SCRIPT_DIR`.
36
+ 3. Download the remote Fock module JS if not already present.
37
+
38
+ Calling `decrypt()` will:
39
+ - Write a temp JSON input file with [ciphertext, chapter_id, fkp, fuid].
40
+ - Spawn `node qidian_decrypt_node.js <in> <out>`.
41
+ - Read and return the decrypted text.
42
+ - Clean up the temp files.
43
+ """
44
+
45
+ QIDIAN_FOCK_JS_URL: str = (
46
+ "https://cococdn.qidian.com/coco/s12062024/4819793b.qeooxh.js"
47
+ )
48
+ QIDIAN_FOCK_JS_PATH: Path = JS_SCRIPT_DIR / "4819793b.qeooxh.js"
49
+ QIDIAN_DECRYPT_SCRIPT_FILE: str = "qidian_decrypt_node.js"
50
+ QIDIAN_DECRYPT_SCRIPT_PATH: Path = JS_SCRIPT_DIR / QIDIAN_DECRYPT_SCRIPT_FILE
51
+
52
+ def __init__(self) -> None:
53
+ """
54
+ Prepare the script directory and verify that both Node.js
55
+ and the necessary JS files are available.
56
+ """
57
+ self.script_dir: Path = JS_SCRIPT_DIR
58
+ self.script_dir.mkdir(parents=True, exist_ok=True)
59
+ self.script_path: Path = self.QIDIAN_DECRYPT_SCRIPT_PATH
60
+ self._check_environment()
61
+
62
+ def _check_environment(self) -> None:
63
+ """
64
+ Ensure Node.js is installed, our decrypt script is copied from
65
+ package resources, and the Fock JS module is downloaded.
66
+
67
+ :raises EnvironmentError: if `node` is not on the system PATH.
68
+ """
69
+ # 1) Check Node.js
70
+ if not shutil.which("node"):
71
+ raise EnvironmentError("Node.js is not installed or not in PATH.")
72
+
73
+ # 2) Copy bundled decrypt script into place if missing
74
+ if not self.QIDIAN_DECRYPT_SCRIPT_PATH.exists():
75
+ try:
76
+ resource = QD_DECRYPT_SCRIPT_PATH
77
+ shutil.copyfile(str(resource), str(self.QIDIAN_DECRYPT_SCRIPT_PATH))
78
+ logger.info(
79
+ "[decryptor] Copied decrypt script to %s",
80
+ self.QIDIAN_DECRYPT_SCRIPT_PATH,
81
+ )
82
+ except Exception as e:
83
+ logger.error("[decryptor] Failed to copy decrypt script: %s", e)
84
+ raise
85
+
86
+ # 3) Download the Fock JS module from Qidian CDN if missing
87
+ if not self.QIDIAN_FOCK_JS_PATH.exists():
88
+ from novel_downloader.utils.network import download_js_file
89
+
90
+ try:
91
+ download_js_file(
92
+ self.QIDIAN_FOCK_JS_URL,
93
+ self.script_dir,
94
+ on_exist="overwrite",
95
+ )
96
+ logger.info(
97
+ "[decryptor] Downloaded Fock module to %s", self.QIDIAN_FOCK_JS_PATH
98
+ )
99
+ except Exception as e:
100
+ logger.error("[decryptor] Failed to download Fock JS module: %s", e)
101
+ raise
102
+
103
+ def decrypt(
104
+ self,
105
+ ciphertext: Union[str, bytes],
106
+ chapter_id: Union[str, int],
107
+ fkp: str,
108
+ fuid: str,
109
+ ) -> str:
110
+ """
111
+ Decrypt a chapter payload via our Node.js script.
112
+
113
+ :param ciphertext: Base64-encoded encrypted content (str or bytes).
114
+ :param chapter_id: The chapter's numeric ID.
115
+ :param fkp: Base64-encoded Fock key param from the page.
116
+ :param fuid: Fock user ID param from the page.
117
+ :return: The decrypted plain-text content.
118
+ :raises RuntimeError: if the Node.js subprocess exits with a non-zero code.
119
+ """
120
+ # Normalize inputs
121
+ cipher_str = (
122
+ ciphertext.decode("utf-8")
123
+ if isinstance(ciphertext, (bytes, bytearray))
124
+ else str(ciphertext)
125
+ )
126
+ chapter_str = str(chapter_id)
127
+
128
+ # Create unique temp file names
129
+ task_id = uuid.uuid4().hex
130
+ input_path = self.script_dir / f"input_{task_id}.json"
131
+ output_path = self.script_dir / f"output_{task_id}.txt"
132
+
133
+ try:
134
+ # Write arguments as JSON array
135
+ input_path.write_text(
136
+ json.dumps([cipher_str, chapter_str, fkp, fuid]),
137
+ encoding="utf-8",
138
+ )
139
+
140
+ logger.debug(
141
+ "[decryptor] Invoking Node.js: node %s %s %s",
142
+ self.script_path.name,
143
+ input_path.name,
144
+ output_path.name,
145
+ )
146
+
147
+ proc = subprocess.run(
148
+ ["node", self.script_path.name, input_path.name, output_path.name],
149
+ stdout=subprocess.PIPE,
150
+ stderr=subprocess.PIPE,
151
+ text=True,
152
+ cwd=str(self.script_dir),
153
+ )
154
+
155
+ if proc.returncode != 0:
156
+ raise RuntimeError(f"Node error: {proc.stderr.strip()}")
157
+
158
+ # Return decrypted content
159
+ return output_path.read_text(encoding="utf-8").strip()
160
+
161
+ finally:
162
+ # Clean up temp files
163
+ input_path.unlink(missing_ok=True)
164
+ output_path.unlink(missing_ok=True)
@@ -0,0 +1,38 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.core.parsers.qidian_parser.shared
5
+ --------------------------------------------------
6
+
7
+ Shared parsing utilities for Qidian parser components.
8
+
9
+ This subpackage provides common functions used across
10
+ different Qidian parsing strategies. It encapsulates logic for:
11
+
12
+ - Parsing the SSR-rendered page context and chapter metadata.
13
+ - Determining access control and encryption status of chapters.
14
+ - Basic HTML preprocessing and fallback parsing behavior.
15
+ - Extracting structured book info from the main book page.
16
+ """
17
+
18
+ from .book_info_parser import parse_book_info
19
+ from .helpers import (
20
+ can_view_chapter,
21
+ extract_chapter_info,
22
+ find_ssr_page_context,
23
+ html_to_soup,
24
+ is_encrypted,
25
+ is_vip,
26
+ vip_status,
27
+ )
28
+
29
+ __all__ = [
30
+ "parse_book_info",
31
+ "html_to_soup",
32
+ "is_vip",
33
+ "can_view_chapter",
34
+ "is_encrypted",
35
+ "vip_status",
36
+ "find_ssr_page_context",
37
+ "extract_chapter_info",
38
+ ]
@@ -0,0 +1,95 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.core.parsers.qidian_parser.shared.book_info_parser
5
+ -------------------------------------------------------------------
6
+
7
+ This module provides parsing of Qidian book info pages.
8
+
9
+ It extracts metadata such as title, author, cover URL, update
10
+ time, status, word count, summary, and volume-chapter structure.
11
+ """
12
+
13
+ import logging
14
+ import re
15
+ from typing import Any, Dict
16
+
17
+ from bs4.element import Tag
18
+
19
+ from .helpers import html_to_soup
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def _chapter_url_to_id(url: str) -> str:
25
+ """
26
+ Extract chapterId as the last non-empty segment of the URL.
27
+ """
28
+ return url.rstrip("/").split("/")[-1]
29
+
30
+
31
+ def _get_volume_name(vol_div: Tag) -> str:
32
+ """
33
+ Extracts the volume title from a <div class="volume"> element
34
+ """
35
+ h3 = vol_div.select_one("h3")
36
+ if not h3:
37
+ return ""
38
+ for a in h3.find_all("a"):
39
+ a.decompose()
40
+ text: str = h3.get_text(strip=True)
41
+ return text.split(chr(183))[0].strip()
42
+
43
+
44
+ def parse_book_info(html_str: str) -> Dict[str, Any]:
45
+ """
46
+ Extract metadata: title, author, cover_url, update_time, status,
47
+ word_count, summary, and volumes with chapters.
48
+
49
+ :param html_str: Raw HTML of the book info page.
50
+ :return: A dict containing book metadata.
51
+ """
52
+ info: Dict[str, Any] = {}
53
+ try:
54
+ soup = html_to_soup(html_str)
55
+ info["book_name"] = soup.select_one("em#bookName").get_text(strip=True)
56
+ info["author"] = soup.select_one("a.writer").get_text(strip=True)
57
+ info["cover_url"] = soup.select_one("div.book-img img")["src"].strip()
58
+ info["update_time"] = (
59
+ soup.select_one("span.book-update-time")
60
+ .get_text(strip=True)
61
+ .replace("更新时间", "")
62
+ .strip()
63
+ )
64
+ info["serial_status"] = soup.select_one("span.blue").get_text(strip=True)
65
+ # word count via regex
66
+ match = re.search(
67
+ r"<em>([\d.]+)</em>\s*<cite>(.*?)字</cite>",
68
+ html_str,
69
+ )
70
+ if match:
71
+ info["word_count"] = match.group(1) + match.group(2) + "字"
72
+ else:
73
+ info["word_count"] = "Unknown"
74
+ info["summary"] = soup.select_one("div.book-intro p").get_text(
75
+ separator="\n", strip=True
76
+ )
77
+ # volumes
78
+ vols = []
79
+ for vol_div in soup.select("div.volume-wrap div.volume"):
80
+ name = _get_volume_name(vol_div)
81
+ chaps = []
82
+ for li in vol_div.select("li"):
83
+ a = li.select_one("a")
84
+ chaps.append(
85
+ {
86
+ "title": a.get_text(strip=True),
87
+ "url": a["href"].strip(),
88
+ "chapterId": _chapter_url_to_id(a["href"]),
89
+ }
90
+ )
91
+ vols.append({"volume_name": name, "chapters": chaps})
92
+ info["volumes"] = vols
93
+ except Exception as e:
94
+ logger.warning("[Parser] Error parsing book info: %s", e)
95
+ return info
@@ -0,0 +1,133 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.core.parsers.qidian_parser.shared.helpers
5
+ ----------------------------------------------------------
6
+
7
+ Shared utility functions for parsing Qidian browser-rendered pages.
8
+
9
+ This module provides reusable helpers to:
10
+ - Convert HTML into BeautifulSoup objects with fallback.
11
+ - Extract SSR-rendered JSON page context and structured chapter metadata.
12
+ - Identify VIP chapters, encrypted content, and viewability conditions.
13
+ """
14
+
15
+ import json
16
+ import logging
17
+ from typing import Any, Dict, Union
18
+
19
+ from bs4 import BeautifulSoup
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def html_to_soup(html_str: str) -> BeautifulSoup:
25
+ """
26
+ Convert an HTML string to a BeautifulSoup object with fallback.
27
+
28
+ :param html_str: Raw HTML string.
29
+ :return: Parsed BeautifulSoup object.
30
+ """
31
+ try:
32
+ return BeautifulSoup(html_str, "lxml")
33
+ except Exception as e:
34
+ logger.warning("[Parser] lxml parse failed, falling back: %s", e)
35
+ return BeautifulSoup(html_str, "html.parser")
36
+
37
+
38
+ def is_vip(html_str: str) -> bool:
39
+ """
40
+ Return True if page indicates VIP‐only content.
41
+
42
+ :param html_str: Raw HTML string.
43
+ """
44
+ markers = ["这是VIP章节", "需要订阅", "订阅后才能阅读"]
45
+ return any(m in html_str for m in markers)
46
+
47
+
48
+ def vip_status(soup: BeautifulSoup) -> bool:
49
+ """
50
+ :param soup: Parsed BeautifulSoup object of the HTML page.
51
+ :return: True if VIP, False otherwise.
52
+ """
53
+ ssr_data = find_ssr_page_context(soup)
54
+ chapter_info = extract_chapter_info(ssr_data)
55
+ vip_flag = chapter_info.get("vipStatus", 0)
56
+ fens_flag = chapter_info.get("fEnS", 0)
57
+ return bool(vip_flag == 1 and fens_flag != 0)
58
+
59
+
60
+ def can_view_chapter(soup: BeautifulSoup) -> bool:
61
+ """
62
+ Return True if the chapter is viewable by the current user.
63
+
64
+ A chapter is not viewable if it is marked as VIP
65
+ and has not been purchased.
66
+
67
+ :param soup: Parsed BeautifulSoup object of the HTML page.
68
+ :return: True if viewable, False otherwise.
69
+ """
70
+ ssr_data = find_ssr_page_context(soup)
71
+ chapter_info = extract_chapter_info(ssr_data)
72
+
73
+ is_buy = chapter_info.get("isBuy", 0)
74
+ vip_status = chapter_info.get("vipStatus", 0)
75
+
76
+ return not (vip_status == 1 and is_buy == 0)
77
+
78
+
79
+ def is_encrypted(content: Union[str, BeautifulSoup]) -> bool:
80
+ """
81
+ Return True if content is encrypted.
82
+
83
+ Chapter Encryption Status (cES):
84
+ - 0: 内容是'明文'
85
+ - 2: 字体加密
86
+
87
+ :param content: HTML content, either as a raw string or a BeautifulSoup object.
88
+ :return: True if encrypted marker is found, else False.
89
+ """
90
+ # main = soup.select_one("div#app div#reader-content main")
91
+ # return bool(main and "r-font-encrypt" in main.get("class", []))
92
+ # Normalize to BeautifulSoup
93
+ soup = html_to_soup(content) if isinstance(content, str) else content
94
+
95
+ ssr_data = find_ssr_page_context(soup)
96
+ chapter_info = extract_chapter_info(ssr_data)
97
+ return int(chapter_info.get("cES", 0)) == 2
98
+
99
+
100
+ def find_ssr_page_context(soup: BeautifulSoup) -> Dict[str, Any]:
101
+ """
102
+ Extract SSR JSON from <script id="vite-plugin-ssr_pageContext">.
103
+ """
104
+ try:
105
+ tag = soup.find("script", id="vite-plugin-ssr_pageContext")
106
+ if tag and tag.string:
107
+ data: Dict[str, Any] = json.loads(tag.string.strip())
108
+ return data
109
+ except Exception as e:
110
+ logger.warning("[Parser] SSR JSON parse error: %s", e)
111
+ return {}
112
+
113
+
114
+ def extract_chapter_info(ssr_data: Dict[str, Any]) -> Dict[str, Any]:
115
+ """
116
+ Extract the 'chapterInfo' dictionary from the SSR page context.
117
+
118
+ This handles nested key access and returns an empty dict if missing.
119
+
120
+ :param ssr_data: The full SSR data object from _find_ssr_page_context().
121
+ :return: A dict with chapter metadata such as chapterName, authorSay, etc.
122
+ """
123
+ try:
124
+ page_context = ssr_data.get("pageContext", {})
125
+ page_props = page_context.get("pageProps", {})
126
+ page_data = page_props.get("pageData", {})
127
+ chapter_info = page_data.get("chapterInfo", {})
128
+
129
+ assert isinstance(chapter_info, dict)
130
+ return chapter_info
131
+ except Exception as e:
132
+ logger.warning("[Parser] Failed to extract chapterInfo: %s", e)
133
+ return {}
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.core.requesters
5
+ --------------------------------
6
+
7
+ This package provides requester implementations for different novel platforms.
8
+ Each submodule corresponds to a specific site and encapsulates the logic needed
9
+ to perform network interactions, such as logging in, sending requests,
10
+ or interacting with browser/session-based sources.
11
+
12
+ Subpackages:
13
+ - common_requester: Handles all common-site requesting logic.
14
+ - qidian_requester: Handles all Qidian-related requesting logic.
15
+ """
16
+
17
+ from .common_requester import (
18
+ CommonAsyncSession,
19
+ CommonSession,
20
+ )
21
+ from .qidian_requester import (
22
+ QidianBrowser,
23
+ QidianSession,
24
+ )
25
+
26
+ __all__ = [
27
+ "CommonAsyncSession",
28
+ "CommonSession",
29
+ "QidianBrowser",
30
+ "QidianSession",
31
+ ]