novel-downloader 1.3.2__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (213) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/clean.py +97 -78
  3. novel_downloader/cli/config.py +177 -0
  4. novel_downloader/cli/download.py +132 -87
  5. novel_downloader/cli/export.py +77 -0
  6. novel_downloader/cli/main.py +21 -28
  7. novel_downloader/config/__init__.py +1 -25
  8. novel_downloader/config/adapter.py +32 -31
  9. novel_downloader/config/loader.py +3 -3
  10. novel_downloader/config/site_rules.py +1 -2
  11. novel_downloader/core/__init__.py +3 -6
  12. novel_downloader/core/downloaders/__init__.py +10 -13
  13. novel_downloader/core/downloaders/base.py +233 -0
  14. novel_downloader/core/downloaders/biquge.py +27 -0
  15. novel_downloader/core/downloaders/common.py +414 -0
  16. novel_downloader/core/downloaders/esjzone.py +27 -0
  17. novel_downloader/core/downloaders/linovelib.py +27 -0
  18. novel_downloader/core/downloaders/qianbi.py +27 -0
  19. novel_downloader/core/downloaders/qidian.py +352 -0
  20. novel_downloader/core/downloaders/sfacg.py +27 -0
  21. novel_downloader/core/downloaders/yamibo.py +27 -0
  22. novel_downloader/core/exporters/__init__.py +37 -0
  23. novel_downloader/core/{savers → exporters}/base.py +73 -44
  24. novel_downloader/core/exporters/biquge.py +25 -0
  25. novel_downloader/core/exporters/common/__init__.py +12 -0
  26. novel_downloader/core/{savers → exporters}/common/epub.py +40 -52
  27. novel_downloader/core/{savers/common/main_saver.py → exporters/common/main_exporter.py} +36 -39
  28. novel_downloader/core/{savers → exporters}/common/txt.py +20 -24
  29. novel_downloader/core/exporters/epub_utils/__init__.py +40 -0
  30. novel_downloader/core/{savers → exporters}/epub_utils/css_builder.py +2 -1
  31. novel_downloader/core/exporters/epub_utils/image_loader.py +131 -0
  32. novel_downloader/core/{savers → exporters}/epub_utils/initializer.py +6 -3
  33. novel_downloader/core/{savers → exporters}/epub_utils/text_to_html.py +49 -2
  34. novel_downloader/core/{savers → exporters}/epub_utils/volume_intro.py +2 -1
  35. novel_downloader/core/exporters/esjzone.py +25 -0
  36. novel_downloader/core/exporters/linovelib/__init__.py +10 -0
  37. novel_downloader/core/exporters/linovelib/epub.py +449 -0
  38. novel_downloader/core/exporters/linovelib/main_exporter.py +127 -0
  39. novel_downloader/core/exporters/linovelib/txt.py +129 -0
  40. novel_downloader/core/exporters/qianbi.py +25 -0
  41. novel_downloader/core/{savers → exporters}/qidian.py +8 -8
  42. novel_downloader/core/exporters/sfacg.py +25 -0
  43. novel_downloader/core/exporters/yamibo.py +25 -0
  44. novel_downloader/core/factory/__init__.py +5 -17
  45. novel_downloader/core/factory/downloader.py +24 -126
  46. novel_downloader/core/factory/exporter.py +58 -0
  47. novel_downloader/core/factory/fetcher.py +96 -0
  48. novel_downloader/core/factory/parser.py +17 -12
  49. novel_downloader/core/{requesters → fetchers}/__init__.py +22 -15
  50. novel_downloader/core/{requesters → fetchers}/base/__init__.py +2 -4
  51. novel_downloader/core/fetchers/base/browser.py +383 -0
  52. novel_downloader/core/fetchers/base/rate_limiter.py +86 -0
  53. novel_downloader/core/fetchers/base/session.py +419 -0
  54. novel_downloader/core/fetchers/biquge/__init__.py +14 -0
  55. novel_downloader/core/{requesters/biquge/async_session.py → fetchers/biquge/browser.py} +18 -6
  56. novel_downloader/core/{requesters → fetchers}/biquge/session.py +23 -30
  57. novel_downloader/core/fetchers/common/__init__.py +14 -0
  58. novel_downloader/core/fetchers/common/browser.py +79 -0
  59. novel_downloader/core/{requesters/common/async_session.py → fetchers/common/session.py} +8 -25
  60. novel_downloader/core/fetchers/esjzone/__init__.py +14 -0
  61. novel_downloader/core/fetchers/esjzone/browser.py +202 -0
  62. novel_downloader/core/{requesters/esjzone/async_session.py → fetchers/esjzone/session.py} +62 -42
  63. novel_downloader/core/fetchers/linovelib/__init__.py +14 -0
  64. novel_downloader/core/fetchers/linovelib/browser.py +178 -0
  65. novel_downloader/core/fetchers/linovelib/session.py +178 -0
  66. novel_downloader/core/fetchers/qianbi/__init__.py +14 -0
  67. novel_downloader/core/{requesters/qianbi/session.py → fetchers/qianbi/browser.py} +30 -48
  68. novel_downloader/core/{requesters/qianbi/async_session.py → fetchers/qianbi/session.py} +18 -6
  69. novel_downloader/core/fetchers/qidian/__init__.py +14 -0
  70. novel_downloader/core/fetchers/qidian/browser.py +266 -0
  71. novel_downloader/core/fetchers/qidian/session.py +326 -0
  72. novel_downloader/core/fetchers/sfacg/__init__.py +14 -0
  73. novel_downloader/core/fetchers/sfacg/browser.py +189 -0
  74. novel_downloader/core/{requesters/sfacg/async_session.py → fetchers/sfacg/session.py} +43 -73
  75. novel_downloader/core/fetchers/yamibo/__init__.py +14 -0
  76. novel_downloader/core/fetchers/yamibo/browser.py +229 -0
  77. novel_downloader/core/{requesters/yamibo/async_session.py → fetchers/yamibo/session.py} +62 -44
  78. novel_downloader/core/interfaces/__init__.py +8 -12
  79. novel_downloader/core/interfaces/downloader.py +54 -0
  80. novel_downloader/core/interfaces/{saver.py → exporter.py} +12 -12
  81. novel_downloader/core/interfaces/fetcher.py +162 -0
  82. novel_downloader/core/interfaces/parser.py +6 -7
  83. novel_downloader/core/parsers/__init__.py +5 -6
  84. novel_downloader/core/parsers/base.py +9 -13
  85. novel_downloader/core/parsers/biquge/main_parser.py +12 -13
  86. novel_downloader/core/parsers/common/helper.py +3 -3
  87. novel_downloader/core/parsers/common/main_parser.py +39 -34
  88. novel_downloader/core/parsers/esjzone/main_parser.py +24 -17
  89. novel_downloader/core/parsers/linovelib/__init__.py +10 -0
  90. novel_downloader/core/parsers/linovelib/main_parser.py +210 -0
  91. novel_downloader/core/parsers/qianbi/main_parser.py +21 -15
  92. novel_downloader/core/parsers/qidian/__init__.py +2 -11
  93. novel_downloader/core/parsers/qidian/book_info_parser.py +113 -0
  94. novel_downloader/core/parsers/qidian/{browser/chapter_encrypted.py → chapter_encrypted.py} +162 -135
  95. novel_downloader/core/parsers/qidian/chapter_normal.py +150 -0
  96. novel_downloader/core/parsers/qidian/{session/chapter_router.py → chapter_router.py} +15 -15
  97. novel_downloader/core/parsers/qidian/{browser/main_parser.py → main_parser.py} +49 -40
  98. novel_downloader/core/parsers/qidian/utils/__init__.py +27 -0
  99. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +145 -0
  100. novel_downloader/core/parsers/qidian/{shared → utils}/helpers.py +41 -68
  101. novel_downloader/core/parsers/qidian/{session → utils}/node_decryptor.py +64 -50
  102. novel_downloader/core/parsers/sfacg/main_parser.py +12 -12
  103. novel_downloader/core/parsers/yamibo/main_parser.py +10 -10
  104. novel_downloader/locales/en.json +18 -2
  105. novel_downloader/locales/zh.json +18 -2
  106. novel_downloader/models/__init__.py +64 -0
  107. novel_downloader/models/browser.py +21 -0
  108. novel_downloader/models/chapter.py +25 -0
  109. novel_downloader/models/config.py +100 -0
  110. novel_downloader/models/login.py +20 -0
  111. novel_downloader/models/site_rules.py +99 -0
  112. novel_downloader/models/tasks.py +33 -0
  113. novel_downloader/models/types.py +15 -0
  114. novel_downloader/resources/config/settings.toml +31 -25
  115. novel_downloader/resources/json/linovelib_font_map.json +3573 -0
  116. novel_downloader/tui/__init__.py +7 -0
  117. novel_downloader/tui/app.py +32 -0
  118. novel_downloader/tui/main.py +17 -0
  119. novel_downloader/tui/screens/__init__.py +14 -0
  120. novel_downloader/tui/screens/home.py +191 -0
  121. novel_downloader/tui/screens/login.py +74 -0
  122. novel_downloader/tui/styles/home_layout.tcss +79 -0
  123. novel_downloader/tui/widgets/richlog_handler.py +24 -0
  124. novel_downloader/utils/__init__.py +6 -0
  125. novel_downloader/utils/chapter_storage.py +25 -38
  126. novel_downloader/utils/constants.py +15 -5
  127. novel_downloader/utils/cookies.py +66 -0
  128. novel_downloader/utils/crypto_utils.py +1 -74
  129. novel_downloader/utils/file_utils/io.py +1 -1
  130. novel_downloader/utils/fontocr/ocr_v1.py +2 -1
  131. novel_downloader/utils/fontocr/ocr_v2.py +2 -2
  132. novel_downloader/utils/hash_store.py +10 -18
  133. novel_downloader/utils/hash_utils.py +3 -2
  134. novel_downloader/utils/logger.py +2 -3
  135. novel_downloader/utils/network.py +53 -39
  136. novel_downloader/utils/text_utils/chapter_formatting.py +6 -1
  137. novel_downloader/utils/text_utils/font_mapping.py +1 -1
  138. novel_downloader/utils/text_utils/text_cleaning.py +1 -1
  139. novel_downloader/utils/time_utils/datetime_utils.py +3 -3
  140. novel_downloader/utils/time_utils/sleep_utils.py +3 -3
  141. {novel_downloader-1.3.2.dist-info → novel_downloader-1.4.0.dist-info}/METADATA +72 -38
  142. novel_downloader-1.4.0.dist-info/RECORD +170 -0
  143. {novel_downloader-1.3.2.dist-info → novel_downloader-1.4.0.dist-info}/WHEEL +1 -1
  144. {novel_downloader-1.3.2.dist-info → novel_downloader-1.4.0.dist-info}/entry_points.txt +1 -0
  145. novel_downloader/cli/interactive.py +0 -66
  146. novel_downloader/cli/settings.py +0 -177
  147. novel_downloader/config/models.py +0 -187
  148. novel_downloader/core/downloaders/base/__init__.py +0 -14
  149. novel_downloader/core/downloaders/base/base_async.py +0 -153
  150. novel_downloader/core/downloaders/base/base_sync.py +0 -208
  151. novel_downloader/core/downloaders/biquge/__init__.py +0 -14
  152. novel_downloader/core/downloaders/biquge/biquge_async.py +0 -27
  153. novel_downloader/core/downloaders/biquge/biquge_sync.py +0 -27
  154. novel_downloader/core/downloaders/common/__init__.py +0 -14
  155. novel_downloader/core/downloaders/common/common_async.py +0 -218
  156. novel_downloader/core/downloaders/common/common_sync.py +0 -210
  157. novel_downloader/core/downloaders/esjzone/__init__.py +0 -14
  158. novel_downloader/core/downloaders/esjzone/esjzone_async.py +0 -27
  159. novel_downloader/core/downloaders/esjzone/esjzone_sync.py +0 -27
  160. novel_downloader/core/downloaders/qianbi/__init__.py +0 -14
  161. novel_downloader/core/downloaders/qianbi/qianbi_async.py +0 -27
  162. novel_downloader/core/downloaders/qianbi/qianbi_sync.py +0 -27
  163. novel_downloader/core/downloaders/qidian/__init__.py +0 -10
  164. novel_downloader/core/downloaders/qidian/qidian_sync.py +0 -227
  165. novel_downloader/core/downloaders/sfacg/__init__.py +0 -14
  166. novel_downloader/core/downloaders/sfacg/sfacg_async.py +0 -27
  167. novel_downloader/core/downloaders/sfacg/sfacg_sync.py +0 -27
  168. novel_downloader/core/downloaders/yamibo/__init__.py +0 -14
  169. novel_downloader/core/downloaders/yamibo/yamibo_async.py +0 -27
  170. novel_downloader/core/downloaders/yamibo/yamibo_sync.py +0 -27
  171. novel_downloader/core/factory/requester.py +0 -144
  172. novel_downloader/core/factory/saver.py +0 -56
  173. novel_downloader/core/interfaces/async_downloader.py +0 -36
  174. novel_downloader/core/interfaces/async_requester.py +0 -84
  175. novel_downloader/core/interfaces/sync_downloader.py +0 -36
  176. novel_downloader/core/interfaces/sync_requester.py +0 -82
  177. novel_downloader/core/parsers/qidian/browser/__init__.py +0 -12
  178. novel_downloader/core/parsers/qidian/browser/chapter_normal.py +0 -93
  179. novel_downloader/core/parsers/qidian/browser/chapter_router.py +0 -71
  180. novel_downloader/core/parsers/qidian/session/__init__.py +0 -12
  181. novel_downloader/core/parsers/qidian/session/chapter_encrypted.py +0 -443
  182. novel_downloader/core/parsers/qidian/session/chapter_normal.py +0 -115
  183. novel_downloader/core/parsers/qidian/session/main_parser.py +0 -128
  184. novel_downloader/core/parsers/qidian/shared/__init__.py +0 -37
  185. novel_downloader/core/parsers/qidian/shared/book_info_parser.py +0 -150
  186. novel_downloader/core/requesters/base/async_session.py +0 -410
  187. novel_downloader/core/requesters/base/browser.py +0 -337
  188. novel_downloader/core/requesters/base/session.py +0 -378
  189. novel_downloader/core/requesters/biquge/__init__.py +0 -14
  190. novel_downloader/core/requesters/common/__init__.py +0 -17
  191. novel_downloader/core/requesters/common/session.py +0 -113
  192. novel_downloader/core/requesters/esjzone/__init__.py +0 -13
  193. novel_downloader/core/requesters/esjzone/session.py +0 -235
  194. novel_downloader/core/requesters/qianbi/__init__.py +0 -13
  195. novel_downloader/core/requesters/qidian/__init__.py +0 -21
  196. novel_downloader/core/requesters/qidian/broswer.py +0 -307
  197. novel_downloader/core/requesters/qidian/session.py +0 -290
  198. novel_downloader/core/requesters/sfacg/__init__.py +0 -13
  199. novel_downloader/core/requesters/sfacg/session.py +0 -242
  200. novel_downloader/core/requesters/yamibo/__init__.py +0 -13
  201. novel_downloader/core/requesters/yamibo/session.py +0 -237
  202. novel_downloader/core/savers/__init__.py +0 -34
  203. novel_downloader/core/savers/biquge.py +0 -25
  204. novel_downloader/core/savers/common/__init__.py +0 -12
  205. novel_downloader/core/savers/epub_utils/__init__.py +0 -26
  206. novel_downloader/core/savers/esjzone.py +0 -25
  207. novel_downloader/core/savers/qianbi.py +0 -25
  208. novel_downloader/core/savers/sfacg.py +0 -25
  209. novel_downloader/core/savers/yamibo.py +0 -25
  210. novel_downloader/resources/config/rules.toml +0 -196
  211. novel_downloader-1.3.2.dist-info/RECORD +0 -165
  212. {novel_downloader-1.3.2.dist-info → novel_downloader-1.4.0.dist-info}/licenses/LICENSE +0 -0
  213. {novel_downloader-1.3.2.dist-info → novel_downloader-1.4.0.dist-info}/top_level.txt +0 -0
@@ -1,443 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- novel_downloader.core.parsers.qidian.session.chapter_encrypted
4
- --------------------------------------------------------------
5
-
6
- Support for parsing encrypted chapters from Qidian using font OCR mapping,
7
- CSS rules, and custom rendering logic.
8
-
9
- Includes:
10
- - Font downloading and caching
11
- - Encrypted paragraph extraction
12
- - Custom CSS parsing and layout restoration
13
- - Font-based OCR decryption and mapping
14
- """
15
-
16
- from __future__ import annotations
17
-
18
- import json
19
- import logging
20
- from pathlib import Path
21
- from typing import TYPE_CHECKING, Any
22
-
23
- import tinycss2
24
- from bs4 import BeautifulSoup, Tag
25
-
26
- from novel_downloader.utils.chapter_storage import ChapterDict
27
- from novel_downloader.utils.network import download_font_file
28
- from novel_downloader.utils.text_utils import apply_font_mapping
29
-
30
- from ..shared import (
31
- extract_chapter_info,
32
- find_ssr_page_context,
33
- html_to_soup,
34
- vip_status,
35
- )
36
- from .node_decryptor import QidianNodeDecryptor
37
-
38
- if TYPE_CHECKING:
39
- from .main_parser import QidianSessionParser
40
-
41
- logger = logging.getLogger(__name__)
42
- IGNORED_CLASS_LISTS = {"title", "review"}
43
- _decryptor: QidianNodeDecryptor | None = None
44
-
45
-
46
- def _get_decryptor() -> QidianNodeDecryptor:
47
- """
48
- Return the singleton QidianNodeDecryptor, initializing it on first use.
49
- """
50
- global _decryptor
51
- if _decryptor is None:
52
- _decryptor = QidianNodeDecryptor()
53
- return _decryptor
54
-
55
-
56
- def parse_encrypted_chapter(
57
- parser: QidianSessionParser,
58
- soup: BeautifulSoup,
59
- chapter_id: str,
60
- fuid: str,
61
- ) -> ChapterDict | None:
62
- """
63
- Extract and return the formatted textual content of an encrypted chapter.
64
-
65
- Steps:
66
- 1. Load SSR JSON context for CSS, fonts, and metadata.
67
- 3. Decode and save randomFont bytes; download fixedFont via download_font().
68
- 4. Extract paragraph structures and save debug JSON.
69
- 5. Parse CSS rules and save debug JSON.
70
- 6. Render encrypted paragraphs, then run OCR font-mapping.
71
- 7. Extracts paragraph texts and formats them.
72
-
73
- :param html_str: Raw HTML content of the chapter page.
74
- :return: Formatted chapter text or empty string if not parsable.
75
- """
76
- try:
77
- if not (parser._decode_font and parser._font_ocr):
78
- return None
79
- ssr_data = find_ssr_page_context(soup)
80
- chapter_info = extract_chapter_info(ssr_data)
81
- if not chapter_info:
82
- logger.warning(
83
- "[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
84
- )
85
- return None
86
- debug_base_dir: Path | None = None
87
- if parser._font_debug_dir:
88
- debug_base_dir = parser._font_debug_dir / chapter_id
89
- debug_base_dir.mkdir(parents=True, exist_ok=True)
90
-
91
- css_str = chapter_info["css"]
92
- randomFont_str = chapter_info["randomFont"]
93
- fixedFontWoff2_url = chapter_info["fixedFontWoff2"]
94
-
95
- title = chapter_info.get("chapterName", "Untitled")
96
- raw_html = chapter_info.get("content", "")
97
- chapter_id = chapter_info.get("chapterId", "")
98
- fkp = chapter_info.get("fkp", "")
99
- author_say = chapter_info.get("authorSay", "")
100
- update_time = chapter_info.get("updateTime", "")
101
- update_timestamp = chapter_info.get("updateTimestamp", 0)
102
- modify_time = chapter_info.get("modifyTime", 0)
103
- word_count = chapter_info.get("wordsCount", 0)
104
- seq = chapter_info.get("seq", None)
105
- volume = chapter_info.get("extra", {}).get("volumeName", "")
106
-
107
- if not raw_html:
108
- logger.warning("[Parser] raw_html not found for chapter '%s'", chapter_id)
109
- return None
110
-
111
- # extract + save font
112
- rf = json.loads(randomFont_str)
113
- rand_path = parser._base_cache_dir / "randomFont.ttf"
114
- rand_path.parent.mkdir(parents=True, exist_ok=True)
115
- rand_path.write_bytes(bytes(rf["data"]))
116
-
117
- fixed_path = download_font_file(
118
- url=fixedFontWoff2_url, target_folder=parser._fixed_font_dir
119
- )
120
- if fixed_path is None:
121
- raise ValueError("fixed_path is None: failed to download font")
122
-
123
- # Extract and render paragraphs from HTML with CSS rules
124
-
125
- if vip_status(soup):
126
- try:
127
- decryptor = _get_decryptor()
128
- raw_html = decryptor.decrypt(
129
- raw_html,
130
- chapter_id,
131
- fkp,
132
- fuid,
133
- )
134
- except Exception as e:
135
- logger.error("[Parser] decryption failed for '%s': %s", chapter_id, e)
136
- return None
137
- main_paragraphs = extract_paragraphs_recursively(html_to_soup(raw_html))
138
- if debug_base_dir:
139
- main_paragraphs_path = debug_base_dir / "main_paragraphs_debug.json"
140
- main_paragraphs_path.write_text(
141
- json.dumps(main_paragraphs, ensure_ascii=False, indent=2),
142
- encoding="utf-8",
143
- )
144
-
145
- paragraphs_rules = parse_rule(css_str)
146
- if debug_base_dir:
147
- paragraphs_rules_path = debug_base_dir / "paragraphs_rules_debug.json"
148
- paragraphs_rules_path.write_text(
149
- json.dumps(paragraphs_rules, ensure_ascii=False, indent=2),
150
- encoding="utf-8",
151
- )
152
-
153
- paragraphs_str, refl_list = render_paragraphs(main_paragraphs, paragraphs_rules)
154
- if debug_base_dir:
155
- paragraphs_str_path = debug_base_dir / f"{chapter_id}_debug.txt"
156
- paragraphs_str_path.write_text(paragraphs_str, encoding="utf-8")
157
-
158
- # Run OCR + fallback mapping
159
- char_set = {c for c in paragraphs_str if c not in {" ", "\n", "\u3000"}}
160
- refl_set = set(refl_list)
161
- char_set = char_set - refl_set
162
- if debug_base_dir:
163
- char_sets_path = debug_base_dir / "char_set_debug.txt"
164
- temp = f"char_set:\n{char_set}\n\nrefl_set:\n{refl_set}"
165
- char_sets_path.write_text(
166
- temp,
167
- encoding="utf-8",
168
- )
169
-
170
- mapping_result = parser._font_ocr.generate_font_map(
171
- fixed_font_path=fixed_path,
172
- random_font_path=rand_path,
173
- char_set=char_set,
174
- refl_set=refl_set,
175
- chapter_id=chapter_id,
176
- )
177
- if debug_base_dir:
178
- mapping_json_path = debug_base_dir / "font_mapping.json"
179
- mapping_json_path.write_text(
180
- json.dumps(mapping_result, ensure_ascii=False, indent=2),
181
- encoding="utf-8",
182
- )
183
-
184
- # Reconstruct final readable text
185
- original_text = apply_font_mapping(paragraphs_str, mapping_result)
186
-
187
- final_paragraphs_str = "\n\n".join(
188
- line.strip() for line in original_text.splitlines() if line.strip()
189
- )
190
- return {
191
- "id": str(chapter_id),
192
- "title": str(title),
193
- "content": final_paragraphs_str,
194
- "extra": {
195
- "author_say": author_say.strip() if author_say else "",
196
- "updated_at": update_time,
197
- "update_timestamp": update_timestamp,
198
- "modify_time": modify_time,
199
- "word_count": word_count,
200
- "seq": seq,
201
- "volume": volume,
202
- },
203
- }
204
-
205
- except Exception as e:
206
- logger.warning(
207
- "[Parser] parse error for encrypted chapter '%s': %s", chapter_id, e
208
- )
209
- return None
210
-
211
-
212
- def extract_paragraphs_recursively(
213
- soup: BeautifulSoup, chapter_id: int = -1
214
- ) -> list[dict[str, Any]]:
215
- """
216
- Extracts paragraph elements under <main id="c-{chapter_id}"> from HTML
217
- and converts them to a nested data structure for further processing.
218
-
219
- :param html_str: Full HTML content.
220
- :param chapter_id: ID used to locate <main id="c-{chapter_id}">.
221
-
222
- :return list: List of parsed <p> paragraph data.
223
- """
224
-
225
- def parse_element(elem: Any) -> dict[str, Any] | None:
226
- if not isinstance(elem, Tag):
227
- return None
228
- result = {"tag": elem.name, "attrs": dict(elem.attrs), "data": []}
229
- for child in elem.contents:
230
- if isinstance(child, Tag):
231
- parsed = parse_element(child)
232
- if parsed:
233
- result["data"].append(parsed)
234
- else:
235
- text = child
236
- if text:
237
- result["data"].append(text)
238
- return result
239
-
240
- if chapter_id > 0:
241
- main_id = f"c-{chapter_id}"
242
- main_tag = soup.find("main", id=main_id)
243
- if not isinstance(main_tag, Tag):
244
- return []
245
- else:
246
- main_tag = soup
247
-
248
- result = []
249
- for p in main_tag.find_all("p"):
250
- parsed_p = parse_element(p)
251
- if parsed_p:
252
- result.append(parsed_p)
253
-
254
- return result
255
-
256
-
257
- def parse_rule(css_str: str) -> dict[str, Any]:
258
- """
259
- Parse a CSS string and extract style rules for rendering.
260
-
261
- Handles:
262
- - font-size:0 (mark for deletion)
263
- - scaleX(-1) (mark as mirrored)
264
- - ::before / ::after with content or attr()
265
- - class + tag selector mapping
266
- - custom rendering order via 'order'
267
-
268
- :param css_str: Raw CSS stylesheet string.
269
- :return: Dict with "rules" and "orders" for rendering.
270
- """
271
-
272
- rules: dict[str, Any] = {}
273
- orders = []
274
-
275
- stylesheet = tinycss2.parse_stylesheet(
276
- css_str, skip_comments=True, skip_whitespace=True
277
- )
278
-
279
- for rule in stylesheet:
280
- if rule.type != "qualified-rule":
281
- continue
282
-
283
- selector = tinycss2.serialize(rule.prelude).strip()
284
- declarations = tinycss2.parse_declaration_list(rule.content)
285
-
286
- parsed = {}
287
- order_val = None
288
-
289
- for decl in declarations:
290
- if decl.type != "declaration":
291
- continue
292
- name = decl.lower_name
293
- value = tinycss2.serialize(decl.value).strip()
294
-
295
- if name == "font-size" and value == "0":
296
- if "::first-letter" in selector:
297
- parsed["delete-first"] = True
298
- else:
299
- parsed["delete-all"] = True
300
- elif name == "transform" and value.lower() == "scalex(-1)":
301
- parsed["transform-x_-1"] = True
302
- elif name == "order":
303
- order_val = value
304
- elif name == "content":
305
- if "::after" in selector:
306
- if "attr(" in value:
307
- parsed["append-end-attr"] = value.split("attr(")[1].split(")")[
308
- 0
309
- ]
310
- else:
311
- parsed["append-end-char"] = value.strip("\"'")
312
- elif "::before" in selector:
313
- if "attr(" in value:
314
- parsed["append-start-attr"] = value.split("attr(")[1].split(
315
- ")"
316
- )[0]
317
- else:
318
- parsed["append-start-char"] = value.strip("\"'")
319
-
320
- # Store in structure
321
- if selector.startswith(".sy-"):
322
- rules.setdefault("sy", {})[selector[1:]] = parsed
323
- elif selector.startswith(".p") and " " in selector:
324
- class_str, tag_part = selector.split(" ", 1)
325
- class_str = class_str.lstrip(".")
326
- tag_part = tag_part.split("::")[0]
327
- rules.setdefault(class_str, {}).setdefault(tag_part, {}).update(parsed)
328
-
329
- if order_val:
330
- orders.append((selector, order_val))
331
-
332
- orders.sort(key=lambda x: int(x[1]))
333
- return {"rules": rules, "orders": orders}
334
-
335
-
336
- def render_paragraphs(
337
- main_paragraphs: list[dict[str, Any]], rules: dict[str, Any]
338
- ) -> tuple[str, list[str]]:
339
- """
340
- Applies the parsed CSS rules to the paragraph structure and
341
- reconstructs the visible text.
342
-
343
- Handles special class styles like .sy-*, text order control,
344
- mirrored characters, etc.
345
-
346
- :param main_paragraphs: A list of paragraph dictionaries, each with 'attrs'
347
- and 'data' fields representing structured content.
348
- :param rules: A dictionary with keys 'orders' and 'rules', parsed from CSS.
349
- - rules['orders']: List of (selector, id) tuples.
350
- - rules['rules']: Nested dict containing transformation rules.
351
-
352
- :return:
353
- - A reconstructed paragraph string with line breaks.
354
- - A list of mirrored (reflected) characters for later OCR processing.
355
- """
356
- orders: list[tuple[str, str]] = rules.get("orders", [])
357
- rules = rules.get("rules", {})
358
- refl_list: list[str] = []
359
-
360
- def apply_rule(data: dict[str, Any], rule: dict[str, Any]) -> str:
361
- if rule.get("delete-all", False):
362
- return ""
363
-
364
- curr_str = ""
365
- if isinstance(data.get("data"), list) and data["data"]:
366
- first_data = data["data"][0]
367
- if isinstance(first_data, str):
368
- curr_str += first_data
369
-
370
- if rule.get("delete-first", False):
371
- curr_str = "" if len(curr_str) <= 1 else curr_str[1:]
372
-
373
- curr_str += rule.get("append-end-char", "")
374
-
375
- attr_name = rule.get("append-end-attr", "")
376
- if attr_name:
377
- curr_str += data.get("attrs", {}).get(attr_name, "")
378
-
379
- curr_str = rule.get("append-start-char", "") + curr_str
380
-
381
- attr_name = rule.get("append-start-attr", "")
382
- if attr_name:
383
- curr_str = data.get("attrs", {}).get(attr_name, "") + curr_str
384
-
385
- if rule.get("transform-x_-1", False):
386
- refl_list.append(curr_str)
387
- return curr_str
388
-
389
- paragraphs_str = ""
390
- for paragraph in main_paragraphs:
391
- class_list = paragraph.get("attrs", {}).get("class", [])
392
- p_class_str = next((c for c in class_list if c.startswith("p")), None)
393
- curr_datas = paragraph.get("data", [])
394
-
395
- ordered_cache = {}
396
- for data in curr_datas:
397
- # 文本节点直接加
398
- if isinstance(data, str):
399
- paragraphs_str += data
400
- continue
401
-
402
- if isinstance(data, dict):
403
- tag = data.get("tag", "")
404
- attrs = data.get("attrs", {})
405
-
406
- # 跳过 span.review
407
- if tag == "span" and "class" in attrs and "review" in attrs["class"]:
408
- continue
409
-
410
- # sy 类型标签处理
411
- if tag == "y":
412
- tag_class_list = attrs.get("class", [])
413
- tag_class = next(
414
- (c for c in tag_class_list if c.startswith("sy-")), None
415
- )
416
-
417
- if tag_class in rules.get("sy", {}):
418
- curr_rule = rules["sy"][tag_class]
419
- paragraphs_str += apply_rule(data, curr_rule)
420
- continue
421
-
422
- if not p_class_str:
423
- if any(cls in IGNORED_CLASS_LISTS for cls in class_list):
424
- continue
425
- logger.debug(f"[parser] not find p_class_str: {class_list}")
426
- continue
427
- # 普通标签处理,根据 orders 顺序匹配
428
- for ord_selector, _ in orders:
429
- tag_name = f"{ord_selector}"
430
- if data.get("tag") != tag_name:
431
- continue
432
- curr_rule = rules.get(p_class_str, {}).get(ord_selector)
433
- curr_rule = curr_rule if curr_rule else {}
434
- ordered_cache[ord_selector] = apply_rule(data, curr_rule)
435
- break
436
- # 最后按 orders 顺序拼接
437
- for ord_selector, _ in orders:
438
- if ord_selector in ordered_cache:
439
- paragraphs_str += ordered_cache[ord_selector]
440
-
441
- paragraphs_str += "\n\n"
442
-
443
- return paragraphs_str, refl_list
@@ -1,115 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- novel_downloader.core.parsers.qidian.session.chapter_normal
4
- -----------------------------------------------------------
5
-
6
- Provides `parse_normal_chapter`, which will:
7
-
8
- 1. Extract SSR context from a “normal” (non-VIP) chapter page and format it.
9
- 2. Detect VIP/encrypted chapters and fall back to Node-based decryption
10
- via `QidianNodeDecryptor`.
11
- """
12
-
13
- import logging
14
-
15
- from bs4 import BeautifulSoup
16
-
17
- from novel_downloader.utils.chapter_storage import ChapterDict
18
-
19
- from ..shared import (
20
- extract_chapter_info,
21
- find_ssr_page_context,
22
- html_to_soup,
23
- vip_status,
24
- )
25
- from .node_decryptor import QidianNodeDecryptor
26
-
27
- logger = logging.getLogger(__name__)
28
- _decryptor: QidianNodeDecryptor | None = None
29
-
30
-
31
- def _get_decryptor() -> QidianNodeDecryptor:
32
- """
33
- Return the singleton QidianNodeDecryptor, initializing it on first use.
34
- """
35
- global _decryptor
36
- if _decryptor is None:
37
- _decryptor = QidianNodeDecryptor()
38
- return _decryptor
39
-
40
-
41
- def parse_normal_chapter(
42
- soup: BeautifulSoup,
43
- chapter_id: str,
44
- fuid: str,
45
- ) -> ChapterDict | None:
46
- """
47
- Extract structured chapter info from a normal Qidian page.
48
-
49
- :param soup: A BeautifulSoup of the chapter HTML.
50
- :param chapter_id: Chapter identifier (string).
51
- :param fuid: Fock user ID parameter from the page.
52
- :return: a dictionary with keys like 'id', 'title', 'content', etc.
53
- """
54
- try:
55
- ssr_data = find_ssr_page_context(soup)
56
- chapter_info = extract_chapter_info(ssr_data)
57
- if not chapter_info:
58
- logger.warning(
59
- "[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
60
- )
61
- return None
62
-
63
- title = chapter_info.get("chapterName", "Untitled")
64
- raw_html = chapter_info.get("content", "")
65
- chapter_id = chapter_info.get("chapterId", "")
66
- fkp = chapter_info.get("fkp", "")
67
- author_say = chapter_info.get("authorSay", "")
68
- update_time = chapter_info.get("updateTime", "")
69
- update_timestamp = chapter_info.get("updateTimestamp", 0)
70
- modify_time = chapter_info.get("modifyTime", 0)
71
- word_count = chapter_info.get("wordsCount", 0)
72
- seq = chapter_info.get("seq", None)
73
- volume = chapter_info.get("extra", {}).get("volumeName", "")
74
-
75
- if not raw_html:
76
- logger.warning("[Parser] raw_html not found for chapter '%s'", chapter_id)
77
- return None
78
-
79
- if vip_status(soup):
80
- try:
81
- decryptor = _get_decryptor()
82
- raw_html = decryptor.decrypt(
83
- raw_html,
84
- chapter_id,
85
- fkp,
86
- fuid,
87
- )
88
- except Exception as e:
89
- logger.error("[Parser] decryption failed for '%s': %s", chapter_id, e)
90
- return None
91
-
92
- paras_soup = html_to_soup(raw_html)
93
- paras = [p.get_text(strip=True) for p in paras_soup.find_all("p")]
94
- chapter_text = "\n\n".join(paras)
95
-
96
- return {
97
- "id": str(chapter_id),
98
- "title": title,
99
- "content": chapter_text,
100
- "extra": {
101
- "author_say": author_say.strip() if author_say else "",
102
- "updated_at": update_time,
103
- "update_timestamp": update_timestamp,
104
- "modify_time": modify_time,
105
- "word_count": word_count,
106
- "seq": seq,
107
- "volume": volume,
108
- },
109
- }
110
-
111
- except Exception as e:
112
- logger.warning(
113
- "[Parser] parse error for normal chapter '%s': %s", chapter_id, e
114
- )
115
- return None
@@ -1,128 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- novel_downloader.core.parsers.qidian.session.main_parser
4
- --------------------------------------------------------
5
-
6
- Main parser class for handling Qidian chapters rendered via a session.
7
-
8
- This module defines `QidianSessionParser`, a parser implementation that supports
9
- content extracted from dynamically rendered Qidian HTML pages.
10
- """
11
-
12
- from __future__ import annotations
13
-
14
- from pathlib import Path
15
- from typing import TYPE_CHECKING, Any
16
-
17
- from novel_downloader.config.models import ParserConfig
18
- from novel_downloader.core.parsers.base import BaseParser
19
- from novel_downloader.utils.chapter_storage import ChapterDict
20
- from novel_downloader.utils.state import state_mgr
21
-
22
- from ..shared import (
23
- is_encrypted,
24
- parse_book_info,
25
- )
26
- from .chapter_router import parse_chapter
27
-
28
- if TYPE_CHECKING:
29
- from novel_downloader.utils.fontocr import FontOCR
30
-
31
-
32
- class QidianSessionParser(BaseParser):
33
- """
34
- Parser for Qidian site using a session HTML workflow.
35
- """
36
-
37
- def __init__(self, config: ParserConfig):
38
- """
39
- Initialize the QidianBrowserParser with the given configuration.
40
-
41
- :param config: ParserConfig object controlling:
42
- """
43
- super().__init__(config)
44
-
45
- # Extract and store parser flags from config
46
- self._decode_font: bool = config.decode_font
47
- self._save_font_debug: bool = config.save_font_debug
48
-
49
- self._fixed_font_dir: Path = self._base_cache_dir / "fixed_fonts"
50
- self._fixed_font_dir.mkdir(parents=True, exist_ok=True)
51
- self._font_debug_dir: Path | None = None
52
-
53
- qd_cookies = state_mgr.get_cookies("qidian")
54
- self._fuid: str = qd_cookies.get("ywguid", "")
55
-
56
- self._font_ocr: FontOCR | None = None
57
- if self._decode_font:
58
- from novel_downloader.utils.fontocr import FontOCR
59
-
60
- self._font_ocr = FontOCR(
61
- cache_dir=self._base_cache_dir,
62
- use_freq=config.use_freq,
63
- use_ocr=config.use_ocr,
64
- use_vec=config.use_vec,
65
- batch_size=config.batch_size,
66
- gpu_mem=config.gpu_mem,
67
- gpu_id=config.gpu_id,
68
- ocr_weight=config.ocr_weight,
69
- vec_weight=config.vec_weight,
70
- font_debug=config.save_font_debug,
71
- )
72
- self._font_debug_dir = self._base_cache_dir / "qidian" / "font_debug"
73
- self._font_debug_dir.mkdir(parents=True, exist_ok=True)
74
-
75
- def parse_book_info(
76
- self,
77
- html_str: list[str],
78
- **kwargs: Any,
79
- ) -> dict[str, Any]:
80
- """
81
- Parse a book info page and extract metadata and chapter structure.
82
-
83
- :param html_str: Raw HTML of the book info page.
84
- :return: Parsed metadata and chapter structure as a dictionary.
85
- """
86
- if not html_str:
87
- return {}
88
- return parse_book_info(html_str[0])
89
-
90
- def parse_chapter(
91
- self,
92
- html_str: list[str],
93
- chapter_id: str,
94
- **kwargs: Any,
95
- ) -> ChapterDict | None:
96
- """
97
- :param html: Raw HTML of the chapter page.
98
- :param chapter_id: Identifier of the chapter being parsed.
99
- :return: Cleaned chapter content as plain text.
100
- """
101
- if not html_str:
102
- return None
103
- return parse_chapter(self, html_str[0], chapter_id)
104
-
105
- def is_encrypted(self, html_str: str) -> bool:
106
- """
107
- Return True if content is encrypted.
108
-
109
- :param html: Raw HTML of the chapter page.
110
- """
111
- return is_encrypted(html_str)
112
-
113
- def _init_cache_folders(self) -> None:
114
- """
115
- Prepare cache folders for plain/encrypted HTML and font debug data.
116
- Folders are only created if corresponding debug/save flags are enabled.
117
- """
118
- base = self._base_cache_dir
119
-
120
- # Font debug folder
121
- if self._save_font_debug and self.book_id:
122
- self._font_debug_dir = base / self.book_id / "font_debug"
123
- self._font_debug_dir.mkdir(parents=True, exist_ok=True)
124
- else:
125
- self._font_debug_dir = None
126
-
127
- def _on_book_id_set(self) -> None:
128
- self._init_cache_folders()