novel-downloader 1.3.3__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/clean.py +97 -78
  3. novel_downloader/cli/config.py +177 -0
  4. novel_downloader/cli/download.py +132 -87
  5. novel_downloader/cli/export.py +77 -0
  6. novel_downloader/cli/main.py +21 -28
  7. novel_downloader/config/__init__.py +1 -25
  8. novel_downloader/config/adapter.py +32 -31
  9. novel_downloader/config/loader.py +3 -3
  10. novel_downloader/config/site_rules.py +1 -2
  11. novel_downloader/core/__init__.py +3 -6
  12. novel_downloader/core/downloaders/__init__.py +10 -13
  13. novel_downloader/core/downloaders/base.py +233 -0
  14. novel_downloader/core/downloaders/biquge.py +27 -0
  15. novel_downloader/core/downloaders/common.py +414 -0
  16. novel_downloader/core/downloaders/esjzone.py +27 -0
  17. novel_downloader/core/downloaders/linovelib.py +27 -0
  18. novel_downloader/core/downloaders/qianbi.py +27 -0
  19. novel_downloader/core/downloaders/qidian.py +352 -0
  20. novel_downloader/core/downloaders/sfacg.py +27 -0
  21. novel_downloader/core/downloaders/yamibo.py +27 -0
  22. novel_downloader/core/exporters/__init__.py +37 -0
  23. novel_downloader/core/{savers → exporters}/base.py +73 -39
  24. novel_downloader/core/exporters/biquge.py +25 -0
  25. novel_downloader/core/exporters/common/__init__.py +12 -0
  26. novel_downloader/core/{savers → exporters}/common/epub.py +22 -22
  27. novel_downloader/core/{savers/common/main_saver.py → exporters/common/main_exporter.py} +35 -40
  28. novel_downloader/core/{savers → exporters}/common/txt.py +20 -23
  29. novel_downloader/core/{savers → exporters}/epub_utils/__init__.py +8 -3
  30. novel_downloader/core/{savers → exporters}/epub_utils/css_builder.py +2 -2
  31. novel_downloader/core/{savers → exporters}/epub_utils/image_loader.py +46 -4
  32. novel_downloader/core/{savers → exporters}/epub_utils/initializer.py +6 -4
  33. novel_downloader/core/{savers → exporters}/epub_utils/text_to_html.py +3 -3
  34. novel_downloader/core/{savers → exporters}/epub_utils/volume_intro.py +2 -2
  35. novel_downloader/core/exporters/esjzone.py +25 -0
  36. novel_downloader/core/exporters/linovelib/__init__.py +10 -0
  37. novel_downloader/core/exporters/linovelib/epub.py +449 -0
  38. novel_downloader/core/exporters/linovelib/main_exporter.py +127 -0
  39. novel_downloader/core/exporters/linovelib/txt.py +129 -0
  40. novel_downloader/core/exporters/qianbi.py +25 -0
  41. novel_downloader/core/{savers → exporters}/qidian.py +8 -8
  42. novel_downloader/core/exporters/sfacg.py +25 -0
  43. novel_downloader/core/exporters/yamibo.py +25 -0
  44. novel_downloader/core/factory/__init__.py +5 -17
  45. novel_downloader/core/factory/downloader.py +24 -126
  46. novel_downloader/core/factory/exporter.py +58 -0
  47. novel_downloader/core/factory/fetcher.py +96 -0
  48. novel_downloader/core/factory/parser.py +17 -12
  49. novel_downloader/core/{requesters → fetchers}/__init__.py +22 -15
  50. novel_downloader/core/{requesters → fetchers}/base/__init__.py +2 -4
  51. novel_downloader/core/fetchers/base/browser.py +383 -0
  52. novel_downloader/core/fetchers/base/rate_limiter.py +86 -0
  53. novel_downloader/core/fetchers/base/session.py +419 -0
  54. novel_downloader/core/fetchers/biquge/__init__.py +14 -0
  55. novel_downloader/core/{requesters/biquge/async_session.py → fetchers/biquge/browser.py} +18 -6
  56. novel_downloader/core/{requesters → fetchers}/biquge/session.py +23 -30
  57. novel_downloader/core/fetchers/common/__init__.py +14 -0
  58. novel_downloader/core/fetchers/common/browser.py +79 -0
  59. novel_downloader/core/{requesters/common/async_session.py → fetchers/common/session.py} +8 -25
  60. novel_downloader/core/fetchers/esjzone/__init__.py +14 -0
  61. novel_downloader/core/fetchers/esjzone/browser.py +202 -0
  62. novel_downloader/core/{requesters/esjzone/async_session.py → fetchers/esjzone/session.py} +62 -42
  63. novel_downloader/core/fetchers/linovelib/__init__.py +14 -0
  64. novel_downloader/core/fetchers/linovelib/browser.py +193 -0
  65. novel_downloader/core/fetchers/linovelib/session.py +193 -0
  66. novel_downloader/core/fetchers/qianbi/__init__.py +14 -0
  67. novel_downloader/core/{requesters/qianbi/session.py → fetchers/qianbi/browser.py} +30 -48
  68. novel_downloader/core/{requesters/qianbi/async_session.py → fetchers/qianbi/session.py} +18 -6
  69. novel_downloader/core/fetchers/qidian/__init__.py +14 -0
  70. novel_downloader/core/fetchers/qidian/browser.py +266 -0
  71. novel_downloader/core/fetchers/qidian/session.py +326 -0
  72. novel_downloader/core/fetchers/sfacg/__init__.py +14 -0
  73. novel_downloader/core/fetchers/sfacg/browser.py +189 -0
  74. novel_downloader/core/{requesters/sfacg/async_session.py → fetchers/sfacg/session.py} +43 -73
  75. novel_downloader/core/fetchers/yamibo/__init__.py +14 -0
  76. novel_downloader/core/fetchers/yamibo/browser.py +229 -0
  77. novel_downloader/core/{requesters/yamibo/async_session.py → fetchers/yamibo/session.py} +62 -44
  78. novel_downloader/core/interfaces/__init__.py +8 -12
  79. novel_downloader/core/interfaces/downloader.py +54 -0
  80. novel_downloader/core/interfaces/{saver.py → exporter.py} +12 -12
  81. novel_downloader/core/interfaces/fetcher.py +162 -0
  82. novel_downloader/core/interfaces/parser.py +6 -7
  83. novel_downloader/core/parsers/__init__.py +5 -6
  84. novel_downloader/core/parsers/base.py +9 -13
  85. novel_downloader/core/parsers/biquge/main_parser.py +12 -13
  86. novel_downloader/core/parsers/common/helper.py +3 -3
  87. novel_downloader/core/parsers/common/main_parser.py +39 -34
  88. novel_downloader/core/parsers/esjzone/main_parser.py +20 -14
  89. novel_downloader/core/parsers/linovelib/__init__.py +10 -0
  90. novel_downloader/core/parsers/linovelib/main_parser.py +210 -0
  91. novel_downloader/core/parsers/qianbi/main_parser.py +21 -15
  92. novel_downloader/core/parsers/qidian/__init__.py +2 -11
  93. novel_downloader/core/parsers/qidian/book_info_parser.py +113 -0
  94. novel_downloader/core/parsers/qidian/{browser/chapter_encrypted.py → chapter_encrypted.py} +162 -135
  95. novel_downloader/core/parsers/qidian/chapter_normal.py +150 -0
  96. novel_downloader/core/parsers/qidian/{session/chapter_router.py → chapter_router.py} +15 -15
  97. novel_downloader/core/parsers/qidian/{browser/main_parser.py → main_parser.py} +49 -40
  98. novel_downloader/core/parsers/qidian/utils/__init__.py +27 -0
  99. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +145 -0
  100. novel_downloader/core/parsers/qidian/{shared → utils}/helpers.py +41 -68
  101. novel_downloader/core/parsers/qidian/{session → utils}/node_decryptor.py +64 -50
  102. novel_downloader/core/parsers/sfacg/main_parser.py +12 -12
  103. novel_downloader/core/parsers/yamibo/main_parser.py +10 -10
  104. novel_downloader/locales/en.json +18 -2
  105. novel_downloader/locales/zh.json +18 -2
  106. novel_downloader/models/__init__.py +64 -0
  107. novel_downloader/models/browser.py +21 -0
  108. novel_downloader/models/chapter.py +25 -0
  109. novel_downloader/models/config.py +100 -0
  110. novel_downloader/models/login.py +20 -0
  111. novel_downloader/models/site_rules.py +99 -0
  112. novel_downloader/models/tasks.py +33 -0
  113. novel_downloader/models/types.py +15 -0
  114. novel_downloader/resources/config/settings.toml +31 -25
  115. novel_downloader/resources/json/linovelib_font_map.json +3573 -0
  116. novel_downloader/tui/__init__.py +7 -0
  117. novel_downloader/tui/app.py +32 -0
  118. novel_downloader/tui/main.py +17 -0
  119. novel_downloader/tui/screens/__init__.py +14 -0
  120. novel_downloader/tui/screens/home.py +191 -0
  121. novel_downloader/tui/screens/login.py +74 -0
  122. novel_downloader/tui/styles/home_layout.tcss +79 -0
  123. novel_downloader/tui/widgets/richlog_handler.py +24 -0
  124. novel_downloader/utils/__init__.py +6 -0
  125. novel_downloader/utils/chapter_storage.py +25 -38
  126. novel_downloader/utils/constants.py +11 -5
  127. novel_downloader/utils/cookies.py +66 -0
  128. novel_downloader/utils/crypto_utils.py +1 -74
  129. novel_downloader/utils/fontocr/ocr_v1.py +2 -1
  130. novel_downloader/utils/fontocr/ocr_v2.py +2 -2
  131. novel_downloader/utils/hash_store.py +10 -18
  132. novel_downloader/utils/hash_utils.py +3 -2
  133. novel_downloader/utils/logger.py +2 -3
  134. novel_downloader/utils/network.py +2 -1
  135. novel_downloader/utils/text_utils/chapter_formatting.py +6 -1
  136. novel_downloader/utils/text_utils/font_mapping.py +1 -1
  137. novel_downloader/utils/text_utils/text_cleaning.py +1 -1
  138. novel_downloader/utils/time_utils/datetime_utils.py +3 -3
  139. novel_downloader/utils/time_utils/sleep_utils.py +1 -1
  140. {novel_downloader-1.3.3.dist-info → novel_downloader-1.4.1.dist-info}/METADATA +69 -35
  141. novel_downloader-1.4.1.dist-info/RECORD +170 -0
  142. {novel_downloader-1.3.3.dist-info → novel_downloader-1.4.1.dist-info}/WHEEL +1 -1
  143. {novel_downloader-1.3.3.dist-info → novel_downloader-1.4.1.dist-info}/entry_points.txt +1 -0
  144. novel_downloader/cli/interactive.py +0 -66
  145. novel_downloader/cli/settings.py +0 -177
  146. novel_downloader/config/models.py +0 -187
  147. novel_downloader/core/downloaders/base/__init__.py +0 -14
  148. novel_downloader/core/downloaders/base/base_async.py +0 -153
  149. novel_downloader/core/downloaders/base/base_sync.py +0 -208
  150. novel_downloader/core/downloaders/biquge/__init__.py +0 -14
  151. novel_downloader/core/downloaders/biquge/biquge_async.py +0 -27
  152. novel_downloader/core/downloaders/biquge/biquge_sync.py +0 -27
  153. novel_downloader/core/downloaders/common/__init__.py +0 -14
  154. novel_downloader/core/downloaders/common/common_async.py +0 -210
  155. novel_downloader/core/downloaders/common/common_sync.py +0 -202
  156. novel_downloader/core/downloaders/esjzone/__init__.py +0 -14
  157. novel_downloader/core/downloaders/esjzone/esjzone_async.py +0 -27
  158. novel_downloader/core/downloaders/esjzone/esjzone_sync.py +0 -27
  159. novel_downloader/core/downloaders/qianbi/__init__.py +0 -14
  160. novel_downloader/core/downloaders/qianbi/qianbi_async.py +0 -27
  161. novel_downloader/core/downloaders/qianbi/qianbi_sync.py +0 -27
  162. novel_downloader/core/downloaders/qidian/__init__.py +0 -10
  163. novel_downloader/core/downloaders/qidian/qidian_sync.py +0 -219
  164. novel_downloader/core/downloaders/sfacg/__init__.py +0 -14
  165. novel_downloader/core/downloaders/sfacg/sfacg_async.py +0 -27
  166. novel_downloader/core/downloaders/sfacg/sfacg_sync.py +0 -27
  167. novel_downloader/core/downloaders/yamibo/__init__.py +0 -14
  168. novel_downloader/core/downloaders/yamibo/yamibo_async.py +0 -27
  169. novel_downloader/core/downloaders/yamibo/yamibo_sync.py +0 -27
  170. novel_downloader/core/factory/requester.py +0 -144
  171. novel_downloader/core/factory/saver.py +0 -56
  172. novel_downloader/core/interfaces/async_downloader.py +0 -36
  173. novel_downloader/core/interfaces/async_requester.py +0 -84
  174. novel_downloader/core/interfaces/sync_downloader.py +0 -36
  175. novel_downloader/core/interfaces/sync_requester.py +0 -82
  176. novel_downloader/core/parsers/qidian/browser/__init__.py +0 -12
  177. novel_downloader/core/parsers/qidian/browser/chapter_normal.py +0 -93
  178. novel_downloader/core/parsers/qidian/browser/chapter_router.py +0 -71
  179. novel_downloader/core/parsers/qidian/session/__init__.py +0 -12
  180. novel_downloader/core/parsers/qidian/session/chapter_encrypted.py +0 -443
  181. novel_downloader/core/parsers/qidian/session/chapter_normal.py +0 -115
  182. novel_downloader/core/parsers/qidian/session/main_parser.py +0 -128
  183. novel_downloader/core/parsers/qidian/shared/__init__.py +0 -37
  184. novel_downloader/core/parsers/qidian/shared/book_info_parser.py +0 -150
  185. novel_downloader/core/requesters/base/async_session.py +0 -410
  186. novel_downloader/core/requesters/base/browser.py +0 -337
  187. novel_downloader/core/requesters/base/session.py +0 -378
  188. novel_downloader/core/requesters/biquge/__init__.py +0 -14
  189. novel_downloader/core/requesters/common/__init__.py +0 -17
  190. novel_downloader/core/requesters/common/session.py +0 -113
  191. novel_downloader/core/requesters/esjzone/__init__.py +0 -13
  192. novel_downloader/core/requesters/esjzone/session.py +0 -235
  193. novel_downloader/core/requesters/qianbi/__init__.py +0 -13
  194. novel_downloader/core/requesters/qidian/__init__.py +0 -21
  195. novel_downloader/core/requesters/qidian/broswer.py +0 -307
  196. novel_downloader/core/requesters/qidian/session.py +0 -290
  197. novel_downloader/core/requesters/sfacg/__init__.py +0 -13
  198. novel_downloader/core/requesters/sfacg/session.py +0 -242
  199. novel_downloader/core/requesters/yamibo/__init__.py +0 -13
  200. novel_downloader/core/requesters/yamibo/session.py +0 -237
  201. novel_downloader/core/savers/__init__.py +0 -34
  202. novel_downloader/core/savers/biquge.py +0 -25
  203. novel_downloader/core/savers/common/__init__.py +0 -12
  204. novel_downloader/core/savers/esjzone.py +0 -25
  205. novel_downloader/core/savers/qianbi.py +0 -25
  206. novel_downloader/core/savers/sfacg.py +0 -25
  207. novel_downloader/core/savers/yamibo.py +0 -25
  208. novel_downloader/resources/config/rules.toml +0 -196
  209. novel_downloader-1.3.3.dist-info/RECORD +0 -166
  210. {novel_downloader-1.3.3.dist-info → novel_downloader-1.4.1.dist-info}/licenses/LICENSE +0 -0
  211. {novel_downloader-1.3.3.dist-info → novel_downloader-1.4.1.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,10 @@
1
1
  #!/usr/bin/env python3
2
2
  """
3
- novel_downloader.core.parsers.qidian.browser.chapter_encrypted
4
- --------------------------------------------------------------
3
+ novel_downloader.core.parsers.qidian.chapter_encrypted
4
+ ------------------------------------------------------
5
5
 
6
6
  Support for parsing encrypted chapters from Qidian using font OCR mapping,
7
7
  CSS rules, and custom rendering logic.
8
-
9
- Includes:
10
- - Font downloading and caching
11
- - Encrypted paragraph extraction
12
- - Custom CSS parsing and layout restoration
13
- - Font-based OCR decryption and mapping
14
8
  """
15
9
 
16
10
  from __future__ import annotations
@@ -21,27 +15,30 @@ from pathlib import Path
21
15
  from typing import TYPE_CHECKING, Any
22
16
 
23
17
  import tinycss2
24
- from bs4 import BeautifulSoup, Tag
18
+ from lxml import html
25
19
 
26
- from novel_downloader.utils.chapter_storage import ChapterDict
20
+ from novel_downloader.models import ChapterDict
27
21
  from novel_downloader.utils.network import download_font_file
28
22
  from novel_downloader.utils.text_utils import apply_font_mapping
29
23
 
30
- from ..shared import (
24
+ from .utils import (
31
25
  extract_chapter_info,
32
26
  find_ssr_page_context,
27
+ get_decryptor,
28
+ vip_status,
33
29
  )
34
30
 
35
31
  if TYPE_CHECKING:
36
- from .main_parser import QidianBrowserParser
32
+ from .main_parser import QidianParser
37
33
 
38
34
  logger = logging.getLogger(__name__)
39
35
  IGNORED_CLASS_LISTS = {"title", "review"}
36
+ NON_CONTENT_KEYWORDS = {"旧版", "反馈", "扫码"}
40
37
 
41
38
 
42
39
  def parse_encrypted_chapter(
43
- parser: QidianBrowserParser,
44
- soup: BeautifulSoup,
40
+ parser: QidianParser,
41
+ html_str: str,
45
42
  chapter_id: str,
46
43
  ) -> ChapterDict | None:
47
44
  """
@@ -52,9 +49,8 @@ def parse_encrypted_chapter(
52
49
  3. Decode and save randomFont bytes; download fixedFont via download_font().
53
50
  4. Extract paragraph structures and save debug JSON.
54
51
  5. Parse CSS rules and save debug JSON.
55
- 6. Determine paragraph name prefixes and ending number; save debug text.
56
- 7. Render encrypted paragraphs, then run OCR font-mapping.
57
- 8. Extracts paragraph texts and formats them.
52
+ 6. Render encrypted paragraphs, then run OCR font-mapping.
53
+ 7. Extracts paragraph texts and formats them.
58
54
 
59
55
  :param html_str: Raw HTML content of the chapter page.
60
56
  :return: Formatted chapter text or empty string if not parsable.
@@ -62,13 +58,14 @@ def parse_encrypted_chapter(
62
58
  try:
63
59
  if not (parser._decode_font and parser._font_ocr):
64
60
  return None
65
- ssr_data = find_ssr_page_context(soup)
61
+ ssr_data = find_ssr_page_context(html_str)
66
62
  chapter_info = extract_chapter_info(ssr_data)
67
63
  if not chapter_info:
68
64
  logger.warning(
69
65
  "[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
70
66
  )
71
67
  return None
68
+
72
69
  debug_base_dir: Path | None = None
73
70
  if parser._font_debug_dir:
74
71
  debug_base_dir = parser._font_debug_dir / chapter_id
@@ -79,7 +76,9 @@ def parse_encrypted_chapter(
79
76
  fixedFontWoff2_url = chapter_info["fixedFontWoff2"]
80
77
 
81
78
  title = chapter_info.get("chapterName", "Untitled")
82
- chapter_id = chapter_info.get("chapterId", "")
79
+ raw_html = chapter_info.get("content", "")
80
+ chapter_id = chapter_info.get("chapterId", chapter_id)
81
+ fkp = chapter_info.get("fkp", "")
83
82
  author_say = chapter_info.get("authorSay", "")
84
83
  update_time = chapter_info.get("updateTime", "")
85
84
  update_timestamp = chapter_info.get("updateTimestamp", 0)
@@ -101,7 +100,26 @@ def parse_encrypted_chapter(
101
100
  raise ValueError("fixed_path is None: failed to download font")
102
101
 
103
102
  # Extract and render paragraphs from HTML with CSS rules
104
- main_paragraphs = extract_paragraphs_recursively(soup, chapter_id)
103
+ main_paragraphs = extract_paragraphs_recursively(html_str, chapter_id)
104
+ if not main_paragraphs or contains_keywords(
105
+ main_paragraphs, NON_CONTENT_KEYWORDS
106
+ ):
107
+ if vip_status(ssr_data):
108
+ try:
109
+ decryptor = get_decryptor()
110
+ raw_html = decryptor.decrypt(
111
+ raw_html,
112
+ chapter_id,
113
+ fkp,
114
+ parser._fuid,
115
+ )
116
+ except Exception as e:
117
+ logger.error(
118
+ "[Parser] decryption failed for '%s': %s", chapter_id, e
119
+ )
120
+ return None
121
+ main_paragraphs = extract_paragraphs_recursively(raw_html, chapter_id)
122
+
105
123
  if debug_base_dir:
106
124
  main_paragraphs_path = debug_base_dir / "main_paragraphs_debug.json"
107
125
  main_paragraphs_path.write_text(
@@ -117,23 +135,11 @@ def parse_encrypted_chapter(
117
135
  encoding="utf-8",
118
136
  )
119
137
 
120
- paragraph_names = parse_paragraph_names(paragraphs_rules)
121
- end_number = parse_end_number(main_paragraphs, paragraph_names)
122
- if debug_base_dir:
123
- paragraphs_rules_path = debug_base_dir / "paragraph_names_debug.txt"
124
- temp = f"names:\n{paragraph_names}\n\nend_number: {end_number}"
125
- paragraphs_rules_path.write_text(
126
- temp,
127
- encoding="utf-8",
128
- )
129
- if not end_number:
130
- logger.warning(
131
- f"[Parser] No end_number found after parsing chapter '{chapter_id}'"
132
- )
133
- return None
134
-
138
+ end_number = parse_end_number(main_paragraphs, paragraphs_rules)
135
139
  paragraphs_str, refl_list = render_paragraphs(
136
- main_paragraphs, paragraphs_rules, end_number
140
+ main_paragraphs,
141
+ paragraphs_rules,
142
+ end_number,
137
143
  )
138
144
  if debug_base_dir:
139
145
  paragraphs_str_path = debug_base_dir / f"{chapter_id}_debug.txt"
@@ -173,7 +179,7 @@ def parse_encrypted_chapter(
173
179
  )
174
180
  return {
175
181
  "id": str(chapter_id),
176
- "title": title,
182
+ "title": str(title),
177
183
  "content": final_paragraphs_str,
178
184
  "extra": {
179
185
  "author_say": author_say.strip() if author_say else "",
@@ -183,6 +189,7 @@ def parse_encrypted_chapter(
183
189
  "word_count": word_count,
184
190
  "seq": seq,
185
191
  "volume": volume,
192
+ "encrypted": True,
186
193
  },
187
194
  }
188
195
 
@@ -194,48 +201,46 @@ def parse_encrypted_chapter(
194
201
 
195
202
 
196
203
  def extract_paragraphs_recursively(
197
- soup: BeautifulSoup, chapter_id: str = ""
204
+ html_str: str,
205
+ chapter_id: str,
198
206
  ) -> list[dict[str, Any]]:
199
- """
200
- Extracts paragraph elements under <main id="c-{chapter_id}"> from HTML
201
- and converts them to a nested data structure for further processing.
207
+ def parse_element(elem: html.HtmlElement) -> dict[str, Any]:
208
+ class_attr = elem.attrib.get("class", "")
209
+ class_list = class_attr.split() if isinstance(class_attr, str) else class_attr
210
+ if "review" in class_list:
211
+ return {}
212
+
213
+ # Build attrs with class as list
214
+ attrs = {k: v.split() if k == "class" else v for k, v in elem.attrib.items()}
215
+
216
+ node: dict[str, Any] = {
217
+ "tag": elem.tag,
218
+ "attrs": attrs,
219
+ "data": [],
220
+ }
202
221
 
203
- :param html_str: Full HTML content.
204
- :param chapter_id: ID used to locate <main id="c-{chapter_id}">.
222
+ # Append entire elem.text if present (no splitting)
223
+ if elem.text:
224
+ node["data"].append(elem.text)
205
225
 
206
- :return list: List of parsed <p> paragraph data.
207
- """
226
+ # Recurse into children
227
+ for child in elem.iterchildren(tag=None):
228
+ child_dict = parse_element(child)
229
+ if child_dict:
230
+ node["data"].append(child_dict)
208
231
 
209
- def parse_element(elem: Any) -> dict[str, Any] | None:
210
- if not isinstance(elem, Tag):
211
- return None
212
- result = {"tag": elem.name, "attrs": dict(elem.attrs), "data": []}
213
- for child in elem.contents:
214
- if isinstance(child, Tag):
215
- parsed = parse_element(child)
216
- if parsed:
217
- result["data"].append(parsed)
218
- else:
219
- text = child
220
- if text:
221
- result["data"].append(text)
222
- return result
223
-
224
- if chapter_id:
225
- main_id = f"c-{chapter_id}"
226
- main_tag = soup.find("main", id=main_id)
227
- if not isinstance(main_tag, Tag):
228
- return []
229
- else:
230
- main_tag = soup
231
-
232
- result = []
233
- for p in main_tag.find_all("p"):
234
- parsed_p = parse_element(p)
235
- if parsed_p:
236
- result.append(parsed_p)
237
-
238
- return result
232
+ # Append entire tail string (no split)
233
+ if child.tail:
234
+ node["data"].append(child.tail)
235
+
236
+ return node
237
+
238
+ tree = html.fromstring(html_str)
239
+
240
+ # Try to find <main id="c-{chapter_id}">
241
+ main_elem = tree.xpath(f'//main[@id="c-{chapter_id}"]')
242
+ search_root = main_elem[0] if main_elem else tree
243
+ return [parse_element(p) for p in search_root.findall(".//p")]
239
244
 
240
245
 
241
246
  def parse_rule(css_str: str) -> dict[str, Any]:
@@ -317,68 +322,10 @@ def parse_rule(css_str: str) -> dict[str, Any]:
317
322
  return {"rules": rules, "orders": orders}
318
323
 
319
324
 
320
- def parse_paragraph_names(rules: dict[str, Any]) -> set[str]:
321
- """
322
- Extract all paragraph selector names from parsed rules, excluding "sy".
323
- """
324
- paragraph_names = set()
325
- for group, group_rules in rules.get("rules", {}).items():
326
- if group == "sy":
327
- continue
328
- paragraph_names.update(group_rules.keys())
329
- return paragraph_names
330
-
331
-
332
- def parse_end_number(
333
- main_paragraphs: list[dict[str, Any]], paragraph_names: set[str]
334
- ) -> int | None:
335
- """
336
- Find the most frequent numeric suffix from tag names
337
- matched by given paragraph prefixes.
338
- """
339
- end_numbers: dict[int, int] = {}
340
- sorted_names = sorted(paragraph_names, key=len, reverse=True)
341
-
342
- def rec_parse(item: list[Any] | dict[str, Any]) -> None:
343
- if isinstance(item, list):
344
- for element in item:
345
- rec_parse(element)
346
- elif isinstance(item, dict):
347
- tag = item.get("tag")
348
- if isinstance(tag, str):
349
- for prefix in sorted_names:
350
- if tag.startswith(prefix):
351
- remain = tag[len(prefix) :]
352
- if remain.isdigit():
353
- num = int(remain)
354
- end_numbers[num] = end_numbers.get(num, 0) + 1
355
- break
356
- for val in item.values():
357
- if isinstance(val, (list | dict)):
358
- rec_parse(val)
359
-
360
- rec_parse(main_paragraphs)
361
-
362
- if not end_numbers:
363
- logger.warning("[Parser] No valid ending numbers found")
364
- return None
365
-
366
- sorted_numbers = sorted(
367
- end_numbers.items(), key=lambda x: (x[1], x[0]), reverse=True
368
- )
369
-
370
- logger.debug(
371
- "[Parser] Top 3 end numbers:\n%s",
372
- "\n".join(f"{n}: {c}" for n, c in sorted_numbers[:3]),
373
- )
374
-
375
- return sorted_numbers[0][0]
376
-
377
-
378
325
  def render_paragraphs(
379
326
  main_paragraphs: list[dict[str, Any]],
380
327
  rules: dict[str, Any],
381
- end_number: int,
328
+ end_number: str = "",
382
329
  ) -> tuple[str, list[str]]:
383
330
  """
384
331
  Applies the parsed CSS rules to the paragraph structure and
@@ -392,7 +339,6 @@ def render_paragraphs(
392
339
  :param rules: A dictionary with keys 'orders' and 'rules', parsed from CSS.
393
340
  - rules['orders']: List of (selector, id) tuples.
394
341
  - rules['rules']: Nested dict containing transformation rules.
395
- :param end_number: HTML tag suffix (e.g. span123 -> 123).
396
342
 
397
343
  :return:
398
344
  - A reconstructed paragraph string with line breaks.
@@ -488,3 +434,84 @@ def render_paragraphs(
488
434
  paragraphs_str += "\n\n"
489
435
 
490
436
  return paragraphs_str, refl_list
437
+
438
+
439
+ def parse_paragraph_names(rules: dict[str, Any]) -> set[str]:
440
+ """
441
+ Extract all paragraph selector names from parsed rules, excluding "sy".
442
+ """
443
+ paragraph_names = set()
444
+ for group, group_rules in rules.get("rules", {}).items():
445
+ if group == "sy":
446
+ continue
447
+ paragraph_names.update(group_rules.keys())
448
+ return paragraph_names
449
+
450
+
451
+ def parse_end_number(
452
+ main_paragraphs: list[dict[str, Any]],
453
+ rules: dict[str, Any],
454
+ ) -> str:
455
+ """
456
+ Find the most frequent numeric suffix from tag names
457
+ matched by given paragraph prefixes.
458
+ """
459
+ paragraph_names = parse_paragraph_names(rules)
460
+ end_numbers: dict[int, int] = {}
461
+ prefix_hits = 0
462
+ sorted_names = sorted(paragraph_names, key=len, reverse=True)
463
+
464
+ def rec_parse(item: list[Any] | dict[str, Any]) -> None:
465
+ nonlocal prefix_hits
466
+ if isinstance(item, list):
467
+ for element in item:
468
+ rec_parse(element)
469
+ elif isinstance(item, dict):
470
+ tag = item.get("tag")
471
+ if isinstance(tag, str):
472
+ for prefix in sorted_names:
473
+ if tag.startswith(prefix):
474
+ prefix_hits += 1
475
+ remain = tag[len(prefix) :]
476
+ if remain.isdigit():
477
+ num = int(remain)
478
+ end_numbers[num] = end_numbers.get(num, 0) + 1
479
+ break
480
+ for val in item.values():
481
+ if isinstance(val, (list | dict)):
482
+ rec_parse(val)
483
+
484
+ rec_parse(main_paragraphs)
485
+
486
+ if not end_numbers:
487
+ logger.debug("[Parser] No valid ending numbers found")
488
+ return ""
489
+
490
+ sorted_numbers = sorted(
491
+ end_numbers.items(), key=lambda x: (x[1], x[0]), reverse=True
492
+ )
493
+
494
+ logger.debug(
495
+ "[Parser] Top 3 end numbers:\n%s",
496
+ "\n".join(f"{n}: {c}" for n, c in sorted_numbers[:3]),
497
+ )
498
+ most_common_number, most_common_count = sorted_numbers[0]
499
+ if most_common_count <= prefix_hits / 2:
500
+ logger.debug(
501
+ "[Parser] Top number (%s) does not exceed 50%% threshold: %d of %d",
502
+ most_common_number,
503
+ most_common_count,
504
+ prefix_hits,
505
+ )
506
+ return ""
507
+
508
+ return str(most_common_number)
509
+
510
+
511
+ def contains_keywords(paragraphs: list[dict[str, Any]], keywords: set[str]) -> bool:
512
+ for para in paragraphs:
513
+ data = para.get("data", [])
514
+ for item in data:
515
+ if isinstance(item, str) and any(kw in item for kw in keywords):
516
+ return True
517
+ return False
@@ -0,0 +1,150 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.qidian.chapter_normal
4
+ ---------------------------------------------------
5
+
6
+ Parser logic for extracting readable text from Qidian chapters
7
+ that use plain (non-encrypted) browser-rendered HTML.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+ from typing import TYPE_CHECKING
14
+
15
+ from lxml import html
16
+
17
+ from novel_downloader.models import ChapterDict
18
+
19
+ from .utils import (
20
+ extract_chapter_info,
21
+ find_ssr_page_context,
22
+ get_decryptor,
23
+ vip_status,
24
+ )
25
+
26
+ if TYPE_CHECKING:
27
+ from .main_parser import QidianParser
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ def parse_normal_chapter(
33
+ parser: QidianParser,
34
+ html_str: str,
35
+ chapter_id: str,
36
+ ) -> ChapterDict | None:
37
+ """
38
+ Extract structured chapter info from a normal Qidian page.
39
+
40
+ :param html_str: Chapter HTML.
41
+ :param chapter_id: Chapter identifier (string).
42
+ :return: a dictionary with keys like 'id', 'title', 'content', etc.
43
+ """
44
+ try:
45
+ ssr_data = find_ssr_page_context(html_str)
46
+ chapter_info = extract_chapter_info(ssr_data)
47
+ if not chapter_info:
48
+ logger.warning(
49
+ "[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
50
+ )
51
+ return None
52
+
53
+ title = chapter_info.get("chapterName", "Untitled")
54
+ raw_html = chapter_info.get("content", "")
55
+ chapter_id = chapter_info.get("chapterId", chapter_id)
56
+ fkp = chapter_info.get("fkp", "")
57
+ author_say = chapter_info.get("authorSay", "")
58
+ update_time = chapter_info.get("updateTime", "")
59
+ update_timestamp = chapter_info.get("updateTimestamp", 0)
60
+ modify_time = chapter_info.get("modifyTime", 0)
61
+ word_count = chapter_info.get("wordsCount", 0)
62
+ seq = chapter_info.get("seq", None)
63
+ volume = chapter_info.get("extra", {}).get("volumeName", "")
64
+
65
+ chapter_text = _parse_browser_paragraph(html_str)
66
+ if not chapter_text:
67
+ chapter_text = _parse_session_paragraph(
68
+ html_str=raw_html,
69
+ is_vip=vip_status(ssr_data),
70
+ chapter_id=chapter_id,
71
+ fkp=fkp,
72
+ fuid=parser._fuid,
73
+ )
74
+ if not chapter_text:
75
+ return None
76
+
77
+ return {
78
+ "id": str(chapter_id),
79
+ "title": title,
80
+ "content": chapter_text,
81
+ "extra": {
82
+ "author_say": author_say.strip() if author_say else "",
83
+ "updated_at": update_time,
84
+ "update_timestamp": update_timestamp,
85
+ "modify_time": modify_time,
86
+ "word_count": word_count,
87
+ "seq": seq,
88
+ "volume": volume,
89
+ "encrypted": False,
90
+ },
91
+ }
92
+ except Exception as e:
93
+ logger.warning(
94
+ "[Parser] parse error for normal chapter '%s': %s", chapter_id, e
95
+ )
96
+ return None
97
+
98
+
99
+ def _parse_browser_paragraph(html_str: str) -> str:
100
+ try:
101
+ tree = html.fromstring(html_str)
102
+ main = tree.xpath('//div[@id="app"]//div[@id="reader-content"]//main')
103
+ if not main:
104
+ return ""
105
+ main = main[0]
106
+
107
+ content_spans = main.xpath('.//span[contains(@class, "content-text")]')
108
+
109
+ paragraph_texts = [
110
+ span.text_content().strip()
111
+ for span in content_spans
112
+ if span.text_content().strip()
113
+ ]
114
+
115
+ chapter_text = "\n\n".join(paragraph_texts)
116
+ return chapter_text
117
+
118
+ except Exception as e:
119
+ logger.error("[Parser] _parse_paragraph failed: %s", e)
120
+ return ""
121
+
122
+
123
+ def _parse_session_paragraph(
124
+ html_str: str,
125
+ is_vip: bool,
126
+ chapter_id: str,
127
+ fkp: str,
128
+ fuid: str,
129
+ ) -> str:
130
+ try:
131
+ raw_html = html_str
132
+
133
+ if is_vip:
134
+ try:
135
+ decryptor = get_decryptor()
136
+ raw_html = decryptor.decrypt(raw_html, chapter_id, fkp, fuid)
137
+ except Exception as e:
138
+ logger.error("[Parser] decryption failed for '%s': %s", chapter_id, e)
139
+ return ""
140
+
141
+ tree = html.fromstring(raw_html)
142
+ paras = tree.xpath(".//p")
143
+ paragraph_texts = [
144
+ p.text_content().strip() for p in paras if p.text_content().strip()
145
+ ]
146
+ return "\n\n".join(paragraph_texts)
147
+
148
+ except Exception as e:
149
+ logger.error("[Parser] _parse_paragraph failed: %s", e)
150
+ return ""
@@ -1,9 +1,9 @@
1
1
  #!/usr/bin/env python3
2
2
  """
3
- novel_downloader.core.parsers.qidian.session.chapter_router
4
- -----------------------------------------------------------
3
+ novel_downloader.core.parsers.qidian.chapter_router
4
+ ---------------------------------------------------
5
5
 
6
- Routing logic for selecting the correct chapter parser for Qidian session pages.
6
+ Routing logic for selecting the correct chapter parser for Qidian pages.
7
7
  """
8
8
 
9
9
  from __future__ import annotations
@@ -11,50 +11,50 @@ from __future__ import annotations
11
11
  import logging
12
12
  from typing import TYPE_CHECKING
13
13
 
14
- from novel_downloader.utils.chapter_storage import ChapterDict
14
+ from novel_downloader.models import ChapterDict
15
15
 
16
- from ..shared import (
16
+ from .chapter_normal import parse_normal_chapter
17
+ from .utils import (
17
18
  can_view_chapter,
18
- html_to_soup,
19
+ find_ssr_page_context,
19
20
  is_encrypted,
20
21
  )
21
- from .chapter_normal import parse_normal_chapter
22
22
 
23
23
  if TYPE_CHECKING:
24
- from .main_parser import QidianSessionParser
24
+ from .main_parser import QidianParser
25
25
 
26
26
  logger = logging.getLogger(__name__)
27
27
 
28
28
 
29
29
  def parse_chapter(
30
- parser: QidianSessionParser,
30
+ parser: QidianParser,
31
31
  html_str: str,
32
32
  chapter_id: str,
33
33
  ) -> ChapterDict | None:
34
34
  """
35
35
  Extract and return the formatted textual content of chapter.
36
36
 
37
- :param parser: Instance of QidianSessionParser.
37
+ :param parser: Instance of QidianParser.
38
38
  :param html_str: Raw HTML content of the chapter page.
39
39
  :param chapter_id: Identifier of the chapter being parsed.
40
40
  :return: Formatted chapter text or empty string if not parsable.
41
41
  """
42
42
  try:
43
- soup = html_to_soup(html_str)
43
+ ssr_data = find_ssr_page_context(html_str)
44
44
 
45
- if not can_view_chapter(soup):
45
+ if not can_view_chapter(ssr_data):
46
46
  logger.warning(
47
47
  "[Parser] Chapter '%s' is not purchased or inaccessible.", chapter_id
48
48
  )
49
49
  return None
50
50
 
51
- if is_encrypted(soup):
51
+ if is_encrypted(ssr_data):
52
52
  if not parser._decode_font:
53
53
  return None
54
54
  try:
55
55
  from .chapter_encrypted import parse_encrypted_chapter
56
56
 
57
- return parse_encrypted_chapter(parser, soup, chapter_id, parser._fuid)
57
+ return parse_encrypted_chapter(parser, html_str, chapter_id)
58
58
  except ImportError:
59
59
  logger.warning(
60
60
  "[Parser] Encrypted chapter '%s' requires extra dependencies.",
@@ -62,7 +62,7 @@ def parse_chapter(
62
62
  )
63
63
  return None
64
64
 
65
- return parse_normal_chapter(soup, chapter_id, parser._fuid)
65
+ return parse_normal_chapter(parser, html_str, chapter_id)
66
66
  except Exception as e:
67
67
  logger.warning("[Parser] parse error for chapter '%s': %s", chapter_id, e)
68
68
  return None