novel-downloader 1.3.3__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (211) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/clean.py +97 -78
  3. novel_downloader/cli/config.py +177 -0
  4. novel_downloader/cli/download.py +132 -87
  5. novel_downloader/cli/export.py +77 -0
  6. novel_downloader/cli/main.py +21 -28
  7. novel_downloader/config/__init__.py +1 -25
  8. novel_downloader/config/adapter.py +32 -31
  9. novel_downloader/config/loader.py +3 -3
  10. novel_downloader/config/site_rules.py +1 -2
  11. novel_downloader/core/__init__.py +3 -6
  12. novel_downloader/core/downloaders/__init__.py +10 -13
  13. novel_downloader/core/downloaders/base.py +233 -0
  14. novel_downloader/core/downloaders/biquge.py +27 -0
  15. novel_downloader/core/downloaders/common.py +414 -0
  16. novel_downloader/core/downloaders/esjzone.py +27 -0
  17. novel_downloader/core/downloaders/linovelib.py +27 -0
  18. novel_downloader/core/downloaders/qianbi.py +27 -0
  19. novel_downloader/core/downloaders/qidian.py +352 -0
  20. novel_downloader/core/downloaders/sfacg.py +27 -0
  21. novel_downloader/core/downloaders/yamibo.py +27 -0
  22. novel_downloader/core/exporters/__init__.py +37 -0
  23. novel_downloader/core/{savers → exporters}/base.py +73 -39
  24. novel_downloader/core/exporters/biquge.py +25 -0
  25. novel_downloader/core/exporters/common/__init__.py +12 -0
  26. novel_downloader/core/{savers → exporters}/common/epub.py +22 -22
  27. novel_downloader/core/{savers/common/main_saver.py → exporters/common/main_exporter.py} +35 -40
  28. novel_downloader/core/{savers → exporters}/common/txt.py +20 -23
  29. novel_downloader/core/{savers → exporters}/epub_utils/__init__.py +8 -3
  30. novel_downloader/core/{savers → exporters}/epub_utils/css_builder.py +2 -2
  31. novel_downloader/core/{savers → exporters}/epub_utils/image_loader.py +46 -4
  32. novel_downloader/core/{savers → exporters}/epub_utils/initializer.py +6 -4
  33. novel_downloader/core/{savers → exporters}/epub_utils/text_to_html.py +3 -3
  34. novel_downloader/core/{savers → exporters}/epub_utils/volume_intro.py +2 -2
  35. novel_downloader/core/exporters/esjzone.py +25 -0
  36. novel_downloader/core/exporters/linovelib/__init__.py +10 -0
  37. novel_downloader/core/exporters/linovelib/epub.py +449 -0
  38. novel_downloader/core/exporters/linovelib/main_exporter.py +127 -0
  39. novel_downloader/core/exporters/linovelib/txt.py +129 -0
  40. novel_downloader/core/exporters/qianbi.py +25 -0
  41. novel_downloader/core/{savers → exporters}/qidian.py +8 -8
  42. novel_downloader/core/exporters/sfacg.py +25 -0
  43. novel_downloader/core/exporters/yamibo.py +25 -0
  44. novel_downloader/core/factory/__init__.py +5 -17
  45. novel_downloader/core/factory/downloader.py +24 -126
  46. novel_downloader/core/factory/exporter.py +58 -0
  47. novel_downloader/core/factory/fetcher.py +96 -0
  48. novel_downloader/core/factory/parser.py +17 -12
  49. novel_downloader/core/{requesters → fetchers}/__init__.py +22 -15
  50. novel_downloader/core/{requesters → fetchers}/base/__init__.py +2 -4
  51. novel_downloader/core/fetchers/base/browser.py +383 -0
  52. novel_downloader/core/fetchers/base/rate_limiter.py +86 -0
  53. novel_downloader/core/fetchers/base/session.py +419 -0
  54. novel_downloader/core/fetchers/biquge/__init__.py +14 -0
  55. novel_downloader/core/{requesters/biquge/async_session.py → fetchers/biquge/browser.py} +18 -6
  56. novel_downloader/core/{requesters → fetchers}/biquge/session.py +23 -30
  57. novel_downloader/core/fetchers/common/__init__.py +14 -0
  58. novel_downloader/core/fetchers/common/browser.py +79 -0
  59. novel_downloader/core/{requesters/common/async_session.py → fetchers/common/session.py} +8 -25
  60. novel_downloader/core/fetchers/esjzone/__init__.py +14 -0
  61. novel_downloader/core/fetchers/esjzone/browser.py +202 -0
  62. novel_downloader/core/{requesters/esjzone/async_session.py → fetchers/esjzone/session.py} +62 -42
  63. novel_downloader/core/fetchers/linovelib/__init__.py +14 -0
  64. novel_downloader/core/fetchers/linovelib/browser.py +193 -0
  65. novel_downloader/core/fetchers/linovelib/session.py +193 -0
  66. novel_downloader/core/fetchers/qianbi/__init__.py +14 -0
  67. novel_downloader/core/{requesters/qianbi/session.py → fetchers/qianbi/browser.py} +30 -48
  68. novel_downloader/core/{requesters/qianbi/async_session.py → fetchers/qianbi/session.py} +18 -6
  69. novel_downloader/core/fetchers/qidian/__init__.py +14 -0
  70. novel_downloader/core/fetchers/qidian/browser.py +266 -0
  71. novel_downloader/core/fetchers/qidian/session.py +326 -0
  72. novel_downloader/core/fetchers/sfacg/__init__.py +14 -0
  73. novel_downloader/core/fetchers/sfacg/browser.py +189 -0
  74. novel_downloader/core/{requesters/sfacg/async_session.py → fetchers/sfacg/session.py} +43 -73
  75. novel_downloader/core/fetchers/yamibo/__init__.py +14 -0
  76. novel_downloader/core/fetchers/yamibo/browser.py +229 -0
  77. novel_downloader/core/{requesters/yamibo/async_session.py → fetchers/yamibo/session.py} +62 -44
  78. novel_downloader/core/interfaces/__init__.py +8 -12
  79. novel_downloader/core/interfaces/downloader.py +54 -0
  80. novel_downloader/core/interfaces/{saver.py → exporter.py} +12 -12
  81. novel_downloader/core/interfaces/fetcher.py +162 -0
  82. novel_downloader/core/interfaces/parser.py +6 -7
  83. novel_downloader/core/parsers/__init__.py +5 -6
  84. novel_downloader/core/parsers/base.py +9 -13
  85. novel_downloader/core/parsers/biquge/main_parser.py +12 -13
  86. novel_downloader/core/parsers/common/helper.py +3 -3
  87. novel_downloader/core/parsers/common/main_parser.py +39 -34
  88. novel_downloader/core/parsers/esjzone/main_parser.py +20 -14
  89. novel_downloader/core/parsers/linovelib/__init__.py +10 -0
  90. novel_downloader/core/parsers/linovelib/main_parser.py +210 -0
  91. novel_downloader/core/parsers/qianbi/main_parser.py +21 -15
  92. novel_downloader/core/parsers/qidian/__init__.py +2 -11
  93. novel_downloader/core/parsers/qidian/book_info_parser.py +113 -0
  94. novel_downloader/core/parsers/qidian/{browser/chapter_encrypted.py → chapter_encrypted.py} +162 -135
  95. novel_downloader/core/parsers/qidian/chapter_normal.py +150 -0
  96. novel_downloader/core/parsers/qidian/{session/chapter_router.py → chapter_router.py} +15 -15
  97. novel_downloader/core/parsers/qidian/{browser/main_parser.py → main_parser.py} +49 -40
  98. novel_downloader/core/parsers/qidian/utils/__init__.py +27 -0
  99. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +145 -0
  100. novel_downloader/core/parsers/qidian/{shared → utils}/helpers.py +41 -68
  101. novel_downloader/core/parsers/qidian/{session → utils}/node_decryptor.py +64 -50
  102. novel_downloader/core/parsers/sfacg/main_parser.py +12 -12
  103. novel_downloader/core/parsers/yamibo/main_parser.py +10 -10
  104. novel_downloader/locales/en.json +18 -2
  105. novel_downloader/locales/zh.json +18 -2
  106. novel_downloader/models/__init__.py +64 -0
  107. novel_downloader/models/browser.py +21 -0
  108. novel_downloader/models/chapter.py +25 -0
  109. novel_downloader/models/config.py +100 -0
  110. novel_downloader/models/login.py +20 -0
  111. novel_downloader/models/site_rules.py +99 -0
  112. novel_downloader/models/tasks.py +33 -0
  113. novel_downloader/models/types.py +15 -0
  114. novel_downloader/resources/config/settings.toml +31 -25
  115. novel_downloader/resources/json/linovelib_font_map.json +3573 -0
  116. novel_downloader/tui/__init__.py +7 -0
  117. novel_downloader/tui/app.py +32 -0
  118. novel_downloader/tui/main.py +17 -0
  119. novel_downloader/tui/screens/__init__.py +14 -0
  120. novel_downloader/tui/screens/home.py +191 -0
  121. novel_downloader/tui/screens/login.py +74 -0
  122. novel_downloader/tui/styles/home_layout.tcss +79 -0
  123. novel_downloader/tui/widgets/richlog_handler.py +24 -0
  124. novel_downloader/utils/__init__.py +6 -0
  125. novel_downloader/utils/chapter_storage.py +25 -38
  126. novel_downloader/utils/constants.py +11 -5
  127. novel_downloader/utils/cookies.py +66 -0
  128. novel_downloader/utils/crypto_utils.py +1 -74
  129. novel_downloader/utils/fontocr/ocr_v1.py +2 -1
  130. novel_downloader/utils/fontocr/ocr_v2.py +2 -2
  131. novel_downloader/utils/hash_store.py +10 -18
  132. novel_downloader/utils/hash_utils.py +3 -2
  133. novel_downloader/utils/logger.py +2 -3
  134. novel_downloader/utils/network.py +2 -1
  135. novel_downloader/utils/text_utils/chapter_formatting.py +6 -1
  136. novel_downloader/utils/text_utils/font_mapping.py +1 -1
  137. novel_downloader/utils/text_utils/text_cleaning.py +1 -1
  138. novel_downloader/utils/time_utils/datetime_utils.py +3 -3
  139. novel_downloader/utils/time_utils/sleep_utils.py +1 -1
  140. {novel_downloader-1.3.3.dist-info → novel_downloader-1.4.1.dist-info}/METADATA +69 -35
  141. novel_downloader-1.4.1.dist-info/RECORD +170 -0
  142. {novel_downloader-1.3.3.dist-info → novel_downloader-1.4.1.dist-info}/WHEEL +1 -1
  143. {novel_downloader-1.3.3.dist-info → novel_downloader-1.4.1.dist-info}/entry_points.txt +1 -0
  144. novel_downloader/cli/interactive.py +0 -66
  145. novel_downloader/cli/settings.py +0 -177
  146. novel_downloader/config/models.py +0 -187
  147. novel_downloader/core/downloaders/base/__init__.py +0 -14
  148. novel_downloader/core/downloaders/base/base_async.py +0 -153
  149. novel_downloader/core/downloaders/base/base_sync.py +0 -208
  150. novel_downloader/core/downloaders/biquge/__init__.py +0 -14
  151. novel_downloader/core/downloaders/biquge/biquge_async.py +0 -27
  152. novel_downloader/core/downloaders/biquge/biquge_sync.py +0 -27
  153. novel_downloader/core/downloaders/common/__init__.py +0 -14
  154. novel_downloader/core/downloaders/common/common_async.py +0 -210
  155. novel_downloader/core/downloaders/common/common_sync.py +0 -202
  156. novel_downloader/core/downloaders/esjzone/__init__.py +0 -14
  157. novel_downloader/core/downloaders/esjzone/esjzone_async.py +0 -27
  158. novel_downloader/core/downloaders/esjzone/esjzone_sync.py +0 -27
  159. novel_downloader/core/downloaders/qianbi/__init__.py +0 -14
  160. novel_downloader/core/downloaders/qianbi/qianbi_async.py +0 -27
  161. novel_downloader/core/downloaders/qianbi/qianbi_sync.py +0 -27
  162. novel_downloader/core/downloaders/qidian/__init__.py +0 -10
  163. novel_downloader/core/downloaders/qidian/qidian_sync.py +0 -219
  164. novel_downloader/core/downloaders/sfacg/__init__.py +0 -14
  165. novel_downloader/core/downloaders/sfacg/sfacg_async.py +0 -27
  166. novel_downloader/core/downloaders/sfacg/sfacg_sync.py +0 -27
  167. novel_downloader/core/downloaders/yamibo/__init__.py +0 -14
  168. novel_downloader/core/downloaders/yamibo/yamibo_async.py +0 -27
  169. novel_downloader/core/downloaders/yamibo/yamibo_sync.py +0 -27
  170. novel_downloader/core/factory/requester.py +0 -144
  171. novel_downloader/core/factory/saver.py +0 -56
  172. novel_downloader/core/interfaces/async_downloader.py +0 -36
  173. novel_downloader/core/interfaces/async_requester.py +0 -84
  174. novel_downloader/core/interfaces/sync_downloader.py +0 -36
  175. novel_downloader/core/interfaces/sync_requester.py +0 -82
  176. novel_downloader/core/parsers/qidian/browser/__init__.py +0 -12
  177. novel_downloader/core/parsers/qidian/browser/chapter_normal.py +0 -93
  178. novel_downloader/core/parsers/qidian/browser/chapter_router.py +0 -71
  179. novel_downloader/core/parsers/qidian/session/__init__.py +0 -12
  180. novel_downloader/core/parsers/qidian/session/chapter_encrypted.py +0 -443
  181. novel_downloader/core/parsers/qidian/session/chapter_normal.py +0 -115
  182. novel_downloader/core/parsers/qidian/session/main_parser.py +0 -128
  183. novel_downloader/core/parsers/qidian/shared/__init__.py +0 -37
  184. novel_downloader/core/parsers/qidian/shared/book_info_parser.py +0 -150
  185. novel_downloader/core/requesters/base/async_session.py +0 -410
  186. novel_downloader/core/requesters/base/browser.py +0 -337
  187. novel_downloader/core/requesters/base/session.py +0 -378
  188. novel_downloader/core/requesters/biquge/__init__.py +0 -14
  189. novel_downloader/core/requesters/common/__init__.py +0 -17
  190. novel_downloader/core/requesters/common/session.py +0 -113
  191. novel_downloader/core/requesters/esjzone/__init__.py +0 -13
  192. novel_downloader/core/requesters/esjzone/session.py +0 -235
  193. novel_downloader/core/requesters/qianbi/__init__.py +0 -13
  194. novel_downloader/core/requesters/qidian/__init__.py +0 -21
  195. novel_downloader/core/requesters/qidian/broswer.py +0 -307
  196. novel_downloader/core/requesters/qidian/session.py +0 -290
  197. novel_downloader/core/requesters/sfacg/__init__.py +0 -13
  198. novel_downloader/core/requesters/sfacg/session.py +0 -242
  199. novel_downloader/core/requesters/yamibo/__init__.py +0 -13
  200. novel_downloader/core/requesters/yamibo/session.py +0 -237
  201. novel_downloader/core/savers/__init__.py +0 -34
  202. novel_downloader/core/savers/biquge.py +0 -25
  203. novel_downloader/core/savers/common/__init__.py +0 -12
  204. novel_downloader/core/savers/esjzone.py +0 -25
  205. novel_downloader/core/savers/qianbi.py +0 -25
  206. novel_downloader/core/savers/sfacg.py +0 -25
  207. novel_downloader/core/savers/yamibo.py +0 -25
  208. novel_downloader/resources/config/rules.toml +0 -196
  209. novel_downloader-1.3.3.dist-info/RECORD +0 -166
  210. {novel_downloader-1.3.3.dist-info → novel_downloader-1.4.1.dist-info}/licenses/LICENSE +0 -0
  211. {novel_downloader-1.3.3.dist-info → novel_downloader-1.4.1.dist-info}/top_level.txt +0 -0
@@ -5,13 +5,13 @@ novel_downloader.core.parsers.esjzone.main_parser
5
5
 
6
6
  """
7
7
 
8
+ import re
8
9
  from typing import Any
9
10
 
10
- from lxml import etree
11
- from lxml.etree import _Element
11
+ from lxml import html
12
12
 
13
13
  from novel_downloader.core.parsers.base import BaseParser
14
- from novel_downloader.utils.chapter_storage import ChapterDict
14
+ from novel_downloader.models import ChapterDict
15
15
 
16
16
 
17
17
  class EsjzoneParser(BaseParser):
@@ -40,7 +40,7 @@ class EsjzoneParser(BaseParser):
40
40
 
41
41
  def parse_book_info(
42
42
  self,
43
- html_str: list[str],
43
+ html_list: list[str],
44
44
  **kwargs: Any,
45
45
  ) -> dict[str, Any]:
46
46
  """
@@ -49,12 +49,12 @@ class EsjzoneParser(BaseParser):
49
49
  注: 由于网站使用了多种不同的分卷格式, 已经尝试兼容常见情况,
50
50
  但仍可能存在未覆盖的 cases
51
51
 
52
- :param html: Raw HTML of the book info page.
52
+ :param html_list: Raw HTML of the book info page.
53
53
  :return: Parsed metadata and chapter structure as a dictionary.
54
54
  """
55
- if not html_str or self._is_forum_page(html_str):
55
+ if not html_list or self._is_forum_page(html_list):
56
56
  return {}
57
- tree = etree.HTML(html_str[0])
57
+ tree = html.fromstring(html_list[0])
58
58
  result: dict[str, Any] = {}
59
59
 
60
60
  result["book_name"] = self._get_text(tree, self._BOOK_NAME_XPATH)
@@ -75,8 +75,14 @@ class EsjzoneParser(BaseParser):
75
75
  volumes: list[dict[str, Any]] = []
76
76
  current_vol: dict[str, Any] = {}
77
77
 
78
+ def _is_garbage_title(name: str) -> bool:
79
+ stripped = name.strip()
80
+ return not stripped or bool(re.fullmatch(r"[\W_]+", stripped))
81
+
78
82
  def _start_volume(name: str) -> None:
79
83
  nonlocal current_vol
84
+ if _is_garbage_title(name):
85
+ return
80
86
  name = name.strip() or "未命名卷"
81
87
  if name == "未命名卷" and current_vol is not None:
82
88
  return
@@ -94,7 +100,7 @@ class EsjzoneParser(BaseParser):
94
100
  tag = node.tag.lower()
95
101
 
96
102
  if tag == "details":
97
- # ---- DETAILSbased layout ----
103
+ # ---- DETAILS-based layout ----
98
104
  summary = node.find("summary")
99
105
  vol_name = summary.text if summary is not None else "未命名卷"
100
106
  _start_volume(vol_name)
@@ -134,20 +140,20 @@ class EsjzoneParser(BaseParser):
134
140
 
135
141
  def parse_chapter(
136
142
  self,
137
- html_str: list[str],
143
+ html_list: list[str],
138
144
  chapter_id: str,
139
145
  **kwargs: Any,
140
146
  ) -> ChapterDict | None:
141
147
  """
142
148
  Parse a single chapter page and extract clean text or simplified HTML.
143
149
 
144
- :param html: Raw HTML of the chapter page.
150
+ :param html_list: Raw HTML of the chapter page.
145
151
  :param chapter_id: Identifier of the chapter being parsed.
146
152
  :return: Cleaned chapter content as plain text or minimal HTML.
147
153
  """
148
- if not html_str or self._is_forum_page(html_str):
154
+ if not html_list or self._is_forum_page(html_list):
149
155
  return None
150
- tree = etree.HTML(html_str[0], parser=None)
156
+ tree = html.fromstring(html_list[0], parser=None)
151
157
 
152
158
  content_lines: list[str] = []
153
159
  content_nodes = tree.xpath(self._CHAPTER_CONTENT_NODES_XPATH)
@@ -198,7 +204,7 @@ class EsjzoneParser(BaseParser):
198
204
  if not html_str:
199
205
  return False
200
206
 
201
- tree = etree.HTML(html_str[0])
207
+ tree = html.fromstring(html_str[0])
202
208
  page_title = tree.xpath('string(//div[@class="page-title"]//h1)').strip()
203
209
  if page_title != "論壇":
204
210
  return False
@@ -208,7 +214,7 @@ class EsjzoneParser(BaseParser):
208
214
 
209
215
  @staticmethod
210
216
  def _get_text(
211
- tree: _Element,
217
+ tree: html.HtmlElement,
212
218
  xpath: str,
213
219
  join: bool = False,
214
220
  clean_comma: bool = False,
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.linovelib
4
+ ---------------------------------------
5
+
6
+ """
7
+
8
+ from .main_parser import LinovelibParser
9
+
10
+ __all__ = ["LinovelibParser"]
@@ -0,0 +1,210 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.linovelib.main_parser
4
+ ---------------------------------------------------
5
+
6
+ """
7
+
8
+ import json
9
+ from itertools import islice
10
+ from pathlib import PurePosixPath
11
+ from typing import Any
12
+
13
+ from lxml import html
14
+
15
+ from novel_downloader.core.parsers.base import BaseParser
16
+ from novel_downloader.models import ChapterDict
17
+ from novel_downloader.utils.constants import LINOVELIB_FONT_MAP_PATH
18
+
19
+
20
+ class LinovelibParser(BaseParser):
21
+ """ """
22
+
23
+ # Book info XPaths
24
+ _BOOK_NAME_XPATH = '//div[@class="book-info"]/h1[@class="book-name"]/text()'
25
+ _AUTHOR_XPATH = '//div[@class="au-name"]/a[1]/text()'
26
+ _COVER_URL_XPATH = '//div[contains(@class, "book-img")]//img/@src'
27
+ _UPDATE_TIME_XPATH = (
28
+ '//div[@class="nums"]/span[contains(text(), "最后更新")]/text()' # noqa: E501
29
+ )
30
+ _SERIAL_STATUS_XPATH = '//div[@class="book-label"]/a[@class="state"]/text()'
31
+ _WORD_COUNT_XPATH = '//div[@class="nums"]/span[contains(text(), "字数")]/text()'
32
+ _SUMMARY_XPATH = '//div[contains(@class, "book-dec")]/p//text()'
33
+
34
+ _CHAPTERS_XPATH = '//div[@class="book-new-chapter"]/div[contains(@class, "tit")]/a'
35
+
36
+ # Chapter XPaths
37
+ _CHAPTER_TITLE_XPATH = "//div[@id='mlfy_main_text']/h1/text()"
38
+ _CHAPTER_CONTENT_NODES_XPATH = "//div[@id='TextContent']/*[self::p or self::img]"
39
+
40
+ _FONT_MAP: dict[str, str] = json.loads(
41
+ LINOVELIB_FONT_MAP_PATH.read_text(encoding="utf-8")
42
+ ) # 注意 json 前 3500 条的内容不必要不修改
43
+ _BLANK_SET: set[str] = set(islice(_FONT_MAP.values(), 3500))
44
+
45
+ def parse_book_info(
46
+ self,
47
+ html_list: list[str],
48
+ **kwargs: Any,
49
+ ) -> dict[str, Any]:
50
+ """
51
+ Parse a book info page and extract metadata and chapter structure.
52
+
53
+ :param html_list: Raw HTML of the book info page.
54
+ :return: Parsed metadata and chapter structure as a dictionary.
55
+ """
56
+ if not html_list:
57
+ return {}
58
+ info_tree = html.fromstring(html_list[0])
59
+ result: dict[str, Any] = {}
60
+
61
+ result["book_name"] = self._safe_xpath(info_tree, self._BOOK_NAME_XPATH)
62
+ result["author"] = self._safe_xpath(info_tree, self._AUTHOR_XPATH)
63
+ result["cover_url"] = self._safe_xpath(info_tree, self._COVER_URL_XPATH)
64
+ result["update_time"] = self._safe_xpath(
65
+ info_tree, self._UPDATE_TIME_XPATH, replace=("最后更新:", "")
66
+ )
67
+ result["serial_status"] = self._safe_xpath(info_tree, self._SERIAL_STATUS_XPATH)
68
+ result["word_count"] = self._safe_xpath(
69
+ info_tree, self._WORD_COUNT_XPATH, replace=("字数:", "")
70
+ )
71
+
72
+ result["summary"] = self._extract_intro(info_tree, self._SUMMARY_XPATH)
73
+
74
+ vol_pages = html_list[1:]
75
+ volumes: list[dict[str, Any]] = []
76
+ for vol_page in vol_pages:
77
+ vol_tree = html.fromstring(vol_page)
78
+ volume_cover = self._safe_xpath(vol_tree, self._COVER_URL_XPATH)
79
+ volume_name = self._safe_xpath(vol_tree, self._BOOK_NAME_XPATH)
80
+ update_time = self._safe_xpath(
81
+ vol_tree, self._UPDATE_TIME_XPATH, replace=("最后更新:", "")
82
+ )
83
+ word_count = self._safe_xpath(
84
+ vol_tree, self._WORD_COUNT_XPATH, replace=("字数:", "")
85
+ )
86
+ volume_intro = self._extract_intro(vol_tree, self._SUMMARY_XPATH)
87
+
88
+ chapters = []
89
+ chapter_elements = vol_tree.xpath(self._CHAPTERS_XPATH)
90
+ for a in chapter_elements:
91
+ title = a.text.strip()
92
+ url = a.attrib.get("href", "").strip()
93
+ chap_path = PurePosixPath(url.rstrip("/"))
94
+ chapters.append(
95
+ {"title": title, "url": url, "chapterId": chap_path.stem}
96
+ )
97
+
98
+ volumes.append(
99
+ {
100
+ "volume_name": volume_name,
101
+ "volume_cover": volume_cover,
102
+ "update_time": update_time,
103
+ "word_count": word_count,
104
+ "volume_intro": volume_intro,
105
+ "chapters": chapters,
106
+ }
107
+ )
108
+ result["volumes"] = volumes
109
+
110
+ return result
111
+
112
+ def parse_chapter(
113
+ self,
114
+ html_list: list[str],
115
+ chapter_id: str,
116
+ **kwargs: Any,
117
+ ) -> ChapterDict | None:
118
+ """
119
+ Parse chapter pages and extract clean text or simplified HTML.
120
+
121
+ :param html_list: Raw HTML of the chapter page.
122
+ :param chapter_id: Identifier of the chapter being parsed.
123
+ :return: Cleaned chapter content as plain text or minimal HTML.
124
+ """
125
+ if not html_list:
126
+ return None
127
+ title_text: str = ""
128
+ contents: list[str] = []
129
+ for curr_html in html_list:
130
+ is_encrypted = self._is_encrypted(curr_html)
131
+ tree = html.fromstring(curr_html)
132
+
133
+ if not title_text:
134
+ titles = tree.xpath(self._CHAPTER_TITLE_XPATH)
135
+ if titles:
136
+ title_text = titles[0].strip()
137
+
138
+ content_container = tree.xpath("//div[@id='TextContent']")
139
+ if not content_container:
140
+ continue
141
+ container = content_container[0]
142
+ nodes = container.xpath("./p | ./img")
143
+ all_p = container.xpath("./p")
144
+ total_p = len(all_p)
145
+ p_counter = 0
146
+
147
+ for node in nodes:
148
+ tag = node.tag.lower()
149
+ if tag == "p":
150
+ raw_text = "".join(node.xpath(".//text()")).strip()
151
+ if not raw_text:
152
+ continue
153
+
154
+ if is_encrypted and p_counter == total_p - 2:
155
+ raw_text = self._apply_font_map(raw_text)
156
+
157
+ contents.append(raw_text)
158
+ p_counter += 1
159
+
160
+ elif tag == "img":
161
+ src = node.get("data-src") or node.get("src", "")
162
+ src = src.strip()
163
+ if src:
164
+ contents.append(f'<img src="{src}" />')
165
+ return {
166
+ "id": chapter_id,
167
+ "title": title_text,
168
+ "content": "\n\n".join(contents),
169
+ "extra": {"site": "linovelib"},
170
+ }
171
+
172
+ def _safe_xpath(
173
+ self,
174
+ tree: html.HtmlElement,
175
+ path: str,
176
+ replace: tuple[str, str] | None = None,
177
+ ) -> str:
178
+ result = tree.xpath(path)
179
+ if not result:
180
+ return ""
181
+ value: str = result[0].strip()
182
+ if replace:
183
+ old, new = replace
184
+ value = value.replace(old, new)
185
+ return value
186
+
187
+ @staticmethod
188
+ def _extract_intro(tree: html.HtmlElement, xpath: str) -> str:
189
+ paragraphs = tree.xpath(xpath.replace("//text()", ""))
190
+ lines = []
191
+ for p in paragraphs:
192
+ text_segments = p.xpath(".//text()")
193
+ cleaned = [seg.strip() for seg in text_segments if seg.strip()]
194
+ lines.append("\n".join(cleaned))
195
+ return "\n\n".join(lines)
196
+
197
+ @staticmethod
198
+ def _is_encrypted(html: str) -> bool:
199
+ """
200
+ Determine whether HTML content likely uses encrypted or obfuscated fonts.
201
+ """
202
+ return "CSSStyleSheet" in html
203
+
204
+ @classmethod
205
+ def _apply_font_map(cls, text: str) -> str:
206
+ """
207
+ Apply font mapping to the input text,
208
+ skipping characters in blank set.
209
+ """
210
+ return "".join(cls._FONT_MAP.get(c, c) for c in text if c not in cls._BLANK_SET)
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env python3
2
2
  """
3
- novel_downloader.core.parsers.biquge.main_parser
3
+ novel_downloader.core.parsers.qianbi.main_parser
4
4
  ------------------------------------------------
5
5
 
6
6
  """
@@ -8,10 +8,10 @@ novel_downloader.core.parsers.biquge.main_parser
8
8
  from datetime import datetime
9
9
  from typing import Any
10
10
 
11
- from lxml import etree
11
+ from lxml import html
12
12
 
13
13
  from novel_downloader.core.parsers.base import BaseParser
14
- from novel_downloader.utils.chapter_storage import ChapterDict
14
+ from novel_downloader.models import ChapterDict
15
15
 
16
16
 
17
17
  class QianbiParser(BaseParser):
@@ -19,20 +19,20 @@ class QianbiParser(BaseParser):
19
19
 
20
20
  def parse_book_info(
21
21
  self,
22
- html_str: list[str],
22
+ html_list: list[str],
23
23
  **kwargs: Any,
24
24
  ) -> dict[str, Any]:
25
25
  """
26
26
  Parse a book info page and extract metadata and chapter structure.
27
27
 
28
- :param html: Raw HTML of the book info page.
28
+ :param html_list: Raw HTML of the book info pages.
29
29
  :return: Parsed metadata and chapter structure as a dictionary.
30
30
  """
31
- if len(html_str) < 2:
31
+ if len(html_list) < 2:
32
32
  return {}
33
33
 
34
- info_tree = etree.HTML(html_str[0])
35
- catalog_tree = etree.HTML(html_str[1])
34
+ info_tree = html.fromstring(html_list[0])
35
+ catalog_tree = html.fromstring(html_list[1])
36
36
  result: dict[str, Any] = {}
37
37
 
38
38
  title = info_tree.xpath('//h1[@class="page-title"]/text()')
@@ -56,9 +56,7 @@ class QianbiParser(BaseParser):
56
56
  '//div[@class="novel-info-item novel-info-content"]/span'
57
57
  )
58
58
  if summary_node and summary_node[0] is not None:
59
- result["summary"] = etree.tostring(
60
- summary_node[0], encoding="unicode", method="text"
61
- ).strip()
59
+ result["summary"] = summary_node[0].text_content().strip()
62
60
  else:
63
61
  result["summary"] = ""
64
62
 
@@ -85,6 +83,8 @@ class QianbiParser(BaseParser):
85
83
  if a_tag:
86
84
  title = a_tag[0].xpath(".//span/text()")
87
85
  href = a_tag[0].attrib.get("href", "")
86
+ if href == "javascript:cid(0)":
87
+ href = ""
88
88
  chapter_id = (
89
89
  href.split("/")[-1].replace(".html", "") if href else ""
90
90
  )
@@ -105,20 +105,20 @@ class QianbiParser(BaseParser):
105
105
 
106
106
  def parse_chapter(
107
107
  self,
108
- html_str: list[str],
108
+ html_list: list[str],
109
109
  chapter_id: str,
110
110
  **kwargs: Any,
111
111
  ) -> ChapterDict | None:
112
112
  """
113
113
  Parse a single chapter page and extract clean text or simplified HTML.
114
114
 
115
- :param html: Raw HTML of the chapter page.
115
+ :param html_list: Raw HTML of the chapter page.
116
116
  :param chapter_id: Identifier of the chapter being parsed.
117
117
  :return: Cleaned chapter content as plain text or minimal HTML.
118
118
  """
119
- if not html_str:
119
+ if not html_list:
120
120
  return None
121
- tree = etree.HTML(html_str[0])
121
+ tree = html.fromstring(html_list[0])
122
122
 
123
123
  paras = tree.xpath('//div[@class="article-content"]/p/text()')
124
124
  content_text = "\n\n".join(p.strip() for p in paras if p.strip())
@@ -131,6 +131,11 @@ class QianbiParser(BaseParser):
131
131
  volume = tree.xpath('//h3[@class="text-muted"]/text()')
132
132
  volume_text = volume[0].strip() if volume else ""
133
133
 
134
+ next_href = tree.xpath('//div[@class="footer"]/a[@class="f-right"]/@href')
135
+ next_chapter_id = (
136
+ next_href[0].split("/")[-1].replace(".html", "") if next_href else ""
137
+ )
138
+
134
139
  return {
135
140
  "id": chapter_id,
136
141
  "title": title_text,
@@ -138,5 +143,6 @@ class QianbiParser(BaseParser):
138
143
  "extra": {
139
144
  "site": "qianbi",
140
145
  "volume": volume_text,
146
+ "next_chapter_id": next_chapter_id,
141
147
  },
142
148
  }
@@ -3,17 +3,8 @@
3
3
  novel_downloader.core.parsers.qidian
4
4
  ------------------------------------
5
5
 
6
- This package provides parsing implementations for the Qidian platform.
7
-
8
- Modules:
9
- - browser: Contains `QidianBrowserParser` for browser-rendered page parsing.
10
- - session: Contains `QidianSessionParser` for session page parsing.
11
6
  """
12
7
 
13
- from .browser import QidianBrowserParser
14
- from .session import QidianSessionParser
8
+ from .main_parser import QidianParser
15
9
 
16
- __all__ = [
17
- "QidianBrowserParser",
18
- "QidianSessionParser",
19
- ]
10
+ __all__ = ["QidianParser"]
@@ -0,0 +1,113 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.qidian.book_info_parser
4
+ -----------------------------------------------------
5
+
6
+ This module provides parsing of Qidian book info pages.
7
+
8
+ It extracts metadata such as title, author, cover URL, update
9
+ time, status, word count, summary, and volume-chapter structure.
10
+ """
11
+
12
+ import logging
13
+ from typing import Any
14
+
15
+ from lxml import html
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ _AUTHOR_XPATH = (
20
+ 'string(//div[contains(@class, "book-info")]//a[contains(@class, "writer")])'
21
+ )
22
+
23
+
24
+ def _chapter_url_to_id(url: str) -> str:
25
+ return url.rstrip("/").split("/")[-1]
26
+
27
+
28
+ def _get_volume_name(
29
+ vol_elem: html.HtmlElement,
30
+ ) -> str:
31
+ """
32
+ Extracts the volume title from a <div class="volume"> element using lxml.
33
+ Ignores <a> tags, and extracts text from other elements.
34
+ """
35
+ h3_candidates = vol_elem.xpath(".//h3")
36
+ if not h3_candidates:
37
+ return ""
38
+ texts = vol_elem.xpath(".//h3//text()[not(ancestor::a)]")
39
+ full_text = "".join(texts).strip()
40
+ return full_text.split(chr(183))[0].strip()
41
+
42
+
43
+ def parse_book_info(html_str: str) -> dict[str, Any]:
44
+ """
45
+ Extract metadata: title, author, cover_url, update_time, status,
46
+ word_count, summary, and volumes with chapters.
47
+
48
+ :param html_str: Raw HTML of the book info page.
49
+ :return: A dict containing book metadata.
50
+ """
51
+ info: dict[str, Any] = {}
52
+ try:
53
+ doc = html.fromstring(html_str)
54
+
55
+ book_name = doc.xpath('string(//h1/em[@id="bookName"])').strip()
56
+ info["book_name"] = book_name
57
+
58
+ author = doc.xpath(_AUTHOR_XPATH).strip()
59
+ info["author"] = author
60
+
61
+ cover_url = doc.xpath('string(//div[@class="book-img"]//img/@src)').strip()
62
+ info["cover_url"] = cover_url
63
+
64
+ update_raw = (
65
+ doc.xpath('string(//span[contains(@class, "update-time")])')
66
+ .replace("更新时间", "")
67
+ .strip()
68
+ )
69
+ info["update_time"] = update_raw
70
+
71
+ status = doc.xpath('string(//p[@class="tag"]/span[@class="blue"][1])').strip()
72
+ info["serial_status"] = status
73
+
74
+ tags = doc.xpath('//p[@class="tag"]/a[@class="red"]/text()')
75
+ info["tags"] = [t.strip() for t in tags if t.strip()]
76
+
77
+ wc_number = doc.xpath("string(//p[em and cite][1]/em[1])").strip()
78
+ wc_unit = doc.xpath("string(//p[em and cite][1]/cite[1])").strip()
79
+ info["word_count"] = (
80
+ (wc_number + wc_unit) if wc_number and wc_unit else "Unknown"
81
+ )
82
+
83
+ summary = doc.xpath('string(//p[@class="intro"])').strip()
84
+ info["summary_brief"] = summary
85
+
86
+ intro_list = doc.xpath('//div[@class="book-intro"]/p')[0]
87
+ detail_intro = "\n".join(intro_list.itertext()).strip()
88
+ info["summary"] = detail_intro
89
+
90
+ volumes = []
91
+ for vol_div in doc.xpath('//div[@class="volume-wrap"]/div[@class="volume"]'):
92
+ volume_name = _get_volume_name(vol_div)
93
+ chapters = []
94
+ for li in vol_div.xpath(".//li"):
95
+ a = li.xpath(".//a")[0] if li.xpath(".//a") else None
96
+ if a is None or "href" not in a.attrib:
97
+ continue
98
+ href = a.attrib["href"].strip()
99
+ title = "".join(a.itertext()).strip()
100
+ chapters.append(
101
+ {
102
+ "title": title,
103
+ "url": href,
104
+ "chapterId": _chapter_url_to_id(href),
105
+ }
106
+ )
107
+ volumes.append({"volume_name": volume_name, "chapters": chapters})
108
+ info["volumes"] = volumes
109
+
110
+ except Exception as e:
111
+ logger.warning("[Parser] Error parsing book info: %s", e)
112
+
113
+ return info