novel-downloader 1.3.0__py3-none-any.whl → 1.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/download.py +1 -1
  3. novel_downloader/config/adapter.py +3 -0
  4. novel_downloader/config/models.py +3 -0
  5. novel_downloader/core/downloaders/__init__.py +23 -1
  6. novel_downloader/core/downloaders/biquge/__init__.py +2 -0
  7. novel_downloader/core/downloaders/biquge/biquge_async.py +27 -0
  8. novel_downloader/core/downloaders/biquge/biquge_sync.py +5 -3
  9. novel_downloader/core/downloaders/common/common_async.py +5 -3
  10. novel_downloader/core/downloaders/common/common_sync.py +18 -10
  11. novel_downloader/core/downloaders/esjzone/__init__.py +14 -0
  12. novel_downloader/core/downloaders/esjzone/esjzone_async.py +27 -0
  13. novel_downloader/core/downloaders/esjzone/esjzone_sync.py +27 -0
  14. novel_downloader/core/downloaders/qianbi/__init__.py +14 -0
  15. novel_downloader/core/downloaders/qianbi/qianbi_async.py +27 -0
  16. novel_downloader/core/downloaders/qianbi/qianbi_sync.py +27 -0
  17. novel_downloader/core/downloaders/qidian/qidian_sync.py +9 -6
  18. novel_downloader/core/downloaders/sfacg/__init__.py +14 -0
  19. novel_downloader/core/downloaders/sfacg/sfacg_async.py +27 -0
  20. novel_downloader/core/downloaders/sfacg/sfacg_sync.py +27 -0
  21. novel_downloader/core/downloaders/yamibo/__init__.py +14 -0
  22. novel_downloader/core/downloaders/yamibo/yamibo_async.py +27 -0
  23. novel_downloader/core/downloaders/yamibo/yamibo_sync.py +27 -0
  24. novel_downloader/core/factory/downloader.py +35 -7
  25. novel_downloader/core/factory/parser.py +23 -2
  26. novel_downloader/core/factory/requester.py +32 -7
  27. novel_downloader/core/factory/saver.py +14 -2
  28. novel_downloader/core/interfaces/async_requester.py +3 -3
  29. novel_downloader/core/interfaces/parser.py +7 -2
  30. novel_downloader/core/interfaces/sync_requester.py +3 -3
  31. novel_downloader/core/parsers/__init__.py +15 -5
  32. novel_downloader/core/parsers/base.py +7 -2
  33. novel_downloader/core/parsers/biquge/main_parser.py +13 -4
  34. novel_downloader/core/parsers/common/main_parser.py +13 -4
  35. novel_downloader/core/parsers/esjzone/__init__.py +10 -0
  36. novel_downloader/core/parsers/esjzone/main_parser.py +219 -0
  37. novel_downloader/core/parsers/qianbi/__init__.py +10 -0
  38. novel_downloader/core/parsers/qianbi/main_parser.py +142 -0
  39. novel_downloader/core/parsers/qidian/browser/main_parser.py +13 -4
  40. novel_downloader/core/parsers/qidian/session/main_parser.py +13 -4
  41. novel_downloader/core/parsers/sfacg/__init__.py +10 -0
  42. novel_downloader/core/parsers/sfacg/main_parser.py +166 -0
  43. novel_downloader/core/parsers/yamibo/__init__.py +10 -0
  44. novel_downloader/core/parsers/yamibo/main_parser.py +194 -0
  45. novel_downloader/core/requesters/__init__.py +33 -3
  46. novel_downloader/core/requesters/base/async_session.py +14 -10
  47. novel_downloader/core/requesters/base/browser.py +4 -7
  48. novel_downloader/core/requesters/base/session.py +25 -11
  49. novel_downloader/core/requesters/biquge/__init__.py +2 -0
  50. novel_downloader/core/requesters/biquge/async_session.py +71 -0
  51. novel_downloader/core/requesters/biquge/session.py +6 -6
  52. novel_downloader/core/requesters/common/async_session.py +4 -4
  53. novel_downloader/core/requesters/common/session.py +6 -6
  54. novel_downloader/core/requesters/esjzone/__init__.py +13 -0
  55. novel_downloader/core/requesters/esjzone/async_session.py +211 -0
  56. novel_downloader/core/requesters/esjzone/session.py +235 -0
  57. novel_downloader/core/requesters/qianbi/__init__.py +13 -0
  58. novel_downloader/core/requesters/qianbi/async_session.py +96 -0
  59. novel_downloader/core/requesters/qianbi/session.py +125 -0
  60. novel_downloader/core/requesters/qidian/broswer.py +11 -10
  61. novel_downloader/core/requesters/qidian/session.py +14 -11
  62. novel_downloader/core/requesters/sfacg/__init__.py +13 -0
  63. novel_downloader/core/requesters/sfacg/async_session.py +204 -0
  64. novel_downloader/core/requesters/sfacg/session.py +242 -0
  65. novel_downloader/core/requesters/yamibo/__init__.py +13 -0
  66. novel_downloader/core/requesters/yamibo/async_session.py +211 -0
  67. novel_downloader/core/requesters/yamibo/session.py +237 -0
  68. novel_downloader/core/savers/__init__.py +15 -3
  69. novel_downloader/core/savers/base.py +1 -0
  70. novel_downloader/core/savers/esjzone.py +25 -0
  71. novel_downloader/core/savers/qianbi.py +25 -0
  72. novel_downloader/core/savers/sfacg.py +25 -0
  73. novel_downloader/core/savers/yamibo.py +25 -0
  74. novel_downloader/locales/en.json +1 -0
  75. novel_downloader/locales/zh.json +1 -0
  76. novel_downloader/resources/config/settings.toml +40 -4
  77. novel_downloader/utils/time_utils/__init__.py +2 -1
  78. novel_downloader/utils/time_utils/datetime_utils.py +3 -1
  79. novel_downloader/utils/time_utils/sleep_utils.py +43 -1
  80. {novel_downloader-1.3.0.dist-info → novel_downloader-1.3.2.dist-info}/METADATA +25 -20
  81. {novel_downloader-1.3.0.dist-info → novel_downloader-1.3.2.dist-info}/RECORD +85 -47
  82. {novel_downloader-1.3.0.dist-info → novel_downloader-1.3.2.dist-info}/WHEEL +0 -0
  83. {novel_downloader-1.3.0.dist-info → novel_downloader-1.3.2.dist-info}/entry_points.txt +0 -0
  84. {novel_downloader-1.3.0.dist-info → novel_downloader-1.3.2.dist-info}/licenses/LICENSE +0 -0
  85. {novel_downloader-1.3.0.dist-info → novel_downloader-1.3.2.dist-info}/top_level.txt +0 -0
@@ -14,19 +14,40 @@ from novel_downloader.core.interfaces import ParserProtocol
14
14
  from novel_downloader.core.parsers import (
15
15
  BiqugeParser,
16
16
  CommonParser,
17
+ EsjzoneParser,
18
+ QianbiParser,
17
19
  QidianBrowserParser,
18
20
  QidianSessionParser,
21
+ SfacgParser,
22
+ YamiboParser,
19
23
  )
20
24
 
21
25
  ParserBuilder = Callable[[ParserConfig], ParserProtocol]
22
26
 
23
27
  _site_map: dict[str, dict[str, ParserBuilder]] = {
28
+ "biquge": {
29
+ "session": BiqugeParser,
30
+ "async": BiqugeParser,
31
+ },
32
+ "esjzone": {
33
+ "session": EsjzoneParser,
34
+ "async": EsjzoneParser,
35
+ },
36
+ "qianbi": {
37
+ "session": QianbiParser,
38
+ "async": QianbiParser,
39
+ },
24
40
  "qidian": {
25
41
  "browser": QidianBrowserParser,
26
42
  "session": QidianSessionParser,
27
43
  },
28
- "biquge": {
29
- "session": BiqugeParser,
44
+ "sfacg": {
45
+ "session": SfacgParser,
46
+ "async": SfacgParser,
47
+ },
48
+ "yamibo": {
49
+ "session": YamiboParser,
50
+ "async": YamiboParser,
30
51
  },
31
52
  }
32
53
 
@@ -15,30 +15,55 @@ from novel_downloader.core.interfaces import (
15
15
  SyncRequesterProtocol,
16
16
  )
17
17
  from novel_downloader.core.requesters import (
18
+ BiqugeAsyncSession,
18
19
  BiqugeSession,
19
20
  CommonAsyncSession,
20
21
  CommonSession,
22
+ EsjzoneAsyncSession,
23
+ EsjzoneSession,
24
+ QianbiAsyncSession,
25
+ QianbiSession,
21
26
  QidianBrowser,
22
27
  QidianSession,
28
+ SfacgAsyncSession,
29
+ SfacgSession,
30
+ YamiboAsyncSession,
31
+ YamiboSession,
23
32
  )
24
33
 
25
34
  AsyncRequesterBuilder = Callable[[RequesterConfig], AsyncRequesterProtocol]
26
35
  SyncRequesterBuilder = Callable[[RequesterConfig], SyncRequesterProtocol]
27
36
 
28
37
 
29
- # _async_site_map: dict[str, AsyncRequesterBuilder] = {
30
- # # "biquge": ...
31
- # }
38
+ _async_site_map: dict[str, AsyncRequesterBuilder] = {
39
+ "biquge": BiqugeAsyncSession,
40
+ "esjzone": EsjzoneAsyncSession,
41
+ "qianbi": QianbiAsyncSession,
42
+ "sfacg": SfacgAsyncSession,
43
+ "yamibo": YamiboAsyncSession,
44
+ }
32
45
  _sync_site_map: dict[
33
46
  str,
34
47
  dict[str, SyncRequesterBuilder],
35
48
  ] = {
49
+ "biquge": {
50
+ "session": BiqugeSession,
51
+ },
52
+ "esjzone": {
53
+ "session": EsjzoneSession,
54
+ },
55
+ "qianbi": {
56
+ "session": QianbiSession,
57
+ },
36
58
  "qidian": {
37
59
  "session": QidianSession,
38
60
  "browser": QidianBrowser,
39
61
  },
40
- "biquge": {
41
- "session": BiqugeSession,
62
+ "sfacg": {
63
+ "session": SfacgSession,
64
+ },
65
+ "yamibo": {
66
+ "session": YamiboSession,
42
67
  },
43
68
  }
44
69
 
@@ -57,8 +82,8 @@ def get_async_requester(
57
82
  site_key = site.lower()
58
83
 
59
84
  # site-specific
60
- # if site_key in _async_site_map:
61
- # return _async_site_map[site_key](config)
85
+ if site_key in _async_site_map:
86
+ return _async_site_map[site_key](config)
62
87
 
63
88
  # fallback
64
89
  site_rules = load_site_rules()
@@ -7,17 +7,29 @@ This module implements a factory function for creating saver instances
7
7
  based on the site name and parser mode specified in the configuration.
8
8
  """
9
9
 
10
+ from collections.abc import Callable
11
+
10
12
  from novel_downloader.config import SaverConfig, load_site_rules
11
13
  from novel_downloader.core.interfaces import SaverProtocol
12
14
  from novel_downloader.core.savers import (
13
15
  BiqugeSaver,
14
16
  CommonSaver,
17
+ EsjzoneSaver,
18
+ QianbiSaver,
15
19
  QidianSaver,
20
+ SfacgSaver,
21
+ YamiboSaver,
16
22
  )
17
23
 
18
- _site_map = {
19
- "qidian": QidianSaver,
24
+ SaverBuilder = Callable[[SaverConfig], SaverProtocol]
25
+
26
+ _site_map: dict[str, SaverBuilder] = {
20
27
  "biquge": BiqugeSaver,
28
+ "esjzone": EsjzoneSaver,
29
+ "qianbi": QianbiSaver,
30
+ "qidian": QidianSaver,
31
+ "sfacg": SfacgSaver,
32
+ "yamibo": YamiboSaver,
21
33
  }
22
34
 
23
35
 
@@ -40,7 +40,7 @@ class AsyncRequesterProtocol(Protocol):
40
40
  self,
41
41
  book_id: str,
42
42
  **kwargs: Any,
43
- ) -> str:
43
+ ) -> list[str]:
44
44
  """
45
45
  Fetch the raw HTML (or JSON) of the book info page asynchronously.
46
46
 
@@ -54,7 +54,7 @@ class AsyncRequesterProtocol(Protocol):
54
54
  book_id: str,
55
55
  chapter_id: str,
56
56
  **kwargs: Any,
57
- ) -> str:
57
+ ) -> list[str]:
58
58
  """
59
59
  Fetch the raw HTML (or JSON) of a single chapter asynchronously.
60
60
 
@@ -68,7 +68,7 @@ class AsyncRequesterProtocol(Protocol):
68
68
  self,
69
69
  page: int = 1,
70
70
  **kwargs: Any,
71
- ) -> str:
71
+ ) -> list[str]:
72
72
  """
73
73
  Optional: Retrieve the HTML content of the authenticated
74
74
  user's bookcase page asynchronously.
@@ -21,7 +21,11 @@ class ParserProtocol(Protocol):
21
21
  - accept a book_id context for multi-step workflows.
22
22
  """
23
23
 
24
- def parse_book_info(self, html_str: str) -> dict[str, Any]:
24
+ def parse_book_info(
25
+ self,
26
+ html_str: list[str],
27
+ **kwargs: Any,
28
+ ) -> dict[str, Any]:
25
29
  """
26
30
  Parse and return a dictionary of book information from the raw HTML.
27
31
 
@@ -32,8 +36,9 @@ class ParserProtocol(Protocol):
32
36
 
33
37
  def parse_chapter(
34
38
  self,
35
- html_str: str,
39
+ html_str: list[str],
36
40
  chapter_id: str,
41
+ **kwargs: Any,
37
42
  ) -> ChapterDict | None:
38
43
  """
39
44
  Parse and return the text content of one chapter.
@@ -38,7 +38,7 @@ class SyncRequesterProtocol(Protocol):
38
38
  self,
39
39
  book_id: str,
40
40
  **kwargs: Any,
41
- ) -> str:
41
+ ) -> list[str]:
42
42
  """
43
43
  Fetch the raw HTML (or JSON) of the book info page.
44
44
 
@@ -52,7 +52,7 @@ class SyncRequesterProtocol(Protocol):
52
52
  book_id: str,
53
53
  chapter_id: str,
54
54
  **kwargs: Any,
55
- ) -> str:
55
+ ) -> list[str]:
56
56
  """
57
57
  Fetch the raw HTML (or JSON) of a single chapter.
58
58
 
@@ -66,7 +66,7 @@ class SyncRequesterProtocol(Protocol):
66
66
  self,
67
67
  page: int = 1,
68
68
  **kwargs: Any,
69
- ) -> str:
69
+ ) -> list[str]:
70
70
  """
71
71
  Optional: Retrieve the HTML content of the authenticated user's bookcase page.
72
72
 
@@ -6,24 +6,34 @@ novel_downloader.core.parsers
6
6
  This package defines all site-specific parsing modules
7
7
  for the novel_downloader framework.
8
8
 
9
- Currently supported:
10
- - Qidian (起点中文网)
11
-
12
9
  Modules:
13
- - qidian_parser
14
- - common_parser
10
+ - biquge (笔趣阁)
11
+ - esjzone (ESJ Zone)
12
+ - qianbi (铅笔小说)
13
+ - qidian (起点中文网)
14
+ - sfacg (SF轻小说)
15
+ - yamibo (百合会)
16
+ - common (通用架构)
15
17
  """
16
18
 
17
19
  from .biquge import BiqugeParser
18
20
  from .common import CommonParser
21
+ from .esjzone import EsjzoneParser
22
+ from .qianbi import QianbiParser
19
23
  from .qidian import (
20
24
  QidianBrowserParser,
21
25
  QidianSessionParser,
22
26
  )
27
+ from .sfacg import SfacgParser
28
+ from .yamibo import YamiboParser
23
29
 
24
30
  __all__ = [
25
31
  "BiqugeParser",
26
32
  "CommonParser",
33
+ "EsjzoneParser",
34
+ "QianbiParser",
27
35
  "QidianBrowserParser",
28
36
  "QidianSessionParser",
37
+ "SfacgParser",
38
+ "YamiboParser",
29
39
  ]
@@ -49,7 +49,11 @@ class BaseParser(ParserProtocol, abc.ABC):
49
49
  self._cache_dir = self._base_cache_dir
50
50
 
51
51
  @abc.abstractmethod
52
- def parse_book_info(self, html_str: str) -> dict[str, Any]:
52
+ def parse_book_info(
53
+ self,
54
+ html_str: list[str],
55
+ **kwargs: Any,
56
+ ) -> dict[str, Any]:
53
57
  """
54
58
  Parse a book info page and extract metadata and chapter structure.
55
59
 
@@ -64,8 +68,9 @@ class BaseParser(ParserProtocol, abc.ABC):
64
68
  @abc.abstractmethod
65
69
  def parse_chapter(
66
70
  self,
67
- html_str: str,
71
+ html_str: list[str],
68
72
  chapter_id: str,
73
+ **kwargs: Any,
69
74
  ) -> ChapterDict | None:
70
75
  """
71
76
  Parse a single chapter page and extract clean text or simplified HTML.
@@ -18,14 +18,20 @@ from novel_downloader.utils.chapter_storage import ChapterDict
18
18
  class BiqugeParser(BaseParser):
19
19
  """ """
20
20
 
21
- def parse_book_info(self, html_str: str) -> dict[str, Any]:
21
+ def parse_book_info(
22
+ self,
23
+ html_str: list[str],
24
+ **kwargs: Any,
25
+ ) -> dict[str, Any]:
22
26
  """
23
27
  Parse a book info page and extract metadata and chapter structure.
24
28
 
25
29
  :param html: Raw HTML of the book info page.
26
30
  :return: Parsed metadata and chapter structure as a dictionary.
27
31
  """
28
- tree = etree.HTML(html_str, parser=None)
32
+ if not html_str:
33
+ return {}
34
+ tree = etree.HTML(html_str[0])
29
35
  result: dict[str, Any] = {}
30
36
 
31
37
  def extract_text(elem: _Element | None) -> str:
@@ -90,8 +96,9 @@ class BiqugeParser(BaseParser):
90
96
 
91
97
  def parse_chapter(
92
98
  self,
93
- html_str: str,
99
+ html_str: list[str],
94
100
  chapter_id: str,
101
+ **kwargs: Any,
95
102
  ) -> ChapterDict | None:
96
103
  """
97
104
  Parse a single chapter page and extract clean text or simplified HTML.
@@ -100,7 +107,9 @@ class BiqugeParser(BaseParser):
100
107
  :param chapter_id: Identifier of the chapter being parsed.
101
108
  :return: Cleaned chapter content as plain text or minimal HTML.
102
109
  """
103
- tree = etree.HTML(html_str, parser=None)
110
+ if not html_str:
111
+ return None
112
+ tree = etree.HTML(html_str[0], parser=None)
104
113
 
105
114
  # 提取标题
106
115
  title_elem = tree.xpath('//div[@class="bookname"]/h1')
@@ -35,21 +35,28 @@ class CommonParser(BaseParser):
35
35
  self._site = site
36
36
  self._site_rule = site_rule
37
37
 
38
- def parse_book_info(self, html_str: str) -> dict[str, Any]:
38
+ def parse_book_info(
39
+ self,
40
+ html_str: list[str],
41
+ **kwargs: Any,
42
+ ) -> dict[str, Any]:
39
43
  """
40
44
  Parse a book info page and extract metadata and chapter structure.
41
45
 
42
46
  :param html: Raw HTML of the book info page.
43
47
  :return: Parsed metadata and chapter structure as a dictionary.
44
48
  """
45
- extractor = HTMLExtractor(html_str)
49
+ if not html_str:
50
+ return {}
51
+ extractor = HTMLExtractor(html_str[0])
46
52
  rules = self._site_rule["book_info"]
47
53
  return extractor.extract_book_info(rules)
48
54
 
49
55
  def parse_chapter(
50
56
  self,
51
- html_str: str,
57
+ html_str: list[str],
52
58
  chapter_id: str,
59
+ **kwargs: Any,
53
60
  ) -> ChapterDict | None:
54
61
  """
55
62
  Parse a single chapter page and extract clean text or simplified HTML.
@@ -58,7 +65,9 @@ class CommonParser(BaseParser):
58
65
  :param chapter_id: Identifier of the chapter being parsed.
59
66
  :return: Cleaned chapter content as plain text or minimal HTML.
60
67
  """
61
- extractor = HTMLExtractor(html_str)
68
+ if not html_str:
69
+ return None
70
+ extractor = HTMLExtractor(html_str[0])
62
71
  chapter_rules = self._site_rule["chapter"]
63
72
 
64
73
  # 必须有正文内容
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.esjzone
4
+ -------------------------------------
5
+
6
+ """
7
+
8
+ from .main_parser import EsjzoneParser
9
+
10
+ __all__ = ["EsjzoneParser"]
@@ -0,0 +1,219 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.esjzone.main_parser
4
+ -------------------------------------------------
5
+
6
+ """
7
+
8
+ from typing import Any
9
+
10
+ from lxml import etree
11
+ from lxml.etree import _Element
12
+
13
+ from novel_downloader.core.parsers.base import BaseParser
14
+ from novel_downloader.utils.chapter_storage import ChapterDict
15
+
16
+
17
+ class EsjzoneParser(BaseParser):
18
+ """ """
19
+
20
+ # Book info XPaths
21
+ _BOOK_NAME_XPATH = '//h2[contains(@class, "text-normal")]/text()'
22
+ _AUTHOR_XPATH = '//li[strong[text()="作者:"]]/a/text()'
23
+ _COVER_URL_XPATH = '//div[contains(@class,"product-gallery")]//img/@src'
24
+ _UPDATE_TIME_XPATH = '//li[strong[text()="更新日期:"]]/text()'
25
+ _WORD_COUNT_XPATH = '//span[@id="txt"]/text()'
26
+ _TYPE_XPATH = '//li[strong[text()="類型:"]]/text()'
27
+ _ALT_NAME_XPATH = '//li[strong[text()="其他書名:"]]/text()'
28
+ _WEB_URL_XPATH = '//li[strong[text()="Web生肉:"]]/a/@href'
29
+ _SUMMARY_XPATH = '//div[@class="description"]/p//text()'
30
+
31
+ # Chapter XPaths
32
+ _CHAPTER_TEXT_XPATH = 'string(//div[contains(@class, "forum-content")])'
33
+ _CHAPTER_CONTENT_NODES_XPATH = '//div[contains(@class, "forum-content")]/*'
34
+ _CHAPTER_TIME_XPATHS = [
35
+ '//i[contains(@class, "icon-clock")]/following-sibling::text()',
36
+ '//i[contains(@class, "icon-pen-tool")]/following-sibling::text()',
37
+ ]
38
+
39
+ _CHECK_FORUM_XPATH = '//div[@class="page-title"]//ul[@class="breadcrumbs"]/li[not(@class="slash")]//text()' # noqa: E501
40
+
41
+ def parse_book_info(
42
+ self,
43
+ html_str: list[str],
44
+ **kwargs: Any,
45
+ ) -> dict[str, Any]:
46
+ """
47
+ Parse a book info page and extract metadata and chapter structure.
48
+
49
+ 注: 由于网站使用了多种不同的分卷格式, 已经尝试兼容常见情况,
50
+ 但仍可能存在未覆盖的 cases
51
+
52
+ :param html: Raw HTML of the book info page.
53
+ :return: Parsed metadata and chapter structure as a dictionary.
54
+ """
55
+ if not html_str or self._is_forum_page(html_str):
56
+ return {}
57
+ tree = etree.HTML(html_str[0])
58
+ result: dict[str, Any] = {}
59
+
60
+ result["book_name"] = self._get_text(tree, self._BOOK_NAME_XPATH)
61
+ result["author"] = self._get_text(tree, self._AUTHOR_XPATH)
62
+ result["cover_url"] = self._get_text(tree, self._COVER_URL_XPATH)
63
+ result["update_time"] = self._get_text(tree, self._UPDATE_TIME_XPATH)
64
+ result["word_count"] = self._get_text(
65
+ tree, self._WORD_COUNT_XPATH, clean_comma=True
66
+ )
67
+ result["type"] = self._get_text(tree, self._TYPE_XPATH)
68
+ result["alt_name"] = self._get_text(tree, self._ALT_NAME_XPATH)
69
+ result["web_url"] = self._get_text(tree, self._WEB_URL_XPATH)
70
+ # result["summary"] = self._get_text(tree, self._SUMMARY_XPATH, join=True)
71
+ paras = tree.xpath('//div[@class="description"]/p')
72
+ texts = [p.xpath("string()").strip() for p in paras]
73
+ result["summary"] = "\n".join(texts).strip()
74
+
75
+ volumes: list[dict[str, Any]] = []
76
+ current_vol: dict[str, Any] = {}
77
+
78
+ def _start_volume(name: str) -> None:
79
+ nonlocal current_vol
80
+ name = name.strip() or "未命名卷"
81
+ if name == "未命名卷" and current_vol is not None:
82
+ return
83
+ current_vol = {"volume_name": name, "chapters": []}
84
+ volumes.append(current_vol)
85
+
86
+ _start_volume("單卷")
87
+
88
+ nodes = tree.xpath('//div[@id="chapterList"]/details') + tree.xpath(
89
+ '//div[@id="chapterList"]/*[not(self::details)]'
90
+ )
91
+
92
+ for node in nodes:
93
+ tag = node.tag.lower()
94
+
95
+ if tag == "details":
96
+ # ---- DETAILS‐based layout ----
97
+ summary = node.find("summary")
98
+ vol_name = summary.text if summary is not None else "未命名卷"
99
+ _start_volume(vol_name)
100
+
101
+ # all chapters inside this details
102
+ for a in node.findall("a"):
103
+ title = "".join(a.xpath(".//p//text()")).strip()
104
+ href = a.get("href", "")
105
+ chap_id = href.rstrip("/").split("/")[-1].split(".", 1)[0]
106
+ current_vol["chapters"].append(
107
+ {"title": title, "url": href, "chapterId": chap_id}
108
+ )
109
+
110
+ elif (
111
+ tag in ("h2",)
112
+ or (tag == "p" and node.get("class") == "non")
113
+ or tag == "summary"
114
+ ):
115
+ # Handle possible volume title markers:
116
+ # - <h2>: standard volume header
117
+ # - <p class="non">: alternative volume header style
118
+ # - <summary>: fallback for stray <summary> tags outside <details>
119
+ _start_volume(node.xpath("string()"))
120
+
121
+ elif tag == "a":
122
+ # ---- chapter link, attach to current volume ----
123
+ title = "".join(node.xpath(".//p//text()")).strip()
124
+ href = node.get("href", "")
125
+ chap_id = href.rstrip("/").split("/")[-1].split(".", 1)[0]
126
+ current_vol["chapters"].append(
127
+ {"title": title, "url": href, "chapterId": chap_id}
128
+ )
129
+ volumes = [vol for vol in volumes if vol["chapters"]]
130
+ result["volumes"] = volumes
131
+
132
+ return result
133
+
134
+ def parse_chapter(
135
+ self,
136
+ html_str: list[str],
137
+ chapter_id: str,
138
+ **kwargs: Any,
139
+ ) -> ChapterDict | None:
140
+ """
141
+ Parse a single chapter page and extract clean text or simplified HTML.
142
+
143
+ :param html: Raw HTML of the chapter page.
144
+ :param chapter_id: Identifier of the chapter being parsed.
145
+ :return: Cleaned chapter content as plain text or minimal HTML.
146
+ """
147
+ if not html_str or self._is_forum_page(html_str):
148
+ return None
149
+ tree = etree.HTML(html_str[0], parser=None)
150
+
151
+ content_lines: list[str] = []
152
+ content_nodes = tree.xpath(self._CHAPTER_CONTENT_NODES_XPATH)
153
+ for node in content_nodes:
154
+ if node.tag == "p":
155
+ img_srcs = node.xpath(".//img/@src")
156
+ if img_srcs:
157
+ for src in img_srcs:
158
+ content_lines.append(f'<img src="{src}" />')
159
+ else:
160
+ text = "".join(node.xpath(".//text()")).strip()
161
+ if text:
162
+ content_lines.append(text)
163
+ elif node.tag == "a":
164
+ img_srcs = node.xpath(".//img/@src")
165
+ for src in img_srcs:
166
+ content_lines.append(f'<img src="{src}" />')
167
+
168
+ content = (
169
+ "\n\n".join(content_lines).strip()
170
+ if content_lines
171
+ else tree.xpath(self._CHAPTER_TEXT_XPATH).strip()
172
+ )
173
+ if not content:
174
+ return None
175
+
176
+ title_nodes = tree.xpath("//h2/text()")
177
+ title = title_nodes[0].strip() if title_nodes else ""
178
+
179
+ updated_at = next(
180
+ (
181
+ x.strip()
182
+ for xp in self._CHAPTER_TIME_XPATHS
183
+ for x in tree.xpath(xp)
184
+ if x.strip()
185
+ ),
186
+ "",
187
+ )
188
+
189
+ return {
190
+ "id": chapter_id,
191
+ "title": title,
192
+ "content": content,
193
+ "extra": {"site": "esjzone", "updated_at": updated_at},
194
+ }
195
+
196
+ def _is_forum_page(self, html_str: list[str]) -> bool:
197
+ if not html_str:
198
+ return False
199
+
200
+ tree = etree.HTML(html_str[0])
201
+ page_title = tree.xpath('string(//div[@class="page-title"]//h1)').strip()
202
+ if page_title != "論壇":
203
+ return False
204
+ breadcrumb: list[str] = tree.xpath(self._CHECK_FORUM_XPATH)
205
+ breadcrumb = [s.strip() for s in breadcrumb if s.strip()]
206
+ return breadcrumb == ["Home", "論壇"]
207
+
208
+ @staticmethod
209
+ def _get_text(
210
+ tree: _Element,
211
+ xpath: str,
212
+ join: bool = False,
213
+ clean_comma: bool = False,
214
+ ) -> str:
215
+ data = tree.xpath(xpath)
216
+ if not data:
217
+ return ""
218
+ text = "\n".join(data) if join else data[0].strip()
219
+ return text.replace(",", "") if clean_comma else text
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.qianbi
4
+ ------------------------------------
5
+
6
+ """
7
+
8
+ from .main_parser import QianbiParser
9
+
10
+ __all__ = ["QianbiParser"]