novel-downloader 1.2.2__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. novel_downloader/__init__.py +1 -2
  2. novel_downloader/cli/__init__.py +0 -1
  3. novel_downloader/cli/clean.py +2 -10
  4. novel_downloader/cli/download.py +16 -22
  5. novel_downloader/cli/interactive.py +0 -1
  6. novel_downloader/cli/main.py +1 -3
  7. novel_downloader/cli/settings.py +8 -8
  8. novel_downloader/config/__init__.py +0 -1
  9. novel_downloader/config/adapter.py +32 -27
  10. novel_downloader/config/loader.py +116 -108
  11. novel_downloader/config/models.py +35 -29
  12. novel_downloader/config/site_rules.py +2 -4
  13. novel_downloader/core/__init__.py +0 -1
  14. novel_downloader/core/downloaders/__init__.py +4 -4
  15. novel_downloader/core/downloaders/base/__init__.py +14 -0
  16. novel_downloader/core/downloaders/{base_async_downloader.py → base/base_async.py} +49 -53
  17. novel_downloader/core/downloaders/{base_downloader.py → base/base_sync.py} +64 -43
  18. novel_downloader/core/downloaders/biquge/__init__.py +12 -0
  19. novel_downloader/core/downloaders/biquge/biquge_sync.py +25 -0
  20. novel_downloader/core/downloaders/common/__init__.py +14 -0
  21. novel_downloader/core/downloaders/{common_asynb_downloader.py → common/common_async.py} +42 -33
  22. novel_downloader/core/downloaders/{common_downloader.py → common/common_sync.py} +33 -21
  23. novel_downloader/core/downloaders/qidian/__init__.py +10 -0
  24. novel_downloader/core/downloaders/{qidian_downloader.py → qidian/qidian_sync.py} +79 -62
  25. novel_downloader/core/factory/__init__.py +4 -5
  26. novel_downloader/core/factory/{downloader_factory.py → downloader.py} +25 -26
  27. novel_downloader/core/factory/{parser_factory.py → parser.py} +12 -14
  28. novel_downloader/core/factory/{requester_factory.py → requester.py} +29 -16
  29. novel_downloader/core/factory/{saver_factory.py → saver.py} +4 -9
  30. novel_downloader/core/interfaces/__init__.py +8 -9
  31. novel_downloader/core/interfaces/{async_downloader_protocol.py → async_downloader.py} +4 -5
  32. novel_downloader/core/interfaces/{async_requester_protocol.py → async_requester.py} +23 -12
  33. novel_downloader/core/interfaces/{parser_protocol.py → parser.py} +11 -6
  34. novel_downloader/core/interfaces/{saver_protocol.py → saver.py} +2 -3
  35. novel_downloader/core/interfaces/{downloader_protocol.py → sync_downloader.py} +6 -7
  36. novel_downloader/core/interfaces/{requester_protocol.py → sync_requester.py} +31 -17
  37. novel_downloader/core/parsers/__init__.py +5 -4
  38. novel_downloader/core/parsers/{base_parser.py → base.py} +18 -9
  39. novel_downloader/core/parsers/biquge/__init__.py +10 -0
  40. novel_downloader/core/parsers/biquge/main_parser.py +126 -0
  41. novel_downloader/core/parsers/{common_parser → common}/__init__.py +2 -3
  42. novel_downloader/core/parsers/{common_parser → common}/helper.py +13 -13
  43. novel_downloader/core/parsers/{common_parser → common}/main_parser.py +15 -9
  44. novel_downloader/core/parsers/{qidian_parser → qidian}/__init__.py +2 -3
  45. novel_downloader/core/parsers/{qidian_parser → qidian}/browser/__init__.py +2 -3
  46. novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_encrypted.py +40 -48
  47. novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_normal.py +17 -21
  48. novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_router.py +10 -9
  49. novel_downloader/core/parsers/{qidian_parser → qidian}/browser/main_parser.py +14 -10
  50. novel_downloader/core/parsers/{qidian_parser → qidian}/session/__init__.py +2 -3
  51. novel_downloader/core/parsers/{qidian_parser → qidian}/session/chapter_encrypted.py +36 -44
  52. novel_downloader/core/parsers/{qidian_parser → qidian}/session/chapter_normal.py +19 -23
  53. novel_downloader/core/parsers/{qidian_parser → qidian}/session/chapter_router.py +10 -9
  54. novel_downloader/core/parsers/{qidian_parser → qidian}/session/main_parser.py +14 -10
  55. novel_downloader/core/parsers/{qidian_parser → qidian}/session/node_decryptor.py +7 -10
  56. novel_downloader/core/parsers/{qidian_parser → qidian}/shared/__init__.py +2 -3
  57. novel_downloader/core/parsers/{qidian_parser → qidian}/shared/book_info_parser.py +5 -6
  58. novel_downloader/core/parsers/{qidian_parser → qidian}/shared/helpers.py +7 -8
  59. novel_downloader/core/requesters/__init__.py +9 -5
  60. novel_downloader/core/requesters/base/__init__.py +16 -0
  61. novel_downloader/core/requesters/{base_async_session.py → base/async_session.py} +177 -73
  62. novel_downloader/core/requesters/base/browser.py +340 -0
  63. novel_downloader/core/requesters/base/session.py +364 -0
  64. novel_downloader/core/requesters/biquge/__init__.py +12 -0
  65. novel_downloader/core/requesters/biquge/session.py +90 -0
  66. novel_downloader/core/requesters/{common_requester → common}/__init__.py +4 -5
  67. novel_downloader/core/requesters/common/async_session.py +96 -0
  68. novel_downloader/core/requesters/common/session.py +113 -0
  69. novel_downloader/core/requesters/qidian/__init__.py +21 -0
  70. novel_downloader/core/requesters/qidian/broswer.py +307 -0
  71. novel_downloader/core/requesters/qidian/session.py +287 -0
  72. novel_downloader/core/savers/__init__.py +5 -3
  73. novel_downloader/core/savers/{base_saver.py → base.py} +12 -13
  74. novel_downloader/core/savers/biquge.py +25 -0
  75. novel_downloader/core/savers/{common_saver → common}/__init__.py +2 -3
  76. novel_downloader/core/savers/{common_saver/common_epub.py → common/epub.py} +23 -51
  77. novel_downloader/core/savers/{common_saver → common}/main_saver.py +43 -9
  78. novel_downloader/core/savers/{common_saver/common_txt.py → common/txt.py} +16 -46
  79. novel_downloader/core/savers/epub_utils/__init__.py +0 -1
  80. novel_downloader/core/savers/epub_utils/css_builder.py +13 -7
  81. novel_downloader/core/savers/epub_utils/initializer.py +4 -5
  82. novel_downloader/core/savers/epub_utils/text_to_html.py +2 -3
  83. novel_downloader/core/savers/epub_utils/volume_intro.py +1 -3
  84. novel_downloader/core/savers/{qidian_saver.py → qidian.py} +12 -6
  85. novel_downloader/locales/en.json +8 -4
  86. novel_downloader/locales/zh.json +5 -1
  87. novel_downloader/resources/config/settings.toml +88 -0
  88. novel_downloader/utils/cache.py +2 -2
  89. novel_downloader/utils/chapter_storage.py +340 -0
  90. novel_downloader/utils/constants.py +6 -4
  91. novel_downloader/utils/crypto_utils.py +3 -3
  92. novel_downloader/utils/file_utils/__init__.py +0 -1
  93. novel_downloader/utils/file_utils/io.py +12 -17
  94. novel_downloader/utils/file_utils/normalize.py +1 -3
  95. novel_downloader/utils/file_utils/sanitize.py +2 -9
  96. novel_downloader/utils/fontocr/__init__.py +0 -1
  97. novel_downloader/utils/fontocr/ocr_v1.py +19 -22
  98. novel_downloader/utils/fontocr/ocr_v2.py +147 -60
  99. novel_downloader/utils/hash_store.py +19 -20
  100. novel_downloader/utils/hash_utils.py +0 -1
  101. novel_downloader/utils/i18n.py +3 -4
  102. novel_downloader/utils/logger.py +5 -6
  103. novel_downloader/utils/model_loader.py +5 -8
  104. novel_downloader/utils/network.py +9 -10
  105. novel_downloader/utils/state.py +6 -7
  106. novel_downloader/utils/text_utils/__init__.py +0 -1
  107. novel_downloader/utils/text_utils/chapter_formatting.py +2 -7
  108. novel_downloader/utils/text_utils/diff_display.py +0 -1
  109. novel_downloader/utils/text_utils/font_mapping.py +1 -4
  110. novel_downloader/utils/text_utils/text_cleaning.py +0 -1
  111. novel_downloader/utils/time_utils/__init__.py +0 -1
  112. novel_downloader/utils/time_utils/datetime_utils.py +8 -10
  113. novel_downloader/utils/time_utils/sleep_utils.py +1 -3
  114. {novel_downloader-1.2.2.dist-info → novel_downloader-1.3.1.dist-info}/METADATA +14 -17
  115. novel_downloader-1.3.1.dist-info/RECORD +127 -0
  116. {novel_downloader-1.2.2.dist-info → novel_downloader-1.3.1.dist-info}/WHEEL +1 -1
  117. novel_downloader/core/requesters/base_browser.py +0 -214
  118. novel_downloader/core/requesters/base_session.py +0 -246
  119. novel_downloader/core/requesters/common_requester/common_async_session.py +0 -98
  120. novel_downloader/core/requesters/common_requester/common_session.py +0 -126
  121. novel_downloader/core/requesters/qidian_requester/__init__.py +0 -22
  122. novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +0 -396
  123. novel_downloader/core/requesters/qidian_requester/qidian_session.py +0 -202
  124. novel_downloader/resources/config/settings.yaml +0 -76
  125. novel_downloader-1.2.2.dist-info/RECORD +0 -115
  126. {novel_downloader-1.2.2.dist-info → novel_downloader-1.3.1.dist-info}/entry_points.txt +0 -0
  127. {novel_downloader-1.2.2.dist-info → novel_downloader-1.3.1.dist-info}/licenses/LICENSE +0 -0
  128. {novel_downloader-1.2.2.dist-info → novel_downloader-1.3.1.dist-info}/top_level.txt +0 -0
@@ -1,26 +1,25 @@
1
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
2
  """
4
- novel_downloader.core.interfaces.async_downloader_protocol
3
+ novel_downloader.core.interfaces.async_downloader
5
4
  ----------------------------------------------------------
6
5
 
7
6
  This module defines the AsyncDownloaderProtocol, a structural interface
8
7
  that outlines the expected behavior of any downloader class.
9
8
  """
10
9
 
11
- from typing import List, Protocol
10
+ from typing import Protocol
12
11
 
13
12
 
14
13
  class AsyncDownloaderProtocol(Protocol):
15
14
  """
16
- Protocol for fullyasynchronous downloader classes.
15
+ Protocol for fully-asynchronous downloader classes.
17
16
 
18
17
  Defines the expected interface for any downloader implementation,
19
18
  including both batch and single book downloads,
20
19
  as well as optional pre-download hooks.
21
20
  """
22
21
 
23
- async def download(self, book_ids: List[str]) -> None:
22
+ async def download(self, book_ids: list[str]) -> None:
24
23
  """
25
24
  Batch download entry point.
26
25
 
@@ -1,15 +1,14 @@
1
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
2
  """
4
- novel_downloader.core.interfaces.async_requester_protocol
3
+ novel_downloader.core.interfaces.async_requester
5
4
  --------------------------------------------------------
6
5
 
7
6
  Defines the AsyncRequesterProtocol interface for fetching raw HTML or JSON
8
7
  for book info pages, individual chapters, managing request lifecycle,
9
- and optionally retrieving a user's authenticated bookcase — all in async style.
8
+ and optionally retrieving a user's authenticated bookcase.
10
9
  """
11
10
 
12
- from typing import Literal, Optional, Protocol, runtime_checkable
11
+ from typing import Any, Literal, Protocol, runtime_checkable
13
12
 
14
13
 
15
14
  @runtime_checkable
@@ -24,7 +23,13 @@ class AsyncRequesterProtocol(Protocol):
24
23
  def is_async(self) -> Literal[True]:
25
24
  ...
26
25
 
27
- async def login(self, max_retries: int = 3, manual_login: bool = False) -> bool:
26
+ async def login(
27
+ self,
28
+ username: str = "",
29
+ password: str = "",
30
+ manual_login: bool = False,
31
+ **kwargs: Any,
32
+ ) -> bool:
28
33
  """
29
34
  Attempt to log in asynchronously.
30
35
  :returns: True if login succeeded.
@@ -32,41 +37,47 @@ class AsyncRequesterProtocol(Protocol):
32
37
  ...
33
38
 
34
39
  async def get_book_info(
35
- self, book_id: str, wait_time: Optional[float] = None
40
+ self,
41
+ book_id: str,
42
+ **kwargs: Any,
36
43
  ) -> str:
37
44
  """
38
45
  Fetch the raw HTML (or JSON) of the book info page asynchronously.
39
46
 
40
47
  :param book_id: The book identifier.
41
- :param wait_time: Base number of seconds to wait before returning content.
42
48
  :return: The page content as a string.
43
49
  """
44
50
  ...
45
51
 
46
52
  async def get_book_chapter(
47
- self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
53
+ self,
54
+ book_id: str,
55
+ chapter_id: str,
56
+ **kwargs: Any,
48
57
  ) -> str:
49
58
  """
50
59
  Fetch the raw HTML (or JSON) of a single chapter asynchronously.
51
60
 
52
61
  :param book_id: The book identifier.
53
62
  :param chapter_id: The chapter identifier.
54
- :param wait_time: Base number of seconds to wait before returning content.
55
63
  :return: The chapter content as a string.
56
64
  """
57
65
  ...
58
66
 
59
- async def get_bookcase(self, wait_time: Optional[float] = None) -> str:
67
+ async def get_bookcase(
68
+ self,
69
+ page: int = 1,
70
+ **kwargs: Any,
71
+ ) -> str:
60
72
  """
61
73
  Optional: Retrieve the HTML content of the authenticated
62
74
  user's bookcase page asynchronously.
63
75
 
64
- :param wait_time: Base number of seconds to wait before returning content.
65
76
  :return: The HTML markup of the bookcase page.
66
77
  """
67
78
  ...
68
79
 
69
- async def shutdown(self) -> None:
80
+ async def close(self) -> None:
70
81
  """
71
82
  Shutdown and clean up any resources (e.g., close aiohttp session).
72
83
  """
@@ -1,14 +1,15 @@
1
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
2
  """
4
- novel_downloader.core.interfaces.parser_protocol
5
- ------------------------------------------------
3
+ novel_downloader.core.interfaces.parser
4
+ ---------------------------------------
6
5
 
7
6
  Defines the ParserProtocol interface for extracting book metadata,
8
7
  parsing individual chapter content, and setting parser context via book_id.
9
8
  """
10
9
 
11
- from typing import Any, Dict, Protocol, runtime_checkable
10
+ from typing import Any, Protocol, runtime_checkable
11
+
12
+ from novel_downloader.utils.chapter_storage import ChapterDict
12
13
 
13
14
 
14
15
  @runtime_checkable
@@ -20,7 +21,7 @@ class ParserProtocol(Protocol):
20
21
  - accept a book_id context for multi-step workflows.
21
22
  """
22
23
 
23
- def parse_book_info(self, html_str: str) -> Dict[str, Any]:
24
+ def parse_book_info(self, html_str: str) -> dict[str, Any]:
24
25
  """
25
26
  Parse and return a dictionary of book information from the raw HTML.
26
27
 
@@ -29,7 +30,11 @@ class ParserProtocol(Protocol):
29
30
  """
30
31
  ...
31
32
 
32
- def parse_chapter(self, html_str: str, chapter_id: str) -> Dict[str, Any]:
33
+ def parse_chapter(
34
+ self,
35
+ html_str: str,
36
+ chapter_id: str,
37
+ ) -> ChapterDict | None:
33
38
  """
34
39
  Parse and return the text content of one chapter.
35
40
 
@@ -1,8 +1,7 @@
1
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
2
  """
4
- novel_downloader.core.interfaces.saver_protocol
5
- ------------------------------------------------
3
+ novel_downloader.core.interfaces.saver
4
+ --------------------------------------
6
5
 
7
6
  Defines the SaverProtocol interface for persisting completed books in
8
7
  TXT, EPUB, Markdown, and PDF formats.
@@ -1,17 +1,16 @@
1
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
2
  """
4
- novel_downloader.core.interfaces.downloader_protocol
5
- ----------------------------------------------------
3
+ novel_downloader.core.interfaces.sync_downloader
4
+ ------------------------------------------------
6
5
 
7
- This module defines the DownloaderProtocol, a structural interface
6
+ This module defines the SyncDownloaderProtocol, a structural interface
8
7
  that outlines the expected behavior of any downloader class.
9
8
  """
10
9
 
11
- from typing import List, Protocol
10
+ from typing import Protocol
12
11
 
13
12
 
14
- class DownloaderProtocol(Protocol):
13
+ class SyncDownloaderProtocol(Protocol):
15
14
  """
16
15
  Protocol for downloader classes.
17
16
 
@@ -20,7 +19,7 @@ class DownloaderProtocol(Protocol):
20
19
  as well as optional pre-download hooks.
21
20
  """
22
21
 
23
- def download(self, book_ids: List[str]) -> None:
22
+ def download(self, book_ids: list[str]) -> None:
24
23
  """
25
24
  Batch download entry point.
26
25
 
@@ -1,19 +1,18 @@
1
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
2
  """
4
- novel_downloader.core.interfaces.requester_protocol
5
- --------------------------------------------------
3
+ novel_downloader.core.interfaces.sync_requester
4
+ -----------------------------------------------
6
5
 
7
6
  Defines the RequesterProtocol interface for fetching raw HTML or JSON
8
7
  for book info pages, individual chapters, managing request lifecycle,
9
8
  and optionally retrieving a user's authenticated bookcase.
10
9
  """
11
10
 
12
- from typing import Literal, Optional, Protocol, runtime_checkable
11
+ from typing import Any, Literal, Protocol, runtime_checkable
13
12
 
14
13
 
15
14
  @runtime_checkable
16
- class RequesterProtocol(Protocol):
15
+ class SyncRequesterProtocol(Protocol):
17
16
  """
18
17
  A requester must be able to fetch raw HTML/data for:
19
18
  - a book's info page,
@@ -23,46 +22,61 @@ class RequesterProtocol(Protocol):
23
22
  def is_async(self) -> Literal[False]:
24
23
  ...
25
24
 
26
- def login(self, max_retries: int = 3, manual_login: bool = False) -> bool:
25
+ def login(
26
+ self,
27
+ username: str = "",
28
+ password: str = "",
29
+ manual_login: bool = False,
30
+ **kwargs: Any,
31
+ ) -> bool:
27
32
  """
28
33
  Attempt to log in
29
34
  """
30
35
  ...
31
36
 
32
- def get_book_info(self, book_id: str, wait_time: Optional[float] = None) -> str:
37
+ def get_book_info(
38
+ self,
39
+ book_id: str,
40
+ **kwargs: Any,
41
+ ) -> str:
33
42
  """
34
43
  Fetch the raw HTML (or JSON) of the book info page.
35
44
 
36
45
  :param book_id: The book identifier.
37
- :param wait_time: Base number of seconds to wait before returning content.
38
46
  :return: The page content as a string.
39
47
  """
40
48
  ...
41
49
 
42
50
  def get_book_chapter(
43
- self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
51
+ self,
52
+ book_id: str,
53
+ chapter_id: str,
54
+ **kwargs: Any,
44
55
  ) -> str:
45
56
  """
46
57
  Fetch the raw HTML (or JSON) of a single chapter.
47
58
 
48
59
  :param book_id: The book identifier.
49
60
  :param chapter_id: The chapter identifier.
50
- :param wait_time: Base number of seconds to wait before returning content.
51
61
  :return: The chapter content as a string.
52
62
  """
53
63
  ...
54
64
 
55
- def shutdown(self) -> None:
65
+ def get_bookcase(
66
+ self,
67
+ page: int = 1,
68
+ **kwargs: Any,
69
+ ) -> str:
56
70
  """
57
- Shutdown and cleans up resources.
71
+ Optional: Retrieve the HTML content of the authenticated user's bookcase page.
72
+
73
+ :param page: Page idx
74
+ :return: The HTML markup of the bookcase page.
58
75
  """
59
76
  ...
60
77
 
61
- def get_bookcase(self, wait_time: Optional[float] = None) -> str:
78
+ def close(self) -> None:
62
79
  """
63
- Optional: Retrieve the HTML content of the authenticated user's bookcase page.
64
-
65
- :param wait_time: Base number of seconds to wait before returning content.
66
- :return: The HTML markup of the bookcase page.
80
+ Shutdown and cleans up resources.
67
81
  """
68
82
  ...
@@ -1,5 +1,4 @@
1
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
2
  """
4
3
  novel_downloader.core.parsers
5
4
  -----------------------------
@@ -8,20 +7,22 @@ This package defines all site-specific parsing modules
8
7
  for the novel_downloader framework.
9
8
 
10
9
  Currently supported:
11
- - Qidian (起点中文网) via browser-rendered page parsing.
10
+ - Qidian (起点中文网)
12
11
 
13
12
  Modules:
14
13
  - qidian_parser
15
14
  - common_parser
16
15
  """
17
16
 
18
- from .common_parser import CommonParser
19
- from .qidian_parser import (
17
+ from .biquge import BiqugeParser
18
+ from .common import CommonParser
19
+ from .qidian import (
20
20
  QidianBrowserParser,
21
21
  QidianSessionParser,
22
22
  )
23
23
 
24
24
  __all__ = [
25
+ "BiqugeParser",
25
26
  "CommonParser",
26
27
  "QidianBrowserParser",
27
28
  "QidianSessionParser",
@@ -1,8 +1,7 @@
1
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
2
  """
4
- novel_downloader.core.parsers.base_parser
5
- -----------------------------------------
3
+ novel_downloader.core.parsers.base
4
+ ----------------------------------
6
5
 
7
6
  This module defines the BaseParser abstract class, which implements the
8
7
  ParserProtocol interface and provides a structured foundation for
@@ -16,10 +15,11 @@ a standard parsing interface for:
16
15
 
17
16
  import abc
18
17
  from pathlib import Path
19
- from typing import Any, Dict, Optional
18
+ from typing import Any
20
19
 
21
20
  from novel_downloader.config import ParserConfig
22
21
  from novel_downloader.core.interfaces import ParserProtocol
22
+ from novel_downloader.utils.chapter_storage import ChapterDict
23
23
 
24
24
 
25
25
  class BaseParser(ParserProtocol, abc.ABC):
@@ -33,19 +33,23 @@ class BaseParser(ParserProtocol, abc.ABC):
33
33
  Subclasses must implement actual parsing logic for specific sites.
34
34
  """
35
35
 
36
- def __init__(self, config: ParserConfig):
36
+ def __init__(
37
+ self,
38
+ config: ParserConfig,
39
+ ):
37
40
  """
38
41
  Initialize the parser with a configuration object.
39
42
 
40
43
  :param config: ParserConfig object controlling parsing behavior.
41
44
  """
42
45
  self._config = config
43
- self._book_id: Optional[str] = None
46
+ self._book_id: str | None = None
44
47
 
45
48
  self._base_cache_dir = Path(config.cache_dir)
49
+ self._cache_dir = self._base_cache_dir
46
50
 
47
51
  @abc.abstractmethod
48
- def parse_book_info(self, html_str: str) -> Dict[str, Any]:
52
+ def parse_book_info(self, html_str: str) -> dict[str, Any]:
49
53
  """
50
54
  Parse a book info page and extract metadata and chapter structure.
51
55
 
@@ -58,7 +62,11 @@ class BaseParser(ParserProtocol, abc.ABC):
58
62
  ...
59
63
 
60
64
  @abc.abstractmethod
61
- def parse_chapter(self, html_str: str, chapter_id: str) -> Dict[str, Any]:
65
+ def parse_chapter(
66
+ self,
67
+ html_str: str,
68
+ chapter_id: str,
69
+ ) -> ChapterDict | None:
62
70
  """
63
71
  Parse a single chapter page and extract clean text or simplified HTML.
64
72
 
@@ -69,7 +77,7 @@ class BaseParser(ParserProtocol, abc.ABC):
69
77
  ...
70
78
 
71
79
  @property
72
- def book_id(self) -> Optional[str]:
80
+ def book_id(self) -> str | None:
73
81
  """
74
82
  Current book ID in context.
75
83
 
@@ -85,6 +93,7 @@ class BaseParser(ParserProtocol, abc.ABC):
85
93
  :param value: Book identifier.
86
94
  """
87
95
  self._book_id = value
96
+ self._cache_dir = self._base_cache_dir / value
88
97
  self._on_book_id_set()
89
98
 
90
99
  def _on_book_id_set(self) -> None:
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.biquge
4
+ ------------------------------------
5
+
6
+ """
7
+
8
+ from .main_parser import BiqugeParser
9
+
10
+ __all__ = ["BiqugeParser"]
@@ -0,0 +1,126 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.biquge.main_parser
4
+ ------------------------------------------------
5
+
6
+ """
7
+
8
+ import re
9
+ from typing import Any
10
+
11
+ from lxml import etree
12
+ from lxml.etree import _Element
13
+
14
+ from novel_downloader.core.parsers.base import BaseParser
15
+ from novel_downloader.utils.chapter_storage import ChapterDict
16
+
17
+
18
+ class BiqugeParser(BaseParser):
19
+ """ """
20
+
21
+ def parse_book_info(self, html_str: str) -> dict[str, Any]:
22
+ """
23
+ Parse a book info page and extract metadata and chapter structure.
24
+
25
+ :param html: Raw HTML of the book info page.
26
+ :return: Parsed metadata and chapter structure as a dictionary.
27
+ """
28
+ tree = etree.HTML(html_str, parser=None)
29
+ result: dict[str, Any] = {}
30
+
31
+ def extract_text(elem: _Element | None) -> str:
32
+ if elem is None:
33
+ return ""
34
+ return "".join(elem.itertext(tag=None)).strip()
35
+
36
+ # 书名
37
+ book_name_elem = tree.xpath('//div[@id="info"]/h1')
38
+ result["book_name"] = extract_text(book_name_elem[0]) if book_name_elem else ""
39
+
40
+ # 作者
41
+ author_elem = tree.xpath('//div[@id="info"]/p[1]')
42
+ if author_elem:
43
+ author_text = extract_text(author_elem[0]).replace("\u00a0", "")
44
+ match = re.search(r"作\s*者[::]?\s*(\S+)", author_text)
45
+ result["author"] = match.group(1).strip() if match else ""
46
+ else:
47
+ result["author"] = ""
48
+
49
+ # 封面
50
+ cover_elem = tree.xpath('//div[@id="fmimg"]/img/@src')
51
+ result["cover_url"] = cover_elem[0].strip() if cover_elem else ""
52
+
53
+ # 最后更新时间
54
+ update_elem = tree.xpath('//div[@id="info"]/p[3]')
55
+ if update_elem:
56
+ update_text = extract_text(update_elem[0])
57
+ match = re.search(r"最后更新[::]\s*(\S+)", update_text)
58
+ result["update_time"] = match.group(1).strip() if match else ""
59
+ else:
60
+ result["update_time"] = ""
61
+
62
+ # 简介
63
+ intro_elem = tree.xpath('//div[@id="intro"]')
64
+ result["summary"] = extract_text(intro_elem[0]) if intro_elem else ""
65
+
66
+ # 卷和章节
67
+ chapters = []
68
+ in_main_volume = False
69
+
70
+ list_dl = tree.xpath('//div[@id="list"]/dl')[0]
71
+ for elem in list_dl:
72
+ if elem.tag == "dt":
73
+ text = "".join(elem.itertext()).strip()
74
+ in_main_volume = "正文" in text
75
+ elif in_main_volume and elem.tag == "dd":
76
+ a: list[_Element] = elem.xpath("./a")
77
+ if a:
78
+ title = "".join(a[0].itertext(tag=None)).strip()
79
+ url = a[0].get("href", "").strip()
80
+ href_cleaned = url.replace(".html", "")
81
+ chapter_id_match = re.search(r"/(\d+)$", href_cleaned)
82
+ chapter_id = chapter_id_match.group(1) if chapter_id_match else ""
83
+ chapters.append(
84
+ {"title": title, "url": url, "chapterId": chapter_id}
85
+ )
86
+
87
+ result["volumes"] = [{"volume_name": "正文", "chapters": chapters}]
88
+
89
+ return result
90
+
91
+ def parse_chapter(
92
+ self,
93
+ html_str: str,
94
+ chapter_id: str,
95
+ ) -> ChapterDict | None:
96
+ """
97
+ Parse a single chapter page and extract clean text or simplified HTML.
98
+
99
+ :param html: Raw HTML of the chapter page.
100
+ :param chapter_id: Identifier of the chapter being parsed.
101
+ :return: Cleaned chapter content as plain text or minimal HTML.
102
+ """
103
+ tree = etree.HTML(html_str, parser=None)
104
+
105
+ # 提取标题
106
+ title_elem = tree.xpath('//div[@class="bookname"]/h1')
107
+ title = "".join(title_elem[0].itertext()).strip() if title_elem else ""
108
+ if not title:
109
+ title = f"第 {chapter_id} 章"
110
+
111
+ # 提取内容
112
+ content_elem = tree.xpath('//div[@id="content"]')
113
+ paragraphs = content_elem[0].xpath(".//p") if content_elem else []
114
+ paragraph_texts = [
115
+ "".join(p.itertext()).strip() for p in paragraphs if p is not None
116
+ ]
117
+ content = "\n\n".join([p for p in paragraph_texts if p])
118
+ if not content.strip():
119
+ return None
120
+
121
+ return {
122
+ "id": chapter_id,
123
+ "title": title,
124
+ "content": content,
125
+ "extra": {"site": "biquge"},
126
+ }
@@ -1,8 +1,7 @@
1
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
2
  """
4
- novel_downloader.core.parsers.common_parser
5
- -------------------------------------------
3
+ novel_downloader.core.parsers.common
4
+ ------------------------------------
6
5
 
7
6
  This module provides a CommonParser class that implements
8
7
  general-purpose parsing logic for extracting novel metadata
@@ -1,15 +1,15 @@
1
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
2
  """
4
- novel_downloader.core.parsers.common_parser.helpers
5
- ---------------------------------------------------
3
+ novel_downloader.core.parsers.common.helpers
4
+ --------------------------------------------
6
5
 
7
6
  Shared utility functions for parsing Common pages.
8
7
  """
9
8
 
10
9
  import logging
11
10
  import re
12
- from typing import Any, Dict, Iterable, Iterator, List, Optional, cast
11
+ from collections.abc import Iterable, Iterator
12
+ from typing import Any, cast
13
13
 
14
14
  from bs4 import BeautifulSoup, Tag
15
15
 
@@ -47,7 +47,7 @@ class HTMLExtractor:
47
47
  self._html = html
48
48
  self._soup = html_to_soup(html)
49
49
 
50
- def extract_book_info(self, rules: BookInfoRules) -> Dict[str, Any]:
50
+ def extract_book_info(self, rules: BookInfoRules) -> dict[str, Any]:
51
51
  """
52
52
  Extract structured book information from HTML according to the given rules.
53
53
 
@@ -56,7 +56,7 @@ class HTMLExtractor:
56
56
  :param rules: Extraction configuration specifying how to extract.
57
57
  :return: A dictionary containing extracted book information.
58
58
  """
59
- book_info: Dict[str, Any] = {}
59
+ book_info: dict[str, Any] = {}
60
60
 
61
61
  for field_name, field_rules in rules.items():
62
62
  if field_rules is None:
@@ -72,7 +72,7 @@ class HTMLExtractor:
72
72
 
73
73
  return book_info
74
74
 
75
- def extract_field(self, steps: List[RuleStep]) -> str:
75
+ def extract_field(self, steps: list[RuleStep]) -> str:
76
76
  """
77
77
  Execute a list of extraction steps on the given HTML.
78
78
 
@@ -209,7 +209,7 @@ class HTMLExtractor:
209
209
  return str(current.get_text().strip())
210
210
  return str(current or "").strip()
211
211
 
212
- def extract_mixed_volumes(self, volume_rule: VolumesRules) -> List[Dict[str, Any]]:
212
+ def extract_mixed_volumes(self, volume_rule: VolumesRules) -> list[dict[str, Any]]:
213
213
  """
214
214
  Special mode: mixed <volume> and <chapter> under same parent.
215
215
  (e.g., dt / dd pattern in BiQuGe)
@@ -228,8 +228,8 @@ class HTMLExtractor:
228
228
  "chapter_selector 和 volume_name_steps"
229
229
  )
230
230
 
231
- volumes: List[Dict[str, Any]] = []
232
- current_volume: Optional[Dict[str, Any]] = None
231
+ volumes: list[dict[str, Any]] = []
232
+ current_volume: dict[str, Any] | None = None
233
233
  if not chapter_steps_list:
234
234
  chapter_steps_list = []
235
235
  chapter_info_steps = {item["key"]: item["steps"] for item in chapter_steps_list}
@@ -258,7 +258,7 @@ class HTMLExtractor:
258
258
 
259
259
  return volumes
260
260
 
261
- def extract_volume_blocks(self, volume_rule: VolumesRules) -> List[Dict[str, Any]]:
261
+ def extract_volume_blocks(self, volume_rule: VolumesRules) -> list[dict[str, Any]]:
262
262
  volume_selector = volume_rule.get("volume_selector")
263
263
  volume_name_steps = volume_rule.get("volume_name_steps")
264
264
  chapter_selector = volume_rule["chapter_selector"]
@@ -285,7 +285,7 @@ class HTMLExtractor:
285
285
 
286
286
  return volumes
287
287
 
288
- def extract_flat_chapters(self, volume_rule: VolumesRules) -> List[Dict[str, Any]]:
288
+ def extract_flat_chapters(self, volume_rule: VolumesRules) -> list[dict[str, Any]]:
289
289
  chapter_selector = volume_rule["chapter_selector"]
290
290
  chapter_steps_list = volume_rule["chapter_steps"]
291
291
  volume_selector = volume_rule.get("volume_selector")
@@ -312,7 +312,7 @@ class HTMLExtractor:
312
312
 
313
313
  def extract_volumes_structure(
314
314
  self, volume_rule: VolumesRules
315
- ) -> List[Dict[str, Any]]:
315
+ ) -> list[dict[str, Any]]:
316
316
  volume_mode = volume_rule.get("volume_mode", "normal")
317
317
  if volume_mode == "mixed":
318
318
  return self.extract_mixed_volumes(volume_rule)