novel-downloader 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. novel_downloader/__init__.py +1 -2
  2. novel_downloader/cli/__init__.py +0 -1
  3. novel_downloader/cli/clean.py +2 -10
  4. novel_downloader/cli/download.py +18 -22
  5. novel_downloader/cli/interactive.py +0 -1
  6. novel_downloader/cli/main.py +1 -3
  7. novel_downloader/cli/settings.py +8 -8
  8. novel_downloader/config/__init__.py +0 -1
  9. novel_downloader/config/adapter.py +48 -18
  10. novel_downloader/config/loader.py +116 -108
  11. novel_downloader/config/models.py +41 -32
  12. novel_downloader/config/site_rules.py +2 -4
  13. novel_downloader/core/__init__.py +0 -1
  14. novel_downloader/core/downloaders/__init__.py +4 -4
  15. novel_downloader/core/downloaders/base/__init__.py +14 -0
  16. novel_downloader/core/downloaders/{base_async_downloader.py → base/base_async.py} +49 -53
  17. novel_downloader/core/downloaders/{base_downloader.py → base/base_sync.py} +64 -43
  18. novel_downloader/core/downloaders/biquge/__init__.py +12 -0
  19. novel_downloader/core/downloaders/biquge/biquge_sync.py +25 -0
  20. novel_downloader/core/downloaders/common/__init__.py +14 -0
  21. novel_downloader/core/downloaders/{common_asynb_downloader.py → common/common_async.py} +42 -33
  22. novel_downloader/core/downloaders/{common_downloader.py → common/common_sync.py} +34 -23
  23. novel_downloader/core/downloaders/qidian/__init__.py +10 -0
  24. novel_downloader/core/downloaders/{qidian_downloader.py → qidian/qidian_sync.py} +80 -64
  25. novel_downloader/core/factory/__init__.py +4 -5
  26. novel_downloader/core/factory/{downloader_factory.py → downloader.py} +36 -35
  27. novel_downloader/core/factory/{parser_factory.py → parser.py} +12 -14
  28. novel_downloader/core/factory/{requester_factory.py → requester.py} +29 -16
  29. novel_downloader/core/factory/{saver_factory.py → saver.py} +4 -9
  30. novel_downloader/core/interfaces/__init__.py +8 -9
  31. novel_downloader/core/interfaces/{async_downloader_protocol.py → async_downloader.py} +4 -5
  32. novel_downloader/core/interfaces/{async_requester_protocol.py → async_requester.py} +26 -12
  33. novel_downloader/core/interfaces/{parser_protocol.py → parser.py} +11 -6
  34. novel_downloader/core/interfaces/{saver_protocol.py → saver.py} +2 -3
  35. novel_downloader/core/interfaces/{downloader_protocol.py → sync_downloader.py} +6 -7
  36. novel_downloader/core/interfaces/{requester_protocol.py → sync_requester.py} +34 -17
  37. novel_downloader/core/parsers/__init__.py +5 -4
  38. novel_downloader/core/parsers/{base_parser.py → base.py} +20 -11
  39. novel_downloader/core/parsers/biquge/__init__.py +10 -0
  40. novel_downloader/core/parsers/biquge/main_parser.py +126 -0
  41. novel_downloader/core/parsers/{common_parser → common}/__init__.py +2 -3
  42. novel_downloader/core/parsers/{common_parser → common}/helper.py +20 -18
  43. novel_downloader/core/parsers/{common_parser → common}/main_parser.py +15 -9
  44. novel_downloader/core/parsers/{qidian_parser → qidian}/__init__.py +2 -3
  45. novel_downloader/core/parsers/{qidian_parser → qidian}/browser/__init__.py +2 -3
  46. novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_encrypted.py +41 -49
  47. novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_normal.py +17 -21
  48. novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_router.py +10 -9
  49. novel_downloader/core/parsers/{qidian_parser → qidian}/browser/main_parser.py +16 -12
  50. novel_downloader/core/parsers/{qidian_parser → qidian}/session/__init__.py +2 -3
  51. novel_downloader/core/parsers/{qidian_parser → qidian}/session/chapter_encrypted.py +37 -45
  52. novel_downloader/core/parsers/{qidian_parser → qidian}/session/chapter_normal.py +19 -23
  53. novel_downloader/core/parsers/{qidian_parser → qidian}/session/chapter_router.py +10 -9
  54. novel_downloader/core/parsers/{qidian_parser → qidian}/session/main_parser.py +16 -12
  55. novel_downloader/core/parsers/{qidian_parser → qidian}/session/node_decryptor.py +7 -10
  56. novel_downloader/core/parsers/{qidian_parser → qidian}/shared/__init__.py +2 -3
  57. novel_downloader/core/parsers/qidian/shared/book_info_parser.py +150 -0
  58. novel_downloader/core/parsers/{qidian_parser → qidian}/shared/helpers.py +9 -10
  59. novel_downloader/core/requesters/__init__.py +9 -5
  60. novel_downloader/core/requesters/base/__init__.py +16 -0
  61. novel_downloader/core/requesters/{base_async_session.py → base/async_session.py} +180 -73
  62. novel_downloader/core/requesters/base/browser.py +340 -0
  63. novel_downloader/core/requesters/base/session.py +364 -0
  64. novel_downloader/core/requesters/biquge/__init__.py +12 -0
  65. novel_downloader/core/requesters/biquge/session.py +90 -0
  66. novel_downloader/core/requesters/{common_requester → common}/__init__.py +4 -5
  67. novel_downloader/core/requesters/common/async_session.py +96 -0
  68. novel_downloader/core/requesters/common/session.py +113 -0
  69. novel_downloader/core/requesters/qidian/__init__.py +21 -0
  70. novel_downloader/core/requesters/qidian/broswer.py +306 -0
  71. novel_downloader/core/requesters/qidian/session.py +287 -0
  72. novel_downloader/core/savers/__init__.py +5 -3
  73. novel_downloader/core/savers/{base_saver.py → base.py} +12 -13
  74. novel_downloader/core/savers/biquge.py +25 -0
  75. novel_downloader/core/savers/{common_saver → common}/__init__.py +2 -3
  76. novel_downloader/core/savers/{common_saver/common_epub.py → common/epub.py} +24 -52
  77. novel_downloader/core/savers/{common_saver → common}/main_saver.py +43 -9
  78. novel_downloader/core/savers/{common_saver/common_txt.py → common/txt.py} +16 -46
  79. novel_downloader/core/savers/epub_utils/__init__.py +0 -1
  80. novel_downloader/core/savers/epub_utils/css_builder.py +13 -7
  81. novel_downloader/core/savers/epub_utils/initializer.py +4 -5
  82. novel_downloader/core/savers/epub_utils/text_to_html.py +2 -3
  83. novel_downloader/core/savers/epub_utils/volume_intro.py +1 -3
  84. novel_downloader/core/savers/{qidian_saver.py → qidian.py} +12 -6
  85. novel_downloader/locales/en.json +12 -4
  86. novel_downloader/locales/zh.json +9 -1
  87. novel_downloader/resources/config/settings.toml +88 -0
  88. novel_downloader/utils/cache.py +2 -2
  89. novel_downloader/utils/chapter_storage.py +340 -0
  90. novel_downloader/utils/constants.py +8 -5
  91. novel_downloader/utils/crypto_utils.py +3 -3
  92. novel_downloader/utils/file_utils/__init__.py +0 -1
  93. novel_downloader/utils/file_utils/io.py +12 -17
  94. novel_downloader/utils/file_utils/normalize.py +1 -3
  95. novel_downloader/utils/file_utils/sanitize.py +2 -9
  96. novel_downloader/utils/fontocr/__init__.py +0 -1
  97. novel_downloader/utils/fontocr/ocr_v1.py +19 -22
  98. novel_downloader/utils/fontocr/ocr_v2.py +147 -60
  99. novel_downloader/utils/hash_store.py +19 -20
  100. novel_downloader/utils/hash_utils.py +0 -1
  101. novel_downloader/utils/i18n.py +3 -4
  102. novel_downloader/utils/logger.py +5 -6
  103. novel_downloader/utils/model_loader.py +5 -8
  104. novel_downloader/utils/network.py +9 -10
  105. novel_downloader/utils/state.py +6 -7
  106. novel_downloader/utils/text_utils/__init__.py +0 -1
  107. novel_downloader/utils/text_utils/chapter_formatting.py +2 -7
  108. novel_downloader/utils/text_utils/diff_display.py +0 -1
  109. novel_downloader/utils/text_utils/font_mapping.py +1 -4
  110. novel_downloader/utils/text_utils/text_cleaning.py +0 -1
  111. novel_downloader/utils/time_utils/__init__.py +0 -1
  112. novel_downloader/utils/time_utils/datetime_utils.py +9 -11
  113. novel_downloader/utils/time_utils/sleep_utils.py +27 -13
  114. {novel_downloader-1.2.1.dist-info → novel_downloader-1.3.0.dist-info}/METADATA +14 -17
  115. novel_downloader-1.3.0.dist-info/RECORD +127 -0
  116. {novel_downloader-1.2.1.dist-info → novel_downloader-1.3.0.dist-info}/WHEEL +1 -1
  117. novel_downloader/core/parsers/qidian_parser/shared/book_info_parser.py +0 -95
  118. novel_downloader/core/requesters/base_browser.py +0 -210
  119. novel_downloader/core/requesters/base_session.py +0 -243
  120. novel_downloader/core/requesters/common_requester/common_async_session.py +0 -98
  121. novel_downloader/core/requesters/common_requester/common_session.py +0 -126
  122. novel_downloader/core/requesters/qidian_requester/__init__.py +0 -22
  123. novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +0 -377
  124. novel_downloader/core/requesters/qidian_requester/qidian_session.py +0 -202
  125. novel_downloader/resources/config/settings.yaml +0 -76
  126. novel_downloader-1.2.1.dist-info/RECORD +0 -115
  127. {novel_downloader-1.2.1.dist-info → novel_downloader-1.3.0.dist-info}/entry_points.txt +0 -0
  128. {novel_downloader-1.2.1.dist-info → novel_downloader-1.3.0.dist-info}/licenses/LICENSE +0 -0
  129. {novel_downloader-1.2.1.dist-info → novel_downloader-1.3.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,4 @@
1
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
2
  """
4
3
  novel_downloader.core.interfaces
5
4
  --------------------------------
@@ -15,18 +14,18 @@ Included protocols:
15
14
  - SaverProtocol
16
15
  """
17
16
 
18
- from .async_downloader_protocol import AsyncDownloaderProtocol
19
- from .async_requester_protocol import AsyncRequesterProtocol
20
- from .downloader_protocol import DownloaderProtocol
21
- from .parser_protocol import ParserProtocol
22
- from .requester_protocol import RequesterProtocol
23
- from .saver_protocol import SaverProtocol
17
+ from .async_downloader import AsyncDownloaderProtocol
18
+ from .async_requester import AsyncRequesterProtocol
19
+ from .parser import ParserProtocol
20
+ from .saver import SaverProtocol
21
+ from .sync_downloader import SyncDownloaderProtocol
22
+ from .sync_requester import SyncRequesterProtocol
24
23
 
25
24
  __all__ = [
26
25
  "AsyncDownloaderProtocol",
27
26
  "AsyncRequesterProtocol",
28
- "DownloaderProtocol",
29
27
  "ParserProtocol",
30
- "RequesterProtocol",
31
28
  "SaverProtocol",
29
+ "SyncDownloaderProtocol",
30
+ "SyncRequesterProtocol",
32
31
  ]
@@ -1,26 +1,25 @@
1
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
2
  """
4
- novel_downloader.core.interfaces.async_downloader_protocol
3
+ novel_downloader.core.interfaces.async_downloader
5
4
  ----------------------------------------------------------
6
5
 
7
6
  This module defines the AsyncDownloaderProtocol, a structural interface
8
7
  that outlines the expected behavior of any downloader class.
9
8
  """
10
9
 
11
- from typing import List, Protocol
10
+ from typing import Protocol
12
11
 
13
12
 
14
13
  class AsyncDownloaderProtocol(Protocol):
15
14
  """
16
- Protocol for fullyasynchronous downloader classes.
15
+ Protocol for fully-asynchronous downloader classes.
17
16
 
18
17
  Defines the expected interface for any downloader implementation,
19
18
  including both batch and single book downloads,
20
19
  as well as optional pre-download hooks.
21
20
  """
22
21
 
23
- async def download(self, book_ids: List[str]) -> None:
22
+ async def download(self, book_ids: list[str]) -> None:
24
23
  """
25
24
  Batch download entry point.
26
25
 
@@ -1,15 +1,14 @@
1
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
2
  """
4
- novel_downloader.core.interfaces.async_requester_protocol
3
+ novel_downloader.core.interfaces.async_requester
5
4
  --------------------------------------------------------
6
5
 
7
6
  Defines the AsyncRequesterProtocol interface for fetching raw HTML or JSON
8
7
  for book info pages, individual chapters, managing request lifecycle,
9
- and optionally retrieving a user's authenticated bookcase — all in async style.
8
+ and optionally retrieving a user's authenticated bookcase.
10
9
  """
11
10
 
12
- from typing import Optional, Protocol, runtime_checkable
11
+ from typing import Any, Literal, Protocol, runtime_checkable
13
12
 
14
13
 
15
14
  @runtime_checkable
@@ -21,7 +20,16 @@ class AsyncRequesterProtocol(Protocol):
21
20
  and manage login/shutdown asynchronously.
22
21
  """
23
22
 
24
- async def login(self, max_retries: int = 3, manual_login: bool = False) -> bool:
23
+ def is_async(self) -> Literal[True]:
24
+ ...
25
+
26
+ async def login(
27
+ self,
28
+ username: str = "",
29
+ password: str = "",
30
+ manual_login: bool = False,
31
+ **kwargs: Any,
32
+ ) -> bool:
25
33
  """
26
34
  Attempt to log in asynchronously.
27
35
  :returns: True if login succeeded.
@@ -29,41 +37,47 @@ class AsyncRequesterProtocol(Protocol):
29
37
  ...
30
38
 
31
39
  async def get_book_info(
32
- self, book_id: str, wait_time: Optional[float] = None
40
+ self,
41
+ book_id: str,
42
+ **kwargs: Any,
33
43
  ) -> str:
34
44
  """
35
45
  Fetch the raw HTML (or JSON) of the book info page asynchronously.
36
46
 
37
47
  :param book_id: The book identifier.
38
- :param wait_time: Base number of seconds to wait before returning content.
39
48
  :return: The page content as a string.
40
49
  """
41
50
  ...
42
51
 
43
52
  async def get_book_chapter(
44
- self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
53
+ self,
54
+ book_id: str,
55
+ chapter_id: str,
56
+ **kwargs: Any,
45
57
  ) -> str:
46
58
  """
47
59
  Fetch the raw HTML (or JSON) of a single chapter asynchronously.
48
60
 
49
61
  :param book_id: The book identifier.
50
62
  :param chapter_id: The chapter identifier.
51
- :param wait_time: Base number of seconds to wait before returning content.
52
63
  :return: The chapter content as a string.
53
64
  """
54
65
  ...
55
66
 
56
- async def get_bookcase(self, wait_time: Optional[float] = None) -> str:
67
+ async def get_bookcase(
68
+ self,
69
+ page: int = 1,
70
+ **kwargs: Any,
71
+ ) -> str:
57
72
  """
58
73
  Optional: Retrieve the HTML content of the authenticated
59
74
  user's bookcase page asynchronously.
60
75
 
61
- :param wait_time: Base number of seconds to wait before returning content.
62
76
  :return: The HTML markup of the bookcase page.
63
77
  """
64
78
  ...
65
79
 
66
- async def shutdown(self) -> None:
80
+ async def close(self) -> None:
67
81
  """
68
82
  Shutdown and clean up any resources (e.g., close aiohttp session).
69
83
  """
@@ -1,14 +1,15 @@
1
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
2
  """
4
- novel_downloader.core.interfaces.parser_protocol
5
- ------------------------------------------------
3
+ novel_downloader.core.interfaces.parser
4
+ ---------------------------------------
6
5
 
7
6
  Defines the ParserProtocol interface for extracting book metadata,
8
7
  parsing individual chapter content, and setting parser context via book_id.
9
8
  """
10
9
 
11
- from typing import Any, Dict, Protocol, runtime_checkable
10
+ from typing import Any, Protocol, runtime_checkable
11
+
12
+ from novel_downloader.utils.chapter_storage import ChapterDict
12
13
 
13
14
 
14
15
  @runtime_checkable
@@ -20,7 +21,7 @@ class ParserProtocol(Protocol):
20
21
  - accept a book_id context for multi-step workflows.
21
22
  """
22
23
 
23
- def parse_book_info(self, html_str: str) -> Dict[str, Any]:
24
+ def parse_book_info(self, html_str: str) -> dict[str, Any]:
24
25
  """
25
26
  Parse and return a dictionary of book information from the raw HTML.
26
27
 
@@ -29,7 +30,11 @@ class ParserProtocol(Protocol):
29
30
  """
30
31
  ...
31
32
 
32
- def parse_chapter(self, html_str: str, chapter_id: str) -> Dict[str, Any]:
33
+ def parse_chapter(
34
+ self,
35
+ html_str: str,
36
+ chapter_id: str,
37
+ ) -> ChapterDict | None:
33
38
  """
34
39
  Parse and return the text content of one chapter.
35
40
 
@@ -1,8 +1,7 @@
1
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
2
  """
4
- novel_downloader.core.interfaces.saver_protocol
5
- ------------------------------------------------
3
+ novel_downloader.core.interfaces.saver
4
+ --------------------------------------
6
5
 
7
6
  Defines the SaverProtocol interface for persisting completed books in
8
7
  TXT, EPUB, Markdown, and PDF formats.
@@ -1,17 +1,16 @@
1
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
2
  """
4
- novel_downloader.core.interfaces.downloader_protocol
5
- ----------------------------------------------------
3
+ novel_downloader.core.interfaces.sync_downloader
4
+ ------------------------------------------------
6
5
 
7
- This module defines the DownloaderProtocol, a structural interface
6
+ This module defines the SyncDownloaderProtocol, a structural interface
8
7
  that outlines the expected behavior of any downloader class.
9
8
  """
10
9
 
11
- from typing import List, Protocol
10
+ from typing import Protocol
12
11
 
13
12
 
14
- class DownloaderProtocol(Protocol):
13
+ class SyncDownloaderProtocol(Protocol):
15
14
  """
16
15
  Protocol for downloader classes.
17
16
 
@@ -20,7 +19,7 @@ class DownloaderProtocol(Protocol):
20
19
  as well as optional pre-download hooks.
21
20
  """
22
21
 
23
- def download(self, book_ids: List[str]) -> None:
22
+ def download(self, book_ids: list[str]) -> None:
24
23
  """
25
24
  Batch download entry point.
26
25
 
@@ -1,65 +1,82 @@
1
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
2
  """
4
- novel_downloader.core.interfaces.requester_protocol
5
- --------------------------------------------------
3
+ novel_downloader.core.interfaces.sync_requester
4
+ -----------------------------------------------
6
5
 
7
6
  Defines the RequesterProtocol interface for fetching raw HTML or JSON
8
7
  for book info pages, individual chapters, managing request lifecycle,
9
8
  and optionally retrieving a user's authenticated bookcase.
10
9
  """
11
10
 
12
- from typing import Optional, Protocol, runtime_checkable
11
+ from typing import Any, Literal, Protocol, runtime_checkable
13
12
 
14
13
 
15
14
  @runtime_checkable
16
- class RequesterProtocol(Protocol):
15
+ class SyncRequesterProtocol(Protocol):
17
16
  """
18
17
  A requester must be able to fetch raw HTML/data for:
19
18
  - a book's info page,
20
19
  - a specific chapter page.
21
20
  """
22
21
 
23
- def login(self, max_retries: int = 3, manual_login: bool = False) -> bool:
22
+ def is_async(self) -> Literal[False]:
23
+ ...
24
+
25
+ def login(
26
+ self,
27
+ username: str = "",
28
+ password: str = "",
29
+ manual_login: bool = False,
30
+ **kwargs: Any,
31
+ ) -> bool:
24
32
  """
25
33
  Attempt to log in
26
34
  """
27
35
  ...
28
36
 
29
- def get_book_info(self, book_id: str, wait_time: Optional[float] = None) -> str:
37
+ def get_book_info(
38
+ self,
39
+ book_id: str,
40
+ **kwargs: Any,
41
+ ) -> str:
30
42
  """
31
43
  Fetch the raw HTML (or JSON) of the book info page.
32
44
 
33
45
  :param book_id: The book identifier.
34
- :param wait_time: Base number of seconds to wait before returning content.
35
46
  :return: The page content as a string.
36
47
  """
37
48
  ...
38
49
 
39
50
  def get_book_chapter(
40
- self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
51
+ self,
52
+ book_id: str,
53
+ chapter_id: str,
54
+ **kwargs: Any,
41
55
  ) -> str:
42
56
  """
43
57
  Fetch the raw HTML (or JSON) of a single chapter.
44
58
 
45
59
  :param book_id: The book identifier.
46
60
  :param chapter_id: The chapter identifier.
47
- :param wait_time: Base number of seconds to wait before returning content.
48
61
  :return: The chapter content as a string.
49
62
  """
50
63
  ...
51
64
 
52
- def shutdown(self) -> None:
65
+ def get_bookcase(
66
+ self,
67
+ page: int = 1,
68
+ **kwargs: Any,
69
+ ) -> str:
53
70
  """
54
- Shutdown and cleans up resources.
71
+ Optional: Retrieve the HTML content of the authenticated user's bookcase page.
72
+
73
+ :param page: Page idx
74
+ :return: The HTML markup of the bookcase page.
55
75
  """
56
76
  ...
57
77
 
58
- def get_bookcase(self, wait_time: Optional[float] = None) -> str:
78
+ def close(self) -> None:
59
79
  """
60
- Optional: Retrieve the HTML content of the authenticated user's bookcase page.
61
-
62
- :param wait_time: Base number of seconds to wait before returning content.
63
- :return: The HTML markup of the bookcase page.
80
+ Shutdown and cleans up resources.
64
81
  """
65
82
  ...
@@ -1,5 +1,4 @@
1
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
2
  """
4
3
  novel_downloader.core.parsers
5
4
  -----------------------------
@@ -8,20 +7,22 @@ This package defines all site-specific parsing modules
8
7
  for the novel_downloader framework.
9
8
 
10
9
  Currently supported:
11
- - Qidian (起点中文网) via browser-rendered page parsing.
10
+ - Qidian (起点中文网)
12
11
 
13
12
  Modules:
14
13
  - qidian_parser
15
14
  - common_parser
16
15
  """
17
16
 
18
- from .common_parser import CommonParser
19
- from .qidian_parser import (
17
+ from .biquge import BiqugeParser
18
+ from .common import CommonParser
19
+ from .qidian import (
20
20
  QidianBrowserParser,
21
21
  QidianSessionParser,
22
22
  )
23
23
 
24
24
  __all__ = [
25
+ "BiqugeParser",
25
26
  "CommonParser",
26
27
  "QidianBrowserParser",
27
28
  "QidianSessionParser",
@@ -1,8 +1,7 @@
1
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
2
  """
4
- novel_downloader.core.parsers.base_parser
5
- -----------------------------------------
3
+ novel_downloader.core.parsers.base
4
+ ----------------------------------
6
5
 
7
6
  This module defines the BaseParser abstract class, which implements the
8
7
  ParserProtocol interface and provides a structured foundation for
@@ -16,10 +15,11 @@ a standard parsing interface for:
16
15
 
17
16
  import abc
18
17
  from pathlib import Path
19
- from typing import Any, Dict, Optional
18
+ from typing import Any
20
19
 
21
20
  from novel_downloader.config import ParserConfig
22
21
  from novel_downloader.core.interfaces import ParserProtocol
22
+ from novel_downloader.utils.chapter_storage import ChapterDict
23
23
 
24
24
 
25
25
  class BaseParser(ParserProtocol, abc.ABC):
@@ -33,43 +33,51 @@ class BaseParser(ParserProtocol, abc.ABC):
33
33
  Subclasses must implement actual parsing logic for specific sites.
34
34
  """
35
35
 
36
- def __init__(self, config: ParserConfig):
36
+ def __init__(
37
+ self,
38
+ config: ParserConfig,
39
+ ):
37
40
  """
38
41
  Initialize the parser with a configuration object.
39
42
 
40
43
  :param config: ParserConfig object controlling parsing behavior.
41
44
  """
42
45
  self._config = config
43
- self._book_id: Optional[str] = None
46
+ self._book_id: str | None = None
44
47
 
45
48
  self._base_cache_dir = Path(config.cache_dir)
49
+ self._cache_dir = self._base_cache_dir
46
50
 
47
51
  @abc.abstractmethod
48
- def parse_book_info(self, html: str) -> Dict[str, Any]:
52
+ def parse_book_info(self, html_str: str) -> dict[str, Any]:
49
53
  """
50
54
  Parse a book info page and extract metadata and chapter structure.
51
55
 
52
56
  Depending on the site structure, the return dict may include a
53
57
  flat `chapters` list or nested `volumes` with chapter groups.
54
58
 
55
- :param html: Raw HTML of the book info page.
59
+ :param html_str: Raw HTML of the book info page.
56
60
  :return: Parsed metadata and chapter structure as a dictionary.
57
61
  """
58
62
  ...
59
63
 
60
64
  @abc.abstractmethod
61
- def parse_chapter(self, html_str: str, chapter_id: str) -> Dict[str, Any]:
65
+ def parse_chapter(
66
+ self,
67
+ html_str: str,
68
+ chapter_id: str,
69
+ ) -> ChapterDict | None:
62
70
  """
63
71
  Parse a single chapter page and extract clean text or simplified HTML.
64
72
 
65
- :param html: Raw HTML of the chapter page.
73
+ :param html_str: Raw HTML of the chapter page.
66
74
  :param chapter_id: Identifier of the chapter being parsed.
67
75
  :return: Cleaned chapter content as plain text or minimal HTML.
68
76
  """
69
77
  ...
70
78
 
71
79
  @property
72
- def book_id(self) -> Optional[str]:
80
+ def book_id(self) -> str | None:
73
81
  """
74
82
  Current book ID in context.
75
83
 
@@ -85,6 +93,7 @@ class BaseParser(ParserProtocol, abc.ABC):
85
93
  :param value: Book identifier.
86
94
  """
87
95
  self._book_id = value
96
+ self._cache_dir = self._base_cache_dir / value
88
97
  self._on_book_id_set()
89
98
 
90
99
  def _on_book_id_set(self) -> None:
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.biquge
4
+ ------------------------------------
5
+
6
+ """
7
+
8
+ from .main_parser import BiqugeParser
9
+
10
+ __all__ = ["BiqugeParser"]
@@ -0,0 +1,126 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.biquge.main_parser
4
+ ------------------------------------------------
5
+
6
+ """
7
+
8
+ import re
9
+ from typing import Any
10
+
11
+ from lxml import etree
12
+ from lxml.etree import _Element
13
+
14
+ from novel_downloader.core.parsers.base import BaseParser
15
+ from novel_downloader.utils.chapter_storage import ChapterDict
16
+
17
+
18
+ class BiqugeParser(BaseParser):
19
+ """ """
20
+
21
+ def parse_book_info(self, html_str: str) -> dict[str, Any]:
22
+ """
23
+ Parse a book info page and extract metadata and chapter structure.
24
+
25
+ :param html: Raw HTML of the book info page.
26
+ :return: Parsed metadata and chapter structure as a dictionary.
27
+ """
28
+ tree = etree.HTML(html_str, parser=None)
29
+ result: dict[str, Any] = {}
30
+
31
+ def extract_text(elem: _Element | None) -> str:
32
+ if elem is None:
33
+ return ""
34
+ return "".join(elem.itertext(tag=None)).strip()
35
+
36
+ # 书名
37
+ book_name_elem = tree.xpath('//div[@id="info"]/h1')
38
+ result["book_name"] = extract_text(book_name_elem[0]) if book_name_elem else ""
39
+
40
+ # 作者
41
+ author_elem = tree.xpath('//div[@id="info"]/p[1]')
42
+ if author_elem:
43
+ author_text = extract_text(author_elem[0]).replace("\u00a0", "")
44
+ match = re.search(r"作\s*者[::]?\s*(\S+)", author_text)
45
+ result["author"] = match.group(1).strip() if match else ""
46
+ else:
47
+ result["author"] = ""
48
+
49
+ # 封面
50
+ cover_elem = tree.xpath('//div[@id="fmimg"]/img/@src')
51
+ result["cover_url"] = cover_elem[0].strip() if cover_elem else ""
52
+
53
+ # 最后更新时间
54
+ update_elem = tree.xpath('//div[@id="info"]/p[3]')
55
+ if update_elem:
56
+ update_text = extract_text(update_elem[0])
57
+ match = re.search(r"最后更新[::]\s*(\S+)", update_text)
58
+ result["update_time"] = match.group(1).strip() if match else ""
59
+ else:
60
+ result["update_time"] = ""
61
+
62
+ # 简介
63
+ intro_elem = tree.xpath('//div[@id="intro"]')
64
+ result["summary"] = extract_text(intro_elem[0]) if intro_elem else ""
65
+
66
+ # 卷和章节
67
+ chapters = []
68
+ in_main_volume = False
69
+
70
+ list_dl = tree.xpath('//div[@id="list"]/dl')[0]
71
+ for elem in list_dl:
72
+ if elem.tag == "dt":
73
+ text = "".join(elem.itertext()).strip()
74
+ in_main_volume = "正文" in text
75
+ elif in_main_volume and elem.tag == "dd":
76
+ a: list[_Element] = elem.xpath("./a")
77
+ if a:
78
+ title = "".join(a[0].itertext(tag=None)).strip()
79
+ url = a[0].get("href", "").strip()
80
+ href_cleaned = url.replace(".html", "")
81
+ chapter_id_match = re.search(r"/(\d+)$", href_cleaned)
82
+ chapter_id = chapter_id_match.group(1) if chapter_id_match else ""
83
+ chapters.append(
84
+ {"title": title, "url": url, "chapterId": chapter_id}
85
+ )
86
+
87
+ result["volumes"] = [{"volume_name": "正文", "chapters": chapters}]
88
+
89
+ return result
90
+
91
+ def parse_chapter(
92
+ self,
93
+ html_str: str,
94
+ chapter_id: str,
95
+ ) -> ChapterDict | None:
96
+ """
97
+ Parse a single chapter page and extract clean text or simplified HTML.
98
+
99
+ :param html: Raw HTML of the chapter page.
100
+ :param chapter_id: Identifier of the chapter being parsed.
101
+ :return: Cleaned chapter content as plain text or minimal HTML.
102
+ """
103
+ tree = etree.HTML(html_str, parser=None)
104
+
105
+ # 提取标题
106
+ title_elem = tree.xpath('//div[@class="bookname"]/h1')
107
+ title = "".join(title_elem[0].itertext()).strip() if title_elem else ""
108
+ if not title:
109
+ title = f"第 {chapter_id} 章"
110
+
111
+ # 提取内容
112
+ content_elem = tree.xpath('//div[@id="content"]')
113
+ paragraphs = content_elem[0].xpath(".//p") if content_elem else []
114
+ paragraph_texts = [
115
+ "".join(p.itertext()).strip() for p in paragraphs if p is not None
116
+ ]
117
+ content = "\n\n".join([p for p in paragraph_texts if p])
118
+ if not content.strip():
119
+ return None
120
+
121
+ return {
122
+ "id": chapter_id,
123
+ "title": title,
124
+ "content": content,
125
+ "extra": {"site": "biquge"},
126
+ }
@@ -1,8 +1,7 @@
1
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
2
  """
4
- novel_downloader.core.parsers.common_parser
5
- -------------------------------------------
3
+ novel_downloader.core.parsers.common
4
+ ------------------------------------
6
5
 
7
6
  This module provides a CommonParser class that implements
8
7
  general-purpose parsing logic for extracting novel metadata